]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
buildsys: use download.ceph.com to download source tar ball
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
7c673cae
FG
26#include <boost/scoped_ptr.hpp>
27
28#ifdef HAVE_SYS_PARAM_H
29#include <sys/param.h>
30#endif
31
32#ifdef HAVE_SYS_MOUNT_H
33#include <sys/mount.h>
34#endif
35
36#include "osd/PG.h"
37
38#include "include/types.h"
39#include "include/compat.h"
11fdf7f2 40#include "include/random.h"
7c673cae
FG
41
42#include "OSD.h"
43#include "OSDMap.h"
44#include "Watch.h"
45#include "osdc/Objecter.h"
46
47#include "common/errno.h"
48#include "common/ceph_argparse.h"
224ce89b 49#include "common/ceph_time.h"
7c673cae 50#include "common/version.h"
b5b8bbf5 51#include "common/pick_address.h"
11fdf7f2
TL
52#include "common/blkdev.h"
53#include "common/numa.h"
7c673cae
FG
54
55#include "os/ObjectStore.h"
56#ifdef HAVE_LIBFUSE
57#include "os/FuseStore.h"
58#endif
59
60#include "PrimaryLogPG.h"
61
7c673cae
FG
62#include "msg/Messenger.h"
63#include "msg/Message.h"
64
65#include "mon/MonClient.h"
66
67#include "messages/MLog.h"
68
69#include "messages/MGenericMessage.h"
7c673cae
FG
70#include "messages/MOSDPing.h"
71#include "messages/MOSDFailure.h"
72#include "messages/MOSDMarkMeDown.h"
73#include "messages/MOSDFull.h"
74#include "messages/MOSDOp.h"
75#include "messages/MOSDOpReply.h"
76#include "messages/MOSDBackoff.h"
77#include "messages/MOSDBeacon.h"
78#include "messages/MOSDRepOp.h"
79#include "messages/MOSDRepOpReply.h"
80#include "messages/MOSDBoot.h"
81#include "messages/MOSDPGTemp.h"
11fdf7f2 82#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
83
84#include "messages/MOSDMap.h"
85#include "messages/MMonGetOSDMap.h"
86#include "messages/MOSDPGNotify.h"
87#include "messages/MOSDPGQuery.h"
88#include "messages/MOSDPGLog.h"
89#include "messages/MOSDPGRemove.h"
90#include "messages/MOSDPGInfo.h"
91#include "messages/MOSDPGCreate.h"
11fdf7f2 92#include "messages/MOSDPGCreate2.h"
7c673cae
FG
93#include "messages/MOSDPGTrim.h"
94#include "messages/MOSDPGScan.h"
7c673cae
FG
95#include "messages/MBackfillReserve.h"
96#include "messages/MRecoveryReserve.h"
c07f9fc5 97#include "messages/MOSDForceRecovery.h"
7c673cae
FG
98#include "messages/MOSDECSubOpWrite.h"
99#include "messages/MOSDECSubOpWriteReply.h"
100#include "messages/MOSDECSubOpRead.h"
101#include "messages/MOSDECSubOpReadReply.h"
102#include "messages/MOSDPGCreated.h"
103#include "messages/MOSDPGUpdateLogMissing.h"
104#include "messages/MOSDPGUpdateLogMissingReply.h"
105
11fdf7f2
TL
106#include "messages/MOSDPeeringOp.h"
107
7c673cae
FG
108#include "messages/MOSDAlive.h"
109
110#include "messages/MOSDScrub.h"
11fdf7f2 111#include "messages/MOSDScrub2.h"
7c673cae
FG
112#include "messages/MOSDRepScrub.h"
113
114#include "messages/MMonCommand.h"
115#include "messages/MCommand.h"
116#include "messages/MCommandReply.h"
117
118#include "messages/MPGStats.h"
119#include "messages/MPGStatsAck.h"
120
121#include "messages/MWatchNotify.h"
122#include "messages/MOSDPGPush.h"
123#include "messages/MOSDPGPushReply.h"
124#include "messages/MOSDPGPull.h"
125
126#include "common/perf_counters.h"
127#include "common/Timer.h"
128#include "common/LogClient.h"
129#include "common/AsyncReserver.h"
130#include "common/HeartbeatMap.h"
131#include "common/admin_socket.h"
132#include "common/ceph_context.h"
133
134#include "global/signal_handler.h"
135#include "global/pidfile.h"
136
137#include "include/color.h"
138#include "perfglue/cpu_profiler.h"
139#include "perfglue/heap_profiler.h"
140
141#include "osd/OpRequest.h"
142
143#include "auth/AuthAuthorizeHandler.h"
144#include "auth/RotatingKeyRing.h"
7c673cae
FG
145
146#include "objclass/objclass.h"
147
148#include "common/cmdparse.h"
149#include "include/str_list.h"
150#include "include/util.h"
151
11fdf7f2 152#include "include/ceph_assert.h"
7c673cae
FG
153#include "common/config.h"
154#include "common/EventTrace.h"
155
11fdf7f2
TL
156#include "json_spirit/json_spirit_reader.h"
157#include "json_spirit/json_spirit_writer.h"
158
7c673cae
FG
159#ifdef WITH_LTTNG
160#define TRACEPOINT_DEFINE
161#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
162#include "tracing/osd.h"
163#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164#undef TRACEPOINT_DEFINE
165#else
166#define tracepoint(...)
167#endif
168
169#define dout_context cct
170#define dout_subsys ceph_subsys_osd
171#undef dout_prefix
172#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
173
224ce89b 174
7c673cae
FG
175static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
176 return *_dout << "osd." << whoami << " " << epoch << " ";
177}
178
7c673cae
FG
179//Initial features in new superblock.
180//Features here are also automatically upgraded
181CompatSet OSD::get_osd_initial_compat_set() {
182 CompatSet::FeatureSet ceph_osd_feature_compat;
183 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
184 CompatSet::FeatureSet ceph_osd_feature_incompat;
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
200 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
201 ceph_osd_feature_incompat);
202}
203
204//Features are added here that this OSD supports.
205CompatSet OSD::get_osd_compat_set() {
206 CompatSet compat = get_osd_initial_compat_set();
207 //Any features here can be set in code, but not in initial superblock
208 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
209 return compat;
210}
211
212OSDService::OSDService(OSD *osd) :
213 osd(osd),
214 cct(osd->cct),
7c673cae
FG
215 whoami(osd->whoami), store(osd->store),
216 log_client(osd->log_client), clog(osd->clog),
217 pg_recovery_stats(osd->pg_recovery_stats),
218 cluster_messenger(osd->cluster_messenger),
219 client_messenger(osd->client_messenger),
220 logger(osd->logger),
221 recoverystate_perf(osd->recoverystate_perf),
222 monc(osd->monc),
7c673cae 223 class_handler(osd->class_handler),
11fdf7f2
TL
224 osd_max_object_size(cct->_conf, "osd_max_object_size"),
225 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
226 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
227 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae
FG
228 max_oldest_map(0),
229 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
230 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
231 scrubs_active(0),
232 agent_lock("OSDService::agent_lock"),
233 agent_valid_iterator(false),
234 agent_ops(0),
235 flush_mode_high_count(0),
236 agent_active(true),
237 agent_thread(this),
238 agent_stop_flag(false),
239 agent_timer_lock("OSDService::agent_timer_lock"),
240 agent_timer(osd->client_messenger->cct, agent_timer_lock),
241 last_recalibrate(ceph_clock_now()),
242 promote_max_objects(0),
243 promote_max_bytes(0),
244 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
11fdf7f2 245 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
246 watch_lock("OSDService::watch_lock"),
247 watch_timer(osd->client_messenger->cct, watch_lock),
248 next_notif_id(0),
249 recovery_request_lock("OSDService::recovery_request_lock"),
250 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2
TL
251 sleep_lock("OSDService::sleep_lock"),
252 sleep_timer(cct, sleep_lock, false),
7c673cae 253 reserver_finisher(cct),
3efd9988 254 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 255 cct->_conf->osd_min_recovery_priority),
3efd9988 256 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae
FG
257 cct->_conf->osd_min_recovery_priority),
258 pg_temp_lock("OSDService::pg_temp_lock"),
3efd9988 259 snap_reserver(cct, &reserver_finisher,
7c673cae
FG
260 cct->_conf->osd_max_trimming_pgs),
261 recovery_lock("OSDService::recovery_lock"),
262 recovery_ops_active(0),
263 recovery_ops_reserved(0),
264 recovery_paused(false),
265 map_cache_lock("OSDService::map_cache_lock"),
266 map_cache(cct, cct->_conf->osd_map_cache_size),
267 map_bl_cache(cct->_conf->osd_map_cache_size),
268 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae
FG
269 stat_lock("OSDService::stat_lock"),
270 full_status_lock("OSDService::full_status_lock"),
271 cur_state(NONE),
11fdf7f2 272 cur_ratio(0), physical_ratio(0),
7c673cae
FG
273 epoch_lock("OSDService::epoch_lock"),
274 boot_epoch(0), up_epoch(0), bind_epoch(0),
275 is_stopping_lock("OSDService::is_stopping_lock")
276#ifdef PG_DEBUG_REFS
277 , pgid_lock("OSDService::pgid_lock")
278#endif
279{
280 objecter->init();
11fdf7f2
TL
281
282 for (int i = 0; i < m_objecter_finishers; i++) {
283 ostringstream str;
284 str << "objecter-finisher-" << i;
285 Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
286 objecter_finishers.push_back(fin);
287 }
7c673cae
FG
288}
289
290OSDService::~OSDService()
291{
292 delete objecter;
11fdf7f2
TL
293
294 for (auto f : objecter_finishers) {
295 delete f;
296 f = NULL;
297 }
7c673cae
FG
298}
299
31f18b77
FG
300
301
302#ifdef PG_DEBUG_REFS
303void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 304 std::lock_guard l(pgid_lock);
31f18b77
FG
305 if (!pgid_tracker.count(pgid)) {
306 live_pgs[pgid] = pg;
307 }
308 pgid_tracker[pgid]++;
309}
310void OSDService::remove_pgid(spg_t pgid, PG *pg)
311{
11fdf7f2
TL
312 std::lock_guard l(pgid_lock);
313 ceph_assert(pgid_tracker.count(pgid));
314 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
315 pgid_tracker[pgid]--;
316 if (pgid_tracker[pgid] == 0) {
317 pgid_tracker.erase(pgid);
318 live_pgs.erase(pgid);
319 }
320}
321void OSDService::dump_live_pgids()
322{
11fdf7f2 323 std::lock_guard l(pgid_lock);
31f18b77
FG
324 derr << "live pgids:" << dendl;
325 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
326 i != pgid_tracker.cend();
327 ++i) {
328 derr << "\t" << *i << dendl;
329 live_pgs[i->first]->dump_live_ids();
330 }
331}
332#endif
333
334
7c673cae 335
11fdf7f2
TL
336void OSDService::identify_splits_and_merges(
337 OSDMapRef old_map,
338 OSDMapRef new_map,
339 spg_t pgid,
340 set<pair<spg_t,epoch_t>> *split_children,
341 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 342{
11fdf7f2 343 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 344 return;
7c673cae 345 }
7c673cae 346 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
347 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
348 if (p == osd->pg_num_history.pg_nums.end()) {
349 return;
350 }
351 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
352 << " to e" << new_map->get_epoch()
353 << " pg_nums " << p->second << dendl;
354 deque<spg_t> queue;
355 queue.push_back(pgid);
356 while (!queue.empty()) {
357 auto cur = queue.front();
358 queue.pop_front();
359 unsigned pgnum = old_pgnum;
360 for (auto q = p->second.lower_bound(old_map->get_epoch());
361 q != p->second.end() &&
362 q->first <= new_map->get_epoch();
363 ++q) {
364 if (pgnum < q->second) {
365 // split?
366 if (cur.ps() < pgnum) {
367 set<spg_t> children;
368 if (cur.is_split(pgnum, q->second, &children)) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " children " << children << dendl;
372 for (auto i : children) {
373 split_children->insert(make_pair(i, q->first));
374 queue.push_back(i);
375 }
376 }
377 } else if (cur.ps() < q->second) {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is a child" << dendl;
381 // normally we'd capture this from the parent, but it's
382 // possible the parent doesn't exist yet (it will be
383 // fabricated to allow an intervening merge). note this PG
384 // as a split child here to be sure we catch it.
385 split_children->insert(make_pair(cur, q->first));
386 } else {
387 dout(20) << __func__ << " " << cur << " e" << q->first
388 << " pg_num " << pgnum << " -> " << q->second
389 << " is post-split, skipping" << dendl;
390 }
391 } else if (merge_pgs) {
392 // merge?
393 if (cur.ps() >= q->second) {
394 if (cur.ps() < pgnum) {
395 spg_t parent;
396 if (cur.is_merge_source(pgnum, q->second, &parent)) {
397 set<spg_t> children;
398 parent.is_split(q->second, pgnum, &children);
399 dout(20) << __func__ << " " << cur << " e" << q->first
400 << " pg_num " << pgnum << " -> " << q->second
401 << " is merge source, target " << parent
402 << ", source(s) " << children << dendl;
403 merge_pgs->insert(make_pair(parent, q->first));
404 for (auto c : children) {
405 merge_pgs->insert(make_pair(c, q->first));
406 }
407 }
408 } else {
409 dout(20) << __func__ << " " << cur << " e" << q->first
410 << " pg_num " << pgnum << " -> " << q->second
411 << " is beyond old pgnum, skipping" << dendl;
412 }
413 } else {
414 set<spg_t> children;
415 if (cur.is_split(q->second, pgnum, &children)) {
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge target, source " << children << dendl;
419 for (auto c : children) {
420 merge_pgs->insert(make_pair(c, q->first));
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
7c673cae
FG
424 }
425 }
11fdf7f2 426 pgnum = q->second;
7c673cae
FG
427 }
428 }
429}
430
7c673cae
FG
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
7c673cae
FG
436void OSDService::start_shutdown()
437{
438 {
11fdf7f2 439 std::lock_guard l(agent_timer_lock);
7c673cae
FG
440 agent_timer.shutdown();
441 }
31f18b77
FG
442
443 {
11fdf7f2
TL
444 std::lock_guard l(sleep_lock);
445 sleep_timer.shutdown();
31f18b77 446 }
7c673cae
FG
447}
448
31f18b77 449void OSDService::shutdown_reserver()
7c673cae
FG
450{
451 reserver_finisher.wait_for_empty();
452 reserver_finisher.stop();
31f18b77
FG
453}
454
455void OSDService::shutdown()
456{
7c673cae 457 {
11fdf7f2 458 std::lock_guard l(watch_lock);
7c673cae
FG
459 watch_timer.shutdown();
460 }
461
462 objecter->shutdown();
11fdf7f2
TL
463 for (auto f : objecter_finishers) {
464 f->wait_for_empty();
465 f->stop();
7c673cae
FG
466 }
467
468 {
11fdf7f2
TL
469 std::lock_guard l(recovery_request_lock);
470 recovery_request_timer.shutdown();
31f18b77
FG
471 }
472
11fdf7f2 473 publish_map(OSDMapRef());
7c673cae
FG
474 next_osdmap = OSDMapRef();
475}
476
477void OSDService::init()
478{
479 reserver_finisher.start();
11fdf7f2
TL
480 for (auto f : objecter_finishers) {
481 f->start();
482 }
7c673cae
FG
483 objecter->set_client_incarnation(0);
484
485 // deprioritize objecter in daemonperf output
486 objecter->get_logger()->set_prio_adjust(-3);
487
488 watch_timer.init();
489 agent_timer.init();
7c673cae
FG
490
491 agent_thread.create("osd_srv_agent");
492
493 if (cct->_conf->osd_recovery_delay_start)
494 defer_recovery(cct->_conf->osd_recovery_delay_start);
495}
496
497void OSDService::final_init()
498{
499 objecter->start(osdmap.get());
500}
501
502void OSDService::activate_map()
503{
504 // wake/unwake the tiering agent
505 agent_lock.Lock();
506 agent_active =
507 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
508 osd->is_active();
509 agent_cond.Signal();
510 agent_lock.Unlock();
511}
512
181888fb
FG
513void OSDService::request_osdmap_update(epoch_t e)
514{
515 osd->osdmap_subscribe(e, false);
516}
517
7c673cae
FG
518class AgentTimeoutCB : public Context {
519 PGRef pg;
520public:
521 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
522 void finish(int) override {
523 pg->agent_choose_mode_restart();
524 }
525};
526
527void OSDService::agent_entry()
528{
529 dout(10) << __func__ << " start" << dendl;
530 agent_lock.Lock();
531
532 while (!agent_stop_flag) {
533 if (agent_queue.empty()) {
534 dout(20) << __func__ << " empty queue" << dendl;
535 agent_cond.Wait(agent_lock);
536 continue;
537 }
538 uint64_t level = agent_queue.rbegin()->first;
539 set<PGRef>& top = agent_queue.rbegin()->second;
540 dout(10) << __func__
541 << " tiers " << agent_queue.size()
542 << ", top is " << level
543 << " with pgs " << top.size()
544 << ", ops " << agent_ops << "/"
545 << cct->_conf->osd_agent_max_ops
546 << (agent_active ? " active" : " NOT ACTIVE")
547 << dendl;
548 dout(20) << __func__ << " oids " << agent_oids << dendl;
549 int max = cct->_conf->osd_agent_max_ops - agent_ops;
550 int agent_flush_quota = max;
551 if (!flush_mode_high_count)
552 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
553 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
554 agent_cond.Wait(agent_lock);
555 continue;
556 }
557
558 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
559 agent_queue_pos = top.begin();
560 agent_valid_iterator = true;
561 }
562 PGRef pg = *agent_queue_pos;
563 dout(10) << "high_count " << flush_mode_high_count
564 << " agent_ops " << agent_ops
565 << " flush_quota " << agent_flush_quota << dendl;
566 agent_lock.Unlock();
567 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 568 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
569 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
570 << " seconds" << dendl;
571
572 osd->logger->inc(l_osd_tier_delay);
573 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
574 agent_timer_lock.Lock();
575 Context *cb = new AgentTimeoutCB(pg);
576 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
577 agent_timer_lock.Unlock();
578 }
579 agent_lock.Lock();
580 }
581 agent_lock.Unlock();
582 dout(10) << __func__ << " finish" << dendl;
583}
584
585void OSDService::agent_stop()
586{
587 {
11fdf7f2 588 std::lock_guard l(agent_lock);
7c673cae
FG
589
590 // By this time all ops should be cancelled
11fdf7f2 591 ceph_assert(agent_ops == 0);
7c673cae
FG
592 // By this time all PGs are shutdown and dequeued
593 if (!agent_queue.empty()) {
594 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
595 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
596 ceph_abort_msg("agent queue not empty");
7c673cae
FG
597 }
598
599 agent_stop_flag = true;
600 agent_cond.Signal();
601 }
602 agent_thread.join();
603}
604
605// -------------------------------------
606
607void OSDService::promote_throttle_recalibrate()
608{
609 utime_t now = ceph_clock_now();
610 double dur = now - last_recalibrate;
611 last_recalibrate = now;
612 unsigned prob = promote_probability_millis;
613
614 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
615 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
616
617 unsigned min_prob = 1;
618
619 uint64_t attempts, obj, bytes;
620 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
621 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 622 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 623 << target_obj_sec << " obj/sec or "
1adf2230 624 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
625 << dendl;
626
627 // calculate what the probability *should* be, given the targets
628 unsigned new_prob;
629 if (attempts && dur > 0) {
630 uint64_t avg_size = 1;
631 if (obj)
11fdf7f2 632 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
633 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
634 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
635 / (double)attempts;
636 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
637 << avg_size << dendl;
638 if (target_obj_sec && target_bytes_sec)
11fdf7f2 639 new_prob = std::min(po, pb);
7c673cae
FG
640 else if (target_obj_sec)
641 new_prob = po;
642 else if (target_bytes_sec)
643 new_prob = pb;
644 else
645 new_prob = 1000;
646 } else {
647 new_prob = 1000;
648 }
649 dout(20) << __func__ << " new_prob " << new_prob << dendl;
650
651 // correct for persistent skew between target rate and actual rate, adjust
652 double ratio = 1.0;
653 unsigned actual = 0;
654 if (attempts && obj) {
655 actual = obj * 1000 / attempts;
656 ratio = (double)actual / (double)prob;
657 new_prob = (double)new_prob / ratio;
658 }
11fdf7f2
TL
659 new_prob = std::max(new_prob, min_prob);
660 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
661
662 // adjust
663 prob = (prob + new_prob) / 2;
11fdf7f2
TL
664 prob = std::max(prob, min_prob);
665 prob = std::min(prob, 1000u);
7c673cae
FG
666 dout(10) << __func__ << " actual " << actual
667 << ", actual/prob ratio " << ratio
668 << ", adjusted new_prob " << new_prob
669 << ", prob " << promote_probability_millis << " -> " << prob
670 << dendl;
671 promote_probability_millis = prob;
672
673 // set hard limits for this interval to mitigate stampedes
91327a77
AA
674 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
675 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
676}
677
678// -------------------------------------
679
680float OSDService::get_failsafe_full_ratio()
681{
682 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
683 if (full_ratio > 1.0) full_ratio /= 100.0;
684 return full_ratio;
685}
686
11fdf7f2 687OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 688{
7c673cae
FG
689 // The OSDMap ratios take precendence. So if the failsafe is .95 and
690 // the admin sets the cluster full to .96, the failsafe moves up to .96
691 // too. (Not that having failsafe == full is ideal, but it's better than
692 // dropping writes before the clusters appears full.)
693 OSDMapRef osdmap = get_osdmap();
694 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 695 return NONE;
7c673cae
FG
696 }
697 float nearfull_ratio = osdmap->get_nearfull_ratio();
698 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
699 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
700 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
701
31f18b77 702 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
703 // use the failsafe for nearfull and full; the mon isn't using the
704 // flags anyway because we're mid-upgrade.
705 full_ratio = failsafe_ratio;
706 backfillfull_ratio = failsafe_ratio;
707 nearfull_ratio = failsafe_ratio;
708 } else if (full_ratio <= 0 ||
709 backfillfull_ratio <= 0 ||
710 nearfull_ratio <= 0) {
711 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
712 // use failsafe flag. ick. the monitor did something wrong or the user
713 // did something stupid.
714 full_ratio = failsafe_ratio;
715 backfillfull_ratio = failsafe_ratio;
716 nearfull_ratio = failsafe_ratio;
717 }
718
7c673cae 719 if (injectfull_state > NONE && injectfull) {
7c673cae 720 inject = "(Injected)";
11fdf7f2
TL
721 return injectfull_state;
722 } else if (pratio > failsafe_ratio) {
723 return FAILSAFE;
7c673cae 724 } else if (ratio > full_ratio) {
11fdf7f2 725 return FULL;
7c673cae 726 } else if (ratio > backfillfull_ratio) {
11fdf7f2 727 return BACKFILLFULL;
7c673cae 728 } else if (ratio > nearfull_ratio) {
11fdf7f2 729 return NEARFULL;
7c673cae 730 }
11fdf7f2
TL
731 return NONE;
732}
733
734void OSDService::check_full_status(float ratio, float pratio)
735{
736 std::lock_guard l(full_status_lock);
737
738 cur_ratio = ratio;
739 physical_ratio = pratio;
740
741 string inject;
742 s_names new_state;
743 new_state = recalc_full_state(ratio, pratio, inject);
744
7c673cae 745 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 746 << ", physical ratio " << pratio
7c673cae
FG
747 << ", new state " << get_full_state_name(new_state)
748 << " " << inject
749 << dendl;
750
751 // warn
752 if (cur_state != new_state) {
753 dout(10) << __func__ << " " << get_full_state_name(cur_state)
754 << " -> " << get_full_state_name(new_state) << dendl;
755 if (new_state == FAILSAFE) {
c07f9fc5 756 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
757 << (int)roundf(ratio * 100) << "% full";
758 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
759 clog->error() << "full status failsafe disengaged, no longer dropping "
760 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
761 }
762 cur_state = new_state;
763 }
764}
765
766bool OSDService::need_fullness_update()
767{
768 OSDMapRef osdmap = get_osdmap();
769 s_names cur = NONE;
770 if (osdmap->exists(whoami)) {
771 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
772 cur = FULL;
773 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
774 cur = BACKFILLFULL;
775 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
776 cur = NEARFULL;
777 }
778 }
779 s_names want = NONE;
780 if (is_full())
781 want = FULL;
782 else if (is_backfillfull())
783 want = BACKFILLFULL;
784 else if (is_nearfull())
785 want = NEARFULL;
786 return want != cur;
787}
788
11fdf7f2 789bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 790{
7c673cae
FG
791 if (injectfull && injectfull_state >= type) {
792 // injectfull is either a count of the number of times to return failsafe full
793 // or if -1 then always return full
794 if (injectfull > 0)
795 --injectfull;
11fdf7f2
TL
796 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
797 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
798 << dendl;
7c673cae
FG
799 return true;
800 }
11fdf7f2
TL
801 return false;
802}
803
804bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
805{
806 std::lock_guard l(full_status_lock);
807
808 if (_check_inject_full(dpp, type))
809 return true;
810
811 if (cur_state >= type)
812 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
813 << " physical " << physical_ratio << dendl;
7c673cae 814
7c673cae
FG
815 return cur_state >= type;
816}
817
11fdf7f2
TL
818bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
819{
820 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
821 {
822 std::lock_guard l(full_status_lock);
823 if (_check_inject_full(dpp, type)) {
824 return true;
825 }
826 }
827
828 float pratio;
829 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
830
831 string notused;
832 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
833
834 if (tentative_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
836
837 return tentative_state >= type;
838}
839
840bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
841{
842 return _check_full(dpp, FAILSAFE);
843}
844
845bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 846{
11fdf7f2 847 return _check_full(dpp, FULL);
7c673cae
FG
848}
849
11fdf7f2 850bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 851{
11fdf7f2 852 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
853}
854
11fdf7f2 855bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 856{
11fdf7f2 857 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
858}
859
11fdf7f2 860bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 861{
11fdf7f2 862 return _check_full(dpp, NEARFULL);
7c673cae
FG
863}
864
865bool OSDService::is_failsafe_full() const
866{
11fdf7f2 867 std::lock_guard l(full_status_lock);
7c673cae
FG
868 return cur_state == FAILSAFE;
869}
870
871bool OSDService::is_full() const
872{
11fdf7f2 873 std::lock_guard l(full_status_lock);
7c673cae
FG
874 return cur_state >= FULL;
875}
876
877bool OSDService::is_backfillfull() const
878{
11fdf7f2 879 std::lock_guard l(full_status_lock);
7c673cae
FG
880 return cur_state >= BACKFILLFULL;
881}
882
883bool OSDService::is_nearfull() const
884{
11fdf7f2 885 std::lock_guard l(full_status_lock);
7c673cae
FG
886 return cur_state >= NEARFULL;
887}
888
889void OSDService::set_injectfull(s_names type, int64_t count)
890{
11fdf7f2 891 std::lock_guard l(full_status_lock);
7c673cae
FG
892 injectfull_state = type;
893 injectfull = count;
894}
895
11fdf7f2
TL
896void OSDService::set_statfs(const struct store_statfs_t &stbuf,
897 osd_alert_list_t& alerts)
7c673cae 898{
224ce89b 899 uint64_t bytes = stbuf.total;
224ce89b 900 uint64_t avail = stbuf.available;
11fdf7f2
TL
901 uint64_t used = stbuf.get_used_raw();
902
903 // For testing fake statfs values so it doesn't matter if all
904 // OSDs are using the same partition.
905 if (cct->_conf->fake_statfs_for_testing) {
906 uint64_t total_num_bytes = 0;
907 vector<PGRef> pgs;
908 osd->_get_pgs(&pgs);
909 for (auto p : pgs) {
910 total_num_bytes += p->get_stats_num_bytes();
911 }
912 bytes = cct->_conf->fake_statfs_for_testing;
913 if (total_num_bytes < bytes)
914 avail = bytes - total_num_bytes;
915 else
916 avail = 0;
917 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
918 << " adjust available " << avail
919 << dendl;
920 used = bytes - avail;
921 }
7c673cae 922
224ce89b
WB
923 osd->logger->set(l_osd_stat_bytes, bytes);
924 osd->logger->set(l_osd_stat_bytes_used, used);
925 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 926
11fdf7f2
TL
927 std::lock_guard l(stat_lock);
928 osd_stat.statfs = stbuf;
929 osd_stat.os_alerts.clear();
930 osd_stat.os_alerts[whoami].swap(alerts);
931 if (cct->_conf->fake_statfs_for_testing) {
932 osd_stat.statfs.total = bytes;
933 osd_stat.statfs.available = avail;
934 // For testing don't want used to go negative, so clear reserved
935 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
936 }
937}
7c673cae 938
11fdf7f2
TL
939osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
940 int num_pgs)
224ce89b 941{
11fdf7f2
TL
942 std::lock_guard l(stat_lock);
943 osd_stat.hb_peers.swap(hb_peers);
944 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
945 osd_stat.num_pgs = num_pgs;
946 return osd_stat;
947}
948
949void OSDService::inc_osd_stat_repaired()
950{
951 std::lock_guard l(stat_lock);
952 osd_stat.num_shards_repaired++;
953 return;
954}
955
956float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
957 uint64_t adjust_used)
958{
959 *pratio =
960 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
961
962 if (adjust_used) {
963 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
964 if (new_stat.statfs.available > adjust_used)
965 new_stat.statfs.available -= adjust_used;
966 else
967 new_stat.statfs.available = 0;
968 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
969 }
970
11fdf7f2
TL
971 // Check all pgs and adjust kb_used to include all pending backfill data
972 int backfill_adjusted = 0;
973 vector<PGRef> pgs;
974 osd->_get_pgs(&pgs);
975 for (auto p : pgs) {
976 backfill_adjusted += p->pg_stat_adjust(&new_stat);
977 }
978 if (backfill_adjusted) {
979 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
980 }
981 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
7c673cae
FG
982}
983
984bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
985{
986 OSDMapRef osdmap = get_osdmap();
987 for (auto shard : missing_on) {
988 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
989 return true;
990 }
991 return false;
992}
993
994void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
995{
996 OSDMapRef next_map = get_nextmap_reserved();
997 // service map is always newer/newest
11fdf7f2 998 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
999
1000 if (next_map->is_down(peer) ||
1001 next_map->get_info(peer).up_from > from_epoch) {
1002 m->put();
1003 release_map(next_map);
1004 return;
1005 }
11fdf7f2
TL
1006 ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1007 next_map->get_cluster_addrs(peer));
7c673cae
FG
1008 share_map_peer(peer, peer_con.get(), next_map);
1009 peer_con->send_message(m);
1010 release_map(next_map);
1011}
1012
1013ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1014{
1015 OSDMapRef next_map = get_nextmap_reserved();
1016 // service map is always newer/newest
11fdf7f2 1017 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1018
1019 if (next_map->is_down(peer) ||
1020 next_map->get_info(peer).up_from > from_epoch) {
1021 release_map(next_map);
1022 return NULL;
1023 }
11fdf7f2
TL
1024 ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1025 next_map->get_cluster_addrs(peer));
7c673cae
FG
1026 release_map(next_map);
1027 return con;
1028}
1029
1030pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1031{
1032 OSDMapRef next_map = get_nextmap_reserved();
1033 // service map is always newer/newest
11fdf7f2 1034 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1035
1036 pair<ConnectionRef,ConnectionRef> ret;
1037 if (next_map->is_down(peer) ||
1038 next_map->get_info(peer).up_from > from_epoch) {
1039 release_map(next_map);
1040 return ret;
1041 }
11fdf7f2
TL
1042 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1043 next_map->get_hb_back_addrs(peer));
1044 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1045 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1046 release_map(next_map);
1047 return ret;
1048}
1049
11fdf7f2
TL
1050entity_name_t OSDService::get_cluster_msgr_name() const
1051{
1052 return cluster_messenger->get_myname();
1053}
7c673cae 1054
94b18763
FG
1055void OSDService::queue_want_pg_temp(pg_t pgid,
1056 const vector<int>& want,
1057 bool forced)
7c673cae 1058{
11fdf7f2 1059 std::lock_guard l(pg_temp_lock);
94b18763 1060 auto p = pg_temp_pending.find(pgid);
7c673cae 1061 if (p == pg_temp_pending.end() ||
94b18763
FG
1062 p->second.acting != want ||
1063 forced) {
11fdf7f2 1064 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1065 }
1066}
1067
1068void OSDService::remove_want_pg_temp(pg_t pgid)
1069{
11fdf7f2 1070 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1071 pg_temp_wanted.erase(pgid);
1072 pg_temp_pending.erase(pgid);
1073}
1074
1075void OSDService::_sent_pg_temp()
1076{
11fdf7f2
TL
1077#ifdef HAVE_STDLIB_MAP_SPLICING
1078 pg_temp_pending.merge(pg_temp_wanted);
1079#else
94b18763
FG
1080 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1081 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1082#endif
7c673cae
FG
1083 pg_temp_wanted.clear();
1084}
1085
1086void OSDService::requeue_pg_temp()
1087{
11fdf7f2 1088 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1089 // wanted overrides pending. note that remove_want_pg_temp
1090 // clears the item out of both.
1091 unsigned old_wanted = pg_temp_wanted.size();
1092 unsigned old_pending = pg_temp_pending.size();
1093 _sent_pg_temp();
1094 pg_temp_wanted.swap(pg_temp_pending);
1095 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1096 << pg_temp_wanted.size() << dendl;
1097}
1098
94b18763
FG
1099std::ostream& operator<<(std::ostream& out,
1100 const OSDService::pg_temp_t& pg_temp)
1101{
1102 out << pg_temp.acting;
1103 if (pg_temp.forced) {
1104 out << " (forced)";
1105 }
1106 return out;
1107}
1108
7c673cae
FG
1109void OSDService::send_pg_temp()
1110{
11fdf7f2 1111 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1112 if (pg_temp_wanted.empty())
1113 return;
1114 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1115 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1116 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1117 auto& m = ms[pg_temp.forced];
94b18763
FG
1118 if (!m) {
1119 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1120 m->forced = pg_temp.forced;
94b18763 1121 }
11fdf7f2 1122 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1123 }
1124 for (auto m : ms) {
1125 if (m) {
1126 monc->send_mon_message(m);
1127 }
1128 }
7c673cae
FG
1129 _sent_pg_temp();
1130}
1131
1132void OSDService::send_pg_created(pg_t pgid)
1133{
11fdf7f2 1134 std::lock_guard l(pg_created_lock);
7c673cae 1135 dout(20) << __func__ << dendl;
11fdf7f2
TL
1136 auto o = get_osdmap();
1137 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1138 pg_created.insert(pgid);
c07f9fc5
FG
1139 monc->send_mon_message(new MOSDPGCreated(pgid));
1140 }
7c673cae
FG
1141}
1142
11fdf7f2
TL
1143void OSDService::send_pg_created()
1144{
1145 std::lock_guard l(pg_created_lock);
1146 dout(20) << __func__ << dendl;
1147 auto o = get_osdmap();
1148 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1149 for (auto pgid : pg_created) {
1150 monc->send_mon_message(new MOSDPGCreated(pgid));
1151 }
1152 }
1153}
1154
1155void OSDService::prune_pg_created()
1156{
1157 std::lock_guard l(pg_created_lock);
1158 dout(20) << __func__ << dendl;
1159 auto o = get_osdmap();
1160 auto i = pg_created.begin();
1161 while (i != pg_created.end()) {
1162 auto p = o->get_pg_pool(i->pool());
1163 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1164 dout(20) << __func__ << " pruning " << *i << dendl;
1165 i = pg_created.erase(i);
1166 } else {
1167 dout(20) << __func__ << " keeping " << *i << dendl;
1168 ++i;
1169 }
1170 }
1171}
1172
1173
7c673cae
FG
1174// --------------------------------------
1175// dispatch
1176
1177epoch_t OSDService::get_peer_epoch(int peer)
1178{
11fdf7f2 1179 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1180 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1181 if (p == peer_map_epoch.end())
1182 return 0;
1183 return p->second;
1184}
1185
1186epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1187{
11fdf7f2 1188 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1189 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1190 if (p != peer_map_epoch.end()) {
1191 if (p->second < e) {
1192 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1193 p->second = e;
1194 } else {
1195 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1196 }
1197 return p->second;
1198 } else {
1199 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1200 peer_map_epoch[peer] = e;
1201 return e;
1202 }
1203}
1204
1205void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1206{
11fdf7f2 1207 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1208 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1209 if (p != peer_map_epoch.end()) {
1210 if (p->second <= as_of) {
1211 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1212 << " had " << p->second << dendl;
1213 peer_map_epoch.erase(p);
1214 } else {
1215 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1216 << " has " << p->second << " - not forgetting" << dendl;
1217 }
1218 }
1219}
1220
1221bool OSDService::should_share_map(entity_name_t name, Connection *con,
1222 epoch_t epoch, const OSDMapRef& osdmap,
1223 const epoch_t *sent_epoch_p)
1224{
1225 dout(20) << "should_share_map "
1226 << name << " " << con->get_peer_addr()
1227 << " " << epoch << dendl;
1228
1229 // does client have old map?
1230 if (name.is_client()) {
1231 bool message_sendmap = epoch < osdmap->get_epoch();
1232 if (message_sendmap && sent_epoch_p) {
1233 dout(20) << "client session last_sent_epoch: "
1234 << *sent_epoch_p
1235 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1236 if (*sent_epoch_p < osdmap->get_epoch()) {
1237 return true;
1238 } // else we don't need to send it out again
1239 }
1240 }
1241
1242 if (con->get_messenger() == osd->cluster_messenger &&
1243 con != osd->cluster_messenger->get_loopback_connection() &&
1244 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1245 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1246 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
7c673cae 1247 // remember
11fdf7f2 1248 epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
7c673cae
FG
1249
1250 // share?
1251 if (has < osdmap->get_epoch()) {
1252 dout(10) << name << " " << con->get_peer_addr()
1253 << " has old map " << epoch << " < "
1254 << osdmap->get_epoch() << dendl;
1255 return true;
1256 }
1257 }
1258
1259 return false;
1260}
1261
1262void OSDService::share_map(
1263 entity_name_t name,
1264 Connection *con,
1265 epoch_t epoch,
1266 OSDMapRef& osdmap,
1267 epoch_t *sent_epoch_p)
1268{
1269 dout(20) << "share_map "
1270 << name << " " << con->get_peer_addr()
1271 << " " << epoch << dendl;
1272
1273 if (!osd->is_active()) {
1274 /*It is safe not to proceed as OSD is not in healthy state*/
1275 return;
1276 }
1277
1278 bool want_shared = should_share_map(name, con, epoch,
1279 osdmap, sent_epoch_p);
1280
1281 if (want_shared){
1282 if (name.is_client()) {
1283 dout(10) << name << " has old map " << epoch
1284 << " < " << osdmap->get_epoch() << dendl;
1285 // we know the Session is valid or we wouldn't be sending
1286 if (sent_epoch_p) {
1287 *sent_epoch_p = osdmap->get_epoch();
1288 }
1289 send_incremental_map(epoch, con, osdmap);
1290 } else if (con->get_messenger() == osd->cluster_messenger &&
1291 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1292 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1293 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1294 dout(10) << name << " " << con->get_peer_addrs()
7c673cae
FG
1295 << " has old map " << epoch << " < "
1296 << osdmap->get_epoch() << dendl;
1297 note_peer_epoch(name.num(), osdmap->get_epoch());
1298 send_incremental_map(epoch, con, osdmap);
1299 }
1300 }
1301}
1302
1303void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1304{
1305 if (!map)
1306 map = get_osdmap();
1307
1308 // send map?
1309 epoch_t pe = get_peer_epoch(peer);
1310 if (pe) {
1311 if (pe < map->get_epoch()) {
1312 send_incremental_map(pe, con, map);
1313 note_peer_epoch(peer, map->get_epoch());
1314 } else
1315 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1316 } else {
1317 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1318 // no idea about peer's epoch.
1319 // ??? send recent ???
1320 // do nothing.
1321 }
1322}
1323
1324bool OSDService::can_inc_scrubs_pending()
1325{
1326 bool can_inc = false;
11fdf7f2 1327 std::lock_guard l(sched_scrub_lock);
7c673cae
FG
1328
1329 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1330 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
28e407b8
AA
1331 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
1332 << ")" << dendl;
7c673cae
FG
1333 can_inc = true;
1334 } else {
28e407b8
AA
1335 dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
1336 << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1337 }
1338
1339 return can_inc;
1340}
1341
1342bool OSDService::inc_scrubs_pending()
1343{
1344 bool result = false;
1345
1346 sched_scrub_lock.Lock();
1347 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1348 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1349 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1350 result = true;
1351 ++scrubs_pending;
1352 } else {
1353 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1354 }
1355 sched_scrub_lock.Unlock();
1356
1357 return result;
1358}
1359
1360void OSDService::dec_scrubs_pending()
1361{
1362 sched_scrub_lock.Lock();
1363 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1364 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1365 --scrubs_pending;
11fdf7f2 1366 ceph_assert(scrubs_pending >= 0);
7c673cae
FG
1367 sched_scrub_lock.Unlock();
1368}
1369
1370void OSDService::inc_scrubs_active(bool reserved)
1371{
1372 sched_scrub_lock.Lock();
1373 ++(scrubs_active);
1374 if (reserved) {
1375 --(scrubs_pending);
1376 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1377 << " (max " << cct->_conf->osd_max_scrubs
1378 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
11fdf7f2 1379 ceph_assert(scrubs_pending >= 0);
7c673cae
FG
1380 } else {
1381 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1382 << " (max " << cct->_conf->osd_max_scrubs
1383 << ", pending " << scrubs_pending << ")" << dendl;
1384 }
1385 sched_scrub_lock.Unlock();
1386}
1387
1388void OSDService::dec_scrubs_active()
1389{
1390 sched_scrub_lock.Lock();
1391 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1392 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1393 --scrubs_active;
11fdf7f2 1394 ceph_assert(scrubs_active >= 0);
7c673cae
FG
1395 sched_scrub_lock.Unlock();
1396}
1397
1398void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1399 epoch_t *_bind_epoch) const
1400{
11fdf7f2 1401 std::lock_guard l(epoch_lock);
7c673cae
FG
1402 if (_boot_epoch)
1403 *_boot_epoch = boot_epoch;
1404 if (_up_epoch)
1405 *_up_epoch = up_epoch;
1406 if (_bind_epoch)
1407 *_bind_epoch = bind_epoch;
1408}
1409
1410void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1411 const epoch_t *_bind_epoch)
1412{
11fdf7f2 1413 std::lock_guard l(epoch_lock);
7c673cae 1414 if (_boot_epoch) {
11fdf7f2 1415 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1416 boot_epoch = *_boot_epoch;
1417 }
1418 if (_up_epoch) {
11fdf7f2 1419 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1420 up_epoch = *_up_epoch;
1421 }
1422 if (_bind_epoch) {
11fdf7f2 1423 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1424 bind_epoch = *_bind_epoch;
1425 }
1426}
1427
1428bool OSDService::prepare_to_stop()
1429{
11fdf7f2 1430 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1431 if (get_state() != NOT_STOPPING)
1432 return false;
1433
1434 OSDMapRef osdmap = get_osdmap();
1435 if (osdmap && osdmap->is_up(whoami)) {
1436 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1437 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1438 monc->send_mon_message(
1439 new MOSDMarkMeDown(
1440 monc->get_fsid(),
1441 whoami,
1442 osdmap->get_addrs(whoami),
1443 osdmap->get_epoch(),
1444 true // request ack
1445 ));
7c673cae
FG
1446 utime_t now = ceph_clock_now();
1447 utime_t timeout;
1448 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1449 while ((ceph_clock_now() < timeout) &&
1450 (get_state() != STOPPING)) {
1451 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1452 }
1453 }
1454 dout(0) << __func__ << " starting shutdown" << dendl;
1455 set_state(STOPPING);
1456 return true;
1457}
1458
1459void OSDService::got_stop_ack()
1460{
11fdf7f2 1461 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1462 if (get_state() == PREPARING_TO_STOP) {
1463 dout(0) << __func__ << " starting shutdown" << dendl;
1464 set_state(STOPPING);
1465 is_stopping_cond.Signal();
1466 } else {
1467 dout(10) << __func__ << " ignoring msg" << dendl;
1468 }
1469}
1470
1471MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1472 OSDSuperblock& sblock)
1473{
28e407b8
AA
1474 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1475 osdmap->get_encoding_features());
7c673cae
FG
1476 m->oldest_map = max_oldest_map;
1477 m->newest_map = sblock.newest_map;
1478
11fdf7f2
TL
1479 int max = cct->_conf->osd_map_message_max;
1480 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1481
1482 if (since < m->oldest_map) {
1483 // we don't have the next map the target wants, so start with a
1484 // full map.
1485 bufferlist bl;
1486 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1487 << since << ", starting with full map" << dendl;
1488 since = m->oldest_map;
1489 if (!get_map_bl(since, bl)) {
1490 derr << __func__ << " missing full map " << since << dendl;
1491 goto panic;
1492 }
1493 max--;
1494 max_bytes -= bl.length();
1495 m->maps[since].claim(bl);
1496 }
1497 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1498 bufferlist bl;
11fdf7f2 1499 if (get_inc_map_bl(e, bl)) {
7c673cae 1500 m->incremental_maps[e].claim(bl);
11fdf7f2
TL
1501 } else {
1502 derr << __func__ << " missing incremental map " << e << dendl;
1503 if (!get_map_bl(e, bl)) {
1504 derr << __func__ << " also missing full map " << e << dendl;
1505 goto panic;
1506 }
7c673cae 1507 m->maps[e].claim(bl);
11fdf7f2
TL
1508 }
1509 max--;
1510 max_bytes -= bl.length();
1511 if (max <= 0 || max_bytes <= 0) {
7c673cae 1512 break;
11fdf7f2
TL
1513 }
1514 }
1515 return m;
1516
1517 panic:
1518 if (!m->maps.empty() ||
1519 !m->incremental_maps.empty()) {
1520 // send what we have so far
1521 return m;
1522 }
1523 // send something
1524 bufferlist bl;
1525 if (get_inc_map_bl(m->newest_map, bl)) {
1526 m->incremental_maps[m->newest_map].claim(bl);
1527 } else {
1528 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1529 if (!get_map_bl(m->newest_map, bl)) {
1530 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1531 << dendl;
11fdf7f2 1532 ceph_abort();
7c673cae 1533 }
11fdf7f2 1534 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1535 }
1536 return m;
1537}
1538
1539void OSDService::send_map(MOSDMap *m, Connection *con)
1540{
1541 con->send_message(m);
1542}
1543
1544void OSDService::send_incremental_map(epoch_t since, Connection *con,
1545 OSDMapRef& osdmap)
1546{
1547 epoch_t to = osdmap->get_epoch();
1548 dout(10) << "send_incremental_map " << since << " -> " << to
1549 << " to " << con << " " << con->get_peer_addr() << dendl;
1550
1551 MOSDMap *m = NULL;
1552 while (!m) {
1553 OSDSuperblock sblock(get_superblock());
1554 if (since < sblock.oldest_map) {
1555 // just send latest full map
28e407b8
AA
1556 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1557 osdmap->get_encoding_features());
7c673cae
FG
1558 m->oldest_map = max_oldest_map;
1559 m->newest_map = sblock.newest_map;
1560 get_map_bl(to, m->maps[to]);
1561 send_map(m, con);
1562 return;
1563 }
1564
1565 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1566 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1567 << ", only sending most recent" << dendl;
1568 since = to - cct->_conf->osd_map_share_max_epochs;
1569 }
1570
7c673cae
FG
1571 m = build_incremental_map_msg(since, to, sblock);
1572 }
1573 send_map(m, con);
1574}
1575
1576bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1577{
1578 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1579 if (found) {
1580 if (logger)
1581 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1582 return true;
31f18b77
FG
1583 }
1584 if (logger)
1585 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1586 found = store->read(meta_ch,
31f18b77
FG
1587 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1588 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1589 if (found) {
7c673cae 1590 _add_map_bl(e, bl);
31f18b77 1591 }
7c673cae
FG
1592 return found;
1593}
1594
1595bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1596{
11fdf7f2 1597 std::lock_guard l(map_cache_lock);
7c673cae 1598 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1599 if (found) {
1600 if (logger)
1601 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1602 return true;
31f18b77
FG
1603 }
1604 if (logger)
1605 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1606 found = store->read(meta_ch,
31f18b77
FG
1607 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1608 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1609 if (found) {
7c673cae 1610 _add_map_inc_bl(e, bl);
31f18b77 1611 }
7c673cae
FG
1612 return found;
1613}
1614
1615void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1616{
1617 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1618 // cache a contiguous buffer
1619 if (bl.get_num_buffers() > 1) {
1620 bl.rebuild();
1621 }
1622 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1623 map_bl_cache.add(e, bl);
1624}
1625
1626void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1627{
1628 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1629 // cache a contiguous buffer
1630 if (bl.get_num_buffers() > 1) {
1631 bl.rebuild();
1632 }
1633 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1634 map_bl_inc_cache.add(e, bl);
1635}
1636
11fdf7f2 1637int OSDService::get_deleted_pool_pg_num(int64_t pool)
7c673cae 1638{
11fdf7f2
TL
1639 std::lock_guard l(map_cache_lock);
1640 auto p = deleted_pool_pg_nums.find(pool);
1641 if (p != deleted_pool_pg_nums.end()) {
1642 return p->second;
31f18b77 1643 }
11fdf7f2
TL
1644 dout(20) << __func__ << " " << pool << " loading" << dendl;
1645 ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1646 bufferlist bl;
1647 int r = store->read(meta_ch, oid, 0, 0, bl);
1648 ceph_assert(r >= 0);
1649 auto blp = bl.cbegin();
1650 pg_pool_t pi;
1651 ::decode(pi, blp);
1652 deleted_pool_pg_nums[pool] = pi.get_pg_num();
1653 dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1654 return pi.get_pg_num();
7c673cae
FG
1655}
1656
1657OSDMapRef OSDService::_add_map(OSDMap *o)
1658{
1659 epoch_t e = o->get_epoch();
1660
1661 if (cct->_conf->osd_map_dedup) {
1662 // Dedup against an existing map at a nearby epoch
1663 OSDMapRef for_dedup = map_cache.lower_bound(e);
1664 if (for_dedup) {
1665 OSDMap::dedup(for_dedup.get(), o);
1666 }
1667 }
1668 bool existed;
1669 OSDMapRef l = map_cache.add(e, o, &existed);
1670 if (existed) {
1671 delete o;
1672 }
1673 return l;
1674}
1675
1676OSDMapRef OSDService::try_get_map(epoch_t epoch)
1677{
11fdf7f2 1678 std::lock_guard l(map_cache_lock);
7c673cae
FG
1679 OSDMapRef retval = map_cache.lookup(epoch);
1680 if (retval) {
1681 dout(30) << "get_map " << epoch << " -cached" << dendl;
1682 if (logger) {
1683 logger->inc(l_osd_map_cache_hit);
1684 }
1685 return retval;
1686 }
1687 if (logger) {
1688 logger->inc(l_osd_map_cache_miss);
1689 epoch_t lb = map_cache.cached_key_lower_bound();
1690 if (epoch < lb) {
1691 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1692 logger->inc(l_osd_map_cache_miss_low);
1693 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1694 }
1695 }
1696
1697 OSDMap *map = new OSDMap;
1698 if (epoch > 0) {
1699 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1700 bufferlist bl;
1701 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1702 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1703 delete map;
1704 return OSDMapRef();
1705 }
1706 map->decode(bl);
1707 } else {
1708 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1709 }
1710 return _add_map(map);
1711}
1712
1713// ops
1714
1715
1716void OSDService::reply_op_error(OpRequestRef op, int err)
1717{
1718 reply_op_error(op, err, eversion_t(), 0);
1719}
1720
1721void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1722 version_t uv)
1723{
1724 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1725 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1726 int flags;
1727 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1728
11fdf7f2 1729 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
7c673cae
FG
1730 reply->set_reply_versions(v, uv);
1731 m->get_connection()->send_message(reply);
1732}
1733
1734void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1735{
31f18b77
FG
1736 if (!cct->_conf->osd_debug_misdirected_ops) {
1737 return;
1738 }
1739
7c673cae 1740 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1741 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1742
11fdf7f2 1743 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1744
1745 if (pg->is_ec_pg()) {
1746 /**
1747 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1748 * can get this result:
1749 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1750 * [CRUSH_ITEM_NONE, 2, 3]/3
1751 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1752 * [3, 2, 3]/3
1753 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1754 * -- misdirected op
1755 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1756 * it and fulfils it
1757 *
1758 * We can't compute the op target based on the sending map epoch due to
1759 * splitting. The simplest thing is to detect such cases here and drop
1760 * them without an error (the client will resend anyway).
1761 */
11fdf7f2 1762 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1763 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1764 if (!opmap) {
1765 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1766 << m->get_map_epoch() << ", dropping" << dendl;
1767 return;
1768 }
1769 pg_t _pgid = m->get_raw_pg();
1770 spg_t pgid;
1771 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1772 _pgid = opmap->raw_pg_to_pg(_pgid);
1773 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1774 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1775 dout(7) << __func__ << ": " << *pg << " primary changed since "
1776 << m->get_map_epoch() << ", dropping" << dendl;
1777 return;
1778 }
1779 }
1780
1781 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1782 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1783 << " pg " << m->get_raw_pg()
1784 << " to osd." << whoami
11fdf7f2 1785 << " not " << pg->get_acting()
7c673cae 1786 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1787}
1788
11fdf7f2 1789void OSDService::enqueue_back(OpQueueItem&& qi)
7c673cae 1790{
11fdf7f2 1791 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1792}
1793
11fdf7f2 1794void OSDService::enqueue_front(OpQueueItem&& qi)
7c673cae 1795{
11fdf7f2 1796 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1797}
1798
11fdf7f2
TL
1799void OSDService::queue_recovery_context(
1800 PG *pg,
1801 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1802{
11fdf7f2
TL
1803 epoch_t e = get_osdmap_epoch();
1804 enqueue_back(
1805 OpQueueItem(
1806 unique_ptr<OpQueueItem::OpQueueable>(
1807 new PGRecoveryContext(pg->get_pgid(), c, e)),
1808 cct->_conf->osd_recovery_cost,
1809 cct->_conf->osd_recovery_priority,
1810 ceph_clock_now(),
1811 0,
1812 e));
7c673cae
FG
1813}
1814
1815void OSDService::queue_for_snap_trim(PG *pg)
1816{
1817 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2
TL
1818 enqueue_back(
1819 OpQueueItem(
1820 unique_ptr<OpQueueItem::OpQueueable>(
1821 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1822 cct->_conf->osd_snap_trim_cost,
1823 cct->_conf->osd_snap_trim_priority,
1824 ceph_clock_now(),
1825 0,
1826 pg->get_osdmap_epoch()));
1827}
1828
1829void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1830{
1831 unsigned scrub_queue_priority = pg->scrubber.priority;
1832 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1833 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1834 }
1835 const auto epoch = pg->get_osdmap_epoch();
1836 enqueue_back(
1837 OpQueueItem(
1838 unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1839 cct->_conf->osd_scrub_cost,
1840 scrub_queue_priority,
1841 ceph_clock_now(),
1842 0,
1843 epoch));
1844}
1845
1846void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1847{
1848 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1849 enqueue_back(
1850 OpQueueItem(
1851 unique_ptr<OpQueueItem::OpQueueable>(
1852 new PGDelete(pgid, e)),
1853 cct->_conf->osd_pg_delete_cost,
1854 cct->_conf->osd_pg_delete_priority,
1855 ceph_clock_now(),
1856 0,
1857 e));
1858}
1859
1860bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1861{
1862 return osd->try_finish_pg_delete(pg, old_pg_num);
1863}
1864
1865// ---
1866
1867void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1868{
1869 std::lock_guard l(merge_lock);
1870 dout(10) << __func__ << " " << pg->pg_id << dendl;
1871 ready_to_merge_source[pg->pg_id.pgid] = version;
1872 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1873 _send_ready_to_merge();
1874}
1875
1876void OSDService::set_ready_to_merge_target(PG *pg,
1877 eversion_t version,
1878 epoch_t last_epoch_started,
1879 epoch_t last_epoch_clean)
1880{
1881 std::lock_guard l(merge_lock);
1882 dout(10) << __func__ << " " << pg->pg_id << dendl;
1883 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1884 make_tuple(version,
1885 last_epoch_started,
1886 last_epoch_clean)));
1887 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1888 _send_ready_to_merge();
1889}
1890
1891void OSDService::set_not_ready_to_merge_source(pg_t source)
1892{
1893 std::lock_guard l(merge_lock);
1894 dout(10) << __func__ << " " << source << dendl;
1895 not_ready_to_merge_source.insert(source);
1896 assert(ready_to_merge_source.count(source) == 0);
1897 _send_ready_to_merge();
1898}
1899
1900void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1901{
1902 std::lock_guard l(merge_lock);
1903 dout(10) << __func__ << " " << target << " source " << source << dendl;
1904 not_ready_to_merge_target[target] = source;
1905 assert(ready_to_merge_target.count(target) == 0);
1906 _send_ready_to_merge();
1907}
1908
1909void OSDService::send_ready_to_merge()
1910{
1911 std::lock_guard l(merge_lock);
1912 _send_ready_to_merge();
1913}
1914
1915void OSDService::_send_ready_to_merge()
1916{
1917 dout(20) << __func__
1918 << " ready_to_merge_source " << ready_to_merge_source
1919 << " not_ready_to_merge_source " << not_ready_to_merge_source
1920 << " ready_to_merge_target " << ready_to_merge_target
1921 << " not_ready_to_merge_target " << not_ready_to_merge_target
1922 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1923 << dendl;
1924 for (auto src : not_ready_to_merge_source) {
1925 if (sent_ready_to_merge_source.count(src) == 0) {
1926 monc->send_mon_message(new MOSDPGReadyToMerge(
1927 src,
1928 {}, {}, 0, 0,
1929 false,
1930 osdmap->get_epoch()));
1931 sent_ready_to_merge_source.insert(src);
1932 }
1933 }
1934 for (auto p : not_ready_to_merge_target) {
1935 if (sent_ready_to_merge_source.count(p.second) == 0) {
1936 monc->send_mon_message(new MOSDPGReadyToMerge(
1937 p.second,
1938 {}, {}, 0, 0,
1939 false,
1940 osdmap->get_epoch()));
1941 sent_ready_to_merge_source.insert(p.second);
1942 }
1943 }
1944 for (auto src : ready_to_merge_source) {
1945 if (not_ready_to_merge_source.count(src.first) ||
1946 not_ready_to_merge_target.count(src.first.get_parent())) {
1947 continue;
1948 }
1949 auto p = ready_to_merge_target.find(src.first.get_parent());
1950 if (p != ready_to_merge_target.end() &&
1951 sent_ready_to_merge_source.count(src.first) == 0) {
1952 monc->send_mon_message(new MOSDPGReadyToMerge(
1953 src.first, // source pgid
1954 src.second, // src version
1955 std::get<0>(p->second), // target version
1956 std::get<1>(p->second), // PG's last_epoch_started
1957 std::get<2>(p->second), // PG's last_epoch_clean
1958 true,
1959 osdmap->get_epoch()));
1960 sent_ready_to_merge_source.insert(src.first);
1961 }
1962 }
1963}
1964
1965void OSDService::clear_ready_to_merge(PG *pg)
1966{
1967 std::lock_guard l(merge_lock);
1968 dout(10) << __func__ << " " << pg->pg_id << dendl;
1969 ready_to_merge_source.erase(pg->pg_id.pgid);
1970 ready_to_merge_target.erase(pg->pg_id.pgid);
1971 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1972 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1973 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1974}
1975
1976void OSDService::clear_sent_ready_to_merge()
1977{
1978 std::lock_guard l(merge_lock);
1979 sent_ready_to_merge_source.clear();
1980}
1981
1982void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
1983{
1984 std::lock_guard l(merge_lock);
1985 auto i = sent_ready_to_merge_source.begin();
1986 while (i != sent_ready_to_merge_source.end()) {
1987 if (!osdmap->pg_exists(*i)) {
1988 dout(10) << __func__ << " " << *i << dendl;
1989 i = sent_ready_to_merge_source.erase(i);
1990 } else {
1991 ++i;
1992 }
1993 }
7c673cae
FG
1994}
1995
11fdf7f2
TL
1996// ---
1997
1998void OSDService::_queue_for_recovery(
1999 std::pair<epoch_t, PGRef> p,
2000 uint64_t reserved_pushes)
2001{
2002 ceph_assert(recovery_lock.is_locked_by_me());
2003 enqueue_back(
2004 OpQueueItem(
2005 unique_ptr<OpQueueItem::OpQueueable>(
2006 new PGRecovery(
2007 p.second->get_pgid(), p.first, reserved_pushes)),
2008 cct->_conf->osd_recovery_cost,
2009 cct->_conf->osd_recovery_priority,
2010 ceph_clock_now(),
2011 0,
2012 p.first));
2013}
7c673cae
FG
2014
2015// ====================================================================
2016// OSD
2017
2018#undef dout_prefix
2019#define dout_prefix *_dout
2020
2021// Commands shared between OSD's console and admin console:
2022namespace ceph {
2023namespace osd_cmds {
2024
11fdf7f2 2025int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
2026
2027}} // namespace ceph::osd_cmds
2028
11fdf7f2 2029int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
7c673cae
FG
2030{
2031 int ret;
2032
7c673cae
FG
2033 OSDSuperblock sb;
2034 bufferlist sbbl;
11fdf7f2 2035 ObjectStore::CollectionHandle ch;
7c673cae
FG
2036
2037 // if we are fed a uuid for this osd, use it.
2038 store->set_fsid(cct->_conf->osd_uuid);
2039
2040 ret = store->mkfs();
2041 if (ret) {
224ce89b
WB
2042 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2043 << cpp_strerror(ret) << dendl;
7c673cae
FG
2044 goto free_store;
2045 }
2046
31f18b77 2047 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2048
2049 ret = store->mount();
2050 if (ret) {
224ce89b
WB
2051 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2052 << cpp_strerror(ret) << dendl;
7c673cae
FG
2053 goto free_store;
2054 }
2055
11fdf7f2
TL
2056 ch = store->open_collection(coll_t::meta());
2057 if (ch) {
2058 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2059 if (ret < 0) {
2060 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2061 goto free_store;
2062 }
7c673cae
FG
2063 /* if we already have superblock, check content of superblock */
2064 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2065 auto p = sbbl.cbegin();
2066 decode(sb, p);
7c673cae
FG
2067 if (whoami != sb.whoami) {
2068 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2069 << dendl;
2070 ret = -EINVAL;
2071 goto umount_store;
2072 }
2073 if (fsid != sb.cluster_fsid) {
2074 derr << "provided cluster fsid " << fsid
2075 << " != superblock's " << sb.cluster_fsid << dendl;
2076 ret = -EINVAL;
2077 goto umount_store;
2078 }
2079 } else {
2080 // create superblock
2081 sb.cluster_fsid = fsid;
2082 sb.osd_fsid = store->get_fsid();
2083 sb.whoami = whoami;
2084 sb.compat_features = get_osd_initial_compat_set();
2085
2086 bufferlist bl;
11fdf7f2 2087 encode(sb, bl);
7c673cae 2088
11fdf7f2
TL
2089 ObjectStore::CollectionHandle ch = store->create_new_collection(
2090 coll_t::meta());
7c673cae
FG
2091 ObjectStore::Transaction t;
2092 t.create_collection(coll_t::meta(), 0);
2093 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2094 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2095 if (ret) {
2096 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2097 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
2098 goto umount_store;
2099 }
2100 }
2101
3efd9988 2102 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 2103 if (ret) {
224ce89b
WB
2104 derr << "OSD::mkfs: failed to write fsid file: error "
2105 << cpp_strerror(ret) << dendl;
7c673cae
FG
2106 goto umount_store;
2107 }
2108
2109umount_store:
11fdf7f2
TL
2110 if (ch) {
2111 ch.reset();
2112 }
7c673cae
FG
2113 store->umount();
2114free_store:
2115 delete store;
2116 return ret;
2117}
2118
3efd9988 2119int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
2120{
2121 char val[80];
2122 int r;
2123
2124 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2125 r = store->write_meta("magic", val);
2126 if (r < 0)
2127 return r;
2128
2129 snprintf(val, sizeof(val), "%d", whoami);
2130 r = store->write_meta("whoami", val);
2131 if (r < 0)
2132 return r;
2133
2134 cluster_fsid.print(val);
2135 r = store->write_meta("ceph_fsid", val);
2136 if (r < 0)
2137 return r;
2138
11fdf7f2 2139 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2140 if (key.size()) {
2141 r = store->write_meta("osd_key", key);
2142 if (r < 0)
2143 return r;
b32b8144 2144 } else {
11fdf7f2 2145 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2146 if (!keyfile.empty()) {
2147 bufferlist keybl;
2148 string err;
11fdf7f2 2149 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2150 if (r < 0) {
2151 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2152 << err << ": " << cpp_strerror(r) << dendl;
2153 return r;
2154 }
2155 r = store->write_meta("osd_key", keybl.to_str());
2156 if (r < 0)
2157 return r;
2158 }
3efd9988
FG
2159 }
2160
7c673cae
FG
2161 r = store->write_meta("ready", "ready");
2162 if (r < 0)
2163 return r;
2164
2165 return 0;
2166}
2167
11fdf7f2
TL
2168int OSD::peek_meta(ObjectStore *store,
2169 std::string *magic,
2170 uuid_d *cluster_fsid,
2171 uuid_d *osd_fsid,
2172 int *whoami,
2173 int *require_osd_release)
7c673cae
FG
2174{
2175 string val;
2176
2177 int r = store->read_meta("magic", &val);
2178 if (r < 0)
2179 return r;
11fdf7f2 2180 *magic = val;
7c673cae
FG
2181
2182 r = store->read_meta("whoami", &val);
2183 if (r < 0)
2184 return r;
11fdf7f2 2185 *whoami = atoi(val.c_str());
7c673cae
FG
2186
2187 r = store->read_meta("ceph_fsid", &val);
2188 if (r < 0)
2189 return r;
11fdf7f2 2190 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2191 if (!r)
2192 return -EINVAL;
2193
2194 r = store->read_meta("fsid", &val);
2195 if (r < 0) {
11fdf7f2 2196 *osd_fsid = uuid_d();
7c673cae 2197 } else {
11fdf7f2 2198 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2199 if (!r)
2200 return -EINVAL;
2201 }
2202
11fdf7f2
TL
2203 r = store->read_meta("require_osd_release", &val);
2204 if (r >= 0) {
2205 *require_osd_release = atoi(val.c_str());
2206 }
2207
7c673cae
FG
2208 return 0;
2209}
2210
2211
2212#undef dout_prefix
2213#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2214
2215// cons/des
2216
2217OSD::OSD(CephContext *cct_, ObjectStore *store_,
2218 int id,
2219 Messenger *internal_messenger,
2220 Messenger *external_messenger,
2221 Messenger *hb_client_front,
2222 Messenger *hb_client_back,
2223 Messenger *hb_front_serverm,
2224 Messenger *hb_back_serverm,
2225 Messenger *osdc_messenger,
2226 MonClient *mc,
2227 const std::string &dev, const std::string &jdev) :
2228 Dispatcher(cct_),
2229 osd_lock("OSD::osd_lock"),
2230 tick_timer(cct, osd_lock),
2231 tick_timer_lock("OSD::tick_timer_lock"),
2232 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2233 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2234 cluster_messenger(internal_messenger),
2235 client_messenger(external_messenger),
2236 objecter_messenger(osdc_messenger),
2237 monc(mc),
2238 mgrc(cct_, client_messenger),
2239 logger(NULL),
2240 recoverystate_perf(NULL),
2241 store(store_),
2242 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2243 clog(log_client.create_channel()),
2244 whoami(id),
2245 dev_path(dev), journal_path(jdev),
31f18b77 2246 store_is_rotational(store->is_rotational()),
7c673cae
FG
2247 trace_endpoint("0.0.0.0", 0, "osd"),
2248 asok_hook(NULL),
11fdf7f2
TL
2249 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2250 "osd_pg_epoch_max_lag_factor")),
7c673cae 2251 osd_compat(get_osd_compat_set()),
7c673cae 2252 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2253 get_num_op_threads()),
7c673cae
FG
2254 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
2255 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 2256 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
2257 heartbeat_lock("OSD::heartbeat_lock"),
2258 heartbeat_stop(false),
2259 heartbeat_need_update(true),
2260 hb_front_client_messenger(hb_client_front),
2261 hb_back_client_messenger(hb_client_back),
2262 hb_front_server_messenger(hb_front_serverm),
2263 hb_back_server_messenger(hb_back_serverm),
2264 daily_loadavg(0.0),
2265 heartbeat_thread(this),
2266 heartbeat_dispatcher(this),
2267 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2268 cct->_conf->osd_num_op_tracker_shard),
2269 test_ops_hook(NULL),
2270 op_queue(get_io_queue()),
2271 op_prio_cutoff(get_io_prio_cut()),
2272 op_shardedwq(
7c673cae
FG
2273 this,
2274 cct->_conf->osd_op_thread_timeout,
2275 cct->_conf->osd_op_thread_suicide_timeout,
2276 &osd_op_tp),
7c673cae 2277 map_lock("OSD::map_lock"),
7c673cae
FG
2278 last_pg_create_epoch(0),
2279 mon_report_lock("OSD::mon_report_lock"),
11fdf7f2 2280 boot_finisher(cct),
7c673cae
FG
2281 up_thru_wanted(0),
2282 requested_full_first(0),
2283 requested_full_last(0),
7c673cae
FG
2284 command_wq(
2285 this,
2286 cct->_conf->osd_command_thread_timeout,
2287 cct->_conf->osd_command_thread_suicide_timeout,
2288 &command_tp),
7c673cae
FG
2289 service(this)
2290{
11fdf7f2
TL
2291
2292 if (!gss_ktfile_client.empty()) {
2293 // Assert we can export environment variable
2294 /*
2295 The default client keytab is used, if it is present and readable,
2296 to automatically obtain initial credentials for GSSAPI client
2297 applications. The principal name of the first entry in the client
2298 keytab is used by default when obtaining initial credentials.
2299 1. The KRB5_CLIENT_KTNAME environment variable.
2300 2. The default_client_keytab_name profile variable in [libdefaults].
2301 3. The hardcoded default, DEFCKTNAME.
2302 */
2303 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2304 gss_ktfile_client.c_str(), 1));
2305 ceph_assert(set_result == 0);
2306 }
2307
7c673cae
FG
2308 monc->set_messenger(client_messenger);
2309 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2310 cct->_conf->osd_op_log_threshold);
2311 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2312 cct->_conf->osd_op_history_duration);
2313 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2314 cct->_conf->osd_op_history_slow_op_threshold);
2315#ifdef WITH_BLKIN
2316 std::stringstream ss;
2317 ss << "osd." << whoami;
2318 trace_endpoint.copy_name(ss.str());
2319#endif
11fdf7f2
TL
2320
2321 // initialize shards
2322 num_shards = get_num_op_shards();
2323 for (uint32_t i = 0; i < num_shards; i++) {
2324 OSDShard *one_shard = new OSDShard(
2325 i,
2326 cct,
2327 this,
2328 cct->_conf->osd_op_pq_max_tokens_per_priority,
2329 cct->_conf->osd_op_pq_min_cost,
2330 op_queue);
2331 shards.push_back(one_shard);
2332 }
7c673cae
FG
2333}
2334
2335OSD::~OSD()
2336{
11fdf7f2
TL
2337 while (!shards.empty()) {
2338 delete shards.back();
2339 shards.pop_back();
2340 }
7c673cae
FG
2341 delete class_handler;
2342 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2343 cct->get_perfcounters_collection()->remove(logger);
2344 delete recoverystate_perf;
2345 delete logger;
2346 delete store;
2347}
2348
91327a77
AA
2349double OSD::get_tick_interval() const
2350{
2351 // vary +/- 5% to avoid scrub scheduling livelocks
2352 constexpr auto delta = 0.05;
91327a77 2353 return (OSD_TICK_INTERVAL *
11fdf7f2 2354 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2355}
2356
7c673cae
FG
2357void cls_initialize(ClassHandler *ch);
2358
2359void OSD::handle_signal(int signum)
2360{
11fdf7f2 2361 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2362 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2363 shutdown();
2364}
2365
2366int OSD::pre_init()
2367{
11fdf7f2 2368 std::lock_guard lock(osd_lock);
7c673cae
FG
2369 if (is_stopping())
2370 return 0;
2371
2372 if (store->test_mount_in_use()) {
2373 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2374 << "currently in use. (Is ceph-osd already running?)" << dendl;
2375 return -EBUSY;
2376 }
2377
11fdf7f2
TL
2378 cct->_conf.add_observer(this);
2379 return 0;
2380}
2381
2382int OSD::set_numa_affinity()
2383{
2384 // storage numa node
2385 int store_node = -1;
2386 store->get_numa_node(&store_node, nullptr, nullptr);
2387 if (store_node >= 0) {
2388 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2389 }
2390
2391 // check network numa node(s)
2392 int front_node = -1, back_node = -1;
2393 string front_iface = pick_iface(
2394 cct,
2395 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2396 string back_iface = pick_iface(
2397 cct,
2398 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2399 int r = get_iface_numa_node(front_iface, &front_node);
2400 if (r >= 0) {
2401 dout(1) << __func__ << " public network " << front_iface << " numa node "
2402 << front_node << dendl;
2403 r = get_iface_numa_node(back_iface, &back_node);
2404 if (r >= 0) {
2405 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2406 << back_node << dendl;
2407 if (front_node == back_node &&
2408 front_node == store_node) {
2409 dout(1) << " objectstore and network numa nodes all match" << dendl;
2410 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2411 numa_node = front_node;
2412 }
2413 } else {
2414 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2415 << dendl;
2416 }
2417 }
2418 } else {
2419 derr << __func__ << " unable to identify public interface '" << front_iface
2420 << "' numa node: " << cpp_strerror(r) << dendl;
2421 }
2422 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2423 // this takes precedence over the automagic logic above
2424 numa_node = node;
2425 }
2426 if (numa_node >= 0) {
2427 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2428 if (r < 0) {
2429 dout(1) << __func__ << " unable to determine numa node " << numa_node
2430 << " CPUs" << dendl;
2431 numa_node = -1;
2432 } else {
2433 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2434 << " cpus "
2435 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2436 << dendl;
2437 r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set);
2438 if (r < 0) {
2439 r = -errno;
2440 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2441 << dendl;
2442 numa_node = -1;
2443 }
2444 }
2445 } else {
2446 dout(1) << __func__ << " not setting numa affinity" << dendl;
2447 }
7c673cae
FG
2448 return 0;
2449}
2450
2451// asok
2452
2453class OSDSocketHook : public AdminSocketHook {
2454 OSD *osd;
2455public:
2456 explicit OSDSocketHook(OSD *o) : osd(o) {}
11fdf7f2
TL
2457 bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2458 std::string_view format, bufferlist& out) override {
7c673cae 2459 stringstream ss;
11fdf7f2
TL
2460 bool r = true;
2461 try {
2462 r = osd->asok_command(admin_command, cmdmap, format, ss);
2463 } catch (const bad_cmd_get& e) {
2464 ss << e.what();
2465 r = true;
2466 }
7c673cae
FG
2467 out.append(ss);
2468 return r;
2469 }
2470};
2471
11fdf7f2
TL
2472std::set<int64_t> OSD::get_mapped_pools()
2473{
2474 std::set<int64_t> pools;
2475 std::vector<spg_t> pgids;
2476 _get_pgids(&pgids);
2477 for (const auto &pgid : pgids) {
2478 pools.insert(pgid.pool());
2479 }
2480 return pools;
2481}
2482
2483bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2484 std::string_view format, ostream& ss)
7c673cae
FG
2485{
2486 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2487 if (admin_command == "status") {
2488 f->open_object_section("status");
2489 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2490 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2491 f->dump_unsigned("whoami", superblock.whoami);
2492 f->dump_string("state", get_state_name(get_state()));
2493 f->dump_unsigned("oldest_map", superblock.oldest_map);
2494 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2495 f->dump_unsigned("num_pgs", num_pgs);
7c673cae
FG
2496 f->close_section();
2497 } else if (admin_command == "flush_journal") {
2498 store->flush_journal();
2499 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2500 admin_command == "ops" ||
2501 admin_command == "dump_blocked_ops" ||
2502 admin_command == "dump_historic_ops" ||
2503 admin_command == "dump_historic_ops_by_duration" ||
2504 admin_command == "dump_historic_slow_ops") {
2505
2506 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2507even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2508will start to track new ops received afterwards.";
2509
2510 set<string> filters;
2511 vector<string> filter_str;
2512 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2513 copy(filter_str.begin(), filter_str.end(),
2514 inserter(filters, filters.end()));
2515 }
2516
2517 if (admin_command == "dump_ops_in_flight" ||
2518 admin_command == "ops") {
2519 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2520 ss << error_str;
2521 }
2522 }
2523 if (admin_command == "dump_blocked_ops") {
2524 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2525 ss << error_str;
2526 }
2527 }
2528 if (admin_command == "dump_historic_ops") {
2529 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2530 ss << error_str;
2531 }
2532 }
2533 if (admin_command == "dump_historic_ops_by_duration") {
2534 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2535 ss << error_str;
2536 }
2537 }
2538 if (admin_command == "dump_historic_slow_ops") {
2539 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2540 ss << error_str;
2541 }
7c673cae
FG
2542 }
2543 } else if (admin_command == "dump_op_pq_state") {
2544 f->open_object_section("pq");
2545 op_shardedwq.dump(f);
2546 f->close_section();
2547 } else if (admin_command == "dump_blacklist") {
2548 list<pair<entity_addr_t,utime_t> > bl;
2549 OSDMapRef curmap = service.get_osdmap();
2550
2551 f->open_array_section("blacklist");
2552 curmap->get_blacklist(&bl);
2553 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2554 it != bl.end(); ++it) {
224ce89b 2555 f->open_object_section("entry");
7c673cae
FG
2556 f->open_object_section("entity_addr_t");
2557 it->first.dump(f);
2558 f->close_section(); //entity_addr_t
2559 it->second.localtime(f->dump_stream("expire_time"));
2560 f->close_section(); //entry
2561 }
2562 f->close_section(); //blacklist
2563 } else if (admin_command == "dump_watchers") {
2564 list<obj_watch_item_t> watchers;
2565 // scan pg's
11fdf7f2
TL
2566 vector<PGRef> pgs;
2567 _get_pgs(&pgs);
2568 for (auto& pg : pgs) {
2569 list<obj_watch_item_t> pg_watchers;
2570 pg->get_watchers(&pg_watchers);
2571 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2572 }
2573
2574 f->open_array_section("watchers");
2575 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2576 it != watchers.end(); ++it) {
2577
224ce89b 2578 f->open_object_section("watch");
7c673cae
FG
2579
2580 f->dump_string("namespace", it->obj.nspace);
2581 f->dump_string("object", it->obj.oid.name);
2582
2583 f->open_object_section("entity_name");
2584 it->wi.name.dump(f);
2585 f->close_section(); //entity_name_t
2586
224ce89b
WB
2587 f->dump_unsigned("cookie", it->wi.cookie);
2588 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2589
2590 f->open_object_section("entity_addr_t");
2591 it->wi.addr.dump(f);
2592 f->close_section(); //entity_addr_t
2593
2594 f->close_section(); //watch
2595 }
2596
2597 f->close_section(); //watchers
2598 } else if (admin_command == "dump_reservations") {
2599 f->open_object_section("reservations");
2600 f->open_object_section("local_reservations");
2601 service.local_reserver.dump(f);
2602 f->close_section();
2603 f->open_object_section("remote_reservations");
2604 service.remote_reserver.dump(f);
2605 f->close_section();
2606 f->close_section();
2607 } else if (admin_command == "get_latest_osdmap") {
2608 get_latest_osdmap();
2609 } else if (admin_command == "heap") {
2610 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2611
2612 // Note: Failed heap profile commands won't necessarily trigger an error:
2613 f->open_object_section("result");
2614 f->dump_string("error", cpp_strerror(result));
2615 f->dump_bool("success", result >= 0);
2616 f->close_section();
2617 } else if (admin_command == "set_heap_property") {
2618 string property;
2619 int64_t value = 0;
2620 string error;
2621 bool success = false;
2622 if (!cmd_getval(cct, cmdmap, "property", property)) {
2623 error = "unable to get property";
2624 success = false;
2625 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2626 error = "unable to get value";
2627 success = false;
2628 } else if (value < 0) {
2629 error = "negative value not allowed";
2630 success = false;
2631 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2632 error = "invalid property";
2633 success = false;
2634 } else {
2635 success = true;
2636 }
2637 f->open_object_section("result");
2638 f->dump_string("error", error);
2639 f->dump_bool("success", success);
2640 f->close_section();
2641 } else if (admin_command == "get_heap_property") {
2642 string property;
2643 size_t value = 0;
2644 string error;
2645 bool success = false;
2646 if (!cmd_getval(cct, cmdmap, "property", property)) {
2647 error = "unable to get property";
2648 success = false;
2649 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2650 error = "invalid property";
2651 success = false;
2652 } else {
2653 success = true;
2654 }
2655 f->open_object_section("result");
2656 f->dump_string("error", error);
2657 f->dump_bool("success", success);
2658 f->dump_int("value", value);
2659 f->close_section();
2660 } else if (admin_command == "dump_objectstore_kv_stats") {
2661 store->get_db_statistics(f);
2662 } else if (admin_command == "dump_scrubs") {
2663 service.dumps_scrub(f);
2664 } else if (admin_command == "calc_objectstore_db_histogram") {
2665 store->generate_db_histogram(f);
2666 } else if (admin_command == "flush_store_cache") {
11fdf7f2 2667 store->flush_cache(&ss);
7c673cae
FG
2668 } else if (admin_command == "dump_pgstate_history") {
2669 f->open_object_section("pgstate_history");
11fdf7f2
TL
2670 vector<PGRef> pgs;
2671 _get_pgs(&pgs);
2672 for (auto& pg : pgs) {
2673 f->dump_stream("pg") << pg->pg_id;
2674 pg->dump_pgstate_history(f);
7c673cae
FG
2675 }
2676 f->close_section();
224ce89b
WB
2677 } else if (admin_command == "compact") {
2678 dout(1) << "triggering manual compaction" << dendl;
2679 auto start = ceph::coarse_mono_clock::now();
2680 store->compact();
2681 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2682 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2683 dout(1) << "finished manual compaction in "
11fdf7f2 2684 << duration
224ce89b
WB
2685 << " seconds" << dendl;
2686 f->open_object_section("compact_result");
11fdf7f2
TL
2687 f->dump_float("elapsed_time", duration);
2688 f->close_section();
2689 } else if (admin_command == "get_mapped_pools") {
2690 f->open_array_section("mapped_pools");
2691 set<int64_t> poollist = get_mapped_pools();
2692 for (auto pool : poollist) {
2693 f->dump_int("pool_id", pool);
2694 }
2695 f->close_section();
2696 } else if (admin_command == "smart") {
2697 string devid;
2698 cmd_getval(cct, cmdmap, "devid", devid);
2699 probe_smart(devid, ss);
2700 } else if (admin_command == "list_devices") {
2701 set<string> devnames;
2702 store->get_devices(&devnames);
2703 f->open_object_section("list_devices");
2704 for (auto dev : devnames) {
2705 if (dev.find("dm-") == 0) {
2706 continue;
2707 }
2708 f->dump_string("device", "/dev/" + dev);
2709 }
224ce89b 2710 f->close_section();
11fdf7f2
TL
2711 } else if (admin_command == "send_beacon") {
2712 if (is_active()) {
2713 send_beacon(ceph::coarse_mono_clock::now());
2714 }
7c673cae 2715 } else {
11fdf7f2 2716 ceph_abort_msg("broken asok registration");
7c673cae
FG
2717 }
2718 f->flush(ss);
2719 delete f;
2720 return true;
2721}
2722
2723class TestOpsSocketHook : public AdminSocketHook {
2724 OSDService *service;
2725 ObjectStore *store;
2726public:
2727 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
11fdf7f2
TL
2728 bool call(std::string_view command, const cmdmap_t& cmdmap,
2729 std::string_view format, bufferlist& out) override {
7c673cae 2730 stringstream ss;
11fdf7f2
TL
2731 try {
2732 test_ops(service, store, command, cmdmap, ss);
2733 } catch (const bad_cmd_get& e) {
2734 ss << e.what();
2735 }
7c673cae
FG
2736 out.append(ss);
2737 return true;
2738 }
2739 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 2740 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
2741
2742};
2743
2744class OSD::C_Tick : public Context {
2745 OSD *osd;
2746 public:
2747 explicit C_Tick(OSD *o) : osd(o) {}
2748 void finish(int r) override {
2749 osd->tick();
2750 }
2751};
2752
2753class OSD::C_Tick_WithoutOSDLock : public Context {
2754 OSD *osd;
2755 public:
2756 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2757 void finish(int r) override {
2758 osd->tick_without_osd_lock();
2759 }
2760};
2761
2762int OSD::enable_disable_fuse(bool stop)
2763{
2764#ifdef HAVE_LIBFUSE
2765 int r;
2766 string mntpath = cct->_conf->osd_data + "/fuse";
2767 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2768 dout(1) << __func__ << " disabling" << dendl;
2769 fuse_store->stop();
2770 delete fuse_store;
2771 fuse_store = NULL;
2772 r = ::rmdir(mntpath.c_str());
7c673cae 2773 if (r < 0) {
c07f9fc5
FG
2774 r = -errno;
2775 derr << __func__ << " failed to rmdir " << mntpath << ": "
2776 << cpp_strerror(r) << dendl;
7c673cae
FG
2777 return r;
2778 }
2779 return 0;
2780 }
2781 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2782 dout(1) << __func__ << " enabling" << dendl;
2783 r = ::mkdir(mntpath.c_str(), 0700);
2784 if (r < 0)
2785 r = -errno;
2786 if (r < 0 && r != -EEXIST) {
2787 derr << __func__ << " unable to create " << mntpath << ": "
2788 << cpp_strerror(r) << dendl;
2789 return r;
2790 }
2791 fuse_store = new FuseStore(store, mntpath);
2792 r = fuse_store->start();
2793 if (r < 0) {
2794 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2795 delete fuse_store;
2796 fuse_store = NULL;
2797 return r;
2798 }
2799 }
2800#endif // HAVE_LIBFUSE
2801 return 0;
2802}
2803
31f18b77
FG
2804int OSD::get_num_op_shards()
2805{
2806 if (cct->_conf->osd_op_num_shards)
2807 return cct->_conf->osd_op_num_shards;
2808 if (store_is_rotational)
2809 return cct->_conf->osd_op_num_shards_hdd;
2810 else
2811 return cct->_conf->osd_op_num_shards_ssd;
2812}
2813
2814int OSD::get_num_op_threads()
2815{
2816 if (cct->_conf->osd_op_num_threads_per_shard)
2817 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2818 if (store_is_rotational)
2819 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2820 else
2821 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2822}
2823
c07f9fc5
FG
2824float OSD::get_osd_recovery_sleep()
2825{
2826 if (cct->_conf->osd_recovery_sleep)
2827 return cct->_conf->osd_recovery_sleep;
d2e6a577 2828 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2829 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 2830 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 2831 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
2832 else
2833 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2834}
2835
11fdf7f2
TL
2836float OSD::get_osd_delete_sleep()
2837{
2838 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
2839 if (osd_delete_sleep > 0)
2840 return osd_delete_sleep;
2841 if (!store_is_rotational && !journal_is_rotational)
2842 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
2843 if (store_is_rotational && !journal_is_rotational)
2844 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
2845 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
2846}
2847
7c673cae
FG
2848int OSD::init()
2849{
2850 CompatSet initial, diff;
11fdf7f2 2851 std::lock_guard lock(osd_lock);
7c673cae
FG
2852 if (is_stopping())
2853 return 0;
2854
2855 tick_timer.init();
2856 tick_timer_without_osd_lock.init();
2857 service.recovery_request_timer.init();
11fdf7f2
TL
2858 service.sleep_timer.init();
2859
2860 boot_finisher.start();
2861
2862 {
2863 string val;
2864 store->read_meta("require_osd_release", &val);
2865 last_require_osd_release = atoi(val.c_str());
2866 }
7c673cae
FG
2867
2868 // mount.
31f18b77
FG
2869 dout(2) << "init " << dev_path
2870 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2871 << dendl;
d2e6a577 2872 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 2873 ceph_assert(store); // call pre_init() first!
7c673cae 2874
31f18b77 2875 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2876
2877 int r = store->mount();
2878 if (r < 0) {
2879 derr << "OSD:init: unable to mount object store" << dendl;
2880 return r;
2881 }
d2e6a577
FG
2882 journal_is_rotational = store->is_journal_rotational();
2883 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2884 << dendl;
7c673cae
FG
2885
2886 enable_disable_fuse(false);
2887
2888 dout(2) << "boot" << dendl;
2889
11fdf7f2
TL
2890 service.meta_ch = store->open_collection(coll_t::meta());
2891
7c673cae
FG
2892 // initialize the daily loadavg with current 15min loadavg
2893 double loadavgs[3];
2894 if (getloadavg(loadavgs, 3) == 3) {
2895 daily_loadavg = loadavgs[2];
2896 } else {
2897 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2898 daily_loadavg = 1.0;
2899 }
2900
2901 int rotating_auth_attempts = 0;
11fdf7f2
TL
2902 auto rotating_auth_timeout =
2903 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
2904
2905 // sanity check long object name handling
2906 {
2907 hobject_t l;
2908 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2909 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2910 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2911 r = store->validate_hobject_key(l);
2912 if (r < 0) {
2913 derr << "backend (" << store->get_type() << ") is unable to support max "
2914 << "object name[space] len" << dendl;
2915 derr << " osd max object name len = "
2916 << cct->_conf->osd_max_object_name_len << dendl;
2917 derr << " osd max object namespace len = "
2918 << cct->_conf->osd_max_object_namespace_len << dendl;
2919 derr << cpp_strerror(r) << dendl;
2920 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2921 goto out;
2922 }
2923 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2924 << dendl;
2925 } else {
2926 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2927 }
2928 }
2929
2930 // read superblock
2931 r = read_superblock();
2932 if (r < 0) {
2933 derr << "OSD::init() : unable to read osd superblock" << dendl;
2934 r = -EINVAL;
2935 goto out;
2936 }
2937
2938 if (osd_compat.compare(superblock.compat_features) < 0) {
2939 derr << "The disk uses features unsupported by the executable." << dendl;
2940 derr << " ondisk features " << superblock.compat_features << dendl;
2941 derr << " daemon features " << osd_compat << dendl;
2942
2943 if (osd_compat.writeable(superblock.compat_features)) {
2944 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2945 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2946 r = -EOPNOTSUPP;
2947 goto out;
2948 }
2949 else {
2950 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2951 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2952 r = -EOPNOTSUPP;
2953 goto out;
2954 }
2955 }
2956
2957 assert_warn(whoami == superblock.whoami);
2958 if (whoami != superblock.whoami) {
2959 derr << "OSD::init: superblock says osd"
2960 << superblock.whoami << " but I am osd." << whoami << dendl;
2961 r = -EINVAL;
2962 goto out;
2963 }
2964
11fdf7f2
TL
2965 // load up "current" osdmap
2966 assert_warn(!osdmap);
2967 if (osdmap) {
2968 derr << "OSD::init: unable to read current osdmap" << dendl;
2969 r = -EINVAL;
2970 goto out;
2971 }
2972 osdmap = get_map(superblock.current_epoch);
2973
2974 // make sure we don't have legacy pgs deleting
2975 {
2976 vector<coll_t> ls;
2977 int r = store->list_collections(ls);
2978 ceph_assert(r >= 0);
2979 for (auto c : ls) {
2980 spg_t pgid;
2981 if (c.is_pg(&pgid) &&
2982 !osdmap->have_pg_pool(pgid.pool())) {
2983 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
2984 if (!store->exists(service.meta_ch, oid)) {
2985 derr << __func__ << " missing pg_pool_t for deleted pool "
2986 << pgid.pool() << " for pg " << pgid
2987 << "; please downgrade to luminous and allow "
2988 << "pg deletion to complete before upgrading" << dendl;
2989 ceph_abort();
2990 }
2991 }
2992 }
2993 }
2994
7c673cae
FG
2995 initial = get_osd_initial_compat_set();
2996 diff = superblock.compat_features.unsupported(initial);
2997 if (superblock.compat_features.merge(initial)) {
2998 // We need to persist the new compat_set before we
2999 // do anything else
3000 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3001 ObjectStore::Transaction t;
3002 write_superblock(t);
11fdf7f2 3003 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3004 if (r < 0)
3005 goto out;
3006 }
3007
3008 // make sure snap mapper object exists
11fdf7f2 3009 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3010 dout(10) << "init creating/touching snapmapper object" << dendl;
3011 ObjectStore::Transaction t;
3012 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3013 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3014 if (r < 0)
3015 goto out;
3016 }
3017
3018 class_handler = new ClassHandler(cct);
3019 cls_initialize(class_handler);
3020
3021 if (cct->_conf->osd_open_classes_on_start) {
3022 int r = class_handler->open_all_classes();
3023 if (r)
3024 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3025 }
3026
11fdf7f2 3027 check_osdmap_features();
7c673cae
FG
3028
3029 create_recoverystate_perf();
3030
3031 {
3032 epoch_t bind_epoch = osdmap->get_epoch();
3033 service.set_epochs(NULL, NULL, &bind_epoch);
3034 }
3035
3036 clear_temp_objects();
3037
d2e6a577 3038 // initialize osdmap references in sharded wq
11fdf7f2
TL
3039 for (auto& shard : shards) {
3040 std::lock_guard l(shard->osdmap_lock);
3041 shard->shard_osdmap = osdmap;
3042 }
d2e6a577 3043
7c673cae
FG
3044 // load up pgs (as they previously existed)
3045 load_pgs();
3046
3047 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3048 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3049 op_prio_cutoff << "." << dendl;
3050
3051 create_logger();
3052
11fdf7f2
TL
3053 // prime osd stats
3054 {
3055 struct store_statfs_t stbuf;
3056 osd_alert_list_t alerts;
3057 int r = store->statfs(&stbuf, &alerts);
3058 ceph_assert(r == 0);
3059 service.set_statfs(stbuf, alerts);
3060 }
3061
3062 // client_messenger auth_client is already set up by monc.
3063 for (auto m : { cluster_messenger,
3064 objecter_messenger,
3065 hb_front_client_messenger,
3066 hb_back_client_messenger,
3067 hb_front_server_messenger,
3068 hb_back_server_messenger } ) {
3069 m->set_auth_client(monc);
3070 }
3071 for (auto m : { client_messenger,
3072 cluster_messenger,
3073 hb_front_server_messenger,
3074 hb_back_server_messenger }) {
3075 m->set_auth_server(monc);
3076 }
3077 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3078
3079 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3080 | CEPH_ENTITY_TYPE_MGR);
3081 r = monc->init();
3082 if (r < 0)
3083 goto out;
3084
11fdf7f2
TL
3085 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3086 mgrc.set_perf_metric_query_cb(
3087 [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3088 set_perf_queries(queries);
3089 },
3090 [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3091 get_perf_reports(reports);
3092 });
7c673cae 3093 mgrc.init();
7c673cae
FG
3094
3095 // tell monc about log_client so it will know about mon session resets
3096 monc->set_log_client(&log_client);
3097 update_log_config();
3098
11fdf7f2
TL
3099 // i'm ready!
3100 client_messenger->add_dispatcher_tail(&mgrc);
3101 client_messenger->add_dispatcher_tail(this);
3102 cluster_messenger->add_dispatcher_head(this);
3103
3104 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3105 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3106 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3107 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3108
3109 objecter_messenger->add_dispatcher_head(service.objecter);
3110
28e407b8
AA
3111 service.init();
3112 service.publish_map(osdmap);
3113 service.publish_superblock(superblock);
3114 service.max_oldest_map = superblock.oldest_map;
3115
11fdf7f2
TL
3116 for (auto& shard : shards) {
3117 // put PGs in a temporary set because we may modify pg_slots
3118 // unordered_map below.
3119 set<PGRef> pgs;
3120 for (auto& i : shard->pg_slots) {
3121 PGRef pg = i.second->pg;
3122 if (!pg) {
3123 continue;
3124 }
3125 pgs.insert(pg);
3126 }
3127 for (auto pg : pgs) {
3128 pg->lock();
3129 set<pair<spg_t,epoch_t>> new_children;
3130 set<pair<spg_t,epoch_t>> merge_pgs;
3131 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3132 &new_children, &merge_pgs);
3133 if (!new_children.empty()) {
3134 for (auto shard : shards) {
3135 shard->prime_splits(osdmap, &new_children);
3136 }
3137 assert(new_children.empty());
3138 }
3139 if (!merge_pgs.empty()) {
3140 for (auto shard : shards) {
3141 shard->prime_merges(osdmap, &merge_pgs);
3142 }
3143 assert(merge_pgs.empty());
3144 }
3145 pg->unlock();
3146 }
3147 }
3148
7c673cae 3149 osd_op_tp.start();
7c673cae
FG
3150 command_tp.start();
3151
7c673cae
FG
3152 // start the heartbeat
3153 heartbeat_thread.create("osd_srv_heartbt");
3154
3155 // tick
91327a77
AA
3156 tick_timer.add_event_after(get_tick_interval(),
3157 new C_Tick(this));
7c673cae 3158 {
11fdf7f2 3159 std::lock_guard l(tick_timer_lock);
91327a77
AA
3160 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3161 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3162 }
3163
7c673cae
FG
3164 osd_lock.Unlock();
3165
3166 r = monc->authenticate();
3167 if (r < 0) {
c07f9fc5
FG
3168 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3169 << dendl;
11fdf7f2 3170 exit(1);
7c673cae
FG
3171 }
3172
11fdf7f2 3173 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3174 derr << "unable to obtain rotating service keys; retrying" << dendl;
3175 ++rotating_auth_attempts;
11fdf7f2 3176 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3177 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3178 exit(1);
7c673cae
FG
3179 }
3180 }
3181
3182 r = update_crush_device_class();
3183 if (r < 0) {
d2e6a577
FG
3184 derr << __func__ << " unable to update_crush_device_class: "
3185 << cpp_strerror(r) << dendl;
11fdf7f2 3186 exit(1);
7c673cae
FG
3187 }
3188
3189 r = update_crush_location();
3190 if (r < 0) {
d2e6a577 3191 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3192 << cpp_strerror(r) << dendl;
11fdf7f2 3193 exit(1);
7c673cae
FG
3194 }
3195
3196 osd_lock.Lock();
3197 if (is_stopping())
3198 return 0;
3199
3200 // start objecter *after* we have authenticated, so that we don't ignore
3201 // the OSDMaps it requests.
3202 service.final_init();
3203
3204 check_config();
3205
3206 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3207 consume_map();
7c673cae
FG
3208
3209 dout(0) << "done with init, starting boot process" << dendl;
3210
3211 // subscribe to any pg creations
3212 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3213
3214 // MgrClient needs this (it doesn't have MonClient reference itself)
3215 monc->sub_want("mgrmap", 0, 0);
3216
3217 // we don't need to ask for an osdmap here; objecter will
3218 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3219
3220 monc->renew_subs();
3221
3222 start_boot();
3223
3224 return 0;
7c673cae
FG
3225
3226out:
3227 enable_disable_fuse(true);
3228 store->umount();
3229 delete store;
3230 store = NULL;
3231 return r;
3232}
3233
3234void OSD::final_init()
3235{
3236 AdminSocket *admin_socket = cct->get_admin_socket();
3237 asok_hook = new OSDSocketHook(this);
3238 int r = admin_socket->register_command("status", "status", asok_hook,
3239 "high-level status of OSD");
11fdf7f2 3240 ceph_assert(r == 0);
7c673cae
FG
3241 r = admin_socket->register_command("flush_journal", "flush_journal",
3242 asok_hook,
3243 "flush the journal to permanent store");
11fdf7f2 3244 ceph_assert(r == 0);
7c673cae 3245 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
3246 "dump_ops_in_flight " \
3247 "name=filterstr,type=CephString,n=N,req=false",
3248 asok_hook,
7c673cae 3249 "show the ops currently in flight");
11fdf7f2 3250 ceph_assert(r == 0);
7c673cae 3251 r = admin_socket->register_command("ops",
c07f9fc5
FG
3252 "ops " \
3253 "name=filterstr,type=CephString,n=N,req=false",
3254 asok_hook,
7c673cae 3255 "show the ops currently in flight");
11fdf7f2 3256 ceph_assert(r == 0);
7c673cae 3257 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
3258 "dump_blocked_ops " \
3259 "name=filterstr,type=CephString,n=N,req=false",
3260 asok_hook,
7c673cae 3261 "show the blocked ops currently in flight");
11fdf7f2 3262 ceph_assert(r == 0);
c07f9fc5
FG
3263 r = admin_socket->register_command("dump_historic_ops",
3264 "dump_historic_ops " \
3265 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3266 asok_hook,
3267 "show recent ops");
11fdf7f2 3268 ceph_assert(r == 0);
c07f9fc5
FG
3269 r = admin_socket->register_command("dump_historic_slow_ops",
3270 "dump_historic_slow_ops " \
3271 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3272 asok_hook,
3273 "show slowest recent ops");
11fdf7f2 3274 ceph_assert(r == 0);
c07f9fc5
FG
3275 r = admin_socket->register_command("dump_historic_ops_by_duration",
3276 "dump_historic_ops_by_duration " \
3277 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3278 asok_hook,
3279 "show slowest recent ops, sorted by duration");
11fdf7f2 3280 ceph_assert(r == 0);
7c673cae
FG
3281 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3282 asok_hook,
3283 "dump op priority queue state");
11fdf7f2 3284 ceph_assert(r == 0);
7c673cae
FG
3285 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3286 asok_hook,
3287 "dump blacklisted clients and times");
11fdf7f2 3288 ceph_assert(r == 0);
7c673cae
FG
3289 r = admin_socket->register_command("dump_watchers", "dump_watchers",
3290 asok_hook,
3291 "show clients which have active watches,"
3292 " and on which objects");
11fdf7f2 3293 ceph_assert(r == 0);
7c673cae
FG
3294 r = admin_socket->register_command("dump_reservations", "dump_reservations",
3295 asok_hook,
3296 "show recovery reservations");
11fdf7f2 3297 ceph_assert(r == 0);
7c673cae
FG
3298 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3299 asok_hook,
3300 "force osd to update the latest map from "
3301 "the mon");
11fdf7f2 3302 ceph_assert(r == 0);
7c673cae
FG
3303
3304 r = admin_socket->register_command( "heap",
3305 "heap " \
11fdf7f2
TL
3306 "name=heapcmd,type=CephString " \
3307 "name=value,type=CephString,req=false",
7c673cae
FG
3308 asok_hook,
3309 "show heap usage info (available only if "
3310 "compiled with tcmalloc)");
11fdf7f2 3311 ceph_assert(r == 0);
7c673cae
FG
3312
3313 r = admin_socket->register_command("set_heap_property",
3314 "set_heap_property " \
3315 "name=property,type=CephString " \
3316 "name=value,type=CephInt",
3317 asok_hook,
3318 "update malloc extension heap property");
11fdf7f2 3319 ceph_assert(r == 0);
7c673cae
FG
3320
3321 r = admin_socket->register_command("get_heap_property",
3322 "get_heap_property " \
3323 "name=property,type=CephString",
3324 asok_hook,
3325 "get malloc extension heap property");
11fdf7f2 3326 ceph_assert(r == 0);
7c673cae
FG
3327
3328 r = admin_socket->register_command("dump_objectstore_kv_stats",
3329 "dump_objectstore_kv_stats",
3330 asok_hook,
3331 "print statistics of kvdb which used by bluestore");
11fdf7f2 3332 ceph_assert(r == 0);
7c673cae
FG
3333
3334 r = admin_socket->register_command("dump_scrubs",
3335 "dump_scrubs",
3336 asok_hook,
3337 "print scheduled scrubs");
11fdf7f2 3338 ceph_assert(r == 0);
7c673cae
FG
3339
3340 r = admin_socket->register_command("calc_objectstore_db_histogram",
3341 "calc_objectstore_db_histogram",
3342 asok_hook,
3343 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3344 ceph_assert(r == 0);
7c673cae
FG
3345
3346 r = admin_socket->register_command("flush_store_cache",
3347 "flush_store_cache",
3348 asok_hook,
3349 "Flush bluestore internal cache");
11fdf7f2 3350 ceph_assert(r == 0);
7c673cae
FG
3351 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3352 asok_hook,
3353 "show recent state history");
11fdf7f2 3354 ceph_assert(r == 0);
7c673cae 3355
224ce89b
WB
3356 r = admin_socket->register_command("compact", "compact",
3357 asok_hook,
3358 "Commpact object store's omap."
3359 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3360 ceph_assert(r == 0);
3361
3362 r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3363 asok_hook,
3364 "dump pools whose PG(s) are mapped to this OSD.");
3365
3366 ceph_assert(r == 0);
3367
3368 r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3369 asok_hook,
3370 "probe OSD devices for SMART data.");
3371
3372 ceph_assert(r == 0);
3373
3374 r = admin_socket->register_command("list_devices", "list_devices",
3375 asok_hook,
3376 "list OSD devices.");
3377 r = admin_socket->register_command("send_beacon", "send_beacon",
3378 asok_hook,
3379 "send OSD beacon to mon immediately");
224ce89b 3380
7c673cae
FG
3381 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3382 // Note: pools are CephString instead of CephPoolname because
3383 // these commands traditionally support both pool names and numbers
3384 r = admin_socket->register_command(
3385 "setomapval",
3386 "setomapval " \
3387 "name=pool,type=CephString " \
3388 "name=objname,type=CephObjectname " \
3389 "name=key,type=CephString "\
3390 "name=val,type=CephString",
3391 test_ops_hook,
3392 "set omap key");
11fdf7f2 3393 ceph_assert(r == 0);
7c673cae
FG
3394 r = admin_socket->register_command(
3395 "rmomapkey",
3396 "rmomapkey " \
3397 "name=pool,type=CephString " \
3398 "name=objname,type=CephObjectname " \
3399 "name=key,type=CephString",
3400 test_ops_hook,
3401 "remove omap key");
11fdf7f2 3402 ceph_assert(r == 0);
7c673cae
FG
3403 r = admin_socket->register_command(
3404 "setomapheader",
3405 "setomapheader " \
3406 "name=pool,type=CephString " \
3407 "name=objname,type=CephObjectname " \
3408 "name=header,type=CephString",
3409 test_ops_hook,
3410 "set omap header");
11fdf7f2 3411 ceph_assert(r == 0);
7c673cae
FG
3412
3413 r = admin_socket->register_command(
3414 "getomap",
3415 "getomap " \
3416 "name=pool,type=CephString " \
3417 "name=objname,type=CephObjectname",
3418 test_ops_hook,
3419 "output entire object map");
11fdf7f2 3420 ceph_assert(r == 0);
7c673cae
FG
3421
3422 r = admin_socket->register_command(
3423 "truncobj",
3424 "truncobj " \
3425 "name=pool,type=CephString " \
3426 "name=objname,type=CephObjectname " \
3427 "name=len,type=CephInt",
3428 test_ops_hook,
3429 "truncate object to length");
11fdf7f2 3430 ceph_assert(r == 0);
7c673cae
FG
3431
3432 r = admin_socket->register_command(
3433 "injectdataerr",
3434 "injectdataerr " \
3435 "name=pool,type=CephString " \
3436 "name=objname,type=CephObjectname " \
3437 "name=shardid,type=CephInt,req=false,range=0|255",
3438 test_ops_hook,
3439 "inject data error to an object");
11fdf7f2 3440 ceph_assert(r == 0);
7c673cae
FG
3441
3442 r = admin_socket->register_command(
3443 "injectmdataerr",
3444 "injectmdataerr " \
3445 "name=pool,type=CephString " \
3446 "name=objname,type=CephObjectname " \
3447 "name=shardid,type=CephInt,req=false,range=0|255",
3448 test_ops_hook,
3449 "inject metadata error to an object");
11fdf7f2 3450 ceph_assert(r == 0);
7c673cae
FG
3451 r = admin_socket->register_command(
3452 "set_recovery_delay",
3453 "set_recovery_delay " \
3454 "name=utime,type=CephInt,req=false",
3455 test_ops_hook,
3456 "Delay osd recovery by specified seconds");
11fdf7f2 3457 ceph_assert(r == 0);
7c673cae
FG
3458 r = admin_socket->register_command(
3459 "trigger_scrub",
3460 "trigger_scrub " \
a8e16298
TL
3461 "name=pgid,type=CephString " \
3462 "name=time,type=CephInt,req=false",
7c673cae
FG
3463 test_ops_hook,
3464 "Trigger a scheduled scrub ");
11fdf7f2 3465 ceph_assert(r == 0);
a8e16298
TL
3466 r = admin_socket->register_command(
3467 "trigger_deep_scrub",
3468 "trigger_deep_scrub " \
3469 "name=pgid,type=CephString " \
3470 "name=time,type=CephInt,req=false",
3471 test_ops_hook,
3472 "Trigger a scheduled deep scrub ");
3473 ceph_assert(r == 0);
7c673cae
FG
3474 r = admin_socket->register_command(
3475 "injectfull",
3476 "injectfull " \
3477 "name=type,type=CephString,req=false " \
3478 "name=count,type=CephInt,req=false ",
3479 test_ops_hook,
3480 "Inject a full disk (optional count times)");
11fdf7f2 3481 ceph_assert(r == 0);
7c673cae
FG
3482}
3483
3484void OSD::create_logger()
3485{
3486 dout(10) << "create_logger" << dendl;
3487
3488 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3489
3490 // Latency axis configuration for op histograms, values are in nanoseconds
3491 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3492 "Latency (usec)",
3493 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3494 0, ///< Start at 0
3495 100000, ///< Quantization unit is 100usec
3496 32, ///< Enough to cover much longer than slow requests
3497 };
3498
3499 // Op size axis configuration for op histograms, values are in bytes
3500 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3501 "Request size (bytes)",
3502 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3503 0, ///< Start at 0
3504 512, ///< Quantization unit is 512 bytes
3505 32, ///< Enough to cover requests larger than GB
3506 };
3507
3508
3efd9988
FG
3509 // All the basic OSD operation stats are to be considered useful
3510 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3511
7c673cae
FG
3512 osd_plb.add_u64(
3513 l_osd_op_wip, "op_wip",
3514 "Replication operations currently being processed (primary)");
3515 osd_plb.add_u64_counter(
3516 l_osd_op, "op",
3517 "Client operations",
3518 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3519 osd_plb.add_u64_counter(
3520 l_osd_op_inb, "op_in_bytes",
3521 "Client operations total write size",
11fdf7f2 3522 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3523 osd_plb.add_u64_counter(
3524 l_osd_op_outb, "op_out_bytes",
3525 "Client operations total read size",
11fdf7f2 3526 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3527 osd_plb.add_time_avg(
3528 l_osd_op_lat, "op_latency",
3529 "Latency of client operations (including queue time)",
3530 "l", 9);
3531 osd_plb.add_time_avg(
3532 l_osd_op_process_lat, "op_process_latency",
3533 "Latency of client operations (excluding queue time)");
3534 osd_plb.add_time_avg(
3535 l_osd_op_prepare_lat, "op_prepare_latency",
3536 "Latency of client operations (excluding queue time and wait for finished)");
3537
3538 osd_plb.add_u64_counter(
3539 l_osd_op_r, "op_r", "Client read operations");
3540 osd_plb.add_u64_counter(
11fdf7f2 3541 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3542 osd_plb.add_time_avg(
3543 l_osd_op_r_lat, "op_r_latency",
3544 "Latency of read operation (including queue time)");
31f18b77 3545 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3546 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3547 op_hist_x_axis_config, op_hist_y_axis_config,
3548 "Histogram of operation latency (including queue time) + data read");
3549 osd_plb.add_time_avg(
3550 l_osd_op_r_process_lat, "op_r_process_latency",
3551 "Latency of read operation (excluding queue time)");
3552 osd_plb.add_time_avg(
3553 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3554 "Latency of read operations (excluding queue time and wait for finished)");
3555 osd_plb.add_u64_counter(
3556 l_osd_op_w, "op_w", "Client write operations");
3557 osd_plb.add_u64_counter(
3558 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3559 osd_plb.add_time_avg(
3560 l_osd_op_w_lat, "op_w_latency",
3561 "Latency of write operation (including queue time)");
31f18b77 3562 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3563 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3564 op_hist_x_axis_config, op_hist_y_axis_config,
3565 "Histogram of operation latency (including queue time) + data written");
3566 osd_plb.add_time_avg(
3567 l_osd_op_w_process_lat, "op_w_process_latency",
3568 "Latency of write operation (excluding queue time)");
3569 osd_plb.add_time_avg(
3570 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3571 "Latency of write operations (excluding queue time and wait for finished)");
3572 osd_plb.add_u64_counter(
3573 l_osd_op_rw, "op_rw",
3574 "Client read-modify-write operations");
3575 osd_plb.add_u64_counter(
3576 l_osd_op_rw_inb, "op_rw_in_bytes",
11fdf7f2 3577 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3578 osd_plb.add_u64_counter(
3579 l_osd_op_rw_outb,"op_rw_out_bytes",
11fdf7f2 3580 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3581 osd_plb.add_time_avg(
3582 l_osd_op_rw_lat, "op_rw_latency",
3583 "Latency of read-modify-write operation (including queue time)");
31f18b77 3584 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3585 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3586 op_hist_x_axis_config, op_hist_y_axis_config,
3587 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3588 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3589 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3590 op_hist_x_axis_config, op_hist_y_axis_config,
3591 "Histogram of rw operation latency (including queue time) + data read");
3592 osd_plb.add_time_avg(
3593 l_osd_op_rw_process_lat, "op_rw_process_latency",
3594 "Latency of read-modify-write operation (excluding queue time)");
3595 osd_plb.add_time_avg(
3596 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3597 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3598
3efd9988
FG
3599 // Now we move on to some more obscure stats, revert to assuming things
3600 // are low priority unless otherwise specified.
3601 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3602
224ce89b
WB
3603 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3604 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3605 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3606 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3607
7c673cae
FG
3608 osd_plb.add_u64_counter(
3609 l_osd_sop, "subop", "Suboperations");
3610 osd_plb.add_u64_counter(
11fdf7f2 3611 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3612 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3613
3614 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3615 osd_plb.add_u64_counter(
11fdf7f2 3616 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3617 osd_plb.add_time_avg(
3618 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3619 osd_plb.add_u64_counter(
3620 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3621 osd_plb.add_time_avg(
3622 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3623 osd_plb.add_u64_counter(
3624 l_osd_sop_push, "subop_push", "Suboperations push messages");
3625 osd_plb.add_u64_counter(
11fdf7f2 3626 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3627 osd_plb.add_time_avg(
3628 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3629
3630 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3631 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
11fdf7f2 3632 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3633
3634 osd_plb.add_u64_counter(
3635 l_osd_rop, "recovery_ops",
3636 "Started recovery operations",
3637 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3638
11fdf7f2
TL
3639 osd_plb.add_u64_counter(
3640 l_osd_rbytes, "recovery_bytes",
3641 "recovery bytes",
3642 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3643
7c673cae 3644 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
7c673cae
FG
3645 osd_plb.add_u64(
3646 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3647 osd_plb.add_u64(
3648 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3649 "Total number getting crc from crc_cache with adjusting");
3650 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3651 "Total number of crc cache misses");
3652
3653 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3654 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3655 osd_plb.add_u64(
3656 l_osd_pg_primary, "numpg_primary",
3657 "Placement groups for which this osd is primary");
3658 osd_plb.add_u64(
3659 l_osd_pg_replica, "numpg_replica",
3660 "Placement groups for which this osd is replica");
3661 osd_plb.add_u64(
3662 l_osd_pg_stray, "numpg_stray",
3663 "Placement groups ready to be deleted from this osd");
94b18763
FG
3664 osd_plb.add_u64(
3665 l_osd_pg_removing, "numpg_removing",
3666 "Placement groups queued for local deletion", "pgsr",
3667 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3668 osd_plb.add_u64(
3669 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3670 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3671 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3672 osd_plb.add_u64_counter(
3673 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3674 osd_plb.add_u64_counter(
3675 l_osd_waiting_for_map, "messages_delayed_for_map",
3676 "Operations waiting for OSD map");
31f18b77 3677
7c673cae
FG
3678 osd_plb.add_u64_counter(
3679 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3680 osd_plb.add_u64_counter(
3681 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3682 osd_plb.add_u64_counter(
3683 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3684 "osdmap cache miss below cache lower bound");
3685 osd_plb.add_u64_avg(
3686 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3687 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3688 osd_plb.add_u64_counter(
3689 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3690 "OSDMap buffer cache hits");
3691 osd_plb.add_u64_counter(
3692 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3693 "OSDMap buffer cache misses");
7c673cae 3694
3efd9988
FG
3695 osd_plb.add_u64(
3696 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
11fdf7f2 3697 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3efd9988
FG
3698 osd_plb.add_u64(
3699 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
11fdf7f2
TL
3700 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3701 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3702
3703 osd_plb.add_u64_counter(
3704 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3705
3706 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3707 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3708 osd_plb.add_u64_counter(
3709 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3710 osd_plb.add_u64_counter(
3711 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3712 osd_plb.add_u64_counter(
3713 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3714 "Failed tier flush attempts");
3715 osd_plb.add_u64_counter(
3716 l_osd_tier_evict, "tier_evict", "Tier evictions");
3717 osd_plb.add_u64_counter(
3718 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3719 osd_plb.add_u64_counter(
3720 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3721 osd_plb.add_u64_counter(
3722 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3723 osd_plb.add_u64_counter(
3724 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3725 osd_plb.add_u64_counter(
3726 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3727 osd_plb.add_u64_counter(
3728 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3729
3730 osd_plb.add_u64_counter(
3731 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3732 osd_plb.add_u64_counter(
3733 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3734 osd_plb.add_u64_counter(
3735 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3736 osd_plb.add_u64_counter(
3737 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3738
3739 osd_plb.add_u64_counter(
3740 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3741 osd_plb.add_u64_counter(
3742 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3743
3744 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3745 osd_plb.add_time_avg(
3746 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3747 osd_plb.add_time_avg(
3748 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3749 osd_plb.add_time_avg(
3750 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3751
3752 osd_plb.add_u64_counter(
3753 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3754 osd_plb.add_u64_counter(
3755 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3756 "PG updated its info using fastinfo attr");
3757 osd_plb.add_u64_counter(
3758 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3759
3760 logger = osd_plb.create_perf_counters();
3761 cct->get_perfcounters_collection()->add(logger);
3762}
3763
3764void OSD::create_recoverystate_perf()
3765{
3766 dout(10) << "create_recoverystate_perf" << dendl;
3767
3768 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3769
3770 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3771 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3772 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3773 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3774 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3775 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3776 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3777 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3778 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3779 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3780 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3781 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3782 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3783 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3784 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3785 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3786 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3787 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3788 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3789 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3790 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3791 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3792 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3793 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3794 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3795 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3796 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3797 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3798 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3799 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3800 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3801
3802 recoverystate_perf = rs_perf.create_perf_counters();
3803 cct->get_perfcounters_collection()->add(recoverystate_perf);
3804}
3805
3806int OSD::shutdown()
3807{
3808 if (!service.prepare_to_stop())
3809 return 0; // already shutting down
3810 osd_lock.Lock();
3811 if (is_stopping()) {
3812 osd_lock.Unlock();
3813 return 0;
3814 }
11fdf7f2 3815 dout(0) << "shutdown" << dendl;
7c673cae
FG
3816
3817 set_state(STATE_STOPPING);
3818
3819 // Debugging
11fdf7f2
TL
3820 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
3821 cct->_conf.set_val("debug_osd", "100");
3822 cct->_conf.set_val("debug_journal", "100");
3823 cct->_conf.set_val("debug_filestore", "100");
3824 cct->_conf.set_val("debug_bluestore", "100");
3825 cct->_conf.set_val("debug_ms", "100");
3826 cct->_conf.apply_changes(nullptr);
3efd9988 3827 }
7c673cae
FG
3828
3829 // stop MgrClient earlier as it's more like an internal consumer of OSD
3830 mgrc.shutdown();
3831
3832 service.start_shutdown();
3833
3834 // stop sending work to pgs. this just prevents any new work in _process
3835 // from racing with on_shutdown and potentially entering the pg after.
3836 op_shardedwq.drain();
3837
3838 // Shutdown PGs
3839 {
11fdf7f2
TL
3840 vector<PGRef> pgs;
3841 _get_pgs(&pgs);
3842 for (auto pg : pgs) {
3843 pg->shutdown();
7c673cae
FG
3844 }
3845 }
7c673cae
FG
3846
3847 // drain op queue again (in case PGs requeued something)
3848 op_shardedwq.drain();
3849 {
3850 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 3851 waiting_for_osdmap.clear();
7c673cae
FG
3852 }
3853
7c673cae 3854 // unregister commands
11fdf7f2 3855 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
3856 delete asok_hook;
3857 asok_hook = NULL;
3858
11fdf7f2 3859 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
3860 delete test_ops_hook;
3861 test_ops_hook = NULL;
3862
3863 osd_lock.Unlock();
3864
3865 heartbeat_lock.Lock();
3866 heartbeat_stop = true;
3867 heartbeat_cond.Signal();
3868 heartbeat_lock.Unlock();
3869 heartbeat_thread.join();
3870
7c673cae
FG
3871 osd_op_tp.drain();
3872 osd_op_tp.stop();
3873 dout(10) << "op sharded tp stopped" << dendl;
3874
3875 command_tp.drain();
3876 command_tp.stop();
3877 dout(10) << "command tp stopped" << dendl;
3878
7c673cae
FG
3879 dout(10) << "stopping agent" << dendl;
3880 service.agent_stop();
3881
11fdf7f2
TL
3882 boot_finisher.wait_for_empty();
3883
7c673cae
FG
3884 osd_lock.Lock();
3885
11fdf7f2 3886 boot_finisher.stop();
7c673cae
FG
3887 reset_heartbeat_peers();
3888
3889 tick_timer.shutdown();
3890
3891 {
11fdf7f2 3892 std::lock_guard l(tick_timer_lock);
7c673cae
FG
3893 tick_timer_without_osd_lock.shutdown();
3894 }
3895
3896 // note unmount epoch
3897 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3898 superblock.mounted = service.get_boot_epoch();
3899 superblock.clean_thru = osdmap->get_epoch();
3900 ObjectStore::Transaction t;
3901 write_superblock(t);
11fdf7f2 3902 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3903 if (r) {
3904 derr << "OSD::shutdown: error writing superblock: "
3905 << cpp_strerror(r) << dendl;
3906 }
3907
3908
31f18b77
FG
3909 service.shutdown_reserver();
3910
7c673cae
FG
3911 // Remove PGs
3912#ifdef PG_DEBUG_REFS
3913 service.dump_live_pgids();
3914#endif
11fdf7f2
TL
3915 while (true) {
3916 vector<PGRef> pgs;
3917 _get_pgs(&pgs, true);
3918 if (pgs.empty()) {
3919 break;
3920 }
3921 for (auto& pg : pgs) {
3922 if (pg->is_deleted()) {
3923 continue;
3924 }
3925 dout(20) << " kicking pg " << pg << dendl;
3926 pg->lock();
3927 if (pg->get_num_ref() != 1) {
3928 derr << "pgid " << pg->get_pgid() << " has ref count of "
3929 << pg->get_num_ref() << dendl;
7c673cae 3930#ifdef PG_DEBUG_REFS
11fdf7f2 3931 pg->dump_live_ids();
7c673cae 3932#endif
31f18b77
FG
3933 if (cct->_conf->osd_shutdown_pgref_assert) {
3934 ceph_abort();
3935 }
7c673cae 3936 }
11fdf7f2
TL
3937 pg->ch.reset();
3938 pg->unlock();
7c673cae 3939 }
7c673cae
FG
3940 }
3941#ifdef PG_DEBUG_REFS
3942 service.dump_live_pgids();
3943#endif
f64942e4
AA
3944
3945 osd_lock.Unlock();
11fdf7f2 3946 cct->_conf.remove_observer(this);
f64942e4 3947 osd_lock.Lock();
7c673cae 3948
11fdf7f2
TL
3949 service.meta_ch.reset();
3950
7c673cae
FG
3951 dout(10) << "syncing store" << dendl;
3952 enable_disable_fuse(true);
3953
3954 if (cct->_conf->osd_journal_flush_on_shutdown) {
3955 dout(10) << "flushing journal" << dendl;
3956 store->flush_journal();
3957 }
3958
7c673cae
FG
3959 monc->shutdown();
3960 osd_lock.Unlock();
3961
11fdf7f2 3962 map_lock.get_write();
7c673cae 3963 osdmap = OSDMapRef();
11fdf7f2
TL
3964 map_lock.put_write();
3965
3966 for (auto s : shards) {
3967 std::lock_guard l(s->osdmap_lock);
3968 s->shard_osdmap = OSDMapRef();
3969 }
7c673cae 3970 service.shutdown();
11fdf7f2
TL
3971
3972 std::lock_guard lock(osd_lock);
3973 store->umount();
3974 delete store;
3975 store = nullptr;
3976 dout(10) << "Store synced" << dendl;
3977
7c673cae
FG
3978 op_tracker.on_shutdown();
3979
3980 class_handler->shutdown();
3981 client_messenger->shutdown();
3982 cluster_messenger->shutdown();
3983 hb_front_client_messenger->shutdown();
3984 hb_back_client_messenger->shutdown();
3985 objecter_messenger->shutdown();
3986 hb_front_server_messenger->shutdown();
3987 hb_back_server_messenger->shutdown();
3988
7c673cae
FG
3989 return r;
3990}
3991
3992int OSD::mon_cmd_maybe_osd_create(string &cmd)
3993{
3994 bool created = false;
3995 while (true) {
3996 dout(10) << __func__ << " cmd: " << cmd << dendl;
3997 vector<string> vcmd{cmd};
3998 bufferlist inbl;
3999 C_SaferCond w;
4000 string outs;
4001 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4002 int r = w.wait();
4003 if (r < 0) {
4004 if (r == -ENOENT && !created) {
4005 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4006 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4007 vector<string> vnewcmd{newcmd};
4008 bufferlist inbl;
4009 C_SaferCond w;
4010 string outs;
4011 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4012 int r = w.wait();
4013 if (r < 0) {
4014 derr << __func__ << " fail: osd does not exist and created failed: "
4015 << cpp_strerror(r) << dendl;
4016 return r;
4017 }
4018 created = true;
4019 continue;
4020 }
4021 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4022 return r;
4023 }
4024 break;
4025 }
4026
4027 return 0;
4028}
4029
4030int OSD::update_crush_location()
4031{
4032 if (!cct->_conf->osd_crush_update_on_start) {
4033 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4034 return 0;
4035 }
4036
4037 char weight[32];
4038 if (cct->_conf->osd_crush_initial_weight >= 0) {
4039 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4040 } else {
4041 struct store_statfs_t st;
11fdf7f2
TL
4042 osd_alert_list_t alerts;
4043 int r = store->statfs(&st, &alerts);
7c673cae
FG
4044 if (r < 0) {
4045 derr << "statfs: " << cpp_strerror(r) << dendl;
4046 return r;
4047 }
4048 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4049 std::max(.00001,
4050 double(st.total) /
4051 double(1ull << 40 /* TB */)));
7c673cae
FG
4052 }
4053
4054 std::multimap<string,string> loc = cct->crush_location.get_location();
4055 dout(10) << __func__ << " crush location is " << loc << dendl;
4056
4057 string cmd =
4058 string("{\"prefix\": \"osd crush create-or-move\", ") +
4059 string("\"id\": ") + stringify(whoami) + string(", ") +
4060 string("\"weight\":") + weight + string(", ") +
4061 string("\"args\": [");
4062 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4063 if (p != loc.begin())
4064 cmd += ", ";
4065 cmd += "\"" + p->first + "=" + p->second + "\"";
4066 }
4067 cmd += "]}";
4068
4069 return mon_cmd_maybe_osd_create(cmd);
4070}
4071
4072int OSD::update_crush_device_class()
4073{
224ce89b
WB
4074 if (!cct->_conf->osd_class_update_on_start) {
4075 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4076 return 0;
4077 }
4078
7c673cae
FG
4079 string device_class;
4080 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4081 if (r < 0 || device_class.empty()) {
4082 device_class = store->get_default_device_class();
4083 }
4084
4085 if (device_class.empty()) {
d2e6a577 4086 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4087 return 0;
224ce89b 4088 }
7c673cae
FG
4089
4090 string cmd =
4091 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4092 string("\"class\": \"") + device_class + string("\", ") +
4093 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4094
224ce89b 4095 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4096 if (r == -EBUSY) {
4097 // good, already bound to a device-class
4098 return 0;
4099 } else {
4100 return r;
4101 }
7c673cae
FG
4102}
4103
4104void OSD::write_superblock(ObjectStore::Transaction& t)
4105{
4106 dout(10) << "write_superblock " << superblock << dendl;
4107
4108 //hack: at minimum it's using the baseline feature set
4109 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4110 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4111
4112 bufferlist bl;
11fdf7f2 4113 encode(superblock, bl);
7c673cae
FG
4114 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4115}
4116
4117int OSD::read_superblock()
4118{
4119 bufferlist bl;
11fdf7f2 4120 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4121 if (r < 0)
4122 return r;
4123
11fdf7f2
TL
4124 auto p = bl.cbegin();
4125 decode(superblock, p);
7c673cae
FG
4126
4127 dout(10) << "read_superblock " << superblock << dendl;
4128
4129 return 0;
4130}
4131
4132void OSD::clear_temp_objects()
4133{
4134 dout(10) << __func__ << dendl;
4135 vector<coll_t> ls;
4136 store->list_collections(ls);
4137 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4138 spg_t pgid;
4139 if (!p->is_pg(&pgid))
4140 continue;
4141
4142 // list temp objects
4143 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4144
4145 vector<ghobject_t> temps;
4146 ghobject_t next;
4147 while (1) {
4148 vector<ghobject_t> objects;
11fdf7f2
TL
4149 auto ch = store->open_collection(*p);
4150 ceph_assert(ch);
4151 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4152 store->get_ideal_list_max(),
4153 &objects, &next);
4154 if (objects.empty())
4155 break;
4156 vector<ghobject_t>::iterator q;
4157 for (q = objects.begin(); q != objects.end(); ++q) {
4158 // Hammer set pool for temps to -1, so check for clean-up
4159 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4160 temps.push_back(*q);
4161 } else {
4162 break;
4163 }
4164 }
4165 // If we saw a non-temp object and hit the break above we can
4166 // break out of the while loop too.
4167 if (q != objects.end())
4168 break;
4169 }
4170 if (!temps.empty()) {
4171 ObjectStore::Transaction t;
4172 int removed = 0;
4173 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4174 dout(20) << " removing " << *p << " object " << *q << dendl;
4175 t.remove(*p, *q);
4176 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4177 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4178 t = ObjectStore::Transaction();
4179 removed = 0;
4180 }
4181 }
4182 if (removed) {
11fdf7f2 4183 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4184 }
4185 }
4186 }
4187}
4188
4189void OSD::recursive_remove_collection(CephContext* cct,
4190 ObjectStore *store, spg_t pgid,
4191 coll_t tmp)
4192{
4193 OSDriver driver(
4194 store,
4195 coll_t(),
4196 make_snapmapper_oid());
4197
11fdf7f2 4198 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4199 ObjectStore::Transaction t;
4200 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4201
11fdf7f2
TL
4202 ghobject_t next;
4203 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4204 vector<ghobject_t> objects;
11fdf7f2
TL
4205 objects.reserve(max);
4206 while (true) {
4207 objects.clear();
4208 store->collection_list(ch, next, ghobject_t::get_max(),
4209 max, &objects, &next);
4210 generic_dout(10) << __func__ << " " << objects << dendl;
4211 if (objects.empty())
4212 break;
4213 for (auto& p: objects) {
4214 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4215 int r = mapper.remove_oid(p.hobj, &_t);
4216 if (r != 0 && r != -ENOENT)
4217 ceph_abort();
4218 t.remove(tmp, p);
7c673cae 4219 }
11fdf7f2
TL
4220 int r = store->queue_transaction(ch, std::move(t));
4221 ceph_assert(r == 0);
4222 t = ObjectStore::Transaction();
7c673cae
FG
4223 }
4224 t.remove_collection(tmp);
11fdf7f2
TL
4225 int r = store->queue_transaction(ch, std::move(t));
4226 ceph_assert(r == 0);
7c673cae
FG
4227
4228 C_SaferCond waiter;
11fdf7f2 4229 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4230 waiter.wait();
4231 }
4232}
4233
4234
4235// ======================================================
4236// PG's
4237
7c673cae
FG
4238PG* OSD::_make_pg(
4239 OSDMapRef createmap,
4240 spg_t pgid)
4241{
11fdf7f2
TL
4242 dout(10) << __func__ << " " << pgid << dendl;
4243 pg_pool_t pi;
4244 map<string,string> ec_profile;
4245 string name;
4246 if (createmap->have_pg_pool(pgid.pool())) {
4247 pi = *createmap->get_pg_pool(pgid.pool());
4248 name = createmap->get_pool_name(pgid.pool());
4249 if (pi.is_erasure()) {
4250 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4251 }
4252 } else {
4253 // pool was deleted; grab final pg_pool_t off disk.
4254 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4255 bufferlist bl;
4256 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4257 if (r < 0) {
4258 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4259 << dendl;
4260 return nullptr;
4261 }
4262 ceph_assert(r >= 0);
4263 auto p = bl.cbegin();
4264 decode(pi, p);
4265 decode(name, p);
4266 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4267 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4268 << " tombstone" << dendl;
4269 return nullptr;
4270 }
4271 decode(ec_profile, p);
4272 }
4273 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4274 PG *pg;
11fdf7f2
TL
4275 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4276 pi.type == pg_pool_t::TYPE_ERASURE)
4277 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4278 else
4279 ceph_abort();
7c673cae
FG
4280 return pg;
4281}
4282
11fdf7f2 4283void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4284{
11fdf7f2
TL
4285 v->clear();
4286 v->reserve(get_num_pgs());
4287 for (auto& s : shards) {
4288 std::lock_guard l(s->shard_lock);
4289 for (auto& j : s->pg_slots) {
4290 if (j.second->pg &&
4291 !j.second->pg->is_deleted()) {
4292 v->push_back(j.second->pg);
4293 if (clear_too) {
4294 s->_detach_pg(j.second.get());
4295 }
4296 }
7c673cae 4297 }
7c673cae 4298 }
7c673cae
FG
4299}
4300
11fdf7f2 4301void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4302{
11fdf7f2
TL
4303 v->clear();
4304 v->reserve(get_num_pgs());
4305 for (auto& s : shards) {
4306 std::lock_guard l(s->shard_lock);
4307 for (auto& j : s->pg_slots) {
4308 if (j.second->pg &&
4309 !j.second->pg->is_deleted()) {
4310 v->push_back(j.first);
4311 }
7c673cae
FG
4312 }
4313 }
7c673cae
FG
4314}
4315
11fdf7f2 4316void OSD::register_pg(PGRef pg)
7c673cae 4317{
11fdf7f2
TL
4318 spg_t pgid = pg->get_pgid();
4319 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4320 auto sdata = shards[shard_index];
4321 std::lock_guard l(sdata->shard_lock);
4322 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4323 ceph_assert(r.second);
4324 auto *slot = r.first->second.get();
4325 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4326 sdata->_attach_pg(slot, pg.get());
4327}
7c673cae 4328
11fdf7f2
TL
4329bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4330{
4331 auto sdata = pg->osd_shard;
4332 ceph_assert(sdata);
4333 {
4334 std::lock_guard l(sdata->shard_lock);
4335 auto p = sdata->pg_slots.find(pg->pg_id);
4336 if (p == sdata->pg_slots.end() ||
4337 !p->second->pg) {
4338 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4339 return false;
4340 }
4341 if (p->second->waiting_for_merge_epoch) {
4342 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4343 return false;
4344 }
4345 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4346 sdata->_detach_pg(p->second.get());
4347 }
7c673cae 4348
11fdf7f2
TL
4349 for (auto shard : shards) {
4350 shard->unprime_split_children(pg->pg_id, old_pg_num);
4351 }
7c673cae 4352
11fdf7f2
TL
4353 // update pg count now since we might not get an osdmap any time soon.
4354 if (pg->is_primary())
4355 service.logger->dec(l_osd_pg_primary);
4356 else if (pg->is_replica())
4357 service.logger->dec(l_osd_pg_replica);
4358 else
4359 service.logger->dec(l_osd_pg_stray);
7c673cae 4360
11fdf7f2 4361 return true;
7c673cae
FG
4362}
4363
11fdf7f2 4364PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4365{
11fdf7f2
TL
4366 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4367 auto sdata = shards[shard_index];
4368 std::lock_guard l(sdata->shard_lock);
4369 auto p = sdata->pg_slots.find(pgid);
4370 if (p == sdata->pg_slots.end()) {
7c673cae 4371 return nullptr;
11fdf7f2
TL
4372 }
4373 return p->second->pg;
7c673cae
FG
4374}
4375
11fdf7f2 4376PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4377{
11fdf7f2
TL
4378 PGRef pg = _lookup_pg(pgid);
4379 if (!pg) {
4380 return nullptr;
4381 }
4382 pg->lock();
4383 if (!pg->is_deleted()) {
4384 return pg;
4385 }
4386 pg->unlock();
4387 return nullptr;
31f18b77
FG
4388}
4389
11fdf7f2 4390PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4391{
11fdf7f2 4392 return _lookup_lock_pg(pgid);
7c673cae
FG
4393}
4394
4395void OSD::load_pgs()
4396{
11fdf7f2 4397 ceph_assert(osd_lock.is_locked());
7c673cae 4398 dout(0) << "load_pgs" << dendl;
11fdf7f2 4399
7c673cae 4400 {
11fdf7f2
TL
4401 auto pghist = make_pg_num_history_oid();
4402 bufferlist bl;
4403 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4404 if (r >= 0 && bl.length() > 0) {
4405 auto p = bl.cbegin();
4406 decode(pg_num_history, p);
4407 }
4408 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4409 }
4410
4411 vector<coll_t> ls;
4412 int r = store->list_collections(ls);
4413 if (r < 0) {
4414 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4415 }
4416
11fdf7f2 4417 int num = 0;
7c673cae
FG
4418 for (vector<coll_t>::iterator it = ls.begin();
4419 it != ls.end();
4420 ++it) {
4421 spg_t pgid;
4422 if (it->is_temp(&pgid) ||
4423 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4424 dout(10) << "load_pgs " << *it
4425 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4426 recursive_remove_collection(cct, store, pgid, *it);
4427 continue;
4428 }
4429
4430 if (!it->is_pg(&pgid)) {
4431 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4432 continue;
4433 }
4434
7c673cae 4435 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4436 epoch_t map_epoch = 0;
11fdf7f2 4437 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4438 if (r < 0) {
4439 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4440 << dendl;
4441 continue;
4442 }
4443
11fdf7f2 4444 PGRef pg;
7c673cae
FG
4445 if (map_epoch > 0) {
4446 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4447 if (!pgosdmap) {
4448 if (!osdmap->have_pg_pool(pgid.pool())) {
4449 derr << __func__ << ": could not find map for epoch " << map_epoch
4450 << " on pg " << pgid << ", but the pool is not present in the "
4451 << "current map, so this is probably a result of bug 10617. "
4452 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4453 << "to clean it up later." << dendl;
4454 continue;
4455 } else {
4456 derr << __func__ << ": have pgid " << pgid << " at epoch "
4457 << map_epoch << ", but missing map. Crashing."
4458 << dendl;
11fdf7f2 4459 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4460 }
4461 }
11fdf7f2 4462 pg = _make_pg(pgosdmap, pgid);
7c673cae 4463 } else {
11fdf7f2 4464 pg = _make_pg(osdmap, pgid);
7c673cae 4465 }
11fdf7f2
TL
4466 if (!pg) {
4467 recursive_remove_collection(cct, store, pgid, *it);
4468 continue;
4469 }
4470
4471 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4472
11fdf7f2 4473 pg->lock();
7c673cae
FG
4474 pg->ch = store->open_collection(pg->coll);
4475
4476 // read pg state, log
11fdf7f2 4477 pg->read_state(store);
7c673cae 4478
94b18763
FG
4479 if (pg->dne()) {
4480 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4481 pg->ch = nullptr;
94b18763 4482 pg->unlock();
94b18763
FG
4483 recursive_remove_collection(cct, store, pgid, *it);
4484 continue;
4485 }
11fdf7f2
TL
4486 {
4487 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4488 assert(NULL != shards[shard_index]);
4489 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4490 }
7c673cae
FG
4491
4492 pg->reg_next_scrub();
4493
11fdf7f2 4494 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4495 pg->unlock();
7c673cae 4496
11fdf7f2
TL
4497 register_pg(pg);
4498 ++num;
7c673cae 4499 }
11fdf7f2 4500 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4501}
4502
4503
11fdf7f2
TL
4504PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4505 const PGCreateInfo *info)
4506{
4507 spg_t pgid = info->pgid;
7c673cae 4508
11fdf7f2
TL
4509 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4510 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4511 return nullptr;
4512 }
3efd9988 4513
11fdf7f2 4514 PG::RecoveryCtx rctx = create_context();
7c673cae 4515
11fdf7f2 4516 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4517
11fdf7f2
TL
4518 if (info->by_mon) {
4519 int64_t pool_id = pgid.pgid.pool();
4520 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4521 if (!pool) {
4522 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4523 return nullptr;
4524 }
4525 if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4526 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4527 // this ensures we do not process old creating messages after the
4528 // pool's initial pgs have been created (and pg are subsequently
4529 // allowed to split or merge).
4530 dout(20) << __func__ << " dropping " << pgid
4531 << "create, pool does not have CREATING flag set" << dendl;
4532 return nullptr;
7c673cae
FG
4533 }
4534 }
7c673cae 4535
11fdf7f2
TL
4536 int up_primary, acting_primary;
4537 vector<int> up, acting;
4538 startmap->pg_to_up_acting_osds(
4539 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4540
11fdf7f2
TL
4541 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4542 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4543 store->get_type() != "bluestore") {
4544 clog->warn() << "pg " << pgid
4545 << " is at risk of silent data corruption: "
4546 << "the pool allows ec overwrites but is not stored in "
4547 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4548 }
11fdf7f2
TL
4549 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4550 PG::_init(*rctx.transaction, pgid, pp);
7c673cae 4551
11fdf7f2
TL
4552 int role = startmap->calc_pg_role(whoami, acting, acting.size());
4553 if (!pp->is_replicated() && role != pgid.shard) {
4554 role = -1;
7c673cae
FG
4555 }
4556
11fdf7f2
TL
4557 PGRef pg = _make_pg(startmap, pgid);
4558 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4559
11fdf7f2
TL
4560 {
4561 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4562 assert(NULL != shards[shard_index]);
4563 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4564 }
7c673cae 4565
11fdf7f2 4566 pg->lock(true);
7c673cae 4567
11fdf7f2
TL
4568 // we are holding the shard lock
4569 ceph_assert(!pg->is_deleted());
4570
4571 pg->init(
4572 role,
4573 up,
4574 up_primary,
4575 acting,
4576 acting_primary,
4577 info->history,
4578 info->past_intervals,
4579 false,
4580 rctx.transaction);
7c673cae 4581
11fdf7f2
TL
4582 if (pg->is_primary()) {
4583 Mutex::Locker locker(m_perf_queries_lock);
4584 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4585 }
7c673cae 4586
11fdf7f2
TL
4587 pg->handle_initialize(&rctx);
4588 pg->handle_activate_map(&rctx);
7c673cae 4589
11fdf7f2 4590 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4591
11fdf7f2
TL
4592 dout(10) << __func__ << " new pg " << *pg << dendl;
4593 return pg;
7c673cae
FG
4594}
4595
11fdf7f2
TL
4596bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4597 spg_t pgid,
4598 bool is_mon_create)
3efd9988
FG
4599{
4600 const auto max_pgs_per_osd =
11fdf7f2
TL
4601 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4602 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4603
11fdf7f2 4604 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4605 return false;
4606 }
11fdf7f2
TL
4607
4608 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4609 if (is_mon_create) {
4610 pending_creates_from_mon++;
4611 } else {
b32b8144
FG
4612 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4613 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
3efd9988 4614 }
1adf2230 4615 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4616 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4617 return true;
4618}
4619
4620// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4621// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4622// to up set if pg_temp is empty. so an empty pg_temp won't work.
4623static vector<int32_t> twiddle(const vector<int>& acting) {
4624 if (acting.size() > 1) {
4625 return {acting[0]};
4626 } else {
4627 vector<int32_t> twiddled(acting.begin(), acting.end());
4628 twiddled.push_back(-1);
4629 return twiddled;
4630 }
4631}
4632
4633void OSD::resume_creating_pg()
4634{
4635 bool do_sub_pg_creates = false;
b32b8144 4636 bool have_pending_creates = false;
3efd9988
FG
4637 {
4638 const auto max_pgs_per_osd =
11fdf7f2
TL
4639 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4640 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4641 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4642 // this could happen if admin decreases this setting before a PG is removed
4643 return;
4644 }
11fdf7f2
TL
4645 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4646 std::lock_guard l(pending_creates_lock);
3efd9988 4647 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4648 dout(20) << __func__ << " pending_creates_from_mon "
4649 << pending_creates_from_mon << dendl;
3efd9988
FG
4650 do_sub_pg_creates = true;
4651 if (pending_creates_from_mon >= spare_pgs) {
4652 spare_pgs = pending_creates_from_mon = 0;
4653 } else {
4654 spare_pgs -= pending_creates_from_mon;
4655 pending_creates_from_mon = 0;
4656 }
4657 }
4658 auto pg = pending_creates_from_osd.cbegin();
4659 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4660 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4661 vector<int> acting;
b32b8144 4662 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
94b18763 4663 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
3efd9988 4664 pg = pending_creates_from_osd.erase(pg);
94b18763 4665 do_sub_pg_creates = true;
3efd9988
FG
4666 spare_pgs--;
4667 }
b32b8144
FG
4668 have_pending_creates = (pending_creates_from_mon > 0 ||
4669 !pending_creates_from_osd.empty());
3efd9988 4670 }
b32b8144
FG
4671
4672 bool do_renew_subs = false;
3efd9988
FG
4673 if (do_sub_pg_creates) {
4674 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4675 dout(4) << __func__ << ": resolicit pg creates from mon since "
4676 << last_pg_create_epoch << dendl;
b32b8144 4677 do_renew_subs = true;
3efd9988
FG
4678 }
4679 }
b32b8144
FG
4680 version_t start = osdmap->get_epoch() + 1;
4681 if (have_pending_creates) {
4682 // don't miss any new osdmap deleting PGs
4683 if (monc->sub_want("osdmap", start, 0)) {
4684 dout(4) << __func__ << ": resolicit osdmap from mon since "
4685 << start << dendl;
4686 do_renew_subs = true;
4687 }
94b18763 4688 } else if (do_sub_pg_creates) {
b32b8144
FG
4689 // no need to subscribe the osdmap continuously anymore
4690 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4691 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4692 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4693 << start << dendl;
4694 do_renew_subs = true;
4695 }
4696 }
4697
4698 if (do_renew_subs) {
4699 monc->renew_subs();
4700 }
4701
94b18763 4702 service.send_pg_temp();
3efd9988 4703}
7c673cae
FG
4704
4705void OSD::build_initial_pg_history(
4706 spg_t pgid,
4707 epoch_t created,
4708 utime_t created_stamp,
4709 pg_history_t *h,
4710 PastIntervals *pi)
4711{
4712 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4713 h->epoch_created = created;
31f18b77 4714 h->epoch_pool_created = created;
7c673cae
FG
4715 h->same_interval_since = created;
4716 h->same_up_since = created;
4717 h->same_primary_since = created;
4718 h->last_scrub_stamp = created_stamp;
4719 h->last_deep_scrub_stamp = created_stamp;
4720 h->last_clean_scrub_stamp = created_stamp;
4721
4722 OSDMapRef lastmap = service.get_map(created);
4723 int up_primary, acting_primary;
4724 vector<int> up, acting;
4725 lastmap->pg_to_up_acting_osds(
4726 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4727
4728 ostringstream debug;
4729 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4730 OSDMapRef osdmap = service.get_map(e);
4731 int new_up_primary, new_acting_primary;
4732 vector<int> new_up, new_acting;
4733 osdmap->pg_to_up_acting_osds(
4734 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4735
4736 // this is a bit imprecise, but sufficient?
4737 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4738 const pg_pool_t *pi;
4739 bool operator()(const set<pg_shard_t> &have) const {
4740 return have.size() >= pi->min_size;
4741 }
11fdf7f2 4742 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4743 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4744
4745 bool new_interval = PastIntervals::check_new_interval(
4746 acting_primary,
4747 new_acting_primary,
4748 acting, new_acting,
4749 up_primary,
4750 new_up_primary,
4751 up, new_up,
4752 h->same_interval_since,
4753 h->last_epoch_clean,
4754 osdmap,
4755 lastmap,
4756 pgid.pgid,
4757 &min_size_predicate,
4758 pi,
4759 &debug);
4760 if (new_interval) {
4761 h->same_interval_since = e;
181888fb
FG
4762 if (up != new_up) {
4763 h->same_up_since = e;
4764 }
4765 if (acting_primary != new_acting_primary) {
4766 h->same_primary_since = e;
4767 }
4768 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4769 osdmap->get_pg_num(pgid.pgid.pool()),
4770 nullptr)) {
4771 h->last_epoch_split = e;
4772 }
4773 up = new_up;
4774 acting = new_acting;
4775 up_primary = new_up_primary;
4776 acting_primary = new_acting_primary;
c07f9fc5 4777 }
7c673cae
FG
4778 lastmap = osdmap;
4779 }
4780 dout(20) << __func__ << " " << debug.str() << dendl;
4781 dout(10) << __func__ << " " << *h << " " << *pi
4782 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4783 pi->get_bounds()) << ")"
4784 << dendl;
4785}
4786
7c673cae
FG
4787void OSD::_add_heartbeat_peer(int p)
4788{
4789 if (p == whoami)
4790 return;
4791 HeartbeatInfo *hi;
4792
4793 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4794 if (i == heartbeat_peers.end()) {
4795 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4796 if (!cons.first)
4797 return;
4798 hi = &heartbeat_peers[p];
4799 hi->peer = p;
11fdf7f2 4800 RefCountedPtr s{new HeartbeatSession{p}, false};
7c673cae 4801 hi->con_back = cons.first.get();
11fdf7f2 4802 hi->con_back->set_priv(s);
7c673cae
FG
4803 if (cons.second) {
4804 hi->con_front = cons.second.get();
11fdf7f2 4805 hi->con_front->set_priv(s);
7c673cae
FG
4806 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4807 << " " << hi->con_back->get_peer_addr()
4808 << " " << hi->con_front->get_peer_addr()
4809 << dendl;
4810 } else {
4811 hi->con_front.reset(NULL);
4812 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4813 << " " << hi->con_back->get_peer_addr()
4814 << dendl;
4815 }
7c673cae
FG
4816 } else {
4817 hi = &i->second;
4818 }
4819 hi->epoch = osdmap->get_epoch();
4820}
4821
4822void OSD::_remove_heartbeat_peer(int n)
4823{
4824 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 4825 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
4826 dout(20) << " removing heartbeat peer osd." << n
4827 << " " << q->second.con_back->get_peer_addr()
4828 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4829 << dendl;
4830 q->second.con_back->mark_down();
4831 if (q->second.con_front) {
4832 q->second.con_front->mark_down();
4833 }
4834 heartbeat_peers.erase(q);
4835}
4836
4837void OSD::need_heartbeat_peer_update()
4838{
4839 if (is_stopping())
4840 return;
4841 dout(20) << "need_heartbeat_peer_update" << dendl;
4842 heartbeat_set_peers_need_update();
4843}
4844
4845void OSD::maybe_update_heartbeat_peers()
4846{
11fdf7f2 4847 ceph_assert(osd_lock.is_locked());
7c673cae 4848
11fdf7f2 4849 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
4850 utime_t now = ceph_clock_now();
4851 if (last_heartbeat_resample == utime_t()) {
4852 last_heartbeat_resample = now;
4853 heartbeat_set_peers_need_update();
4854 } else if (!heartbeat_peers_need_update()) {
4855 utime_t dur = now - last_heartbeat_resample;
4856 if (dur > cct->_conf->osd_heartbeat_grace) {
4857 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4858 heartbeat_set_peers_need_update();
4859 last_heartbeat_resample = now;
11fdf7f2
TL
4860 if (is_waiting_for_healthy()) {
4861 reset_heartbeat_peers(); // we want *new* peers!
4862 }
7c673cae
FG
4863 }
4864 }
4865 }
4866
4867 if (!heartbeat_peers_need_update())
4868 return;
4869 heartbeat_clear_peers_need_update();
4870
11fdf7f2 4871 std::lock_guard l(heartbeat_lock);
7c673cae
FG
4872
4873 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4874
4875
4876 // build heartbeat from set
4877 if (is_active()) {
11fdf7f2
TL
4878 vector<PGRef> pgs;
4879 _get_pgs(&pgs);
4880 for (auto& pg : pgs) {
4881 pg->with_heartbeat_peers([&](int peer) {
4882 if (osdmap->is_up(peer)) {
4883 _add_heartbeat_peer(peer);
4884 }
4885 });
7c673cae
FG
4886 }
4887 }
4888
4889 // include next and previous up osds to ensure we have a fully-connected set
4890 set<int> want, extras;
11fdf7f2 4891 const int next = osdmap->get_next_up_osd_after(whoami);
7c673cae
FG
4892 if (next >= 0)
4893 want.insert(next);
4894 int prev = osdmap->get_previous_up_osd_before(whoami);
4895 if (prev >= 0 && prev != next)
4896 want.insert(prev);
4897
11fdf7f2
TL
4898 // make sure we have at least **min_down** osds coming from different
4899 // subtree level (e.g., hosts) for fast failure detection.
4900 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
4901 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
4902 osdmap->get_random_up_osds_by_subtree(
4903 whoami, subtree, min_down, want, &want);
4904
7c673cae
FG
4905 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4906 dout(10) << " adding neighbor peer osd." << *p << dendl;
4907 extras.insert(*p);
4908 _add_heartbeat_peer(*p);
4909 }
4910
4911 // remove down peers; enumerate extras
4912 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4913 while (p != heartbeat_peers.end()) {
4914 if (!osdmap->is_up(p->first)) {
4915 int o = p->first;
4916 ++p;
4917 _remove_heartbeat_peer(o);
4918 continue;
4919 }
4920 if (p->second.epoch < osdmap->get_epoch()) {
4921 extras.insert(p->first);
4922 }
4923 ++p;
4924 }
4925
4926 // too few?
11fdf7f2 4927 for (int n = next; n >= 0; ) {
7c673cae
FG
4928 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4929 break;
4930 if (!extras.count(n) && !want.count(n) && n != whoami) {
4931 dout(10) << " adding random peer osd." << n << dendl;
4932 extras.insert(n);
4933 _add_heartbeat_peer(n);
4934 }
4935 n = osdmap->get_next_up_osd_after(n);
11fdf7f2 4936 if (n == next)
7c673cae
FG
4937 break; // came full circle; stop
4938 }
4939
4940 // too many?
4941 for (set<int>::iterator p = extras.begin();
4942 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4943 ++p) {
4944 if (want.count(*p))
4945 continue;
4946 _remove_heartbeat_peer(*p);
4947 }
4948
4949 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4950}
4951
4952void OSD::reset_heartbeat_peers()
4953{
11fdf7f2 4954 ceph_assert(osd_lock.is_locked());
7c673cae 4955 dout(10) << "reset_heartbeat_peers" << dendl;
11fdf7f2 4956 std::lock_guard l(heartbeat_lock);
7c673cae
FG
4957 while (!heartbeat_peers.empty()) {
4958 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4959 hi.con_back->mark_down();
4960 if (hi.con_front) {
4961 hi.con_front->mark_down();
4962 }
4963 heartbeat_peers.erase(heartbeat_peers.begin());
4964 }
4965 failure_queue.clear();
4966}
4967
4968void OSD::handle_osd_ping(MOSDPing *m)
4969{
4970 if (superblock.cluster_fsid != m->fsid) {
4971 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4972 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4973 m->put();
4974 return;
4975 }
4976
4977 int from = m->get_source().num();
4978
4979 heartbeat_lock.Lock();
4980 if (is_stopping()) {
4981 heartbeat_lock.Unlock();
4982 m->put();
4983 return;
4984 }
4985
4986 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
4987 if (!curmap) {
4988 heartbeat_lock.Unlock();
4989 m->put();
4990 return;
4991 }
7c673cae
FG
4992
4993 switch (m->op) {
4994
4995 case MOSDPing::PING:
4996 {
4997 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4998 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4999 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5000 if (heartbeat_drop->second == 0) {
5001 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5002 } else {
5003 --heartbeat_drop->second;
5004 dout(5) << "Dropping heartbeat from " << from
5005 << ", " << heartbeat_drop->second
5006 << " remaining to drop" << dendl;
5007 break;
5008 }
5009 } else if (cct->_conf->osd_debug_drop_ping_probability >
5010 ((((double)(rand()%100))/100.0))) {
5011 heartbeat_drop =
5012 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5013 cct->_conf->osd_debug_drop_ping_duration)).first;
5014 dout(5) << "Dropping heartbeat from " << from
5015 << ", " << heartbeat_drop->second
5016 << " remaining to drop" << dendl;
5017 break;
5018 }
5019 }
5020
5021 if (!cct->get_heartbeat_map()->is_healthy()) {
5022 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5023 break;
5024 }
5025
5026 Message *r = new MOSDPing(monc->get_fsid(),
5027 curmap->get_epoch(),
31f18b77
FG
5028 MOSDPing::PING_REPLY, m->stamp,
5029 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5030 m->get_connection()->send_message(r);
5031
5032 if (curmap->is_up(from)) {
5033 service.note_peer_epoch(from, m->map_epoch);
5034 if (is_active()) {
5035 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5036 if (con) {
5037 service.share_map_peer(from, con.get());
5038 }
5039 }
5040 } else if (!curmap->exists(from) ||
5041 curmap->get_down_at(from) > m->map_epoch) {
5042 // tell them they have died
5043 Message *r = new MOSDPing(monc->get_fsid(),
5044 curmap->get_epoch(),
5045 MOSDPing::YOU_DIED,
31f18b77
FG
5046 m->stamp,
5047 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5048 m->get_connection()->send_message(r);
5049 }
5050 }
5051 break;
5052
5053 case MOSDPing::PING_REPLY:
5054 {
5055 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5056 if (i != heartbeat_peers.end()) {
11fdf7f2
TL
5057 auto acked = i->second.ping_history.find(m->stamp);
5058 if (acked != i->second.ping_history.end()) {
5059 utime_t now = ceph_clock_now();
5060 int &unacknowledged = acked->second.second;
5061 if (m->get_connection() == i->second.con_back) {
5062 dout(25) << "handle_osd_ping got reply from osd." << from
5063 << " first_tx " << i->second.first_tx
5064 << " last_tx " << i->second.last_tx
5065 << " last_rx_back " << i->second.last_rx_back << " -> " << now
5066 << " last_rx_front " << i->second.last_rx_front
5067 << dendl;
5068 i->second.last_rx_back = now;
5069 ceph_assert(unacknowledged > 0);
5070 --unacknowledged;
5071 // if there is no front con, set both stamps.
5072 if (i->second.con_front == NULL) {
5073 i->second.last_rx_front = now;
5074 ceph_assert(unacknowledged > 0);
5075 --unacknowledged;
5076 }
5077 } else if (m->get_connection() == i->second.con_front) {
5078 dout(25) << "handle_osd_ping got reply from osd." << from
5079 << " first_tx " << i->second.first_tx
5080 << " last_tx " << i->second.last_tx
5081 << " last_rx_back " << i->second.last_rx_back
5082 << " last_rx_front " << i->second.last_rx_front << " -> " << now
5083 << dendl;
5084 i->second.last_rx_front = now;
5085 ceph_assert(unacknowledged > 0);
5086 --unacknowledged;
5087 }
7c673cae 5088
11fdf7f2
TL
5089 if (unacknowledged == 0) {
5090 // succeeded in getting all replies
5091 dout(25) << "handle_osd_ping got all replies from osd." << from
5092 << " , erase pending ping(sent at " << m->stamp << ")"
5093 << " and older pending ping(s)"
5094 << dendl;
5095 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5096 }
5097
11fdf7f2
TL
5098 if (i->second.is_healthy(now)) {
5099 // Cancel false reports
5100 auto failure_queue_entry = failure_queue.find(from);
5101 if (failure_queue_entry != failure_queue.end()) {
5102 dout(10) << "handle_osd_ping canceling queued "
5103 << "failure report for osd." << from << dendl;
5104 failure_queue.erase(failure_queue_entry);
5105 }
5106
5107 auto failure_pending_entry = failure_pending.find(from);
5108 if (failure_pending_entry != failure_pending.end()) {
5109 dout(10) << "handle_osd_ping canceling in-flight "
5110 << "failure report for osd." << from << dendl;
5111 send_still_alive(curmap->get_epoch(),
5112 from,
5113 failure_pending_entry->second.second);
5114 failure_pending.erase(failure_pending_entry);
5115 }
7c673cae 5116 }
11fdf7f2
TL
5117 } else {
5118 // old replies, deprecated by newly sent pings.
5119 dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5120 << ") is found, treat as covered by newly sent pings "
5121 << "and ignore"
5122 << dendl;
7c673cae
FG
5123 }
5124 }
5125
5126 if (m->map_epoch &&
5127 curmap->is_up(from)) {
5128 service.note_peer_epoch(from, m->map_epoch);
5129 if (is_active()) {
5130 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5131 if (con) {
5132 service.share_map_peer(from, con.get());
5133 }
5134 }
5135 }
5136 }
5137 break;
5138
5139 case MOSDPing::YOU_DIED:
5140 dout(10) << "handle_osd_ping " << m->get_source_inst()
5141 << " says i am down in " << m->map_epoch << dendl;
5142 osdmap_subscribe(curmap->get_epoch()+1, false);
5143 break;
5144 }
5145
5146 heartbeat_lock.Unlock();
5147 m->put();
5148}
5149
5150void OSD::heartbeat_entry()
5151{
11fdf7f2 5152 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5153 if (is_stopping())
5154 return;
5155 while (!heartbeat_stop) {
5156 heartbeat();
5157
5158 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5159 utime_t w;
5160 w.set_from_double(wait);
5161 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5162 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5163 if (is_stopping())
5164 return;
5165 dout(30) << "heartbeat_entry woke up" << dendl;
5166 }
5167}
5168
5169void OSD::heartbeat_check()
5170{
11fdf7f2 5171 ceph_assert(heartbeat_lock.is_locked());
7c673cae
FG
5172 utime_t now = ceph_clock_now();
5173
11fdf7f2 5174 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5175 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5176 p != heartbeat_peers.end();
5177 ++p) {
5178
5179 if (p->second.first_tx == utime_t()) {
5180 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5181 << " yet, skipping" << dendl;
7c673cae
FG
5182 continue;
5183 }
5184
5185 dout(25) << "heartbeat_check osd." << p->first
5186 << " first_tx " << p->second.first_tx
5187 << " last_tx " << p->second.last_tx
5188 << " last_rx_back " << p->second.last_rx_back
5189 << " last_rx_front " << p->second.last_rx_front
5190 << dendl;
11fdf7f2
TL
5191 if (p->second.is_unhealthy(now)) {
5192 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5193 if (p->second.last_rx_back == utime_t() ||
5194 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5195 derr << "heartbeat_check: no reply from "
5196 << p->second.con_front->get_peer_addr().get_sockaddr()
5197 << " osd." << p->first
5198 << " ever on either front or back, first ping sent "
5199 << p->second.first_tx
5200 << " (oldest deadline " << oldest_deadline << ")"
5201 << dendl;
7c673cae 5202 // fail
11fdf7f2 5203 failure_queue[p->first] = p->second.first_tx;
7c673cae 5204 } else {
11fdf7f2
TL
5205 derr << "heartbeat_check: no reply from "
5206 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5207 << " osd." << p->first << " since back " << p->second.last_rx_back
5208 << " front " << p->second.last_rx_front
11fdf7f2
TL
5209 << " (oldest deadline " << oldest_deadline << ")"
5210 << dendl;
7c673cae 5211 // fail
11fdf7f2 5212 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5213 }
5214 }
5215 }
5216}
5217
5218void OSD::heartbeat()
5219{
5220 dout(30) << "heartbeat" << dendl;
5221
5222 // get CPU load avg
5223 double loadavgs[1];
11fdf7f2
TL
5224 int hb_interval = cct->_conf->osd_heartbeat_interval;
5225 int n_samples = 86400;
5226 if (hb_interval > 1) {
5227 n_samples /= hb_interval;
5228 if (n_samples < 1)
5229 n_samples = 1;
5230 }
5231
7c673cae
FG
5232 if (getloadavg(loadavgs, 1) == 1) {
5233 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5234 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5235 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5236 }
5237
5238 dout(30) << "heartbeat checking stats" << dendl;
5239
11fdf7f2 5240 // refresh peer list and osd stats
7c673cae
FG
5241 vector<int> hb_peers;
5242 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5243 p != heartbeat_peers.end();
5244 ++p)
5245 hb_peers.push_back(p->first);
7c673cae 5246
11fdf7f2
TL
5247 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5248 dout(5) << __func__ << " " << new_stat << dendl;
5249 ceph_assert(new_stat.statfs.total);
5250
5251 float pratio;
5252 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5253
5254 service.check_full_status(ratio, pratio);
7c673cae
FG
5255
5256 utime_t now = ceph_clock_now();
11fdf7f2
TL
5257 utime_t deadline = now;
5258 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5259
5260 // send heartbeats
5261 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5262 i != heartbeat_peers.end();
5263 ++i) {
5264 int peer = i->first;
5265 i->second.last_tx = now;
5266 if (i->second.first_tx == utime_t())
5267 i->second.first_tx = now;
11fdf7f2
TL
5268 i->second.ping_history[now] = make_pair(deadline,
5269 HeartbeatInfo::HEARTBEAT_MAX_CONN);
7c673cae
FG
5270 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5271 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5272 service.get_osdmap_epoch(),
31f18b77
FG
5273 MOSDPing::PING, now,
5274 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5275
5276 if (i->second.con_front)
5277 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5278 service.get_osdmap_epoch(),
31f18b77
FG
5279 MOSDPing::PING, now,
5280 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5281 }
5282
5283 logger->set(l_osd_hb_to, heartbeat_peers.size());
5284
5285 // hmm.. am i all alone?
5286 dout(30) << "heartbeat lonely?" << dendl;
5287 if (heartbeat_peers.empty()) {
5288 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5289 last_mon_heartbeat = now;
5290 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5291 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5292 }
5293 }
5294
5295 dout(30) << "heartbeat done" << dendl;
5296}
5297
5298bool OSD::heartbeat_reset(Connection *con)
5299{
11fdf7f2
TL
5300 std::lock_guard l(heartbeat_lock);
5301 auto s = con->get_priv();
5302 con->set_priv(nullptr);
7c673cae 5303 if (s) {
7c673cae 5304 if (is_stopping()) {
7c673cae
FG
5305 return true;
5306 }
11fdf7f2
TL
5307 auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5308 auto p = heartbeat_peers.find(heartbeat_session->peer);
7c673cae
FG
5309 if (p != heartbeat_peers.end() &&
5310 (p->second.con_back == con ||
5311 p->second.con_front == con)) {
5312 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5313 << ", reopening" << dendl;
5314 if (con != p->second.con_back) {
5315 p->second.con_back->mark_down();
5316 }
5317 p->second.con_back.reset(NULL);
5318 if (p->second.con_front && con != p->second.con_front) {
5319 p->second.con_front->mark_down();
5320 }
5321 p->second.con_front.reset(NULL);
5322 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5323 if (newcon.first) {
5324 p->second.con_back = newcon.first.get();
11fdf7f2 5325 p->second.con_back->set_priv(s);
7c673cae
FG
5326 if (newcon.second) {
5327 p->second.con_front = newcon.second.get();
11fdf7f2 5328 p->second.con_front->set_priv(s);
7c673cae 5329 }
11fdf7f2 5330 p->second.ping_history.clear();
7c673cae
FG
5331 } else {
5332 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5333 << ", raced with osdmap update, closing out peer" << dendl;
5334 heartbeat_peers.erase(p);
5335 }
5336 } else {
5337 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5338 }
7c673cae
FG
5339 }
5340 return true;
5341}
5342
5343
5344
5345// =========================================
5346
5347void OSD::tick()
5348{
11fdf7f2 5349 ceph_assert(osd_lock.is_locked());
7c673cae
FG
5350 dout(10) << "tick" << dendl;
5351
5352 if (is_active() || is_waiting_for_healthy()) {
5353 maybe_update_heartbeat_peers();
5354 }
5355
5356 if (is_waiting_for_healthy()) {
5357 start_boot();
11fdf7f2
TL
5358 if (is_waiting_for_healthy()) {
5359 // failed to boot
5360 std::lock_guard l(heartbeat_lock);
5361 utime_t now = ceph_clock_now();
5362 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5363 last_mon_heartbeat = now;
5364 dout(1) << __func__ << " checking mon for new map" << dendl;
5365 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5366 }
5367 }
7c673cae
FG
5368 }
5369
5370 do_waiters();
5371
91327a77 5372 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5373}
5374
5375void OSD::tick_without_osd_lock()
5376{
11fdf7f2 5377 ceph_assert(tick_timer_lock.is_locked());
7c673cae
FG
5378 dout(10) << "tick_without_osd_lock" << dendl;
5379
7c673cae
FG
5380 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5381 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5382 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5383
5384 // refresh osd stats
5385 struct store_statfs_t stbuf;
5386 osd_alert_list_t alerts;
5387 int r = store->statfs(&stbuf, &alerts);
5388 ceph_assert(r == 0);
5389 service.set_statfs(stbuf, alerts);
7c673cae
FG
5390
5391 // osd_lock is not being held, which means the OSD state
5392 // might change when doing the monitor report
5393 if (is_active() || is_waiting_for_healthy()) {
5394 heartbeat_lock.Lock();
5395 heartbeat_check();
5396 heartbeat_lock.Unlock();
5397
5398 map_lock.get_read();
11fdf7f2 5399 std::lock_guard l(mon_report_lock);
7c673cae
FG
5400
5401 // mon report?
7c673cae 5402 utime_t now = ceph_clock_now();
11fdf7f2
TL
5403 if (service.need_fullness_update() ||
5404 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5405 last_mon_report = now;
7c673cae
FG
5406 send_full_update();
5407 send_failures();
7c673cae
FG
5408 }
5409 map_lock.put_read();
11fdf7f2
TL
5410
5411 epoch_t max_waiting_epoch = 0;
5412 for (auto s : shards) {
5413 max_waiting_epoch = std::max(max_waiting_epoch,
5414 s->get_max_waiting_epoch());
5415 }
5416 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5417 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5418 << ", requesting new map" << dendl;
5419 osdmap_subscribe(superblock.newest_map + 1, false);
5420 }
7c673cae
FG
5421 }
5422
5423 if (is_active()) {
5424 if (!scrub_random_backoff()) {
5425 sched_scrub();
5426 }
5427 service.promote_throttle_recalibrate();
3efd9988 5428 resume_creating_pg();
224ce89b
WB
5429 bool need_send_beacon = false;
5430 const auto now = ceph::coarse_mono_clock::now();
5431 {
5432 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5433 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5434 const auto elapsed = now - last_sent_beacon;
5435 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5436 cct->_conf->osd_beacon_report_interval) {
5437 need_send_beacon = true;
5438 }
5439 }
5440 if (need_send_beacon) {
5441 send_beacon(now);
5442 }
7c673cae
FG
5443 }
5444
11fdf7f2 5445 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5446 service.kick_recovery_queue();
91327a77
AA
5447 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5448 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5449}
5450
7c673cae
FG
5451// Usage:
5452// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5453// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5454// setomapheader <pool-id> [namespace/]<obj-name> <header>
5455// getomap <pool> [namespace/]<obj-name>
5456// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5457// injectmdataerr [namespace/]<obj-name> [shardid]
5458// injectdataerr [namespace/]<obj-name> [shardid]
5459//
5460// set_recovery_delay [utime]
5461void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5462 std::string_view command,
5463 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5464{
5465 //Test support
5466 //Support changing the omap on a single osd by using the Admin Socket to
5467 //directly request the osd make a change.
5468 if (command == "setomapval" || command == "rmomapkey" ||
5469 command == "setomapheader" || command == "getomap" ||
5470 command == "truncobj" || command == "injectmdataerr" ||
5471 command == "injectdataerr"
5472 ) {
5473 pg_t rawpg;
5474 int64_t pool;
5475 OSDMapRef curmap = service->get_osdmap();
5476 int r = -1;
5477
5478 string poolstr;
5479
5480 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5481 pool = curmap->lookup_pg_pool_name(poolstr);
5482 //If we can't find it by name then maybe id specified
5483 if (pool < 0 && isdigit(poolstr[0]))
5484 pool = atoll(poolstr.c_str());
5485 if (pool < 0) {
b5b8bbf5 5486 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5487 return;
5488 }
5489
5490 string objname, nspace;
5491 cmd_getval(service->cct, cmdmap, "objname", objname);
5492 std::size_t found = objname.find_first_of('/');
5493 if (found != string::npos) {
5494 nspace = objname.substr(0, found);
5495 objname = objname.substr(found+1);
5496 }
5497 object_locator_t oloc(pool, nspace);
5498 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5499
5500 if (r < 0) {
5501 ss << "Invalid namespace/objname";
5502 return;
5503 }
5504
5505 int64_t shardid;
5506 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5507 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5508 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5509 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5510 if (curmap->pg_is_ec(rawpg)) {
5511 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5512 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5513 return;
5514 }
5515 }
5516
5517 ObjectStore::Transaction t;
5518
5519 if (command == "setomapval") {
5520 map<string, bufferlist> newattrs;
5521 bufferlist val;
5522 string key, valstr;
5523 cmd_getval(service->cct, cmdmap, "key", key);
5524 cmd_getval(service->cct, cmdmap, "val", valstr);
5525
5526 val.append(valstr);
5527 newattrs[key] = val;
5528 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5529 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5530 if (r < 0)
5531 ss << "error=" << r;
5532 else
5533 ss << "ok";
5534 } else if (command == "rmomapkey") {
5535 string key;
5536 set<string> keys;
5537 cmd_getval(service->cct, cmdmap, "key", key);
5538
5539 keys.insert(key);
5540 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
11fdf7f2 5541 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5542 if (r < 0)
5543 ss << "error=" << r;
5544 else
5545 ss << "ok";
5546 } else if (command == "setomapheader") {
5547 bufferlist newheader;
5548 string headerstr;
5549
5550 cmd_getval(service->cct, cmdmap, "header", headerstr);
5551 newheader.append(headerstr);
5552 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 5553 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5554 if (r < 0)
5555 ss << "error=" << r;
5556 else
5557 ss << "ok";
5558 } else if (command == "getomap") {
5559 //Debug: Output entire omap
5560 bufferlist hdrbl;
5561 map<string, bufferlist> keyvals;
11fdf7f2
TL
5562 auto ch = store->open_collection(coll_t(pgid));
5563 if (!ch) {
5564 ss << "unable to open collection for " << pgid;
5565 r = -ENOENT;
5566 } else {
5567 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5568 if (r >= 0) {
7c673cae
FG
5569 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5570 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 5571 it != keyvals.end(); ++it)
7c673cae
FG
5572 ss << " key=" << (*it).first << " val="
5573 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 5574 } else {
7c673cae 5575 ss << "error=" << r;
11fdf7f2 5576 }
7c673cae
FG
5577 }
5578 } else if (command == "truncobj") {
5579 int64_t trunclen;
5580 cmd_getval(service->cct, cmdmap, "len", trunclen);
5581 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 5582 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5583 if (r < 0)
5584 ss << "error=" << r;
5585 else
5586 ss << "ok";
5587 } else if (command == "injectdataerr") {
5588 store->inject_data_error(gobj);
5589 ss << "ok";
5590 } else if (command == "injectmdataerr") {
5591 store->inject_mdata_error(gobj);
5592 ss << "ok";
5593 }
5594 return;
5595 }
5596 if (command == "set_recovery_delay") {
5597 int64_t delay;
5598 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5599 ostringstream oss;
5600 oss << delay;
11fdf7f2 5601 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
5602 oss.str().c_str());
5603 if (r != 0) {
5604 ss << "set_recovery_delay: error setting "
5605 << "osd_recovery_delay_start to '" << delay << "': error "
5606 << r;
5607 return;
5608 }
11fdf7f2 5609 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
5610 ss << "set_recovery_delay: set osd_recovery_delay_start "
5611 << "to " << service->cct->_conf->osd_recovery_delay_start;
5612 return;
5613 }
a8e16298 5614 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
7c673cae 5615 spg_t pgid;
a8e16298 5616 bool deep = (command == "trigger_deep_scrub");
7c673cae
FG
5617 OSDMapRef curmap = service->get_osdmap();
5618
5619 string pgidstr;
5620
5621 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5622 if (!pgid.parse(pgidstr.c_str())) {
5623 ss << "Invalid pgid specified";
5624 return;
5625 }
5626
a8e16298
TL
5627 int64_t time;
5628 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5629
11fdf7f2 5630 PGRef pg = service->osd->_lookup_lock_pg(pgid);
7c673cae
FG
5631 if (pg == nullptr) {
5632 ss << "Can't find pg " << pgid;
5633 return;
5634 }
5635
5636 if (pg->is_primary()) {
5637 pg->unreg_next_scrub();
5638 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5639 double pool_scrub_max_interval = 0;
a8e16298
TL
5640 double scrub_max_interval;
5641 if (deep) {
5642 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5643 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5644 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
a8e16298
TL
5645 } else {
5646 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5647 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5648 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
a8e16298 5649 }
7c673cae
FG
5650 // Instead of marking must_scrub force a schedule scrub
5651 utime_t stamp = ceph_clock_now();
a8e16298
TL
5652 if (time == 0)
5653 stamp -= scrub_max_interval;
5654 else
5655 stamp -= (float)time;
5656 stamp -= 100.0; // push back last scrub more for good measure
5657 if (deep) {
5658 pg->set_last_deep_scrub_stamp(stamp);
5659 } else {
5660 pg->set_last_scrub_stamp(stamp);
5661 }
7c673cae 5662 pg->reg_next_scrub();
a8e16298
TL
5663 pg->publish_stats_to_osd();
5664 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
7c673cae
FG
5665 } else {
5666 ss << "Not primary";
5667 }
5668 pg->unlock();
5669 return;
5670 }
5671 if (command == "injectfull") {
5672 int64_t count;
5673 string type;
5674 OSDService::s_names state;
5675 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5676 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5677 if (type == "none" || count == 0) {
5678 type = "none";
5679 count = 0;
5680 }
5681 state = service->get_full_state(type);
5682 if (state == OSDService::s_names::INVALID) {
5683 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5684 return;
5685 }
5686 service->set_injectfull(state, count);
5687 return;
5688 }
5689 ss << "Internal error - command=" << command;
5690}
5691
7c673cae
FG
5692// =========================================
5693
5694void OSD::ms_handle_connect(Connection *con)
5695{
5696 dout(10) << __func__ << " con " << con << dendl;
5697 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 5698 std::lock_guard l(osd_lock);
7c673cae
FG
5699 if (is_stopping())
5700 return;
5701 dout(10) << __func__ << " on mon" << dendl;
5702
5703 if (is_preboot()) {
5704 start_boot();
5705 } else if (is_booting()) {
5706 _send_boot(); // resend boot message
5707 } else {
5708 map_lock.get_read();
11fdf7f2 5709 std::lock_guard l2(mon_report_lock);
7c673cae
FG
5710
5711 utime_t now = ceph_clock_now();
5712 last_mon_report = now;
5713
5714 // resend everything, it's a new session
5715 send_full_update();
5716 send_alive();
5717 service.requeue_pg_temp();
11fdf7f2 5718 service.clear_sent_ready_to_merge();
7c673cae 5719 service.send_pg_temp();
11fdf7f2
TL
5720 service.send_ready_to_merge();
5721 service.send_pg_created();
7c673cae
FG
5722 requeue_failures();
5723 send_failures();
7c673cae
FG
5724
5725 map_lock.put_read();
5726 if (is_active()) {
5727 send_beacon(ceph::coarse_mono_clock::now());
5728 }
5729 }
5730
5731 // full map requests may happen while active or pre-boot
5732 if (requested_full_first) {
5733 rerequest_full_maps();
5734 }
5735 }
5736}
5737
5738void OSD::ms_handle_fast_connect(Connection *con)
5739{
5740 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5741 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
5742 auto priv = con->get_priv();
5743 auto s = static_cast<Session*>(priv.get());
7c673cae 5744 if (!s) {
11fdf7f2
TL
5745 s = new Session{cct, con};
5746 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
5747 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5748 << " addr=" << s->con->get_peer_addr() << dendl;
5749 // we don't connect to clients
11fdf7f2 5750 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
5751 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5752 }
7c673cae
FG
5753 }
5754}
5755
5756void OSD::ms_handle_fast_accept(Connection *con)
5757{
5758 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5759 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
5760 auto priv = con->get_priv();
5761 auto s = static_cast<Session*>(priv.get());
7c673cae 5762 if (!s) {
11fdf7f2
TL
5763 s = new Session{cct, con};
5764 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
5765 dout(10) << "new session (incoming)" << s << " con=" << con
5766 << " addr=" << con->get_peer_addr()
5767 << " must have raced with connect" << dendl;
11fdf7f2 5768 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
5769 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5770 }
7c673cae
FG
5771 }
5772}
5773
5774bool OSD::ms_handle_reset(Connection *con)
5775{
11fdf7f2
TL
5776 auto s = con->get_priv();
5777 auto session = static_cast<Session*>(s.get());
7c673cae
FG
5778 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5779 if (!session)
5780 return false;
5781 session->wstate.reset(con);
11fdf7f2
TL
5782 session->con->set_priv(nullptr);
5783 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
5784 // note that we break session->con *before* the session_handle_reset
5785 // cleanup below. this avoids a race between us and
5786 // PG::add_backoff, Session::check_backoff, etc.
11fdf7f2 5787 session_handle_reset(SessionRef{session});
7c673cae
FG
5788 return true;
5789}
5790
5791bool OSD::ms_handle_refused(Connection *con)
5792{
5793 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5794 return false;
5795
11fdf7f2
TL
5796 auto priv = con->get_priv();
5797 auto session = static_cast<Session*>(priv.get());
7c673cae
FG
5798 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5799 if (!session)
5800 return false;
5801 int type = con->get_peer_type();
5802 // handle only OSD failures here
5803 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5804 OSDMapRef osdmap = get_osdmap();
5805 if (osdmap) {
5806 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5807 if (id >= 0 && osdmap->is_up(id)) {
5808 // I'm cheating mon heartbeat grace logic, because we know it's not going
5809 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
5810 monc->send_mon_message(
5811 new MOSDFailure(
5812 monc->get_fsid(),
5813 id,
5814 osdmap->get_addrs(id),
5815 cct->_conf->osd_heartbeat_grace + 1,
5816 osdmap->get_epoch(),
5817 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5818 ));
7c673cae
FG
5819 }
5820 }
5821 }
7c673cae
FG
5822 return true;
5823}
5824
5825struct C_OSD_GetVersion : public Context {
5826 OSD *osd;
5827 uint64_t oldest, newest;
5828 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5829 void finish(int r) override {
5830 if (r >= 0)
5831 osd->_got_mon_epochs(oldest, newest);
5832 }
5833};
5834
5835void OSD::start_boot()
5836{
5837 if (!_is_healthy()) {
5838 // if we are not healthy, do not mark ourselves up (yet)
5839 dout(1) << "not healthy; waiting to boot" << dendl;
5840 if (!is_waiting_for_healthy())
5841 start_waiting_for_healthy();
5842 // send pings sooner rather than later
5843 heartbeat_kick();
5844 return;
5845 }
5846 dout(1) << __func__ << dendl;
5847 set_state(STATE_PREBOOT);
5848 dout(10) << "start_boot - have maps " << superblock.oldest_map
5849 << ".." << superblock.newest_map << dendl;
5850 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5851 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5852}
5853
5854void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5855{
11fdf7f2 5856 std::lock_guard l(osd_lock);
7c673cae
FG
5857 if (is_preboot()) {
5858 _preboot(oldest, newest);
5859 }
5860}
5861
5862void OSD::_preboot(epoch_t oldest, epoch_t newest)
5863{
11fdf7f2 5864 ceph_assert(is_preboot());
7c673cae
FG
5865 dout(10) << __func__ << " _preboot mon has osdmaps "
5866 << oldest << ".." << newest << dendl;
5867
5868 // ensure our local fullness awareness is accurate
5869 heartbeat();
5870
5871 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
5872 if (osdmap->get_epoch() == 0) {
5873 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 5874 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
5875 derr << "osdmap says I am destroyed" << dendl;
5876 // provide a small margin so we don't livelock seeing if we
5877 // un-destroyed ourselves.
5878 if (osdmap->get_epoch() > newest - 1) {
5879 exit(0);
5880 }
31f18b77 5881 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
7c673cae
FG
5882 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5883 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5884 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5885 << dendl;
11fdf7f2
TL
5886 } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5887 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 5888 << dendl;
7c673cae
FG
5889 } else if (service.need_fullness_update()) {
5890 derr << "osdmap fullness state needs update" << dendl;
5891 send_full_update();
5892 } else if (osdmap->get_epoch() >= oldest - 1 &&
5893 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
5894
5895 // wait for pgs to fully catch up in a different thread, since
5896 // this thread might be required for splitting and merging PGs to
5897 // make progress.
5898 boot_finisher.queue(
5899 new FunctionContext(
5900 [this](int r) {
5901 std::lock_guard l(osd_lock);
5902 if (is_preboot()) {
5903 dout(10) << __func__ << " waiting for peering work to drain"
5904 << dendl;
5905 osd_lock.Unlock();
5906 for (auto shard : shards) {
5907 shard->wait_min_pg_epoch(osdmap->get_epoch());
5908 }
5909 osd_lock.Lock();
5910 }
5911 if (is_preboot()) {
5912 _send_boot();
5913 }
5914 }));
5915 return;
7c673cae
FG
5916 }
5917
5918 // get all the latest maps
5919 if (osdmap->get_epoch() + 1 >= oldest)
5920 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5921 else
5922 osdmap_subscribe(oldest - 1, true);
5923}
5924
5925void OSD::send_full_update()
5926{
5927 if (!service.need_fullness_update())
5928 return;
5929 unsigned state = 0;
5930 if (service.is_full()) {
5931 state = CEPH_OSD_FULL;
5932 } else if (service.is_backfillfull()) {
5933 state = CEPH_OSD_BACKFILLFULL;
5934 } else if (service.is_nearfull()) {
5935 state = CEPH_OSD_NEARFULL;
5936 }
5937 set<string> s;
5938 OSDMap::calc_state_set(state, s);
5939 dout(10) << __func__ << " want state " << s << dendl;
5940 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5941}
5942
5943void OSD::start_waiting_for_healthy()
5944{
5945 dout(1) << "start_waiting_for_healthy" << dendl;
5946 set_state(STATE_WAITING_FOR_HEALTHY);
5947 last_heartbeat_resample = utime_t();
181888fb
FG
5948
5949 // subscribe to osdmap updates, in case our peers really are known to be dead
5950 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
5951}
5952
5953bool OSD::_is_healthy()
5954{
5955 if (!cct->get_heartbeat_map()->is_healthy()) {
5956 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5957 return false;
5958 }
5959
5960 if (is_waiting_for_healthy()) {
11fdf7f2
TL
5961 utime_t now = ceph_clock_now();
5962 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5963 while (!osd_markdown_log.empty() &&
5964 osd_markdown_log.front() + grace < now)
5965 osd_markdown_log.pop_front();
5966 if (osd_markdown_log.size() <= 1) {
5967 dout(5) << __func__ << " first time marked as down,"
5968 << " try reboot unconditionally" << dendl;
5969 return true;
5970 }
5971 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5972 int num = 0, up = 0;
5973 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5974 p != heartbeat_peers.end();
5975 ++p) {
11fdf7f2 5976 if (p->second.is_healthy(now))
7c673cae
FG
5977 ++up;
5978 ++num;
5979 }
5980 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5981 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5982 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5983 return false;
5984 }
5985 }
5986
5987 return true;
5988}
5989
5990void OSD::_send_boot()
5991{
5992 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
5993 Connection *local_connection =
5994 cluster_messenger->get_loopback_connection().get();
5995 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
5996 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
5997 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
5998 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
5999
6000 dout(20) << " initial client_addrs " << client_addrs
6001 << ", cluster_addrs " << cluster_addrs
6002 << ", hb_back_addrs " << hb_back_addrs
6003 << ", hb_front_addrs " << hb_front_addrs
6004 << dendl;
6005 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6006 dout(10) << " assuming cluster_addrs match client_addrs "
6007 << client_addrs << dendl;
6008 cluster_addrs = cluster_messenger->get_myaddrs();
6009 }
6010 if (auto session = local_connection->get_priv(); !session) {
6011 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6012 }
6013
7c673cae 6014 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6015 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6016 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6017 << cluster_addrs << dendl;
6018 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6019 }
11fdf7f2
TL
6020 if (auto session = local_connection->get_priv(); !session) {
6021 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6022 }
6023
11fdf7f2
TL
6024 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6025 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6026 dout(10) << " assuming hb_front_addrs match client_addrs "
6027 << client_addrs << dendl;
6028 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6029 }
6030 if (auto session = local_connection->get_priv(); !session) {
6031 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6032 }
6033
6034 // we now know what our front and back addrs will be, and we are
6035 // about to tell the mon what our metadata (including numa bindings)
6036 // are, so now is a good time!
6037 set_numa_affinity();
6038
6039 MOSDBoot *mboot = new MOSDBoot(
6040 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6041 hb_back_addrs, hb_front_addrs, cluster_addrs,
6042 CEPH_FEATURES_ALL);
6043 dout(10) << " final client_addrs " << client_addrs
6044 << ", cluster_addrs " << cluster_addrs
6045 << ", hb_back_addrs " << hb_back_addrs
6046 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6047 << dendl;
6048 _collect_metadata(&mboot->metadata);
6049 monc->send_mon_message(mboot);
6050 set_state(STATE_BOOTING);
6051}
6052
6053void OSD::_collect_metadata(map<string,string> *pm)
6054{
6055 // config info
6056 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6057 if (store->get_type() == "filestore") {
6058 // not applicable for bluestore
6059 (*pm)["osd_journal"] = journal_path;
6060 }
11fdf7f2
TL
6061 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6062 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6063 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6064 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6065
6066 // backend
6067 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6068 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6069 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6070 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6071 store->collect_metadata(pm);
6072
6073 collect_sys_info(pm, cct);
6074
11fdf7f2
TL
6075 (*pm)["front_iface"] = pick_iface(
6076 cct,
6077 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6078 (*pm)["back_iface"] = pick_iface(
6079 cct,
6080 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6081
6082 // network numa
6083 {
6084 int node = -1;
6085 set<int> nodes;
6086 set<string> unknown;
6087 for (auto nm : { "front_iface", "back_iface" }) {
6088 if (!(*pm)[nm].size()) {
6089 unknown.insert(nm);
6090 continue;
6091 }
6092 int n = -1;
6093 int r = get_iface_numa_node((*pm)[nm], &n);
6094 if (r < 0) {
6095 unknown.insert((*pm)[nm]);
6096 continue;
6097 }
6098 nodes.insert(n);
6099 if (node < 0) {
6100 node = n;
6101 }
6102 }
6103 if (unknown.size()) {
6104 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6105 }
6106 if (!nodes.empty()) {
6107 (*pm)["network_numa_nodes"] = stringify(nodes);
6108 }
6109 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6110 (*pm)["network_numa_node"] = stringify(node);
6111 }
6112 }
6113
6114 if (numa_node >= 0) {
6115 (*pm)["numa_node"] = stringify(numa_node);
6116 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6117 &numa_cpu_set);
6118 }
6119
6120 set<string> devnames;
6121 store->get_devices(&devnames);
6122 (*pm)["devices"] = stringify(devnames);
6123 string devids;
6124 for (auto& dev : devnames) {
6125 string err;
6126 string id = get_device_id(dev, &err);
6127 if (id.size()) {
6128 if (!devids.empty()) {
6129 devids += ",";
6130 }
6131 devids += dev + "=" + id;
6132 } else {
6133 dout(10) << __func__ << " no unique device id for " << dev << ": "
6134 << err << dendl;
6135 }
6136 }
6137 (*pm)["device_ids"] = devids;
b5b8bbf5 6138
7c673cae
FG
6139 dout(10) << __func__ << " " << *pm << dendl;
6140}
6141
6142void OSD::queue_want_up_thru(epoch_t want)
6143{
6144 map_lock.get_read();
6145 epoch_t cur = osdmap->get_up_thru(whoami);
11fdf7f2 6146 std::lock_guard l(mon_report_lock);
7c673cae
FG
6147 if (want > up_thru_wanted) {
6148 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6149 << ", currently " << cur
6150 << dendl;
6151 up_thru_wanted = want;
6152 send_alive();
6153 } else {
6154 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6155 << ", currently " << cur
6156 << dendl;
6157 }
6158 map_lock.put_read();
6159}
6160
6161void OSD::send_alive()
6162{
11fdf7f2 6163 ceph_assert(mon_report_lock.is_locked());
7c673cae
FG
6164 if (!osdmap->exists(whoami))
6165 return;
6166 epoch_t up_thru = osdmap->get_up_thru(whoami);
6167 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6168 if (up_thru_wanted > up_thru) {
6169 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6170 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6171 }
6172}
6173
6174void OSD::request_full_map(epoch_t first, epoch_t last)
6175{
6176 dout(10) << __func__ << " " << first << ".." << last
6177 << ", previously requested "
6178 << requested_full_first << ".." << requested_full_last << dendl;
11fdf7f2
TL
6179 ceph_assert(osd_lock.is_locked());
6180 ceph_assert(first > 0 && last > 0);
6181 ceph_assert(first <= last);
6182 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6183 if (requested_full_first == 0) {
6184 // first request
6185 requested_full_first = first;
6186 requested_full_last = last;
6187 } else if (last <= requested_full_last) {
6188 // dup
6189 return;
6190 } else {
6191 // additional request
6192 first = requested_full_last + 1;
6193 requested_full_last = last;
6194 }
6195 MMonGetOSDMap *req = new MMonGetOSDMap;
6196 req->request_full(first, last);
6197 monc->send_mon_message(req);
6198}
6199
6200void OSD::got_full_map(epoch_t e)
6201{
11fdf7f2
TL
6202 ceph_assert(requested_full_first <= requested_full_last);
6203 ceph_assert(osd_lock.is_locked());
7c673cae
FG
6204 if (requested_full_first == 0) {
6205 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6206 return;
6207 }
6208 if (e < requested_full_first) {
6209 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6210 << ".." << requested_full_last
6211 << ", ignoring" << dendl;
6212 return;
6213 }
6214 if (e >= requested_full_last) {
6215 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6216 << ".." << requested_full_last << ", resetting" << dendl;
6217 requested_full_first = requested_full_last = 0;
6218 return;
6219 }
6220
6221 requested_full_first = e + 1;
6222
6223 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6224 << ".." << requested_full_last
6225 << ", still need more" << dendl;
6226}
6227
6228void OSD::requeue_failures()
6229{
11fdf7f2 6230 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6231 unsigned old_queue = failure_queue.size();
6232 unsigned old_pending = failure_pending.size();
11fdf7f2 6233 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6234 failure_queue[p->first] = p->second.first;
6235 failure_pending.erase(p++);
6236 }
6237 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6238 << failure_queue.size() << dendl;
6239}
6240
6241void OSD::send_failures()
6242{
11fdf7f2
TL
6243 ceph_assert(map_lock.is_locked());
6244 ceph_assert(mon_report_lock.is_locked());
6245 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6246 utime_t now = ceph_clock_now();
6247 while (!failure_queue.empty()) {
6248 int osd = failure_queue.begin()->first;
7c673cae
FG
6249 if (!failure_pending.count(osd)) {
6250 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6251 monc->send_mon_message(
6252 new MOSDFailure(
6253 monc->get_fsid(),
6254 osd,
6255 osdmap->get_addrs(osd),
6256 failed_for,
6257 osdmap->get_epoch()));
6258 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6259 osdmap->get_addrs(osd));
7c673cae
FG
6260 }
6261 failure_queue.erase(osd);
6262 }
6263}
6264
11fdf7f2 6265void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6266{
11fdf7f2
TL
6267 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6268 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6269 monc->send_mon_message(m);
6270}
6271
11fdf7f2 6272void OSD::cancel_pending_failures()
7c673cae 6273{
11fdf7f2
TL
6274 std::lock_guard l(heartbeat_lock);
6275 auto it = failure_pending.begin();
6276 while (it != failure_pending.end()) {
6277 dout(10) << __func__ << " canceling in-flight failure report for osd."
6278 << it->first << dendl;
6279 send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6280 failure_pending.erase(it++);
7c673cae 6281 }
7c673cae
FG
6282}
6283
6284void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6285{
6286 const auto& monmap = monc->monmap;
6287 // send beacon to mon even if we are just connected, and the monmap is not
6288 // initialized yet by then.
6289 if (monmap.epoch > 0 &&
6290 monmap.get_required_features().contains_all(
6291 ceph::features::mon::FEATURE_LUMINOUS)) {
6292 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6293 MOSDBeacon* beacon = nullptr;
6294 {
11fdf7f2 6295 std::lock_guard l{min_last_epoch_clean_lock};
7c673cae
FG
6296 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6297 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
224ce89b 6298 last_sent_beacon = now;
7c673cae
FG
6299 }
6300 monc->send_mon_message(beacon);
6301 } else {
6302 dout(20) << __func__ << " not sending" << dendl;
6303 }
6304}
6305
6306void OSD::handle_command(MMonCommand *m)
6307{
6308 if (!require_mon_peer(m)) {
6309 m->put();
6310 return;
6311 }
6312
6313 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6314 command_wq.queue(c);
6315 m->put();
6316}
6317
6318void OSD::handle_command(MCommand *m)
6319{
6320 ConnectionRef con = m->get_connection();
11fdf7f2
TL
6321 auto priv = con->get_priv();
6322 auto session = static_cast<Session *>(priv.get());
7c673cae
FG
6323 if (!session) {
6324 con->send_message(new MCommandReply(m, -EPERM));
6325 m->put();
6326 return;
6327 }
6328
6329 OSDCap& caps = session->caps;
11fdf7f2 6330 priv.reset();
7c673cae
FG
6331
6332 if (!caps.allow_all() || m->get_source().is_mon()) {
6333 con->send_message(new MCommandReply(m, -EPERM));
6334 m->put();
6335 return;
6336 }
6337
6338 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6339 command_wq.queue(c);
6340
6341 m->put();
6342}
6343
6344struct OSDCommand {
6345 string cmdstring;
6346 string helpstring;
6347 string module;
6348 string perm;
7c673cae
FG
6349} osd_commands[] = {
6350
11fdf7f2
TL
6351#define COMMAND(parsesig, helptext, module, perm) \
6352 {parsesig, helptext, module, perm},
7c673cae
FG
6353
6354// yes, these are really pg commands, but there's a limit to how
6355// much work it's worth. The OSD returns all of them. Make this
6356// form (pg <pgid> <cmd>) valid only for the cli.
6357// Rest uses "tell <pgid> <cmd>"
6358
6359COMMAND("pg " \
6360 "name=pgid,type=CephPgid " \
6361 "name=cmd,type=CephChoices,strings=query", \
11fdf7f2 6362 "show details of a specific pg", "osd", "r")
7c673cae
FG
6363COMMAND("pg " \
6364 "name=pgid,type=CephPgid " \
6365 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6366 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6367 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2 6368 "osd", "rw")
7c673cae
FG
6369COMMAND("pg " \
6370 "name=pgid,type=CephPgid " \
11fdf7f2 6371 "name=cmd,type=CephChoices,strings=list_unfound " \
7c673cae 6372 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6373 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6374 "osd", "r")
7c673cae
FG
6375
6376// new form: tell <pgid> <cmd> for both cli and rest
6377
6378COMMAND("query",
11fdf7f2 6379 "show details of a specific pg", "osd", "r")
7c673cae
FG
6380COMMAND("mark_unfound_lost " \
6381 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6382 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2
TL
6383 "osd", "rw")
6384COMMAND("list_unfound " \
7c673cae 6385 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6386 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6387 "osd", "r")
31f18b77
FG
6388COMMAND("perf histogram dump "
6389 "name=logger,type=CephString,req=false "
6390 "name=counter,type=CephString,req=false",
6391 "Get histogram data",
11fdf7f2 6392 "osd", "r")
7c673cae
FG
6393
6394// tell <osd.n> commands. Validation of osd.n must be special-cased in client
11fdf7f2
TL
6395COMMAND("version", "report version of OSD", "osd", "r")
6396COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
7c673cae
FG
6397COMMAND("injectargs " \
6398 "name=injected_args,type=CephString,n=N",
6399 "inject configuration arguments into running OSD",
11fdf7f2 6400 "osd", "rw")
c07f9fc5
FG
6401COMMAND("config set " \
6402 "name=key,type=CephString name=value,type=CephString",
6403 "Set a configuration option at runtime (not persistent)",
11fdf7f2
TL
6404 "osd", "rw")
6405COMMAND("config get " \
6406 "name=key,type=CephString",
6407 "Get a configuration option at runtime",
6408 "osd", "r")
6409COMMAND("config unset " \
6410 "name=key,type=CephString",
6411 "Unset a configuration option at runtime (not persistent)",
6412 "osd", "rw")
7c673cae
FG
6413COMMAND("cluster_log " \
6414 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6415 "name=message,type=CephString,n=N",
6416 "log a message to the cluster log",
11fdf7f2 6417 "osd", "rw")
7c673cae
FG
6418COMMAND("bench " \
6419 "name=count,type=CephInt,req=false " \
6420 "name=size,type=CephInt,req=false " \
6421 "name=object_size,type=CephInt,req=false " \
6422 "name=object_num,type=CephInt,req=false ", \
6423 "OSD benchmark: write <count> <size>-byte objects, " \
6424 "(default 1G size 4MB). Results in log.",
11fdf7f2
TL
6425 "osd", "rw")
6426COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
7c673cae 6427COMMAND("heap " \
11fdf7f2
TL
6428 "name=heapcmd,type=CephChoices,strings="\
6429 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6430 "name=value,type=CephString,req=false",
6431 "show heap usage info (available only if compiled with tcmalloc)",
6432 "osd", "rw")
7c673cae
FG
6433COMMAND("debug dump_missing " \
6434 "name=filename,type=CephFilepath",
11fdf7f2 6435 "dump missing objects to a named file", "osd", "r")
7c673cae
FG
6436COMMAND("debug kick_recovery_wq " \
6437 "name=delay,type=CephInt,range=0",
11fdf7f2 6438 "set osd_recovery_delay_start to <val>", "osd", "rw")
7c673cae
FG
6439COMMAND("cpu_profiler " \
6440 "name=arg,type=CephChoices,strings=status|flush",
11fdf7f2 6441 "run cpu profiling on daemon", "osd", "rw")
7c673cae 6442COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
11fdf7f2 6443 "osd", "r")
7c673cae 6444COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
11fdf7f2 6445 "osd", "rw")
224ce89b
WB
6446COMMAND("compact",
6447 "compact object store's omap. "
6448 "WARNING: Compaction probably slows your requests",
11fdf7f2
TL
6449 "osd", "rw")
6450COMMAND("smart name=devid,type=CephString,req=False",
6451 "runs smartctl on this osd devices. ",
6452 "osd", "rw")
6453COMMAND("cache drop",
6454 "Drop all OSD caches",
6455 "osd", "rwx")
6456COMMAND("cache status",
6457 "Get OSD caches statistics",
6458 "osd", "r")
6459COMMAND("send_beacon",
6460 "Send OSD beacon to mon immediately",
6461 "osd", "r")
7c673cae
FG
6462};
6463
11fdf7f2
TL
6464void OSD::do_command(
6465 Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6466{
6467 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6468
6469 int r = 0;
6470 stringstream ss, ds;
6471 bufferlist odata;
6472 cmdmap_t cmdmap;
6473 if (cmd.empty()) {
6474 ss << "no command given";
6475 goto out;
6476 }
6477 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6478 r = -EINVAL;
6479 goto out;
6480 }
6481
6482 try {
6483 r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6484 } catch (const bad_cmd_get& e) {
6485 r = -EINVAL;
6486 ss << e.what();
6487 }
6488 if (r == -EAGAIN) {
6489 return;
6490 }
6491 out:
6492 string rs = ss.str();
6493 odata.append(ds);
6494 dout(0) << "do_command r=" << r << " " << rs << dendl;
6495 clog->info() << rs;
6496 if (con) {
6497 MCommandReply *reply = new MCommandReply(r, rs);
6498 reply->set_tid(tid);
6499 reply->set_data(odata);
6500 con->send_message(reply);
6501 }
6502}
6503
f64942e4
AA
6504namespace {
6505 class unlock_guard {
6506 Mutex& m;
6507 public:
6508 explicit unlock_guard(Mutex& mutex)
6509 : m(mutex)
6510 {
11fdf7f2 6511 m.unlock();
f64942e4
AA
6512 }
6513 unlock_guard(unlock_guard&) = delete;
6514 ~unlock_guard() {
11fdf7f2 6515 m.lock();
f64942e4
AA
6516 }
6517 };
6518}
6519
11fdf7f2
TL
6520int OSD::_do_command(
6521 Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6522 bufferlist& odata, stringstream& ss, stringstream& ds)
7c673cae
FG
6523{
6524 int r = 0;
7c673cae
FG
6525 string prefix;
6526 string format;
6527 string pgidstr;
6528 boost::scoped_ptr<Formatter> f;
6529
7c673cae
FG
6530 cmd_getval(cct, cmdmap, "prefix", prefix);
6531
6532 if (prefix == "get_command_descriptions") {
6533 int cmdnum = 0;
6534 JSONFormatter *f = new JSONFormatter();
6535 f->open_object_section("command_descriptions");
6536 for (OSDCommand *cp = osd_commands;
11fdf7f2 6537 cp < &osd_commands[std::size(osd_commands)]; cp++) {
7c673cae
FG
6538
6539 ostringstream secname;
6540 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
11fdf7f2
TL
6541 dump_cmddesc_to_json(f, con->get_features(),
6542 secname.str(), cp->cmdstring, cp->helpstring,
6543 cp->module, cp->perm, 0);
7c673cae
FG
6544 cmdnum++;
6545 }
6546 f->close_section(); // command_descriptions
6547
6548 f->flush(ds);
6549 delete f;
6550 goto out;
6551 }
6552
6553 cmd_getval(cct, cmdmap, "format", format);
6554 f.reset(Formatter::create(format));
6555
6556 if (prefix == "version") {
6557 if (f) {
6558 f->open_object_section("version");
6559 f->dump_string("version", pretty_version_to_str());
6560 f->close_section();
6561 f->flush(ds);
6562 } else {
6563 ds << pretty_version_to_str();
6564 }
6565 goto out;
6566 }
6567 else if (prefix == "injectargs") {
6568 vector<string> argsvec;
6569 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6570
6571 if (argsvec.empty()) {
6572 r = -EINVAL;
6573 ss << "ignoring empty injectargs";
6574 goto out;
6575 }
6576 string args = argsvec.front();
6577 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6578 args += " " + *a;
f64942e4 6579 unlock_guard unlock{osd_lock};
11fdf7f2 6580 r = cct->_conf.injectargs(args, &ss);
7c673cae 6581 }
c07f9fc5
FG
6582 else if (prefix == "config set") {
6583 std::string key;
6584 std::string val;
6585 cmd_getval(cct, cmdmap, "key", key);
6586 cmd_getval(cct, cmdmap, "value", val);
f64942e4 6587 unlock_guard unlock{osd_lock};
11fdf7f2
TL
6588 r = cct->_conf.set_val(key, val, &ss);
6589 if (r == 0) {
6590 cct->_conf.apply_changes(nullptr);
6591 }
6592 }
6593 else if (prefix == "config get") {
6594 std::string key;
6595 cmd_getval(cct, cmdmap, "key", key);
6596 unlock_guard unlock{osd_lock};
6597 std::string val;
6598 r = cct->_conf.get_val(key, &val);
6599 if (r == 0) {
6600 ds << val;
6601 }
6602 }
6603 else if (prefix == "config unset") {
6604 std::string key;
6605 cmd_getval(cct, cmdmap, "key", key);
6606 unlock_guard unlock{osd_lock};
6607 r = cct->_conf.rm_val(key);
d2e6a577 6608 if (r == 0) {
11fdf7f2
TL
6609 cct->_conf.apply_changes(nullptr);
6610 }
6611 if (r == -ENOENT) {
6612 r = 0; // make command idempotent
d2e6a577 6613 }
c07f9fc5 6614 }
7c673cae
FG
6615 else if (prefix == "cluster_log") {
6616 vector<string> msg;
6617 cmd_getval(cct, cmdmap, "message", msg);
6618 if (msg.empty()) {
6619 r = -EINVAL;
6620 ss << "ignoring empty log message";
6621 goto out;
6622 }
6623 string message = msg.front();
6624 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6625 message += " " + *a;
6626 string lvl;
6627 cmd_getval(cct, cmdmap, "level", lvl);
6628 clog_type level = string_to_clog_type(lvl);
6629 if (level < 0) {
6630 r = -EINVAL;
6631 ss << "unknown level '" << lvl << "'";
6632 goto out;
6633 }
6634 clog->do_log(level, message);
6635 }
6636
6637 // either 'pg <pgid> <command>' or
6638 // 'tell <pgid>' (which comes in without any of that prefix)?
6639
6640 else if (prefix == "pg" ||
6641 prefix == "query" ||
6642 prefix == "mark_unfound_lost" ||
11fdf7f2 6643 prefix == "list_unfound"
7c673cae
FG
6644 ) {
6645 pg_t pgid;
6646
6647 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6648 ss << "no pgid specified";
6649 r = -EINVAL;
6650 } else if (!pgid.parse(pgidstr.c_str())) {
6651 ss << "couldn't parse pgid '" << pgidstr << "'";
6652 r = -EINVAL;
6653 } else {
6654 spg_t pcand;
11fdf7f2 6655 PGRef pg;
7c673cae
FG
6656 if (osdmap->get_primary_shard(pgid, &pcand) &&
6657 (pg = _lookup_lock_pg(pcand))) {
6658 if (pg->is_primary()) {
6659 // simulate pg <pgid> cmd= for pg->do-command
6660 if (prefix != "pg")
6661 cmd_putval(cct, cmdmap, "cmd", prefix);
11fdf7f2
TL
6662 try {
6663 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6664 } catch (const bad_cmd_get& e) {
6665 pg->unlock();
6666 ss << e.what();
6667 return -EINVAL;
6668 }
7c673cae
FG
6669 if (r == -EAGAIN) {
6670 pg->unlock();
6671 // don't reply, pg will do so async
11fdf7f2 6672 return -EAGAIN;
7c673cae
FG
6673 }
6674 } else {
6675 ss << "not primary for pgid " << pgid;
6676
6677 // send them the latest diff to ensure they realize the mapping
6678 // has changed.
6679 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6680
6681 // do not reply; they will get newer maps and realize they
6682 // need to resend.
6683 pg->unlock();
11fdf7f2 6684 return -EAGAIN;
7c673cae
FG
6685 }
6686 pg->unlock();
6687 } else {
6688 ss << "i don't have pgid " << pgid;
6689 r = -ENOENT;
6690 }
6691 }
6692 }
6693
6694 else if (prefix == "bench") {
6695 int64_t count;
6696 int64_t bsize;
6697 int64_t osize, onum;
6698 // default count 1G, size 4MB
6699 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6700 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6701 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6702 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6703
7c673cae
FG
6704 uint32_t duration = cct->_conf->osd_bench_duration;
6705
6706 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6707 // let us limit the block size because the next checks rely on it
6708 // having a sane value. If we allow any block size to be set things
6709 // can still go sideways.
6710 ss << "block 'size' values are capped at "
1adf2230 6711 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7c673cae
FG
6712 << " a higher value, please adjust 'osd_bench_max_block_size'";
6713 r = -EINVAL;
6714 goto out;
6715 } else if (bsize < (int64_t) (1 << 20)) {
6716 // entering the realm of small block sizes.
6717 // limit the count to a sane value, assuming a configurable amount of
6718 // IOPS and duration, so that the OSD doesn't get hung up on this,
6719 // preventing timeouts from going off
6720 int64_t max_count =
6721 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6722 if (count > max_count) {
6723 ss << "'count' values greater than " << max_count
1adf2230 6724 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7c673cae
FG
6725 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6726 << " for " << duration << " seconds,"
6727 << " can cause ill effects on osd. "
6728 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6729 << " value if you wish to use a higher 'count'.";
6730 r = -EINVAL;
6731 goto out;
6732 }
6733 } else {
6734 // 1MB block sizes are big enough so that we get more stuff done.
6735 // However, to avoid the osd from getting hung on this and having
6736 // timers being triggered, we are going to limit the count assuming
6737 // a configurable throughput and duration.
6738 // NOTE: max_count is the total amount of bytes that we believe we
6739 // will be able to write during 'duration' for the given
6740 // throughput. The block size hardly impacts this unless it's
6741 // way too big. Given we already check how big the block size
6742 // is, it's safe to assume everything will check out.
6743 int64_t max_count =
6744 cct->_conf->osd_bench_large_size_max_throughput * duration;
6745 if (count > max_count) {
6746 ss << "'count' values greater than " << max_count
1adf2230
AA
6747 << " for a block size of " << byte_u_t(bsize) << ", assuming "
6748 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7c673cae
FG
6749 << " for " << duration << " seconds,"
6750 << " can cause ill effects on osd. "
6751 << " Please adjust 'osd_bench_large_size_max_throughput'"
6752 << " with a higher value if you wish to use a higher 'count'.";
6753 r = -EINVAL;
6754 goto out;
6755 }
6756 }
6757
6758 if (osize && bsize > osize)
6759 bsize = osize;
6760
6761 dout(1) << " bench count " << count
1adf2230 6762 << " bsize " << byte_u_t(bsize) << dendl;
7c673cae
FG
6763
6764 ObjectStore::Transaction cleanupt;
6765
6766 if (osize && onum) {
6767 bufferlist bl;
6768 bufferptr bp(osize);
6769 bp.zero();
6770 bl.push_back(std::move(bp));
6771 bl.rebuild_page_aligned();
6772 for (int i=0; i<onum; ++i) {
6773 char nm[30];
6774 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6775 object_t oid(nm);
6776 hobject_t soid(sobject_t(oid, 0));
6777 ObjectStore::Transaction t;
6778 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
11fdf7f2 6779 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
6780 cleanupt.remove(coll_t(), ghobject_t(soid));
6781 }
6782 }
6783
6784 bufferlist bl;
6785 bufferptr bp(bsize);
6786 bp.zero();
6787 bl.push_back(std::move(bp));
6788 bl.rebuild_page_aligned();
6789
6790 {
6791 C_SaferCond waiter;
11fdf7f2 6792 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6793 waiter.wait();
6794 }
6795 }
6796
6797 utime_t start = ceph_clock_now();
6798 for (int64_t pos = 0; pos < count; pos += bsize) {
6799 char nm[30];
6800 unsigned offset = 0;
6801 if (onum && osize) {
6802 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6803 offset = rand() % (osize / bsize) * bsize;
6804 } else {
6805 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6806 }
6807 object_t oid(nm);
6808 hobject_t soid(sobject_t(oid, 0));
6809 ObjectStore::Transaction t;
6810 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
11fdf7f2 6811 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
6812 if (!onum || !osize)
6813 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6814 }
6815
6816 {
6817 C_SaferCond waiter;
11fdf7f2 6818 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6819 waiter.wait();
6820 }
6821 }
6822 utime_t end = ceph_clock_now();
6823
6824 // clean up
11fdf7f2 6825 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7c673cae
FG
6826 {
6827 C_SaferCond waiter;
11fdf7f2 6828 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6829 waiter.wait();
6830 }
6831 }
6832
91327a77
AA
6833 double elapsed = end - start;
6834 double rate = count / elapsed;
6835 double iops = rate / bsize;
7c673cae
FG
6836 if (f) {
6837 f->open_object_section("osd_bench_results");
6838 f->dump_int("bytes_written", count);
6839 f->dump_int("blocksize", bsize);
91327a77
AA
6840 f->dump_float("elapsed_sec", elapsed);
6841 f->dump_float("bytes_per_sec", rate);
6842 f->dump_float("iops", iops);
7c673cae 6843 f->close_section();
91327a77 6844 f->flush(ds);
7c673cae 6845 } else {
91327a77 6846 ds << "bench: wrote " << byte_u_t(count)
1adf2230 6847 << " in blocks of " << byte_u_t(bsize) << " in "
91327a77
AA
6848 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
6849 << si_u_t(iops) << " IOPS";
7c673cae
FG
6850 }
6851 }
6852
6853 else if (prefix == "flush_pg_stats") {
11fdf7f2
TL
6854 mgrc.send_pgstats();
6855 ds << service.get_osd_stat_seq() << "\n";
7c673cae
FG
6856 }
6857
6858 else if (prefix == "heap") {
6859 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6860 }
6861
6862 else if (prefix == "debug dump_missing") {
11fdf7f2
TL
6863 if (!f) {
6864 f.reset(new JSONFormatter(true));
7c673cae 6865 }
11fdf7f2
TL
6866 f->open_array_section("pgs");
6867 vector<PGRef> pgs;
6868 _get_pgs(&pgs);
6869 for (auto& pg : pgs) {
6870 string s = stringify(pg->pg_id);
6871 f->open_array_section(s.c_str());
7c673cae 6872 pg->lock();
11fdf7f2 6873 pg->dump_missing(f.get());
7c673cae 6874 pg->unlock();
11fdf7f2 6875 f->close_section();
7c673cae 6876 }
11fdf7f2
TL
6877 f->close_section();
6878 f->flush(ds);
7c673cae
FG
6879 }
6880 else if (prefix == "debug kick_recovery_wq") {
6881 int64_t delay;
6882 cmd_getval(cct, cmdmap, "delay", delay);
6883 ostringstream oss;
6884 oss << delay;
f64942e4 6885 unlock_guard unlock{osd_lock};
11fdf7f2 6886 r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7c673cae
FG
6887 if (r != 0) {
6888 ss << "kick_recovery_wq: error setting "
6889 << "osd_recovery_delay_start to '" << delay << "': error "
6890 << r;
6891 goto out;
6892 }
11fdf7f2 6893 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6894 ss << "kicking recovery queue. set osd_recovery_delay_start "
6895 << "to " << cct->_conf->osd_recovery_delay_start;
6896 }
6897
6898 else if (prefix == "cpu_profiler") {
6899 string arg;
6900 cmd_getval(cct, cmdmap, "arg", arg);
6901 vector<string> argvec;
6902 get_str_vec(arg, argvec);
6903 cpu_profiler_handle_command(argvec, ds);
6904 }
6905
6906 else if (prefix == "dump_pg_recovery_stats") {
6907 stringstream s;
6908 if (f) {
6909 pg_recovery_stats.dump_formatted(f.get());
6910 f->flush(ds);
6911 } else {
6912 pg_recovery_stats.dump(s);
6913 ds << "dump pg recovery stats: " << s.str();
6914 }
6915 }
6916
6917 else if (prefix == "reset_pg_recovery_stats") {
6918 ss << "reset pg recovery stats";
6919 pg_recovery_stats.reset();
6920 }
6921
31f18b77
FG
6922 else if (prefix == "perf histogram dump") {
6923 std::string logger;
6924 std::string counter;
6925 cmd_getval(cct, cmdmap, "logger", logger);
6926 cmd_getval(cct, cmdmap, "counter", counter);
6927 if (f) {
6928 cct->get_perfcounters_collection()->dump_formatted_histograms(
6929 f.get(), false, logger, counter);
6930 f->flush(ds);
6931 }
6932 }
6933
224ce89b
WB
6934 else if (prefix == "compact") {
6935 dout(1) << "triggering manual compaction" << dendl;
6936 auto start = ceph::coarse_mono_clock::now();
6937 store->compact();
6938 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 6939 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 6940 dout(1) << "finished manual compaction in "
11fdf7f2 6941 << duration
224ce89b 6942 << " seconds" << dendl;
11fdf7f2
TL
6943 ss << "compacted omap in " << duration << " seconds";
6944 }
6945
6946 else if (prefix == "smart") {
6947 string devid;
6948 cmd_getval(cct, cmdmap, "devid", devid);
6949 probe_smart(devid, ds);
6950 }
6951
6952 else if (prefix == "cache drop") {
6953 dout(20) << "clearing all caches" << dendl;
6954 // Clear the objectstore's cache - onode and buffer for Bluestore,
6955 // system's pagecache for Filestore
6956 r = store->flush_cache(&ss);
6957 if (r < 0) {
6958 ds << "Error flushing objectstore cache: " << cpp_strerror(r);
6959 goto out;
6960 }
6961 // Clear the objectcontext cache (per PG)
6962 vector<PGRef> pgs;
6963 _get_pgs(&pgs);
6964 for (auto& pg: pgs) {
6965 pg->clear_cache();
6966 }
224ce89b
WB
6967 }
6968
11fdf7f2
TL
6969 else if (prefix == "cache status") {
6970 int obj_ctx_count = 0;
6971 vector<PGRef> pgs;
6972 _get_pgs(&pgs);
6973 for (auto& pg: pgs) {
6974 obj_ctx_count += pg->get_cache_obj_count();
6975 }
6976 if (f) {
6977 f->open_object_section("cache_status");
6978 f->dump_int("object_ctx", obj_ctx_count);
6979 store->dump_cache_stats(f.get());
6980 f->close_section();
6981 f->flush(ds);
6982 } else {
6983 ds << "object_ctx: " << obj_ctx_count;
6984 store->dump_cache_stats(ds);
6985 }
6986 }
6987 else if (prefix == "send_beacon") {
6988 if (is_active()) {
6989 send_beacon(ceph::coarse_mono_clock::now());
6990 }
6991 } else {
6992 ss << "unrecognized command '" << prefix << "'";
7c673cae
FG
6993 r = -EINVAL;
6994 }
6995
6996 out:
11fdf7f2
TL
6997 return r;
6998}
6999
7000void OSD::probe_smart(const string& only_devid, ostream& ss)
7001{
7002 set<string> devnames;
7003 store->get_devices(&devnames);
7004 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7005 "osd_smart_report_timeout");
7006
7007 // == typedef std::map<std::string, mValue> mObject;
7008 json_spirit::mObject json_map;
7009
7010 for (auto dev : devnames) {
7011 // smartctl works only on physical devices; filter out any logical device
7012 if (dev.find("dm-") == 0) {
7013 continue;
7014 }
7015
7016 string err;
7017 string devid = get_device_id(dev, &err);
7018 if (devid.size() == 0) {
7019 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7020 << err << "), skipping" << dendl;
7021 continue;
7022 }
7023 if (only_devid.size() && devid != only_devid) {
7024 continue;
7025 }
7026
7027 json_spirit::mValue smart_json;
7028 if (block_device_get_metrics(dev, smart_timeout,
7029 &smart_json)) {
7030 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7031 continue;
7032 }
7033 json_map[devid] = smart_json;
7c673cae 7034 }
11fdf7f2 7035 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7036}
7037
7038bool OSD::heartbeat_dispatch(Message *m)
7039{
7040 dout(30) << "heartbeat_dispatch " << m << dendl;
7041 switch (m->get_type()) {
7042
7043 case CEPH_MSG_PING:
7044 dout(10) << "ping from " << m->get_source_inst() << dendl;
7045 m->put();
7046 break;
7047
7048 case MSG_OSD_PING:
7049 handle_osd_ping(static_cast<MOSDPing*>(m));
7050 break;
7051
7052 default:
7053 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7054 m->put();
7055 }
7056
7057 return true;
7058}
7059
7060bool OSD::ms_dispatch(Message *m)
7061{
7062 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7063 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7064 service.got_stop_ack();
7065 m->put();
7066 return true;
7067 }
7068
7069 // lock!
7070
7071 osd_lock.Lock();
7072 if (is_stopping()) {
7073 osd_lock.Unlock();
7074 m->put();
7075 return true;
7076 }
7077
7078 do_waiters();
7079 _dispatch(m);
7080
7081 osd_lock.Unlock();
7082
7083 return true;
7084}
7085
7086void OSD::maybe_share_map(
7087 Session *session,
7088 OpRequestRef op,
7089 OSDMapRef osdmap)
7090{
7091 if (!op->check_send_map) {
7092 return;
7093 }
7094 epoch_t last_sent_epoch = 0;
7095
7096 session->sent_epoch_lock.lock();
7097 last_sent_epoch = session->last_sent_epoch;
7098 session->sent_epoch_lock.unlock();
7099
11fdf7f2
TL
7100 // assume the peer has the newer of the op's sent_epoch and what
7101 // we think we sent them.
7102 epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7103
7c673cae
FG
7104 const Message *m = op->get_req();
7105 service.share_map(
7106 m->get_source(),
7107 m->get_connection().get(),
11fdf7f2 7108 from,
7c673cae
FG
7109 osdmap,
7110 session ? &last_sent_epoch : NULL);
7111
7112 session->sent_epoch_lock.lock();
7113 if (session->last_sent_epoch < last_sent_epoch) {
7114 session->last_sent_epoch = last_sent_epoch;
7115 }
7116 session->sent_epoch_lock.unlock();
7117
7118 op->check_send_map = false;
7119}
7120
11fdf7f2 7121void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7c673cae 7122{
11fdf7f2 7123 ceph_assert(session->session_dispatch_lock.is_locked());
7c673cae
FG
7124
7125 auto i = session->waiting_on_map.begin();
7126 while (i != session->waiting_on_map.end()) {
7127 OpRequestRef op = &(*i);
11fdf7f2 7128 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7c673cae
FG
7129 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7130 op->get_req());
7131 if (m->get_min_epoch() > osdmap->get_epoch()) {
7132 break;
7133 }
7134 session->waiting_on_map.erase(i++);
7135 op->put();
7136
7137 spg_t pgid;
7138 if (m->get_type() == CEPH_MSG_OSD_OP) {
7139 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7140 static_cast<const MOSDOp*>(m)->get_pg());
7141 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7142 continue;
7143 }
7144 } else {
7145 pgid = m->get_spg();
7146 }
11fdf7f2 7147 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7148 }
7149
7150 if (session->waiting_on_map.empty()) {
7151 clear_session_waiting_on_map(session);
7152 } else {
7153 register_session_waiting_on_map(session);
7154 }
7155}
7156
7157void OSD::ms_fast_dispatch(Message *m)
7158{
11fdf7f2 7159 FUNCTRACE(cct);
7c673cae
FG
7160 if (service.is_stopping()) {
7161 m->put();
7162 return;
7163 }
11fdf7f2
TL
7164
7165 // peering event?
7166 switch (m->get_type()) {
7167 case CEPH_MSG_PING:
7168 dout(10) << "ping from " << m->get_source() << dendl;
7169 m->put();
7170 return;
7171 case MSG_MON_COMMAND:
7172 handle_command(static_cast<MMonCommand*>(m));
7173 return;
7174 case MSG_OSD_FORCE_RECOVERY:
7175 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7176 return;
7177 case MSG_OSD_SCRUB2:
7178 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7179 return;
7180
7181 case MSG_OSD_PG_CREATE2:
7182 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7183 case MSG_OSD_PG_QUERY:
7184 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7185 case MSG_OSD_PG_NOTIFY:
7186 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7187 case MSG_OSD_PG_INFO:
7188 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7189 case MSG_OSD_PG_REMOVE:
7190 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7191
7192 // these are single-pg messages that handle themselves
7193 case MSG_OSD_PG_LOG:
7194 case MSG_OSD_PG_TRIM:
7195 case MSG_OSD_BACKFILL_RESERVE:
7196 case MSG_OSD_RECOVERY_RESERVE:
7197 {
7198 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7199 if (require_osd_peer(pm)) {
7200 enqueue_peering_evt(
7201 pm->get_spg(),
7202 PGPeeringEventRef(pm->get_event()));
7203 }
7204 pm->put();
7205 return;
7206 }
7207 }
7208
7c673cae
FG
7209 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7210 {
7211#ifdef WITH_LTTNG
7212 osd_reqid_t reqid = op->get_reqid();
7213#endif
7214 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7215 reqid.name._num, reqid.tid, reqid.inc);
7216 }
7217
7218 if (m->trace)
7219 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7220
11fdf7f2 7221 // note sender epoch, min req's epoch
7c673cae
FG
7222 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7223 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7224 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7225
7226 service.maybe_inject_dispatch_delay();
7227
7228 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7229 m->get_type() != CEPH_MSG_OSD_OP) {
7230 // queue it directly
7231 enqueue_op(
7232 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7233 std::move(op),
7c673cae
FG
7234 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7235 } else {
7236 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7237 // message that didn't have an explicit spg_t); we need to map
7238 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7239 auto priv = m->get_connection()->get_priv();
7240 if (auto session = static_cast<Session*>(priv.get()); session) {
7241 std::lock_guard l{session->session_dispatch_lock};
7242 op->get();
7243 session->waiting_on_map.push_back(*op);
7244 OSDMapRef nextmap = service.get_nextmap_reserved();
7245 dispatch_session_waiting(session, nextmap);
7246 service.release_map(nextmap);
7c673cae
FG
7247 }
7248 }
7249 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7250}
7251
11fdf7f2 7252bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
7253{
7254 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7255
31f18b77
FG
7256 if (is_stopping()) {
7257 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7258 return false;
7259 }
7260
7c673cae
FG
7261 if (dest_type == CEPH_ENTITY_TYPE_MON)
7262 return true;
7263
7c673cae
FG
7264 *authorizer = monc->build_authorizer(dest_type);
7265 return *authorizer != NULL;
7266}
7267
11fdf7f2
TL
7268KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7269{
7270 return monc->rotating_secrets.get();
7271}
7c673cae 7272
11fdf7f2 7273int OSD::ms_handle_authentication(Connection *con)
7c673cae 7274{
11fdf7f2
TL
7275 int ret = 0;
7276 auto priv = con->get_priv();
7277 Session *s = static_cast<Session*>(priv.get());
7278 if (!s) {
7279 s = new Session(cct, con);
7280 con->set_priv(RefCountedPtr{s, false});
7281 s->entity_name = con->get_peer_entity_name();
7282 dout(10) << __func__ << " new session " << s << " con " << s->con
7283 << " entity " << s->entity_name
7284 << " addr " << con->get_peer_addrs() << dendl;
7285 } else {
7286 dout(10) << __func__ << " existing session " << s << " con " << s->con
7287 << " entity " << s->entity_name
7288 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7289 }
7290
11fdf7f2
TL
7291 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7292 if (caps_info.allow_all)
7293 s->caps.set_allow_all();
7c673cae 7294
11fdf7f2
TL
7295 if (caps_info.caps.length() > 0) {
7296 bufferlist::const_iterator p = caps_info.caps.cbegin();
7297 string str;
7298 try {
7299 decode(str, p);
7300 }
7301 catch (buffer::error& e) {
7302 dout(10) << __func__ << " session " << s << " " << s->entity_name
7303 << " failed to decode caps string" << dendl;
7304 ret = -EPERM;
7305 }
7306 if (!ret) {
7c673cae 7307 bool success = s->caps.parse(str);
11fdf7f2
TL
7308 if (success) {
7309 dout(10) << __func__ << " session " << s
7310 << " " << s->entity_name
7311 << " has caps " << s->caps << " '" << str << "'" << dendl;
7312 ret = 1;
7313 } else {
7314 dout(10) << __func__ << " session " << s << " " << s->entity_name
7315 << " failed to parse caps '" << str << "'" << dendl;
7316 ret = -EPERM;
7317 }
7c673cae 7318 }
7c673cae 7319 }
11fdf7f2 7320 return ret;
7c673cae
FG
7321}
7322
7323void OSD::do_waiters()
7324{
11fdf7f2 7325 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7326
7327 dout(10) << "do_waiters -- start" << dendl;
7328 while (!finished.empty()) {
7329 OpRequestRef next = finished.front();
7330 finished.pop_front();
7331 dispatch_op(next);
7332 }
7333 dout(10) << "do_waiters -- finish" << dendl;
7334}
7335
7336void OSD::dispatch_op(OpRequestRef op)
7337{
7338 switch (op->get_req()->get_type()) {
7339
7340 case MSG_OSD_PG_CREATE:
7341 handle_pg_create(op);
7342 break;
7c673cae
FG
7343 }
7344}
7345
7346void OSD::_dispatch(Message *m)
7347{
11fdf7f2 7348 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7349 dout(20) << "_dispatch " << m << " " << *m << dendl;
7350
7351 switch (m->get_type()) {
7c673cae
FG
7352 // -- don't need OSDMap --
7353
7354 // map and replication
7355 case CEPH_MSG_OSD_MAP:
7356 handle_osd_map(static_cast<MOSDMap*>(m));
7357 break;
7358
7359 // osd
7c673cae
FG
7360 case MSG_OSD_SCRUB:
7361 handle_scrub(static_cast<MOSDScrub*>(m));
7362 break;
7363
11fdf7f2
TL
7364 case MSG_COMMAND:
7365 handle_command(static_cast<MCommand*>(m));
7366 return;
c07f9fc5 7367
7c673cae
FG
7368 // -- need OSDMap --
7369
7370 case MSG_OSD_PG_CREATE:
7c673cae
FG
7371 {
7372 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7373 if (m->trace)
7374 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7375 // no map? starting up?
7376 if (!osdmap) {
7377 dout(7) << "no OSDMap, not booted" << dendl;
7378 logger->inc(l_osd_waiting_for_map);
7379 waiting_for_osdmap.push_back(op);
7380 op->mark_delayed("no osdmap");
7381 break;
7382 }
7383
7384 // need OSDMap
7385 dispatch_op(op);
7386 }
7387 }
7388}
7389
11fdf7f2 7390// remove me post-nautilus
7c673cae
FG
7391void OSD::handle_scrub(MOSDScrub *m)
7392{
7393 dout(10) << "handle_scrub " << *m << dendl;
7394 if (!require_mon_or_mgr_peer(m)) {
7395 m->put();
7396 return;
7397 }
7398 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7399 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7400 << dendl;
7c673cae
FG
7401 m->put();
7402 return;
7403 }
7404
11fdf7f2
TL
7405 vector<spg_t> spgs;
7406 _get_pgids(&spgs);
7407
7408 if (!m->scrub_pgs.empty()) {
7409 vector<spg_t> v;
7410 for (auto pgid : m->scrub_pgs) {
7c673cae 7411 spg_t pcand;
11fdf7f2
TL
7412 if (osdmap->get_primary_shard(pgid, &pcand) &&
7413 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7414 v.push_back(pcand);
7c673cae
FG
7415 }
7416 }
11fdf7f2
TL
7417 spgs.swap(v);
7418 }
7419
7420 for (auto pgid : spgs) {
7421 enqueue_peering_evt(
7422 pgid,
7423 PGPeeringEventRef(
7424 std::make_shared<PGPeeringEvent>(
7425 get_osdmap_epoch(),
7426 get_osdmap_epoch(),
7427 PG::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7428 }
7429
7430 m->put();
7431}
7432
11fdf7f2
TL
7433void OSD::handle_fast_scrub(MOSDScrub2 *m)
7434{
7435 dout(10) << __func__ << " " << *m << dendl;
7436 if (!require_mon_or_mgr_peer(m)) {
7437 m->put();
7438 return;
7439 }
7440 if (m->fsid != monc->get_fsid()) {
7441 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7442 << dendl;
7443 m->put();
7444 return;
7445 }
7446 for (auto pgid : m->scrub_pgs) {
7447 enqueue_peering_evt(
7448 pgid,
7449 PGPeeringEventRef(
7450 std::make_shared<PGPeeringEvent>(
7451 m->epoch,
7452 m->epoch,
7453 PG::RequestScrub(m->deep, m->repair))));
7454 }
7455 m->put();
7456}
7457
7c673cae
FG
7458bool OSD::scrub_random_backoff()
7459{
7460 bool coin_flip = (rand() / (double)RAND_MAX >=
7461 cct->_conf->osd_scrub_backoff_ratio);
7462 if (!coin_flip) {
7463 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7464 return true;
7465 }
7466 return false;
7467}
7468
7469OSDService::ScrubJob::ScrubJob(CephContext* cct,
7470 const spg_t& pg, const utime_t& timestamp,
7471 double pool_scrub_min_interval,
7472 double pool_scrub_max_interval, bool must)
7473 : cct(cct),
7474 pgid(pg),
7475 sched_time(timestamp),
7476 deadline(timestamp)
7477{
7478 // if not explicitly requested, postpone the scrub with a random delay
7479 if (!must) {
7480 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7481 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7482 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7483 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7484
7485 sched_time += scrub_min_interval;
7486 double r = rand() / (double)RAND_MAX;
7487 sched_time +=
7488 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7489 if (scrub_max_interval == 0) {
7490 deadline = utime_t();
7491 } else {
7492 deadline += scrub_max_interval;
7493 }
7494
7c673cae
FG
7495 }
7496}
7497
7498bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7499 if (sched_time < rhs.sched_time)
7500 return true;
7501 if (sched_time > rhs.sched_time)
7502 return false;
7503 return pgid < rhs.pgid;
7504}
7505
7506bool OSD::scrub_time_permit(utime_t now)
7507{
7508 struct tm bdt;
7509 time_t tt = now.sec();
7510 localtime_r(&tt, &bdt);
28e407b8
AA
7511
7512 bool day_permit = false;
7513 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7514 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7515 day_permit = true;
7516 }
7517 } else {
7518 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7519 day_permit = true;
7520 }
7521 }
7522
7523 if (!day_permit) {
7524 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7525 << " - " << cct->_conf->osd_scrub_end_week_day
7526 << " now " << bdt.tm_wday << " = no" << dendl;
7527 return false;
7528 }
7529
7c673cae
FG
7530 bool time_permit = false;
7531 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7532 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7533 time_permit = true;
7534 }
7535 } else {
7536 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7537 time_permit = true;
7538 }
7539 }
7540 if (!time_permit) {
7541 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7542 << " - " << cct->_conf->osd_scrub_end_hour
7543 << " now " << bdt.tm_hour << " = no" << dendl;
7544 } else {
7545 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7546 << " - " << cct->_conf->osd_scrub_end_hour
7547 << " now " << bdt.tm_hour << " = yes" << dendl;
7548 }
7549 return time_permit;
7550}
7551
7552bool OSD::scrub_load_below_threshold()
7553{
7554 double loadavgs[3];
7555 if (getloadavg(loadavgs, 3) != 3) {
7556 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7557 return false;
7558 }
7559
7560 // allow scrub if below configured threshold
91327a77
AA
7561 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7562 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7563 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7564 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7565 << " < max " << cct->_conf->osd_scrub_load_threshold
7566 << " = yes" << dendl;
7567 return true;
7568 }
7569
7570 // allow scrub if below daily avg and currently decreasing
7571 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7572 dout(20) << __func__ << " loadavg " << loadavgs[0]
7573 << " < daily_loadavg " << daily_loadavg
7574 << " and < 15m avg " << loadavgs[2]
7575 << " = yes" << dendl;
7576 return true;
7577 }
7578
7579 dout(20) << __func__ << " loadavg " << loadavgs[0]
7580 << " >= max " << cct->_conf->osd_scrub_load_threshold
7581 << " and ( >= daily_loadavg " << daily_loadavg
7582 << " or >= 15m avg " << loadavgs[2]
7583 << ") = no" << dendl;
7584 return false;
7585}
7586
7587void OSD::sched_scrub()
7588{
7589 // if not permitted, fail fast
7590 if (!service.can_inc_scrubs_pending()) {
7591 return;
7592 }
b5b8bbf5
FG
7593 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7594 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7595 return;
7596 }
7597
7c673cae
FG
7598
7599 utime_t now = ceph_clock_now();
7600 bool time_permit = scrub_time_permit(now);
7601 bool load_is_low = scrub_load_below_threshold();
7602 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7603
7604 OSDService::ScrubJob scrub;
7605 if (service.first_scrub_stamp(&scrub)) {
7606 do {
7607 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7608
7609 if (scrub.sched_time > now) {
7610 // save ourselves some effort
7611 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7612 << " > " << now << dendl;
7613 break;
7614 }
7615
11fdf7f2 7616 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7617 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7618 << (!time_permit ? "time not permit" : "high load") << dendl;
7619 continue;
7620 }
7621
11fdf7f2 7622 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7623 if (!pg)
7624 continue;
11fdf7f2
TL
7625 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7626 << (pg->get_must_scrub() ? ", explicitly requested" :
7627 (load_is_low ? ", load_is_low" : " deadline < now"))
7628 << dendl;
7629 if (pg->sched_scrub()) {
7630 pg->unlock();
7631 break;
7c673cae
FG
7632 }
7633 pg->unlock();
7634 } while (service.next_scrub_stamp(scrub, &scrub));
7635 }
7636 dout(20) << "sched_scrub done" << dendl;
7637}
7638
11fdf7f2
TL
7639MPGStats* OSD::collect_pg_stats()
7640{
7641 // This implementation unconditionally sends every is_primary PG's
7642 // stats every time we're called. This has equivalent cost to the
7643 // previous implementation's worst case where all PGs are busy and
7644 // their stats are always enqueued for sending.
7645 RWLock::RLocker l(map_lock);
7646
7647 utime_t had_for = ceph_clock_now() - had_map_since;
7648 osd_stat_t cur_stat = service.get_osd_stat();
7649 cur_stat.os_perf_stat = store->get_cur_stats();
7650
7651 auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
7652 m->osd_stat = cur_stat;
7653
7654 std::lock_guard lec{min_last_epoch_clean_lock};
7655 min_last_epoch_clean = osdmap->get_epoch();
7656 min_last_epoch_clean_pgs.clear();
7657
7658 std::set<int64_t> pool_set;
7659 vector<PGRef> pgs;
7660 _get_pgs(&pgs);
7661 for (auto& pg : pgs) {
7662 auto pool = pg->pg_id.pgid.pool();
7663 pool_set.emplace((int64_t)pool);
7664 if (!pg->is_primary()) {
7665 continue;
7666 }
7667 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7668 m->pg_stat[pg->pg_id.pgid] = s;
7669 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7670 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7671 });
7672 }
7673 store_statfs_t st;
7674 for (auto p : pool_set) {
7675 int r = store->pool_statfs(p, &st);
7676 if (r == -ENOTSUP) {
7677 break;
7678 } else {
7679 assert(r >= 0);
7680 m->pool_stat[p] = st;
7681 }
7682 }
7c673cae 7683
11fdf7f2
TL
7684 return m;
7685}
7c673cae 7686
11fdf7f2 7687vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7688{
11fdf7f2
TL
7689 vector<DaemonHealthMetric> metrics;
7690 {
7691 utime_t oldest_secs;
7692 const utime_t now = ceph_clock_now();
7693 auto too_old = now;
7694 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7695 int slow = 0;
7696 TrackedOpRef oldest_op;
7697 auto count_slow_ops = [&](TrackedOp& op) {
7698 if (op.get_initiated() < too_old) {
7699 lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
7700 << " initiated "
7701 << op.get_initiated() << dendl;
7702 slow++;
7703 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7704 oldest_op = &op;
7705 }
7706 return true;
7707 } else {
7708 return false;
7709 }
7710 };
7711 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7712 if (slow) {
7713 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7714 << oldest_op->get_desc() << dendl;
7715 }
7716 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7717 } else {
7718 // no news is not good news.
7719 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7720 }
7721 }
7722 {
7723 std::lock_guard l(pending_creates_lock);
7724 auto n_primaries = pending_creates_from_mon;
7725 for (const auto& create : pending_creates_from_osd) {
7726 if (create.second) {
7727 n_primaries++;
7728 }
b32b8144 7729 }
11fdf7f2 7730 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7731 }
b32b8144
FG
7732 return metrics;
7733}
7734
7c673cae
FG
7735// =====================================================
7736// MAP
7737
7738void OSD::wait_for_new_map(OpRequestRef op)
7739{
7740 // ask?
7741 if (waiting_for_osdmap.empty()) {
7742 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7743 }
7744
7745 logger->inc(l_osd_waiting_for_map);
7746 waiting_for_osdmap.push_back(op);
7747 op->mark_delayed("wait for new map");
7748}
7749
7750
7751/** update_map
7752 * assimilate new OSDMap(s). scan pgs, etc.
7753 */
7754
7755void OSD::note_down_osd(int peer)
7756{
11fdf7f2
TL
7757 ceph_assert(osd_lock.is_locked());
7758 cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
7c673cae
FG
7759
7760 heartbeat_lock.Lock();
7761 failure_queue.erase(peer);
7762 failure_pending.erase(peer);
7763 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7764 if (p != heartbeat_peers.end()) {
7765 p->second.con_back->mark_down();
7766 if (p->second.con_front) {
7767 p->second.con_front->mark_down();
7768 }
7769 heartbeat_peers.erase(p);
7770 }
7771 heartbeat_lock.Unlock();
7772}
7773
7774void OSD::note_up_osd(int peer)
7775{
7776 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7777 heartbeat_set_peers_need_update();
7778}
7779
7780struct C_OnMapCommit : public Context {
7781 OSD *osd;
7782 epoch_t first, last;
7783 MOSDMap *msg;
7784 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7785 : osd(o), first(f), last(l), msg(m) {}
7786 void finish(int r) override {
7787 osd->_committed_osd_maps(first, last, msg);
7788 msg->put();
7789 }
7790};
7791
7c673cae
FG
7792void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7793{
11fdf7f2 7794 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7795 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7796 return;
7797
11fdf7f2 7798 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7799
7c673cae
FG
7800 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7801 force_request) {
7802 monc->renew_subs();
7803 }
7804}
7805
7806void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7807{
7808 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7809 if (min <= superblock.oldest_map)
7810 return;
7811
7812 int num = 0;
7813 ObjectStore::Transaction t;
7814 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7815 dout(20) << " removing old osdmap epoch " << e << dendl;
7816 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7817 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7818 superblock.oldest_map = e + 1;
7819 num++;
7820 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7821 service.publish_superblock(superblock);
7822 write_superblock(t);
11fdf7f2
TL
7823 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7824 ceph_assert(tr == 0);
7c673cae
FG
7825 num = 0;
7826 if (!skip_maps) {
7827 // skip_maps leaves us with a range of old maps if we fail to remove all
7828 // of them before moving superblock.oldest_map forward to the first map
7829 // in the incoming MOSDMap msg. so we should continue removing them in
7830 // this case, even we could do huge series of delete transactions all at
7831 // once.
7832 break;
7833 }
7834 }
7835 }
7836 if (num > 0) {
7837 service.publish_superblock(superblock);
7838 write_superblock(t);
11fdf7f2
TL
7839 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7840 ceph_assert(tr == 0);
7c673cae
FG
7841 }
7842 // we should not remove the cached maps
11fdf7f2 7843 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7844}
7845
7846void OSD::handle_osd_map(MOSDMap *m)
7847{
11fdf7f2
TL
7848 // wait for pgs to catch up
7849 {
7850 // we extend the map cache pins to accomodate pgs slow to consume maps
7851 // for some period, until we hit the max_lag_factor bound, at which point
7852 // we block here to stop injesting more maps than they are able to keep
7853 // up with.
7854 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7855 m_osd_pg_epoch_max_lag_factor;
7856 ceph_assert(max_lag > 0);
7857 epoch_t osd_min = 0;
7858 for (auto shard : shards) {
7859 epoch_t min = shard->get_min_pg_epoch();
7860 if (osd_min == 0 || min < osd_min) {
7861 osd_min = min;
7862 }
7863 }
7864 if (osd_min > 0 &&
7865 osdmap->get_epoch() > max_lag &&
7866 osdmap->get_epoch() - max_lag > osd_min) {
7867 epoch_t need = osdmap->get_epoch() - max_lag;
7868 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7869 << " max_lag " << max_lag << ")" << dendl;
7870 for (auto shard : shards) {
7871 epoch_t min = shard->get_min_pg_epoch();
7872 if (need > min) {
7873 dout(10) << __func__ << " waiting for pgs to consume " << need
7874 << " (shard " << shard->shard_id << " min " << min
7875 << ", map cache is " << cct->_conf->osd_map_cache_size
7876 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7877 << ")" << dendl;
7878 unlock_guard unlock{osd_lock};
7879 shard->wait_min_pg_epoch(need);
7880 }
7881 }
7882 }
7883 }
7884
7885 ceph_assert(osd_lock.is_locked());
7886 map<epoch_t,OSDMapRef> added_maps;
7887 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7888 if (m->fsid != monc->get_fsid()) {
7889 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7890 << monc->get_fsid() << dendl;
7891 m->put();
7892 return;
7893 }
7894 if (is_initializing()) {
7895 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7896 m->put();
7897 return;
7898 }
7899
11fdf7f2
TL
7900 auto priv = m->get_connection()->get_priv();
7901 if (auto session = static_cast<Session *>(priv.get());
7902 session && !(session->entity_name.is_mon() ||
7c673cae
FG
7903 session->entity_name.is_osd())) {
7904 //not enough perms!
7905 dout(10) << "got osd map from Session " << session
7906 << " which we can't take maps from (not a mon or osd)" << dendl;
7907 m->put();
7c673cae
FG
7908 return;
7909 }
7c673cae
FG
7910
7911 // share with the objecter
7912 if (!is_preboot())
7913 service.objecter->handle_osd_map(m);
7914
7915 epoch_t first = m->get_first();
7916 epoch_t last = m->get_last();
7917 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7918 << superblock.newest_map
7919 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7920 << dendl;
7921
7922 logger->inc(l_osd_map);
7923 logger->inc(l_osd_mape, last - first + 1);
7924 if (first <= superblock.newest_map)
7925 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7926 if (service.max_oldest_map < m->oldest_map) {
7927 service.max_oldest_map = m->oldest_map;
11fdf7f2 7928 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7929 }
7930
7931 // make sure there is something new, here, before we bother flushing
7932 // the queues and such
7933 if (last <= superblock.newest_map) {
7934 dout(10) << " no new maps here, dropping" << dendl;
7935 m->put();
7936 return;
7937 }
7938
7939 // missing some?
7940 bool skip_maps = false;
7941 if (first > superblock.newest_map + 1) {
7942 dout(10) << "handle_osd_map message skips epochs "
7943 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7944 if (m->oldest_map <= superblock.newest_map + 1) {
7945 osdmap_subscribe(superblock.newest_map + 1, false);
7946 m->put();
7947 return;
7948 }
7949 // always try to get the full range of maps--as many as we can. this
7950 // 1- is good to have
7951 // 2- is at present the only way to ensure that we get a *full* map as
7952 // the first map!
7953 if (m->oldest_map < first) {
7954 osdmap_subscribe(m->oldest_map - 1, true);
7955 m->put();
7956 return;
7957 }
7958 skip_maps = true;
7959 }
7960
7961 ObjectStore::Transaction t;
7962 uint64_t txn_size = 0;
7963
7964 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 7965 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
7966 for (epoch_t e = start; e <= last; e++) {
7967 if (txn_size >= t.get_num_bytes()) {
7968 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 7969 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
7970 }
7971 txn_size = t.get_num_bytes();
7972 map<epoch_t,bufferlist>::iterator p;
7973 p = m->maps.find(e);
7974 if (p != m->maps.end()) {
7975 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7976 OSDMap *o = new OSDMap;
7977 bufferlist& bl = p->second;
7978
7979 o->decode(bl);
7980
7981 ghobject_t fulloid = get_osdmap_pobject_name(e);
7982 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
7983 added_maps[e] = add_map(o);
7984 added_maps_bl[e] = bl;
7c673cae
FG
7985 got_full_map(e);
7986 continue;
7987 }
7988
7989 p = m->incremental_maps.find(e);
7990 if (p != m->incremental_maps.end()) {
7991 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7992 bufferlist& bl = p->second;
7993 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7994 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
7995
7996 OSDMap *o = new OSDMap;
7997 if (e > 1) {
7998 bufferlist obl;
7999 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8000 if (!got) {
8001 auto p = added_maps_bl.find(e - 1);
8002 ceph_assert(p != added_maps_bl.end());
8003 obl = p->second;
8004 }
7c673cae
FG
8005 o->decode(obl);
8006 }
8007
8008 OSDMap::Incremental inc;
11fdf7f2 8009 auto p = bl.cbegin();
7c673cae
FG
8010 inc.decode(p);
8011 if (o->apply_incremental(inc) < 0) {
8012 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8013 ceph_abort_msg("bad fsid");
7c673cae
FG
8014 }
8015
8016 bufferlist fbl;
8017 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8018
8019 bool injected_failure = false;
8020 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8021 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8022 derr << __func__ << " injecting map crc failure" << dendl;
8023 injected_failure = true;
8024 }
8025
8026 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8027 dout(2) << "got incremental " << e
8028 << " but failed to encode full with correct crc; requesting"
8029 << dendl;
8030 clog->warn() << "failed to encode map e" << e << " with expected crc";
8031 dout(20) << "my encoded map was:\n";
8032 fbl.hexdump(*_dout);
8033 *_dout << dendl;
8034 delete o;
8035 request_full_map(e, last);
8036 last = e - 1;
8037 break;
8038 }
8039 got_full_map(e);
8040
8041 ghobject_t fulloid = get_osdmap_pobject_name(e);
8042 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8043 added_maps[e] = add_map(o);
8044 added_maps_bl[e] = fbl;
7c673cae
FG
8045 continue;
8046 }
8047
11fdf7f2 8048 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8049 }
8050
8051 // even if this map isn't from a mon, we may have satisfied our subscription
8052 monc->sub_got("osdmap", last);
8053
8054 if (!m->maps.empty() && requested_full_first) {
8055 dout(10) << __func__ << " still missing full maps " << requested_full_first
8056 << ".." << requested_full_last << dendl;
8057 rerequest_full_maps();
8058 }
8059
7c673cae
FG
8060 if (superblock.oldest_map) {
8061 // make sure we at least keep pace with incoming maps
8062 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8063 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8064 }
8065
8066 if (!superblock.oldest_map || skip_maps)
8067 superblock.oldest_map = first;
8068 superblock.newest_map = last;
8069 superblock.current_epoch = last;
8070
8071 // note in the superblock that we were clean thru the prior epoch
8072 epoch_t boot_epoch = service.get_boot_epoch();
8073 if (boot_epoch && boot_epoch >= superblock.mounted) {
8074 superblock.mounted = boot_epoch;
8075 superblock.clean_thru = last;
8076 }
8077
11fdf7f2
TL
8078 // check for pg_num changes and deleted pools
8079 OSDMapRef lastmap;
8080 for (auto& i : added_maps) {
8081 if (!lastmap) {
8082 if (!(lastmap = service.try_get_map(i.first - 1))) {
8083 dout(10) << __func__ << " can't get previous map " << i.first - 1
8084 << " probably first start of this osd" << dendl;
8085 continue;
8086 }
8087 }
8088 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8089 for (auto& j : lastmap->get_pools()) {
8090 if (!i.second->have_pg_pool(j.first)) {
8091 pg_num_history.log_pool_delete(i.first, j.first);
8092 dout(10) << __func__ << " recording final pg_pool_t for pool "
8093 << j.first << dendl;
8094 // this information is needed by _make_pg() if have to restart before
8095 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8096 ghobject_t obj = make_final_pool_info_oid(j.first);
8097 bufferlist bl;
8098 encode(j.second, bl, CEPH_FEATURES_ALL);
8099 string name = lastmap->get_pool_name(j.first);
8100 encode(name, bl);
8101 map<string,string> profile;
8102 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8103 profile = lastmap->get_erasure_code_profile(
8104 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8105 }
8106 encode(profile, bl);
8107 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8108 service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8109 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8110 new_pg_num != j.second.get_pg_num()) {
8111 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8112 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8113 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8114 }
8115 }
8116 for (auto& j : i.second->get_pools()) {
8117 if (!lastmap->have_pg_pool(j.first)) {
8118 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8119 << j.second.get_pg_num() << dendl;
8120 pg_num_history.log_pg_num_change(i.first, j.first,
8121 j.second.get_pg_num());
8122 }
8123 }
8124 lastmap = i.second;
8125 }
8126 pg_num_history.epoch = last;
8127 {
8128 bufferlist bl;
8129 ::encode(pg_num_history, bl);
8130 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8131 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8132 }
8133
7c673cae
FG
8134 // superblock and commit
8135 write_superblock(t);
11fdf7f2 8136 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8137 store->queue_transaction(
11fdf7f2
TL
8138 service.meta_ch,
8139 std::move(t));
7c673cae
FG
8140 service.publish_superblock(superblock);
8141}
8142
8143void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8144{
8145 dout(10) << __func__ << " " << first << ".." << last << dendl;
8146 if (is_stopping()) {
8147 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8148 return;
8149 }
11fdf7f2 8150 std::lock_guard l(osd_lock);
31f18b77
FG
8151 if (is_stopping()) {
8152 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8153 return;
8154 }
7c673cae
FG
8155 map_lock.get_write();
8156
8157 bool do_shutdown = false;
8158 bool do_restart = false;
8159 bool network_error = false;
8160
8161 // advance through the new maps
8162 for (epoch_t cur = first; cur <= last; cur++) {
8163 dout(10) << " advance to epoch " << cur
8164 << " (<= last " << last
8165 << " <= newest_map " << superblock.newest_map
8166 << ")" << dendl;
8167
8168 OSDMapRef newmap = get_map(cur);
11fdf7f2 8169 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8170
8171 // start blacklisting messages sent to peers that go down.
8172 service.pre_publish_map(newmap);
8173
8174 // kill connections to newly down osds
8175 bool waited_for_reservations = false;
8176 set<int> old;
8177 osdmap->get_all_osds(old);
8178 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8179 if (*p != whoami &&
8180 osdmap->is_up(*p) && // in old map
8181 newmap->is_down(*p)) { // but not the new one
8182 if (!waited_for_reservations) {
8183 service.await_reserved_maps();
8184 waited_for_reservations = true;
8185 }
8186 note_down_osd(*p);
8187 } else if (*p != whoami &&
8188 osdmap->is_down(*p) &&
8189 newmap->is_up(*p)) {
8190 note_up_osd(*p);
8191 }
8192 }
8193
31f18b77
FG
8194 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
8195 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
8196 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7c673cae
FG
8197 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8198 << dendl;
8199 if (is_booting()) {
8200 // this captures the case where we sent the boot message while
8201 // NOUP was being set on the mon and our boot request was
8202 // dropped, and then later it is cleared. it imperfectly
8203 // handles the case where our original boot message was not
8204 // dropped and we restart even though we might have booted, but
8205 // that is harmless (boot will just take slightly longer).
8206 do_restart = true;
8207 }
8208 }
8209
8210 osdmap = newmap;
8211 epoch_t up_epoch;
8212 epoch_t boot_epoch;
8213 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8214 if (!up_epoch &&
8215 osdmap->is_up(whoami) &&
11fdf7f2 8216 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8217 up_epoch = osdmap->get_epoch();
8218 dout(10) << "up_epoch is " << up_epoch << dendl;
8219 if (!boot_epoch) {
8220 boot_epoch = osdmap->get_epoch();
8221 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8222 }
8223 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8224 }
8225 }
8226
8227 had_map_since = ceph_clock_now();
8228
8229 epoch_t _bind_epoch = service.get_bind_epoch();
8230 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8231 osdmap->get_addrs(whoami).legacy_equals(
8232 client_messenger->get_myaddrs()) &&
7c673cae
FG
8233 _bind_epoch < osdmap->get_up_from(whoami)) {
8234
8235 if (is_booting()) {
8236 dout(1) << "state: booting -> active" << dendl;
8237 set_state(STATE_ACTIVE);
11fdf7f2 8238 do_restart = false;
7c673cae
FG
8239
8240 // set incarnation so that osd_reqid_t's we generate for our
8241 // objecter requests are unique across restarts.
8242 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8243 cancel_pending_failures();
7c673cae
FG
8244 }
8245 }
8246
8247 if (osdmap->get_epoch() > 0 &&
8248 is_active()) {
8249 if (!osdmap->exists(whoami)) {
8250 dout(0) << "map says i do not exist. shutting down." << dendl;
8251 do_shutdown = true; // don't call shutdown() while we have
8252 // everything paused
8253 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8254 !osdmap->get_addrs(whoami).legacy_equals(
8255 client_messenger->get_myaddrs()) ||
8256 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8257 cluster_messenger->get_myaddrs()) ||
8258 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8259 hb_back_server_messenger->get_myaddrs()) ||
8260 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8261 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8262 if (!osdmap->is_up(whoami)) {
8263 if (service.is_preparing_to_stop() || service.is_stopping()) {
8264 service.got_stop_ack();
8265 } else {
c07f9fc5
FG
8266 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8267 "but it is still running";
8268 clog->debug() << "map e" << osdmap->get_epoch()
8269 << " wrongly marked me down at e"
8270 << osdmap->get_down_at(whoami);
7c673cae 8271 }
11fdf7f2
TL
8272 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8273 client_messenger->get_myaddrs())) {
7c673cae 8274 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8275 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8276 << " != my " << client_messenger->get_myaddrs() << ")";
8277 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8278 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8279 clog->error() << "map e" << osdmap->get_epoch()
8280 << " had wrong cluster addr ("
11fdf7f2
TL
8281 << osdmap->get_cluster_addrs(whoami)
8282 << " != my " << cluster_messenger->get_myaddrs() << ")";
8283 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8284 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8285 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8286 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8287 << osdmap->get_hb_back_addrs(whoami)
8288 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8289 << ")";
11fdf7f2
TL
8290 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8291 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8292 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8293 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8294 << osdmap->get_hb_front_addrs(whoami)
8295 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8296 << ")";
8297 }
8298
8299 if (!service.is_stopping()) {
8300 epoch_t up_epoch = 0;
8301 epoch_t bind_epoch = osdmap->get_epoch();
8302 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8303 do_restart = true;
8304
8305 //add markdown log
8306 utime_t now = ceph_clock_now();
8307 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8308 osd_markdown_log.push_back(now);
8309 //clear all out-of-date log
8310 while (!osd_markdown_log.empty() &&
8311 osd_markdown_log.front() + grace < now)
8312 osd_markdown_log.pop_front();
8313 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8314 dout(0) << __func__ << " marked down "
8315 << osd_markdown_log.size()
8316 << " > osd_max_markdown_count "
8317 << cct->_conf->osd_max_markdown_count
8318 << " in last " << grace << " seconds, shutting down"
8319 << dendl;
8320 do_restart = false;
8321 do_shutdown = true;
8322 }
8323
8324 start_waiting_for_healthy();
8325
8326 set<int> avoid_ports;
8327#if defined(__FreeBSD__)
8328 // prevent FreeBSD from grabbing the client_messenger port during
8329 // rebinding. In which case a cluster_meesneger will connect also
8330 // to the same port
11fdf7f2 8331 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8332#endif
11fdf7f2
TL
8333 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8334 hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8335 hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8336
8337 int r = cluster_messenger->rebind(avoid_ports);
8338 if (r != 0) {
8339 do_shutdown = true; // FIXME: do_restart?
8340 network_error = true;
8341 dout(0) << __func__ << " marked down:"
8342 << " rebind cluster_messenger failed" << dendl;
8343 }
8344
8345 r = hb_back_server_messenger->rebind(avoid_ports);
8346 if (r != 0) {
8347 do_shutdown = true; // FIXME: do_restart?
8348 network_error = true;
8349 dout(0) << __func__ << " marked down:"
8350 << " rebind hb_back_server_messenger failed" << dendl;
8351 }
8352
8353 r = hb_front_server_messenger->rebind(avoid_ports);
8354 if (r != 0) {
8355 do_shutdown = true; // FIXME: do_restart?
8356 network_error = true;
8357 dout(0) << __func__ << " marked down:"
8358 << " rebind hb_front_server_messenger failed" << dendl;
8359 }
8360
8361 hb_front_client_messenger->mark_down_all();
8362 hb_back_client_messenger->mark_down_all();
8363
8364 reset_heartbeat_peers();
8365 }
8366 }
8367 }
8368
8369 map_lock.put_write();
8370
11fdf7f2 8371 check_osdmap_features();
7c673cae
FG
8372
8373 // yay!
8374 consume_map();
8375
8376 if (is_active() || is_waiting_for_healthy())
8377 maybe_update_heartbeat_peers();
8378
11fdf7f2 8379 if (is_active()) {
7c673cae
FG
8380 activate_map();
8381 }
8382
31f18b77 8383 if (do_shutdown) {
7c673cae 8384 if (network_error) {
11fdf7f2 8385 cancel_pending_failures();
7c673cae
FG
8386 }
8387 // trigger shutdown in a different thread
8388 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8389 queue_async_signal(SIGINT);
8390 }
31f18b77
FG
8391 else if (m->newest_map && m->newest_map > last) {
8392 dout(10) << " msg say newest map is " << m->newest_map
8393 << ", requesting more" << dendl;
8394 osdmap_subscribe(osdmap->get_epoch()+1, false);
8395 }
7c673cae
FG
8396 else if (is_preboot()) {
8397 if (m->get_source().is_mon())
8398 _preboot(m->oldest_map, m->newest_map);
8399 else
8400 start_boot();
8401 }
8402 else if (do_restart)
8403 start_boot();
8404
8405}
8406
11fdf7f2 8407void OSD::check_osdmap_features()
7c673cae
FG
8408{
8409 // adjust required feature bits?
8410
8411 // we have to be a bit careful here, because we are accessing the
8412 // Policy structures without taking any lock. in particular, only
8413 // modify integer values that can safely be read by a racing CPU.
8414 // since we are only accessing existing Policy structures a their
8415 // current memory location, and setting or clearing bits in integer
8416 // fields, and we are the only writer, this is not a problem.
8417
8418 {
8419 Messenger::Policy p = client_messenger->get_default_policy();
8420 uint64_t mask;
8421 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8422 if ((p.features_required & mask) != features) {
8423 dout(0) << "crush map has features " << features
8424 << ", adjusting msgr requires for clients" << dendl;
8425 p.features_required = (p.features_required & ~mask) | features;
8426 client_messenger->set_default_policy(p);
8427 }
8428 }
8429 {
8430 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8431 uint64_t mask;
8432 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8433 if ((p.features_required & mask) != features) {
8434 dout(0) << "crush map has features " << features
8435 << " was " << p.features_required
8436 << ", adjusting msgr requires for mons" << dendl;
8437 p.features_required = (p.features_required & ~mask) | features;
8438 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8439 }
8440 }
8441 {
8442 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8443 uint64_t mask;
8444 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8445
8446 if ((p.features_required & mask) != features) {
8447 dout(0) << "crush map has features " << features
8448 << ", adjusting msgr requires for osds" << dendl;
8449 p.features_required = (p.features_required & ~mask) | features;
8450 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8451 }
8452
11fdf7f2 8453 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8454 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8455 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8456 ObjectStore::Transaction t;
8457 write_superblock(t);
11fdf7f2
TL
8458 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8459 ceph_assert(err == 0);
7c673cae
FG
8460 }
8461 }
11fdf7f2
TL
8462
8463 if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8464 heartbeat_dispatcher.ms_set_require_authorizer(false);
8465 }
8466
8467 if (osdmap->require_osd_release != last_require_osd_release) {
8468 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8469 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8470 store->write_meta("require_osd_release",
8471 stringify((int)osdmap->require_osd_release));
8472 last_require_osd_release = osdmap->require_osd_release;
8473 }
7c673cae
FG
8474}
8475
11fdf7f2
TL
8476struct C_FinishSplits : public Context {
8477 OSD *osd;
8478 set<PGRef> pgs;
8479 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8480 : osd(osd), pgs(in) {}
8481 void finish(int r) override {
8482 osd->_finish_splits(pgs);
8483 }
8484};
8485
8486void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8487{
11fdf7f2
TL
8488 dout(10) << __func__ << " " << pgs << dendl;
8489 if (is_stopping())
8490 return;
8491 PG::RecoveryCtx rctx = create_context();
8492 for (set<PGRef>::iterator i = pgs.begin();
8493 i != pgs.end();
8494 ++i) {
8495 PG *pg = i->get();
7c673cae 8496
11fdf7f2
TL
8497 pg->lock();
8498 dout(10) << __func__ << " " << *pg << dendl;
8499 epoch_t e = pg->get_osdmap_epoch();
8500 pg->handle_initialize(&rctx);
8501 pg->queue_null(e, e);
8502 dispatch_context_transaction(rctx, pg);
8503 pg->unlock();
7c673cae 8504
11fdf7f2
TL
8505 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8506 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae
FG
8507 }
8508
11fdf7f2
TL
8509 dispatch_context(rctx, 0, service.get_osdmap());
8510};
8511
8512bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8513 unsigned need)
8514{
8515 std::lock_guard l(merge_lock);
8516 auto& p = merge_waiters[nextmap->get_epoch()][target];
8517 p[src->pg_id] = src;
8518 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8519 << " for " << target << ", have " << p.size() << "/" << need
8520 << dendl;
8521 return p.size() == need;
8522}
8523
8524bool OSD::advance_pg(
8525 epoch_t osd_epoch,
8526 PG *pg,
8527 ThreadPool::TPHandle &handle,
8528 PG::RecoveryCtx *rctx)
8529{
8530 if (osd_epoch <= pg->get_osdmap_epoch()) {
8531 return true;
8532 }
8533 ceph_assert(pg->is_locked());
8534 OSDMapRef lastmap = pg->get_osdmap();
8535 ceph_assert(lastmap->get_epoch() < osd_epoch);
8536 set<PGRef> new_pgs; // any split children
8537 bool ret = true;
8538
8539 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8540 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8541 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8542 next_epoch <= osd_epoch;
7c673cae
FG
8543 ++next_epoch) {
8544 OSDMapRef nextmap = service.try_get_map(next_epoch);
8545 if (!nextmap) {
8546 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8547 continue;
8548 }
8549
11fdf7f2
TL
8550 unsigned new_pg_num =
8551 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8552 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8553 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8554 // check for merge
8555 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8556 spg_t parent;
8557 if (pg->pg_id.is_merge_source(
8558 old_pg_num,
8559 new_pg_num,
8560 &parent)) {
8561 // we are merge source
8562 PGRef spg = pg; // carry a ref
8563 dout(1) << __func__ << " " << pg->pg_id
8564 << " is merge source, target is " << parent
8565 << dendl;
8566 pg->write_if_dirty(rctx);
8567 dispatch_context_transaction(*rctx, pg, &handle);
8568 pg->ch->flush();
8569 pg->on_shutdown();
8570 OSDShard *sdata = pg->osd_shard;
8571 {
8572 std::lock_guard l(sdata->shard_lock);
8573 if (pg->pg_slot) {
8574 sdata->_detach_pg(pg->pg_slot);
8575 // update pg count now since we might not get an osdmap
8576 // any time soon.
8577 if (pg->is_primary())
8578 logger->dec(l_osd_pg_primary);
8579 else if (pg->is_replica())
8580 logger->dec(l_osd_pg_replica);
8581 else
8582 logger->dec(l_osd_pg_stray);
8583 }
8584 }
8585 pg->unlock();
8586
8587 set<spg_t> children;
8588 parent.is_split(new_pg_num, old_pg_num, &children);
8589 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8590 enqueue_peering_evt(
8591 parent,
8592 PGPeeringEventRef(
8593 std::make_shared<PGPeeringEvent>(
8594 nextmap->get_epoch(),
8595 nextmap->get_epoch(),
8596 NullEvt())));
8597 }
8598 ret = false;
8599 goto out;
8600 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8601 // we are merge target
8602 set<spg_t> children;
8603 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8604 dout(20) << __func__ << " " << pg->pg_id
8605 << " is merge target, sources are " << children
8606 << dendl;
8607 map<spg_t,PGRef> sources;
8608 {
8609 std::lock_guard l(merge_lock);
8610 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8611 unsigned need = children.size();
8612 dout(20) << __func__ << " have " << s.size() << "/"
8613 << need << dendl;
8614 if (s.size() == need) {
8615 sources.swap(s);
8616 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8617 if (merge_waiters[nextmap->get_epoch()].empty()) {
8618 merge_waiters.erase(nextmap->get_epoch());
8619 }
8620 }
8621 }
8622 if (!sources.empty()) {
8623 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8624 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8625 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8626 pg->merge_from(
8627 sources, rctx, split_bits,
8628 nextmap->get_pg_pool(
8629 pg->pg_id.pool())->last_pg_merge_meta);
8630 pg->pg_slot->waiting_for_merge_epoch = 0;
8631 } else {
8632 dout(20) << __func__ << " not ready to merge yet" << dendl;
8633 pg->write_if_dirty(rctx);
8634 pg->unlock();
8635 // kick source(s) to get them ready
8636 for (auto& i : children) {
8637 dout(20) << __func__ << " kicking source " << i << dendl;
8638 enqueue_peering_evt(
8639 i,
8640 PGPeeringEventRef(
8641 std::make_shared<PGPeeringEvent>(
8642 nextmap->get_epoch(),
8643 nextmap->get_epoch(),
8644 NullEvt())));
8645 }
8646 ret = false;
8647 goto out;
8648 }
8649 }
8650 }
8651 }
8652
7c673cae
FG
8653 vector<int> newup, newacting;
8654 int up_primary, acting_primary;
8655 nextmap->pg_to_up_acting_osds(
11fdf7f2 8656 pg->pg_id.pgid,
7c673cae
FG
8657 &newup, &up_primary,
8658 &newacting, &acting_primary);
8659 pg->handle_advance_map(
8660 nextmap, lastmap, newup, up_primary,
8661 newacting, acting_primary, rctx);
8662
11fdf7f2
TL
8663 if (new_pg_num && old_pg_num != new_pg_num) {
8664 // check for split
8665 set<spg_t> children;
8666 if (pg->pg_id.is_split(
8667 old_pg_num,
8668 new_pg_num,
8669 &children)) {
8670 split_pgs(
8671 pg, children, &new_pgs, lastmap, nextmap,
8672 rctx);
8673 }
7c673cae
FG
8674 }
8675
8676 lastmap = nextmap;
11fdf7f2 8677 old_pg_num = new_pg_num;
7c673cae
FG
8678 handle.reset_tp_timeout();
8679 }
7c673cae 8680 pg->handle_activate_map(rctx);
11fdf7f2
TL
8681
8682 ret = true;
8683 out:
8684 if (!new_pgs.empty()) {
8685 rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8686 }
11fdf7f2 8687 return ret;
7c673cae
FG
8688}
8689
8690void OSD::consume_map()
8691{
11fdf7f2 8692 ceph_assert(osd_lock.is_locked());
7c673cae
FG
8693 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8694
3efd9988
FG
8695 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8696 * speak the older sorting version any more. Be careful not to force
8697 * a shutdown if we are merely processing old maps, though.
8698 */
8699 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8700 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8701 ceph_abort();
8702 }
8703
11fdf7f2
TL
8704 service.pre_publish_map(osdmap);
8705 service.await_reserved_maps();
8706 service.publish_map(osdmap);
7c673cae 8707
11fdf7f2
TL
8708 // prime splits and merges
8709 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8710 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8711 for (auto& shard : shards) {
8712 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8713 }
8714 if (!newly_split.empty()) {
8715 for (auto& shard : shards) {
8716 shard->prime_splits(osdmap, &newly_split);
8717 }
8718 ceph_assert(newly_split.empty());
8719 }
7c673cae 8720
11fdf7f2
TL
8721 // prune sent_ready_to_merge
8722 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8723
11fdf7f2
TL
8724 // FIXME, maybe: We could race against an incoming peering message
8725 // that instantiates a merge PG after identify_merges() below and
8726 // never set up its peer to complete the merge. An OSD restart
8727 // would clear it up. This is a hard race to resolve,
8728 // extraordinarily rare (we only merge PGs that are stable and
8729 // clean, so it'd have to be an imported PG to an OSD with a
8730 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8731 // replace all of this with a seastar-based code soon anyway.
8732 if (!merge_pgs.empty()) {
8733 // mark the pgs we already have, or create new and empty merge
8734 // participants for those we are missing. do this all under the
8735 // shard lock so we don't have to worry about racing pg creates
8736 // via _process.
8737 for (auto& shard : shards) {
8738 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8739 }
11fdf7f2
TL
8740 ceph_assert(merge_pgs.empty());
8741 }
8742
8743 service.prune_pg_created();
8744
8745 unsigned pushes_to_free = 0;
8746 for (auto& shard : shards) {
8747 shard->consume_map(osdmap, &pushes_to_free);
8748 }
8749
8750 vector<spg_t> pgids;
8751 _get_pgids(&pgids);
8752
8753 // count (FIXME, probably during seastar rewrite)
8754 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8755 vector<PGRef> pgs;
8756 _get_pgs(&pgs);
8757 for (auto& pg : pgs) {
8758 // FIXME (probably during seastar rewrite): this is lockless and
8759 // racy, but we don't want to take pg lock here.
8760 if (pg->is_primary())
8761 num_pg_primary++;
8762 else if (pg->is_replica())
8763 num_pg_replica++;
8764 else
8765 num_pg_stray++;
8766 }
3efd9988 8767
11fdf7f2
TL
8768 {
8769 // FIXME (as part of seastar rewrite): move to OSDShard
8770 std::lock_guard l(pending_creates_lock);
8771 for (auto pg = pending_creates_from_osd.begin();
8772 pg != pending_creates_from_osd.end();) {
b32b8144 8773 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
11fdf7f2
TL
8774 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8775 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8776 pg = pending_creates_from_osd.erase(pg);
8777 } else {
8778 ++pg;
8779 }
8780 }
7c673cae
FG
8781 }
8782
7c673cae
FG
8783 service.maybe_inject_dispatch_delay();
8784
8785 dispatch_sessions_waiting_on_map();
8786
8787 service.maybe_inject_dispatch_delay();
8788
11fdf7f2 8789 service.release_reserved_pushes(pushes_to_free);
7c673cae 8790
11fdf7f2
TL
8791 // queue null events to push maps down to individual PGs
8792 for (auto pgid : pgids) {
8793 enqueue_peering_evt(
8794 pgid,
8795 PGPeeringEventRef(
8796 std::make_shared<PGPeeringEvent>(
8797 osdmap->get_epoch(),
8798 osdmap->get_epoch(),
8799 NullEvt())));
7c673cae 8800 }
11fdf7f2 8801 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8802 logger->set(l_osd_pg_primary, num_pg_primary);
8803 logger->set(l_osd_pg_replica, num_pg_replica);
8804 logger->set(l_osd_pg_stray, num_pg_stray);
8805}
8806
8807void OSD::activate_map()
8808{
11fdf7f2 8809 ceph_assert(osd_lock.is_locked());
7c673cae
FG
8810
8811 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8812
7c673cae
FG
8813 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8814 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8815 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8816 }
8817
8818 // norecover?
8819 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8820 if (!service.recovery_is_paused()) {
8821 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8822 service.pause_recovery();
8823 }
8824 } else {
8825 if (service.recovery_is_paused()) {
8826 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8827 service.unpause_recovery();
8828 }
8829 }
8830
8831 service.activate_map();
8832
8833 // process waiters
8834 take_waiters(waiting_for_osdmap);
8835}
8836
8837bool OSD::require_mon_peer(const Message *m)
8838{
8839 if (!m->get_connection()->peer_is_mon()) {
8840 dout(0) << "require_mon_peer received from non-mon "
8841 << m->get_connection()->get_peer_addr()
8842 << " " << *m << dendl;
8843 return false;
8844 }
8845 return true;
8846}
8847
8848bool OSD::require_mon_or_mgr_peer(const Message *m)
8849{
8850 if (!m->get_connection()->peer_is_mon() &&
8851 !m->get_connection()->peer_is_mgr()) {
8852 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8853 << m->get_connection()->get_peer_addr()
8854 << " " << *m << dendl;
8855 return false;
8856 }
8857 return true;
8858}
8859
8860bool OSD::require_osd_peer(const Message *m)
8861{
8862 if (!m->get_connection()->peer_is_osd()) {
8863 dout(0) << "require_osd_peer received from non-osd "
8864 << m->get_connection()->get_peer_addr()
8865 << " " << *m << dendl;
8866 return false;
8867 }
8868 return true;
8869}
8870
8871bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8872{
8873 epoch_t up_epoch = service.get_up_epoch();
8874 if (epoch < up_epoch) {
8875 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8876 return false;
8877 }
8878
8879 if (!is_active()) {
8880 dout(7) << "still in boot state, dropping message " << *m << dendl;
8881 return false;
8882 }
8883
8884 return true;
8885}
8886
8887bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8888 bool is_fast_dispatch)
8889{
8890 int from = m->get_source().num();
8891
8892 if (map->is_down(from) ||
11fdf7f2 8893 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
8894 dout(5) << "from dead osd." << from << ", marking down, "
8895 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
8896 << " expected "
8897 << (map->is_up(from) ?
8898 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
8899 << dendl;
8900 ConnectionRef con = m->get_connection();
8901 con->mark_down();
11fdf7f2
TL
8902 auto priv = con->get_priv();
8903 if (auto s = static_cast<Session*>(priv.get()); s) {
7c673cae
FG
8904 if (!is_fast_dispatch)
8905 s->session_dispatch_lock.Lock();
8906 clear_session_waiting_on_map(s);
11fdf7f2
TL
8907 con->set_priv(nullptr); // break ref <-> session cycle, if any
8908 s->con.reset();
7c673cae
FG
8909 if (!is_fast_dispatch)
8910 s->session_dispatch_lock.Unlock();
7c673cae
FG
8911 }
8912 return false;
8913 }
8914 return true;
8915}
8916
8917
8918/*
8919 * require that we have same (or newer) map, and that
8920 * the source is the pg primary.
8921 */
8922bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8923 bool is_fast_dispatch)
8924{
8925 const Message *m = op->get_req();
8926 dout(15) << "require_same_or_newer_map " << epoch
8927 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8928
11fdf7f2 8929 ceph_assert(osd_lock.is_locked());
7c673cae
FG
8930
8931 // do they have a newer map?
8932 if (epoch > osdmap->get_epoch()) {
8933 dout(7) << "waiting for newer map epoch " << epoch
8934 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8935 wait_for_new_map(op);
8936 return false;
8937 }
8938
8939 if (!require_self_aliveness(op->get_req(), epoch)) {
8940 return false;
8941 }
8942
8943 // ok, our map is same or newer.. do they still exist?
8944 if (m->get_connection()->get_messenger() == cluster_messenger &&
8945 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8946 return false;
8947 }
8948
8949 return true;
8950}
8951
8952
8953
8954
8955
8956// ----------------------------------------
8957// pg creation
8958
8959void OSD::split_pgs(
8960 PG *parent,
31f18b77 8961 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8962 OSDMapRef curmap,
8963 OSDMapRef nextmap,
8964 PG::RecoveryCtx *rctx)
8965{
11fdf7f2
TL
8966 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8967 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 8968
11fdf7f2
TL
8969 vector<object_stat_sum_t> updated_stats;
8970 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
8971
8972 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8973 for (set<spg_t>::const_iterator i = childpgids.begin();
8974 i != childpgids.end();
8975 ++i, ++stat_iter) {
11fdf7f2
TL
8976 ceph_assert(stat_iter != updated_stats.end());
8977 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
8978 PG* child = _make_pg(nextmap, *i);
8979 child->lock(true);
8980 out_pgs->insert(child);
11fdf7f2 8981 child->ch = store->create_new_collection(child->coll);
7c673cae 8982
11fdf7f2
TL
8983 {
8984 uint32_t shard_index = i->hash_to_shard(shards.size());
8985 assert(NULL != shards[shard_index]);
8986 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8987 }
7c673cae 8988
11fdf7f2
TL
8989 unsigned split_bits = i->get_split_bits(pg_num);
8990 dout(10) << " pg_num is " << pg_num
8991 << ", m_seed " << i->ps()
8992 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
8993 parent->split_colls(
8994 *i,
8995 split_bits,
8996 i->ps(),
11fdf7f2 8997 &child->get_pool().info,
7c673cae
FG
8998 rctx->transaction);
8999 parent->split_into(
9000 i->pgid,
9001 child,
9002 split_bits);
7c673cae 9003
11fdf7f2 9004 child->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9005 child->unlock();
9006 }
11fdf7f2
TL
9007 ceph_assert(stat_iter != updated_stats.end());
9008 parent->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9009}
9010
9011/*
9012 * holding osd_lock
9013 */
9014void OSD::handle_pg_create(OpRequestRef op)
9015{
9016 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
11fdf7f2 9017 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9018
9019 dout(10) << "handle_pg_create " << *m << dendl;
9020
9021 if (!require_mon_peer(op->get_req())) {
9022 return;
9023 }
9024
9025 if (!require_same_or_newer_map(op, m->epoch, false))
9026 return;
9027
9028 op->mark_started();
9029
9030 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9031 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9032 p != m->mkpg.end();
9033 ++p, ++ci) {
11fdf7f2 9034 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9035 epoch_t created = p->second.created;
9036 if (p->second.split_bits) // Skip split pgs
9037 continue;
9038 pg_t on = p->first;
9039
7c673cae
FG
9040 if (!osdmap->have_pg_pool(on.pool())) {
9041 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9042 continue;
9043 }
9044
9045 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9046
9047 // is it still ours?
9048 vector<int> up, acting;
9049 int up_primary = -1;
9050 int acting_primary = -1;
9051 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9052 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9053
9054 if (acting_primary != whoami) {
9055 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9056 << "), my role=" << role << ", skipping" << dendl;
9057 continue;
9058 }
9059
9060 spg_t pgid;
9061 bool mapped = osdmap->get_primary_shard(on, &pgid);
11fdf7f2 9062 ceph_assert(mapped);
7c673cae 9063
11fdf7f2 9064 PastIntervals pi;
7c673cae
FG
9065 pg_history_t history;
9066 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9067
11fdf7f2
TL
9068 // The mon won't resend unless the primary changed, so we ignore
9069 // same_interval_since. We'll pass this history with the current
9070 // epoch as the event.
7c673cae
FG
9071 if (history.same_primary_since > m->epoch) {
9072 dout(10) << __func__ << ": got obsolete pg create on pgid "
9073 << pgid << " from epoch " << m->epoch
9074 << ", primary changed in " << history.same_primary_since
9075 << dendl;
9076 continue;
9077 }
11fdf7f2
TL
9078 enqueue_peering_evt(
9079 pgid,
9080 PGPeeringEventRef(
9081 std::make_shared<PGPeeringEvent>(
9082 osdmap->get_epoch(),
9083 osdmap->get_epoch(),
9084 NullEvt(),
9085 true,
9086 new PGCreateInfo(
9087 pgid,
9088 osdmap->get_epoch(),
9089 history,
9090 pi,
9091 true)
9092 )));
7c673cae 9093 }
7c673cae 9094
3efd9988 9095 {
11fdf7f2 9096 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9097 if (pending_creates_from_mon == 0) {
9098 last_pg_create_epoch = m->epoch;
9099 }
9100 }
11fdf7f2 9101
7c673cae
FG
9102 maybe_update_heartbeat_peers();
9103}
9104
9105
9106// ----------------------------------------
9107// peering and recovery
9108
9109PG::RecoveryCtx OSD::create_context()
9110{
9111 ObjectStore::Transaction *t = new ObjectStore::Transaction;
7c673cae
FG
9112 map<int, map<spg_t,pg_query_t> > *query_map =
9113 new map<int, map<spg_t, pg_query_t> >;
9114 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9115 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9116 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9117 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
11fdf7f2 9118 PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
7c673cae
FG
9119 return rctx;
9120}
9121
7c673cae
FG
9122void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9123 ThreadPool::TPHandle *handle)
9124{
11fdf7f2 9125 if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
7c673cae 9126 int tr = store->queue_transaction(
11fdf7f2
TL
9127 pg->ch,
9128 std::move(*ctx.transaction), TrackedOpRef(), handle);
9129 ceph_assert(tr == 0);
7c673cae 9130 delete (ctx.transaction);
7c673cae 9131 ctx.transaction = new ObjectStore::Transaction;
7c673cae
FG
9132 }
9133}
9134
9135void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9136 ThreadPool::TPHandle *handle)
9137{
11fdf7f2
TL
9138 if (!service.get_osdmap()->is_up(whoami)) {
9139 dout(20) << __func__ << " not up in osdmap" << dendl;
9140 } else if (!is_active()) {
9141 dout(20) << __func__ << " not active" << dendl;
9142 } else {
7c673cae
FG
9143 do_notifies(*ctx.notify_list, curmap);
9144 do_queries(*ctx.query_map, curmap);
9145 do_infos(*ctx.info_map, curmap);
9146 }
11fdf7f2 9147 if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
7c673cae 9148 int tr = store->queue_transaction(
11fdf7f2
TL
9149 pg->ch,
9150 std::move(*ctx.transaction), TrackedOpRef(),
7c673cae 9151 handle);
11fdf7f2 9152 ceph_assert(tr == 0);
7c673cae 9153 }
11fdf7f2
TL
9154 delete ctx.notify_list;
9155 delete ctx.query_map;
9156 delete ctx.info_map;
9157 delete ctx.transaction;
9158}
9159
9160void OSD::discard_context(PG::RecoveryCtx& ctx)
9161{
9162 delete ctx.notify_list;
9163 delete ctx.query_map;
9164 delete ctx.info_map;
9165 delete ctx.transaction;
7c673cae
FG
9166}
9167
11fdf7f2 9168
7c673cae
FG
9169/** do_notifies
9170 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9171 * content for, and they are primary for.
9172 */
9173
9174void OSD::do_notifies(
9175 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9176 OSDMapRef curmap)
9177{
9178 for (map<int,
9179 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9180 notify_list.begin();
9181 it != notify_list.end();
9182 ++it) {
9183 if (!curmap->is_up(it->first)) {
9184 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9185 continue;
9186 }
9187 ConnectionRef con = service.get_con_osd_cluster(
9188 it->first, curmap->get_epoch());
9189 if (!con) {
9190 dout(20) << __func__ << " skipping osd." << it->first
9191 << " (NULL con)" << dendl;
9192 continue;
9193 }
9194 service.share_map_peer(it->first, con.get(), curmap);
3efd9988 9195 dout(7) << __func__ << " osd." << it->first
7c673cae
FG
9196 << " on " << it->second.size() << " PGs" << dendl;
9197 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9198 it->second);
9199 con->send_message(m);
9200 }
9201}
9202
9203
9204/** do_queries
9205 * send out pending queries for info | summaries
9206 */
9207void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9208 OSDMapRef curmap)
9209{
9210 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9211 pit != query_map.end();
9212 ++pit) {
9213 if (!curmap->is_up(pit->first)) {
9214 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9215 continue;
9216 }
9217 int who = pit->first;
9218 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9219 if (!con) {
9220 dout(20) << __func__ << " skipping osd." << who
9221 << " (NULL con)" << dendl;
9222 continue;
9223 }
9224 service.share_map_peer(who, con.get(), curmap);
9225 dout(7) << __func__ << " querying osd." << who
9226 << " on " << pit->second.size() << " PGs" << dendl;
9227 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9228 con->send_message(m);
9229 }
9230}
9231
9232
9233void OSD::do_infos(map<int,
9234 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9235 OSDMapRef curmap)
9236{
9237 for (map<int,
9238 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9239 info_map.begin();
9240 p != info_map.end();
9241 ++p) {
9242 if (!curmap->is_up(p->first)) {
9243 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9244 continue;
9245 }
9246 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9247 i != p->second.end();
9248 ++i) {
9249 dout(20) << __func__ << " sending info " << i->first.info
9250 << " to shard " << p->first << dendl;
9251 }
9252 ConnectionRef con = service.get_con_osd_cluster(
9253 p->first, curmap->get_epoch());
9254 if (!con) {
9255 dout(20) << __func__ << " skipping osd." << p->first
9256 << " (NULL con)" << dendl;
9257 continue;
9258 }
9259 service.share_map_peer(p->first, con.get(), curmap);
9260 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9261 m->pg_list = p->second;
9262 con->send_message(m);
9263 }
9264 info_map.clear();
9265}
9266
11fdf7f2 9267void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9268{
11fdf7f2
TL
9269 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9270 if (!require_mon_peer(m)) {
9271 m->put();
7c673cae 9272 return;
7c673cae 9273 }
11fdf7f2
TL
9274 for (auto& p : m->pgs) {
9275 spg_t pgid = p.first;
9276 epoch_t created = p.second.first;
9277 utime_t created_stamp = p.second.second;
9278 dout(20) << __func__ << " " << pgid << " e" << created
9279 << "@" << created_stamp << dendl;
9280 pg_history_t h;
9281 h.epoch_created = created;
9282 h.epoch_pool_created = created;
9283 h.same_up_since = created;
9284 h.same_interval_since = created;
9285 h.same_primary_since = created;
9286 h.last_scrub_stamp = created_stamp;
9287 h.last_deep_scrub_stamp = created_stamp;
9288 h.last_clean_scrub_stamp = created_stamp;
9289
9290 enqueue_peering_evt(
9291 pgid,
9292 PGPeeringEventRef(
9293 std::make_shared<PGPeeringEvent>(
9294 m->epoch,
9295 m->epoch,
9296 NullEvt(),
9297 true,
9298 new PGCreateInfo(
9299 pgid,
9300 created,
9301 h,
9302 PastIntervals(),
9303 true)
9304 )));
9305 }
7c673cae 9306
11fdf7f2
TL
9307 {
9308 std::lock_guard l(pending_creates_lock);
9309 if (pending_creates_from_mon == 0) {
9310 last_pg_create_epoch = m->epoch;
9311 }
7c673cae
FG
9312 }
9313
11fdf7f2 9314 m->put();
7c673cae
FG
9315}
9316
11fdf7f2 9317void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9318{
11fdf7f2
TL
9319 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9320 if (!require_osd_peer(m)) {
9321 m->put();
7c673cae 9322 return;
11fdf7f2 9323 }
7c673cae 9324 int from = m->get_source().num();
11fdf7f2
TL
9325 for (auto& p : m->pg_list) {
9326 enqueue_peering_evt(
9327 p.first,
9328 PGPeeringEventRef(
9329 std::make_shared<PGPeeringEvent>(
9330 p.second.epoch_sent, p.second.epoch_sent,
9331 MQuery(
9332 p.first,
9333 pg_shard_t(from, p.second.from),
9334 p.second,
9335 p.second.epoch_sent),
9336 false))
7c673cae
FG
9337 );
9338 }
11fdf7f2 9339 m->put();
7c673cae
FG
9340}
9341
11fdf7f2 9342void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9343{
11fdf7f2
TL
9344 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9345 if (!require_osd_peer(m)) {
9346 m->put();
7c673cae
FG
9347 return;
9348 }
11fdf7f2
TL
9349 int from = m->get_source().num();
9350 for (auto& p : m->get_pg_list()) {
9351 spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9352 enqueue_peering_evt(
9353 pgid,
9354 PGPeeringEventRef(
9355 std::make_shared<PGPeeringEvent>(
9356 p.first.epoch_sent,
9357 p.first.query_epoch,
9358 MNotifyRec(
9359 pgid, pg_shard_t(from, p.first.from),
9360 p.first,
9361 m->get_connection()->get_features(),
9362 p.second),
9363 true,
9364 new PGCreateInfo(
9365 pgid,
9366 p.first.query_epoch,
9367 p.first.info.history,
9368 p.second,
9369 false)
9370 )));
7c673cae 9371 }
11fdf7f2 9372 m->put();
7c673cae
FG
9373}
9374
11fdf7f2 9375void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9376{
11fdf7f2
TL
9377 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9378 if (!require_osd_peer(m)) {
9379 m->put();
7c673cae
FG
9380 return;
9381 }
11fdf7f2
TL
9382 int from = m->get_source().num();
9383 for (auto& p : m->pg_list) {
9384 enqueue_peering_evt(
9385 spg_t(p.first.info.pgid.pgid, p.first.to),
9386 PGPeeringEventRef(
9387 std::make_shared<PGPeeringEvent>(
9388 p.first.epoch_sent, p.first.query_epoch,
9389 MInfoRec(
9390 pg_shard_t(from, p.first.from),
9391 p.first.info,
9392 p.first.epoch_sent)))
9393 );
7c673cae 9394 }
11fdf7f2 9395 m->put();
7c673cae
FG
9396}
9397
11fdf7f2 9398void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9399{
11fdf7f2
TL
9400 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9401 if (!require_osd_peer(m)) {
9402 m->put();
7c673cae
FG
9403 return;
9404 }
11fdf7f2
TL
9405 for (auto& pgid : m->pg_list) {
9406 enqueue_peering_evt(
9407 pgid,
9408 PGPeeringEventRef(
9409 std::make_shared<PGPeeringEvent>(
9410 m->get_epoch(), m->get_epoch(),
9411 PG::DeleteStart())));
7c673cae 9412 }
11fdf7f2 9413 m->put();
7c673cae
FG
9414}
9415
11fdf7f2 9416void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9417{
11fdf7f2
TL
9418 dout(10) << __func__ << " " << *m << dendl;
9419 if (!require_mon_or_mgr_peer(m)) {
9420 m->put();
9421 return;
9422 }
9423 epoch_t epoch = get_osdmap_epoch();
9424 for (auto pgid : m->forced_pgs) {
9425 if (m->options & OFR_BACKFILL) {
9426 if (m->options & OFR_CANCEL) {
9427 enqueue_peering_evt(
9428 pgid,
9429 PGPeeringEventRef(
9430 std::make_shared<PGPeeringEvent>(
9431 epoch, epoch,
9432 PG::UnsetForceBackfill())));
9433 } else {
9434 enqueue_peering_evt(
9435 pgid,
9436 PGPeeringEventRef(
9437 std::make_shared<PGPeeringEvent>(
9438 epoch, epoch,
9439 PG::SetForceBackfill())));
9440 }
9441 } else if (m->options & OFR_RECOVERY) {
9442 if (m->options & OFR_CANCEL) {
9443 enqueue_peering_evt(
9444 pgid,
9445 PGPeeringEventRef(
9446 std::make_shared<PGPeeringEvent>(
9447 epoch, epoch,
9448 PG::UnsetForceRecovery())));
9449 } else {
9450 enqueue_peering_evt(
9451 pgid,
9452 PGPeeringEventRef(
9453 std::make_shared<PGPeeringEvent>(
9454 epoch, epoch,
9455 PG::SetForceRecovery())));
c07f9fc5
FG
9456 }
9457 }
9458 }
11fdf7f2 9459 m->put();
c07f9fc5 9460}
7c673cae 9461
11fdf7f2 9462void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9463{
11fdf7f2
TL
9464 spg_t pgid = q.pgid;
9465 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9466
11fdf7f2
TL
9467 OSDMapRef osdmap = get_osdmap();
9468 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9469 return;
9470
11fdf7f2
TL
9471 dout(10) << " pg " << pgid << " dne" << dendl;
9472 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9473 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9474 if (con) {
9475 Message *m;
9476 if (q.query.type == pg_query_t::LOG ||
9477 q.query.type == pg_query_t::FULLLOG) {
9478 m = new MOSDPGLog(
9479 q.query.from, q.query.to,
9480 osdmap->get_epoch(), empty,
9481 q.query.epoch_sent);
7c673cae 9482 } else {
11fdf7f2
TL
9483 vector<pair<pg_notify_t,PastIntervals>> ls;
9484 ls.push_back(
7c673cae
FG
9485 make_pair(
9486 pg_notify_t(
11fdf7f2
TL
9487 q.query.from, q.query.to,
9488 q.query.epoch_sent,
7c673cae
FG
9489 osdmap->get_epoch(),
9490 empty),
11fdf7f2
TL
9491 PastIntervals()));
9492 m = new MOSDPGNotify(osdmap->get_epoch(), ls);
7c673cae 9493 }
11fdf7f2
TL
9494 service.share_map_peer(q.from.osd, con.get(), osdmap);
9495 con->send_message(m);
7c673cae
FG
9496 }
9497}
9498
7c673cae 9499
7c673cae
FG
9500// =========================================================
9501// RECOVERY
9502
9503void OSDService::_maybe_queue_recovery() {
11fdf7f2 9504 ceph_assert(recovery_lock.is_locked_by_me());
7c673cae
FG
9505 uint64_t available_pushes;
9506 while (!awaiting_throttle.empty() &&
9507 _recover_now(&available_pushes)) {
11fdf7f2 9508 uint64_t to_start = std::min(
7c673cae
FG
9509 available_pushes,
9510 cct->_conf->osd_recovery_max_single_start);
9511 _queue_for_recovery(awaiting_throttle.front(), to_start);
9512 awaiting_throttle.pop_front();
11fdf7f2
TL
9513 dout(10) << __func__ << " starting " << to_start
9514 << ", recovery_ops_reserved " << recovery_ops_reserved
9515 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9516 recovery_ops_reserved += to_start;
9517 }
9518}
9519
9520bool OSDService::_recover_now(uint64_t *available_pushes)
9521{
9522 if (available_pushes)
9523 *available_pushes = 0;
9524
9525 if (ceph_clock_now() < defer_recovery_until) {
9526 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9527 return false;
9528 }
9529
9530 if (recovery_paused) {
9531 dout(15) << __func__ << " paused" << dendl;
9532 return false;
9533 }
9534
9535 uint64_t max = cct->_conf->osd_recovery_max_active;
9536 if (max <= recovery_ops_active + recovery_ops_reserved) {
9537 dout(15) << __func__ << " active " << recovery_ops_active
9538 << " + reserved " << recovery_ops_reserved
9539 << " >= max " << max << dendl;
9540 return false;
9541 }
9542
9543 if (available_pushes)
9544 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9545
9546 return true;
9547}
9548
9549void OSD::do_recovery(
9550 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9551 ThreadPool::TPHandle &handle)
9552{
9553 uint64_t started = 0;
31f18b77
FG
9554
9555 /*
9556 * When the value of osd_recovery_sleep is set greater than zero, recovery
9557 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9558 * recovery event's schedule time. This is done by adding a
9559 * recovery_requeue_callback event, which re-queues the recovery op using
9560 * queue_recovery_after_sleep.
9561 */
c07f9fc5 9562 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9563 {
11fdf7f2 9564 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9565 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9566 PGRef pgref(pg);
9567 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9568 dout(20) << "do_recovery wake up at "
9569 << ceph_clock_now()
9570 << ", re-queuing recovery" << dendl;
11fdf7f2 9571 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9572 service.recovery_needs_sleep = false;
9573 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9574 });
9575
9576 // This is true for the first recovery op and when the previous recovery op
9577 // has been scheduled in the past. The next recovery op is scheduled after
9578 // completing the sleep from now.
9579 if (service.recovery_schedule_time < ceph_clock_now()) {
9580 service.recovery_schedule_time = ceph_clock_now();
9581 }
9582 service.recovery_schedule_time += recovery_sleep;
11fdf7f2 9583 service.sleep_timer.add_event_at(service.recovery_schedule_time,
b32b8144
FG
9584 recovery_requeue_callback);
9585 dout(20) << "Recovery event scheduled at "
9586 << service.recovery_schedule_time << dendl;
9587 return;
9588 }
7c673cae
FG
9589 }
9590
9591 {
b32b8144 9592 {
11fdf7f2 9593 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9594 service.recovery_needs_sleep = true;
9595 }
9596
7c673cae
FG
9597 if (pg->pg_has_reset_since(queued)) {
9598 goto out;
9599 }
9600
7c673cae
FG
9601 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9602#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9603 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9604#endif
9605
11fdf7f2 9606 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
9607 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9608 << " on " << *pg << dendl;
9609
11fdf7f2
TL
9610 if (do_unfound) {
9611 PG::RecoveryCtx rctx = create_context();
9612 rctx.handle = &handle;
9613 pg->find_unfound(queued, &rctx);
9614 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9615 }
7c673cae
FG
9616 }
9617
9618 out:
11fdf7f2 9619 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9620 service.release_reserved_pushes(reserved_pushes);
9621}
9622
9623void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9624{
11fdf7f2 9625 std::lock_guard l(recovery_lock);
7c673cae
FG
9626 dout(10) << "start_recovery_op " << *pg << " " << soid
9627 << " (" << recovery_ops_active << "/"
9628 << cct->_conf->osd_recovery_max_active << " rops)"
9629 << dendl;
9630 recovery_ops_active++;
9631
9632#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9633 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9634 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9635 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9636#endif
9637}
9638
9639void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9640{
11fdf7f2 9641 std::lock_guard l(recovery_lock);
7c673cae
FG
9642 dout(10) << "finish_recovery_op " << *pg << " " << soid
9643 << " dequeue=" << dequeue
9644 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9645 << dendl;
9646
9647 // adjust count
11fdf7f2 9648 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9649 recovery_ops_active--;
9650
9651#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9652 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9653 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9654 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9655#endif
9656
9657 _maybe_queue_recovery();
9658}
9659
9660bool OSDService::is_recovery_active()
9661{
b5b8bbf5 9662 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9663}
9664
11fdf7f2
TL
9665void OSDService::release_reserved_pushes(uint64_t pushes)
9666{
9667 std::lock_guard l(recovery_lock);
9668 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9669 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9670 << dendl;
9671 ceph_assert(recovery_ops_reserved >= pushes);
9672 recovery_ops_reserved -= pushes;
9673 _maybe_queue_recovery();
9674}
9675
7c673cae
FG
9676// =========================================================
9677// OPS
9678
9679bool OSD::op_is_discardable(const MOSDOp *op)
9680{
9681 // drop client request if they are not connected and can't get the
9682 // reply anyway.
9683 if (!op->get_connection()->is_connected()) {
9684 return true;
9685 }
9686 return false;
9687}
9688
11fdf7f2 9689void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9690{
11fdf7f2
TL
9691 const utime_t stamp = op->get_req()->get_recv_stamp();
9692 const utime_t latency = ceph_clock_now() - stamp;
9693 const unsigned priority = op->get_req()->get_priority();
9694 const int cost = op->get_req()->get_cost();
9695 const uint64_t owner = op->get_req()->get_source().num();
9696
9697 dout(15) << "enqueue_op " << op << " prio " << priority
9698 << " cost " << cost
7c673cae
FG
9699 << " latency " << latency
9700 << " epoch " << epoch
9701 << " " << *(op->get_req()) << dendl;
9702 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9703 op->osd_trace.keyval("priority", priority);
9704 op->osd_trace.keyval("cost", cost);
7c673cae 9705 op->mark_queued_for_pg();
224ce89b 9706 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2
TL
9707 op_shardedwq.queue(
9708 OpQueueItem(
9709 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9710 cost, priority, stamp, owner, epoch));
7c673cae
FG
9711}
9712
11fdf7f2
TL
9713void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9714{
9715 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9716 op_shardedwq.queue(
9717 OpQueueItem(
9718 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9719 10,
9720 cct->_conf->osd_peering_op_priority,
9721 utime_t(),
9722 0,
9723 evt->get_epoch_sent()));
9724}
7c673cae 9725
11fdf7f2
TL
9726void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
9727{
9728 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9729 op_shardedwq.queue_front(
9730 OpQueueItem(
9731 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9732 10,
9733 cct->_conf->osd_peering_op_priority,
9734 utime_t(),
9735 0,
9736 evt->get_epoch_sent()));
9737}
7c673cae
FG
9738
9739/*
9740 * NOTE: dequeue called in worker thread, with pg lock
9741 */
9742void OSD::dequeue_op(
9743 PGRef pg, OpRequestRef op,
9744 ThreadPool::TPHandle &handle)
9745{
11fdf7f2 9746 FUNCTRACE(cct);
7c673cae
FG
9747 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9748
9749 utime_t now = ceph_clock_now();
9750 op->set_dequeued_time(now);
9751 utime_t latency = now - op->get_req()->get_recv_stamp();
9752 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9753 << " cost " << op->get_req()->get_cost()
9754 << " latency " << latency
9755 << " " << *(op->get_req())
9756 << " pg " << *pg << dendl;
9757
224ce89b
WB
9758 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9759
11fdf7f2
TL
9760 auto priv = op->get_req()->get_connection()->get_priv();
9761 if (auto session = static_cast<Session *>(priv.get()); session) {
7c673cae 9762 maybe_share_map(session, op, pg->get_osdmap());
7c673cae
FG
9763 }
9764
11fdf7f2 9765 if (pg->is_deleting())
7c673cae
FG
9766 return;
9767
9768 op->mark_reached_pg();
9769 op->osd_trace.event("dequeue_op");
9770
9771 pg->do_request(op, handle);
9772
9773 // finish
9774 dout(10) << "dequeue_op " << op << " finish" << dendl;
9775 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9776}
9777
9778
11fdf7f2
TL
9779void OSD::dequeue_peering_evt(
9780 OSDShard *sdata,
9781 PG *pg,
9782 PGPeeringEventRef evt,
9783 ThreadPool::TPHandle& handle)
7c673cae 9784{
7c673cae 9785 PG::RecoveryCtx rctx = create_context();
11fdf7f2
TL
9786 auto curmap = sdata->get_osdmap();
9787 epoch_t need_up_thru = 0, same_interval_since = 0;
9788 if (!pg) {
9789 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9790 handle_pg_query_nopg(*q);
7c673cae 9791 } else {
11fdf7f2
TL
9792 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9793 ceph_abort();
9794 }
9795 } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
9796 pg->do_peering_event(evt, &rctx);
9797 if (pg->is_deleted()) {
9798 // do not dispatch rctx; the final _delete_some already did it.
9799 discard_context(rctx);
9800 pg->unlock();
9801 return;
7c673cae
FG
9802 }
9803 dispatch_context_transaction(rctx, pg, &handle);
11fdf7f2
TL
9804 need_up_thru = pg->get_need_up_thru();
9805 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9806 pg->unlock();
9807 }
11fdf7f2
TL
9808
9809 if (need_up_thru) {
7c673cae 9810 queue_want_up_thru(same_interval_since);
11fdf7f2
TL
9811 }
9812 dispatch_context(rctx, pg, curmap, &handle);
7c673cae
FG
9813
9814 service.send_pg_temp();
9815}
9816
11fdf7f2
TL
9817void OSD::dequeue_delete(
9818 OSDShard *sdata,
9819 PG *pg,
9820 epoch_t e,
9821 ThreadPool::TPHandle& handle)
9822{
9823 dequeue_peering_evt(
9824 sdata,
9825 pg,
9826 PGPeeringEventRef(
9827 std::make_shared<PGPeeringEvent>(
9828 e, e,
9829 PG::DeleteSome())),
9830 handle);
9831}
9832
9833
9834
7c673cae
FG
9835// --------------------------------
9836
9837const char** OSD::get_tracked_conf_keys() const
9838{
9839 static const char* KEYS[] = {
9840 "osd_max_backfills",
9841 "osd_min_recovery_priority",
224ce89b
WB
9842 "osd_max_trimming_pgs",
9843 "osd_op_complaint_time",
9844 "osd_op_log_threshold",
9845 "osd_op_history_size",
9846 "osd_op_history_duration",
9847 "osd_op_history_slow_op_size",
9848 "osd_op_history_slow_op_threshold",
7c673cae
FG
9849 "osd_enable_op_tracker",
9850 "osd_map_cache_size",
11fdf7f2 9851 "osd_pg_epoch_max_lag_factor",
7c673cae 9852 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
9853 // clog & admin clog
9854 "clog_to_monitors",
9855 "clog_to_syslog",
9856 "clog_to_syslog_facility",
9857 "clog_to_syslog_level",
9858 "osd_objectstore_fuse",
9859 "clog_to_graylog",
9860 "clog_to_graylog_host",
9861 "clog_to_graylog_port",
9862 "host",
9863 "fsid",
9864 "osd_recovery_delay_start",
9865 "osd_client_message_size_cap",
9866 "osd_client_message_cap",
31f18b77
FG
9867 "osd_heartbeat_min_size",
9868 "osd_heartbeat_interval",
7c673cae
FG
9869 NULL
9870 };
9871 return KEYS;
9872}
9873
11fdf7f2 9874void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9875 const std::set <std::string> &changed)
9876{
f64942e4 9877 Mutex::Locker l(osd_lock);
7c673cae
FG
9878 if (changed.count("osd_max_backfills")) {
9879 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9880 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9881 }
9882 if (changed.count("osd_min_recovery_priority")) {
9883 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9884 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9885 }
9886 if (changed.count("osd_max_trimming_pgs")) {
9887 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9888 }
9889 if (changed.count("osd_op_complaint_time") ||
9890 changed.count("osd_op_log_threshold")) {
9891 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9892 cct->_conf->osd_op_log_threshold);
9893 }
9894 if (changed.count("osd_op_history_size") ||
9895 changed.count("osd_op_history_duration")) {
9896 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9897 cct->_conf->osd_op_history_duration);
9898 }
9899 if (changed.count("osd_op_history_slow_op_size") ||
9900 changed.count("osd_op_history_slow_op_threshold")) {
9901 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9902 cct->_conf->osd_op_history_slow_op_threshold);
9903 }
9904 if (changed.count("osd_enable_op_tracker")) {
9905 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9906 }
7c673cae
FG
9907 if (changed.count("osd_map_cache_size")) {
9908 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9909 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9910 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9911 }
9912 if (changed.count("clog_to_monitors") ||
9913 changed.count("clog_to_syslog") ||
9914 changed.count("clog_to_syslog_level") ||
9915 changed.count("clog_to_syslog_facility") ||
9916 changed.count("clog_to_graylog") ||
9917 changed.count("clog_to_graylog_host") ||
9918 changed.count("clog_to_graylog_port") ||
9919 changed.count("host") ||
9920 changed.count("fsid")) {
9921 update_log_config();
9922 }
11fdf7f2
TL
9923 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9924 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9925 "osd_pg_epoch_max_lag_factor");
9926 }
7c673cae
FG
9927
9928#ifdef HAVE_LIBFUSE
9929 if (changed.count("osd_objectstore_fuse")) {
9930 if (store) {
9931 enable_disable_fuse(false);
9932 }
9933 }
9934#endif
9935
9936 if (changed.count("osd_recovery_delay_start")) {
9937 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9938 service.kick_recovery_queue();
9939 }
9940
9941 if (changed.count("osd_client_message_cap")) {
9942 uint64_t newval = cct->_conf->osd_client_message_cap;
9943 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9944 if (pol.throttler_messages && newval > 0) {
9945 pol.throttler_messages->reset_max(newval);
9946 }
9947 }
9948 if (changed.count("osd_client_message_size_cap")) {
9949 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9950 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9951 if (pol.throttler_bytes && newval > 0) {
9952 pol.throttler_bytes->reset_max(newval);
9953 }
9954 }
9955
9956 check_config();
9957}
9958
9959void OSD::update_log_config()
9960{
9961 map<string,string> log_to_monitors;
9962 map<string,string> log_to_syslog;
9963 map<string,string> log_channel;
9964 map<string,string> log_prio;
9965 map<string,string> log_to_graylog;
9966 map<string,string> log_to_graylog_host;
9967 map<string,string> log_to_graylog_port;
9968 uuid_d fsid;
9969 string host;
9970
9971 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9972 log_channel, log_prio, log_to_graylog,
9973 log_to_graylog_host, log_to_graylog_port,
9974 fsid, host) == 0)
9975 clog->update_config(log_to_monitors, log_to_syslog,
9976 log_channel, log_prio, log_to_graylog,
9977 log_to_graylog_host, log_to_graylog_port,
9978 fsid, host);
9979 derr << "log_to_monitors " << log_to_monitors << dendl;
9980}
9981
9982void OSD::check_config()
9983{
9984 // some sanity checks
7c673cae
FG
9985 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9986 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9987 << " is not > osd_pg_epoch_persisted_max_stale ("
9988 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9989 }
9990}
9991
7c673cae
FG
9992// --------------------------------
9993
9994void OSD::get_latest_osdmap()
9995{
9996 dout(10) << __func__ << " -- start" << dendl;
9997
9998 C_SaferCond cond;
9999 service.objecter->wait_for_latest_osdmap(&cond);
10000 cond.wait();
10001
10002 dout(10) << __func__ << " -- finish" << dendl;
10003}
10004
10005// --------------------------------
10006
10007int OSD::init_op_flags(OpRequestRef& op)
10008{
10009 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10010 vector<OSDOp>::const_iterator iter;
10011
10012 // client flags have no bearing on whether an op is a read, write, etc.
10013 op->rmw_flags = 0;
10014
10015 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10016 op->set_force_rwordered();
10017 }
10018
10019 // set bits based on op codes, called methods.
10020 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10021 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10022 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10023 /* This a bit odd. PING isn't actually a write. It can't
11fdf7f2
TL
10024 * result in an update to the object_info. PINGs also aren't
10025 * resent, so there's no reason to write out a log entry.
7c673cae
FG
10026 *
10027 * However, we pipeline them behind writes, so let's force
10028 * the write_ordered flag.
10029 */
10030 op->set_force_rwordered();
10031 } else {
10032 if (ceph_osd_op_mode_modify(iter->op.op))
10033 op->set_write();
10034 }
10035 if (ceph_osd_op_mode_read(iter->op.op))
10036 op->set_read();
10037
10038 // set READ flag if there are src_oids
10039 if (iter->soid.oid.name.length())
10040 op->set_read();
10041
10042 // set PGOP flag if there are PG ops
10043 if (ceph_osd_op_type_pg(iter->op.op))
10044 op->set_pg_op();
10045
10046 if (ceph_osd_op_mode_cache(iter->op.op))
10047 op->set_cache();
10048
10049 // check for ec base pool
10050 int64_t poolid = m->get_pg().pool();
10051 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10052 if (pool && pool->is_tier()) {
10053 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10054 if (base_pool && base_pool->require_rollback()) {
10055 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10056 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 10057 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
10058 (iter->op.op != CEPH_OSD_OP_STAT) &&
10059 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10060 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10061 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10062 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10063 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10064 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10065 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10066 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10067 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10068 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10069 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10070 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10071 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10072 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10073 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10074 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10075 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10076 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10077 op->set_promote();
10078 }
10079 }
10080 }
10081
10082 switch (iter->op.op) {
10083 case CEPH_OSD_OP_CALL:
10084 {
10085 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10086 int is_write, is_read;
10087 string cname, mname;
10088 bp.copy(iter->op.cls.class_len, cname);
10089 bp.copy(iter->op.cls.method_len, mname);
10090
10091 ClassHandler::ClassData *cls;
10092 int r = class_handler->open_class(cname, &cls);
10093 if (r) {
10094 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10095 if (r == -ENOENT)
10096 r = -EOPNOTSUPP;
10097 else if (r != -EPERM) // propagate permission errors
10098 r = -EIO;
10099 return r;
10100 }
10101 int flags = cls->get_method_flags(mname.c_str());
10102 if (flags < 0) {
10103 if (flags == -ENOENT)
10104 r = -EOPNOTSUPP;
10105 else
10106 r = flags;
10107 return r;
10108 }
10109 is_read = flags & CLS_METHOD_RD;
10110 is_write = flags & CLS_METHOD_WR;
10111 bool is_promote = flags & CLS_METHOD_PROMOTE;
10112
10113 dout(10) << "class " << cname << " method " << mname << " "
10114 << "flags=" << (is_read ? "r" : "")
10115 << (is_write ? "w" : "")
10116 << (is_promote ? "p" : "")
10117 << dendl;
10118 if (is_read)
10119 op->set_class_read();
10120 if (is_write)
10121 op->set_class_write();
10122 if (is_promote)
10123 op->set_promote();
11fdf7f2
TL
10124 op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10125 cls->whitelisted);
7c673cae
FG
10126 break;
10127 }
10128
10129 case CEPH_OSD_OP_WATCH:
10130 // force the read bit for watch since it is depends on previous
10131 // watch state (and may return early if the watch exists) or, in
10132 // the case of ping, is simply a read op.
10133 op->set_read();
10134 // fall through
10135 case CEPH_OSD_OP_NOTIFY:
10136 case CEPH_OSD_OP_NOTIFY_ACK:
10137 {
10138 op->set_promote();
10139 break;
10140 }
10141
10142 case CEPH_OSD_OP_DELETE:
10143 // if we get a delete with FAILOK we can skip handle cache. without
10144 // FAILOK we still need to promote (or do something smarter) to
10145 // determine whether to return ENOENT or 0.
10146 if (iter == m->ops.begin() &&
10147 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10148 op->set_skip_handle_cache();
10149 }
10150 // skip promotion when proxying a delete op
10151 if (m->ops.size() == 1) {
10152 op->set_skip_promote();
10153 }
10154 break;
10155
10156 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10157 case CEPH_OSD_OP_CACHE_FLUSH:
10158 case CEPH_OSD_OP_CACHE_EVICT:
10159 // If try_flush/flush/evict is the only op, can skip handle cache.
10160 if (m->ops.size() == 1) {
10161 op->set_skip_handle_cache();
10162 }
10163 break;
10164
10165 case CEPH_OSD_OP_READ:
10166 case CEPH_OSD_OP_SYNC_READ:
10167 case CEPH_OSD_OP_SPARSE_READ:
10168 case CEPH_OSD_OP_CHECKSUM:
10169 case CEPH_OSD_OP_WRITEFULL:
10170 if (m->ops.size() == 1 &&
10171 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10172 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10173 op->set_skip_promote();
10174 }
10175 break;
10176
10177 // force promotion when pin an object in cache tier
10178 case CEPH_OSD_OP_CACHE_PIN:
10179 op->set_promote();
10180 break;
10181
10182 default:
10183 break;
10184 }
10185 }
10186
10187 if (op->rmw_flags == 0)
10188 return -EINVAL;
10189
10190 return 0;
10191}
10192
11fdf7f2
TL
10193void OSD::set_perf_queries(
10194 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10195 dout(10) << "setting " << queries.size() << " queries" << dendl;
10196
10197 std::list<OSDPerfMetricQuery> supported_queries;
10198 for (auto &it : queries) {
10199 auto &query = it.first;
10200 if (!query.key_descriptor.empty()) {
10201 supported_queries.push_back(query);
10202 }
10203 }
10204 if (supported_queries.size() < queries.size()) {
10205 dout(1) << queries.size() - supported_queries.size()
10206 << " unsupported queries" << dendl;
10207 }
10208
10209 {
10210 Mutex::Locker locker(m_perf_queries_lock);
10211 m_perf_queries = supported_queries;
10212 m_perf_limits = queries;
10213 }
10214
10215 std::vector<PGRef> pgs;
10216 _get_pgs(&pgs);
10217 for (auto& pg : pgs) {
10218 if (pg->is_primary()) {
10219 pg->lock();
10220 pg->set_dynamic_perf_stats_queries(supported_queries);
10221 pg->unlock();
10222 }
7c673cae 10223 }
7c673cae
FG
10224}
10225
11fdf7f2
TL
10226void OSD::get_perf_reports(
10227 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10228 std::vector<PGRef> pgs;
10229 _get_pgs(&pgs);
10230 DynamicPerfStats dps;
10231 for (auto& pg : pgs) {
10232 if (pg->is_primary()) {
10233 // m_perf_queries can be modified only in set_perf_queries by mgr client
10234 // request, and it is protected by by mgr client's lock, which is held
10235 // when set_perf_queries/get_perf_reports are called, so we may not hold
10236 // m_perf_queries_lock here.
10237 DynamicPerfStats pg_dps(m_perf_queries);
10238 pg->lock();
10239 pg->get_dynamic_perf_stats(&pg_dps);
10240 pg->unlock();
10241 dps.merge(pg_dps);
10242 }
10243 }
10244 dps.add_to_reports(m_perf_limits, reports);
10245 dout(20) << "reports for " << reports->size() << " queries" << dendl;
10246}
224ce89b 10247
7c673cae
FG
10248// =============================================================
10249
10250#undef dout_context
11fdf7f2 10251#define dout_context cct
7c673cae 10252#undef dout_prefix
11fdf7f2 10253#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10254
11fdf7f2 10255void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10256{
11fdf7f2
TL
10257 dout(10) << pg->pg_id << " " << pg << dendl;
10258 slot->pg = pg;
10259 pg->osd_shard = this;
10260 pg->pg_slot = slot;
10261 osd->inc_num_pgs();
10262
10263 slot->epoch = pg->get_osdmap_epoch();
10264 pg_slots_by_epoch.insert(*slot);
10265}
10266
10267void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10268{
10269 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10270 slot->pg->osd_shard = nullptr;
10271 slot->pg->pg_slot = nullptr;
10272 slot->pg = nullptr;
10273 osd->dec_num_pgs();
10274
10275 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10276 slot->epoch = 0;
10277 if (waiting_for_min_pg_epoch) {
10278 min_pg_epoch_cond.notify_all();
10279 }
10280}
10281
10282void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10283{
10284 std::lock_guard l(shard_lock);
10285 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10286 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10287 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10288 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10289 slot->epoch = e;
10290 pg_slots_by_epoch.insert(*slot);
10291 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10292 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10293 if (waiting_for_min_pg_epoch) {
10294 min_pg_epoch_cond.notify_all();
10295 }
10296}
10297
10298epoch_t OSDShard::get_min_pg_epoch()
10299{
10300 std::lock_guard l(shard_lock);
10301 auto p = pg_slots_by_epoch.begin();
10302 if (p == pg_slots_by_epoch.end()) {
10303 return 0;
10304 }
10305 return p->epoch;
10306}
10307
10308void OSDShard::wait_min_pg_epoch(epoch_t need)
10309{
10310 std::unique_lock l{shard_lock};
10311 ++waiting_for_min_pg_epoch;
10312 min_pg_epoch_cond.wait(l, [need, this] {
10313 if (pg_slots_by_epoch.empty()) {
10314 return true;
10315 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10316 return true;
10317 } else {
10318 dout(10) << need << " waiting on "
10319 << pg_slots_by_epoch.begin()->epoch << dendl;
10320 return false;
10321 }
10322 });
10323 --waiting_for_min_pg_epoch;
10324}
10325
10326epoch_t OSDShard::get_max_waiting_epoch()
10327{
10328 std::lock_guard l(shard_lock);
10329 epoch_t r = 0;
10330 for (auto& i : pg_slots) {
10331 if (!i.second->waiting_peering.empty()) {
10332 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10333 }
10334 }
10335 return r;
10336}
10337
10338void OSDShard::consume_map(
10339 OSDMapRef& new_osdmap,
10340 unsigned *pushes_to_free)
10341{
10342 std::lock_guard l(shard_lock);
10343 OSDMapRef old_osdmap;
7c673cae 10344 {
11fdf7f2
TL
10345 std::lock_guard l(osdmap_lock);
10346 old_osdmap = std::move(shard_osdmap);
10347 shard_osdmap = new_osdmap;
10348 }
10349 dout(10) << new_osdmap->get_epoch()
10350 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10351 << dendl;
10352 bool queued = false;
10353
10354 // check slots
10355 auto p = pg_slots.begin();
10356 while (p != pg_slots.end()) {
10357 OSDShardPGSlot *slot = p->second.get();
10358 const spg_t& pgid = p->first;
10359 dout(20) << __func__ << " " << pgid << dendl;
10360 if (!slot->waiting_for_split.empty()) {
10361 dout(20) << __func__ << " " << pgid
10362 << " waiting for split " << slot->waiting_for_split << dendl;
10363 ++p;
10364 continue;
10365 }
10366 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10367 dout(20) << __func__ << " " << pgid
10368 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10369 << dendl;
10370 ++p;
10371 continue;
10372 }
10373 if (!slot->waiting_peering.empty()) {
10374 epoch_t first = slot->waiting_peering.begin()->first;
10375 if (first <= new_osdmap->get_epoch()) {
10376 dout(20) << __func__ << " " << pgid
10377 << " pending_peering first epoch " << first
10378 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10379 _wake_pg_slot(pgid, slot);
10380 queued = true;
10381 }
10382 ++p;
10383 continue;
10384 }
10385 if (!slot->waiting.empty()) {
10386 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10387 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10388 << dendl;
10389 ++p;
10390 continue;
7c673cae 10391 }
11fdf7f2
TL
10392 while (!slot->waiting.empty() &&
10393 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10394 auto& qi = slot->waiting.front();
10395 dout(20) << __func__ << " " << pgid
10396 << " waiting item " << qi
10397 << " epoch " << qi.get_map_epoch()
10398 << " <= " << new_osdmap->get_epoch()
10399 << ", "
10400 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10401 "misdirected")
10402 << ", dropping" << dendl;
10403 *pushes_to_free += qi.get_reserved_pushes();
10404 slot->waiting.pop_front();
10405 }
10406 }
10407 if (slot->waiting.empty() &&
10408 slot->num_running == 0 &&
10409 slot->waiting_for_split.empty() &&
10410 !slot->pg) {
10411 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10412 p = pg_slots.erase(p);
10413 continue;
7c673cae 10414 }
11fdf7f2
TL
10415
10416 ++p;
7c673cae 10417 }
7c673cae 10418 if (queued) {
11fdf7f2
TL
10419 std::lock_guard l{sdata_wait_lock};
10420 sdata_cond.notify_one();
7c673cae
FG
10421 }
10422}
10423
11fdf7f2
TL
10424void OSDShard::_wake_pg_slot(
10425 spg_t pgid,
10426 OSDShardPGSlot *slot)
10427{
10428 dout(20) << __func__ << " " << pgid
10429 << " to_process " << slot->to_process
10430 << " waiting " << slot->waiting
10431 << " waiting_peering " << slot->waiting_peering << dendl;
10432 for (auto i = slot->to_process.rbegin();
10433 i != slot->to_process.rend();
10434 ++i) {
10435 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10436 }
10437 slot->to_process.clear();
10438 for (auto i = slot->waiting.rbegin();
10439 i != slot->waiting.rend();
10440 ++i) {
10441 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10442 }
10443 slot->waiting.clear();
10444 for (auto i = slot->waiting_peering.rbegin();
10445 i != slot->waiting_peering.rend();
10446 ++i) {
10447 // this is overkill; we requeue everything, even if some of these
10448 // items are waiting for maps we don't have yet. FIXME, maybe,
10449 // someday, if we decide this inefficiency matters
10450 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10451 _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10452 }
10453 }
10454 slot->waiting_peering.clear();
10455 ++slot->requeue_seq;
10456}
10457
10458void OSDShard::identify_splits_and_merges(
10459 const OSDMapRef& as_of_osdmap,
10460 set<pair<spg_t,epoch_t>> *split_pgs,
10461 set<pair<spg_t,epoch_t>> *merge_pgs)
10462{
10463 std::lock_guard l(shard_lock);
10464 if (shard_osdmap) {
10465 for (auto& i : pg_slots) {
10466 const spg_t& pgid = i.first;
10467 auto *slot = i.second.get();
10468 if (slot->pg) {
10469 osd->service.identify_splits_and_merges(
10470 shard_osdmap, as_of_osdmap, pgid,
10471 split_pgs, merge_pgs);
10472 } else if (!slot->waiting_for_split.empty()) {
10473 osd->service.identify_splits_and_merges(
10474 shard_osdmap, as_of_osdmap, pgid,
10475 split_pgs, nullptr);
10476 } else {
10477 dout(20) << __func__ << " slot " << pgid
10478 << " has no pg and waiting_for_split "
10479 << slot->waiting_for_split << dendl;
7c673cae 10480 }
11fdf7f2
TL
10481 }
10482 }
10483}
10484
10485void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10486 set<pair<spg_t,epoch_t>> *pgids)
10487{
10488 std::lock_guard l(shard_lock);
10489 _prime_splits(pgids);
10490 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10491 set<pair<spg_t,epoch_t>> newer_children;
10492 for (auto i : *pgids) {
10493 osd->service.identify_splits_and_merges(
10494 as_of_osdmap, shard_osdmap, i.first,
10495 &newer_children, nullptr);
10496 }
10497 newer_children.insert(pgids->begin(), pgids->end());
10498 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10499 << shard_osdmap->get_epoch() << ", new children " << newer_children
10500 << dendl;
10501 _prime_splits(&newer_children);
10502 // note: we don't care what is left over here for other shards.
10503 // if this shard is ahead of us and one isn't, e.g., one thread is
10504 // calling into prime_splits via _process (due to a newly created
10505 // pg) and this shard has a newer map due to a racing consume_map,
10506 // then any grandchildren left here will be identified (or were
10507 // identified) when the slower shard's osdmap is advanced.
10508 // _prime_splits() will tolerate the case where the pgid is
10509 // already primed.
10510 }
10511}
10512
10513void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10514{
10515 dout(10) << *pgids << dendl;
10516 auto p = pgids->begin();
10517 while (p != pgids->end()) {
10518 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10519 if (shard_index == shard_id) {
10520 auto r = pg_slots.emplace(p->first, nullptr);
10521 if (r.second) {
10522 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10523 r.first->second = make_unique<OSDShardPGSlot>();
10524 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10525 } else {
11fdf7f2
TL
10526 auto q = r.first;
10527 ceph_assert(q != pg_slots.end());
10528 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10529 << dendl;
10530 q->second->waiting_for_split.insert(p->second);
7c673cae 10531 }
11fdf7f2
TL
10532 p = pgids->erase(p);
10533 } else {
10534 ++p;
7c673cae
FG
10535 }
10536 }
11fdf7f2
TL
10537}
10538
10539void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10540 set<pair<spg_t,epoch_t>> *merge_pgs)
10541{
10542 std::lock_guard l(shard_lock);
10543 dout(20) << __func__ << " checking shard " << shard_id
10544 << " for remaining merge pgs " << merge_pgs << dendl;
10545 auto p = merge_pgs->begin();
10546 while (p != merge_pgs->end()) {
10547 spg_t pgid = p->first;
10548 epoch_t epoch = p->second;
10549 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10550 if (shard_index != shard_id) {
10551 ++p;
10552 continue;
10553 }
10554 OSDShardPGSlot *slot;
10555 auto r = pg_slots.emplace(pgid, nullptr);
10556 if (r.second) {
10557 r.first->second = make_unique<OSDShardPGSlot>();
10558 }
10559 slot = r.first->second.get();
10560 if (slot->pg) {
10561 // already have pg
10562 dout(20) << __func__ << " have merge participant pg " << pgid
10563 << " " << slot->pg << dendl;
10564 } else if (!slot->waiting_for_split.empty() &&
10565 *slot->waiting_for_split.begin() < epoch) {
10566 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10567 << " " << slot->waiting_for_split << dendl;
10568 } else {
10569 dout(20) << __func__ << " creating empty merge participant " << pgid
10570 << " for merge in " << epoch << dendl;
10571 // leave history zeroed; PG::merge_from() will fill it in.
10572 pg_history_t history;
10573 PGCreateInfo cinfo(pgid, epoch - 1,
10574 history, PastIntervals(), false);
10575 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10576 _attach_pg(r.first->second.get(), pg.get());
10577 _wake_pg_slot(pgid, slot);
10578 pg->unlock();
10579 }
10580 // mark slot for merge
10581 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10582 slot->waiting_for_merge_epoch = epoch;
10583 p = merge_pgs->erase(p);
7c673cae
FG
10584 }
10585}
10586
11fdf7f2 10587void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10588{
11fdf7f2
TL
10589 epoch_t epoch;
10590 {
10591 std::lock_guard l(shard_lock);
10592 dout(10) << pg->pg_id << " " << pg << dendl;
10593 auto p = pg_slots.find(pg->pg_id);
10594 ceph_assert(p != pg_slots.end());
10595 auto *slot = p->second.get();
10596 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10597 << dendl;
10598 ceph_assert(!slot->pg);
10599 ceph_assert(!slot->waiting_for_split.empty());
10600 _attach_pg(slot, pg);
10601
10602 epoch = pg->get_osdmap_epoch();
10603 ceph_assert(slot->waiting_for_split.count(epoch));
10604 slot->waiting_for_split.erase(epoch);
10605 if (slot->waiting_for_split.empty()) {
10606 _wake_pg_slot(pg->pg_id, slot);
10607 } else {
10608 dout(10) << __func__ << " still waiting for split on "
10609 << slot->waiting_for_split << dendl;
10610 }
7c673cae 10611 }
11fdf7f2
TL
10612
10613 // kick child to ensure it pulls up to the latest osdmap
10614 osd->enqueue_peering_evt(
10615 pg->pg_id,
10616 PGPeeringEventRef(
10617 std::make_shared<PGPeeringEvent>(
10618 epoch,
10619 epoch,
10620 NullEvt())));
10621
10622 std::lock_guard l{sdata_wait_lock};
10623 sdata_cond.notify_one();
7c673cae
FG
10624}
10625
11fdf7f2 10626void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10627{
11fdf7f2
TL
10628 std::lock_guard l(shard_lock);
10629 vector<spg_t> to_delete;
10630 for (auto& i : pg_slots) {
10631 if (i.first != parent &&
10632 i.first.get_ancestor(old_pg_num) == parent) {
10633 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10634 << dendl;
10635 _wake_pg_slot(i.first, i.second.get());
10636 to_delete.push_back(i.first);
10637 }
10638 }
10639 for (auto pgid : to_delete) {
10640 pg_slots.erase(pgid);
10641 }
10642}
10643
10644
10645// =============================================================
10646
10647#undef dout_context
10648#define dout_context osd->cct
10649#undef dout_prefix
10650#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10651
10652void OSD::ShardedOpWQ::_add_slot_waiter(
10653 spg_t pgid,
10654 OSDShardPGSlot *slot,
10655 OpQueueItem&& qi)
10656{
10657 if (qi.is_peering()) {
10658 dout(20) << __func__ << " " << pgid
10659 << " peering, item epoch is "
10660 << qi.get_map_epoch()
10661 << ", will wait on " << qi << dendl;
10662 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10663 } else {
10664 dout(20) << __func__ << " " << pgid
10665 << " item epoch is "
10666 << qi.get_map_epoch()
10667 << ", will wait on " << qi << dendl;
10668 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10669 }
10670}
10671
10672#undef dout_prefix
10673#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10674
10675void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10676{
11fdf7f2
TL
10677 uint32_t shard_index = thread_index % osd->num_shards;
10678 auto& sdata = osd->shards[shard_index];
10679 ceph_assert(sdata);
10680
10681 // If all threads of shards do oncommits, there is a out-of-order
10682 // problem. So we choose the thread which has the smallest
10683 // thread_index(thread_index < num_shards) of shard to do oncommit
10684 // callback.
10685 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10686
10687 // peek at spg_t
11fdf7f2
TL
10688 sdata->shard_lock.lock();
10689 if (sdata->pqueue->empty() &&
10690 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10691 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10692 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10693 // we raced with a context_queue addition, don't wait
10694 wait_lock.unlock();
10695 } else if (!sdata->stop_waiting) {
10696 dout(20) << __func__ << " empty q, waiting" << dendl;
10697 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10698 sdata->shard_lock.unlock();
10699 sdata->sdata_cond.wait(wait_lock);
10700 wait_lock.unlock();
10701 sdata->shard_lock.lock();
10702 if (sdata->pqueue->empty() &&
10703 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10704 sdata->shard_lock.unlock();
10705 return;
10706 }
10707 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10708 osd->cct->_conf->threadpool_default_timeout, 0);
10709 } else {
10710 dout(20) << __func__ << " need return immediately" << dendl;
10711 wait_lock.unlock();
10712 sdata->shard_lock.unlock();
7c673cae
FG
10713 return;
10714 }
10715 }
11fdf7f2
TL
10716
10717 list<Context *> oncommits;
10718 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10719 sdata->context_queue.swap(oncommits);
7c673cae 10720 }
11fdf7f2
TL
10721
10722 if (sdata->pqueue->empty()) {
10723 if (osd->is_stopping()) {
10724 sdata->shard_lock.unlock();
10725 for (auto c : oncommits) {
10726 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10727 delete c;
10728 }
10729 return; // OSD shutdown, discard.
7c673cae 10730 }
11fdf7f2
TL
10731 sdata->shard_lock.unlock();
10732 handle_oncommits(oncommits);
10733 return;
7c673cae 10734 }
7c673cae 10735
11fdf7f2
TL
10736 OpQueueItem item = sdata->pqueue->dequeue();
10737 if (osd->is_stopping()) {
10738 sdata->shard_lock.unlock();
10739 for (auto c : oncommits) {
10740 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10741 delete c;
10742 }
10743 return; // OSD shutdown, discard.
10744 }
7c673cae 10745
11fdf7f2
TL
10746 const auto token = item.get_ordering_token();
10747 auto r = sdata->pg_slots.emplace(token, nullptr);
10748 if (r.second) {
10749 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10750 }
11fdf7f2
TL
10751 OSDShardPGSlot *slot = r.first->second.get();
10752 dout(20) << __func__ << " " << token
10753 << (r.second ? " (new)" : "")
10754 << " to_process " << slot->to_process
10755 << " waiting " << slot->waiting
10756 << " waiting_peering " << slot->waiting_peering
10757 << dendl;
10758 slot->to_process.push_back(std::move(item));
10759 dout(20) << __func__ << " " << slot->to_process.back()
10760 << " queued" << dendl;
7c673cae 10761
11fdf7f2
TL
10762 retry_pg:
10763 PGRef pg = slot->pg;
7c673cae 10764
11fdf7f2
TL
10765 // lock pg (if we have it)
10766 if (pg) {
10767 // note the requeue seq now...
10768 uint64_t requeue_seq = slot->requeue_seq;
10769 ++slot->num_running;
7c673cae 10770
11fdf7f2
TL
10771 sdata->shard_lock.unlock();
10772 osd->service.maybe_inject_dispatch_delay();
10773 pg->lock();
10774 osd->service.maybe_inject_dispatch_delay();
10775 sdata->shard_lock.lock();
7c673cae 10776
11fdf7f2
TL
10777 auto q = sdata->pg_slots.find(token);
10778 if (q == sdata->pg_slots.end()) {
10779 // this can happen if we race with pg removal.
10780 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10781 pg->unlock();
10782 sdata->shard_lock.unlock();
10783 handle_oncommits(oncommits);
10784 return;
10785 }
10786 slot = q->second.get();
10787 --slot->num_running;
7c673cae 10788
11fdf7f2
TL
10789 if (slot->to_process.empty()) {
10790 // raced with _wake_pg_slot or consume_map
10791 dout(20) << __func__ << " " << token
10792 << " nothing queued" << dendl;
7c673cae 10793 pg->unlock();
11fdf7f2
TL
10794 sdata->shard_lock.unlock();
10795 handle_oncommits(oncommits);
10796 return;
7c673cae 10797 }
11fdf7f2
TL
10798 if (requeue_seq != slot->requeue_seq) {
10799 dout(20) << __func__ << " " << token
10800 << " requeue_seq " << slot->requeue_seq << " > our "
10801 << requeue_seq << ", we raced with _wake_pg_slot"
10802 << dendl;
7c673cae 10803 pg->unlock();
11fdf7f2
TL
10804 sdata->shard_lock.unlock();
10805 handle_oncommits(oncommits);
10806 return;
7c673cae 10807 }
11fdf7f2
TL
10808 if (slot->pg != pg) {
10809 // this can happen if we race with pg removal.
10810 dout(20) << __func__ << " slot " << token << " no longer attached to "
10811 << pg << dendl;
7c673cae 10812 pg->unlock();
11fdf7f2 10813 goto retry_pg;
7c673cae 10814 }
7c673cae
FG
10815 }
10816
11fdf7f2
TL
10817 dout(20) << __func__ << " " << token
10818 << " to_process " << slot->to_process
10819 << " waiting " << slot->waiting
10820 << " waiting_peering " << slot->waiting_peering << dendl;
10821
10822 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10823 suicide_interval);
10824
7c673cae 10825 // take next item
11fdf7f2
TL
10826 auto qi = std::move(slot->to_process.front());
10827 slot->to_process.pop_front();
10828 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10829 set<pair<spg_t,epoch_t>> new_children;
10830 OSDMapRef osdmap;
7c673cae 10831
11fdf7f2 10832 while (!pg) {
7c673cae 10833 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10834 osdmap = sdata->shard_osdmap;
10835 const PGCreateInfo *create_info = qi.creates_pg();
10836 if (!slot->waiting_for_split.empty()) {
10837 dout(20) << __func__ << " " << token
10838 << " splitting " << slot->waiting_for_split << dendl;
10839 _add_slot_waiter(token, slot, std::move(qi));
10840 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10841 dout(20) << __func__ << " " << token
10842 << " map " << qi.get_map_epoch() << " > "
10843 << osdmap->get_epoch() << dendl;
10844 _add_slot_waiter(token, slot, std::move(qi));
10845 } else if (qi.is_peering()) {
10846 if (!qi.peering_requires_pg()) {
10847 // for pg-less events, we run them under the ordering lock, since
10848 // we don't have the pg lock to keep them ordered.
10849 qi.run(osd, sdata, pg, tp_handle);
10850 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10851 if (create_info) {
10852 if (create_info->by_mon &&
10853 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10854 dout(20) << __func__ << " " << token
10855 << " no pg, no longer primary, ignoring mon create on "
10856 << qi << dendl;
10857 } else {
10858 dout(20) << __func__ << " " << token
10859 << " no pg, should create on " << qi << dendl;
10860 pg = osd->handle_pg_create_info(osdmap, create_info);
10861 if (pg) {
10862 // we created the pg! drop out and continue "normally"!
10863 sdata->_attach_pg(slot, pg.get());
10864 sdata->_wake_pg_slot(token, slot);
10865
10866 // identify split children between create epoch and shard epoch.
10867 osd->service.identify_splits_and_merges(
10868 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10869 sdata->_prime_splits(&new_children);
10870 // distribute remaining split children to other shards below!
10871 break;
10872 }
10873 dout(20) << __func__ << " ignored create on " << qi << dendl;
10874 }
10875 } else {
10876 dout(20) << __func__ << " " << token
10877 << " no pg, peering, !create, discarding " << qi << dendl;
10878 }
10879 } else {
10880 dout(20) << __func__ << " " << token
10881 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10882 << ", discarding " << qi
10883 << dendl;
10884 }
10885 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10886 dout(20) << __func__ << " " << token
10887 << " no pg, should exist e" << osdmap->get_epoch()
10888 << ", will wait on " << qi << dendl;
10889 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10890 } else {
11fdf7f2
TL
10891 dout(20) << __func__ << " " << token
10892 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10893 << ", dropping " << qi << dendl;
7c673cae 10894 // share map with client?
11fdf7f2
TL
10895 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10896 auto priv = (*_op)->get_req()->get_connection()->get_priv();
10897 if (auto session = static_cast<Session *>(priv.get()); session) {
10898 osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
7c673cae
FG
10899 }
10900 }
11fdf7f2 10901 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10902 if (pushes_to_free > 0) {
11fdf7f2 10903 sdata->shard_lock.unlock();
7c673cae 10904 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10905 handle_oncommits(oncommits);
7c673cae
FG
10906 return;
10907 }
10908 }
11fdf7f2
TL
10909 sdata->shard_lock.unlock();
10910 handle_oncommits(oncommits);
7c673cae
FG
10911 return;
10912 }
11fdf7f2
TL
10913 if (qi.is_peering()) {
10914 OSDMapRef osdmap = sdata->shard_osdmap;
10915 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10916 _add_slot_waiter(token, slot, std::move(qi));
10917 sdata->shard_lock.unlock();
10918 pg->unlock();
10919 handle_oncommits(oncommits);
10920 return;
10921 }
10922 }
10923 sdata->shard_lock.unlock();
7c673cae 10924
11fdf7f2
TL
10925 if (!new_children.empty()) {
10926 for (auto shard : osd->shards) {
10927 shard->prime_splits(osdmap, &new_children);
10928 }
10929 ceph_assert(new_children.empty());
10930 }
7c673cae
FG
10931
10932 // osd_opwq_process marks the point at which an operation has been dequeued
10933 // and will begin to be handled by a worker thread.
10934 {
10935#ifdef WITH_LTTNG
10936 osd_reqid_t reqid;
11fdf7f2 10937 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10938 reqid = (*_op)->get_reqid();
10939 }
10940#endif
10941 tracepoint(osd, opwq_process_start, reqid.name._type,
10942 reqid.name._num, reqid.tid, reqid.inc);
10943 }
10944
10945 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10946 Formatter *f = Formatter::create("json");
10947 f->open_object_section("q");
10948 dump(f);
10949 f->close_section();
10950 f->flush(*_dout);
10951 delete f;
10952 *_dout << dendl;
10953
11fdf7f2 10954 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
10955
10956 {
10957#ifdef WITH_LTTNG
10958 osd_reqid_t reqid;
11fdf7f2 10959 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10960 reqid = (*_op)->get_reqid();
10961 }
10962#endif
10963 tracepoint(osd, opwq_process_finish, reqid.name._type,
10964 reqid.name._num, reqid.tid, reqid.inc);
10965 }
10966
11fdf7f2 10967 handle_oncommits(oncommits);
7c673cae
FG
10968}
10969
11fdf7f2 10970void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
7c673cae 10971 uint32_t shard_index =
11fdf7f2 10972 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 10973
11fdf7f2 10974 OSDShard* sdata = osd->shards[shard_index];
7c673cae 10975 assert (NULL != sdata);
11fdf7f2
TL
10976 unsigned priority = item.get_priority();
10977 unsigned cost = item.get_cost();
10978 sdata->shard_lock.lock();
7c673cae 10979
11fdf7f2 10980 dout(20) << __func__ << " " << item << dendl;
7c673cae
FG
10981 if (priority >= osd->op_prio_cutoff)
10982 sdata->pqueue->enqueue_strict(
11fdf7f2 10983 item.get_owner(), priority, std::move(item));
7c673cae
FG
10984 else
10985 sdata->pqueue->enqueue(
11fdf7f2
TL
10986 item.get_owner(), priority, cost, std::move(item));
10987 sdata->shard_lock.unlock();
7c673cae 10988
11fdf7f2
TL
10989 std::lock_guard l{sdata->sdata_wait_lock};
10990 sdata->sdata_cond.notify_one();
7c673cae
FG
10991}
10992
11fdf7f2 10993void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
7c673cae 10994{
11fdf7f2
TL
10995 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10996 auto& sdata = osd->shards[shard_index];
10997 ceph_assert(sdata);
10998 sdata->shard_lock.lock();
10999 auto p = sdata->pg_slots.find(item.get_ordering_token());
11000 if (p != sdata->pg_slots.end() &&
11001 !p->second->to_process.empty()) {
7c673cae
FG
11002 // we may be racing with _process, which has dequeued a new item
11003 // from pqueue, put it on to_process, and is now busy taking the
11004 // pg lock. ensure this old requeued item is ordered before any
11005 // such newer item in to_process.
11fdf7f2
TL
11006 p->second->to_process.push_front(std::move(item));
11007 item = std::move(p->second->to_process.back());
11008 p->second->to_process.pop_back();
11009 dout(20) << __func__
11010 << " " << p->second->to_process.front()
11011 << " shuffled w/ " << item << dendl;
7c673cae 11012 } else {
11fdf7f2 11013 dout(20) << __func__ << " " << item << dendl;
7c673cae 11014 }
11fdf7f2
TL
11015 sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11016 sdata->shard_lock.unlock();
11017 std::lock_guard l{sdata->sdata_wait_lock};
11018 sdata->sdata_cond.notify_one();
7c673cae
FG
11019}
11020
11021namespace ceph {
11022namespace osd_cmds {
11023
11fdf7f2
TL
11024int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11025 std::ostream& os)
7c673cae
FG
11026{
11027 if (!ceph_using_tcmalloc()) {
11028 os << "could not issue heap profiler command -- not using tcmalloc!";
11029 return -EOPNOTSUPP;
11030 }
11031
11032 string cmd;
11033 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11034 os << "unable to get value for command \"" << cmd << "\"";
11035 return -EINVAL;
11fdf7f2 11036 }
7c673cae
FG
11037
11038 std::vector<std::string> cmd_vec;
11039 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11040
11041 string val;
11042 if (cmd_getval(&cct, cmdmap, "value", val)) {
11043 cmd_vec.push_back(val);
11044 }
7c673cae
FG
11045
11046 ceph_heap_profiler_handle_command(cmd_vec, os);
11047
11048 return 0;
11049}
11050
11051}} // namespace ceph::osd_cmds
11052
224ce89b 11053
11fdf7f2 11054std::ostream& operator<<(std::ostream& out, const io_queue& q) {
224ce89b 11055 switch(q) {
11fdf7f2 11056 case io_queue::prioritized:
224ce89b
WB
11057 out << "prioritized";
11058 break;
11fdf7f2 11059 case io_queue::weightedpriority:
224ce89b
WB
11060 out << "weightedpriority";
11061 break;
11fdf7f2 11062 case io_queue::mclock_opclass:
224ce89b
WB
11063 out << "mclock_opclass";
11064 break;
11fdf7f2 11065 case io_queue::mclock_client:
224ce89b
WB
11066 out << "mclock_client";
11067 break;
11068 }
11069 return out;
11070}