]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_time.h"
52 #include "common/version.h"
53 #include "common/pick_address.h"
54 #include "common/blkdev.h"
55 #include "common/numa.h"
56
57 #include "os/ObjectStore.h"
58 #ifdef HAVE_LIBFUSE
59 #include "os/FuseStore.h"
60 #endif
61
62 #include "PrimaryLogPG.h"
63
64 #include "msg/Messenger.h"
65 #include "msg/Message.h"
66
67 #include "mon/MonClient.h"
68
69 #include "messages/MLog.h"
70
71 #include "messages/MGenericMessage.h"
72 #include "messages/MOSDPing.h"
73 #include "messages/MOSDFailure.h"
74 #include "messages/MOSDMarkMeDown.h"
75 #include "messages/MOSDFull.h"
76 #include "messages/MOSDOp.h"
77 #include "messages/MOSDOpReply.h"
78 #include "messages/MOSDBackoff.h"
79 #include "messages/MOSDBeacon.h"
80 #include "messages/MOSDRepOp.h"
81 #include "messages/MOSDRepOpReply.h"
82 #include "messages/MOSDBoot.h"
83 #include "messages/MOSDPGTemp.h"
84 #include "messages/MOSDPGReadyToMerge.h"
85
86 #include "messages/MOSDMap.h"
87 #include "messages/MMonGetOSDMap.h"
88 #include "messages/MOSDPGNotify.h"
89 #include "messages/MOSDPGQuery.h"
90 #include "messages/MOSDPGLog.h"
91 #include "messages/MOSDPGRemove.h"
92 #include "messages/MOSDPGInfo.h"
93 #include "messages/MOSDPGCreate.h"
94 #include "messages/MOSDPGCreate2.h"
95 #include "messages/MOSDPGTrim.h"
96 #include "messages/MOSDPGScan.h"
97 #include "messages/MBackfillReserve.h"
98 #include "messages/MRecoveryReserve.h"
99 #include "messages/MOSDForceRecovery.h"
100 #include "messages/MOSDECSubOpWrite.h"
101 #include "messages/MOSDECSubOpWriteReply.h"
102 #include "messages/MOSDECSubOpRead.h"
103 #include "messages/MOSDECSubOpReadReply.h"
104 #include "messages/MOSDPGCreated.h"
105 #include "messages/MOSDPGUpdateLogMissing.h"
106 #include "messages/MOSDPGUpdateLogMissingReply.h"
107
108 #include "messages/MOSDPeeringOp.h"
109
110 #include "messages/MOSDAlive.h"
111
112 #include "messages/MOSDScrub.h"
113 #include "messages/MOSDScrub2.h"
114 #include "messages/MOSDRepScrub.h"
115
116 #include "messages/MMonCommand.h"
117 #include "messages/MCommand.h"
118 #include "messages/MCommandReply.h"
119
120 #include "messages/MPGStats.h"
121 #include "messages/MPGStatsAck.h"
122
123 #include "messages/MWatchNotify.h"
124 #include "messages/MOSDPGPush.h"
125 #include "messages/MOSDPGPushReply.h"
126 #include "messages/MOSDPGPull.h"
127
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
135
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
138
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
142
143 #include "osd/OpRequest.h"
144
145 #include "auth/AuthAuthorizeHandler.h"
146 #include "auth/RotatingKeyRing.h"
147
148 #include "objclass/objclass.h"
149
150 #include "common/cmdparse.h"
151 #include "include/str_list.h"
152 #include "include/util.h"
153
154 #include "include/ceph_assert.h"
155 #include "common/config.h"
156 #include "common/EventTrace.h"
157
158 #include "json_spirit/json_spirit_reader.h"
159 #include "json_spirit/json_spirit_writer.h"
160
161 #ifdef WITH_LTTNG
162 #define TRACEPOINT_DEFINE
163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164 #include "tracing/osd.h"
165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
166 #undef TRACEPOINT_DEFINE
167 #else
168 #define tracepoint(...)
169 #endif
170
171 #define dout_context cct
172 #define dout_subsys ceph_subsys_osd
173 #undef dout_prefix
174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
175
176
177 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
178 return *_dout << "osd." << whoami << " " << epoch << " ";
179 }
180
181 //Initial features in new superblock.
182 //Features here are also automatically upgraded
183 CompatSet OSD::get_osd_initial_compat_set() {
184 CompatSet::FeatureSet ceph_osd_feature_compat;
185 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
186 CompatSet::FeatureSet ceph_osd_feature_incompat;
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
202 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
203 ceph_osd_feature_incompat);
204 }
205
206 //Features are added here that this OSD supports.
207 CompatSet OSD::get_osd_compat_set() {
208 CompatSet compat = get_osd_initial_compat_set();
209 //Any features here can be set in code, but not in initial superblock
210 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
211 return compat;
212 }
213
214 OSDService::OSDService(OSD *osd) :
215 osd(osd),
216 cct(osd->cct),
217 whoami(osd->whoami), store(osd->store),
218 log_client(osd->log_client), clog(osd->clog),
219 pg_recovery_stats(osd->pg_recovery_stats),
220 cluster_messenger(osd->cluster_messenger),
221 client_messenger(osd->client_messenger),
222 logger(osd->logger),
223 recoverystate_perf(osd->recoverystate_perf),
224 monc(osd->monc),
225 class_handler(osd->class_handler),
226 osd_max_object_size(cct->_conf, "osd_max_object_size"),
227 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
228 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
229 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
230 max_oldest_map(0),
231 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
232 sched_scrub_lock("OSDService::sched_scrub_lock"),
233 scrubs_local(0),
234 scrubs_remote(0),
235 agent_lock("OSDService::agent_lock"),
236 agent_valid_iterator(false),
237 agent_ops(0),
238 flush_mode_high_count(0),
239 agent_active(true),
240 agent_thread(this),
241 agent_stop_flag(false),
242 agent_timer_lock("OSDService::agent_timer_lock"),
243 agent_timer(osd->client_messenger->cct, agent_timer_lock),
244 last_recalibrate(ceph_clock_now()),
245 promote_max_objects(0),
246 promote_max_bytes(0),
247 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
248 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
249 watch_lock("OSDService::watch_lock"),
250 watch_timer(osd->client_messenger->cct, watch_lock),
251 next_notif_id(0),
252 recovery_request_lock("OSDService::recovery_request_lock"),
253 recovery_request_timer(cct, recovery_request_lock, false),
254 sleep_lock("OSDService::sleep_lock"),
255 sleep_timer(cct, sleep_lock, false),
256 reserver_finisher(cct),
257 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
258 cct->_conf->osd_min_recovery_priority),
259 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
260 cct->_conf->osd_min_recovery_priority),
261 pg_temp_lock("OSDService::pg_temp_lock"),
262 snap_reserver(cct, &reserver_finisher,
263 cct->_conf->osd_max_trimming_pgs),
264 recovery_lock("OSDService::recovery_lock"),
265 recovery_ops_active(0),
266 recovery_ops_reserved(0),
267 recovery_paused(false),
268 map_cache_lock("OSDService::map_cache_lock"),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0), physical_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279 #ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281 #endif
282 {
283 objecter->init();
284
285 for (int i = 0; i < m_objecter_finishers; i++) {
286 ostringstream str;
287 str << "objecter-finisher-" << i;
288 Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
289 objecter_finishers.push_back(fin);
290 }
291 }
292
293 OSDService::~OSDService()
294 {
295 delete objecter;
296
297 for (auto f : objecter_finishers) {
298 delete f;
299 f = NULL;
300 }
301 }
302
303
304
305 #ifdef PG_DEBUG_REFS
306 void OSDService::add_pgid(spg_t pgid, PG *pg){
307 std::lock_guard l(pgid_lock);
308 if (!pgid_tracker.count(pgid)) {
309 live_pgs[pgid] = pg;
310 }
311 pgid_tracker[pgid]++;
312 }
313 void OSDService::remove_pgid(spg_t pgid, PG *pg)
314 {
315 std::lock_guard l(pgid_lock);
316 ceph_assert(pgid_tracker.count(pgid));
317 ceph_assert(pgid_tracker[pgid] > 0);
318 pgid_tracker[pgid]--;
319 if (pgid_tracker[pgid] == 0) {
320 pgid_tracker.erase(pgid);
321 live_pgs.erase(pgid);
322 }
323 }
324 void OSDService::dump_live_pgids()
325 {
326 std::lock_guard l(pgid_lock);
327 derr << "live pgids:" << dendl;
328 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
329 i != pgid_tracker.cend();
330 ++i) {
331 derr << "\t" << *i << dendl;
332 live_pgs[i->first]->dump_live_ids();
333 }
334 }
335 #endif
336
337
338
339 void OSDService::identify_splits_and_merges(
340 OSDMapRef old_map,
341 OSDMapRef new_map,
342 spg_t pgid,
343 set<pair<spg_t,epoch_t>> *split_children,
344 set<pair<spg_t,epoch_t>> *merge_pgs)
345 {
346 if (!old_map->have_pg_pool(pgid.pool())) {
347 return;
348 }
349 int old_pgnum = old_map->get_pg_num(pgid.pool());
350 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
351 if (p == osd->pg_num_history.pg_nums.end()) {
352 return;
353 }
354 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
355 << " to e" << new_map->get_epoch()
356 << " pg_nums " << p->second << dendl;
357 deque<spg_t> queue;
358 queue.push_back(pgid);
359 set<spg_t> did;
360 while (!queue.empty()) {
361 auto cur = queue.front();
362 queue.pop_front();
363 did.insert(cur);
364 unsigned pgnum = old_pgnum;
365 for (auto q = p->second.lower_bound(old_map->get_epoch());
366 q != p->second.end() &&
367 q->first <= new_map->get_epoch();
368 ++q) {
369 if (pgnum < q->second) {
370 // split?
371 if (cur.ps() < pgnum) {
372 set<spg_t> children;
373 if (cur.is_split(pgnum, q->second, &children)) {
374 dout(20) << __func__ << " " << cur << " e" << q->first
375 << " pg_num " << pgnum << " -> " << q->second
376 << " children " << children << dendl;
377 for (auto i : children) {
378 split_children->insert(make_pair(i, q->first));
379 if (!did.count(i))
380 queue.push_back(i);
381 }
382 }
383 } else if (cur.ps() < q->second) {
384 dout(20) << __func__ << " " << cur << " e" << q->first
385 << " pg_num " << pgnum << " -> " << q->second
386 << " is a child" << dendl;
387 // normally we'd capture this from the parent, but it's
388 // possible the parent doesn't exist yet (it will be
389 // fabricated to allow an intervening merge). note this PG
390 // as a split child here to be sure we catch it.
391 split_children->insert(make_pair(cur, q->first));
392 } else {
393 dout(20) << __func__ << " " << cur << " e" << q->first
394 << " pg_num " << pgnum << " -> " << q->second
395 << " is post-split, skipping" << dendl;
396 }
397 } else if (merge_pgs) {
398 // merge?
399 if (cur.ps() >= q->second) {
400 if (cur.ps() < pgnum) {
401 spg_t parent;
402 if (cur.is_merge_source(pgnum, q->second, &parent)) {
403 set<spg_t> children;
404 parent.is_split(q->second, pgnum, &children);
405 dout(20) << __func__ << " " << cur << " e" << q->first
406 << " pg_num " << pgnum << " -> " << q->second
407 << " is merge source, target " << parent
408 << ", source(s) " << children << dendl;
409 merge_pgs->insert(make_pair(parent, q->first));
410 if (!did.count(parent)) {
411 // queue (and re-scan) parent in case it might not exist yet
412 // and there are some future splits pending on it
413 queue.push_back(parent);
414 }
415 for (auto c : children) {
416 merge_pgs->insert(make_pair(c, q->first));
417 if (!did.count(c))
418 queue.push_back(c);
419 }
420 }
421 } else {
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is beyond old pgnum, skipping" << dendl;
425 }
426 } else {
427 set<spg_t> children;
428 if (cur.is_split(q->second, pgnum, &children)) {
429 dout(20) << __func__ << " " << cur << " e" << q->first
430 << " pg_num " << pgnum << " -> " << q->second
431 << " is merge target, source " << children << dendl;
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
434 if (!did.count(c))
435 queue.push_back(c);
436 }
437 merge_pgs->insert(make_pair(cur, q->first));
438 }
439 }
440 }
441 pgnum = q->second;
442 }
443 }
444 }
445
446 void OSDService::need_heartbeat_peer_update()
447 {
448 osd->need_heartbeat_peer_update();
449 }
450
451 void OSDService::start_shutdown()
452 {
453 {
454 std::lock_guard l(agent_timer_lock);
455 agent_timer.shutdown();
456 }
457
458 {
459 std::lock_guard l(sleep_lock);
460 sleep_timer.shutdown();
461 }
462
463 {
464 std::lock_guard l(recovery_request_lock);
465 recovery_request_timer.shutdown();
466 }
467 }
468
469 void OSDService::shutdown_reserver()
470 {
471 reserver_finisher.wait_for_empty();
472 reserver_finisher.stop();
473 }
474
475 void OSDService::shutdown()
476 {
477 {
478 std::lock_guard l(watch_lock);
479 watch_timer.shutdown();
480 }
481
482 objecter->shutdown();
483 for (auto f : objecter_finishers) {
484 f->wait_for_empty();
485 f->stop();
486 }
487
488 publish_map(OSDMapRef());
489 next_osdmap = OSDMapRef();
490 }
491
492 void OSDService::init()
493 {
494 reserver_finisher.start();
495 for (auto f : objecter_finishers) {
496 f->start();
497 }
498 objecter->set_client_incarnation(0);
499
500 // deprioritize objecter in daemonperf output
501 objecter->get_logger()->set_prio_adjust(-3);
502
503 watch_timer.init();
504 agent_timer.init();
505
506 agent_thread.create("osd_srv_agent");
507
508 if (cct->_conf->osd_recovery_delay_start)
509 defer_recovery(cct->_conf->osd_recovery_delay_start);
510 }
511
512 void OSDService::final_init()
513 {
514 objecter->start(osdmap.get());
515 }
516
517 void OSDService::activate_map()
518 {
519 // wake/unwake the tiering agent
520 agent_lock.Lock();
521 agent_active =
522 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
523 osd->is_active();
524 agent_cond.Signal();
525 agent_lock.Unlock();
526 }
527
528 void OSDService::request_osdmap_update(epoch_t e)
529 {
530 osd->osdmap_subscribe(e, false);
531 }
532
533 class AgentTimeoutCB : public Context {
534 PGRef pg;
535 public:
536 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
537 void finish(int) override {
538 pg->agent_choose_mode_restart();
539 }
540 };
541
542 void OSDService::agent_entry()
543 {
544 dout(10) << __func__ << " start" << dendl;
545 agent_lock.Lock();
546
547 while (!agent_stop_flag) {
548 if (agent_queue.empty()) {
549 dout(20) << __func__ << " empty queue" << dendl;
550 agent_cond.Wait(agent_lock);
551 continue;
552 }
553 uint64_t level = agent_queue.rbegin()->first;
554 set<PGRef>& top = agent_queue.rbegin()->second;
555 dout(10) << __func__
556 << " tiers " << agent_queue.size()
557 << ", top is " << level
558 << " with pgs " << top.size()
559 << ", ops " << agent_ops << "/"
560 << cct->_conf->osd_agent_max_ops
561 << (agent_active ? " active" : " NOT ACTIVE")
562 << dendl;
563 dout(20) << __func__ << " oids " << agent_oids << dendl;
564 int max = cct->_conf->osd_agent_max_ops - agent_ops;
565 int agent_flush_quota = max;
566 if (!flush_mode_high_count)
567 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
568 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
569 agent_cond.Wait(agent_lock);
570 continue;
571 }
572
573 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
574 agent_queue_pos = top.begin();
575 agent_valid_iterator = true;
576 }
577 PGRef pg = *agent_queue_pos;
578 dout(10) << "high_count " << flush_mode_high_count
579 << " agent_ops " << agent_ops
580 << " flush_quota " << agent_flush_quota << dendl;
581 agent_lock.Unlock();
582 if (!pg->agent_work(max, agent_flush_quota)) {
583 dout(10) << __func__ << " " << pg->pg_id
584 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
585 << " seconds" << dendl;
586
587 osd->logger->inc(l_osd_tier_delay);
588 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
589 agent_timer_lock.Lock();
590 Context *cb = new AgentTimeoutCB(pg);
591 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
592 agent_timer_lock.Unlock();
593 }
594 agent_lock.Lock();
595 }
596 agent_lock.Unlock();
597 dout(10) << __func__ << " finish" << dendl;
598 }
599
600 void OSDService::agent_stop()
601 {
602 {
603 std::lock_guard l(agent_lock);
604
605 // By this time all ops should be cancelled
606 ceph_assert(agent_ops == 0);
607 // By this time all PGs are shutdown and dequeued
608 if (!agent_queue.empty()) {
609 set<PGRef>& top = agent_queue.rbegin()->second;
610 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
611 ceph_abort_msg("agent queue not empty");
612 }
613
614 agent_stop_flag = true;
615 agent_cond.Signal();
616 }
617 agent_thread.join();
618 }
619
620 // -------------------------------------
621
622 void OSDService::promote_throttle_recalibrate()
623 {
624 utime_t now = ceph_clock_now();
625 double dur = now - last_recalibrate;
626 last_recalibrate = now;
627 unsigned prob = promote_probability_millis;
628
629 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
630 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
631
632 unsigned min_prob = 1;
633
634 uint64_t attempts, obj, bytes;
635 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
636 dout(10) << __func__ << " " << attempts << " attempts, promoted "
637 << obj << " objects and " << byte_u_t(bytes) << "; target "
638 << target_obj_sec << " obj/sec or "
639 << byte_u_t(target_bytes_sec) << "/sec"
640 << dendl;
641
642 // calculate what the probability *should* be, given the targets
643 unsigned new_prob;
644 if (attempts && dur > 0) {
645 uint64_t avg_size = 1;
646 if (obj)
647 avg_size = std::max<uint64_t>(bytes / obj, 1);
648 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
649 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
650 / (double)attempts;
651 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
652 << avg_size << dendl;
653 if (target_obj_sec && target_bytes_sec)
654 new_prob = std::min(po, pb);
655 else if (target_obj_sec)
656 new_prob = po;
657 else if (target_bytes_sec)
658 new_prob = pb;
659 else
660 new_prob = 1000;
661 } else {
662 new_prob = 1000;
663 }
664 dout(20) << __func__ << " new_prob " << new_prob << dendl;
665
666 // correct for persistent skew between target rate and actual rate, adjust
667 double ratio = 1.0;
668 unsigned actual = 0;
669 if (attempts && obj) {
670 actual = obj * 1000 / attempts;
671 ratio = (double)actual / (double)prob;
672 new_prob = (double)new_prob / ratio;
673 }
674 new_prob = std::max(new_prob, min_prob);
675 new_prob = std::min(new_prob, 1000u);
676
677 // adjust
678 prob = (prob + new_prob) / 2;
679 prob = std::max(prob, min_prob);
680 prob = std::min(prob, 1000u);
681 dout(10) << __func__ << " actual " << actual
682 << ", actual/prob ratio " << ratio
683 << ", adjusted new_prob " << new_prob
684 << ", prob " << promote_probability_millis << " -> " << prob
685 << dendl;
686 promote_probability_millis = prob;
687
688 // set hard limits for this interval to mitigate stampedes
689 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
690 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
691 }
692
693 // -------------------------------------
694
695 float OSDService::get_failsafe_full_ratio()
696 {
697 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
698 if (full_ratio > 1.0) full_ratio /= 100.0;
699 return full_ratio;
700 }
701
702 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
703 {
704 // The OSDMap ratios take precendence. So if the failsafe is .95 and
705 // the admin sets the cluster full to .96, the failsafe moves up to .96
706 // too. (Not that having failsafe == full is ideal, but it's better than
707 // dropping writes before the clusters appears full.)
708 OSDMapRef osdmap = get_osdmap();
709 if (!osdmap || osdmap->get_epoch() == 0) {
710 return NONE;
711 }
712 float nearfull_ratio = osdmap->get_nearfull_ratio();
713 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
714 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
715 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
716
717 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
718 // use the failsafe for nearfull and full; the mon isn't using the
719 // flags anyway because we're mid-upgrade.
720 full_ratio = failsafe_ratio;
721 backfillfull_ratio = failsafe_ratio;
722 nearfull_ratio = failsafe_ratio;
723 } else if (full_ratio <= 0 ||
724 backfillfull_ratio <= 0 ||
725 nearfull_ratio <= 0) {
726 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
727 // use failsafe flag. ick. the monitor did something wrong or the user
728 // did something stupid.
729 full_ratio = failsafe_ratio;
730 backfillfull_ratio = failsafe_ratio;
731 nearfull_ratio = failsafe_ratio;
732 }
733
734 if (injectfull_state > NONE && injectfull) {
735 inject = "(Injected)";
736 return injectfull_state;
737 } else if (pratio > failsafe_ratio) {
738 return FAILSAFE;
739 } else if (ratio > full_ratio) {
740 return FULL;
741 } else if (ratio > backfillfull_ratio) {
742 return BACKFILLFULL;
743 } else if (pratio > nearfull_ratio) {
744 return NEARFULL;
745 }
746 return NONE;
747 }
748
749 void OSDService::check_full_status(float ratio, float pratio)
750 {
751 std::lock_guard l(full_status_lock);
752
753 cur_ratio = ratio;
754 physical_ratio = pratio;
755
756 string inject;
757 s_names new_state;
758 new_state = recalc_full_state(ratio, pratio, inject);
759
760 dout(20) << __func__ << " cur ratio " << ratio
761 << ", physical ratio " << pratio
762 << ", new state " << get_full_state_name(new_state)
763 << " " << inject
764 << dendl;
765
766 // warn
767 if (cur_state != new_state) {
768 dout(10) << __func__ << " " << get_full_state_name(cur_state)
769 << " -> " << get_full_state_name(new_state) << dendl;
770 if (new_state == FAILSAFE) {
771 clog->error() << "full status failsafe engaged, dropping updates, now "
772 << (int)roundf(ratio * 100) << "% full";
773 } else if (cur_state == FAILSAFE) {
774 clog->error() << "full status failsafe disengaged, no longer dropping "
775 << "updates, now " << (int)roundf(ratio * 100) << "% full";
776 }
777 cur_state = new_state;
778 }
779 }
780
781 bool OSDService::need_fullness_update()
782 {
783 OSDMapRef osdmap = get_osdmap();
784 s_names cur = NONE;
785 if (osdmap->exists(whoami)) {
786 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
787 cur = FULL;
788 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
789 cur = BACKFILLFULL;
790 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
791 cur = NEARFULL;
792 }
793 }
794 s_names want = NONE;
795 if (is_full())
796 want = FULL;
797 else if (is_backfillfull())
798 want = BACKFILLFULL;
799 else if (is_nearfull())
800 want = NEARFULL;
801 return want != cur;
802 }
803
804 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
805 {
806 if (injectfull && injectfull_state >= type) {
807 // injectfull is either a count of the number of times to return failsafe full
808 // or if -1 then always return full
809 if (injectfull > 0)
810 --injectfull;
811 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
812 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
813 << dendl;
814 return true;
815 }
816 return false;
817 }
818
819 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
820 {
821 std::lock_guard l(full_status_lock);
822
823 if (_check_inject_full(dpp, type))
824 return true;
825
826 if (cur_state >= type)
827 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
828 << " physical " << physical_ratio << dendl;
829
830 return cur_state >= type;
831 }
832
833 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
834 {
835 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
836 {
837 std::lock_guard l(full_status_lock);
838 if (_check_inject_full(dpp, type)) {
839 return true;
840 }
841 }
842
843 float pratio;
844 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
845
846 string notused;
847 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
848
849 if (tentative_state >= type)
850 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
851
852 return tentative_state >= type;
853 }
854
855 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
856 {
857 return _check_full(dpp, FAILSAFE);
858 }
859
860 bool OSDService::check_full(DoutPrefixProvider *dpp) const
861 {
862 return _check_full(dpp, FULL);
863 }
864
865 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
866 {
867 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
868 }
869
870 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
871 {
872 return _check_full(dpp, BACKFILLFULL);
873 }
874
875 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
876 {
877 return _check_full(dpp, NEARFULL);
878 }
879
880 bool OSDService::is_failsafe_full() const
881 {
882 std::lock_guard l(full_status_lock);
883 return cur_state == FAILSAFE;
884 }
885
886 bool OSDService::is_full() const
887 {
888 std::lock_guard l(full_status_lock);
889 return cur_state >= FULL;
890 }
891
892 bool OSDService::is_backfillfull() const
893 {
894 std::lock_guard l(full_status_lock);
895 return cur_state >= BACKFILLFULL;
896 }
897
898 bool OSDService::is_nearfull() const
899 {
900 std::lock_guard l(full_status_lock);
901 return cur_state >= NEARFULL;
902 }
903
904 void OSDService::set_injectfull(s_names type, int64_t count)
905 {
906 std::lock_guard l(full_status_lock);
907 injectfull_state = type;
908 injectfull = count;
909 }
910
911 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
912 osd_alert_list_t& alerts)
913 {
914 uint64_t bytes = stbuf.total;
915 uint64_t avail = stbuf.available;
916 uint64_t used = stbuf.get_used_raw();
917
918 // For testing fake statfs values so it doesn't matter if all
919 // OSDs are using the same partition.
920 if (cct->_conf->fake_statfs_for_testing) {
921 uint64_t total_num_bytes = 0;
922 vector<PGRef> pgs;
923 osd->_get_pgs(&pgs);
924 for (auto p : pgs) {
925 total_num_bytes += p->get_stats_num_bytes();
926 }
927 bytes = cct->_conf->fake_statfs_for_testing;
928 if (total_num_bytes < bytes)
929 avail = bytes - total_num_bytes;
930 else
931 avail = 0;
932 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
933 << " adjust available " << avail
934 << dendl;
935 used = bytes - avail;
936 }
937
938 osd->logger->set(l_osd_stat_bytes, bytes);
939 osd->logger->set(l_osd_stat_bytes_used, used);
940 osd->logger->set(l_osd_stat_bytes_avail, avail);
941
942 std::lock_guard l(stat_lock);
943 osd_stat.statfs = stbuf;
944 osd_stat.os_alerts.clear();
945 osd_stat.os_alerts[whoami].swap(alerts);
946 if (cct->_conf->fake_statfs_for_testing) {
947 osd_stat.statfs.total = bytes;
948 osd_stat.statfs.available = avail;
949 // For testing don't want used to go negative, so clear reserved
950 osd_stat.statfs.internally_reserved = 0;
951 }
952 }
953
954 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
955 int num_pgs)
956 {
957 utime_t now = ceph_clock_now();
958 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
959 std::lock_guard l(stat_lock);
960 osd_stat.hb_peers.swap(hb_peers);
961 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
962 osd_stat.num_pgs = num_pgs;
963 // Clean entries that aren't updated
964 // This is called often enough that we can just remove 1 at a time
965 for (auto i: osd_stat.hb_pingtime) {
966 if (i.second.last_update == 0)
967 continue;
968 if (stale_time && now.sec() - i.second.last_update > stale_time) {
969 dout(20) << __func__ << " time out heartbeat for osd " << i.first
970 << " last_update " << i.second.last_update << dendl;
971 osd_stat.hb_pingtime.erase(i.first);
972 break;
973 }
974 }
975 return osd_stat;
976 }
977
978 void OSDService::inc_osd_stat_repaired()
979 {
980 std::lock_guard l(stat_lock);
981 osd_stat.num_shards_repaired++;
982 return;
983 }
984
985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
986 uint64_t adjust_used)
987 {
988 *pratio =
989 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
990
991 if (adjust_used) {
992 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
993 if (new_stat.statfs.available > adjust_used)
994 new_stat.statfs.available -= adjust_used;
995 else
996 new_stat.statfs.available = 0;
997 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
998 }
999
1000 // Check all pgs and adjust kb_used to include all pending backfill data
1001 int backfill_adjusted = 0;
1002 vector<PGRef> pgs;
1003 osd->_get_pgs(&pgs);
1004 for (auto p : pgs) {
1005 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1006 }
1007 if (backfill_adjusted) {
1008 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1009 }
1010 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1011 }
1012
1013 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
1014 {
1015 OSDMapRef osdmap = get_osdmap();
1016 for (auto shard : missing_on) {
1017 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
1018 return true;
1019 }
1020 return false;
1021 }
1022
1023 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1024 {
1025 OSDMapRef next_map = get_nextmap_reserved();
1026 // service map is always newer/newest
1027 ceph_assert(from_epoch <= next_map->get_epoch());
1028
1029 if (next_map->is_down(peer) ||
1030 next_map->get_info(peer).up_from > from_epoch) {
1031 m->put();
1032 release_map(next_map);
1033 return;
1034 }
1035 ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1036 next_map->get_cluster_addrs(peer));
1037 share_map_peer(peer, peer_con.get(), next_map);
1038 peer_con->send_message(m);
1039 release_map(next_map);
1040 }
1041
1042 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1043 {
1044 OSDMapRef next_map = get_nextmap_reserved();
1045 // service map is always newer/newest
1046 ceph_assert(from_epoch <= next_map->get_epoch());
1047
1048 if (next_map->is_down(peer) ||
1049 next_map->get_info(peer).up_from > from_epoch) {
1050 release_map(next_map);
1051 return NULL;
1052 }
1053 ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1054 next_map->get_cluster_addrs(peer));
1055 release_map(next_map);
1056 return con;
1057 }
1058
1059 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1060 {
1061 OSDMapRef next_map = get_nextmap_reserved();
1062 // service map is always newer/newest
1063 ceph_assert(from_epoch <= next_map->get_epoch());
1064
1065 pair<ConnectionRef,ConnectionRef> ret;
1066 if (next_map->is_down(peer) ||
1067 next_map->get_info(peer).up_from > from_epoch) {
1068 release_map(next_map);
1069 return ret;
1070 }
1071 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1072 next_map->get_hb_back_addrs(peer));
1073 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1074 next_map->get_hb_front_addrs(peer));
1075 release_map(next_map);
1076 return ret;
1077 }
1078
1079 entity_name_t OSDService::get_cluster_msgr_name() const
1080 {
1081 return cluster_messenger->get_myname();
1082 }
1083
1084 void OSDService::queue_want_pg_temp(pg_t pgid,
1085 const vector<int>& want,
1086 bool forced)
1087 {
1088 std::lock_guard l(pg_temp_lock);
1089 auto p = pg_temp_pending.find(pgid);
1090 if (p == pg_temp_pending.end() ||
1091 p->second.acting != want ||
1092 forced) {
1093 pg_temp_wanted[pgid] = {want, forced};
1094 }
1095 }
1096
1097 void OSDService::remove_want_pg_temp(pg_t pgid)
1098 {
1099 std::lock_guard l(pg_temp_lock);
1100 pg_temp_wanted.erase(pgid);
1101 pg_temp_pending.erase(pgid);
1102 }
1103
1104 void OSDService::_sent_pg_temp()
1105 {
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107 pg_temp_pending.merge(pg_temp_wanted);
1108 #else
1109 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1110 make_move_iterator(end(pg_temp_wanted)));
1111 #endif
1112 pg_temp_wanted.clear();
1113 }
1114
1115 void OSDService::requeue_pg_temp()
1116 {
1117 std::lock_guard l(pg_temp_lock);
1118 // wanted overrides pending. note that remove_want_pg_temp
1119 // clears the item out of both.
1120 unsigned old_wanted = pg_temp_wanted.size();
1121 unsigned old_pending = pg_temp_pending.size();
1122 _sent_pg_temp();
1123 pg_temp_wanted.swap(pg_temp_pending);
1124 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1125 << pg_temp_wanted.size() << dendl;
1126 }
1127
1128 std::ostream& operator<<(std::ostream& out,
1129 const OSDService::pg_temp_t& pg_temp)
1130 {
1131 out << pg_temp.acting;
1132 if (pg_temp.forced) {
1133 out << " (forced)";
1134 }
1135 return out;
1136 }
1137
1138 void OSDService::send_pg_temp()
1139 {
1140 std::lock_guard l(pg_temp_lock);
1141 if (pg_temp_wanted.empty())
1142 return;
1143 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1144 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1145 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1146 auto& m = ms[pg_temp.forced];
1147 if (!m) {
1148 m = new MOSDPGTemp(osdmap->get_epoch());
1149 m->forced = pg_temp.forced;
1150 }
1151 m->pg_temp.emplace(pgid, pg_temp.acting);
1152 }
1153 for (auto m : ms) {
1154 if (m) {
1155 monc->send_mon_message(m);
1156 }
1157 }
1158 _sent_pg_temp();
1159 }
1160
1161 void OSDService::send_pg_created(pg_t pgid)
1162 {
1163 std::lock_guard l(pg_created_lock);
1164 dout(20) << __func__ << dendl;
1165 auto o = get_osdmap();
1166 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1167 pg_created.insert(pgid);
1168 monc->send_mon_message(new MOSDPGCreated(pgid));
1169 }
1170 }
1171
1172 void OSDService::send_pg_created()
1173 {
1174 std::lock_guard l(pg_created_lock);
1175 dout(20) << __func__ << dendl;
1176 auto o = get_osdmap();
1177 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1178 for (auto pgid : pg_created) {
1179 monc->send_mon_message(new MOSDPGCreated(pgid));
1180 }
1181 }
1182 }
1183
1184 void OSDService::prune_pg_created()
1185 {
1186 std::lock_guard l(pg_created_lock);
1187 dout(20) << __func__ << dendl;
1188 auto o = get_osdmap();
1189 auto i = pg_created.begin();
1190 while (i != pg_created.end()) {
1191 auto p = o->get_pg_pool(i->pool());
1192 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1193 dout(20) << __func__ << " pruning " << *i << dendl;
1194 i = pg_created.erase(i);
1195 } else {
1196 dout(20) << __func__ << " keeping " << *i << dendl;
1197 ++i;
1198 }
1199 }
1200 }
1201
1202
1203 // --------------------------------------
1204 // dispatch
1205
1206 epoch_t OSDService::get_peer_epoch(int peer)
1207 {
1208 std::lock_guard l(peer_map_epoch_lock);
1209 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1210 if (p == peer_map_epoch.end())
1211 return 0;
1212 return p->second;
1213 }
1214
1215 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1216 {
1217 std::lock_guard l(peer_map_epoch_lock);
1218 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1219 if (p != peer_map_epoch.end()) {
1220 if (p->second < e) {
1221 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1222 p->second = e;
1223 } else {
1224 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1225 }
1226 return p->second;
1227 } else {
1228 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1229 peer_map_epoch[peer] = e;
1230 return e;
1231 }
1232 }
1233
1234 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1235 {
1236 std::lock_guard l(peer_map_epoch_lock);
1237 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1238 if (p != peer_map_epoch.end()) {
1239 if (p->second <= as_of) {
1240 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1241 << " had " << p->second << dendl;
1242 peer_map_epoch.erase(p);
1243 } else {
1244 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1245 << " has " << p->second << " - not forgetting" << dendl;
1246 }
1247 }
1248 }
1249
1250 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1251 epoch_t epoch, const OSDMapRef& osdmap,
1252 const epoch_t *sent_epoch_p)
1253 {
1254 dout(20) << "should_share_map "
1255 << name << " " << con->get_peer_addr()
1256 << " " << epoch << dendl;
1257
1258 // does client have old map?
1259 if (name.is_client()) {
1260 bool message_sendmap = epoch < osdmap->get_epoch();
1261 if (message_sendmap && sent_epoch_p) {
1262 dout(20) << "client session last_sent_epoch: "
1263 << *sent_epoch_p
1264 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1265 if (*sent_epoch_p < osdmap->get_epoch()) {
1266 return true;
1267 } // else we don't need to send it out again
1268 }
1269 }
1270
1271 if (con->get_messenger() == osd->cluster_messenger &&
1272 con != osd->cluster_messenger->get_loopback_connection() &&
1273 osdmap->is_up(name.num()) &&
1274 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1275 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1276 // remember
1277 epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
1278
1279 // share?
1280 if (has < osdmap->get_epoch()) {
1281 dout(10) << name << " " << con->get_peer_addr()
1282 << " has old map " << epoch << " < "
1283 << osdmap->get_epoch() << dendl;
1284 return true;
1285 }
1286 }
1287
1288 return false;
1289 }
1290
1291 void OSDService::share_map(
1292 entity_name_t name,
1293 Connection *con,
1294 epoch_t epoch,
1295 OSDMapRef& osdmap,
1296 epoch_t *sent_epoch_p)
1297 {
1298 dout(20) << "share_map "
1299 << name << " " << con->get_peer_addr()
1300 << " " << epoch << dendl;
1301
1302 if (!osd->is_active()) {
1303 /*It is safe not to proceed as OSD is not in healthy state*/
1304 return;
1305 }
1306
1307 bool want_shared = should_share_map(name, con, epoch,
1308 osdmap, sent_epoch_p);
1309
1310 if (want_shared){
1311 if (name.is_client()) {
1312 dout(10) << name << " has old map " << epoch
1313 << " < " << osdmap->get_epoch() << dendl;
1314 // we know the Session is valid or we wouldn't be sending
1315 if (sent_epoch_p) {
1316 *sent_epoch_p = osdmap->get_epoch();
1317 }
1318 send_incremental_map(epoch, con, osdmap);
1319 } else if (con->get_messenger() == osd->cluster_messenger &&
1320 osdmap->is_up(name.num()) &&
1321 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1322 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1323 dout(10) << name << " " << con->get_peer_addrs()
1324 << " has old map " << epoch << " < "
1325 << osdmap->get_epoch() << dendl;
1326 note_peer_epoch(name.num(), osdmap->get_epoch());
1327 send_incremental_map(epoch, con, osdmap);
1328 }
1329 }
1330 }
1331
1332 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1333 {
1334 if (!map)
1335 map = get_osdmap();
1336
1337 // send map?
1338 epoch_t pe = get_peer_epoch(peer);
1339 if (pe) {
1340 if (pe < map->get_epoch()) {
1341 send_incremental_map(pe, con, map);
1342 note_peer_epoch(peer, map->get_epoch());
1343 } else
1344 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1345 } else {
1346 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1347 // no idea about peer's epoch.
1348 // ??? send recent ???
1349 // do nothing.
1350 }
1351 }
1352
1353 bool OSDService::can_inc_scrubs()
1354 {
1355 bool can_inc = false;
1356 std::lock_guard l(sched_scrub_lock);
1357
1358 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1359 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1360 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1361 can_inc = true;
1362 } else {
1363 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1364 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1365 }
1366
1367 return can_inc;
1368 }
1369
1370 bool OSDService::inc_scrubs_local()
1371 {
1372 bool result = false;
1373 std::lock_guard l{sched_scrub_lock};
1374 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1375 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1376 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1377 result = true;
1378 ++scrubs_local;
1379 } else {
1380 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1381 }
1382 return result;
1383 }
1384
1385 void OSDService::dec_scrubs_local()
1386 {
1387 std::lock_guard l{sched_scrub_lock};
1388 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1389 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1390 --scrubs_local;
1391 ceph_assert(scrubs_local >= 0);
1392 }
1393
1394 bool OSDService::inc_scrubs_remote()
1395 {
1396 bool result = false;
1397 std::lock_guard l{sched_scrub_lock};
1398 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1399 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1400 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1401 result = true;
1402 ++scrubs_remote;
1403 } else {
1404 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1405 }
1406 return result;
1407 }
1408
1409 void OSDService::dec_scrubs_remote()
1410 {
1411 std::lock_guard l{sched_scrub_lock};
1412 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1413 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1414 --scrubs_remote;
1415 ceph_assert(scrubs_remote >= 0);
1416 }
1417
1418 void OSDService::dump_scrub_reservations(Formatter *f)
1419 {
1420 std::lock_guard l{sched_scrub_lock};
1421 f->dump_int("scrubs_local", scrubs_local);
1422 f->dump_int("scrubs_remote", scrubs_remote);
1423 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1424 }
1425
1426 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1427 epoch_t *_bind_epoch) const
1428 {
1429 std::lock_guard l(epoch_lock);
1430 if (_boot_epoch)
1431 *_boot_epoch = boot_epoch;
1432 if (_up_epoch)
1433 *_up_epoch = up_epoch;
1434 if (_bind_epoch)
1435 *_bind_epoch = bind_epoch;
1436 }
1437
1438 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1439 const epoch_t *_bind_epoch)
1440 {
1441 std::lock_guard l(epoch_lock);
1442 if (_boot_epoch) {
1443 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1444 boot_epoch = *_boot_epoch;
1445 }
1446 if (_up_epoch) {
1447 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1448 up_epoch = *_up_epoch;
1449 }
1450 if (_bind_epoch) {
1451 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1452 bind_epoch = *_bind_epoch;
1453 }
1454 }
1455
1456 bool OSDService::prepare_to_stop()
1457 {
1458 std::lock_guard l(is_stopping_lock);
1459 if (get_state() != NOT_STOPPING)
1460 return false;
1461
1462 OSDMapRef osdmap = get_osdmap();
1463 if (osdmap && osdmap->is_up(whoami)) {
1464 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1465 set_state(PREPARING_TO_STOP);
1466 monc->send_mon_message(
1467 new MOSDMarkMeDown(
1468 monc->get_fsid(),
1469 whoami,
1470 osdmap->get_addrs(whoami),
1471 osdmap->get_epoch(),
1472 true // request ack
1473 ));
1474 utime_t now = ceph_clock_now();
1475 utime_t timeout;
1476 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1477 while ((ceph_clock_now() < timeout) &&
1478 (get_state() != STOPPING)) {
1479 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1480 }
1481 }
1482 dout(0) << __func__ << " starting shutdown" << dendl;
1483 set_state(STOPPING);
1484 return true;
1485 }
1486
1487 void OSDService::got_stop_ack()
1488 {
1489 std::lock_guard l(is_stopping_lock);
1490 if (get_state() == PREPARING_TO_STOP) {
1491 dout(0) << __func__ << " starting shutdown" << dendl;
1492 set_state(STOPPING);
1493 is_stopping_cond.Signal();
1494 } else {
1495 dout(10) << __func__ << " ignoring msg" << dendl;
1496 }
1497 }
1498
1499 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1500 OSDSuperblock& sblock)
1501 {
1502 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1503 osdmap->get_encoding_features());
1504 m->oldest_map = max_oldest_map;
1505 m->newest_map = sblock.newest_map;
1506
1507 int max = cct->_conf->osd_map_message_max;
1508 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1509
1510 if (since < m->oldest_map) {
1511 // we don't have the next map the target wants, so start with a
1512 // full map.
1513 bufferlist bl;
1514 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1515 << since << ", starting with full map" << dendl;
1516 since = m->oldest_map;
1517 if (!get_map_bl(since, bl)) {
1518 derr << __func__ << " missing full map " << since << dendl;
1519 goto panic;
1520 }
1521 max--;
1522 max_bytes -= bl.length();
1523 m->maps[since].claim(bl);
1524 }
1525 for (epoch_t e = since + 1; e <= to; ++e) {
1526 bufferlist bl;
1527 if (get_inc_map_bl(e, bl)) {
1528 m->incremental_maps[e].claim(bl);
1529 } else {
1530 derr << __func__ << " missing incremental map " << e << dendl;
1531 if (!get_map_bl(e, bl)) {
1532 derr << __func__ << " also missing full map " << e << dendl;
1533 goto panic;
1534 }
1535 m->maps[e].claim(bl);
1536 }
1537 max--;
1538 max_bytes -= bl.length();
1539 if (max <= 0 || max_bytes <= 0) {
1540 break;
1541 }
1542 }
1543 return m;
1544
1545 panic:
1546 if (!m->maps.empty() ||
1547 !m->incremental_maps.empty()) {
1548 // send what we have so far
1549 return m;
1550 }
1551 // send something
1552 bufferlist bl;
1553 if (get_inc_map_bl(m->newest_map, bl)) {
1554 m->incremental_maps[m->newest_map].claim(bl);
1555 } else {
1556 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1557 if (!get_map_bl(m->newest_map, bl)) {
1558 derr << __func__ << " unable to load latest full map " << m->newest_map
1559 << dendl;
1560 ceph_abort();
1561 }
1562 m->maps[m->newest_map].claim(bl);
1563 }
1564 return m;
1565 }
1566
1567 void OSDService::send_map(MOSDMap *m, Connection *con)
1568 {
1569 con->send_message(m);
1570 }
1571
1572 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1573 OSDMapRef& osdmap)
1574 {
1575 epoch_t to = osdmap->get_epoch();
1576 dout(10) << "send_incremental_map " << since << " -> " << to
1577 << " to " << con << " " << con->get_peer_addr() << dendl;
1578
1579 MOSDMap *m = NULL;
1580 while (!m) {
1581 OSDSuperblock sblock(get_superblock());
1582 if (since < sblock.oldest_map) {
1583 // just send latest full map
1584 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1585 osdmap->get_encoding_features());
1586 m->oldest_map = max_oldest_map;
1587 m->newest_map = sblock.newest_map;
1588 get_map_bl(to, m->maps[to]);
1589 send_map(m, con);
1590 return;
1591 }
1592
1593 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1594 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1595 << ", only sending most recent" << dendl;
1596 since = to - cct->_conf->osd_map_share_max_epochs;
1597 }
1598
1599 m = build_incremental_map_msg(since, to, sblock);
1600 }
1601 send_map(m, con);
1602 }
1603
1604 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1605 {
1606 bool found = map_bl_cache.lookup(e, &bl);
1607 if (found) {
1608 if (logger)
1609 logger->inc(l_osd_map_bl_cache_hit);
1610 return true;
1611 }
1612 if (logger)
1613 logger->inc(l_osd_map_bl_cache_miss);
1614 found = store->read(meta_ch,
1615 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1616 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1617 if (found) {
1618 _add_map_bl(e, bl);
1619 }
1620 return found;
1621 }
1622
1623 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1624 {
1625 std::lock_guard l(map_cache_lock);
1626 bool found = map_bl_inc_cache.lookup(e, &bl);
1627 if (found) {
1628 if (logger)
1629 logger->inc(l_osd_map_bl_cache_hit);
1630 return true;
1631 }
1632 if (logger)
1633 logger->inc(l_osd_map_bl_cache_miss);
1634 found = store->read(meta_ch,
1635 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1636 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1637 if (found) {
1638 _add_map_inc_bl(e, bl);
1639 }
1640 return found;
1641 }
1642
1643 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1644 {
1645 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1646 // cache a contiguous buffer
1647 if (bl.get_num_buffers() > 1) {
1648 bl.rebuild();
1649 }
1650 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1651 map_bl_cache.add(e, bl);
1652 }
1653
1654 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1655 {
1656 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1657 // cache a contiguous buffer
1658 if (bl.get_num_buffers() > 1) {
1659 bl.rebuild();
1660 }
1661 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1662 map_bl_inc_cache.add(e, bl);
1663 }
1664
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool)
1666 {
1667 std::lock_guard l(map_cache_lock);
1668 auto p = deleted_pool_pg_nums.find(pool);
1669 if (p != deleted_pool_pg_nums.end()) {
1670 return p->second;
1671 }
1672 dout(20) << __func__ << " " << pool << " loading" << dendl;
1673 ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1674 bufferlist bl;
1675 int r = store->read(meta_ch, oid, 0, 0, bl);
1676 ceph_assert(r >= 0);
1677 auto blp = bl.cbegin();
1678 pg_pool_t pi;
1679 ::decode(pi, blp);
1680 deleted_pool_pg_nums[pool] = pi.get_pg_num();
1681 dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1682 return pi.get_pg_num();
1683 }
1684
1685 OSDMapRef OSDService::_add_map(OSDMap *o)
1686 {
1687 epoch_t e = o->get_epoch();
1688
1689 if (cct->_conf->osd_map_dedup) {
1690 // Dedup against an existing map at a nearby epoch
1691 OSDMapRef for_dedup = map_cache.lower_bound(e);
1692 if (for_dedup) {
1693 OSDMap::dedup(for_dedup.get(), o);
1694 }
1695 }
1696 bool existed;
1697 OSDMapRef l = map_cache.add(e, o, &existed);
1698 if (existed) {
1699 delete o;
1700 }
1701 return l;
1702 }
1703
1704 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1705 {
1706 std::lock_guard l(map_cache_lock);
1707 OSDMapRef retval = map_cache.lookup(epoch);
1708 if (retval) {
1709 dout(30) << "get_map " << epoch << " -cached" << dendl;
1710 if (logger) {
1711 logger->inc(l_osd_map_cache_hit);
1712 }
1713 return retval;
1714 }
1715 if (logger) {
1716 logger->inc(l_osd_map_cache_miss);
1717 epoch_t lb = map_cache.cached_key_lower_bound();
1718 if (epoch < lb) {
1719 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1720 logger->inc(l_osd_map_cache_miss_low);
1721 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1722 }
1723 }
1724
1725 OSDMap *map = new OSDMap;
1726 if (epoch > 0) {
1727 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1728 bufferlist bl;
1729 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1730 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1731 delete map;
1732 return OSDMapRef();
1733 }
1734 map->decode(bl);
1735 } else {
1736 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1737 }
1738 return _add_map(map);
1739 }
1740
1741 // ops
1742
1743
1744 void OSDService::reply_op_error(OpRequestRef op, int err)
1745 {
1746 reply_op_error(op, err, eversion_t(), 0);
1747 }
1748
1749 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1750 version_t uv)
1751 {
1752 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1753 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1754 int flags;
1755 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1756
1757 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
1758 reply->set_reply_versions(v, uv);
1759 m->get_connection()->send_message(reply);
1760 }
1761
1762 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1763 {
1764 if (!cct->_conf->osd_debug_misdirected_ops) {
1765 return;
1766 }
1767
1768 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1769 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1770
1771 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1772
1773 if (pg->is_ec_pg()) {
1774 /**
1775 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776 * can get this result:
1777 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778 * [CRUSH_ITEM_NONE, 2, 3]/3
1779 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1780 * [3, 2, 3]/3
1781 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1782 * -- misdirected op
1783 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1784 * it and fulfils it
1785 *
1786 * We can't compute the op target based on the sending map epoch due to
1787 * splitting. The simplest thing is to detect such cases here and drop
1788 * them without an error (the client will resend anyway).
1789 */
1790 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1791 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1792 if (!opmap) {
1793 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1794 << m->get_map_epoch() << ", dropping" << dendl;
1795 return;
1796 }
1797 pg_t _pgid = m->get_raw_pg();
1798 spg_t pgid;
1799 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1800 _pgid = opmap->raw_pg_to_pg(_pgid);
1801 if (opmap->get_primary_shard(_pgid, &pgid) &&
1802 pgid.shard != pg->pg_id.shard) {
1803 dout(7) << __func__ << ": " << *pg << " primary changed since "
1804 << m->get_map_epoch() << ", dropping" << dendl;
1805 return;
1806 }
1807 }
1808
1809 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1810 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1811 << " pg " << m->get_raw_pg()
1812 << " to osd." << whoami
1813 << " not " << pg->get_acting()
1814 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1815 }
1816
1817 void OSDService::enqueue_back(OpQueueItem&& qi)
1818 {
1819 osd->op_shardedwq.queue(std::move(qi));
1820 }
1821
1822 void OSDService::enqueue_front(OpQueueItem&& qi)
1823 {
1824 osd->op_shardedwq.queue_front(std::move(qi));
1825 }
1826
1827 void OSDService::queue_recovery_context(
1828 PG *pg,
1829 GenContext<ThreadPool::TPHandle&> *c)
1830 {
1831 epoch_t e = get_osdmap_epoch();
1832 enqueue_back(
1833 OpQueueItem(
1834 unique_ptr<OpQueueItem::OpQueueable>(
1835 new PGRecoveryContext(pg->get_pgid(), c, e)),
1836 cct->_conf->osd_recovery_cost,
1837 cct->_conf->osd_recovery_priority,
1838 ceph_clock_now(),
1839 0,
1840 e));
1841 }
1842
1843 void OSDService::queue_for_snap_trim(PG *pg)
1844 {
1845 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1846 enqueue_back(
1847 OpQueueItem(
1848 unique_ptr<OpQueueItem::OpQueueable>(
1849 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1850 cct->_conf->osd_snap_trim_cost,
1851 cct->_conf->osd_snap_trim_priority,
1852 ceph_clock_now(),
1853 0,
1854 pg->get_osdmap_epoch()));
1855 }
1856
1857 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1858 {
1859 unsigned scrub_queue_priority = pg->scrubber.priority;
1860 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1861 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1862 }
1863 const auto epoch = pg->get_osdmap_epoch();
1864 enqueue_back(
1865 OpQueueItem(
1866 unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1867 cct->_conf->osd_scrub_cost,
1868 scrub_queue_priority,
1869 ceph_clock_now(),
1870 0,
1871 epoch));
1872 }
1873
1874 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1875 {
1876 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1877 enqueue_back(
1878 OpQueueItem(
1879 unique_ptr<OpQueueItem::OpQueueable>(
1880 new PGDelete(pgid, e)),
1881 cct->_conf->osd_pg_delete_cost,
1882 cct->_conf->osd_pg_delete_priority,
1883 ceph_clock_now(),
1884 0,
1885 e));
1886 }
1887
1888 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1889 {
1890 return osd->try_finish_pg_delete(pg, old_pg_num);
1891 }
1892
1893 // ---
1894
1895 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1896 {
1897 std::lock_guard l(merge_lock);
1898 dout(10) << __func__ << " " << pg->pg_id << dendl;
1899 ready_to_merge_source[pg->pg_id.pgid] = version;
1900 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1901 _send_ready_to_merge();
1902 }
1903
1904 void OSDService::set_ready_to_merge_target(PG *pg,
1905 eversion_t version,
1906 epoch_t last_epoch_started,
1907 epoch_t last_epoch_clean)
1908 {
1909 std::lock_guard l(merge_lock);
1910 dout(10) << __func__ << " " << pg->pg_id << dendl;
1911 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1912 make_tuple(version,
1913 last_epoch_started,
1914 last_epoch_clean)));
1915 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1916 _send_ready_to_merge();
1917 }
1918
1919 void OSDService::set_not_ready_to_merge_source(pg_t source)
1920 {
1921 std::lock_guard l(merge_lock);
1922 dout(10) << __func__ << " " << source << dendl;
1923 not_ready_to_merge_source.insert(source);
1924 assert(ready_to_merge_source.count(source) == 0);
1925 _send_ready_to_merge();
1926 }
1927
1928 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1929 {
1930 std::lock_guard l(merge_lock);
1931 dout(10) << __func__ << " " << target << " source " << source << dendl;
1932 not_ready_to_merge_target[target] = source;
1933 assert(ready_to_merge_target.count(target) == 0);
1934 _send_ready_to_merge();
1935 }
1936
1937 void OSDService::send_ready_to_merge()
1938 {
1939 std::lock_guard l(merge_lock);
1940 _send_ready_to_merge();
1941 }
1942
1943 void OSDService::_send_ready_to_merge()
1944 {
1945 dout(20) << __func__
1946 << " ready_to_merge_source " << ready_to_merge_source
1947 << " not_ready_to_merge_source " << not_ready_to_merge_source
1948 << " ready_to_merge_target " << ready_to_merge_target
1949 << " not_ready_to_merge_target " << not_ready_to_merge_target
1950 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1951 << dendl;
1952 for (auto src : not_ready_to_merge_source) {
1953 if (sent_ready_to_merge_source.count(src) == 0) {
1954 monc->send_mon_message(new MOSDPGReadyToMerge(
1955 src,
1956 {}, {}, 0, 0,
1957 false,
1958 osdmap->get_epoch()));
1959 sent_ready_to_merge_source.insert(src);
1960 }
1961 }
1962 for (auto p : not_ready_to_merge_target) {
1963 if (sent_ready_to_merge_source.count(p.second) == 0) {
1964 monc->send_mon_message(new MOSDPGReadyToMerge(
1965 p.second,
1966 {}, {}, 0, 0,
1967 false,
1968 osdmap->get_epoch()));
1969 sent_ready_to_merge_source.insert(p.second);
1970 }
1971 }
1972 for (auto src : ready_to_merge_source) {
1973 if (not_ready_to_merge_source.count(src.first) ||
1974 not_ready_to_merge_target.count(src.first.get_parent())) {
1975 continue;
1976 }
1977 auto p = ready_to_merge_target.find(src.first.get_parent());
1978 if (p != ready_to_merge_target.end() &&
1979 sent_ready_to_merge_source.count(src.first) == 0) {
1980 monc->send_mon_message(new MOSDPGReadyToMerge(
1981 src.first, // source pgid
1982 src.second, // src version
1983 std::get<0>(p->second), // target version
1984 std::get<1>(p->second), // PG's last_epoch_started
1985 std::get<2>(p->second), // PG's last_epoch_clean
1986 true,
1987 osdmap->get_epoch()));
1988 sent_ready_to_merge_source.insert(src.first);
1989 }
1990 }
1991 }
1992
1993 void OSDService::clear_ready_to_merge(PG *pg)
1994 {
1995 std::lock_guard l(merge_lock);
1996 dout(10) << __func__ << " " << pg->pg_id << dendl;
1997 ready_to_merge_source.erase(pg->pg_id.pgid);
1998 ready_to_merge_target.erase(pg->pg_id.pgid);
1999 not_ready_to_merge_source.erase(pg->pg_id.pgid);
2000 not_ready_to_merge_target.erase(pg->pg_id.pgid);
2001 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2002 }
2003
2004 void OSDService::clear_sent_ready_to_merge()
2005 {
2006 std::lock_guard l(merge_lock);
2007 sent_ready_to_merge_source.clear();
2008 }
2009
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
2011 {
2012 std::lock_guard l(merge_lock);
2013 auto i = sent_ready_to_merge_source.begin();
2014 while (i != sent_ready_to_merge_source.end()) {
2015 if (!osdmap->pg_exists(*i)) {
2016 dout(10) << __func__ << " " << *i << dendl;
2017 i = sent_ready_to_merge_source.erase(i);
2018 } else {
2019 ++i;
2020 }
2021 }
2022 }
2023
2024 // ---
2025
2026 void OSDService::_queue_for_recovery(
2027 std::pair<epoch_t, PGRef> p,
2028 uint64_t reserved_pushes)
2029 {
2030 ceph_assert(recovery_lock.is_locked_by_me());
2031 enqueue_back(
2032 OpQueueItem(
2033 unique_ptr<OpQueueItem::OpQueueable>(
2034 new PGRecovery(
2035 p.second->get_pgid(), p.first, reserved_pushes)),
2036 cct->_conf->osd_recovery_cost,
2037 cct->_conf->osd_recovery_priority,
2038 ceph_clock_now(),
2039 0,
2040 p.first));
2041 }
2042
2043 // ====================================================================
2044 // OSD
2045
2046 #undef dout_prefix
2047 #define dout_prefix *_dout
2048
2049 // Commands shared between OSD's console and admin console:
2050 namespace ceph {
2051 namespace osd_cmds {
2052
2053 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2054
2055 }} // namespace ceph::osd_cmds
2056
2057 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
2058 {
2059 int ret;
2060
2061 OSDSuperblock sb;
2062 bufferlist sbbl;
2063 ObjectStore::CollectionHandle ch;
2064
2065 // if we are fed a uuid for this osd, use it.
2066 store->set_fsid(cct->_conf->osd_uuid);
2067
2068 ret = store->mkfs();
2069 if (ret) {
2070 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2071 << cpp_strerror(ret) << dendl;
2072 goto free_store;
2073 }
2074
2075 store->set_cache_shards(1); // doesn't matter for mkfs!
2076
2077 ret = store->mount();
2078 if (ret) {
2079 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2080 << cpp_strerror(ret) << dendl;
2081 goto free_store;
2082 }
2083
2084 ch = store->open_collection(coll_t::meta());
2085 if (ch) {
2086 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2087 if (ret < 0) {
2088 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2089 goto free_store;
2090 }
2091 /* if we already have superblock, check content of superblock */
2092 dout(0) << " have superblock" << dendl;
2093 auto p = sbbl.cbegin();
2094 decode(sb, p);
2095 if (whoami != sb.whoami) {
2096 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2097 << dendl;
2098 ret = -EINVAL;
2099 goto umount_store;
2100 }
2101 if (fsid != sb.cluster_fsid) {
2102 derr << "provided cluster fsid " << fsid
2103 << " != superblock's " << sb.cluster_fsid << dendl;
2104 ret = -EINVAL;
2105 goto umount_store;
2106 }
2107 } else {
2108 // create superblock
2109 sb.cluster_fsid = fsid;
2110 sb.osd_fsid = store->get_fsid();
2111 sb.whoami = whoami;
2112 sb.compat_features = get_osd_initial_compat_set();
2113
2114 bufferlist bl;
2115 encode(sb, bl);
2116
2117 ObjectStore::CollectionHandle ch = store->create_new_collection(
2118 coll_t::meta());
2119 ObjectStore::Transaction t;
2120 t.create_collection(coll_t::meta(), 0);
2121 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2122 ret = store->queue_transaction(ch, std::move(t));
2123 if (ret) {
2124 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2126 goto umount_store;
2127 }
2128 }
2129
2130 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
2131 if (ret) {
2132 derr << "OSD::mkfs: failed to write fsid file: error "
2133 << cpp_strerror(ret) << dendl;
2134 goto umount_store;
2135 }
2136
2137 umount_store:
2138 if (ch) {
2139 ch.reset();
2140 }
2141 store->umount();
2142 free_store:
2143 delete store;
2144 return ret;
2145 }
2146
2147 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
2148 {
2149 char val[80];
2150 int r;
2151
2152 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2153 r = store->write_meta("magic", val);
2154 if (r < 0)
2155 return r;
2156
2157 snprintf(val, sizeof(val), "%d", whoami);
2158 r = store->write_meta("whoami", val);
2159 if (r < 0)
2160 return r;
2161
2162 cluster_fsid.print(val);
2163 r = store->write_meta("ceph_fsid", val);
2164 if (r < 0)
2165 return r;
2166
2167 string key = cct->_conf.get_val<string>("key");
2168 if (key.size()) {
2169 r = store->write_meta("osd_key", key);
2170 if (r < 0)
2171 return r;
2172 } else {
2173 string keyfile = cct->_conf.get_val<string>("keyfile");
2174 if (!keyfile.empty()) {
2175 bufferlist keybl;
2176 string err;
2177 r = keybl.read_file(keyfile.c_str(), &err);
2178 if (r < 0) {
2179 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2180 << err << ": " << cpp_strerror(r) << dendl;
2181 return r;
2182 }
2183 r = store->write_meta("osd_key", keybl.to_str());
2184 if (r < 0)
2185 return r;
2186 }
2187 }
2188
2189 r = store->write_meta("ready", "ready");
2190 if (r < 0)
2191 return r;
2192
2193 return 0;
2194 }
2195
2196 int OSD::peek_meta(ObjectStore *store,
2197 std::string *magic,
2198 uuid_d *cluster_fsid,
2199 uuid_d *osd_fsid,
2200 int *whoami,
2201 int *require_osd_release)
2202 {
2203 string val;
2204
2205 int r = store->read_meta("magic", &val);
2206 if (r < 0)
2207 return r;
2208 *magic = val;
2209
2210 r = store->read_meta("whoami", &val);
2211 if (r < 0)
2212 return r;
2213 *whoami = atoi(val.c_str());
2214
2215 r = store->read_meta("ceph_fsid", &val);
2216 if (r < 0)
2217 return r;
2218 r = cluster_fsid->parse(val.c_str());
2219 if (!r)
2220 return -EINVAL;
2221
2222 r = store->read_meta("fsid", &val);
2223 if (r < 0) {
2224 *osd_fsid = uuid_d();
2225 } else {
2226 r = osd_fsid->parse(val.c_str());
2227 if (!r)
2228 return -EINVAL;
2229 }
2230
2231 r = store->read_meta("require_osd_release", &val);
2232 if (r >= 0) {
2233 *require_osd_release = atoi(val.c_str());
2234 }
2235
2236 return 0;
2237 }
2238
2239
2240 #undef dout_prefix
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2242
2243 // cons/des
2244
2245 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2246 int id,
2247 Messenger *internal_messenger,
2248 Messenger *external_messenger,
2249 Messenger *hb_client_front,
2250 Messenger *hb_client_back,
2251 Messenger *hb_front_serverm,
2252 Messenger *hb_back_serverm,
2253 Messenger *osdc_messenger,
2254 MonClient *mc,
2255 const std::string &dev, const std::string &jdev) :
2256 Dispatcher(cct_),
2257 osd_lock("OSD::osd_lock"),
2258 tick_timer(cct, osd_lock),
2259 tick_timer_lock("OSD::tick_timer_lock"),
2260 tick_timer_without_osd_lock(cct, tick_timer_lock),
2261 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2262 cluster_messenger(internal_messenger),
2263 client_messenger(external_messenger),
2264 objecter_messenger(osdc_messenger),
2265 monc(mc),
2266 mgrc(cct_, client_messenger),
2267 logger(NULL),
2268 recoverystate_perf(NULL),
2269 store(store_),
2270 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2271 clog(log_client.create_channel()),
2272 whoami(id),
2273 dev_path(dev), journal_path(jdev),
2274 store_is_rotational(store->is_rotational()),
2275 trace_endpoint("0.0.0.0", 0, "osd"),
2276 asok_hook(NULL),
2277 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2278 "osd_pg_epoch_max_lag_factor")),
2279 osd_compat(get_osd_compat_set()),
2280 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2281 get_num_op_threads()),
2282 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
2283 session_waiting_lock("OSD::session_waiting_lock"),
2284 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285 heartbeat_lock("OSD::heartbeat_lock"),
2286 heartbeat_stop(false),
2287 heartbeat_need_update(true),
2288 hb_front_client_messenger(hb_client_front),
2289 hb_back_client_messenger(hb_client_back),
2290 hb_front_server_messenger(hb_front_serverm),
2291 hb_back_server_messenger(hb_back_serverm),
2292 daily_loadavg(0.0),
2293 heartbeat_thread(this),
2294 heartbeat_dispatcher(this),
2295 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2296 cct->_conf->osd_num_op_tracker_shard),
2297 test_ops_hook(NULL),
2298 op_queue(get_io_queue()),
2299 op_prio_cutoff(get_io_prio_cut()),
2300 op_shardedwq(
2301 this,
2302 cct->_conf->osd_op_thread_timeout,
2303 cct->_conf->osd_op_thread_suicide_timeout,
2304 &osd_op_tp),
2305 map_lock("OSD::map_lock"),
2306 last_pg_create_epoch(0),
2307 mon_report_lock("OSD::mon_report_lock"),
2308 boot_finisher(cct),
2309 up_thru_wanted(0),
2310 requested_full_first(0),
2311 requested_full_last(0),
2312 command_wq(
2313 this,
2314 cct->_conf->osd_command_thread_timeout,
2315 cct->_conf->osd_command_thread_suicide_timeout,
2316 &command_tp),
2317 service(this)
2318 {
2319
2320 if (!gss_ktfile_client.empty()) {
2321 // Assert we can export environment variable
2322 /*
2323 The default client keytab is used, if it is present and readable,
2324 to automatically obtain initial credentials for GSSAPI client
2325 applications. The principal name of the first entry in the client
2326 keytab is used by default when obtaining initial credentials.
2327 1. The KRB5_CLIENT_KTNAME environment variable.
2328 2. The default_client_keytab_name profile variable in [libdefaults].
2329 3. The hardcoded default, DEFCKTNAME.
2330 */
2331 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332 gss_ktfile_client.c_str(), 1));
2333 ceph_assert(set_result == 0);
2334 }
2335
2336 monc->set_messenger(client_messenger);
2337 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2338 cct->_conf->osd_op_log_threshold);
2339 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2340 cct->_conf->osd_op_history_duration);
2341 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2342 cct->_conf->osd_op_history_slow_op_threshold);
2343 #ifdef WITH_BLKIN
2344 std::stringstream ss;
2345 ss << "osd." << whoami;
2346 trace_endpoint.copy_name(ss.str());
2347 #endif
2348
2349 // initialize shards
2350 num_shards = get_num_op_shards();
2351 for (uint32_t i = 0; i < num_shards; i++) {
2352 OSDShard *one_shard = new OSDShard(
2353 i,
2354 cct,
2355 this,
2356 cct->_conf->osd_op_pq_max_tokens_per_priority,
2357 cct->_conf->osd_op_pq_min_cost,
2358 op_queue);
2359 shards.push_back(one_shard);
2360 }
2361 }
2362
2363 OSD::~OSD()
2364 {
2365 while (!shards.empty()) {
2366 delete shards.back();
2367 shards.pop_back();
2368 }
2369 delete class_handler;
2370 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2371 cct->get_perfcounters_collection()->remove(logger);
2372 delete recoverystate_perf;
2373 delete logger;
2374 delete store;
2375 }
2376
2377 double OSD::get_tick_interval() const
2378 {
2379 // vary +/- 5% to avoid scrub scheduling livelocks
2380 constexpr auto delta = 0.05;
2381 return (OSD_TICK_INTERVAL *
2382 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2383 }
2384
2385 void cls_initialize(ClassHandler *ch);
2386
2387 void OSD::handle_signal(int signum)
2388 {
2389 ceph_assert(signum == SIGINT || signum == SIGTERM);
2390 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2391 shutdown();
2392 }
2393
2394 int OSD::pre_init()
2395 {
2396 std::lock_guard lock(osd_lock);
2397 if (is_stopping())
2398 return 0;
2399
2400 if (store->test_mount_in_use()) {
2401 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2402 << "currently in use. (Is ceph-osd already running?)" << dendl;
2403 return -EBUSY;
2404 }
2405
2406 cct->_conf.add_observer(this);
2407 return 0;
2408 }
2409
2410 int OSD::set_numa_affinity()
2411 {
2412 // storage numa node
2413 int store_node = -1;
2414 store->get_numa_node(&store_node, nullptr, nullptr);
2415 if (store_node >= 0) {
2416 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2417 }
2418
2419 // check network numa node(s)
2420 int front_node = -1, back_node = -1;
2421 string front_iface = pick_iface(
2422 cct,
2423 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2424 string back_iface = pick_iface(
2425 cct,
2426 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2427 int r = get_iface_numa_node(front_iface, &front_node);
2428 if (r >= 0 && front_node >= 0) {
2429 dout(1) << __func__ << " public network " << front_iface << " numa node "
2430 << front_node << dendl;
2431 r = get_iface_numa_node(back_iface, &back_node);
2432 if (r >= 0 && back_node >= 0) {
2433 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2434 << back_node << dendl;
2435 if (front_node == back_node &&
2436 front_node == store_node) {
2437 dout(1) << " objectstore and network numa nodes all match" << dendl;
2438 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2439 numa_node = front_node;
2440 }
2441 } else if (front_node != back_node) {
2442 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2443 << dendl;
2444 } else {
2445 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2446 << dendl;
2447 }
2448 } else if (back_node == -2) {
2449 dout(1) << __func__ << " cluster network " << back_iface
2450 << " ports numa nodes do not match" << dendl;
2451 } else {
2452 derr << __func__ << " unable to identify cluster interface '" << back_iface
2453 << "' numa node: " << cpp_strerror(r) << dendl;
2454 }
2455 } else if (front_node == -2) {
2456 dout(1) << __func__ << " public network " << front_iface
2457 << " ports numa nodes do not match" << dendl;
2458 } else {
2459 derr << __func__ << " unable to identify public interface '" << front_iface
2460 << "' numa node: " << cpp_strerror(r) << dendl;
2461 }
2462 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2463 // this takes precedence over the automagic logic above
2464 numa_node = node;
2465 }
2466 if (numa_node >= 0) {
2467 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2468 if (r < 0) {
2469 dout(1) << __func__ << " unable to determine numa node " << numa_node
2470 << " CPUs" << dendl;
2471 numa_node = -1;
2472 } else {
2473 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2474 << " cpus "
2475 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2476 << dendl;
2477 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2478 if (r < 0) {
2479 r = -errno;
2480 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2481 << dendl;
2482 numa_node = -1;
2483 }
2484 }
2485 } else {
2486 dout(1) << __func__ << " not setting numa affinity" << dendl;
2487 }
2488 return 0;
2489 }
2490
2491 // asok
2492
2493 class OSDSocketHook : public AdminSocketHook {
2494 OSD *osd;
2495 public:
2496 explicit OSDSocketHook(OSD *o) : osd(o) {}
2497 bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2498 std::string_view format, bufferlist& out) override {
2499 stringstream ss;
2500 bool r = true;
2501 try {
2502 r = osd->asok_command(admin_command, cmdmap, format, ss);
2503 } catch (const bad_cmd_get& e) {
2504 ss << e.what();
2505 r = true;
2506 }
2507 out.append(ss);
2508 return r;
2509 }
2510 };
2511
2512 std::set<int64_t> OSD::get_mapped_pools()
2513 {
2514 std::set<int64_t> pools;
2515 std::vector<spg_t> pgids;
2516 _get_pgids(&pgids);
2517 for (const auto &pgid : pgids) {
2518 pools.insert(pgid.pool());
2519 }
2520 return pools;
2521 }
2522
2523 bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2524 std::string_view format, ostream& ss)
2525 {
2526 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2527 if (admin_command == "status") {
2528 f->open_object_section("status");
2529 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2530 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2531 f->dump_unsigned("whoami", superblock.whoami);
2532 f->dump_string("state", get_state_name(get_state()));
2533 f->dump_unsigned("oldest_map", superblock.oldest_map);
2534 f->dump_unsigned("newest_map", superblock.newest_map);
2535 f->dump_unsigned("num_pgs", num_pgs);
2536 f->close_section();
2537 } else if (admin_command == "flush_journal") {
2538 store->flush_journal();
2539 } else if (admin_command == "dump_ops_in_flight" ||
2540 admin_command == "ops" ||
2541 admin_command == "dump_blocked_ops" ||
2542 admin_command == "dump_historic_ops" ||
2543 admin_command == "dump_historic_ops_by_duration" ||
2544 admin_command == "dump_historic_slow_ops") {
2545
2546 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2547 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2548 will start to track new ops received afterwards.";
2549
2550 set<string> filters;
2551 vector<string> filter_str;
2552 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2553 copy(filter_str.begin(), filter_str.end(),
2554 inserter(filters, filters.end()));
2555 }
2556
2557 if (admin_command == "dump_ops_in_flight" ||
2558 admin_command == "ops") {
2559 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2560 ss << error_str;
2561 }
2562 }
2563 if (admin_command == "dump_blocked_ops") {
2564 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2565 ss << error_str;
2566 }
2567 }
2568 if (admin_command == "dump_historic_ops") {
2569 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2570 ss << error_str;
2571 }
2572 }
2573 if (admin_command == "dump_historic_ops_by_duration") {
2574 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2575 ss << error_str;
2576 }
2577 }
2578 if (admin_command == "dump_historic_slow_ops") {
2579 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2580 ss << error_str;
2581 }
2582 }
2583 } else if (admin_command == "dump_op_pq_state") {
2584 f->open_object_section("pq");
2585 op_shardedwq.dump(f);
2586 f->close_section();
2587 } else if (admin_command == "dump_blacklist") {
2588 list<pair<entity_addr_t,utime_t> > bl;
2589 OSDMapRef curmap = service.get_osdmap();
2590
2591 f->open_array_section("blacklist");
2592 curmap->get_blacklist(&bl);
2593 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2594 it != bl.end(); ++it) {
2595 f->open_object_section("entry");
2596 f->open_object_section("entity_addr_t");
2597 it->first.dump(f);
2598 f->close_section(); //entity_addr_t
2599 it->second.localtime(f->dump_stream("expire_time"));
2600 f->close_section(); //entry
2601 }
2602 f->close_section(); //blacklist
2603 } else if (admin_command == "dump_watchers") {
2604 list<obj_watch_item_t> watchers;
2605 // scan pg's
2606 vector<PGRef> pgs;
2607 _get_pgs(&pgs);
2608 for (auto& pg : pgs) {
2609 list<obj_watch_item_t> pg_watchers;
2610 pg->get_watchers(&pg_watchers);
2611 watchers.splice(watchers.end(), pg_watchers);
2612 }
2613
2614 f->open_array_section("watchers");
2615 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2616 it != watchers.end(); ++it) {
2617
2618 f->open_object_section("watch");
2619
2620 f->dump_string("namespace", it->obj.nspace);
2621 f->dump_string("object", it->obj.oid.name);
2622
2623 f->open_object_section("entity_name");
2624 it->wi.name.dump(f);
2625 f->close_section(); //entity_name_t
2626
2627 f->dump_unsigned("cookie", it->wi.cookie);
2628 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2629
2630 f->open_object_section("entity_addr_t");
2631 it->wi.addr.dump(f);
2632 f->close_section(); //entity_addr_t
2633
2634 f->close_section(); //watch
2635 }
2636
2637 f->close_section(); //watchers
2638 } else if (admin_command == "dump_recovery_reservations") {
2639 f->open_object_section("reservations");
2640 f->open_object_section("local_reservations");
2641 service.local_reserver.dump(f);
2642 f->close_section();
2643 f->open_object_section("remote_reservations");
2644 service.remote_reserver.dump(f);
2645 f->close_section();
2646 f->close_section();
2647 } else if (admin_command == "dump_scrub_reservations") {
2648 f->open_object_section("scrub_reservations");
2649 service.dump_scrub_reservations(f);
2650 f->close_section();
2651 } else if (admin_command == "get_latest_osdmap") {
2652 get_latest_osdmap();
2653 } else if (admin_command == "heap") {
2654 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2655
2656 // Note: Failed heap profile commands won't necessarily trigger an error:
2657 f->open_object_section("result");
2658 f->dump_string("error", cpp_strerror(result));
2659 f->dump_bool("success", result >= 0);
2660 f->close_section();
2661 } else if (admin_command == "set_heap_property") {
2662 string property;
2663 int64_t value = 0;
2664 string error;
2665 bool success = false;
2666 if (!cmd_getval(cct, cmdmap, "property", property)) {
2667 error = "unable to get property";
2668 success = false;
2669 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2670 error = "unable to get value";
2671 success = false;
2672 } else if (value < 0) {
2673 error = "negative value not allowed";
2674 success = false;
2675 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2676 error = "invalid property";
2677 success = false;
2678 } else {
2679 success = true;
2680 }
2681 f->open_object_section("result");
2682 f->dump_string("error", error);
2683 f->dump_bool("success", success);
2684 f->close_section();
2685 } else if (admin_command == "get_heap_property") {
2686 string property;
2687 size_t value = 0;
2688 string error;
2689 bool success = false;
2690 if (!cmd_getval(cct, cmdmap, "property", property)) {
2691 error = "unable to get property";
2692 success = false;
2693 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2694 error = "invalid property";
2695 success = false;
2696 } else {
2697 success = true;
2698 }
2699 f->open_object_section("result");
2700 f->dump_string("error", error);
2701 f->dump_bool("success", success);
2702 f->dump_int("value", value);
2703 f->close_section();
2704 } else if (admin_command == "dump_objectstore_kv_stats") {
2705 store->get_db_statistics(f);
2706 } else if (admin_command == "dump_scrubs") {
2707 service.dumps_scrub(f);
2708 } else if (admin_command == "calc_objectstore_db_histogram") {
2709 store->generate_db_histogram(f);
2710 } else if (admin_command == "flush_store_cache") {
2711 store->flush_cache(&ss);
2712 } else if (admin_command == "dump_pgstate_history") {
2713 f->open_object_section("pgstate_history");
2714 vector<PGRef> pgs;
2715 _get_pgs(&pgs);
2716 for (auto& pg : pgs) {
2717 f->dump_stream("pg") << pg->pg_id;
2718 pg->dump_pgstate_history(f);
2719 }
2720 f->close_section();
2721 } else if (admin_command == "compact") {
2722 dout(1) << "triggering manual compaction" << dendl;
2723 auto start = ceph::coarse_mono_clock::now();
2724 store->compact();
2725 auto end = ceph::coarse_mono_clock::now();
2726 double duration = std::chrono::duration<double>(end-start).count();
2727 dout(1) << "finished manual compaction in "
2728 << duration
2729 << " seconds" << dendl;
2730 f->open_object_section("compact_result");
2731 f->dump_float("elapsed_time", duration);
2732 f->close_section();
2733 } else if (admin_command == "get_mapped_pools") {
2734 f->open_array_section("mapped_pools");
2735 set<int64_t> poollist = get_mapped_pools();
2736 for (auto pool : poollist) {
2737 f->dump_int("pool_id", pool);
2738 }
2739 f->close_section();
2740 } else if (admin_command == "smart") {
2741 string devid;
2742 cmd_getval(cct, cmdmap, "devid", devid);
2743 probe_smart(devid, ss);
2744 } else if (admin_command == "list_devices") {
2745 set<string> devnames;
2746 store->get_devices(&devnames);
2747 f->open_object_section("list_devices");
2748 for (auto dev : devnames) {
2749 if (dev.find("dm-") == 0) {
2750 continue;
2751 }
2752 f->dump_string("device", "/dev/" + dev);
2753 }
2754 f->close_section();
2755 } else if (admin_command == "send_beacon") {
2756 if (is_active()) {
2757 send_beacon(ceph::coarse_mono_clock::now());
2758 }
2759 } else if (admin_command == "dump_osd_network") {
2760 int64_t value = 0;
2761 if (!(cmd_getval(cct, cmdmap, "value", value))) {
2762 // Convert milliseconds to microseconds
2763 value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
2764 if (value == 0) {
2765 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2766 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2767 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2768 }
2769 } else {
2770 // Convert user input to microseconds
2771 value *= 1000;
2772 }
2773 if (value < 0) value = 0;
2774
2775 struct osd_ping_time_t {
2776 uint32_t pingtime;
2777 int to;
2778 bool back;
2779 std::array<uint32_t,3> times;
2780 std::array<uint32_t,3> min;
2781 std::array<uint32_t,3> max;
2782 uint32_t last;
2783 uint32_t last_update;
2784
2785 bool operator<(const osd_ping_time_t& rhs) const {
2786 if (pingtime < rhs.pingtime)
2787 return true;
2788 if (pingtime > rhs.pingtime)
2789 return false;
2790 if (to < rhs.to)
2791 return true;
2792 if (to > rhs.to)
2793 return false;
2794 return back;
2795 }
2796 };
2797
2798 set<osd_ping_time_t> sorted;
2799 // Get pingtimes under lock and not on the stack
2800 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
2801 service.get_hb_pingtime(pingtimes);
2802 for (auto j : *pingtimes) {
2803 if (j.second.last_update == 0)
2804 continue;
2805 osd_ping_time_t item;
2806 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2807 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
2808 if (item.pingtime >= value) {
2809 item.to = j.first;
2810 item.times[0] = j.second.back_pingtime[0];
2811 item.times[1] = j.second.back_pingtime[1];
2812 item.times[2] = j.second.back_pingtime[2];
2813 item.min[0] = j.second.back_min[0];
2814 item.min[1] = j.second.back_min[1];
2815 item.min[2] = j.second.back_min[2];
2816 item.max[0] = j.second.back_max[0];
2817 item.max[1] = j.second.back_max[1];
2818 item.max[2] = j.second.back_max[2];
2819 item.last = j.second.back_last;
2820 item.back = true;
2821 item.last_update = j.second.last_update;
2822 sorted.emplace(item);
2823 }
2824 if (j.second.front_last == 0)
2825 continue;
2826 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2827 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
2828 if (item.pingtime >= value) {
2829 item.to = j.first;
2830 item.times[0] = j.second.front_pingtime[0];
2831 item.times[1] = j.second.front_pingtime[1];
2832 item.times[2] = j.second.front_pingtime[2];
2833 item.min[0] = j.second.front_min[0];
2834 item.min[1] = j.second.front_min[1];
2835 item.min[2] = j.second.front_min[2];
2836 item.max[0] = j.second.front_max[0];
2837 item.max[1] = j.second.front_max[1];
2838 item.max[2] = j.second.front_max[2];
2839 item.last = j.second.front_last;
2840 item.last_update = j.second.last_update;
2841 item.back = false;
2842 sorted.emplace(item);
2843 }
2844 }
2845 delete pingtimes;
2846 //
2847 // Network ping times (1min 5min 15min)
2848 f->open_object_section("network_ping_times");
2849 f->dump_int("threshold", value / 1000);
2850 f->open_array_section("entries");
2851 for (auto &sitem : boost::adaptors::reverse(sorted)) {
2852 ceph_assert(sitem.pingtime >= value);
2853 f->open_object_section("entry");
2854
2855 const time_t lu(sitem.last_update);
2856 char buffer[26];
2857 string lustr(ctime_r(&lu, buffer));
2858 lustr.pop_back(); // Remove trailing \n
2859 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
2860 f->dump_string("last update", lustr);
2861 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
2862 f->dump_int("from osd", whoami);
2863 f->dump_int("to osd", sitem.to);
2864 f->dump_string("interface", (sitem.back ? "back" : "front"));
2865 f->open_object_section("average");
2866 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
2867 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
2868 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
2869 f->close_section(); // average
2870 f->open_object_section("min");
2871 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2872 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2873 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2874 f->close_section(); // min
2875 f->open_object_section("max");
2876 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2877 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2878 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2879 f->close_section(); // max
2880 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
2881 f->close_section(); // entry
2882 }
2883 f->close_section(); // entries
2884 f->close_section(); // network_ping_times
2885 } else {
2886 ceph_abort_msg("broken asok registration");
2887 }
2888 f->flush(ss);
2889 delete f;
2890 return true;
2891 }
2892
2893 class TestOpsSocketHook : public AdminSocketHook {
2894 OSDService *service;
2895 ObjectStore *store;
2896 public:
2897 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2898 bool call(std::string_view command, const cmdmap_t& cmdmap,
2899 std::string_view format, bufferlist& out) override {
2900 stringstream ss;
2901 try {
2902 test_ops(service, store, command, cmdmap, ss);
2903 } catch (const bad_cmd_get& e) {
2904 ss << e.what();
2905 }
2906 out.append(ss);
2907 return true;
2908 }
2909 void test_ops(OSDService *service, ObjectStore *store,
2910 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
2911
2912 };
2913
2914 class OSD::C_Tick : public Context {
2915 OSD *osd;
2916 public:
2917 explicit C_Tick(OSD *o) : osd(o) {}
2918 void finish(int r) override {
2919 osd->tick();
2920 }
2921 };
2922
2923 class OSD::C_Tick_WithoutOSDLock : public Context {
2924 OSD *osd;
2925 public:
2926 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2927 void finish(int r) override {
2928 osd->tick_without_osd_lock();
2929 }
2930 };
2931
2932 int OSD::enable_disable_fuse(bool stop)
2933 {
2934 #ifdef HAVE_LIBFUSE
2935 int r;
2936 string mntpath = cct->_conf->osd_data + "/fuse";
2937 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2938 dout(1) << __func__ << " disabling" << dendl;
2939 fuse_store->stop();
2940 delete fuse_store;
2941 fuse_store = NULL;
2942 r = ::rmdir(mntpath.c_str());
2943 if (r < 0) {
2944 r = -errno;
2945 derr << __func__ << " failed to rmdir " << mntpath << ": "
2946 << cpp_strerror(r) << dendl;
2947 return r;
2948 }
2949 return 0;
2950 }
2951 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2952 dout(1) << __func__ << " enabling" << dendl;
2953 r = ::mkdir(mntpath.c_str(), 0700);
2954 if (r < 0)
2955 r = -errno;
2956 if (r < 0 && r != -EEXIST) {
2957 derr << __func__ << " unable to create " << mntpath << ": "
2958 << cpp_strerror(r) << dendl;
2959 return r;
2960 }
2961 fuse_store = new FuseStore(store, mntpath);
2962 r = fuse_store->start();
2963 if (r < 0) {
2964 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2965 delete fuse_store;
2966 fuse_store = NULL;
2967 return r;
2968 }
2969 }
2970 #endif // HAVE_LIBFUSE
2971 return 0;
2972 }
2973
2974 int OSD::get_num_op_shards()
2975 {
2976 if (cct->_conf->osd_op_num_shards)
2977 return cct->_conf->osd_op_num_shards;
2978 if (store_is_rotational)
2979 return cct->_conf->osd_op_num_shards_hdd;
2980 else
2981 return cct->_conf->osd_op_num_shards_ssd;
2982 }
2983
2984 int OSD::get_num_op_threads()
2985 {
2986 if (cct->_conf->osd_op_num_threads_per_shard)
2987 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2988 if (store_is_rotational)
2989 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2990 else
2991 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2992 }
2993
2994 float OSD::get_osd_recovery_sleep()
2995 {
2996 if (cct->_conf->osd_recovery_sleep)
2997 return cct->_conf->osd_recovery_sleep;
2998 if (!store_is_rotational && !journal_is_rotational)
2999 return cct->_conf->osd_recovery_sleep_ssd;
3000 else if (store_is_rotational && !journal_is_rotational)
3001 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3002 else
3003 return cct->_conf->osd_recovery_sleep_hdd;
3004 }
3005
3006 float OSD::get_osd_delete_sleep()
3007 {
3008 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3009 if (osd_delete_sleep > 0)
3010 return osd_delete_sleep;
3011 if (!store_is_rotational && !journal_is_rotational)
3012 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3013 if (store_is_rotational && !journal_is_rotational)
3014 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3015 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3016 }
3017
3018 float OSD::get_osd_snap_trim_sleep()
3019 {
3020 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3021 if (osd_snap_trim_sleep > 0)
3022 return osd_snap_trim_sleep;
3023 if (!store_is_rotational && !journal_is_rotational)
3024 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3025 if (store_is_rotational && !journal_is_rotational)
3026 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3027 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3028 }
3029
3030 int OSD::init()
3031 {
3032 CompatSet initial, diff;
3033 std::lock_guard lock(osd_lock);
3034 if (is_stopping())
3035 return 0;
3036
3037 tick_timer.init();
3038 tick_timer_without_osd_lock.init();
3039 service.recovery_request_timer.init();
3040 service.sleep_timer.init();
3041
3042 boot_finisher.start();
3043
3044 {
3045 string val;
3046 store->read_meta("require_osd_release", &val);
3047 last_require_osd_release = atoi(val.c_str());
3048 }
3049
3050 // mount.
3051 dout(2) << "init " << dev_path
3052 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3053 << dendl;
3054 dout(2) << "journal " << journal_path << dendl;
3055 ceph_assert(store); // call pre_init() first!
3056
3057 store->set_cache_shards(get_num_op_shards());
3058
3059 int r = store->mount();
3060 if (r < 0) {
3061 derr << "OSD:init: unable to mount object store" << dendl;
3062 return r;
3063 }
3064 journal_is_rotational = store->is_journal_rotational();
3065 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3066 << dendl;
3067
3068 enable_disable_fuse(false);
3069
3070 dout(2) << "boot" << dendl;
3071
3072 service.meta_ch = store->open_collection(coll_t::meta());
3073
3074 // initialize the daily loadavg with current 15min loadavg
3075 double loadavgs[3];
3076 if (getloadavg(loadavgs, 3) == 3) {
3077 daily_loadavg = loadavgs[2];
3078 } else {
3079 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3080 daily_loadavg = 1.0;
3081 }
3082
3083 int rotating_auth_attempts = 0;
3084 auto rotating_auth_timeout =
3085 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3086
3087 // sanity check long object name handling
3088 {
3089 hobject_t l;
3090 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3091 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3092 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3093 r = store->validate_hobject_key(l);
3094 if (r < 0) {
3095 derr << "backend (" << store->get_type() << ") is unable to support max "
3096 << "object name[space] len" << dendl;
3097 derr << " osd max object name len = "
3098 << cct->_conf->osd_max_object_name_len << dendl;
3099 derr << " osd max object namespace len = "
3100 << cct->_conf->osd_max_object_namespace_len << dendl;
3101 derr << cpp_strerror(r) << dendl;
3102 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3103 goto out;
3104 }
3105 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3106 << dendl;
3107 } else {
3108 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3109 }
3110 }
3111
3112 // read superblock
3113 r = read_superblock();
3114 if (r < 0) {
3115 derr << "OSD::init() : unable to read osd superblock" << dendl;
3116 r = -EINVAL;
3117 goto out;
3118 }
3119
3120 if (osd_compat.compare(superblock.compat_features) < 0) {
3121 derr << "The disk uses features unsupported by the executable." << dendl;
3122 derr << " ondisk features " << superblock.compat_features << dendl;
3123 derr << " daemon features " << osd_compat << dendl;
3124
3125 if (osd_compat.writeable(superblock.compat_features)) {
3126 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3127 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3128 r = -EOPNOTSUPP;
3129 goto out;
3130 }
3131 else {
3132 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3133 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3134 r = -EOPNOTSUPP;
3135 goto out;
3136 }
3137 }
3138
3139 assert_warn(whoami == superblock.whoami);
3140 if (whoami != superblock.whoami) {
3141 derr << "OSD::init: superblock says osd"
3142 << superblock.whoami << " but I am osd." << whoami << dendl;
3143 r = -EINVAL;
3144 goto out;
3145 }
3146
3147 // load up "current" osdmap
3148 assert_warn(!osdmap);
3149 if (osdmap) {
3150 derr << "OSD::init: unable to read current osdmap" << dendl;
3151 r = -EINVAL;
3152 goto out;
3153 }
3154 osdmap = get_map(superblock.current_epoch);
3155
3156 // make sure we don't have legacy pgs deleting
3157 {
3158 vector<coll_t> ls;
3159 int r = store->list_collections(ls);
3160 ceph_assert(r >= 0);
3161 for (auto c : ls) {
3162 spg_t pgid;
3163 if (c.is_pg(&pgid) &&
3164 !osdmap->have_pg_pool(pgid.pool())) {
3165 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3166 if (!store->exists(service.meta_ch, oid)) {
3167 derr << __func__ << " missing pg_pool_t for deleted pool "
3168 << pgid.pool() << " for pg " << pgid
3169 << "; please downgrade to luminous and allow "
3170 << "pg deletion to complete before upgrading" << dendl;
3171 ceph_abort();
3172 }
3173 }
3174 }
3175 }
3176
3177 initial = get_osd_initial_compat_set();
3178 diff = superblock.compat_features.unsupported(initial);
3179 if (superblock.compat_features.merge(initial)) {
3180 // We need to persist the new compat_set before we
3181 // do anything else
3182 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3183 ObjectStore::Transaction t;
3184 write_superblock(t);
3185 r = store->queue_transaction(service.meta_ch, std::move(t));
3186 if (r < 0)
3187 goto out;
3188 }
3189
3190 // make sure snap mapper object exists
3191 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3192 dout(10) << "init creating/touching snapmapper object" << dendl;
3193 ObjectStore::Transaction t;
3194 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3195 r = store->queue_transaction(service.meta_ch, std::move(t));
3196 if (r < 0)
3197 goto out;
3198 }
3199
3200 class_handler = new ClassHandler(cct);
3201 cls_initialize(class_handler);
3202
3203 if (cct->_conf->osd_open_classes_on_start) {
3204 int r = class_handler->open_all_classes();
3205 if (r)
3206 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3207 }
3208
3209 check_osdmap_features();
3210
3211 create_recoverystate_perf();
3212
3213 {
3214 epoch_t bind_epoch = osdmap->get_epoch();
3215 service.set_epochs(NULL, NULL, &bind_epoch);
3216 }
3217
3218 clear_temp_objects();
3219
3220 // initialize osdmap references in sharded wq
3221 for (auto& shard : shards) {
3222 std::lock_guard l(shard->osdmap_lock);
3223 shard->shard_osdmap = osdmap;
3224 }
3225
3226 // load up pgs (as they previously existed)
3227 load_pgs();
3228
3229 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3230 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3231 op_prio_cutoff << "." << dendl;
3232
3233 create_logger();
3234
3235 // prime osd stats
3236 {
3237 struct store_statfs_t stbuf;
3238 osd_alert_list_t alerts;
3239 int r = store->statfs(&stbuf, &alerts);
3240 ceph_assert(r == 0);
3241 service.set_statfs(stbuf, alerts);
3242 }
3243
3244 // client_messenger auth_client is already set up by monc.
3245 for (auto m : { cluster_messenger,
3246 objecter_messenger,
3247 hb_front_client_messenger,
3248 hb_back_client_messenger,
3249 hb_front_server_messenger,
3250 hb_back_server_messenger } ) {
3251 m->set_auth_client(monc);
3252 }
3253 for (auto m : { client_messenger,
3254 cluster_messenger,
3255 hb_front_server_messenger,
3256 hb_back_server_messenger }) {
3257 m->set_auth_server(monc);
3258 }
3259 monc->set_handle_authentication_dispatcher(this);
3260
3261 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3262 | CEPH_ENTITY_TYPE_MGR);
3263 r = monc->init();
3264 if (r < 0)
3265 goto out;
3266
3267 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3268 mgrc.set_perf_metric_query_cb(
3269 [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3270 set_perf_queries(queries);
3271 },
3272 [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3273 get_perf_reports(reports);
3274 });
3275 mgrc.init();
3276
3277 // tell monc about log_client so it will know about mon session resets
3278 monc->set_log_client(&log_client);
3279 update_log_config();
3280
3281 // i'm ready!
3282 client_messenger->add_dispatcher_tail(&mgrc);
3283 client_messenger->add_dispatcher_tail(this);
3284 cluster_messenger->add_dispatcher_head(this);
3285
3286 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3287 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3288 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3289 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3290
3291 objecter_messenger->add_dispatcher_head(service.objecter);
3292
3293 service.init();
3294 service.publish_map(osdmap);
3295 service.publish_superblock(superblock);
3296 service.max_oldest_map = superblock.oldest_map;
3297
3298 for (auto& shard : shards) {
3299 // put PGs in a temporary set because we may modify pg_slots
3300 // unordered_map below.
3301 set<PGRef> pgs;
3302 for (auto& i : shard->pg_slots) {
3303 PGRef pg = i.second->pg;
3304 if (!pg) {
3305 continue;
3306 }
3307 pgs.insert(pg);
3308 }
3309 for (auto pg : pgs) {
3310 pg->lock();
3311 set<pair<spg_t,epoch_t>> new_children;
3312 set<pair<spg_t,epoch_t>> merge_pgs;
3313 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3314 &new_children, &merge_pgs);
3315 if (!new_children.empty()) {
3316 for (auto shard : shards) {
3317 shard->prime_splits(osdmap, &new_children);
3318 }
3319 assert(new_children.empty());
3320 }
3321 if (!merge_pgs.empty()) {
3322 for (auto shard : shards) {
3323 shard->prime_merges(osdmap, &merge_pgs);
3324 }
3325 assert(merge_pgs.empty());
3326 }
3327 pg->unlock();
3328 }
3329 }
3330
3331 osd_op_tp.start();
3332 command_tp.start();
3333
3334 // start the heartbeat
3335 heartbeat_thread.create("osd_srv_heartbt");
3336
3337 // tick
3338 tick_timer.add_event_after(get_tick_interval(),
3339 new C_Tick(this));
3340 {
3341 std::lock_guard l(tick_timer_lock);
3342 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3343 new C_Tick_WithoutOSDLock(this));
3344 }
3345
3346 osd_lock.Unlock();
3347
3348 r = monc->authenticate();
3349 if (r < 0) {
3350 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3351 << dendl;
3352 exit(1);
3353 }
3354
3355 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3356 derr << "unable to obtain rotating service keys; retrying" << dendl;
3357 ++rotating_auth_attempts;
3358 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3359 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3360 exit(1);
3361 }
3362 }
3363
3364 r = update_crush_device_class();
3365 if (r < 0) {
3366 derr << __func__ << " unable to update_crush_device_class: "
3367 << cpp_strerror(r) << dendl;
3368 exit(1);
3369 }
3370
3371 r = update_crush_location();
3372 if (r < 0) {
3373 derr << __func__ << " unable to update_crush_location: "
3374 << cpp_strerror(r) << dendl;
3375 exit(1);
3376 }
3377
3378 osd_lock.Lock();
3379 if (is_stopping())
3380 return 0;
3381
3382 // start objecter *after* we have authenticated, so that we don't ignore
3383 // the OSDMaps it requests.
3384 service.final_init();
3385
3386 check_config();
3387
3388 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3389 consume_map();
3390
3391 dout(0) << "done with init, starting boot process" << dendl;
3392
3393 // subscribe to any pg creations
3394 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3395
3396 // MgrClient needs this (it doesn't have MonClient reference itself)
3397 monc->sub_want("mgrmap", 0, 0);
3398
3399 // we don't need to ask for an osdmap here; objecter will
3400 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3401
3402 monc->renew_subs();
3403
3404 start_boot();
3405
3406 return 0;
3407
3408 out:
3409 enable_disable_fuse(true);
3410 store->umount();
3411 delete store;
3412 store = NULL;
3413 return r;
3414 }
3415
3416 void OSD::final_init()
3417 {
3418 AdminSocket *admin_socket = cct->get_admin_socket();
3419 asok_hook = new OSDSocketHook(this);
3420 int r = admin_socket->register_command("status", "status", asok_hook,
3421 "high-level status of OSD");
3422 ceph_assert(r == 0);
3423 r = admin_socket->register_command("flush_journal", "flush_journal",
3424 asok_hook,
3425 "flush the journal to permanent store");
3426 ceph_assert(r == 0);
3427 r = admin_socket->register_command("dump_ops_in_flight",
3428 "dump_ops_in_flight " \
3429 "name=filterstr,type=CephString,n=N,req=false",
3430 asok_hook,
3431 "show the ops currently in flight");
3432 ceph_assert(r == 0);
3433 r = admin_socket->register_command("ops",
3434 "ops " \
3435 "name=filterstr,type=CephString,n=N,req=false",
3436 asok_hook,
3437 "show the ops currently in flight");
3438 ceph_assert(r == 0);
3439 r = admin_socket->register_command("dump_blocked_ops",
3440 "dump_blocked_ops " \
3441 "name=filterstr,type=CephString,n=N,req=false",
3442 asok_hook,
3443 "show the blocked ops currently in flight");
3444 ceph_assert(r == 0);
3445 r = admin_socket->register_command("dump_historic_ops",
3446 "dump_historic_ops " \
3447 "name=filterstr,type=CephString,n=N,req=false",
3448 asok_hook,
3449 "show recent ops");
3450 ceph_assert(r == 0);
3451 r = admin_socket->register_command("dump_historic_slow_ops",
3452 "dump_historic_slow_ops " \
3453 "name=filterstr,type=CephString,n=N,req=false",
3454 asok_hook,
3455 "show slowest recent ops");
3456 ceph_assert(r == 0);
3457 r = admin_socket->register_command("dump_historic_ops_by_duration",
3458 "dump_historic_ops_by_duration " \
3459 "name=filterstr,type=CephString,n=N,req=false",
3460 asok_hook,
3461 "show slowest recent ops, sorted by duration");
3462 ceph_assert(r == 0);
3463 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3464 asok_hook,
3465 "dump op priority queue state");
3466 ceph_assert(r == 0);
3467 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3468 asok_hook,
3469 "dump blacklisted clients and times");
3470 ceph_assert(r == 0);
3471 r = admin_socket->register_command("dump_watchers", "dump_watchers",
3472 asok_hook,
3473 "show clients which have active watches,"
3474 " and on which objects");
3475 ceph_assert(r == 0);
3476 r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3477 asok_hook,
3478 "show recovery reservations");
3479 ceph_assert(r == 0);
3480 r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3481 asok_hook,
3482 "show scrub reservations");
3483 ceph_assert(r == 0);
3484 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3485 asok_hook,
3486 "force osd to update the latest map from "
3487 "the mon");
3488 ceph_assert(r == 0);
3489
3490 r = admin_socket->register_command( "heap",
3491 "heap " \
3492 "name=heapcmd,type=CephString " \
3493 "name=value,type=CephString,req=false",
3494 asok_hook,
3495 "show heap usage info (available only if "
3496 "compiled with tcmalloc)");
3497 ceph_assert(r == 0);
3498
3499 r = admin_socket->register_command("set_heap_property",
3500 "set_heap_property " \
3501 "name=property,type=CephString " \
3502 "name=value,type=CephInt",
3503 asok_hook,
3504 "update malloc extension heap property");
3505 ceph_assert(r == 0);
3506
3507 r = admin_socket->register_command("get_heap_property",
3508 "get_heap_property " \
3509 "name=property,type=CephString",
3510 asok_hook,
3511 "get malloc extension heap property");
3512 ceph_assert(r == 0);
3513
3514 r = admin_socket->register_command("dump_objectstore_kv_stats",
3515 "dump_objectstore_kv_stats",
3516 asok_hook,
3517 "print statistics of kvdb which used by bluestore");
3518 ceph_assert(r == 0);
3519
3520 r = admin_socket->register_command("dump_scrubs",
3521 "dump_scrubs",
3522 asok_hook,
3523 "print scheduled scrubs");
3524 ceph_assert(r == 0);
3525
3526 r = admin_socket->register_command("calc_objectstore_db_histogram",
3527 "calc_objectstore_db_histogram",
3528 asok_hook,
3529 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3530 ceph_assert(r == 0);
3531
3532 r = admin_socket->register_command("flush_store_cache",
3533 "flush_store_cache",
3534 asok_hook,
3535 "Flush bluestore internal cache");
3536 ceph_assert(r == 0);
3537 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3538 asok_hook,
3539 "show recent state history");
3540 ceph_assert(r == 0);
3541
3542 r = admin_socket->register_command("compact", "compact",
3543 asok_hook,
3544 "Commpact object store's omap."
3545 " WARNING: Compaction probably slows your requests");
3546 ceph_assert(r == 0);
3547
3548 r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3549 asok_hook,
3550 "dump pools whose PG(s) are mapped to this OSD.");
3551
3552 ceph_assert(r == 0);
3553
3554 r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3555 asok_hook,
3556 "probe OSD devices for SMART data.");
3557
3558 ceph_assert(r == 0);
3559
3560 r = admin_socket->register_command("list_devices", "list_devices",
3561 asok_hook,
3562 "list OSD devices.");
3563 r = admin_socket->register_command("send_beacon", "send_beacon",
3564 asok_hook,
3565 "send OSD beacon to mon immediately");
3566
3567 r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3568 "Dump osd heartbeat network ping times");
3569 ceph_assert(r == 0);
3570
3571 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3572 // Note: pools are CephString instead of CephPoolname because
3573 // these commands traditionally support both pool names and numbers
3574 r = admin_socket->register_command(
3575 "setomapval",
3576 "setomapval " \
3577 "name=pool,type=CephString " \
3578 "name=objname,type=CephObjectname " \
3579 "name=key,type=CephString "\
3580 "name=val,type=CephString",
3581 test_ops_hook,
3582 "set omap key");
3583 ceph_assert(r == 0);
3584 r = admin_socket->register_command(
3585 "rmomapkey",
3586 "rmomapkey " \
3587 "name=pool,type=CephString " \
3588 "name=objname,type=CephObjectname " \
3589 "name=key,type=CephString",
3590 test_ops_hook,
3591 "remove omap key");
3592 ceph_assert(r == 0);
3593 r = admin_socket->register_command(
3594 "setomapheader",
3595 "setomapheader " \
3596 "name=pool,type=CephString " \
3597 "name=objname,type=CephObjectname " \
3598 "name=header,type=CephString",
3599 test_ops_hook,
3600 "set omap header");
3601 ceph_assert(r == 0);
3602
3603 r = admin_socket->register_command(
3604 "getomap",
3605 "getomap " \
3606 "name=pool,type=CephString " \
3607 "name=objname,type=CephObjectname",
3608 test_ops_hook,
3609 "output entire object map");
3610 ceph_assert(r == 0);
3611
3612 r = admin_socket->register_command(
3613 "truncobj",
3614 "truncobj " \
3615 "name=pool,type=CephString " \
3616 "name=objname,type=CephObjectname " \
3617 "name=len,type=CephInt",
3618 test_ops_hook,
3619 "truncate object to length");
3620 ceph_assert(r == 0);
3621
3622 r = admin_socket->register_command(
3623 "injectdataerr",
3624 "injectdataerr " \
3625 "name=pool,type=CephString " \
3626 "name=objname,type=CephObjectname " \
3627 "name=shardid,type=CephInt,req=false,range=0|255",
3628 test_ops_hook,
3629 "inject data error to an object");
3630 ceph_assert(r == 0);
3631
3632 r = admin_socket->register_command(
3633 "injectmdataerr",
3634 "injectmdataerr " \
3635 "name=pool,type=CephString " \
3636 "name=objname,type=CephObjectname " \
3637 "name=shardid,type=CephInt,req=false,range=0|255",
3638 test_ops_hook,
3639 "inject metadata error to an object");
3640 ceph_assert(r == 0);
3641 r = admin_socket->register_command(
3642 "set_recovery_delay",
3643 "set_recovery_delay " \
3644 "name=utime,type=CephInt,req=false",
3645 test_ops_hook,
3646 "Delay osd recovery by specified seconds");
3647 ceph_assert(r == 0);
3648 r = admin_socket->register_command(
3649 "trigger_scrub",
3650 "trigger_scrub " \
3651 "name=pgid,type=CephString " \
3652 "name=time,type=CephInt,req=false",
3653 test_ops_hook,
3654 "Trigger a scheduled scrub ");
3655 ceph_assert(r == 0);
3656 r = admin_socket->register_command(
3657 "trigger_deep_scrub",
3658 "trigger_deep_scrub " \
3659 "name=pgid,type=CephString " \
3660 "name=time,type=CephInt,req=false",
3661 test_ops_hook,
3662 "Trigger a scheduled deep scrub ");
3663 ceph_assert(r == 0);
3664 r = admin_socket->register_command(
3665 "injectfull",
3666 "injectfull " \
3667 "name=type,type=CephString,req=false " \
3668 "name=count,type=CephInt,req=false ",
3669 test_ops_hook,
3670 "Inject a full disk (optional count times)");
3671 ceph_assert(r == 0);
3672 }
3673
3674 void OSD::create_logger()
3675 {
3676 dout(10) << "create_logger" << dendl;
3677
3678 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3679
3680 // Latency axis configuration for op histograms, values are in nanoseconds
3681 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3682 "Latency (usec)",
3683 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3684 0, ///< Start at 0
3685 100000, ///< Quantization unit is 100usec
3686 32, ///< Enough to cover much longer than slow requests
3687 };
3688
3689 // Op size axis configuration for op histograms, values are in bytes
3690 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3691 "Request size (bytes)",
3692 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3693 0, ///< Start at 0
3694 512, ///< Quantization unit is 512 bytes
3695 32, ///< Enough to cover requests larger than GB
3696 };
3697
3698
3699 // All the basic OSD operation stats are to be considered useful
3700 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3701
3702 osd_plb.add_u64(
3703 l_osd_op_wip, "op_wip",
3704 "Replication operations currently being processed (primary)");
3705 osd_plb.add_u64_counter(
3706 l_osd_op, "op",
3707 "Client operations",
3708 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3709 osd_plb.add_u64_counter(
3710 l_osd_op_inb, "op_in_bytes",
3711 "Client operations total write size",
3712 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3713 osd_plb.add_u64_counter(
3714 l_osd_op_outb, "op_out_bytes",
3715 "Client operations total read size",
3716 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3717 osd_plb.add_time_avg(
3718 l_osd_op_lat, "op_latency",
3719 "Latency of client operations (including queue time)",
3720 "l", 9);
3721 osd_plb.add_time_avg(
3722 l_osd_op_process_lat, "op_process_latency",
3723 "Latency of client operations (excluding queue time)");
3724 osd_plb.add_time_avg(
3725 l_osd_op_prepare_lat, "op_prepare_latency",
3726 "Latency of client operations (excluding queue time and wait for finished)");
3727
3728 osd_plb.add_u64_counter(
3729 l_osd_op_r, "op_r", "Client read operations");
3730 osd_plb.add_u64_counter(
3731 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3732 osd_plb.add_time_avg(
3733 l_osd_op_r_lat, "op_r_latency",
3734 "Latency of read operation (including queue time)");
3735 osd_plb.add_u64_counter_histogram(
3736 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3737 op_hist_x_axis_config, op_hist_y_axis_config,
3738 "Histogram of operation latency (including queue time) + data read");
3739 osd_plb.add_time_avg(
3740 l_osd_op_r_process_lat, "op_r_process_latency",
3741 "Latency of read operation (excluding queue time)");
3742 osd_plb.add_time_avg(
3743 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3744 "Latency of read operations (excluding queue time and wait for finished)");
3745 osd_plb.add_u64_counter(
3746 l_osd_op_w, "op_w", "Client write operations");
3747 osd_plb.add_u64_counter(
3748 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3749 osd_plb.add_time_avg(
3750 l_osd_op_w_lat, "op_w_latency",
3751 "Latency of write operation (including queue time)");
3752 osd_plb.add_u64_counter_histogram(
3753 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3754 op_hist_x_axis_config, op_hist_y_axis_config,
3755 "Histogram of operation latency (including queue time) + data written");
3756 osd_plb.add_time_avg(
3757 l_osd_op_w_process_lat, "op_w_process_latency",
3758 "Latency of write operation (excluding queue time)");
3759 osd_plb.add_time_avg(
3760 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3761 "Latency of write operations (excluding queue time and wait for finished)");
3762 osd_plb.add_u64_counter(
3763 l_osd_op_rw, "op_rw",
3764 "Client read-modify-write operations");
3765 osd_plb.add_u64_counter(
3766 l_osd_op_rw_inb, "op_rw_in_bytes",
3767 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3768 osd_plb.add_u64_counter(
3769 l_osd_op_rw_outb,"op_rw_out_bytes",
3770 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3771 osd_plb.add_time_avg(
3772 l_osd_op_rw_lat, "op_rw_latency",
3773 "Latency of read-modify-write operation (including queue time)");
3774 osd_plb.add_u64_counter_histogram(
3775 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3776 op_hist_x_axis_config, op_hist_y_axis_config,
3777 "Histogram of rw operation latency (including queue time) + data written");
3778 osd_plb.add_u64_counter_histogram(
3779 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3780 op_hist_x_axis_config, op_hist_y_axis_config,
3781 "Histogram of rw operation latency (including queue time) + data read");
3782 osd_plb.add_time_avg(
3783 l_osd_op_rw_process_lat, "op_rw_process_latency",
3784 "Latency of read-modify-write operation (excluding queue time)");
3785 osd_plb.add_time_avg(
3786 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3787 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3788
3789 // Now we move on to some more obscure stats, revert to assuming things
3790 // are low priority unless otherwise specified.
3791 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3792
3793 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3794 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3795 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3796 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3797
3798 osd_plb.add_u64_counter(
3799 l_osd_sop, "subop", "Suboperations");
3800 osd_plb.add_u64_counter(
3801 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
3802 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3803
3804 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3805 osd_plb.add_u64_counter(
3806 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
3807 osd_plb.add_time_avg(
3808 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3809 osd_plb.add_u64_counter(
3810 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3811 osd_plb.add_time_avg(
3812 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3813 osd_plb.add_u64_counter(
3814 l_osd_sop_push, "subop_push", "Suboperations push messages");
3815 osd_plb.add_u64_counter(
3816 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
3817 osd_plb.add_time_avg(
3818 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3819
3820 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3821 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3822 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
3823
3824 osd_plb.add_u64_counter(
3825 l_osd_rop, "recovery_ops",
3826 "Started recovery operations",
3827 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3828
3829 osd_plb.add_u64_counter(
3830 l_osd_rbytes, "recovery_bytes",
3831 "recovery bytes",
3832 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3833
3834 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3835 osd_plb.add_u64(
3836 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3837 osd_plb.add_u64(
3838 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3839 "Total number getting crc from crc_cache with adjusting");
3840 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3841 "Total number of crc cache misses");
3842
3843 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3844 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3845 osd_plb.add_u64(
3846 l_osd_pg_primary, "numpg_primary",
3847 "Placement groups for which this osd is primary");
3848 osd_plb.add_u64(
3849 l_osd_pg_replica, "numpg_replica",
3850 "Placement groups for which this osd is replica");
3851 osd_plb.add_u64(
3852 l_osd_pg_stray, "numpg_stray",
3853 "Placement groups ready to be deleted from this osd");
3854 osd_plb.add_u64(
3855 l_osd_pg_removing, "numpg_removing",
3856 "Placement groups queued for local deletion", "pgsr",
3857 PerfCountersBuilder::PRIO_USEFUL);
3858 osd_plb.add_u64(
3859 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3860 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3861 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3862 osd_plb.add_u64_counter(
3863 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3864 osd_plb.add_u64_counter(
3865 l_osd_waiting_for_map, "messages_delayed_for_map",
3866 "Operations waiting for OSD map");
3867
3868 osd_plb.add_u64_counter(
3869 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3870 osd_plb.add_u64_counter(
3871 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3872 osd_plb.add_u64_counter(
3873 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3874 "osdmap cache miss below cache lower bound");
3875 osd_plb.add_u64_avg(
3876 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3877 "osdmap cache miss, avg distance below cache lower bound");
3878 osd_plb.add_u64_counter(
3879 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3880 "OSDMap buffer cache hits");
3881 osd_plb.add_u64_counter(
3882 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3883 "OSDMap buffer cache misses");
3884
3885 osd_plb.add_u64(
3886 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3887 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3888 osd_plb.add_u64(
3889 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3890 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3891 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
3892
3893 osd_plb.add_u64_counter(
3894 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3895
3896 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3897 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3898 osd_plb.add_u64_counter(
3899 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3900 osd_plb.add_u64_counter(
3901 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3902 osd_plb.add_u64_counter(
3903 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3904 "Failed tier flush attempts");
3905 osd_plb.add_u64_counter(
3906 l_osd_tier_evict, "tier_evict", "Tier evictions");
3907 osd_plb.add_u64_counter(
3908 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3909 osd_plb.add_u64_counter(
3910 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3911 osd_plb.add_u64_counter(
3912 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3913 osd_plb.add_u64_counter(
3914 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3915 osd_plb.add_u64_counter(
3916 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3917 osd_plb.add_u64_counter(
3918 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3919
3920 osd_plb.add_u64_counter(
3921 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3922 osd_plb.add_u64_counter(
3923 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3924 osd_plb.add_u64_counter(
3925 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3926 osd_plb.add_u64_counter(
3927 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3928
3929 osd_plb.add_u64_counter(
3930 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3931 osd_plb.add_u64_counter(
3932 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3933
3934 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3935 osd_plb.add_time_avg(
3936 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3937 osd_plb.add_time_avg(
3938 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3939 osd_plb.add_time_avg(
3940 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3941
3942 osd_plb.add_u64_counter(
3943 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3944 osd_plb.add_u64_counter(
3945 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3946 "PG updated its info using fastinfo attr");
3947 osd_plb.add_u64_counter(
3948 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3949
3950 logger = osd_plb.create_perf_counters();
3951 cct->get_perfcounters_collection()->add(logger);
3952 }
3953
3954 void OSD::create_recoverystate_perf()
3955 {
3956 dout(10) << "create_recoverystate_perf" << dendl;
3957
3958 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3959
3960 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3961 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3962 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3963 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3964 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3965 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3966 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3967 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3968 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3969 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3970 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3971 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3972 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3973 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3974 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3975 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3976 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3977 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3978 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3979 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3980 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3981 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3982 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3983 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3984 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3985 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3986 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3987 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3988 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3989 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3990 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3991
3992 recoverystate_perf = rs_perf.create_perf_counters();
3993 cct->get_perfcounters_collection()->add(recoverystate_perf);
3994 }
3995
3996 int OSD::shutdown()
3997 {
3998 if (cct->_conf->osd_fast_shutdown) {
3999 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4000 cct->_log->flush();
4001 _exit(0);
4002 }
4003
4004 if (!service.prepare_to_stop())
4005 return 0; // already shutting down
4006 osd_lock.Lock();
4007 if (is_stopping()) {
4008 osd_lock.Unlock();
4009 return 0;
4010 }
4011 dout(0) << "shutdown" << dendl;
4012
4013 set_state(STATE_STOPPING);
4014
4015 // Debugging
4016 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4017 cct->_conf.set_val("debug_osd", "100");
4018 cct->_conf.set_val("debug_journal", "100");
4019 cct->_conf.set_val("debug_filestore", "100");
4020 cct->_conf.set_val("debug_bluestore", "100");
4021 cct->_conf.set_val("debug_ms", "100");
4022 cct->_conf.apply_changes(nullptr);
4023 }
4024
4025 // stop MgrClient earlier as it's more like an internal consumer of OSD
4026 mgrc.shutdown();
4027
4028 service.start_shutdown();
4029
4030 // stop sending work to pgs. this just prevents any new work in _process
4031 // from racing with on_shutdown and potentially entering the pg after.
4032 op_shardedwq.drain();
4033
4034 // Shutdown PGs
4035 {
4036 vector<PGRef> pgs;
4037 _get_pgs(&pgs);
4038 for (auto pg : pgs) {
4039 pg->shutdown();
4040 }
4041 }
4042
4043 // drain op queue again (in case PGs requeued something)
4044 op_shardedwq.drain();
4045 {
4046 finished.clear(); // zap waiters (bleh, this is messy)
4047 waiting_for_osdmap.clear();
4048 }
4049
4050 // unregister commands
4051 cct->get_admin_socket()->unregister_commands(asok_hook);
4052 delete asok_hook;
4053 asok_hook = NULL;
4054
4055 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4056 delete test_ops_hook;
4057 test_ops_hook = NULL;
4058
4059 osd_lock.Unlock();
4060
4061 heartbeat_lock.Lock();
4062 heartbeat_stop = true;
4063 heartbeat_cond.Signal();
4064 heartbeat_lock.Unlock();
4065 heartbeat_thread.join();
4066
4067 osd_op_tp.drain();
4068 osd_op_tp.stop();
4069 dout(10) << "op sharded tp stopped" << dendl;
4070
4071 command_tp.drain();
4072 command_tp.stop();
4073 dout(10) << "command tp stopped" << dendl;
4074
4075 dout(10) << "stopping agent" << dendl;
4076 service.agent_stop();
4077
4078 boot_finisher.wait_for_empty();
4079
4080 osd_lock.Lock();
4081
4082 boot_finisher.stop();
4083 reset_heartbeat_peers(true);
4084
4085 tick_timer.shutdown();
4086
4087 {
4088 std::lock_guard l(tick_timer_lock);
4089 tick_timer_without_osd_lock.shutdown();
4090 }
4091
4092 // note unmount epoch
4093 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
4094 superblock.mounted = service.get_boot_epoch();
4095 superblock.clean_thru = osdmap->get_epoch();
4096 ObjectStore::Transaction t;
4097 write_superblock(t);
4098 int r = store->queue_transaction(service.meta_ch, std::move(t));
4099 if (r) {
4100 derr << "OSD::shutdown: error writing superblock: "
4101 << cpp_strerror(r) << dendl;
4102 }
4103
4104
4105 service.shutdown_reserver();
4106
4107 // Remove PGs
4108 #ifdef PG_DEBUG_REFS
4109 service.dump_live_pgids();
4110 #endif
4111 while (true) {
4112 vector<PGRef> pgs;
4113 _get_pgs(&pgs, true);
4114 if (pgs.empty()) {
4115 break;
4116 }
4117 for (auto& pg : pgs) {
4118 if (pg->is_deleted()) {
4119 continue;
4120 }
4121 dout(20) << " kicking pg " << pg << dendl;
4122 pg->lock();
4123 if (pg->get_num_ref() != 1) {
4124 derr << "pgid " << pg->get_pgid() << " has ref count of "
4125 << pg->get_num_ref() << dendl;
4126 #ifdef PG_DEBUG_REFS
4127 pg->dump_live_ids();
4128 #endif
4129 if (cct->_conf->osd_shutdown_pgref_assert) {
4130 ceph_abort();
4131 }
4132 }
4133 pg->ch.reset();
4134 pg->unlock();
4135 }
4136 }
4137 #ifdef PG_DEBUG_REFS
4138 service.dump_live_pgids();
4139 #endif
4140
4141 osd_lock.Unlock();
4142 cct->_conf.remove_observer(this);
4143 osd_lock.Lock();
4144
4145 service.meta_ch.reset();
4146
4147 dout(10) << "syncing store" << dendl;
4148 enable_disable_fuse(true);
4149
4150 if (cct->_conf->osd_journal_flush_on_shutdown) {
4151 dout(10) << "flushing journal" << dendl;
4152 store->flush_journal();
4153 }
4154
4155 monc->shutdown();
4156 osd_lock.Unlock();
4157
4158 map_lock.get_write();
4159 osdmap = OSDMapRef();
4160 map_lock.put_write();
4161
4162 for (auto s : shards) {
4163 std::lock_guard l(s->osdmap_lock);
4164 s->shard_osdmap = OSDMapRef();
4165 }
4166 service.shutdown();
4167
4168 std::lock_guard lock(osd_lock);
4169 store->umount();
4170 delete store;
4171 store = nullptr;
4172 dout(10) << "Store synced" << dendl;
4173
4174 op_tracker.on_shutdown();
4175
4176 class_handler->shutdown();
4177 client_messenger->shutdown();
4178 cluster_messenger->shutdown();
4179 hb_front_client_messenger->shutdown();
4180 hb_back_client_messenger->shutdown();
4181 objecter_messenger->shutdown();
4182 hb_front_server_messenger->shutdown();
4183 hb_back_server_messenger->shutdown();
4184
4185 return r;
4186 }
4187
4188 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4189 {
4190 bool created = false;
4191 while (true) {
4192 dout(10) << __func__ << " cmd: " << cmd << dendl;
4193 vector<string> vcmd{cmd};
4194 bufferlist inbl;
4195 C_SaferCond w;
4196 string outs;
4197 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4198 int r = w.wait();
4199 if (r < 0) {
4200 if (r == -ENOENT && !created) {
4201 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4202 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4203 vector<string> vnewcmd{newcmd};
4204 bufferlist inbl;
4205 C_SaferCond w;
4206 string outs;
4207 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4208 int r = w.wait();
4209 if (r < 0) {
4210 derr << __func__ << " fail: osd does not exist and created failed: "
4211 << cpp_strerror(r) << dendl;
4212 return r;
4213 }
4214 created = true;
4215 continue;
4216 }
4217 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4218 return r;
4219 }
4220 break;
4221 }
4222
4223 return 0;
4224 }
4225
4226 int OSD::update_crush_location()
4227 {
4228 if (!cct->_conf->osd_crush_update_on_start) {
4229 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4230 return 0;
4231 }
4232
4233 char weight[32];
4234 if (cct->_conf->osd_crush_initial_weight >= 0) {
4235 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4236 } else {
4237 struct store_statfs_t st;
4238 osd_alert_list_t alerts;
4239 int r = store->statfs(&st, &alerts);
4240 if (r < 0) {
4241 derr << "statfs: " << cpp_strerror(r) << dendl;
4242 return r;
4243 }
4244 snprintf(weight, sizeof(weight), "%.4lf",
4245 std::max(.00001,
4246 double(st.total) /
4247 double(1ull << 40 /* TB */)));
4248 }
4249
4250 std::multimap<string,string> loc = cct->crush_location.get_location();
4251 dout(10) << __func__ << " crush location is " << loc << dendl;
4252
4253 string cmd =
4254 string("{\"prefix\": \"osd crush create-or-move\", ") +
4255 string("\"id\": ") + stringify(whoami) + string(", ") +
4256 string("\"weight\":") + weight + string(", ") +
4257 string("\"args\": [");
4258 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4259 if (p != loc.begin())
4260 cmd += ", ";
4261 cmd += "\"" + p->first + "=" + p->second + "\"";
4262 }
4263 cmd += "]}";
4264
4265 return mon_cmd_maybe_osd_create(cmd);
4266 }
4267
4268 int OSD::update_crush_device_class()
4269 {
4270 if (!cct->_conf->osd_class_update_on_start) {
4271 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4272 return 0;
4273 }
4274
4275 string device_class;
4276 int r = store->read_meta("crush_device_class", &device_class);
4277 if (r < 0 || device_class.empty()) {
4278 device_class = store->get_default_device_class();
4279 }
4280
4281 if (device_class.empty()) {
4282 dout(20) << __func__ << " no device class stored locally" << dendl;
4283 return 0;
4284 }
4285
4286 string cmd =
4287 string("{\"prefix\": \"osd crush set-device-class\", ") +
4288 string("\"class\": \"") + device_class + string("\", ") +
4289 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4290
4291 r = mon_cmd_maybe_osd_create(cmd);
4292 if (r == -EBUSY) {
4293 // good, already bound to a device-class
4294 return 0;
4295 } else {
4296 return r;
4297 }
4298 }
4299
4300 void OSD::write_superblock(ObjectStore::Transaction& t)
4301 {
4302 dout(10) << "write_superblock " << superblock << dendl;
4303
4304 //hack: at minimum it's using the baseline feature set
4305 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4306 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4307
4308 bufferlist bl;
4309 encode(superblock, bl);
4310 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4311 }
4312
4313 int OSD::read_superblock()
4314 {
4315 bufferlist bl;
4316 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4317 if (r < 0)
4318 return r;
4319
4320 auto p = bl.cbegin();
4321 decode(superblock, p);
4322
4323 dout(10) << "read_superblock " << superblock << dendl;
4324
4325 return 0;
4326 }
4327
4328 void OSD::clear_temp_objects()
4329 {
4330 dout(10) << __func__ << dendl;
4331 vector<coll_t> ls;
4332 store->list_collections(ls);
4333 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4334 spg_t pgid;
4335 if (!p->is_pg(&pgid))
4336 continue;
4337
4338 // list temp objects
4339 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4340
4341 vector<ghobject_t> temps;
4342 ghobject_t next;
4343 while (1) {
4344 vector<ghobject_t> objects;
4345 auto ch = store->open_collection(*p);
4346 ceph_assert(ch);
4347 store->collection_list(ch, next, ghobject_t::get_max(),
4348 store->get_ideal_list_max(),
4349 &objects, &next);
4350 if (objects.empty())
4351 break;
4352 vector<ghobject_t>::iterator q;
4353 for (q = objects.begin(); q != objects.end(); ++q) {
4354 // Hammer set pool for temps to -1, so check for clean-up
4355 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4356 temps.push_back(*q);
4357 } else {
4358 break;
4359 }
4360 }
4361 // If we saw a non-temp object and hit the break above we can
4362 // break out of the while loop too.
4363 if (q != objects.end())
4364 break;
4365 }
4366 if (!temps.empty()) {
4367 ObjectStore::Transaction t;
4368 int removed = 0;
4369 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4370 dout(20) << " removing " << *p << " object " << *q << dendl;
4371 t.remove(*p, *q);
4372 if (++removed > cct->_conf->osd_target_transaction_size) {
4373 store->queue_transaction(service.meta_ch, std::move(t));
4374 t = ObjectStore::Transaction();
4375 removed = 0;
4376 }
4377 }
4378 if (removed) {
4379 store->queue_transaction(service.meta_ch, std::move(t));
4380 }
4381 }
4382 }
4383 }
4384
4385 void OSD::recursive_remove_collection(CephContext* cct,
4386 ObjectStore *store, spg_t pgid,
4387 coll_t tmp)
4388 {
4389 OSDriver driver(
4390 store,
4391 coll_t(),
4392 make_snapmapper_oid());
4393
4394 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4395 ObjectStore::Transaction t;
4396 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4397
4398 ghobject_t next;
4399 int max = cct->_conf->osd_target_transaction_size;
4400 vector<ghobject_t> objects;
4401 objects.reserve(max);
4402 while (true) {
4403 objects.clear();
4404 store->collection_list(ch, next, ghobject_t::get_max(),
4405 max, &objects, &next);
4406 generic_dout(10) << __func__ << " " << objects << dendl;
4407 if (objects.empty())
4408 break;
4409 for (auto& p: objects) {
4410 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4411 int r = mapper.remove_oid(p.hobj, &_t);
4412 if (r != 0 && r != -ENOENT)
4413 ceph_abort();
4414 t.remove(tmp, p);
4415 }
4416 int r = store->queue_transaction(ch, std::move(t));
4417 ceph_assert(r == 0);
4418 t = ObjectStore::Transaction();
4419 }
4420 t.remove_collection(tmp);
4421 int r = store->queue_transaction(ch, std::move(t));
4422 ceph_assert(r == 0);
4423
4424 C_SaferCond waiter;
4425 if (!ch->flush_commit(&waiter)) {
4426 waiter.wait();
4427 }
4428 }
4429
4430
4431 // ======================================================
4432 // PG's
4433
4434 PG* OSD::_make_pg(
4435 OSDMapRef createmap,
4436 spg_t pgid)
4437 {
4438 dout(10) << __func__ << " " << pgid << dendl;
4439 pg_pool_t pi;
4440 map<string,string> ec_profile;
4441 string name;
4442 if (createmap->have_pg_pool(pgid.pool())) {
4443 pi = *createmap->get_pg_pool(pgid.pool());
4444 name = createmap->get_pool_name(pgid.pool());
4445 if (pi.is_erasure()) {
4446 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4447 }
4448 } else {
4449 // pool was deleted; grab final pg_pool_t off disk.
4450 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4451 bufferlist bl;
4452 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4453 if (r < 0) {
4454 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4455 << dendl;
4456 return nullptr;
4457 }
4458 ceph_assert(r >= 0);
4459 auto p = bl.cbegin();
4460 decode(pi, p);
4461 decode(name, p);
4462 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4463 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4464 << " tombstone" << dendl;
4465 return nullptr;
4466 }
4467 decode(ec_profile, p);
4468 }
4469 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4470 PG *pg;
4471 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4472 pi.type == pg_pool_t::TYPE_ERASURE)
4473 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4474 else
4475 ceph_abort();
4476 return pg;
4477 }
4478
4479 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4480 {
4481 v->clear();
4482 v->reserve(get_num_pgs());
4483 for (auto& s : shards) {
4484 std::lock_guard l(s->shard_lock);
4485 for (auto& j : s->pg_slots) {
4486 if (j.second->pg &&
4487 !j.second->pg->is_deleted()) {
4488 v->push_back(j.second->pg);
4489 if (clear_too) {
4490 s->_detach_pg(j.second.get());
4491 }
4492 }
4493 }
4494 }
4495 }
4496
4497 void OSD::_get_pgids(vector<spg_t> *v)
4498 {
4499 v->clear();
4500 v->reserve(get_num_pgs());
4501 for (auto& s : shards) {
4502 std::lock_guard l(s->shard_lock);
4503 for (auto& j : s->pg_slots) {
4504 if (j.second->pg &&
4505 !j.second->pg->is_deleted()) {
4506 v->push_back(j.first);
4507 }
4508 }
4509 }
4510 }
4511
4512 void OSD::register_pg(PGRef pg)
4513 {
4514 spg_t pgid = pg->get_pgid();
4515 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4516 auto sdata = shards[shard_index];
4517 std::lock_guard l(sdata->shard_lock);
4518 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4519 ceph_assert(r.second);
4520 auto *slot = r.first->second.get();
4521 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4522 sdata->_attach_pg(slot, pg.get());
4523 }
4524
4525 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4526 {
4527 auto sdata = pg->osd_shard;
4528 ceph_assert(sdata);
4529 {
4530 std::lock_guard l(sdata->shard_lock);
4531 auto p = sdata->pg_slots.find(pg->pg_id);
4532 if (p == sdata->pg_slots.end() ||
4533 !p->second->pg) {
4534 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4535 return false;
4536 }
4537 if (p->second->waiting_for_merge_epoch) {
4538 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4539 return false;
4540 }
4541 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4542 sdata->_detach_pg(p->second.get());
4543 }
4544
4545 for (auto shard : shards) {
4546 shard->unprime_split_children(pg->pg_id, old_pg_num);
4547 }
4548
4549 // update pg count now since we might not get an osdmap any time soon.
4550 if (pg->is_primary())
4551 service.logger->dec(l_osd_pg_primary);
4552 else if (pg->is_replica())
4553 service.logger->dec(l_osd_pg_replica);
4554 else
4555 service.logger->dec(l_osd_pg_stray);
4556
4557 return true;
4558 }
4559
4560 PGRef OSD::_lookup_pg(spg_t pgid)
4561 {
4562 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4563 auto sdata = shards[shard_index];
4564 std::lock_guard l(sdata->shard_lock);
4565 auto p = sdata->pg_slots.find(pgid);
4566 if (p == sdata->pg_slots.end()) {
4567 return nullptr;
4568 }
4569 return p->second->pg;
4570 }
4571
4572 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4573 {
4574 PGRef pg = _lookup_pg(pgid);
4575 if (!pg) {
4576 return nullptr;
4577 }
4578 pg->lock();
4579 if (!pg->is_deleted()) {
4580 return pg;
4581 }
4582 pg->unlock();
4583 return nullptr;
4584 }
4585
4586 PGRef OSD::lookup_lock_pg(spg_t pgid)
4587 {
4588 return _lookup_lock_pg(pgid);
4589 }
4590
4591 void OSD::load_pgs()
4592 {
4593 ceph_assert(osd_lock.is_locked());
4594 dout(0) << "load_pgs" << dendl;
4595
4596 {
4597 auto pghist = make_pg_num_history_oid();
4598 bufferlist bl;
4599 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4600 if (r >= 0 && bl.length() > 0) {
4601 auto p = bl.cbegin();
4602 decode(pg_num_history, p);
4603 }
4604 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4605 }
4606
4607 vector<coll_t> ls;
4608 int r = store->list_collections(ls);
4609 if (r < 0) {
4610 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4611 }
4612
4613 int num = 0;
4614 for (vector<coll_t>::iterator it = ls.begin();
4615 it != ls.end();
4616 ++it) {
4617 spg_t pgid;
4618 if (it->is_temp(&pgid) ||
4619 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4620 dout(10) << "load_pgs " << *it
4621 << " removing, legacy or flagged for removal pg" << dendl;
4622 recursive_remove_collection(cct, store, pgid, *it);
4623 continue;
4624 }
4625
4626 if (!it->is_pg(&pgid)) {
4627 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4628 continue;
4629 }
4630
4631 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4632 epoch_t map_epoch = 0;
4633 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4634 if (r < 0) {
4635 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4636 << dendl;
4637 continue;
4638 }
4639
4640 PGRef pg;
4641 if (map_epoch > 0) {
4642 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4643 if (!pgosdmap) {
4644 if (!osdmap->have_pg_pool(pgid.pool())) {
4645 derr << __func__ << ": could not find map for epoch " << map_epoch
4646 << " on pg " << pgid << ", but the pool is not present in the "
4647 << "current map, so this is probably a result of bug 10617. "
4648 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4649 << "to clean it up later." << dendl;
4650 continue;
4651 } else {
4652 derr << __func__ << ": have pgid " << pgid << " at epoch "
4653 << map_epoch << ", but missing map. Crashing."
4654 << dendl;
4655 ceph_abort_msg("Missing map in load_pgs");
4656 }
4657 }
4658 pg = _make_pg(pgosdmap, pgid);
4659 } else {
4660 pg = _make_pg(osdmap, pgid);
4661 }
4662 if (!pg) {
4663 recursive_remove_collection(cct, store, pgid, *it);
4664 continue;
4665 }
4666
4667 // there can be no waiters here, so we don't call _wake_pg_slot
4668
4669 pg->lock();
4670 pg->ch = store->open_collection(pg->coll);
4671
4672 // read pg state, log
4673 pg->read_state(store);
4674
4675 if (pg->dne()) {
4676 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4677 pg->ch = nullptr;
4678 pg->unlock();
4679 recursive_remove_collection(cct, store, pgid, *it);
4680 continue;
4681 }
4682 {
4683 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4684 assert(NULL != shards[shard_index]);
4685 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4686 }
4687
4688 pg->reg_next_scrub();
4689
4690 dout(10) << __func__ << " loaded " << *pg << dendl;
4691 pg->unlock();
4692
4693 register_pg(pg);
4694 ++num;
4695 }
4696 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4697 }
4698
4699
4700 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4701 const PGCreateInfo *info)
4702 {
4703 spg_t pgid = info->pgid;
4704
4705 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4706 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4707 return nullptr;
4708 }
4709
4710 PG::RecoveryCtx rctx = create_context();
4711
4712 OSDMapRef startmap = get_map(info->epoch);
4713
4714 if (info->by_mon) {
4715 int64_t pool_id = pgid.pgid.pool();
4716 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4717 if (!pool) {
4718 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4719 return nullptr;
4720 }
4721 if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4722 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4723 // this ensures we do not process old creating messages after the
4724 // pool's initial pgs have been created (and pg are subsequently
4725 // allowed to split or merge).
4726 dout(20) << __func__ << " dropping " << pgid
4727 << "create, pool does not have CREATING flag set" << dendl;
4728 return nullptr;
4729 }
4730 }
4731
4732 int up_primary, acting_primary;
4733 vector<int> up, acting;
4734 startmap->pg_to_up_acting_osds(
4735 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4736
4737 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4738 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4739 store->get_type() != "bluestore") {
4740 clog->warn() << "pg " << pgid
4741 << " is at risk of silent data corruption: "
4742 << "the pool allows ec overwrites but is not stored in "
4743 << "bluestore, so deep scrubbing will not detect bitrot";
4744 }
4745 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4746 PG::_init(*rctx.transaction, pgid, pp);
4747
4748 int role = startmap->calc_pg_role(whoami, acting, acting.size());
4749 if (!pp->is_replicated() && role != pgid.shard) {
4750 role = -1;
4751 }
4752
4753 PGRef pg = _make_pg(startmap, pgid);
4754 pg->ch = store->create_new_collection(pg->coll);
4755
4756 {
4757 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4758 assert(NULL != shards[shard_index]);
4759 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4760 }
4761
4762 pg->lock(true);
4763
4764 // we are holding the shard lock
4765 ceph_assert(!pg->is_deleted());
4766
4767 pg->init(
4768 role,
4769 up,
4770 up_primary,
4771 acting,
4772 acting_primary,
4773 info->history,
4774 info->past_intervals,
4775 false,
4776 rctx.transaction);
4777
4778 pg->init_collection_pool_opts();
4779
4780 if (pg->is_primary()) {
4781 Mutex::Locker locker(m_perf_queries_lock);
4782 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4783 }
4784
4785 pg->handle_initialize(&rctx);
4786 pg->handle_activate_map(&rctx);
4787
4788 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4789
4790 dout(10) << __func__ << " new pg " << *pg << dendl;
4791 return pg;
4792 }
4793
4794 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4795 spg_t pgid,
4796 bool is_mon_create)
4797 {
4798 const auto max_pgs_per_osd =
4799 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4800 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4801
4802 if (num_pgs < max_pgs_per_osd) {
4803 return false;
4804 }
4805
4806 std::lock_guard l(pending_creates_lock);
4807 if (is_mon_create) {
4808 pending_creates_from_mon++;
4809 } else {
4810 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4811 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4812 }
4813 dout(1) << __func__ << " withhold creation of pg " << pgid
4814 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4815 return true;
4816 }
4817
4818 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4819 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4820 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4821 static vector<int32_t> twiddle(const vector<int>& acting) {
4822 if (acting.size() > 1) {
4823 return {acting[0]};
4824 } else {
4825 vector<int32_t> twiddled(acting.begin(), acting.end());
4826 twiddled.push_back(-1);
4827 return twiddled;
4828 }
4829 }
4830
4831 void OSD::resume_creating_pg()
4832 {
4833 bool do_sub_pg_creates = false;
4834 bool have_pending_creates = false;
4835 {
4836 const auto max_pgs_per_osd =
4837 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4838 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4839 if (max_pgs_per_osd <= num_pgs) {
4840 // this could happen if admin decreases this setting before a PG is removed
4841 return;
4842 }
4843 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4844 std::lock_guard l(pending_creates_lock);
4845 if (pending_creates_from_mon > 0) {
4846 dout(20) << __func__ << " pending_creates_from_mon "
4847 << pending_creates_from_mon << dendl;
4848 do_sub_pg_creates = true;
4849 if (pending_creates_from_mon >= spare_pgs) {
4850 spare_pgs = pending_creates_from_mon = 0;
4851 } else {
4852 spare_pgs -= pending_creates_from_mon;
4853 pending_creates_from_mon = 0;
4854 }
4855 }
4856 auto pg = pending_creates_from_osd.cbegin();
4857 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4858 dout(20) << __func__ << " pg " << pg->first << dendl;
4859 vector<int> acting;
4860 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4861 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4862 pg = pending_creates_from_osd.erase(pg);
4863 do_sub_pg_creates = true;
4864 spare_pgs--;
4865 }
4866 have_pending_creates = (pending_creates_from_mon > 0 ||
4867 !pending_creates_from_osd.empty());
4868 }
4869
4870 bool do_renew_subs = false;
4871 if (do_sub_pg_creates) {
4872 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4873 dout(4) << __func__ << ": resolicit pg creates from mon since "
4874 << last_pg_create_epoch << dendl;
4875 do_renew_subs = true;
4876 }
4877 }
4878 version_t start = osdmap->get_epoch() + 1;
4879 if (have_pending_creates) {
4880 // don't miss any new osdmap deleting PGs
4881 if (monc->sub_want("osdmap", start, 0)) {
4882 dout(4) << __func__ << ": resolicit osdmap from mon since "
4883 << start << dendl;
4884 do_renew_subs = true;
4885 }
4886 } else if (do_sub_pg_creates) {
4887 // no need to subscribe the osdmap continuously anymore
4888 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4889 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4890 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4891 << start << dendl;
4892 do_renew_subs = true;
4893 }
4894 }
4895
4896 if (do_renew_subs) {
4897 monc->renew_subs();
4898 }
4899
4900 service.send_pg_temp();
4901 }
4902
4903 void OSD::build_initial_pg_history(
4904 spg_t pgid,
4905 epoch_t created,
4906 utime_t created_stamp,
4907 pg_history_t *h,
4908 PastIntervals *pi)
4909 {
4910 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4911 h->epoch_created = created;
4912 h->epoch_pool_created = created;
4913 h->same_interval_since = created;
4914 h->same_up_since = created;
4915 h->same_primary_since = created;
4916 h->last_scrub_stamp = created_stamp;
4917 h->last_deep_scrub_stamp = created_stamp;
4918 h->last_clean_scrub_stamp = created_stamp;
4919
4920 OSDMapRef lastmap = service.get_map(created);
4921 int up_primary, acting_primary;
4922 vector<int> up, acting;
4923 lastmap->pg_to_up_acting_osds(
4924 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4925
4926 ostringstream debug;
4927 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4928 OSDMapRef osdmap = service.get_map(e);
4929 int new_up_primary, new_acting_primary;
4930 vector<int> new_up, new_acting;
4931 osdmap->pg_to_up_acting_osds(
4932 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4933
4934 // this is a bit imprecise, but sufficient?
4935 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4936 const pg_pool_t *pi;
4937 bool operator()(const set<pg_shard_t> &have) const {
4938 return have.size() >= pi->min_size;
4939 }
4940 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4941 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4942
4943 bool new_interval = PastIntervals::check_new_interval(
4944 acting_primary,
4945 new_acting_primary,
4946 acting, new_acting,
4947 up_primary,
4948 new_up_primary,
4949 up, new_up,
4950 h->same_interval_since,
4951 h->last_epoch_clean,
4952 osdmap,
4953 lastmap,
4954 pgid.pgid,
4955 &min_size_predicate,
4956 pi,
4957 &debug);
4958 if (new_interval) {
4959 h->same_interval_since = e;
4960 if (up != new_up) {
4961 h->same_up_since = e;
4962 }
4963 if (acting_primary != new_acting_primary) {
4964 h->same_primary_since = e;
4965 }
4966 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4967 osdmap->get_pg_num(pgid.pgid.pool()),
4968 nullptr)) {
4969 h->last_epoch_split = e;
4970 }
4971 up = new_up;
4972 acting = new_acting;
4973 up_primary = new_up_primary;
4974 acting_primary = new_acting_primary;
4975 }
4976 lastmap = osdmap;
4977 }
4978 dout(20) << __func__ << " " << debug.str() << dendl;
4979 dout(10) << __func__ << " " << *h << " " << *pi
4980 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4981 pi->get_bounds()) << ")"
4982 << dendl;
4983 }
4984
4985 void OSD::_add_heartbeat_peer(int p)
4986 {
4987 if (p == whoami)
4988 return;
4989 HeartbeatInfo *hi;
4990
4991 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4992 if (i == heartbeat_peers.end()) {
4993 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4994 if (!cons.first)
4995 return;
4996 hi = &heartbeat_peers[p];
4997 hi->peer = p;
4998 RefCountedPtr s{new HeartbeatSession{p}, false};
4999 hi->hb_interval_start = ceph_clock_now();
5000 hi->con_back = cons.first.get();
5001 hi->con_back->set_priv(s);
5002 if (cons.second) {
5003 hi->con_front = cons.second.get();
5004 hi->con_front->set_priv(s);
5005 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5006 << " " << hi->con_back->get_peer_addr()
5007 << " " << hi->con_front->get_peer_addr()
5008 << dendl;
5009 } else {
5010 hi->con_front.reset(NULL);
5011 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5012 << " " << hi->con_back->get_peer_addr()
5013 << dendl;
5014 }
5015 } else {
5016 hi = &i->second;
5017 }
5018 hi->epoch = osdmap->get_epoch();
5019 }
5020
5021 void OSD::_remove_heartbeat_peer(int n)
5022 {
5023 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5024 ceph_assert(q != heartbeat_peers.end());
5025 dout(20) << " removing heartbeat peer osd." << n
5026 << " " << q->second.con_back->get_peer_addr()
5027 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5028 << dendl;
5029 q->second.con_back->mark_down();
5030 if (q->second.con_front) {
5031 q->second.con_front->mark_down();
5032 }
5033 heartbeat_peers.erase(q);
5034 }
5035
5036 void OSD::need_heartbeat_peer_update()
5037 {
5038 if (is_stopping())
5039 return;
5040 dout(20) << "need_heartbeat_peer_update" << dendl;
5041 heartbeat_set_peers_need_update();
5042 }
5043
5044 void OSD::maybe_update_heartbeat_peers()
5045 {
5046 ceph_assert(osd_lock.is_locked());
5047
5048 if (is_waiting_for_healthy() || is_active()) {
5049 utime_t now = ceph_clock_now();
5050 if (last_heartbeat_resample == utime_t()) {
5051 last_heartbeat_resample = now;
5052 heartbeat_set_peers_need_update();
5053 } else if (!heartbeat_peers_need_update()) {
5054 utime_t dur = now - last_heartbeat_resample;
5055 if (dur > cct->_conf->osd_heartbeat_grace) {
5056 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5057 heartbeat_set_peers_need_update();
5058 last_heartbeat_resample = now;
5059 // automatically clean up any stale heartbeat peers
5060 // if we are unhealthy, then clean all
5061 reset_heartbeat_peers(is_waiting_for_healthy());
5062 }
5063 }
5064 }
5065
5066 if (!heartbeat_peers_need_update())
5067 return;
5068 heartbeat_clear_peers_need_update();
5069
5070 std::lock_guard l(heartbeat_lock);
5071
5072 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5073
5074
5075 // build heartbeat from set
5076 if (is_active()) {
5077 vector<PGRef> pgs;
5078 _get_pgs(&pgs);
5079 for (auto& pg : pgs) {
5080 pg->with_heartbeat_peers([&](int peer) {
5081 if (osdmap->is_up(peer)) {
5082 _add_heartbeat_peer(peer);
5083 }
5084 });
5085 }
5086 }
5087
5088 // include next and previous up osds to ensure we have a fully-connected set
5089 set<int> want, extras;
5090 const int next = osdmap->get_next_up_osd_after(whoami);
5091 if (next >= 0)
5092 want.insert(next);
5093 int prev = osdmap->get_previous_up_osd_before(whoami);
5094 if (prev >= 0 && prev != next)
5095 want.insert(prev);
5096
5097 // make sure we have at least **min_down** osds coming from different
5098 // subtree level (e.g., hosts) for fast failure detection.
5099 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5100 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5101 osdmap->get_random_up_osds_by_subtree(
5102 whoami, subtree, min_down, want, &want);
5103
5104 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5105 dout(10) << " adding neighbor peer osd." << *p << dendl;
5106 extras.insert(*p);
5107 _add_heartbeat_peer(*p);
5108 }
5109
5110 // remove down peers; enumerate extras
5111 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5112 while (p != heartbeat_peers.end()) {
5113 if (!osdmap->is_up(p->first)) {
5114 int o = p->first;
5115 ++p;
5116 _remove_heartbeat_peer(o);
5117 continue;
5118 }
5119 if (p->second.epoch < osdmap->get_epoch()) {
5120 extras.insert(p->first);
5121 }
5122 ++p;
5123 }
5124
5125 // too few?
5126 for (int n = next; n >= 0; ) {
5127 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5128 break;
5129 if (!extras.count(n) && !want.count(n) && n != whoami) {
5130 dout(10) << " adding random peer osd." << n << dendl;
5131 extras.insert(n);
5132 _add_heartbeat_peer(n);
5133 }
5134 n = osdmap->get_next_up_osd_after(n);
5135 if (n == next)
5136 break; // came full circle; stop
5137 }
5138
5139 // too many?
5140 for (set<int>::iterator p = extras.begin();
5141 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5142 ++p) {
5143 if (want.count(*p))
5144 continue;
5145 _remove_heartbeat_peer(*p);
5146 }
5147
5148 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5149 }
5150
5151 void OSD::reset_heartbeat_peers(bool all)
5152 {
5153 ceph_assert(osd_lock.is_locked());
5154 dout(10) << "reset_heartbeat_peers" << dendl;
5155 utime_t stale = ceph_clock_now();
5156 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5157 std::lock_guard l(heartbeat_lock);
5158 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5159 HeartbeatInfo& hi = it->second;
5160 if (all || hi.is_stale(stale)) {
5161 hi.con_back->mark_down();
5162 if (hi.con_front) {
5163 hi.con_front->mark_down();
5164 }
5165 // stop sending failure_report to mon too
5166 failure_queue.erase(it->first);
5167 heartbeat_peers.erase(it++);
5168 } else {
5169 it++;
5170 }
5171 }
5172 }
5173
5174 void OSD::handle_osd_ping(MOSDPing *m)
5175 {
5176 if (superblock.cluster_fsid != m->fsid) {
5177 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5178 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
5179 m->put();
5180 return;
5181 }
5182
5183 int from = m->get_source().num();
5184
5185 heartbeat_lock.Lock();
5186 if (is_stopping()) {
5187 heartbeat_lock.Unlock();
5188 m->put();
5189 return;
5190 }
5191
5192 OSDMapRef curmap = service.get_osdmap();
5193 if (!curmap) {
5194 heartbeat_lock.Unlock();
5195 m->put();
5196 return;
5197 }
5198
5199 switch (m->op) {
5200
5201 case MOSDPing::PING:
5202 {
5203 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5204 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5205 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5206 if (heartbeat_drop->second == 0) {
5207 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5208 } else {
5209 --heartbeat_drop->second;
5210 dout(5) << "Dropping heartbeat from " << from
5211 << ", " << heartbeat_drop->second
5212 << " remaining to drop" << dendl;
5213 break;
5214 }
5215 } else if (cct->_conf->osd_debug_drop_ping_probability >
5216 ((((double)(rand()%100))/100.0))) {
5217 heartbeat_drop =
5218 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5219 cct->_conf->osd_debug_drop_ping_duration)).first;
5220 dout(5) << "Dropping heartbeat from " << from
5221 << ", " << heartbeat_drop->second
5222 << " remaining to drop" << dendl;
5223 break;
5224 }
5225 }
5226
5227 if (!cct->get_heartbeat_map()->is_healthy()) {
5228 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5229 break;
5230 }
5231
5232 Message *r = new MOSDPing(monc->get_fsid(),
5233 curmap->get_epoch(),
5234 MOSDPing::PING_REPLY, m->stamp,
5235 cct->_conf->osd_heartbeat_min_size);
5236 m->get_connection()->send_message(r);
5237
5238 if (curmap->is_up(from)) {
5239 service.note_peer_epoch(from, m->map_epoch);
5240 if (is_active()) {
5241 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5242 if (con) {
5243 service.share_map_peer(from, con.get());
5244 }
5245 }
5246 } else if (!curmap->exists(from) ||
5247 curmap->get_down_at(from) > m->map_epoch) {
5248 // tell them they have died
5249 Message *r = new MOSDPing(monc->get_fsid(),
5250 curmap->get_epoch(),
5251 MOSDPing::YOU_DIED,
5252 m->stamp,
5253 cct->_conf->osd_heartbeat_min_size);
5254 m->get_connection()->send_message(r);
5255 }
5256 }
5257 break;
5258
5259 case MOSDPing::PING_REPLY:
5260 {
5261 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5262 if (i != heartbeat_peers.end()) {
5263 auto acked = i->second.ping_history.find(m->stamp);
5264 if (acked != i->second.ping_history.end()) {
5265 utime_t now = ceph_clock_now();
5266 int &unacknowledged = acked->second.second;
5267 if (m->get_connection() == i->second.con_back) {
5268 dout(25) << "handle_osd_ping got reply from osd." << from
5269 << " first_tx " << i->second.first_tx
5270 << " last_tx " << i->second.last_tx
5271 << " last_rx_back " << i->second.last_rx_back << " -> " << now
5272 << " last_rx_front " << i->second.last_rx_front
5273 << dendl;
5274 i->second.last_rx_back = now;
5275 ceph_assert(unacknowledged > 0);
5276 --unacknowledged;
5277 // if there is no front con, set both stamps.
5278 if (i->second.con_front == NULL) {
5279 i->second.last_rx_front = now;
5280 ceph_assert(unacknowledged > 0);
5281 --unacknowledged;
5282 }
5283 } else if (m->get_connection() == i->second.con_front) {
5284 dout(25) << "handle_osd_ping got reply from osd." << from
5285 << " first_tx " << i->second.first_tx
5286 << " last_tx " << i->second.last_tx
5287 << " last_rx_back " << i->second.last_rx_back
5288 << " last_rx_front " << i->second.last_rx_front << " -> " << now
5289 << dendl;
5290 i->second.last_rx_front = now;
5291 ceph_assert(unacknowledged > 0);
5292 --unacknowledged;
5293 }
5294
5295 if (unacknowledged == 0) {
5296 // succeeded in getting all replies
5297 dout(25) << "handle_osd_ping got all replies from osd." << from
5298 << " , erase pending ping(sent at " << m->stamp << ")"
5299 << " and older pending ping(s)"
5300 << dendl;
5301
5302 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5303 ++i->second.hb_average_count;
5304 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
5305 i->second.hb_total_back += back_pingtime;
5306 if (back_pingtime < i->second.hb_min_back)
5307 i->second.hb_min_back = back_pingtime;
5308 if (back_pingtime > i->second.hb_max_back)
5309 i->second.hb_max_back = back_pingtime;
5310 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
5311 i->second.hb_total_front += front_pingtime;
5312 if (front_pingtime < i->second.hb_min_front)
5313 i->second.hb_min_front = front_pingtime;
5314 if (front_pingtime > i->second.hb_max_front)
5315 i->second.hb_max_front = front_pingtime;
5316
5317 ceph_assert(i->second.hb_interval_start != utime_t());
5318 if (i->second.hb_interval_start == utime_t())
5319 i->second.hb_interval_start = now;
5320 int64_t hb_avg_time_period = 60;
5321 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5322 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5323 }
5324 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5325 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5326 uint32_t back_min = i->second.hb_min_back;
5327 uint32_t back_max = i->second.hb_max_back;
5328 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5329 uint32_t front_min = i->second.hb_min_front;
5330 uint32_t front_max = i->second.hb_max_front;
5331
5332 // Reset for new interval
5333 i->second.hb_average_count = 0;
5334 i->second.hb_interval_start = now;
5335 i->second.hb_total_back = i->second.hb_max_back = 0;
5336 i->second.hb_min_back = UINT_MAX;
5337 i->second.hb_total_front = i->second.hb_max_front = 0;
5338 i->second.hb_min_front = UINT_MAX;
5339
5340 // Record per osd interace ping times
5341 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5342 if (i->second.hb_back_pingtime.size() == 0) {
5343 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5344 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5345 i->second.hb_back_pingtime.push_back(back_avg);
5346 i->second.hb_back_min.push_back(back_min);
5347 i->second.hb_back_max.push_back(back_max);
5348 i->second.hb_front_pingtime.push_back(front_avg);
5349 i->second.hb_front_min.push_back(front_min);
5350 i->second.hb_front_max.push_back(front_max);
5351 ++i->second.hb_index;
5352 }
5353 } else {
5354 int index = i->second.hb_index & (hb_vector_size - 1);
5355 i->second.hb_back_pingtime[index] = back_avg;
5356 i->second.hb_back_min[index] = back_min;
5357 i->second.hb_back_max[index] = back_max;
5358 i->second.hb_front_pingtime[index] = front_avg;
5359 i->second.hb_front_min[index] = front_min;
5360 i->second.hb_front_max[index] = front_max;
5361 ++i->second.hb_index;
5362 }
5363
5364 {
5365 std::lock_guard l(service.stat_lock);
5366 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5367 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5368
5369 uint32_t total = 0;
5370 uint32_t min = UINT_MAX;
5371 uint32_t max = 0;
5372 uint32_t count = 0;
5373 uint32_t which = 0;
5374 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5375 for (int32_t k = size - 1 ; k >= 0; --k) {
5376 ++count;
5377 int index = (i->second.hb_index + k) % size;
5378 total += i->second.hb_back_pingtime[index];
5379 if (i->second.hb_back_min[index] < min)
5380 min = i->second.hb_back_min[index];
5381 if (i->second.hb_back_max[index] > max)
5382 max = i->second.hb_back_max[index];
5383 if (count == 1 || count == 5 || count == 15) {
5384 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5385 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5386 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5387 which++;
5388 if (count == 15)
5389 break;
5390 }
5391 }
5392
5393 if (i->second.con_front != NULL) {
5394 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5395
5396 total = 0;
5397 min = UINT_MAX;
5398 max = 0;
5399 count = 0;
5400 which = 0;
5401 for (int32_t k = size - 1 ; k >= 0; --k) {
5402 ++count;
5403 int index = (i->second.hb_index + k) % size;
5404 total += i->second.hb_front_pingtime[index];
5405 if (i->second.hb_front_min[index] < min)
5406 min = i->second.hb_front_min[index];
5407 if (i->second.hb_front_max[index] > max)
5408 max = i->second.hb_front_max[index];
5409 if (count == 1 || count == 5 || count == 15) {
5410 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5411 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5412 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5413 which++;
5414 if (count == 15)
5415 break;
5416 }
5417 }
5418 }
5419 }
5420 } else {
5421 std::lock_guard l(service.stat_lock);
5422 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5423 if (i->second.con_front != NULL)
5424 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5425 }
5426 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5427 }
5428
5429 if (i->second.is_healthy(now)) {
5430 // Cancel false reports
5431 auto failure_queue_entry = failure_queue.find(from);
5432 if (failure_queue_entry != failure_queue.end()) {
5433 dout(10) << "handle_osd_ping canceling queued "
5434 << "failure report for osd." << from << dendl;
5435 failure_queue.erase(failure_queue_entry);
5436 }
5437
5438 auto failure_pending_entry = failure_pending.find(from);
5439 if (failure_pending_entry != failure_pending.end()) {
5440 dout(10) << "handle_osd_ping canceling in-flight "
5441 << "failure report for osd." << from << dendl;
5442 send_still_alive(curmap->get_epoch(),
5443 from,
5444 failure_pending_entry->second.second);
5445 failure_pending.erase(failure_pending_entry);
5446 }
5447 }
5448 } else {
5449 // old replies, deprecated by newly sent pings.
5450 dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5451 << ") is found, treat as covered by newly sent pings "
5452 << "and ignore"
5453 << dendl;
5454 }
5455 }
5456
5457 if (m->map_epoch &&
5458 curmap->is_up(from)) {
5459 service.note_peer_epoch(from, m->map_epoch);
5460 if (is_active()) {
5461 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5462 if (con) {
5463 service.share_map_peer(from, con.get());
5464 }
5465 }
5466 }
5467 }
5468 break;
5469
5470 case MOSDPing::YOU_DIED:
5471 dout(10) << "handle_osd_ping " << m->get_source_inst()
5472 << " says i am down in " << m->map_epoch << dendl;
5473 osdmap_subscribe(curmap->get_epoch()+1, false);
5474 break;
5475 }
5476
5477 heartbeat_lock.Unlock();
5478 m->put();
5479 }
5480
5481 void OSD::heartbeat_entry()
5482 {
5483 std::lock_guard l(heartbeat_lock);
5484 if (is_stopping())
5485 return;
5486 while (!heartbeat_stop) {
5487 heartbeat();
5488
5489 double wait;
5490 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5491 wait = (float)cct->_conf->osd_heartbeat_interval;
5492 } else {
5493 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5494 }
5495 utime_t w;
5496 w.set_from_double(wait);
5497 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5498 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5499 if (is_stopping())
5500 return;
5501 dout(30) << "heartbeat_entry woke up" << dendl;
5502 }
5503 }
5504
5505 void OSD::heartbeat_check()
5506 {
5507 ceph_assert(heartbeat_lock.is_locked());
5508 utime_t now = ceph_clock_now();
5509
5510 // check for incoming heartbeats (move me elsewhere?)
5511 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5512 p != heartbeat_peers.end();
5513 ++p) {
5514
5515 if (p->second.first_tx == utime_t()) {
5516 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5517 << " yet, skipping" << dendl;
5518 continue;
5519 }
5520
5521 dout(25) << "heartbeat_check osd." << p->first
5522 << " first_tx " << p->second.first_tx
5523 << " last_tx " << p->second.last_tx
5524 << " last_rx_back " << p->second.last_rx_back
5525 << " last_rx_front " << p->second.last_rx_front
5526 << dendl;
5527 if (p->second.is_unhealthy(now)) {
5528 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5529 if (p->second.last_rx_back == utime_t() ||
5530 p->second.last_rx_front == utime_t()) {
5531 derr << "heartbeat_check: no reply from "
5532 << p->second.con_front->get_peer_addr().get_sockaddr()
5533 << " osd." << p->first
5534 << " ever on either front or back, first ping sent "
5535 << p->second.first_tx
5536 << " (oldest deadline " << oldest_deadline << ")"
5537 << dendl;
5538 // fail
5539 failure_queue[p->first] = p->second.first_tx;
5540 } else {
5541 derr << "heartbeat_check: no reply from "
5542 << p->second.con_front->get_peer_addr().get_sockaddr()
5543 << " osd." << p->first << " since back " << p->second.last_rx_back
5544 << " front " << p->second.last_rx_front
5545 << " (oldest deadline " << oldest_deadline << ")"
5546 << dendl;
5547 // fail
5548 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5549 }
5550 }
5551 }
5552 }
5553
5554 void OSD::heartbeat()
5555 {
5556 ceph_assert(heartbeat_lock.is_locked_by_me());
5557 dout(30) << "heartbeat" << dendl;
5558
5559 // get CPU load avg
5560 double loadavgs[1];
5561 int hb_interval = cct->_conf->osd_heartbeat_interval;
5562 int n_samples = 86400;
5563 if (hb_interval > 1) {
5564 n_samples /= hb_interval;
5565 if (n_samples < 1)
5566 n_samples = 1;
5567 }
5568
5569 if (getloadavg(loadavgs, 1) == 1) {
5570 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5571 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5572 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5573 }
5574
5575 dout(30) << "heartbeat checking stats" << dendl;
5576
5577 // refresh peer list and osd stats
5578 vector<int> hb_peers;
5579 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5580 p != heartbeat_peers.end();
5581 ++p)
5582 hb_peers.push_back(p->first);
5583
5584 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5585 dout(5) << __func__ << " " << new_stat << dendl;
5586 ceph_assert(new_stat.statfs.total);
5587
5588 float pratio;
5589 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5590
5591 service.check_full_status(ratio, pratio);
5592
5593 utime_t now = ceph_clock_now();
5594 utime_t deadline = now;
5595 deadline += cct->_conf->osd_heartbeat_grace;
5596
5597 // send heartbeats
5598 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5599 i != heartbeat_peers.end();
5600 ++i) {
5601 int peer = i->first;
5602 i->second.last_tx = now;
5603 if (i->second.first_tx == utime_t())
5604 i->second.first_tx = now;
5605 i->second.ping_history[now] = make_pair(deadline,
5606 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5607 if (i->second.hb_interval_start == utime_t())
5608 i->second.hb_interval_start = now;
5609 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5610 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5611 service.get_osdmap_epoch(),
5612 MOSDPing::PING, now,
5613 cct->_conf->osd_heartbeat_min_size));
5614
5615 if (i->second.con_front)
5616 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5617 service.get_osdmap_epoch(),
5618 MOSDPing::PING, now,
5619 cct->_conf->osd_heartbeat_min_size));
5620 }
5621
5622 logger->set(l_osd_hb_to, heartbeat_peers.size());
5623
5624 // hmm.. am i all alone?
5625 dout(30) << "heartbeat lonely?" << dendl;
5626 if (heartbeat_peers.empty()) {
5627 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5628 last_mon_heartbeat = now;
5629 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5630 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5631 }
5632 }
5633
5634 dout(30) << "heartbeat done" << dendl;
5635 }
5636
5637 bool OSD::heartbeat_reset(Connection *con)
5638 {
5639 std::lock_guard l(heartbeat_lock);
5640 auto s = con->get_priv();
5641 con->set_priv(nullptr);
5642 if (s) {
5643 if (is_stopping()) {
5644 return true;
5645 }
5646 auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5647 auto p = heartbeat_peers.find(heartbeat_session->peer);
5648 if (p != heartbeat_peers.end() &&
5649 (p->second.con_back == con ||
5650 p->second.con_front == con)) {
5651 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5652 << ", reopening" << dendl;
5653 if (con != p->second.con_back) {
5654 p->second.con_back->mark_down();
5655 }
5656 p->second.con_back.reset(NULL);
5657 if (p->second.con_front && con != p->second.con_front) {
5658 p->second.con_front->mark_down();
5659 }
5660 p->second.con_front.reset(NULL);
5661 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5662 if (newcon.first) {
5663 p->second.con_back = newcon.first.get();
5664 p->second.con_back->set_priv(s);
5665 if (newcon.second) {
5666 p->second.con_front = newcon.second.get();
5667 p->second.con_front->set_priv(s);
5668 }
5669 p->second.ping_history.clear();
5670 } else {
5671 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5672 << ", raced with osdmap update, closing out peer" << dendl;
5673 heartbeat_peers.erase(p);
5674 }
5675 } else {
5676 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5677 }
5678 }
5679 return true;
5680 }
5681
5682
5683
5684 // =========================================
5685
5686 void OSD::tick()
5687 {
5688 ceph_assert(osd_lock.is_locked());
5689 dout(10) << "tick" << dendl;
5690
5691 if (is_active() || is_waiting_for_healthy()) {
5692 maybe_update_heartbeat_peers();
5693 }
5694
5695 if (is_waiting_for_healthy()) {
5696 start_boot();
5697 }
5698
5699 if (is_waiting_for_healthy() || is_booting()) {
5700 std::lock_guard l(heartbeat_lock);
5701 utime_t now = ceph_clock_now();
5702 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5703 last_mon_heartbeat = now;
5704 dout(1) << __func__ << " checking mon for new map" << dendl;
5705 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5706 }
5707 }
5708
5709 do_waiters();
5710
5711 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5712 }
5713
5714 void OSD::tick_without_osd_lock()
5715 {
5716 ceph_assert(tick_timer_lock.is_locked());
5717 dout(10) << "tick_without_osd_lock" << dendl;
5718
5719 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5720 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5721 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5722
5723 // refresh osd stats
5724 struct store_statfs_t stbuf;
5725 osd_alert_list_t alerts;
5726 int r = store->statfs(&stbuf, &alerts);
5727 ceph_assert(r == 0);
5728 service.set_statfs(stbuf, alerts);
5729
5730 // osd_lock is not being held, which means the OSD state
5731 // might change when doing the monitor report
5732 if (is_active() || is_waiting_for_healthy()) {
5733 heartbeat_lock.Lock();
5734 heartbeat_check();
5735 heartbeat_lock.Unlock();
5736
5737 map_lock.get_read();
5738 std::lock_guard l(mon_report_lock);
5739
5740 // mon report?
5741 utime_t now = ceph_clock_now();
5742 if (service.need_fullness_update() ||
5743 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5744 last_mon_report = now;
5745 send_full_update();
5746 send_failures();
5747 }
5748 map_lock.put_read();
5749
5750 epoch_t max_waiting_epoch = 0;
5751 for (auto s : shards) {
5752 max_waiting_epoch = std::max(max_waiting_epoch,
5753 s->get_max_waiting_epoch());
5754 }
5755 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5756 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5757 << ", requesting new map" << dendl;
5758 osdmap_subscribe(superblock.newest_map + 1, false);
5759 }
5760 }
5761
5762 if (is_active()) {
5763 if (!scrub_random_backoff()) {
5764 sched_scrub();
5765 }
5766 service.promote_throttle_recalibrate();
5767 resume_creating_pg();
5768 bool need_send_beacon = false;
5769 const auto now = ceph::coarse_mono_clock::now();
5770 {
5771 // borrow lec lock to pretect last_sent_beacon from changing
5772 std::lock_guard l{min_last_epoch_clean_lock};
5773 const auto elapsed = now - last_sent_beacon;
5774 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5775 cct->_conf->osd_beacon_report_interval) {
5776 need_send_beacon = true;
5777 }
5778 }
5779 if (need_send_beacon) {
5780 send_beacon(now);
5781 }
5782 }
5783
5784 mgrc.update_daemon_health(get_health_metrics());
5785 service.kick_recovery_queue();
5786 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5787 new C_Tick_WithoutOSDLock(this));
5788 }
5789
5790 // Usage:
5791 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5792 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5793 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5794 // getomap <pool> [namespace/]<obj-name>
5795 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5796 // injectmdataerr [namespace/]<obj-name> [shardid]
5797 // injectdataerr [namespace/]<obj-name> [shardid]
5798 //
5799 // set_recovery_delay [utime]
5800 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5801 std::string_view command,
5802 const cmdmap_t& cmdmap, ostream &ss)
5803 {
5804 //Test support
5805 //Support changing the omap on a single osd by using the Admin Socket to
5806 //directly request the osd make a change.
5807 if (command == "setomapval" || command == "rmomapkey" ||
5808 command == "setomapheader" || command == "getomap" ||
5809 command == "truncobj" || command == "injectmdataerr" ||
5810 command == "injectdataerr"
5811 ) {
5812 pg_t rawpg;
5813 int64_t pool;
5814 OSDMapRef curmap = service->get_osdmap();
5815 int r = -1;
5816
5817 string poolstr;
5818
5819 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5820 pool = curmap->lookup_pg_pool_name(poolstr);
5821 //If we can't find it by name then maybe id specified
5822 if (pool < 0 && isdigit(poolstr[0]))
5823 pool = atoll(poolstr.c_str());
5824 if (pool < 0) {
5825 ss << "Invalid pool '" << poolstr << "''";
5826 return;
5827 }
5828
5829 string objname, nspace;
5830 cmd_getval(service->cct, cmdmap, "objname", objname);
5831 std::size_t found = objname.find_first_of('/');
5832 if (found != string::npos) {
5833 nspace = objname.substr(0, found);
5834 objname = objname.substr(found+1);
5835 }
5836 object_locator_t oloc(pool, nspace);
5837 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5838
5839 if (r < 0) {
5840 ss << "Invalid namespace/objname";
5841 return;
5842 }
5843
5844 int64_t shardid;
5845 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5846 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5847 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5848 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5849 if (curmap->pg_is_ec(rawpg)) {
5850 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5851 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5852 return;
5853 }
5854 }
5855
5856 ObjectStore::Transaction t;
5857
5858 if (command == "setomapval") {
5859 map<string, bufferlist> newattrs;
5860 bufferlist val;
5861 string key, valstr;
5862 cmd_getval(service->cct, cmdmap, "key", key);
5863 cmd_getval(service->cct, cmdmap, "val", valstr);
5864
5865 val.append(valstr);
5866 newattrs[key] = val;
5867 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5868 r = store->queue_transaction(service->meta_ch, std::move(t));
5869 if (r < 0)
5870 ss << "error=" << r;
5871 else
5872 ss << "ok";
5873 } else if (command == "rmomapkey") {
5874 string key;
5875 set<string> keys;
5876 cmd_getval(service->cct, cmdmap, "key", key);
5877
5878 keys.insert(key);
5879 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5880 r = store->queue_transaction(service->meta_ch, std::move(t));
5881 if (r < 0)
5882 ss << "error=" << r;
5883 else
5884 ss << "ok";
5885 } else if (command == "setomapheader") {
5886 bufferlist newheader;
5887 string headerstr;
5888
5889 cmd_getval(service->cct, cmdmap, "header", headerstr);
5890 newheader.append(headerstr);
5891 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5892 r = store->queue_transaction(service->meta_ch, std::move(t));
5893 if (r < 0)
5894 ss << "error=" << r;
5895 else
5896 ss << "ok";
5897 } else if (command == "getomap") {
5898 //Debug: Output entire omap
5899 bufferlist hdrbl;
5900 map<string, bufferlist> keyvals;
5901 auto ch = store->open_collection(coll_t(pgid));
5902 if (!ch) {
5903 ss << "unable to open collection for " << pgid;
5904 r = -ENOENT;
5905 } else {
5906 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5907 if (r >= 0) {
5908 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5909 for (map<string, bufferlist>::iterator it = keyvals.begin();
5910 it != keyvals.end(); ++it)
5911 ss << " key=" << (*it).first << " val="
5912 << string((*it).second.c_str(), (*it).second.length());
5913 } else {
5914 ss << "error=" << r;
5915 }
5916 }
5917 } else if (command == "truncobj") {
5918 int64_t trunclen;
5919 cmd_getval(service->cct, cmdmap, "len", trunclen);
5920 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5921 r = store->queue_transaction(service->meta_ch, std::move(t));
5922 if (r < 0)
5923 ss << "error=" << r;
5924 else
5925 ss << "ok";
5926 } else if (command == "injectdataerr") {
5927 store->inject_data_error(gobj);
5928 ss << "ok";
5929 } else if (command == "injectmdataerr") {
5930 store->inject_mdata_error(gobj);
5931 ss << "ok";
5932 }
5933 return;
5934 }
5935 if (command == "set_recovery_delay") {
5936 int64_t delay;
5937 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5938 ostringstream oss;
5939 oss << delay;
5940 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
5941 oss.str().c_str());
5942 if (r != 0) {
5943 ss << "set_recovery_delay: error setting "
5944 << "osd_recovery_delay_start to '" << delay << "': error "
5945 << r;
5946 return;
5947 }
5948 service->cct->_conf.apply_changes(nullptr);
5949 ss << "set_recovery_delay: set osd_recovery_delay_start "
5950 << "to " << service->cct->_conf->osd_recovery_delay_start;
5951 return;
5952 }
5953 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
5954 spg_t pgid;
5955 bool deep = (command == "trigger_deep_scrub");
5956 OSDMapRef curmap = service->get_osdmap();
5957
5958 string pgidstr;
5959
5960 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5961 if (!pgid.parse(pgidstr.c_str())) {
5962 ss << "Invalid pgid specified";
5963 return;
5964 }
5965
5966 int64_t time;
5967 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5968
5969 PGRef pg = service->osd->_lookup_lock_pg(pgid);
5970 if (pg == nullptr) {
5971 ss << "Can't find pg " << pgid;
5972 return;
5973 }
5974
5975 if (pg->is_primary()) {
5976 pg->unreg_next_scrub();
5977 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5978 double pool_scrub_max_interval = 0;
5979 double scrub_max_interval;
5980 if (deep) {
5981 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5982 scrub_max_interval = pool_scrub_max_interval > 0 ?
5983 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
5984 } else {
5985 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5986 scrub_max_interval = pool_scrub_max_interval > 0 ?
5987 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
5988 }
5989 // Instead of marking must_scrub force a schedule scrub
5990 utime_t stamp = ceph_clock_now();
5991 if (time == 0)
5992 stamp -= scrub_max_interval;
5993 else
5994 stamp -= (float)time;
5995 stamp -= 100.0; // push back last scrub more for good measure
5996 if (deep) {
5997 pg->set_last_deep_scrub_stamp(stamp);
5998 } else {
5999 pg->set_last_scrub_stamp(stamp);
6000 }
6001 pg->reg_next_scrub();
6002 pg->publish_stats_to_osd();
6003 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
6004 } else {
6005 ss << "Not primary";
6006 }
6007 pg->unlock();
6008 return;
6009 }
6010 if (command == "injectfull") {
6011 int64_t count;
6012 string type;
6013 OSDService::s_names state;
6014 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
6015 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
6016 if (type == "none" || count == 0) {
6017 type = "none";
6018 count = 0;
6019 }
6020 state = service->get_full_state(type);
6021 if (state == OSDService::s_names::INVALID) {
6022 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6023 return;
6024 }
6025 service->set_injectfull(state, count);
6026 return;
6027 }
6028 ss << "Internal error - command=" << command;
6029 }
6030
6031 // =========================================
6032
6033 void OSD::ms_handle_connect(Connection *con)
6034 {
6035 dout(10) << __func__ << " con " << con << dendl;
6036 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6037 std::lock_guard l(osd_lock);
6038 if (is_stopping())
6039 return;
6040 dout(10) << __func__ << " on mon" << dendl;
6041
6042 if (is_preboot()) {
6043 start_boot();
6044 } else if (is_booting()) {
6045 _send_boot(); // resend boot message
6046 } else {
6047 map_lock.get_read();
6048 std::lock_guard l2(mon_report_lock);
6049
6050 utime_t now = ceph_clock_now();
6051 last_mon_report = now;
6052
6053 // resend everything, it's a new session
6054 send_full_update();
6055 send_alive();
6056 service.requeue_pg_temp();
6057 service.clear_sent_ready_to_merge();
6058 service.send_pg_temp();
6059 service.send_ready_to_merge();
6060 service.send_pg_created();
6061 requeue_failures();
6062 send_failures();
6063
6064 map_lock.put_read();
6065 if (is_active()) {
6066 send_beacon(ceph::coarse_mono_clock::now());
6067 }
6068 }
6069
6070 // full map requests may happen while active or pre-boot
6071 if (requested_full_first) {
6072 rerequest_full_maps();
6073 }
6074 }
6075 }
6076
6077 void OSD::ms_handle_fast_connect(Connection *con)
6078 {
6079 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6080 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6081 auto priv = con->get_priv();
6082 auto s = static_cast<Session*>(priv.get());
6083 if (!s) {
6084 s = new Session{cct, con};
6085 con->set_priv(RefCountedPtr{s, false});
6086 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6087 << " addr=" << s->con->get_peer_addr() << dendl;
6088 // we don't connect to clients
6089 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6090 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6091 }
6092 }
6093 }
6094
6095 void OSD::ms_handle_fast_accept(Connection *con)
6096 {
6097 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6098 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6099 auto priv = con->get_priv();
6100 auto s = static_cast<Session*>(priv.get());
6101 if (!s) {
6102 s = new Session{cct, con};
6103 con->set_priv(RefCountedPtr{s, false});
6104 dout(10) << "new session (incoming)" << s << " con=" << con
6105 << " addr=" << con->get_peer_addr()
6106 << " must have raced with connect" << dendl;
6107 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6108 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6109 }
6110 }
6111 }
6112
6113 bool OSD::ms_handle_reset(Connection *con)
6114 {
6115 auto s = con->get_priv();
6116 auto session = static_cast<Session*>(s.get());
6117 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
6118 if (!session)
6119 return false;
6120 session->wstate.reset(con);
6121 session->con->set_priv(nullptr);
6122 session->con.reset(); // break con <-> session ref cycle
6123 // note that we break session->con *before* the session_handle_reset
6124 // cleanup below. this avoids a race between us and
6125 // PG::add_backoff, Session::check_backoff, etc.
6126 session_handle_reset(SessionRef{session});
6127 return true;
6128 }
6129
6130 bool OSD::ms_handle_refused(Connection *con)
6131 {
6132 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6133 return false;
6134
6135 auto priv = con->get_priv();
6136 auto session = static_cast<Session*>(priv.get());
6137 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
6138 if (!session)
6139 return false;
6140 int type = con->get_peer_type();
6141 // handle only OSD failures here
6142 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6143 OSDMapRef osdmap = get_osdmap();
6144 if (osdmap) {
6145 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6146 if (id >= 0 && osdmap->is_up(id)) {
6147 // I'm cheating mon heartbeat grace logic, because we know it's not going
6148 // to respawn alone. +1 so we won't hit any boundary case.
6149 monc->send_mon_message(
6150 new MOSDFailure(
6151 monc->get_fsid(),
6152 id,
6153 osdmap->get_addrs(id),
6154 cct->_conf->osd_heartbeat_grace + 1,
6155 osdmap->get_epoch(),
6156 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6157 ));
6158 }
6159 }
6160 }
6161 return true;
6162 }
6163
6164 struct C_OSD_GetVersion : public Context {
6165 OSD *osd;
6166 uint64_t oldest, newest;
6167 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6168 void finish(int r) override {
6169 if (r >= 0)
6170 osd->_got_mon_epochs(oldest, newest);
6171 }
6172 };
6173
6174 void OSD::start_boot()
6175 {
6176 if (!_is_healthy()) {
6177 // if we are not healthy, do not mark ourselves up (yet)
6178 dout(1) << "not healthy; waiting to boot" << dendl;
6179 if (!is_waiting_for_healthy())
6180 start_waiting_for_healthy();
6181 // send pings sooner rather than later
6182 heartbeat_kick();
6183 return;
6184 }
6185 dout(1) << __func__ << dendl;
6186 set_state(STATE_PREBOOT);
6187 dout(10) << "start_boot - have maps " << superblock.oldest_map
6188 << ".." << superblock.newest_map << dendl;
6189 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6190 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6191 }
6192
6193 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6194 {
6195 std::lock_guard l(osd_lock);
6196 if (is_preboot()) {
6197 _preboot(oldest, newest);
6198 }
6199 }
6200
6201 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6202 {
6203 ceph_assert(is_preboot());
6204 dout(10) << __func__ << " _preboot mon has osdmaps "
6205 << oldest << ".." << newest << dendl;
6206
6207 // ensure our local fullness awareness is accurate
6208 {
6209 std::lock_guard l(heartbeat_lock);
6210 heartbeat();
6211 }
6212
6213 // if our map within recent history, try to add ourselves to the osdmap.
6214 if (osdmap->get_epoch() == 0) {
6215 derr << "waiting for initial osdmap" << dendl;
6216 } else if (osdmap->is_destroyed(whoami)) {
6217 derr << "osdmap says I am destroyed" << dendl;
6218 // provide a small margin so we don't livelock seeing if we
6219 // un-destroyed ourselves.
6220 if (osdmap->get_epoch() > newest - 1) {
6221 exit(0);
6222 }
6223 } else if (osdmap->is_noup(whoami)) {
6224 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6225 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6226 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6227 << dendl;
6228 } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6229 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6230 << dendl;
6231 } else if (service.need_fullness_update()) {
6232 derr << "osdmap fullness state needs update" << dendl;
6233 send_full_update();
6234 } else if (osdmap->get_epoch() >= oldest - 1 &&
6235 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6236
6237 // wait for pgs to fully catch up in a different thread, since
6238 // this thread might be required for splitting and merging PGs to
6239 // make progress.
6240 boot_finisher.queue(
6241 new FunctionContext(
6242 [this](int r) {
6243 std::lock_guard l(osd_lock);
6244 if (is_preboot()) {
6245 dout(10) << __func__ << " waiting for peering work to drain"
6246 << dendl;
6247 osd_lock.Unlock();
6248 for (auto shard : shards) {
6249 shard->wait_min_pg_epoch(osdmap->get_epoch());
6250 }
6251 osd_lock.Lock();
6252 }
6253 if (is_preboot()) {
6254 _send_boot();
6255 }
6256 }));
6257 return;
6258 }
6259
6260 // get all the latest maps
6261 if (osdmap->get_epoch() + 1 >= oldest)
6262 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6263 else
6264 osdmap_subscribe(oldest - 1, true);
6265 }
6266
6267 void OSD::send_full_update()
6268 {
6269 if (!service.need_fullness_update())
6270 return;
6271 unsigned state = 0;
6272 if (service.is_full()) {
6273 state = CEPH_OSD_FULL;
6274 } else if (service.is_backfillfull()) {
6275 state = CEPH_OSD_BACKFILLFULL;
6276 } else if (service.is_nearfull()) {
6277 state = CEPH_OSD_NEARFULL;
6278 }
6279 set<string> s;
6280 OSDMap::calc_state_set(state, s);
6281 dout(10) << __func__ << " want state " << s << dendl;
6282 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6283 }
6284
6285 void OSD::start_waiting_for_healthy()
6286 {
6287 dout(1) << "start_waiting_for_healthy" << dendl;
6288 set_state(STATE_WAITING_FOR_HEALTHY);
6289 last_heartbeat_resample = utime_t();
6290
6291 // subscribe to osdmap updates, in case our peers really are known to be dead
6292 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6293 }
6294
6295 bool OSD::_is_healthy()
6296 {
6297 if (!cct->get_heartbeat_map()->is_healthy()) {
6298 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6299 return false;
6300 }
6301
6302 if (is_waiting_for_healthy()) {
6303 utime_t now = ceph_clock_now();
6304 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6305 while (!osd_markdown_log.empty() &&
6306 osd_markdown_log.front() + grace < now)
6307 osd_markdown_log.pop_front();
6308 if (osd_markdown_log.size() <= 1) {
6309 dout(5) << __func__ << " first time marked as down,"
6310 << " try reboot unconditionally" << dendl;
6311 return true;
6312 }
6313 std::lock_guard l(heartbeat_lock);
6314 int num = 0, up = 0;
6315 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6316 p != heartbeat_peers.end();
6317 ++p) {
6318 if (p->second.is_healthy(now))
6319 ++up;
6320 ++num;
6321 }
6322 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6323 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6324 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6325 return false;
6326 }
6327 }
6328
6329 return true;
6330 }
6331
6332 void OSD::_send_boot()
6333 {
6334 dout(10) << "_send_boot" << dendl;
6335 Connection *local_connection =
6336 cluster_messenger->get_loopback_connection().get();
6337 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6338 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6339 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6340 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6341
6342 dout(20) << " initial client_addrs " << client_addrs
6343 << ", cluster_addrs " << cluster_addrs
6344 << ", hb_back_addrs " << hb_back_addrs
6345 << ", hb_front_addrs " << hb_front_addrs
6346 << dendl;
6347 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6348 dout(10) << " assuming cluster_addrs match client_addrs "
6349 << client_addrs << dendl;
6350 cluster_addrs = cluster_messenger->get_myaddrs();
6351 }
6352 if (auto session = local_connection->get_priv(); !session) {
6353 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6354 }
6355
6356 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6357 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6358 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6359 << cluster_addrs << dendl;
6360 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6361 }
6362 if (auto session = local_connection->get_priv(); !session) {
6363 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6364 }
6365
6366 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6367 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6368 dout(10) << " assuming hb_front_addrs match client_addrs "
6369 << client_addrs << dendl;
6370 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6371 }
6372 if (auto session = local_connection->get_priv(); !session) {
6373 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6374 }
6375
6376 // we now know what our front and back addrs will be, and we are
6377 // about to tell the mon what our metadata (including numa bindings)
6378 // are, so now is a good time!
6379 set_numa_affinity();
6380
6381 MOSDBoot *mboot = new MOSDBoot(
6382 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6383 hb_back_addrs, hb_front_addrs, cluster_addrs,
6384 CEPH_FEATURES_ALL);
6385 dout(10) << " final client_addrs " << client_addrs
6386 << ", cluster_addrs " << cluster_addrs
6387 << ", hb_back_addrs " << hb_back_addrs
6388 << ", hb_front_addrs " << hb_front_addrs
6389 << dendl;
6390 _collect_metadata(&mboot->metadata);
6391 monc->send_mon_message(mboot);
6392 set_state(STATE_BOOTING);
6393 }
6394
6395 void OSD::_collect_metadata(map<string,string> *pm)
6396 {
6397 // config info
6398 (*pm)["osd_data"] = dev_path;
6399 if (store->get_type() == "filestore") {
6400 // not applicable for bluestore
6401 (*pm)["osd_journal"] = journal_path;
6402 }
6403 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6404 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6405 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6406 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6407
6408 // backend
6409 (*pm)["osd_objectstore"] = store->get_type();
6410 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6411 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6412 (*pm)["default_device_class"] = store->get_default_device_class();
6413 store->collect_metadata(pm);
6414
6415 collect_sys_info(pm, cct);
6416
6417 (*pm)["front_iface"] = pick_iface(
6418 cct,
6419 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6420 (*pm)["back_iface"] = pick_iface(
6421 cct,
6422 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6423
6424 // network numa
6425 {
6426 int node = -1;
6427 set<int> nodes;
6428 set<string> unknown;
6429 for (auto nm : { "front_iface", "back_iface" }) {
6430 if (!(*pm)[nm].size()) {
6431 unknown.insert(nm);
6432 continue;
6433 }
6434 int n = -1;
6435 int r = get_iface_numa_node((*pm)[nm], &n);
6436 if (r < 0) {
6437 unknown.insert((*pm)[nm]);
6438 continue;
6439 }
6440 nodes.insert(n);
6441 if (node < 0) {
6442 node = n;
6443 }
6444 }
6445 if (unknown.size()) {
6446 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6447 }
6448 if (!nodes.empty()) {
6449 (*pm)["network_numa_nodes"] = stringify(nodes);
6450 }
6451 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6452 (*pm)["network_numa_node"] = stringify(node);
6453 }
6454 }
6455
6456 if (numa_node >= 0) {
6457 (*pm)["numa_node"] = stringify(numa_node);
6458 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6459 &numa_cpu_set);
6460 }
6461
6462 set<string> devnames;
6463 store->get_devices(&devnames);
6464 (*pm)["devices"] = stringify(devnames);
6465 string devids;
6466 for (auto& dev : devnames) {
6467 string err;
6468 string id = get_device_id(dev, &err);
6469 if (id.size()) {
6470 if (!devids.empty()) {
6471 devids += ",";
6472 }
6473 devids += dev + "=" + id;
6474 } else {
6475 dout(10) << __func__ << " no unique device id for " << dev << ": "
6476 << err << dendl;
6477 }
6478 }
6479 (*pm)["device_ids"] = devids;
6480
6481 dout(10) << __func__ << " " << *pm << dendl;
6482 }
6483
6484 void OSD::queue_want_up_thru(epoch_t want)
6485 {
6486 map_lock.get_read();
6487 epoch_t cur = osdmap->get_up_thru(whoami);
6488 std::lock_guard l(mon_report_lock);
6489 if (want > up_thru_wanted) {
6490 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6491 << ", currently " << cur
6492 << dendl;
6493 up_thru_wanted = want;
6494 send_alive();
6495 } else {
6496 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6497 << ", currently " << cur
6498 << dendl;
6499 }
6500 map_lock.put_read();
6501 }
6502
6503 void OSD::send_alive()
6504 {
6505 ceph_assert(mon_report_lock.is_locked());
6506 if (!osdmap->exists(whoami))
6507 return;
6508 epoch_t up_thru = osdmap->get_up_thru(whoami);
6509 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6510 if (up_thru_wanted > up_thru) {
6511 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6512 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6513 }
6514 }
6515
6516 void OSD::request_full_map(epoch_t first, epoch_t last)
6517 {
6518 dout(10) << __func__ << " " << first << ".." << last
6519 << ", previously requested "
6520 << requested_full_first << ".." << requested_full_last << dendl;
6521 ceph_assert(osd_lock.is_locked());
6522 ceph_assert(first > 0 && last > 0);
6523 ceph_assert(first <= last);
6524 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6525 if (requested_full_first == 0) {
6526 // first request
6527 requested_full_first = first;
6528 requested_full_last = last;
6529 } else if (last <= requested_full_last) {
6530 // dup
6531 return;
6532 } else {
6533 // additional request
6534 first = requested_full_last + 1;
6535 requested_full_last = last;
6536 }
6537 MMonGetOSDMap *req = new MMonGetOSDMap;
6538 req->request_full(first, last);
6539 monc->send_mon_message(req);
6540 }
6541
6542 void OSD::got_full_map(epoch_t e)
6543 {
6544 ceph_assert(requested_full_first <= requested_full_last);
6545 ceph_assert(osd_lock.is_locked());
6546 if (requested_full_first == 0) {
6547 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6548 return;
6549 }
6550 if (e < requested_full_first) {
6551 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6552 << ".." << requested_full_last
6553 << ", ignoring" << dendl;
6554 return;
6555 }
6556 if (e >= requested_full_last) {
6557 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6558 << ".." << requested_full_last << ", resetting" << dendl;
6559 requested_full_first = requested_full_last = 0;
6560 return;
6561 }
6562
6563 requested_full_first = e + 1;
6564
6565 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6566 << ".." << requested_full_last
6567 << ", still need more" << dendl;
6568 }
6569
6570 void OSD::requeue_failures()
6571 {
6572 std::lock_guard l(heartbeat_lock);
6573 unsigned old_queue = failure_queue.size();
6574 unsigned old_pending = failure_pending.size();
6575 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6576 failure_queue[p->first] = p->second.first;
6577 failure_pending.erase(p++);
6578 }
6579 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6580 << failure_queue.size() << dendl;
6581 }
6582
6583 void OSD::send_failures()
6584 {
6585 ceph_assert(map_lock.is_locked());
6586 ceph_assert(mon_report_lock.is_locked());
6587 std::lock_guard l(heartbeat_lock);
6588 utime_t now = ceph_clock_now();
6589 while (!failure_queue.empty()) {
6590 int osd = failure_queue.begin()->first;
6591 if (!failure_pending.count(osd)) {
6592 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6593 monc->send_mon_message(
6594 new MOSDFailure(
6595 monc->get_fsid(),
6596 osd,
6597 osdmap->get_addrs(osd),
6598 failed_for,
6599 osdmap->get_epoch()));
6600 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6601 osdmap->get_addrs(osd));
6602 }
6603 failure_queue.erase(osd);
6604 }
6605 }
6606
6607 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6608 {
6609 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6610 MOSDFailure::FLAG_ALIVE);
6611 monc->send_mon_message(m);
6612 }
6613
6614 void OSD::cancel_pending_failures()
6615 {
6616 std::lock_guard l(heartbeat_lock);
6617 auto it = failure_pending.begin();
6618 while (it != failure_pending.end()) {
6619 dout(10) << __func__ << " canceling in-flight failure report for osd."
6620 << it->first << dendl;
6621 send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6622 failure_pending.erase(it++);
6623 }
6624 }
6625
6626 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6627 {
6628 const auto& monmap = monc->monmap;
6629 // send beacon to mon even if we are just connected, and the monmap is not
6630 // initialized yet by then.
6631 if (monmap.epoch > 0 &&
6632 monmap.get_required_features().contains_all(
6633 ceph::features::mon::FEATURE_LUMINOUS)) {
6634 dout(20) << __func__ << " sending" << dendl;
6635 MOSDBeacon* beacon = nullptr;
6636 {
6637 std::lock_guard l{min_last_epoch_clean_lock};
6638 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6639 beacon->pgs = min_last_epoch_clean_pgs;
6640 last_sent_beacon = now;
6641 }
6642 monc->send_mon_message(beacon);
6643 } else {
6644 dout(20) << __func__ << " not sending" << dendl;
6645 }
6646 }
6647
6648 void OSD::handle_command(MMonCommand *m)
6649 {
6650 if (!require_mon_peer(m)) {
6651 m->put();
6652 return;
6653 }
6654
6655 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6656 command_wq.queue(c);
6657 m->put();
6658 }
6659
6660 void OSD::handle_command(MCommand *m)
6661 {
6662 ConnectionRef con = m->get_connection();
6663 auto priv = con->get_priv();
6664 auto session = static_cast<Session *>(priv.get());
6665 if (!session) {
6666 con->send_message(new MCommandReply(m, -EPERM));
6667 m->put();
6668 return;
6669 }
6670
6671 OSDCap& caps = session->caps;
6672 priv.reset();
6673
6674 if (!caps.allow_all() || m->get_source().is_mon()) {
6675 con->send_message(new MCommandReply(m, -EPERM));
6676 m->put();
6677 return;
6678 }
6679
6680 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6681 command_wq.queue(c);
6682
6683 m->put();
6684 }
6685
6686 struct OSDCommand {
6687 string cmdstring;
6688 string helpstring;
6689 string module;
6690 string perm;
6691 } osd_commands[] = {
6692
6693 #define COMMAND(parsesig, helptext, module, perm) \
6694 {parsesig, helptext, module, perm},
6695
6696 // yes, these are really pg commands, but there's a limit to how
6697 // much work it's worth. The OSD returns all of them. Make this
6698 // form (pg <pgid> <cmd>) valid only for the cli.
6699 // Rest uses "tell <pgid> <cmd>"
6700
6701 COMMAND("pg " \
6702 "name=pgid,type=CephPgid " \
6703 "name=cmd,type=CephChoices,strings=query", \
6704 "show details of a specific pg", "osd", "r")
6705 COMMAND("pg " \
6706 "name=pgid,type=CephPgid " \
6707 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6708 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6709 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6710 "osd", "rw")
6711 COMMAND("pg " \
6712 "name=pgid,type=CephPgid " \
6713 "name=cmd,type=CephChoices,strings=list_unfound " \
6714 "name=offset,type=CephString,req=false",
6715 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6716 "osd", "r")
6717
6718 // new form: tell <pgid> <cmd> for both cli and rest
6719
6720 COMMAND("query",
6721 "show details of a specific pg", "osd", "r")
6722 COMMAND("mark_unfound_lost " \
6723 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6724 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6725 "osd", "rw")
6726 COMMAND("list_unfound " \
6727 "name=offset,type=CephString,req=false",
6728 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6729 "osd", "r")
6730 COMMAND("perf histogram dump "
6731 "name=logger,type=CephString,req=false "
6732 "name=counter,type=CephString,req=false",
6733 "Get histogram data",
6734 "osd", "r")
6735
6736 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6737 COMMAND("version", "report version of OSD", "osd", "r")
6738 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6739 COMMAND("injectargs " \
6740 "name=injected_args,type=CephString,n=N",
6741 "inject configuration arguments into running OSD",
6742 "osd", "rw")
6743 COMMAND("config set " \
6744 "name=key,type=CephString name=value,type=CephString",
6745 "Set a configuration option at runtime (not persistent)",
6746 "osd", "rw")
6747 COMMAND("config get " \
6748 "name=key,type=CephString",
6749 "Get a configuration option at runtime",
6750 "osd", "r")
6751 COMMAND("config unset " \
6752 "name=key,type=CephString",
6753 "Unset a configuration option at runtime (not persistent)",
6754 "osd", "rw")
6755 COMMAND("cluster_log " \
6756 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6757 "name=message,type=CephString,n=N",
6758 "log a message to the cluster log",
6759 "osd", "rw")
6760 COMMAND("bench " \
6761 "name=count,type=CephInt,req=false " \
6762 "name=size,type=CephInt,req=false " \
6763 "name=object_size,type=CephInt,req=false " \
6764 "name=object_num,type=CephInt,req=false ", \
6765 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6766 "(default count=1G default size=4MB). Results in log.",
6767 "osd", "rw")
6768 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6769 COMMAND("heap " \
6770 "name=heapcmd,type=CephChoices,strings="\
6771 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6772 "name=value,type=CephString,req=false",
6773 "show heap usage info (available only if compiled with tcmalloc)",
6774 "osd", "rw")
6775 COMMAND("debug dump_missing " \
6776 "name=filename,type=CephFilepath",
6777 "dump missing objects to a named file", "osd", "r")
6778 COMMAND("debug kick_recovery_wq " \
6779 "name=delay,type=CephInt,range=0",
6780 "set osd_recovery_delay_start to <val>", "osd", "rw")
6781 COMMAND("cpu_profiler " \
6782 "name=arg,type=CephChoices,strings=status|flush",
6783 "run cpu profiling on daemon", "osd", "rw")
6784 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6785 "osd", "r")
6786 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6787 "osd", "rw")
6788 COMMAND("compact",
6789 "compact object store's omap. "
6790 "WARNING: Compaction probably slows your requests",
6791 "osd", "rw")
6792 COMMAND("smart name=devid,type=CephString,req=False",
6793 "runs smartctl on this osd devices. ",
6794 "osd", "rw")
6795 COMMAND("cache drop",
6796 "Drop all OSD caches",
6797 "osd", "rwx")
6798 COMMAND("cache status",
6799 "Get OSD caches statistics",
6800 "osd", "r")
6801 COMMAND("send_beacon",
6802 "Send OSD beacon to mon immediately",
6803 "osd", "r")
6804 };
6805
6806 void OSD::do_command(
6807 Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6808 {
6809 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6810
6811 int r = 0;
6812 stringstream ss, ds;
6813 bufferlist odata;
6814 cmdmap_t cmdmap;
6815 if (cmd.empty()) {
6816 ss << "no command given";
6817 goto out;
6818 }
6819 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6820 r = -EINVAL;
6821 goto out;
6822 }
6823
6824 try {
6825 r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6826 } catch (const bad_cmd_get& e) {
6827 r = -EINVAL;
6828 ss << e.what();
6829 }
6830 if (r == -EAGAIN) {
6831 return;
6832 }
6833 out:
6834 string rs = ss.str();
6835 odata.append(ds);
6836 dout(0) << "do_command r=" << r << " " << rs << dendl;
6837 clog->info() << rs;
6838 if (con) {
6839 MCommandReply *reply = new MCommandReply(r, rs);
6840 reply->set_tid(tid);
6841 reply->set_data(odata);
6842 con->send_message(reply);
6843 }
6844 }
6845
6846 namespace {
6847 class unlock_guard {
6848 Mutex& m;
6849 public:
6850 explicit unlock_guard(Mutex& mutex)
6851 : m(mutex)
6852 {
6853 m.unlock();
6854 }
6855 unlock_guard(unlock_guard&) = delete;
6856 ~unlock_guard() {
6857 m.lock();
6858 }
6859 };
6860 }
6861
6862 int OSD::_do_command(
6863 Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6864 bufferlist& odata, stringstream& ss, stringstream& ds)
6865 {
6866 int r = 0;
6867 string prefix;
6868 string format;
6869 string pgidstr;
6870 boost::scoped_ptr<Formatter> f;
6871
6872 cmd_getval(cct, cmdmap, "prefix", prefix);
6873
6874 if (prefix == "get_command_descriptions") {
6875 int cmdnum = 0;
6876 JSONFormatter *f = new JSONFormatter();
6877 f->open_object_section("command_descriptions");
6878 for (OSDCommand *cp = osd_commands;
6879 cp < &osd_commands[std::size(osd_commands)]; cp++) {
6880
6881 ostringstream secname;
6882 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6883 dump_cmddesc_to_json(f, con->get_features(),
6884 secname.str(), cp->cmdstring, cp->helpstring,
6885 cp->module, cp->perm, 0);
6886 cmdnum++;
6887 }
6888 f->close_section(); // command_descriptions
6889
6890 f->flush(ds);
6891 delete f;
6892 goto out;
6893 }
6894
6895 cmd_getval(cct, cmdmap, "format", format);
6896 f.reset(Formatter::create(format));
6897
6898 if (prefix == "version") {
6899 if (f) {
6900 f->open_object_section("version");
6901 f->dump_string("version", pretty_version_to_str());
6902 f->close_section();
6903 f->flush(ds);
6904 } else {
6905 ds << pretty_version_to_str();
6906 }
6907 goto out;
6908 }
6909 else if (prefix == "injectargs") {
6910 vector<string> argsvec;
6911 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6912
6913 if (argsvec.empty()) {
6914 r = -EINVAL;
6915 ss << "ignoring empty injectargs";
6916 goto out;
6917 }
6918 string args = argsvec.front();
6919 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6920 args += " " + *a;
6921 unlock_guard unlock{osd_lock};
6922 r = cct->_conf.injectargs(args, &ss);
6923 }
6924 else if (prefix == "config set") {
6925 std::string key;
6926 std::string val;
6927 cmd_getval(cct, cmdmap, "key", key);
6928 cmd_getval(cct, cmdmap, "value", val);
6929 unlock_guard unlock{osd_lock};
6930 r = cct->_conf.set_val(key, val, &ss);
6931 if (r == 0) {
6932 cct->_conf.apply_changes(nullptr);
6933 }
6934 }
6935 else if (prefix == "config get") {
6936 std::string key;
6937 cmd_getval(cct, cmdmap, "key", key);
6938 unlock_guard unlock{osd_lock};
6939 std::string val;
6940 r = cct->_conf.get_val(key, &val);
6941 if (r == 0) {
6942 ds << val;
6943 }
6944 }
6945 else if (prefix == "config unset") {
6946 std::string key;
6947 cmd_getval(cct, cmdmap, "key", key);
6948 unlock_guard unlock{osd_lock};
6949 r = cct->_conf.rm_val(key);
6950 if (r == 0) {
6951 cct->_conf.apply_changes(nullptr);
6952 }
6953 if (r == -ENOENT) {
6954 r = 0; // make command idempotent
6955 }
6956 }
6957 else if (prefix == "cluster_log") {
6958 vector<string> msg;
6959 cmd_getval(cct, cmdmap, "message", msg);
6960 if (msg.empty()) {
6961 r = -EINVAL;
6962 ss << "ignoring empty log message";
6963 goto out;
6964 }
6965 string message = msg.front();
6966 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6967 message += " " + *a;
6968 string lvl;
6969 cmd_getval(cct, cmdmap, "level", lvl);
6970 clog_type level = string_to_clog_type(lvl);
6971 if (level < 0) {
6972 r = -EINVAL;
6973 ss << "unknown level '" << lvl << "'";
6974 goto out;
6975 }
6976 clog->do_log(level, message);
6977 }
6978
6979 // either 'pg <pgid> <command>' or
6980 // 'tell <pgid>' (which comes in without any of that prefix)?
6981
6982 else if (prefix == "pg" ||
6983 prefix == "query" ||
6984 prefix == "mark_unfound_lost" ||
6985 prefix == "list_unfound"
6986 ) {
6987 pg_t pgid;
6988
6989 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6990 ss << "no pgid specified";
6991 r = -EINVAL;
6992 } else if (!pgid.parse(pgidstr.c_str())) {
6993 ss << "couldn't parse pgid '" << pgidstr << "'";
6994 r = -EINVAL;
6995 } else {
6996 spg_t pcand;
6997 PGRef pg;
6998 if (osdmap->get_primary_shard(pgid, &pcand) &&
6999 (pg = _lookup_lock_pg(pcand))) {
7000 if (pg->is_primary()) {
7001 // simulate pg <pgid> cmd= for pg->do-command
7002 if (prefix != "pg")
7003 cmd_putval(cct, cmdmap, "cmd", prefix);
7004 try {
7005 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
7006 } catch (const bad_cmd_get& e) {
7007 pg->unlock();
7008 ss << e.what();
7009 return -EINVAL;
7010 }
7011 if (r == -EAGAIN) {
7012 pg->unlock();
7013 // don't reply, pg will do so async
7014 return -EAGAIN;
7015 }
7016 } else {
7017 ss << "not primary for pgid " << pgid;
7018
7019 // send them the latest diff to ensure they realize the mapping
7020 // has changed.
7021 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
7022
7023 // do not reply; they will get newer maps and realize they
7024 // need to resend.
7025 pg->unlock();
7026 return -EAGAIN;
7027 }
7028 pg->unlock();
7029 } else {
7030 ss << "i don't have pgid " << pgid;
7031 r = -ENOENT;
7032 }
7033 }
7034 }
7035
7036 else if (prefix == "bench") {
7037 int64_t count;
7038 int64_t bsize;
7039 int64_t osize, onum;
7040 // default count 1G, size 4MB
7041 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
7042 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
7043 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
7044 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
7045
7046 uint32_t duration = cct->_conf->osd_bench_duration;
7047
7048 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
7049 // let us limit the block size because the next checks rely on it
7050 // having a sane value. If we allow any block size to be set things
7051 // can still go sideways.
7052 ss << "block 'size' values are capped at "
7053 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7054 << " a higher value, please adjust 'osd_bench_max_block_size'";
7055 r = -EINVAL;
7056 goto out;
7057 } else if (bsize < (int64_t) (1 << 20)) {
7058 // entering the realm of small block sizes.
7059 // limit the count to a sane value, assuming a configurable amount of
7060 // IOPS and duration, so that the OSD doesn't get hung up on this,
7061 // preventing timeouts from going off
7062 int64_t max_count =
7063 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
7064 if (count > max_count) {
7065 ss << "'count' values greater than " << max_count
7066 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7067 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
7068 << " for " << duration << " seconds,"
7069 << " can cause ill effects on osd. "
7070 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7071 << " value if you wish to use a higher 'count'.";
7072 r = -EINVAL;
7073 goto out;
7074 }
7075 } else {
7076 // 1MB block sizes are big enough so that we get more stuff done.
7077 // However, to avoid the osd from getting hung on this and having
7078 // timers being triggered, we are going to limit the count assuming
7079 // a configurable throughput and duration.
7080 // NOTE: max_count is the total amount of bytes that we believe we
7081 // will be able to write during 'duration' for the given
7082 // throughput. The block size hardly impacts this unless it's
7083 // way too big. Given we already check how big the block size
7084 // is, it's safe to assume everything will check out.
7085 int64_t max_count =
7086 cct->_conf->osd_bench_large_size_max_throughput * duration;
7087 if (count > max_count) {
7088 ss << "'count' values greater than " << max_count
7089 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7090 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7091 << " for " << duration << " seconds,"
7092 << " can cause ill effects on osd. "
7093 << " Please adjust 'osd_bench_large_size_max_throughput'"
7094 << " with a higher value if you wish to use a higher 'count'.";
7095 r = -EINVAL;
7096 goto out;
7097 }
7098 }
7099
7100 if (osize && bsize > osize)
7101 bsize = osize;
7102
7103 dout(1) << " bench count " << count
7104 << " bsize " << byte_u_t(bsize) << dendl;
7105
7106 ObjectStore::Transaction cleanupt;
7107
7108 if (osize && onum) {
7109 bufferlist bl;
7110 bufferptr bp(osize);
7111 bp.zero();
7112 bl.push_back(std::move(bp));
7113 bl.rebuild_page_aligned();
7114 for (int i=0; i<onum; ++i) {
7115 char nm[30];
7116 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
7117 object_t oid(nm);
7118 hobject_t soid(sobject_t(oid, 0));
7119 ObjectStore::Transaction t;
7120 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
7121 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7122 cleanupt.remove(coll_t(), ghobject_t(soid));
7123 }
7124 }
7125
7126 bufferlist bl;
7127 bufferptr bp(bsize);
7128 bp.zero();
7129 bl.push_back(std::move(bp));
7130 bl.rebuild_page_aligned();
7131
7132 {
7133 C_SaferCond waiter;
7134 if (!service.meta_ch->flush_commit(&waiter)) {
7135 waiter.wait();
7136 }
7137 }
7138
7139 utime_t start = ceph_clock_now();
7140 for (int64_t pos = 0; pos < count; pos += bsize) {
7141 char nm[30];
7142 unsigned offset = 0;
7143 if (onum && osize) {
7144 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
7145 offset = rand() % (osize / bsize) * bsize;
7146 } else {
7147 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
7148 }
7149 object_t oid(nm);
7150 hobject_t soid(sobject_t(oid, 0));
7151 ObjectStore::Transaction t;
7152 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
7153 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7154 if (!onum || !osize)
7155 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
7156 }
7157
7158 {
7159 C_SaferCond waiter;
7160 if (!service.meta_ch->flush_commit(&waiter)) {
7161 waiter.wait();
7162 }
7163 }
7164 utime_t end = ceph_clock_now();
7165
7166 // clean up
7167 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7168 {
7169 C_SaferCond waiter;
7170 if (!service.meta_ch->flush_commit(&waiter)) {
7171 waiter.wait();
7172 }
7173 }
7174
7175 double elapsed = end - start;
7176 double rate = count / elapsed;
7177 double iops = rate / bsize;
7178 if (f) {
7179 f->open_object_section("osd_bench_results");
7180 f->dump_int("bytes_written", count);
7181 f->dump_int("blocksize", bsize);
7182 f->dump_float("elapsed_sec", elapsed);
7183 f->dump_float("bytes_per_sec", rate);
7184 f->dump_float("iops", iops);
7185 f->close_section();
7186 f->flush(ds);
7187 } else {
7188 ds << "bench: wrote " << byte_u_t(count)
7189 << " in blocks of " << byte_u_t(bsize) << " in "
7190 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
7191 << si_u_t(iops) << " IOPS";
7192 }
7193 }
7194
7195 else if (prefix == "flush_pg_stats") {
7196 mgrc.send_pgstats();
7197 ds << service.get_osd_stat_seq() << "\n";
7198 }
7199
7200 else if (prefix == "heap") {
7201 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
7202 }
7203
7204 else if (prefix == "debug dump_missing") {
7205 if (!f) {
7206 f.reset(new JSONFormatter(true));
7207 }
7208 f->open_array_section("pgs");
7209 vector<PGRef> pgs;
7210 _get_pgs(&pgs);
7211 for (auto& pg : pgs) {
7212 string s = stringify(pg->pg_id);
7213 f->open_array_section(s.c_str());
7214 pg->lock();
7215 pg->dump_missing(f.get());
7216 pg->unlock();
7217 f->close_section();
7218 }
7219 f->close_section();
7220 f->flush(ds);
7221 }
7222 else if (prefix == "debug kick_recovery_wq") {
7223 int64_t delay;
7224 cmd_getval(cct, cmdmap, "delay", delay);
7225 ostringstream oss;
7226 oss << delay;
7227 unlock_guard unlock{osd_lock};
7228 r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7229 if (r != 0) {
7230 ss << "kick_recovery_wq: error setting "
7231 << "osd_recovery_delay_start to '" << delay << "': error "
7232 << r;
7233 goto out;
7234 }
7235 cct->_conf.apply_changes(nullptr);
7236 ss << "kicking recovery queue. set osd_recovery_delay_start "
7237 << "to " << cct->_conf->osd_recovery_delay_start;
7238 }
7239
7240 else if (prefix == "cpu_profiler") {
7241 string arg;
7242 cmd_getval(cct, cmdmap, "arg", arg);
7243 vector<string> argvec;
7244 get_str_vec(arg, argvec);
7245 cpu_profiler_handle_command(argvec, ds);
7246 }
7247
7248 else if (prefix == "dump_pg_recovery_stats") {
7249 stringstream s;
7250 if (f) {
7251 pg_recovery_stats.dump_formatted(f.get());
7252 f->flush(ds);
7253 } else {
7254 pg_recovery_stats.dump(s);
7255 ds << "dump pg recovery stats: " << s.str();
7256 }
7257 }
7258
7259 else if (prefix == "reset_pg_recovery_stats") {
7260 ss << "reset pg recovery stats";
7261 pg_recovery_stats.reset();
7262 }
7263
7264 else if (prefix == "perf histogram dump") {
7265 std::string logger;
7266 std::string counter;
7267 cmd_getval(cct, cmdmap, "logger", logger);
7268 cmd_getval(cct, cmdmap, "counter", counter);
7269 if (f) {
7270 cct->get_perfcounters_collection()->dump_formatted_histograms(
7271 f.get(), false, logger, counter);
7272 f->flush(ds);
7273 }
7274 }
7275
7276 else if (prefix == "compact") {
7277 dout(1) << "triggering manual compaction" << dendl;
7278 auto start = ceph::coarse_mono_clock::now();
7279 store->compact();
7280 auto end = ceph::coarse_mono_clock::now();
7281 double duration = std::chrono::duration<double>(end-start).count();
7282 dout(1) << "finished manual compaction in "
7283 << duration
7284 << " seconds" << dendl;
7285 ss << "compacted omap in " << duration << " seconds";
7286 }
7287
7288 else if (prefix == "smart") {
7289 string devid;
7290 cmd_getval(cct, cmdmap, "devid", devid);
7291 probe_smart(devid, ds);
7292 }
7293
7294 else if (prefix == "cache drop") {
7295 dout(20) << "clearing all caches" << dendl;
7296 // Clear the objectstore's cache - onode and buffer for Bluestore,
7297 // system's pagecache for Filestore
7298 r = store->flush_cache(&ss);
7299 if (r < 0) {
7300 ds << "Error flushing objectstore cache: " << cpp_strerror(r);
7301 goto out;
7302 }
7303 // Clear the objectcontext cache (per PG)
7304 vector<PGRef> pgs;
7305 _get_pgs(&pgs);
7306 for (auto& pg: pgs) {
7307 pg->clear_cache();
7308 }
7309 }
7310
7311 else if (prefix == "cache status") {
7312 int obj_ctx_count = 0;
7313 vector<PGRef> pgs;
7314 _get_pgs(&pgs);
7315 for (auto& pg: pgs) {
7316 obj_ctx_count += pg->get_cache_obj_count();
7317 }
7318 if (f) {
7319 f->open_object_section("cache_status");
7320 f->dump_int("object_ctx", obj_ctx_count);
7321 store->dump_cache_stats(f.get());
7322 f->close_section();
7323 f->flush(ds);
7324 } else {
7325 ds << "object_ctx: " << obj_ctx_count;
7326 store->dump_cache_stats(ds);
7327 }
7328 }
7329 else if (prefix == "send_beacon") {
7330 if (is_active()) {
7331 send_beacon(ceph::coarse_mono_clock::now());
7332 }
7333 } else {
7334 ss << "unrecognized command '" << prefix << "'";
7335 r = -EINVAL;
7336 }
7337
7338 out:
7339 return r;
7340 }
7341
7342 void OSD::probe_smart(const string& only_devid, ostream& ss)
7343 {
7344 set<string> devnames;
7345 store->get_devices(&devnames);
7346 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7347 "osd_smart_report_timeout");
7348
7349 // == typedef std::map<std::string, mValue> mObject;
7350 json_spirit::mObject json_map;
7351
7352 for (auto dev : devnames) {
7353 // smartctl works only on physical devices; filter out any logical device
7354 if (dev.find("dm-") == 0) {
7355 continue;
7356 }
7357
7358 string err;
7359 string devid = get_device_id(dev, &err);
7360 if (devid.size() == 0) {
7361 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7362 << err << "), skipping" << dendl;
7363 continue;
7364 }
7365 if (only_devid.size() && devid != only_devid) {
7366 continue;
7367 }
7368
7369 json_spirit::mValue smart_json;
7370 if (block_device_get_metrics(dev, smart_timeout,
7371 &smart_json)) {
7372 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7373 continue;
7374 }
7375 json_map[devid] = smart_json;
7376 }
7377 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7378 }
7379
7380 bool OSD::heartbeat_dispatch(Message *m)
7381 {
7382 dout(30) << "heartbeat_dispatch " << m << dendl;
7383 switch (m->get_type()) {
7384
7385 case CEPH_MSG_PING:
7386 dout(10) << "ping from " << m->get_source_inst() << dendl;
7387 m->put();
7388 break;
7389
7390 case MSG_OSD_PING:
7391 handle_osd_ping(static_cast<MOSDPing*>(m));
7392 break;
7393
7394 default:
7395 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7396 m->put();
7397 }
7398
7399 return true;
7400 }
7401
7402 bool OSD::ms_dispatch(Message *m)
7403 {
7404 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7405 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7406 service.got_stop_ack();
7407 m->put();
7408 return true;
7409 }
7410
7411 // lock!
7412
7413 osd_lock.Lock();
7414 if (is_stopping()) {
7415 osd_lock.Unlock();
7416 m->put();
7417 return true;
7418 }
7419
7420 do_waiters();
7421 _dispatch(m);
7422
7423 osd_lock.Unlock();
7424
7425 return true;
7426 }
7427
7428 void OSD::maybe_share_map(
7429 Session *session,
7430 OpRequestRef op,
7431 OSDMapRef osdmap)
7432 {
7433 if (!op->check_send_map) {
7434 return;
7435 }
7436 epoch_t last_sent_epoch = 0;
7437
7438 session->sent_epoch_lock.lock();
7439 last_sent_epoch = session->last_sent_epoch;
7440 session->sent_epoch_lock.unlock();
7441
7442 // assume the peer has the newer of the op's sent_epoch and what
7443 // we think we sent them.
7444 epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7445
7446 const Message *m = op->get_req();
7447 service.share_map(
7448 m->get_source(),
7449 m->get_connection().get(),
7450 from,
7451 osdmap,
7452 session ? &last_sent_epoch : NULL);
7453
7454 session->sent_epoch_lock.lock();
7455 if (session->last_sent_epoch < last_sent_epoch) {
7456 session->last_sent_epoch = last_sent_epoch;
7457 }
7458 session->sent_epoch_lock.unlock();
7459
7460 op->check_send_map = false;
7461 }
7462
7463 void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7464 {
7465 ceph_assert(session->session_dispatch_lock.is_locked());
7466
7467 auto i = session->waiting_on_map.begin();
7468 while (i != session->waiting_on_map.end()) {
7469 OpRequestRef op = &(*i);
7470 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7471 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7472 op->get_req());
7473 if (m->get_min_epoch() > osdmap->get_epoch()) {
7474 break;
7475 }
7476 session->waiting_on_map.erase(i++);
7477 op->put();
7478
7479 spg_t pgid;
7480 if (m->get_type() == CEPH_MSG_OSD_OP) {
7481 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7482 static_cast<const MOSDOp*>(m)->get_pg());
7483 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7484 continue;
7485 }
7486 } else {
7487 pgid = m->get_spg();
7488 }
7489 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7490 }
7491
7492 if (session->waiting_on_map.empty()) {
7493 clear_session_waiting_on_map(session);
7494 } else {
7495 register_session_waiting_on_map(session);
7496 }
7497 }
7498
7499 void OSD::ms_fast_dispatch(Message *m)
7500 {
7501 FUNCTRACE(cct);
7502 if (service.is_stopping()) {
7503 m->put();
7504 return;
7505 }
7506
7507 // peering event?
7508 switch (m->get_type()) {
7509 case CEPH_MSG_PING:
7510 dout(10) << "ping from " << m->get_source() << dendl;
7511 m->put();
7512 return;
7513 case MSG_MON_COMMAND:
7514 handle_command(static_cast<MMonCommand*>(m));
7515 return;
7516 case MSG_OSD_FORCE_RECOVERY:
7517 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7518 return;
7519 case MSG_OSD_SCRUB2:
7520 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7521 return;
7522
7523 case MSG_OSD_PG_CREATE2:
7524 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7525 case MSG_OSD_PG_QUERY:
7526 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7527 case MSG_OSD_PG_NOTIFY:
7528 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7529 case MSG_OSD_PG_INFO:
7530 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7531 case MSG_OSD_PG_REMOVE:
7532 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7533
7534 // these are single-pg messages that handle themselves
7535 case MSG_OSD_PG_LOG:
7536 case MSG_OSD_PG_TRIM:
7537 case MSG_OSD_BACKFILL_RESERVE:
7538 case MSG_OSD_RECOVERY_RESERVE:
7539 {
7540 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7541 if (require_osd_peer(pm)) {
7542 enqueue_peering_evt(
7543 pm->get_spg(),
7544 PGPeeringEventRef(pm->get_event()));
7545 }
7546 pm->put();
7547 return;
7548 }
7549 }
7550
7551 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7552 {
7553 #ifdef WITH_LTTNG
7554 osd_reqid_t reqid = op->get_reqid();
7555 #endif
7556 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7557 reqid.name._num, reqid.tid, reqid.inc);
7558 }
7559
7560 if (m->trace)
7561 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7562
7563 // note sender epoch, min req's epoch
7564 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7565 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7566 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7567
7568 service.maybe_inject_dispatch_delay();
7569
7570 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7571 m->get_type() != CEPH_MSG_OSD_OP) {
7572 // queue it directly
7573 enqueue_op(
7574 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7575 std::move(op),
7576 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7577 } else {
7578 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7579 // message that didn't have an explicit spg_t); we need to map
7580 // them to an spg_t while preserving delivery order.
7581 auto priv = m->get_connection()->get_priv();
7582 if (auto session = static_cast<Session*>(priv.get()); session) {
7583 std::lock_guard l{session->session_dispatch_lock};
7584 op->get();
7585 session->waiting_on_map.push_back(*op);
7586 OSDMapRef nextmap = service.get_nextmap_reserved();
7587 dispatch_session_waiting(session, nextmap);
7588 service.release_map(nextmap);
7589 }
7590 }
7591 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7592 }
7593
7594 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7595 {
7596 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7597
7598 if (is_stopping()) {
7599 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7600 return false;
7601 }
7602
7603 if (dest_type == CEPH_ENTITY_TYPE_MON)
7604 return true;
7605
7606 *authorizer = monc->build_authorizer(dest_type);
7607 return *authorizer != NULL;
7608 }
7609
7610 KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7611 {
7612 return monc->rotating_secrets.get();
7613 }
7614
7615 int OSD::ms_handle_authentication(Connection *con)
7616 {
7617 int ret = 0;
7618 auto priv = con->get_priv();
7619 Session *s = static_cast<Session*>(priv.get());
7620 if (!s) {
7621 s = new Session(cct, con);
7622 con->set_priv(RefCountedPtr{s, false});
7623 s->entity_name = con->get_peer_entity_name();
7624 dout(10) << __func__ << " new session " << s << " con " << s->con
7625 << " entity " << s->entity_name
7626 << " addr " << con->get_peer_addrs() << dendl;
7627 } else {
7628 dout(10) << __func__ << " existing session " << s << " con " << s->con
7629 << " entity " << s->entity_name
7630 << " addr " << con->get_peer_addrs() << dendl;
7631 }
7632
7633 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7634 if (caps_info.allow_all)
7635 s->caps.set_allow_all();
7636
7637 if (caps_info.caps.length() > 0) {
7638 bufferlist::const_iterator p = caps_info.caps.cbegin();
7639 string str;
7640 try {
7641 decode(str, p);
7642 }
7643 catch (buffer::error& e) {
7644 dout(10) << __func__ << " session " << s << " " << s->entity_name
7645 << " failed to decode caps string" << dendl;
7646 ret = -EPERM;
7647 }
7648 if (!ret) {
7649 bool success = s->caps.parse(str);
7650 if (success) {
7651 dout(10) << __func__ << " session " << s
7652 << " " << s->entity_name
7653 << " has caps " << s->caps << " '" << str << "'" << dendl;
7654 ret = 1;
7655 } else {
7656 dout(10) << __func__ << " session " << s << " " << s->entity_name
7657 << " failed to parse caps '" << str << "'" << dendl;
7658 ret = -EPERM;
7659 }
7660 }
7661 }
7662 return ret;
7663 }
7664
7665 void OSD::do_waiters()
7666 {
7667 ceph_assert(osd_lock.is_locked());
7668
7669 dout(10) << "do_waiters -- start" << dendl;
7670 while (!finished.empty()) {
7671 OpRequestRef next = finished.front();
7672 finished.pop_front();
7673 dispatch_op(next);
7674 }
7675 dout(10) << "do_waiters -- finish" << dendl;
7676 }
7677
7678 void OSD::dispatch_op(OpRequestRef op)
7679 {
7680 switch (op->get_req()->get_type()) {
7681
7682 case MSG_OSD_PG_CREATE:
7683 handle_pg_create(op);
7684 break;
7685 }
7686 }
7687
7688 void OSD::_dispatch(Message *m)
7689 {
7690 ceph_assert(osd_lock.is_locked());
7691 dout(20) << "_dispatch " << m << " " << *m << dendl;
7692
7693 switch (m->get_type()) {
7694 // -- don't need OSDMap --
7695
7696 // map and replication
7697 case CEPH_MSG_OSD_MAP:
7698 handle_osd_map(static_cast<MOSDMap*>(m));
7699 break;
7700
7701 // osd
7702 case MSG_OSD_SCRUB:
7703 handle_scrub(static_cast<MOSDScrub*>(m));
7704 break;
7705
7706 case MSG_COMMAND:
7707 handle_command(static_cast<MCommand*>(m));
7708 return;
7709
7710 // -- need OSDMap --
7711
7712 case MSG_OSD_PG_CREATE:
7713 {
7714 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7715 if (m->trace)
7716 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7717 // no map? starting up?
7718 if (!osdmap) {
7719 dout(7) << "no OSDMap, not booted" << dendl;
7720 logger->inc(l_osd_waiting_for_map);
7721 waiting_for_osdmap.push_back(op);
7722 op->mark_delayed("no osdmap");
7723 break;
7724 }
7725
7726 // need OSDMap
7727 dispatch_op(op);
7728 }
7729 }
7730 }
7731
7732 // remove me post-nautilus
7733 void OSD::handle_scrub(MOSDScrub *m)
7734 {
7735 dout(10) << "handle_scrub " << *m << dendl;
7736 if (!require_mon_or_mgr_peer(m)) {
7737 m->put();
7738 return;
7739 }
7740 if (m->fsid != monc->get_fsid()) {
7741 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7742 << dendl;
7743 m->put();
7744 return;
7745 }
7746
7747 vector<spg_t> spgs;
7748 _get_pgids(&spgs);
7749
7750 if (!m->scrub_pgs.empty()) {
7751 vector<spg_t> v;
7752 for (auto pgid : m->scrub_pgs) {
7753 spg_t pcand;
7754 if (osdmap->get_primary_shard(pgid, &pcand) &&
7755 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7756 v.push_back(pcand);
7757 }
7758 }
7759 spgs.swap(v);
7760 }
7761
7762 for (auto pgid : spgs) {
7763 enqueue_peering_evt(
7764 pgid,
7765 PGPeeringEventRef(
7766 std::make_shared<PGPeeringEvent>(
7767 get_osdmap_epoch(),
7768 get_osdmap_epoch(),
7769 PG::RequestScrub(m->deep, m->repair))));
7770 }
7771
7772 m->put();
7773 }
7774
7775 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7776 {
7777 dout(10) << __func__ << " " << *m << dendl;
7778 if (!require_mon_or_mgr_peer(m)) {
7779 m->put();
7780 return;
7781 }
7782 if (m->fsid != monc->get_fsid()) {
7783 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7784 << dendl;
7785 m->put();
7786 return;
7787 }
7788 for (auto pgid : m->scrub_pgs) {
7789 enqueue_peering_evt(
7790 pgid,
7791 PGPeeringEventRef(
7792 std::make_shared<PGPeeringEvent>(
7793 m->epoch,
7794 m->epoch,
7795 PG::RequestScrub(m->deep, m->repair))));
7796 }
7797 m->put();
7798 }
7799
7800 bool OSD::scrub_random_backoff()
7801 {
7802 bool coin_flip = (rand() / (double)RAND_MAX >=
7803 cct->_conf->osd_scrub_backoff_ratio);
7804 if (!coin_flip) {
7805 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7806 return true;
7807 }
7808 return false;
7809 }
7810
7811 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7812 const spg_t& pg, const utime_t& timestamp,
7813 double pool_scrub_min_interval,
7814 double pool_scrub_max_interval, bool must)
7815 : cct(cct),
7816 pgid(pg),
7817 sched_time(timestamp),
7818 deadline(timestamp)
7819 {
7820 // if not explicitly requested, postpone the scrub with a random delay
7821 if (!must) {
7822 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7823 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7824 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7825 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7826
7827 sched_time += scrub_min_interval;
7828 double r = rand() / (double)RAND_MAX;
7829 sched_time +=
7830 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7831 if (scrub_max_interval == 0) {
7832 deadline = utime_t();
7833 } else {
7834 deadline += scrub_max_interval;
7835 }
7836
7837 }
7838 }
7839
7840 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7841 if (sched_time < rhs.sched_time)
7842 return true;
7843 if (sched_time > rhs.sched_time)
7844 return false;
7845 return pgid < rhs.pgid;
7846 }
7847
7848 bool OSD::scrub_time_permit(utime_t now)
7849 {
7850 struct tm bdt;
7851 time_t tt = now.sec();
7852 localtime_r(&tt, &bdt);
7853
7854 bool day_permit = false;
7855 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7856 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7857 day_permit = true;
7858 }
7859 } else {
7860 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7861 day_permit = true;
7862 }
7863 }
7864
7865 if (!day_permit) {
7866 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7867 << " - " << cct->_conf->osd_scrub_end_week_day
7868 << " now " << bdt.tm_wday << " = no" << dendl;
7869 return false;
7870 }
7871
7872 bool time_permit = false;
7873 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7874 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7875 time_permit = true;
7876 }
7877 } else {
7878 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7879 time_permit = true;
7880 }
7881 }
7882 if (!time_permit) {
7883 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7884 << " - " << cct->_conf->osd_scrub_end_hour
7885 << " now " << bdt.tm_hour << " = no" << dendl;
7886 } else {
7887 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7888 << " - " << cct->_conf->osd_scrub_end_hour
7889 << " now " << bdt.tm_hour << " = yes" << dendl;
7890 }
7891 return time_permit;
7892 }
7893
7894 bool OSD::scrub_load_below_threshold()
7895 {
7896 double loadavgs[3];
7897 if (getloadavg(loadavgs, 3) != 3) {
7898 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7899 return false;
7900 }
7901
7902 // allow scrub if below configured threshold
7903 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7904 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7905 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7906 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7907 << " < max " << cct->_conf->osd_scrub_load_threshold
7908 << " = yes" << dendl;
7909 return true;
7910 }
7911
7912 // allow scrub if below daily avg and currently decreasing
7913 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7914 dout(20) << __func__ << " loadavg " << loadavgs[0]
7915 << " < daily_loadavg " << daily_loadavg
7916 << " and < 15m avg " << loadavgs[2]
7917 << " = yes" << dendl;
7918 return true;
7919 }
7920
7921 dout(20) << __func__ << " loadavg " << loadavgs[0]
7922 << " >= max " << cct->_conf->osd_scrub_load_threshold
7923 << " and ( >= daily_loadavg " << daily_loadavg
7924 << " or >= 15m avg " << loadavgs[2]
7925 << ") = no" << dendl;
7926 return false;
7927 }
7928
7929 void OSD::sched_scrub()
7930 {
7931 // if not permitted, fail fast
7932 if (!service.can_inc_scrubs()) {
7933 return;
7934 }
7935 bool allow_requested_repair_only = false;
7936 if (service.is_recovery_active()) {
7937 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7938 dout(10) << __func__
7939 << " will only schedule explicitly requested repair due to active recovery"
7940 << dendl;
7941 allow_requested_repair_only = true;
7942 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7943 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7944 return;
7945 }
7946 }
7947
7948 utime_t now = ceph_clock_now();
7949 bool time_permit = scrub_time_permit(now);
7950 bool load_is_low = scrub_load_below_threshold();
7951 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7952
7953 OSDService::ScrubJob scrub;
7954 if (service.first_scrub_stamp(&scrub)) {
7955 do {
7956 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7957
7958 if (scrub.sched_time > now) {
7959 // save ourselves some effort
7960 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7961 << " > " << now << dendl;
7962 break;
7963 }
7964
7965 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7966 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7967 << (!time_permit ? "time not permit" : "high load") << dendl;
7968 continue;
7969 }
7970
7971 PGRef pg = _lookup_lock_pg(scrub.pgid);
7972 if (!pg)
7973 continue;
7974 // This has already started, so go on to the next scrub job
7975 if (pg->scrubber.active) {
7976 pg->unlock();
7977 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7978 continue;
7979 }
7980 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7981 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7982 pg->unlock();
7983 dout(10) << __func__ << " skip " << scrub.pgid
7984 << " because repairing is not explicitly requested on it"
7985 << dendl;
7986 continue;
7987 }
7988 // If it is reserving, let it resolve before going to the next scrub job
7989 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7990 pg->unlock();
7991 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7992 break;
7993 }
7994 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7995 << (pg->get_must_scrub() ? ", explicitly requested" :
7996 (load_is_low ? ", load_is_low" : " deadline < now"))
7997 << dendl;
7998 if (pg->sched_scrub()) {
7999 pg->unlock();
8000 break;
8001 }
8002 pg->unlock();
8003 } while (service.next_scrub_stamp(scrub, &scrub));
8004 }
8005 dout(20) << "sched_scrub done" << dendl;
8006 }
8007
8008 void OSD::resched_all_scrubs()
8009 {
8010 dout(10) << __func__ << ": start" << dendl;
8011 OSDService::ScrubJob scrub;
8012 if (service.first_scrub_stamp(&scrub)) {
8013 do {
8014 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
8015
8016 PGRef pg = _lookup_lock_pg(scrub.pgid);
8017 if (!pg)
8018 continue;
8019 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
8020 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
8021 pg->on_info_history_change();
8022 }
8023 pg->unlock();
8024 } while (service.next_scrub_stamp(scrub, &scrub));
8025 }
8026 dout(10) << __func__ << ": done" << dendl;
8027 }
8028
8029 MPGStats* OSD::collect_pg_stats()
8030 {
8031 // This implementation unconditionally sends every is_primary PG's
8032 // stats every time we're called. This has equivalent cost to the
8033 // previous implementation's worst case where all PGs are busy and
8034 // their stats are always enqueued for sending.
8035 RWLock::RLocker l(map_lock);
8036
8037 utime_t had_for = ceph_clock_now() - had_map_since;
8038 osd_stat_t cur_stat = service.get_osd_stat();
8039 cur_stat.os_perf_stat = store->get_cur_stats();
8040
8041 auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
8042 m->osd_stat = cur_stat;
8043
8044 std::lock_guard lec{min_last_epoch_clean_lock};
8045 min_last_epoch_clean = osdmap->get_epoch();
8046 min_last_epoch_clean_pgs.clear();
8047
8048 std::set<int64_t> pool_set;
8049 vector<PGRef> pgs;
8050 _get_pgs(&pgs);
8051 for (auto& pg : pgs) {
8052 auto pool = pg->pg_id.pgid.pool();
8053 pool_set.emplace((int64_t)pool);
8054 if (!pg->is_primary()) {
8055 continue;
8056 }
8057 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
8058 m->pg_stat[pg->pg_id.pgid] = s;
8059 min_last_epoch_clean = min(min_last_epoch_clean, lec);
8060 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
8061 });
8062 }
8063 store_statfs_t st;
8064 bool per_pool_stats = false;
8065 for (auto p : pool_set) {
8066 int r = store->pool_statfs(p, &st);
8067 if (r == -ENOTSUP) {
8068 break;
8069 } else {
8070 assert(r >= 0);
8071 m->pool_stat[p] = st;
8072 per_pool_stats = true;
8073 }
8074 }
8075
8076 // indicate whether we are reporting per-pool stats
8077 m->osd_stat.num_osds = 1;
8078 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
8079
8080 return m;
8081 }
8082
8083 vector<DaemonHealthMetric> OSD::get_health_metrics()
8084 {
8085 vector<DaemonHealthMetric> metrics;
8086 {
8087 utime_t oldest_secs;
8088 const utime_t now = ceph_clock_now();
8089 auto too_old = now;
8090 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
8091 int slow = 0;
8092 TrackedOpRef oldest_op;
8093 auto count_slow_ops = [&](TrackedOp& op) {
8094 if (op.get_initiated() < too_old) {
8095 lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
8096 << " initiated "
8097 << op.get_initiated() << dendl;
8098 slow++;
8099 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
8100 oldest_op = &op;
8101 }
8102 return true;
8103 } else {
8104 return false;
8105 }
8106 };
8107 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
8108 if (slow) {
8109 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
8110 << oldest_op->get_desc() << dendl;
8111 }
8112 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
8113 } else {
8114 // no news is not good news.
8115 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
8116 }
8117 }
8118 {
8119 std::lock_guard l(pending_creates_lock);
8120 auto n_primaries = pending_creates_from_mon;
8121 for (const auto& create : pending_creates_from_osd) {
8122 if (create.second) {
8123 n_primaries++;
8124 }
8125 }
8126 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
8127 }
8128 return metrics;
8129 }
8130
8131 // =====================================================
8132 // MAP
8133
8134 void OSD::wait_for_new_map(OpRequestRef op)
8135 {
8136 // ask?
8137 if (waiting_for_osdmap.empty()) {
8138 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8139 }
8140
8141 logger->inc(l_osd_waiting_for_map);
8142 waiting_for_osdmap.push_back(op);
8143 op->mark_delayed("wait for new map");
8144 }
8145
8146
8147 /** update_map
8148 * assimilate new OSDMap(s). scan pgs, etc.
8149 */
8150
8151 void OSD::note_down_osd(int peer)
8152 {
8153 ceph_assert(osd_lock.is_locked());
8154 cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
8155
8156 heartbeat_lock.Lock();
8157 failure_queue.erase(peer);
8158 failure_pending.erase(peer);
8159 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
8160 if (p != heartbeat_peers.end()) {
8161 p->second.con_back->mark_down();
8162 if (p->second.con_front) {
8163 p->second.con_front->mark_down();
8164 }
8165 heartbeat_peers.erase(p);
8166 }
8167 heartbeat_lock.Unlock();
8168 }
8169
8170 void OSD::note_up_osd(int peer)
8171 {
8172 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
8173 heartbeat_set_peers_need_update();
8174 }
8175
8176 struct C_OnMapCommit : public Context {
8177 OSD *osd;
8178 epoch_t first, last;
8179 MOSDMap *msg;
8180 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
8181 : osd(o), first(f), last(l), msg(m) {}
8182 void finish(int r) override {
8183 osd->_committed_osd_maps(first, last, msg);
8184 msg->put();
8185 }
8186 };
8187
8188 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
8189 {
8190 std::lock_guard l(osdmap_subscribe_lock);
8191 if (latest_subscribed_epoch >= epoch && !force_request)
8192 return;
8193
8194 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
8195
8196 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
8197 force_request) {
8198 monc->renew_subs();
8199 }
8200 }
8201
8202 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
8203 {
8204 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
8205 if (min <= superblock.oldest_map)
8206 return;
8207
8208 int num = 0;
8209 ObjectStore::Transaction t;
8210 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
8211 dout(20) << " removing old osdmap epoch " << e << dendl;
8212 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
8213 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
8214 superblock.oldest_map = e + 1;
8215 num++;
8216 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
8217 service.publish_superblock(superblock);
8218 write_superblock(t);
8219 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8220 ceph_assert(tr == 0);
8221 num = 0;
8222 if (!skip_maps) {
8223 // skip_maps leaves us with a range of old maps if we fail to remove all
8224 // of them before moving superblock.oldest_map forward to the first map
8225 // in the incoming MOSDMap msg. so we should continue removing them in
8226 // this case, even we could do huge series of delete transactions all at
8227 // once.
8228 break;
8229 }
8230 }
8231 }
8232 if (num > 0) {
8233 service.publish_superblock(superblock);
8234 write_superblock(t);
8235 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8236 ceph_assert(tr == 0);
8237 }
8238 // we should not remove the cached maps
8239 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
8240 }
8241
8242 void OSD::handle_osd_map(MOSDMap *m)
8243 {
8244 // wait for pgs to catch up
8245 {
8246 // we extend the map cache pins to accomodate pgs slow to consume maps
8247 // for some period, until we hit the max_lag_factor bound, at which point
8248 // we block here to stop injesting more maps than they are able to keep
8249 // up with.
8250 epoch_t max_lag = cct->_conf->osd_map_cache_size *
8251 m_osd_pg_epoch_max_lag_factor;
8252 ceph_assert(max_lag > 0);
8253 epoch_t osd_min = 0;
8254 for (auto shard : shards) {
8255 epoch_t min = shard->get_min_pg_epoch();
8256 if (osd_min == 0 || min < osd_min) {
8257 osd_min = min;
8258 }
8259 }
8260 if (osd_min > 0 &&
8261 osdmap->get_epoch() > max_lag &&
8262 osdmap->get_epoch() - max_lag > osd_min) {
8263 epoch_t need = osdmap->get_epoch() - max_lag;
8264 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
8265 << " max_lag " << max_lag << ")" << dendl;
8266 for (auto shard : shards) {
8267 epoch_t min = shard->get_min_pg_epoch();
8268 if (need > min) {
8269 dout(10) << __func__ << " waiting for pgs to consume " << need
8270 << " (shard " << shard->shard_id << " min " << min
8271 << ", map cache is " << cct->_conf->osd_map_cache_size
8272 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8273 << ")" << dendl;
8274 unlock_guard unlock{osd_lock};
8275 shard->wait_min_pg_epoch(need);
8276 }
8277 }
8278 }
8279 }
8280
8281 ceph_assert(osd_lock.is_locked());
8282 map<epoch_t,OSDMapRef> added_maps;
8283 map<epoch_t,bufferlist> added_maps_bl;
8284 if (m->fsid != monc->get_fsid()) {
8285 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8286 << monc->get_fsid() << dendl;
8287 m->put();
8288 return;
8289 }
8290 if (is_initializing()) {
8291 dout(0) << "ignoring osdmap until we have initialized" << dendl;
8292 m->put();
8293 return;
8294 }
8295
8296 auto priv = m->get_connection()->get_priv();
8297 if (auto session = static_cast<Session *>(priv.get());
8298 session && !(session->entity_name.is_mon() ||
8299 session->entity_name.is_osd())) {
8300 //not enough perms!
8301 dout(10) << "got osd map from Session " << session
8302 << " which we can't take maps from (not a mon or osd)" << dendl;
8303 m->put();
8304 return;
8305 }
8306
8307 // share with the objecter
8308 if (!is_preboot())
8309 service.objecter->handle_osd_map(m);
8310
8311 epoch_t first = m->get_first();
8312 epoch_t last = m->get_last();
8313 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8314 << superblock.newest_map
8315 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8316 << dendl;
8317
8318 logger->inc(l_osd_map);
8319 logger->inc(l_osd_mape, last - first + 1);
8320 if (first <= superblock.newest_map)
8321 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8322 if (service.max_oldest_map < m->oldest_map) {
8323 service.max_oldest_map = m->oldest_map;
8324 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8325 }
8326
8327 // make sure there is something new, here, before we bother flushing
8328 // the queues and such
8329 if (last <= superblock.newest_map) {
8330 dout(10) << " no new maps here, dropping" << dendl;
8331 m->put();
8332 return;
8333 }
8334
8335 // missing some?
8336 bool skip_maps = false;
8337 if (first > superblock.newest_map + 1) {
8338 dout(10) << "handle_osd_map message skips epochs "
8339 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8340 if (m->oldest_map <= superblock.newest_map + 1) {
8341 osdmap_subscribe(superblock.newest_map + 1, false);
8342 m->put();
8343 return;
8344 }
8345 // always try to get the full range of maps--as many as we can. this
8346 // 1- is good to have
8347 // 2- is at present the only way to ensure that we get a *full* map as
8348 // the first map!
8349 if (m->oldest_map < first) {
8350 osdmap_subscribe(m->oldest_map - 1, true);
8351 m->put();
8352 return;
8353 }
8354 skip_maps = true;
8355 }
8356
8357 ObjectStore::Transaction t;
8358 uint64_t txn_size = 0;
8359
8360 // store new maps: queue for disk and put in the osdmap cache
8361 epoch_t start = std::max(superblock.newest_map + 1, first);
8362 for (epoch_t e = start; e <= last; e++) {
8363 if (txn_size >= t.get_num_bytes()) {
8364 derr << __func__ << " transaction size overflowed" << dendl;
8365 ceph_assert(txn_size < t.get_num_bytes());
8366 }
8367 txn_size = t.get_num_bytes();
8368 map<epoch_t,bufferlist>::iterator p;
8369 p = m->maps.find(e);
8370 if (p != m->maps.end()) {
8371 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8372 OSDMap *o = new OSDMap;
8373 bufferlist& bl = p->second;
8374
8375 o->decode(bl);
8376
8377 ghobject_t fulloid = get_osdmap_pobject_name(e);
8378 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8379 added_maps[e] = add_map(o);
8380 added_maps_bl[e] = bl;
8381 got_full_map(e);
8382 continue;
8383 }
8384
8385 p = m->incremental_maps.find(e);
8386 if (p != m->incremental_maps.end()) {
8387 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8388 bufferlist& bl = p->second;
8389 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8390 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8391
8392 OSDMap *o = new OSDMap;
8393 if (e > 1) {
8394 bufferlist obl;
8395 bool got = get_map_bl(e - 1, obl);
8396 if (!got) {
8397 auto p = added_maps_bl.find(e - 1);
8398 ceph_assert(p != added_maps_bl.end());
8399 obl = p->second;
8400 }
8401 o->decode(obl);
8402 }
8403
8404 OSDMap::Incremental inc;
8405 auto p = bl.cbegin();
8406 inc.decode(p);
8407
8408 if (o->apply_incremental(inc) < 0) {
8409 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
8410 ceph_abort_msg("bad fsid");
8411 }
8412
8413 bufferlist fbl;
8414 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8415
8416 bool injected_failure = false;
8417 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8418 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8419 derr << __func__ << " injecting map crc failure" << dendl;
8420 injected_failure = true;
8421 }
8422
8423 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8424 dout(2) << "got incremental " << e
8425 << " but failed to encode full with correct crc; requesting"
8426 << dendl;
8427 clog->warn() << "failed to encode map e" << e << " with expected crc";
8428 dout(20) << "my encoded map was:\n";
8429 fbl.hexdump(*_dout);
8430 *_dout << dendl;
8431 delete o;
8432 request_full_map(e, last);
8433 last = e - 1;
8434 break;
8435 }
8436 got_full_map(e);
8437
8438 ghobject_t fulloid = get_osdmap_pobject_name(e);
8439 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8440 added_maps[e] = add_map(o);
8441 added_maps_bl[e] = fbl;
8442 continue;
8443 }
8444
8445 ceph_abort_msg("MOSDMap lied about what maps it had?");
8446 }
8447
8448 // even if this map isn't from a mon, we may have satisfied our subscription
8449 monc->sub_got("osdmap", last);
8450
8451 if (!m->maps.empty() && requested_full_first) {
8452 dout(10) << __func__ << " still missing full maps " << requested_full_first
8453 << ".." << requested_full_last << dendl;
8454 rerequest_full_maps();
8455 }
8456
8457 if (superblock.oldest_map) {
8458 // make sure we at least keep pace with incoming maps
8459 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8460 pg_num_history.prune(superblock.oldest_map);
8461 }
8462
8463 if (!superblock.oldest_map || skip_maps)
8464 superblock.oldest_map = first;
8465 superblock.newest_map = last;
8466 superblock.current_epoch = last;
8467
8468 // note in the superblock that we were clean thru the prior epoch
8469 epoch_t boot_epoch = service.get_boot_epoch();
8470 if (boot_epoch && boot_epoch >= superblock.mounted) {
8471 superblock.mounted = boot_epoch;
8472 superblock.clean_thru = last;
8473 }
8474
8475 // check for pg_num changes and deleted pools
8476 OSDMapRef lastmap;
8477 for (auto& i : added_maps) {
8478 if (!lastmap) {
8479 if (!(lastmap = service.try_get_map(i.first - 1))) {
8480 dout(10) << __func__ << " can't get previous map " << i.first - 1
8481 << " probably first start of this osd" << dendl;
8482 continue;
8483 }
8484 }
8485 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8486 for (auto& j : lastmap->get_pools()) {
8487 if (!i.second->have_pg_pool(j.first)) {
8488 pg_num_history.log_pool_delete(i.first, j.first);
8489 dout(10) << __func__ << " recording final pg_pool_t for pool "
8490 << j.first << dendl;
8491 // this information is needed by _make_pg() if have to restart before
8492 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8493 ghobject_t obj = make_final_pool_info_oid(j.first);
8494 bufferlist bl;
8495 encode(j.second, bl, CEPH_FEATURES_ALL);
8496 string name = lastmap->get_pool_name(j.first);
8497 encode(name, bl);
8498 map<string,string> profile;
8499 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8500 profile = lastmap->get_erasure_code_profile(
8501 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8502 }
8503 encode(profile, bl);
8504 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8505 service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8506 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8507 new_pg_num != j.second.get_pg_num()) {
8508 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8509 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8510 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8511 }
8512 }
8513 for (auto& j : i.second->get_pools()) {
8514 if (!lastmap->have_pg_pool(j.first)) {
8515 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8516 << j.second.get_pg_num() << dendl;
8517 pg_num_history.log_pg_num_change(i.first, j.first,
8518 j.second.get_pg_num());
8519 }
8520 }
8521 lastmap = i.second;
8522 }
8523 pg_num_history.epoch = last;
8524 {
8525 bufferlist bl;
8526 ::encode(pg_num_history, bl);
8527 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8528 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8529 }
8530
8531 // superblock and commit
8532 write_superblock(t);
8533 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8534 store->queue_transaction(
8535 service.meta_ch,
8536 std::move(t));
8537 service.publish_superblock(superblock);
8538 }
8539
8540 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8541 {
8542 dout(10) << __func__ << " " << first << ".." << last << dendl;
8543 if (is_stopping()) {
8544 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8545 return;
8546 }
8547 std::lock_guard l(osd_lock);
8548 if (is_stopping()) {
8549 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8550 return;
8551 }
8552 map_lock.get_write();
8553
8554 bool do_shutdown = false;
8555 bool do_restart = false;
8556 bool network_error = false;
8557
8558 // advance through the new maps
8559 for (epoch_t cur = first; cur <= last; cur++) {
8560 dout(10) << " advance to epoch " << cur
8561 << " (<= last " << last
8562 << " <= newest_map " << superblock.newest_map
8563 << ")" << dendl;
8564
8565 OSDMapRef newmap = get_map(cur);
8566 ceph_assert(newmap); // we just cached it above!
8567
8568 // start blacklisting messages sent to peers that go down.
8569 service.pre_publish_map(newmap);
8570
8571 // kill connections to newly down osds
8572 bool waited_for_reservations = false;
8573 set<int> old;
8574 osdmap->get_all_osds(old);
8575 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8576 if (*p != whoami &&
8577 osdmap->is_up(*p) && // in old map
8578 newmap->is_down(*p)) { // but not the new one
8579 if (!waited_for_reservations) {
8580 service.await_reserved_maps();
8581 waited_for_reservations = true;
8582 }
8583 note_down_osd(*p);
8584 } else if (*p != whoami &&
8585 osdmap->is_down(*p) &&
8586 newmap->is_up(*p)) {
8587 note_up_osd(*p);
8588 }
8589 }
8590
8591 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8592 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8593 << dendl;
8594 if (is_booting()) {
8595 // this captures the case where we sent the boot message while
8596 // NOUP was being set on the mon and our boot request was
8597 // dropped, and then later it is cleared. it imperfectly
8598 // handles the case where our original boot message was not
8599 // dropped and we restart even though we might have booted, but
8600 // that is harmless (boot will just take slightly longer).
8601 do_restart = true;
8602 }
8603 }
8604
8605 osdmap = newmap;
8606 epoch_t up_epoch;
8607 epoch_t boot_epoch;
8608 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8609 if (!up_epoch &&
8610 osdmap->is_up(whoami) &&
8611 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8612 up_epoch = osdmap->get_epoch();
8613 dout(10) << "up_epoch is " << up_epoch << dendl;
8614 if (!boot_epoch) {
8615 boot_epoch = osdmap->get_epoch();
8616 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8617 }
8618 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8619 }
8620 }
8621
8622 had_map_since = ceph_clock_now();
8623
8624 epoch_t _bind_epoch = service.get_bind_epoch();
8625 if (osdmap->is_up(whoami) &&
8626 osdmap->get_addrs(whoami).legacy_equals(
8627 client_messenger->get_myaddrs()) &&
8628 _bind_epoch < osdmap->get_up_from(whoami)) {
8629
8630 if (is_booting()) {
8631 dout(1) << "state: booting -> active" << dendl;
8632 set_state(STATE_ACTIVE);
8633 do_restart = false;
8634
8635 // set incarnation so that osd_reqid_t's we generate for our
8636 // objecter requests are unique across restarts.
8637 service.objecter->set_client_incarnation(osdmap->get_epoch());
8638 cancel_pending_failures();
8639 }
8640 }
8641
8642 if (osdmap->get_epoch() > 0 &&
8643 is_active()) {
8644 if (!osdmap->exists(whoami)) {
8645 dout(0) << "map says i do not exist. shutting down." << dendl;
8646 do_shutdown = true; // don't call shutdown() while we have
8647 // everything paused
8648 } else if (!osdmap->is_up(whoami) ||
8649 !osdmap->get_addrs(whoami).legacy_equals(
8650 client_messenger->get_myaddrs()) ||
8651 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8652 cluster_messenger->get_myaddrs()) ||
8653 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8654 hb_back_server_messenger->get_myaddrs()) ||
8655 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8656 hb_front_server_messenger->get_myaddrs())) {
8657 if (!osdmap->is_up(whoami)) {
8658 if (service.is_preparing_to_stop() || service.is_stopping()) {
8659 service.got_stop_ack();
8660 } else {
8661 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8662 "but it is still running";
8663 clog->debug() << "map e" << osdmap->get_epoch()
8664 << " wrongly marked me down at e"
8665 << osdmap->get_down_at(whoami);
8666 }
8667 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8668 client_messenger->get_myaddrs())) {
8669 clog->error() << "map e" << osdmap->get_epoch()
8670 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8671 << " != my " << client_messenger->get_myaddrs() << ")";
8672 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8673 cluster_messenger->get_myaddrs())) {
8674 clog->error() << "map e" << osdmap->get_epoch()
8675 << " had wrong cluster addr ("
8676 << osdmap->get_cluster_addrs(whoami)
8677 << " != my " << cluster_messenger->get_myaddrs() << ")";
8678 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8679 hb_back_server_messenger->get_myaddrs())) {
8680 clog->error() << "map e" << osdmap->get_epoch()
8681 << " had wrong heartbeat back addr ("
8682 << osdmap->get_hb_back_addrs(whoami)
8683 << " != my " << hb_back_server_messenger->get_myaddrs()
8684 << ")";
8685 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8686 hb_front_server_messenger->get_myaddrs())) {
8687 clog->error() << "map e" << osdmap->get_epoch()
8688 << " had wrong heartbeat front addr ("
8689 << osdmap->get_hb_front_addrs(whoami)
8690 << " != my " << hb_front_server_messenger->get_myaddrs()
8691 << ")";
8692 }
8693
8694 if (!service.is_stopping()) {
8695 epoch_t up_epoch = 0;
8696 epoch_t bind_epoch = osdmap->get_epoch();
8697 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8698 do_restart = true;
8699
8700 //add markdown log
8701 utime_t now = ceph_clock_now();
8702 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8703 osd_markdown_log.push_back(now);
8704 //clear all out-of-date log
8705 while (!osd_markdown_log.empty() &&
8706 osd_markdown_log.front() + grace < now)
8707 osd_markdown_log.pop_front();
8708 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8709 dout(0) << __func__ << " marked down "
8710 << osd_markdown_log.size()
8711 << " > osd_max_markdown_count "
8712 << cct->_conf->osd_max_markdown_count
8713 << " in last " << grace << " seconds, shutting down"
8714 << dendl;
8715 do_restart = false;
8716 do_shutdown = true;
8717 }
8718
8719 start_waiting_for_healthy();
8720
8721 set<int> avoid_ports;
8722 #if defined(__FreeBSD__)
8723 // prevent FreeBSD from grabbing the client_messenger port during
8724 // rebinding. In which case a cluster_meesneger will connect also
8725 // to the same port
8726 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8727 #endif
8728 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8729 hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8730 hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8731
8732 int r = cluster_messenger->rebind(avoid_ports);
8733 if (r != 0) {
8734 do_shutdown = true; // FIXME: do_restart?
8735 network_error = true;
8736 dout(0) << __func__ << " marked down:"
8737 << " rebind cluster_messenger failed" << dendl;
8738 }
8739
8740 r = hb_back_server_messenger->rebind(avoid_ports);
8741 if (r != 0) {
8742 do_shutdown = true; // FIXME: do_restart?
8743 network_error = true;
8744 dout(0) << __func__ << " marked down:"
8745 << " rebind hb_back_server_messenger failed" << dendl;
8746 }
8747
8748 r = hb_front_server_messenger->rebind(avoid_ports);
8749 if (r != 0) {
8750 do_shutdown = true; // FIXME: do_restart?
8751 network_error = true;
8752 dout(0) << __func__ << " marked down:"
8753 << " rebind hb_front_server_messenger failed" << dendl;
8754 }
8755
8756 hb_front_client_messenger->mark_down_all();
8757 hb_back_client_messenger->mark_down_all();
8758
8759 reset_heartbeat_peers(true);
8760 }
8761 }
8762 }
8763
8764 map_lock.put_write();
8765
8766 check_osdmap_features();
8767
8768 // yay!
8769 consume_map();
8770
8771 if (is_active() || is_waiting_for_healthy())
8772 maybe_update_heartbeat_peers();
8773
8774 if (is_active()) {
8775 activate_map();
8776 }
8777
8778 if (do_shutdown) {
8779 if (network_error) {
8780 cancel_pending_failures();
8781 }
8782 // trigger shutdown in a different thread
8783 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8784 queue_async_signal(SIGINT);
8785 }
8786 else if (m->newest_map && m->newest_map > last) {
8787 dout(10) << " msg say newest map is " << m->newest_map
8788 << ", requesting more" << dendl;
8789 osdmap_subscribe(osdmap->get_epoch()+1, false);
8790 }
8791 else if (is_preboot()) {
8792 if (m->get_source().is_mon())
8793 _preboot(m->oldest_map, m->newest_map);
8794 else
8795 start_boot();
8796 }
8797 else if (do_restart)
8798 start_boot();
8799
8800 }
8801
8802 void OSD::check_osdmap_features()
8803 {
8804 // adjust required feature bits?
8805
8806 // we have to be a bit careful here, because we are accessing the
8807 // Policy structures without taking any lock. in particular, only
8808 // modify integer values that can safely be read by a racing CPU.
8809 // since we are only accessing existing Policy structures a their
8810 // current memory location, and setting or clearing bits in integer
8811 // fields, and we are the only writer, this is not a problem.
8812
8813 {
8814 Messenger::Policy p = client_messenger->get_default_policy();
8815 uint64_t mask;
8816 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8817 if ((p.features_required & mask) != features) {
8818 dout(0) << "crush map has features " << features
8819 << ", adjusting msgr requires for clients" << dendl;
8820 p.features_required = (p.features_required & ~mask) | features;
8821 client_messenger->set_default_policy(p);
8822 }
8823 }
8824 {
8825 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8826 uint64_t mask;
8827 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8828 if ((p.features_required & mask) != features) {
8829 dout(0) << "crush map has features " << features
8830 << " was " << p.features_required
8831 << ", adjusting msgr requires for mons" << dendl;
8832 p.features_required = (p.features_required & ~mask) | features;
8833 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8834 }
8835 }
8836 {
8837 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8838 uint64_t mask;
8839 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8840
8841 if ((p.features_required & mask) != features) {
8842 dout(0) << "crush map has features " << features
8843 << ", adjusting msgr requires for osds" << dendl;
8844 p.features_required = (p.features_required & ~mask) | features;
8845 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8846 }
8847
8848 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8849 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8850 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8851 ObjectStore::Transaction t;
8852 write_superblock(t);
8853 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8854 ceph_assert(err == 0);
8855 }
8856 }
8857
8858 if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8859 heartbeat_dispatcher.ms_set_require_authorizer(false);
8860 }
8861
8862 if (osdmap->require_osd_release != last_require_osd_release) {
8863 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8864 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8865 store->write_meta("require_osd_release",
8866 stringify((int)osdmap->require_osd_release));
8867 last_require_osd_release = osdmap->require_osd_release;
8868 }
8869 }
8870
8871 struct C_FinishSplits : public Context {
8872 OSD *osd;
8873 set<PGRef> pgs;
8874 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8875 : osd(osd), pgs(in) {}
8876 void finish(int r) override {
8877 osd->_finish_splits(pgs);
8878 }
8879 };
8880
8881 void OSD::_finish_splits(set<PGRef>& pgs)
8882 {
8883 dout(10) << __func__ << " " << pgs << dendl;
8884 if (is_stopping())
8885 return;
8886 PG::RecoveryCtx rctx = create_context();
8887 for (set<PGRef>::iterator i = pgs.begin();
8888 i != pgs.end();
8889 ++i) {
8890 PG *pg = i->get();
8891
8892 pg->lock();
8893 dout(10) << __func__ << " " << *pg << dendl;
8894 epoch_t e = pg->get_osdmap_epoch();
8895 pg->handle_initialize(&rctx);
8896 pg->queue_null(e, e);
8897 dispatch_context_transaction(rctx, pg);
8898 pg->unlock();
8899
8900 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8901 shards[shard_index]->register_and_wake_split_child(pg);
8902 }
8903
8904 dispatch_context(rctx, 0, service.get_osdmap());
8905 };
8906
8907 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8908 unsigned need)
8909 {
8910 std::lock_guard l(merge_lock);
8911 auto& p = merge_waiters[nextmap->get_epoch()][target];
8912 p[src->pg_id] = src;
8913 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8914 << " for " << target << ", have " << p.size() << "/" << need
8915 << dendl;
8916 return p.size() == need;
8917 }
8918
8919 bool OSD::advance_pg(
8920 epoch_t osd_epoch,
8921 PG *pg,
8922 ThreadPool::TPHandle &handle,
8923 PG::RecoveryCtx *rctx)
8924 {
8925 if (osd_epoch <= pg->get_osdmap_epoch()) {
8926 return true;
8927 }
8928 ceph_assert(pg->is_locked());
8929 OSDMapRef lastmap = pg->get_osdmap();
8930 ceph_assert(lastmap->get_epoch() < osd_epoch);
8931 set<PGRef> new_pgs; // any split children
8932 bool ret = true;
8933
8934 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8935 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8936 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8937 next_epoch <= osd_epoch;
8938 ++next_epoch) {
8939 OSDMapRef nextmap = service.try_get_map(next_epoch);
8940 if (!nextmap) {
8941 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8942 continue;
8943 }
8944
8945 unsigned new_pg_num =
8946 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8947 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8948 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8949 // check for merge
8950 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8951 spg_t parent;
8952 if (pg->pg_id.is_merge_source(
8953 old_pg_num,
8954 new_pg_num,
8955 &parent)) {
8956 // we are merge source
8957 PGRef spg = pg; // carry a ref
8958 dout(1) << __func__ << " " << pg->pg_id
8959 << " is merge source, target is " << parent
8960 << dendl;
8961 pg->write_if_dirty(rctx);
8962 dispatch_context_transaction(*rctx, pg, &handle);
8963 pg->ch->flush();
8964 // release backoffs explicitly, since the on_shutdown path
8965 // aggressively tears down backoff state.
8966 if (pg->is_primary()) {
8967 pg->release_pg_backoffs();
8968 }
8969 pg->on_shutdown();
8970 OSDShard *sdata = pg->osd_shard;
8971 {
8972 std::lock_guard l(sdata->shard_lock);
8973 if (pg->pg_slot) {
8974 sdata->_detach_pg(pg->pg_slot);
8975 // update pg count now since we might not get an osdmap
8976 // any time soon.
8977 if (pg->is_primary())
8978 logger->dec(l_osd_pg_primary);
8979 else if (pg->is_replica())
8980 logger->dec(l_osd_pg_replica);
8981 else
8982 logger->dec(l_osd_pg_stray);
8983 }
8984 }
8985 pg->unlock();
8986
8987 set<spg_t> children;
8988 parent.is_split(new_pg_num, old_pg_num, &children);
8989 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8990 enqueue_peering_evt(
8991 parent,
8992 PGPeeringEventRef(
8993 std::make_shared<PGPeeringEvent>(
8994 nextmap->get_epoch(),
8995 nextmap->get_epoch(),
8996 NullEvt())));
8997 }
8998 ret = false;
8999 goto out;
9000 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
9001 // we are merge target
9002 set<spg_t> children;
9003 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
9004 dout(20) << __func__ << " " << pg->pg_id
9005 << " is merge target, sources are " << children
9006 << dendl;
9007 map<spg_t,PGRef> sources;
9008 {
9009 std::lock_guard l(merge_lock);
9010 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
9011 unsigned need = children.size();
9012 dout(20) << __func__ << " have " << s.size() << "/"
9013 << need << dendl;
9014 if (s.size() == need) {
9015 sources.swap(s);
9016 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
9017 if (merge_waiters[nextmap->get_epoch()].empty()) {
9018 merge_waiters.erase(nextmap->get_epoch());
9019 }
9020 }
9021 }
9022 if (!sources.empty()) {
9023 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
9024 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
9025 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
9026 pg->merge_from(
9027 sources, rctx, split_bits,
9028 nextmap->get_pg_pool(
9029 pg->pg_id.pool())->last_pg_merge_meta);
9030 pg->pg_slot->waiting_for_merge_epoch = 0;
9031 } else {
9032 dout(20) << __func__ << " not ready to merge yet" << dendl;
9033 pg->write_if_dirty(rctx);
9034 pg->unlock();
9035 // kick source(s) to get them ready
9036 for (auto& i : children) {
9037 dout(20) << __func__ << " kicking source " << i << dendl;
9038 enqueue_peering_evt(
9039 i,
9040 PGPeeringEventRef(
9041 std::make_shared<PGPeeringEvent>(
9042 nextmap->get_epoch(),
9043 nextmap->get_epoch(),
9044 NullEvt())));
9045 }
9046 ret = false;
9047 goto out;
9048 }
9049 }
9050 }
9051 }
9052
9053 vector<int> newup, newacting;
9054 int up_primary, acting_primary;
9055 nextmap->pg_to_up_acting_osds(
9056 pg->pg_id.pgid,
9057 &newup, &up_primary,
9058 &newacting, &acting_primary);
9059 pg->handle_advance_map(
9060 nextmap, lastmap, newup, up_primary,
9061 newacting, acting_primary, rctx);
9062
9063 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
9064 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
9065 if (oldpool != lastmap->get_pools().end()
9066 && newpool != nextmap->get_pools().end()) {
9067 dout(20) << __func__
9068 << " new pool opts " << newpool->second.opts
9069 << " old pool opts " << oldpool->second.opts
9070 << dendl;
9071
9072 double old_min_interval = 0, new_min_interval = 0;
9073 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
9074 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
9075
9076 double old_max_interval = 0, new_max_interval = 0;
9077 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
9078 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
9079
9080 // Assume if an interval is change from set to unset or vice versa the actual config
9081 // is different. Keep it simple even if it is possible to call resched_all_scrub()
9082 // unnecessarily.
9083 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
9084 pg->on_info_history_change();
9085 }
9086 }
9087
9088 if (new_pg_num && old_pg_num != new_pg_num) {
9089 // check for split
9090 set<spg_t> children;
9091 if (pg->pg_id.is_split(
9092 old_pg_num,
9093 new_pg_num,
9094 &children)) {
9095 split_pgs(
9096 pg, children, &new_pgs, lastmap, nextmap,
9097 rctx);
9098 }
9099 }
9100
9101 lastmap = nextmap;
9102 old_pg_num = new_pg_num;
9103 handle.reset_tp_timeout();
9104 }
9105 pg->handle_activate_map(rctx);
9106
9107 ret = true;
9108 out:
9109 if (!new_pgs.empty()) {
9110 rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
9111 }
9112 return ret;
9113 }
9114
9115 void OSD::consume_map()
9116 {
9117 ceph_assert(osd_lock.is_locked());
9118 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
9119
9120 /** make sure the cluster is speaking in SORTBITWISE, because we don't
9121 * speak the older sorting version any more. Be careful not to force
9122 * a shutdown if we are merely processing old maps, though.
9123 */
9124 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
9125 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
9126 ceph_abort();
9127 }
9128
9129 service.pre_publish_map(osdmap);
9130 service.await_reserved_maps();
9131 service.publish_map(osdmap);
9132
9133 // prime splits and merges
9134 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
9135 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
9136 for (auto& shard : shards) {
9137 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
9138 }
9139 if (!newly_split.empty()) {
9140 for (auto& shard : shards) {
9141 shard->prime_splits(osdmap, &newly_split);
9142 }
9143 ceph_assert(newly_split.empty());
9144 }
9145
9146 // prune sent_ready_to_merge
9147 service.prune_sent_ready_to_merge(osdmap);
9148
9149 // FIXME, maybe: We could race against an incoming peering message
9150 // that instantiates a merge PG after identify_merges() below and
9151 // never set up its peer to complete the merge. An OSD restart
9152 // would clear it up. This is a hard race to resolve,
9153 // extraordinarily rare (we only merge PGs that are stable and
9154 // clean, so it'd have to be an imported PG to an OSD with a
9155 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
9156 // replace all of this with a seastar-based code soon anyway.
9157 if (!merge_pgs.empty()) {
9158 // mark the pgs we already have, or create new and empty merge
9159 // participants for those we are missing. do this all under the
9160 // shard lock so we don't have to worry about racing pg creates
9161 // via _process.
9162 for (auto& shard : shards) {
9163 shard->prime_merges(osdmap, &merge_pgs);
9164 }
9165 ceph_assert(merge_pgs.empty());
9166 }
9167
9168 service.prune_pg_created();
9169
9170 unsigned pushes_to_free = 0;
9171 for (auto& shard : shards) {
9172 shard->consume_map(osdmap, &pushes_to_free);
9173 }
9174
9175 vector<spg_t> pgids;
9176 _get_pgids(&pgids);
9177
9178 // count (FIXME, probably during seastar rewrite)
9179 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
9180 vector<PGRef> pgs;
9181 _get_pgs(&pgs);
9182 for (auto& pg : pgs) {
9183 // FIXME (probably during seastar rewrite): this is lockless and
9184 // racy, but we don't want to take pg lock here.
9185 if (pg->is_primary())
9186 num_pg_primary++;
9187 else if (pg->is_replica())
9188 num_pg_replica++;
9189 else
9190 num_pg_stray++;
9191 }
9192
9193 {
9194 // FIXME (as part of seastar rewrite): move to OSDShard
9195 std::lock_guard l(pending_creates_lock);
9196 for (auto pg = pending_creates_from_osd.begin();
9197 pg != pending_creates_from_osd.end();) {
9198 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
9199 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
9200 << "discarding pending_create_from_osd" << dendl;
9201 pg = pending_creates_from_osd.erase(pg);
9202 } else {
9203 ++pg;
9204 }
9205 }
9206 }
9207
9208 service.maybe_inject_dispatch_delay();
9209
9210 dispatch_sessions_waiting_on_map();
9211
9212 service.maybe_inject_dispatch_delay();
9213
9214 service.release_reserved_pushes(pushes_to_free);
9215
9216 // queue null events to push maps down to individual PGs
9217 for (auto pgid : pgids) {
9218 enqueue_peering_evt(
9219 pgid,
9220 PGPeeringEventRef(
9221 std::make_shared<PGPeeringEvent>(
9222 osdmap->get_epoch(),
9223 osdmap->get_epoch(),
9224 NullEvt())));
9225 }
9226 logger->set(l_osd_pg, pgids.size());
9227 logger->set(l_osd_pg_primary, num_pg_primary);
9228 logger->set(l_osd_pg_replica, num_pg_replica);
9229 logger->set(l_osd_pg_stray, num_pg_stray);
9230 }
9231
9232 void OSD::activate_map()
9233 {
9234 ceph_assert(osd_lock.is_locked());
9235
9236 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
9237
9238 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
9239 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
9240 osdmap_subscribe(osdmap->get_epoch() + 1, false);
9241 }
9242
9243 // norecover?
9244 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
9245 if (!service.recovery_is_paused()) {
9246 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
9247 service.pause_recovery();
9248 }
9249 } else {
9250 if (service.recovery_is_paused()) {
9251 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9252 service.unpause_recovery();
9253 }
9254 }
9255
9256 service.activate_map();
9257
9258 // process waiters
9259 take_waiters(waiting_for_osdmap);
9260 }
9261
9262 bool OSD::require_mon_peer(const Message *m)
9263 {
9264 if (!m->get_connection()->peer_is_mon()) {
9265 dout(0) << "require_mon_peer received from non-mon "
9266 << m->get_connection()->get_peer_addr()
9267 << " " << *m << dendl;
9268 return false;
9269 }
9270 return true;
9271 }
9272
9273 bool OSD::require_mon_or_mgr_peer(const Message *m)
9274 {
9275 if (!m->get_connection()->peer_is_mon() &&
9276 !m->get_connection()->peer_is_mgr()) {
9277 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9278 << m->get_connection()->get_peer_addr()
9279 << " " << *m << dendl;
9280 return false;
9281 }
9282 return true;
9283 }
9284
9285 bool OSD::require_osd_peer(const Message *m)
9286 {
9287 if (!m->get_connection()->peer_is_osd()) {
9288 dout(0) << "require_osd_peer received from non-osd "
9289 << m->get_connection()->get_peer_addr()
9290 << " " << *m << dendl;
9291 return false;
9292 }
9293 return true;
9294 }
9295
9296 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9297 {
9298 epoch_t up_epoch = service.get_up_epoch();
9299 if (epoch < up_epoch) {
9300 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9301 return false;
9302 }
9303
9304 if (!is_active()) {
9305 dout(7) << "still in boot state, dropping message " << *m << dendl;
9306 return false;
9307 }
9308
9309 return true;
9310 }
9311
9312 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
9313 bool is_fast_dispatch)
9314 {
9315 int from = m->get_source().num();
9316
9317 if (map->is_down(from) ||
9318 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9319 dout(5) << "from dead osd." << from << ", marking down, "
9320 << " msg was " << m->get_source_inst().addr
9321 << " expected "
9322 << (map->is_up(from) ?
9323 map->get_cluster_addrs(from) : entity_addrvec_t())
9324 << dendl;
9325 ConnectionRef con = m->get_connection();
9326 con->mark_down();
9327 auto priv = con->get_priv();
9328 if (auto s = static_cast<Session*>(priv.get()); s) {
9329 if (!is_fast_dispatch)
9330 s->session_dispatch_lock.Lock();
9331 clear_session_waiting_on_map(s);
9332 con->set_priv(nullptr); // break ref <-> session cycle, if any
9333 s->con.reset();
9334 if (!is_fast_dispatch)
9335 s->session_dispatch_lock.Unlock();
9336 }
9337 return false;
9338 }
9339 return true;
9340 }
9341
9342
9343 /*
9344 * require that we have same (or newer) map, and that
9345 * the source is the pg primary.
9346 */
9347 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9348 bool is_fast_dispatch)
9349 {
9350 const Message *m = op->get_req();
9351 dout(15) << "require_same_or_newer_map " << epoch
9352 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9353
9354 ceph_assert(osd_lock.is_locked());
9355
9356 // do they have a newer map?
9357 if (epoch > osdmap->get_epoch()) {
9358 dout(7) << "waiting for newer map epoch " << epoch
9359 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9360 wait_for_new_map(op);
9361 return false;
9362 }
9363
9364 if (!require_self_aliveness(op->get_req(), epoch)) {
9365 return false;
9366 }
9367
9368 // ok, our map is same or newer.. do they still exist?
9369 if (m->get_connection()->get_messenger() == cluster_messenger &&
9370 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9371 return false;
9372 }
9373
9374 return true;
9375 }
9376
9377
9378
9379
9380
9381 // ----------------------------------------
9382 // pg creation
9383
9384 void OSD::split_pgs(
9385 PG *parent,
9386 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9387 OSDMapRef curmap,
9388 OSDMapRef nextmap,
9389 PG::RecoveryCtx *rctx)
9390 {
9391 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9392 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9393
9394 vector<object_stat_sum_t> updated_stats;
9395 parent->start_split_stats(childpgids, &updated_stats);
9396
9397 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9398 for (set<spg_t>::const_iterator i = childpgids.begin();
9399 i != childpgids.end();
9400 ++i, ++stat_iter) {
9401 ceph_assert(stat_iter != updated_stats.end());
9402 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9403 PG* child = _make_pg(nextmap, *i);
9404 child->lock(true);
9405 out_pgs->insert(child);
9406 child->ch = store->create_new_collection(child->coll);
9407
9408 {
9409 uint32_t shard_index = i->hash_to_shard(shards.size());
9410 assert(NULL != shards[shard_index]);
9411 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9412 }
9413
9414 unsigned split_bits = i->get_split_bits(pg_num);
9415 dout(10) << " pg_num is " << pg_num
9416 << ", m_seed " << i->ps()
9417 << ", split_bits is " << split_bits << dendl;
9418 parent->split_colls(
9419 *i,
9420 split_bits,
9421 i->ps(),
9422 &child->get_pool().info,
9423 rctx->transaction);
9424 parent->split_into(
9425 i->pgid,
9426 child,
9427 split_bits);
9428
9429 child->init_collection_pool_opts();
9430
9431 child->finish_split_stats(*stat_iter, rctx->transaction);
9432 child->unlock();
9433 }
9434 ceph_assert(stat_iter != updated_stats.end());
9435 parent->finish_split_stats(*stat_iter, rctx->transaction);
9436 }
9437
9438 /*
9439 * holding osd_lock
9440 */
9441 void OSD::handle_pg_create(OpRequestRef op)
9442 {
9443 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
9444 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9445
9446 dout(10) << "handle_pg_create " << *m << dendl;
9447
9448 if (!require_mon_peer(op->get_req())) {
9449 return;
9450 }
9451
9452 if (!require_same_or_newer_map(op, m->epoch, false))
9453 return;
9454
9455 op->mark_started();
9456
9457 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9458 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9459 p != m->mkpg.end();
9460 ++p, ++ci) {
9461 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9462 epoch_t created = p->second.created;
9463 if (p->second.split_bits) // Skip split pgs
9464 continue;
9465 pg_t on = p->first;
9466
9467 if (!osdmap->have_pg_pool(on.pool())) {
9468 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9469 continue;
9470 }
9471
9472 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9473
9474 // is it still ours?
9475 vector<int> up, acting;
9476 int up_primary = -1;
9477 int acting_primary = -1;
9478 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9479 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9480
9481 if (acting_primary != whoami) {
9482 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9483 << "), my role=" << role << ", skipping" << dendl;
9484 continue;
9485 }
9486
9487 spg_t pgid;
9488 bool mapped = osdmap->get_primary_shard(on, &pgid);
9489 ceph_assert(mapped);
9490
9491 PastIntervals pi;
9492 pg_history_t history;
9493 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9494
9495 // The mon won't resend unless the primary changed, so we ignore
9496 // same_interval_since. We'll pass this history with the current
9497 // epoch as the event.
9498 if (history.same_primary_since > m->epoch) {
9499 dout(10) << __func__ << ": got obsolete pg create on pgid "
9500 << pgid << " from epoch " << m->epoch
9501 << ", primary changed in " << history.same_primary_since
9502 << dendl;
9503 continue;
9504 }
9505 enqueue_peering_evt(
9506 pgid,
9507 PGPeeringEventRef(
9508 std::make_shared<PGPeeringEvent>(
9509 osdmap->get_epoch(),
9510 osdmap->get_epoch(),
9511 NullEvt(),
9512 true,
9513 new PGCreateInfo(
9514 pgid,
9515 osdmap->get_epoch(),
9516 history,
9517 pi,
9518 true)
9519 )));
9520 }
9521
9522 {
9523 std::lock_guard l(pending_creates_lock);
9524 if (pending_creates_from_mon == 0) {
9525 last_pg_create_epoch = m->epoch;
9526 }
9527 }
9528
9529 maybe_update_heartbeat_peers();
9530 }
9531
9532
9533 // ----------------------------------------
9534 // peering and recovery
9535
9536 PG::RecoveryCtx OSD::create_context()
9537 {
9538 ObjectStore::Transaction *t = new ObjectStore::Transaction;
9539 map<int, map<spg_t,pg_query_t> > *query_map =
9540 new map<int, map<spg_t, pg_query_t> >;
9541 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9542 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9543 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9544 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
9545 PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
9546 return rctx;
9547 }
9548
9549 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9550 ThreadPool::TPHandle *handle)
9551 {
9552 if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
9553 int tr = store->queue_transaction(
9554 pg->ch,
9555 std::move(*ctx.transaction), TrackedOpRef(), handle);
9556 ceph_assert(tr == 0);
9557 delete (ctx.transaction);
9558 ctx.transaction = new ObjectStore::Transaction;
9559 }
9560 }
9561
9562 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9563 ThreadPool::TPHandle *handle)
9564 {
9565 if (!service.get_osdmap()->is_up(whoami)) {
9566 dout(20) << __func__ << " not up in osdmap" << dendl;
9567 } else if (!is_active()) {
9568 dout(20) << __func__ << " not active" << dendl;
9569 } else {
9570 do_notifies(*ctx.notify_list, curmap);
9571 do_queries(*ctx.query_map, curmap);
9572 do_infos(*ctx.info_map, curmap);
9573 }
9574 if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
9575 int tr = store->queue_transaction(
9576 pg->ch,
9577 std::move(*ctx.transaction), TrackedOpRef(),
9578 handle);
9579 ceph_assert(tr == 0);
9580 }
9581 delete ctx.notify_list;
9582 delete ctx.query_map;
9583 delete ctx.info_map;
9584 delete ctx.transaction;
9585 }
9586
9587 void OSD::discard_context(PG::RecoveryCtx& ctx)
9588 {
9589 delete ctx.notify_list;
9590 delete ctx.query_map;
9591 delete ctx.info_map;
9592 delete ctx.transaction;
9593 }
9594
9595
9596 /** do_notifies
9597 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9598 * content for, and they are primary for.
9599 */
9600
9601 void OSD::do_notifies(
9602 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9603 OSDMapRef curmap)
9604 {
9605 for (map<int,
9606 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9607 notify_list.begin();
9608 it != notify_list.end();
9609 ++it) {
9610 if (!curmap->is_up(it->first)) {
9611 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9612 continue;
9613 }
9614 ConnectionRef con = service.get_con_osd_cluster(
9615 it->first, curmap->get_epoch());
9616 if (!con) {
9617 dout(20) << __func__ << " skipping osd." << it->first
9618 << " (NULL con)" << dendl;
9619 continue;
9620 }
9621 service.share_map_peer(it->first, con.get(), curmap);
9622 dout(7) << __func__ << " osd." << it->first
9623 << " on " << it->second.size() << " PGs" << dendl;
9624 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9625 it->second);
9626 con->send_message(m);
9627 }
9628 }
9629
9630
9631 /** do_queries
9632 * send out pending queries for info | summaries
9633 */
9634 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9635 OSDMapRef curmap)
9636 {
9637 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9638 pit != query_map.end();
9639 ++pit) {
9640 if (!curmap->is_up(pit->first)) {
9641 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9642 continue;
9643 }
9644 int who = pit->first;
9645 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9646 if (!con) {
9647 dout(20) << __func__ << " skipping osd." << who
9648 << " (NULL con)" << dendl;
9649 continue;
9650 }
9651 service.share_map_peer(who, con.get(), curmap);
9652 dout(7) << __func__ << " querying osd." << who
9653 << " on " << pit->second.size() << " PGs" << dendl;
9654 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9655 con->send_message(m);
9656 }
9657 }
9658
9659
9660 void OSD::do_infos(map<int,
9661 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9662 OSDMapRef curmap)
9663 {
9664 for (map<int,
9665 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9666 info_map.begin();
9667 p != info_map.end();
9668 ++p) {
9669 if (!curmap->is_up(p->first)) {
9670 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9671 continue;
9672 }
9673 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9674 i != p->second.end();
9675 ++i) {
9676 dout(20) << __func__ << " sending info " << i->first.info
9677 << " to shard " << p->first << dendl;
9678 }
9679 ConnectionRef con = service.get_con_osd_cluster(
9680 p->first, curmap->get_epoch());
9681 if (!con) {
9682 dout(20) << __func__ << " skipping osd." << p->first
9683 << " (NULL con)" << dendl;
9684 continue;
9685 }
9686 service.share_map_peer(p->first, con.get(), curmap);
9687 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9688 m->pg_list = p->second;
9689 con->send_message(m);
9690 }
9691 info_map.clear();
9692 }
9693
9694 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9695 {
9696 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9697 if (!require_mon_peer(m)) {
9698 m->put();
9699 return;
9700 }
9701 for (auto& p : m->pgs) {
9702 spg_t pgid = p.first;
9703 epoch_t created = p.second.first;
9704 utime_t created_stamp = p.second.second;
9705 dout(20) << __func__ << " " << pgid << " e" << created
9706 << "@" << created_stamp << dendl;
9707 pg_history_t h;
9708 h.epoch_created = created;
9709 h.epoch_pool_created = created;
9710 h.same_up_since = created;
9711 h.same_interval_since = created;
9712 h.same_primary_since = created;
9713 h.last_scrub_stamp = created_stamp;
9714 h.last_deep_scrub_stamp = created_stamp;
9715 h.last_clean_scrub_stamp = created_stamp;
9716
9717 enqueue_peering_evt(
9718 pgid,
9719 PGPeeringEventRef(
9720 std::make_shared<PGPeeringEvent>(
9721 m->epoch,
9722 m->epoch,
9723 NullEvt(),
9724 true,
9725 new PGCreateInfo(
9726 pgid,
9727 created,
9728 h,
9729 PastIntervals(),
9730 true)
9731 )));
9732 }
9733
9734 {
9735 std::lock_guard l(pending_creates_lock);
9736 if (pending_creates_from_mon == 0) {
9737 last_pg_create_epoch = m->epoch;
9738 }
9739 }
9740
9741 m->put();
9742 }
9743
9744 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9745 {
9746 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9747 if (!require_osd_peer(m)) {
9748 m->put();
9749 return;
9750 }
9751 int from = m->get_source().num();
9752 for (auto& p : m->pg_list) {
9753 enqueue_peering_evt(
9754 p.first,
9755 PGPeeringEventRef(
9756 std::make_shared<PGPeeringEvent>(
9757 p.second.epoch_sent, p.second.epoch_sent,
9758 MQuery(
9759 p.first,
9760 pg_shard_t(from, p.second.from),
9761 p.second,
9762 p.second.epoch_sent),
9763 false))
9764 );
9765 }
9766 m->put();
9767 }
9768
9769 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9770 {
9771 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9772 if (!require_osd_peer(m)) {
9773 m->put();
9774 return;
9775 }
9776 int from = m->get_source().num();
9777 for (auto& p : m->get_pg_list()) {
9778 spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9779 enqueue_peering_evt(
9780 pgid,
9781 PGPeeringEventRef(
9782 std::make_shared<PGPeeringEvent>(
9783 p.first.epoch_sent,
9784 p.first.query_epoch,
9785 MNotifyRec(
9786 pgid, pg_shard_t(from, p.first.from),
9787 p.first,
9788 m->get_connection()->get_features(),
9789 p.second),
9790 true,
9791 new PGCreateInfo(
9792 pgid,
9793 p.first.query_epoch,
9794 p.first.info.history,
9795 p.second,
9796 false)
9797 )));
9798 }
9799 m->put();
9800 }
9801
9802 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9803 {
9804 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9805 if (!require_osd_peer(m)) {
9806 m->put();
9807 return;
9808 }
9809 int from = m->get_source().num();
9810 for (auto& p : m->pg_list) {
9811 enqueue_peering_evt(
9812 spg_t(p.first.info.pgid.pgid, p.first.to),
9813 PGPeeringEventRef(
9814 std::make_shared<PGPeeringEvent>(
9815 p.first.epoch_sent, p.first.query_epoch,
9816 MInfoRec(
9817 pg_shard_t(from, p.first.from),
9818 p.first.info,
9819 p.first.epoch_sent)))
9820 );
9821 }
9822 m->put();
9823 }
9824
9825 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9826 {
9827 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9828 if (!require_osd_peer(m)) {
9829 m->put();
9830 return;
9831 }
9832 for (auto& pgid : m->pg_list) {
9833 enqueue_peering_evt(
9834 pgid,
9835 PGPeeringEventRef(
9836 std::make_shared<PGPeeringEvent>(
9837 m->get_epoch(), m->get_epoch(),
9838 PG::DeleteStart())));
9839 }
9840 m->put();
9841 }
9842
9843 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9844 {
9845 dout(10) << __func__ << " " << *m << dendl;
9846 if (!require_mon_or_mgr_peer(m)) {
9847 m->put();
9848 return;
9849 }
9850 epoch_t epoch = get_osdmap_epoch();
9851 for (auto pgid : m->forced_pgs) {
9852 if (m->options & OFR_BACKFILL) {
9853 if (m->options & OFR_CANCEL) {
9854 enqueue_peering_evt(
9855 pgid,
9856 PGPeeringEventRef(
9857 std::make_shared<PGPeeringEvent>(
9858 epoch, epoch,
9859 PG::UnsetForceBackfill())));
9860 } else {
9861 enqueue_peering_evt(
9862 pgid,
9863 PGPeeringEventRef(
9864 std::make_shared<PGPeeringEvent>(
9865 epoch, epoch,
9866 PG::SetForceBackfill())));
9867 }
9868 } else if (m->options & OFR_RECOVERY) {
9869 if (m->options & OFR_CANCEL) {
9870 enqueue_peering_evt(
9871 pgid,
9872 PGPeeringEventRef(
9873 std::make_shared<PGPeeringEvent>(
9874 epoch, epoch,
9875 PG::UnsetForceRecovery())));
9876 } else {
9877 enqueue_peering_evt(
9878 pgid,
9879 PGPeeringEventRef(
9880 std::make_shared<PGPeeringEvent>(
9881 epoch, epoch,
9882 PG::SetForceRecovery())));
9883 }
9884 }
9885 }
9886 m->put();
9887 }
9888
9889 void OSD::handle_pg_query_nopg(const MQuery& q)
9890 {
9891 spg_t pgid = q.pgid;
9892 dout(10) << __func__ << " " << pgid << dendl;
9893
9894 OSDMapRef osdmap = get_osdmap();
9895 if (!osdmap->have_pg_pool(pgid.pool()))
9896 return;
9897
9898 dout(10) << " pg " << pgid << " dne" << dendl;
9899 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9900 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9901 if (con) {
9902 Message *m;
9903 if (q.query.type == pg_query_t::LOG ||
9904 q.query.type == pg_query_t::FULLLOG) {
9905 m = new MOSDPGLog(
9906 q.query.from, q.query.to,
9907 osdmap->get_epoch(), empty,
9908 q.query.epoch_sent);
9909 } else {
9910 vector<pair<pg_notify_t,PastIntervals>> ls;
9911 ls.push_back(
9912 make_pair(
9913 pg_notify_t(
9914 q.query.from, q.query.to,
9915 q.query.epoch_sent,
9916 osdmap->get_epoch(),
9917 empty),
9918 PastIntervals()));
9919 m = new MOSDPGNotify(osdmap->get_epoch(), ls);
9920 }
9921 service.share_map_peer(q.from.osd, con.get(), osdmap);
9922 con->send_message(m);
9923 }
9924 }
9925
9926
9927 // =========================================================
9928 // RECOVERY
9929
9930 void OSDService::_maybe_queue_recovery() {
9931 ceph_assert(recovery_lock.is_locked_by_me());
9932 uint64_t available_pushes;
9933 while (!awaiting_throttle.empty() &&
9934 _recover_now(&available_pushes)) {
9935 uint64_t to_start = std::min(
9936 available_pushes,
9937 cct->_conf->osd_recovery_max_single_start);
9938 _queue_for_recovery(awaiting_throttle.front(), to_start);
9939 awaiting_throttle.pop_front();
9940 dout(10) << __func__ << " starting " << to_start
9941 << ", recovery_ops_reserved " << recovery_ops_reserved
9942 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9943 recovery_ops_reserved += to_start;
9944 }
9945 }
9946
9947 bool OSDService::_recover_now(uint64_t *available_pushes)
9948 {
9949 if (available_pushes)
9950 *available_pushes = 0;
9951
9952 if (ceph_clock_now() < defer_recovery_until) {
9953 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9954 return false;
9955 }
9956
9957 if (recovery_paused) {
9958 dout(15) << __func__ << " paused" << dendl;
9959 return false;
9960 }
9961
9962 uint64_t max = cct->_conf->osd_recovery_max_active;
9963 if (max <= recovery_ops_active + recovery_ops_reserved) {
9964 dout(15) << __func__ << " active " << recovery_ops_active
9965 << " + reserved " << recovery_ops_reserved
9966 << " >= max " << max << dendl;
9967 return false;
9968 }
9969
9970 if (available_pushes)
9971 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9972
9973 return true;
9974 }
9975
9976 void OSD::do_recovery(
9977 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9978 ThreadPool::TPHandle &handle)
9979 {
9980 uint64_t started = 0;
9981
9982 /*
9983 * When the value of osd_recovery_sleep is set greater than zero, recovery
9984 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9985 * recovery event's schedule time. This is done by adding a
9986 * recovery_requeue_callback event, which re-queues the recovery op using
9987 * queue_recovery_after_sleep.
9988 */
9989 float recovery_sleep = get_osd_recovery_sleep();
9990 {
9991 std::lock_guard l(service.sleep_lock);
9992 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9993 PGRef pgref(pg);
9994 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9995 dout(20) << "do_recovery wake up at "
9996 << ceph_clock_now()
9997 << ", re-queuing recovery" << dendl;
9998 std::lock_guard l(service.sleep_lock);
9999 service.recovery_needs_sleep = false;
10000 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
10001 });
10002
10003 // This is true for the first recovery op and when the previous recovery op
10004 // has been scheduled in the past. The next recovery op is scheduled after
10005 // completing the sleep from now.
10006 if (service.recovery_schedule_time < ceph_clock_now()) {
10007 service.recovery_schedule_time = ceph_clock_now();
10008 }
10009 service.recovery_schedule_time += recovery_sleep;
10010 service.sleep_timer.add_event_at(service.recovery_schedule_time,
10011 recovery_requeue_callback);
10012 dout(20) << "Recovery event scheduled at "
10013 << service.recovery_schedule_time << dendl;
10014 return;
10015 }
10016 }
10017
10018 {
10019 {
10020 std::lock_guard l(service.sleep_lock);
10021 service.recovery_needs_sleep = true;
10022 }
10023
10024 if (pg->pg_has_reset_since(queued)) {
10025 goto out;
10026 }
10027
10028 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
10029 #ifdef DEBUG_RECOVERY_OIDS
10030 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
10031 #endif
10032
10033 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
10034 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10035 << " on " << *pg << dendl;
10036
10037 if (do_unfound) {
10038 PG::RecoveryCtx rctx = create_context();
10039 rctx.handle = &handle;
10040 pg->find_unfound(queued, &rctx);
10041 dispatch_context(rctx, pg, pg->get_osdmap());
10042 }
10043 }
10044
10045 out:
10046 ceph_assert(started <= reserved_pushes);
10047 service.release_reserved_pushes(reserved_pushes);
10048 }
10049
10050 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
10051 {
10052 std::lock_guard l(recovery_lock);
10053 dout(10) << "start_recovery_op " << *pg << " " << soid
10054 << " (" << recovery_ops_active << "/"
10055 << cct->_conf->osd_recovery_max_active << " rops)"
10056 << dendl;
10057 recovery_ops_active++;
10058
10059 #ifdef DEBUG_RECOVERY_OIDS
10060 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
10061 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
10062 recovery_oids[pg->pg_id].insert(soid);
10063 #endif
10064 }
10065
10066 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
10067 {
10068 std::lock_guard l(recovery_lock);
10069 dout(10) << "finish_recovery_op " << *pg << " " << soid
10070 << " dequeue=" << dequeue
10071 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
10072 << dendl;
10073
10074 // adjust count
10075 ceph_assert(recovery_ops_active > 0);
10076 recovery_ops_active--;
10077
10078 #ifdef DEBUG_RECOVERY_OIDS
10079 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
10080 ceph_assert(recovery_oids[pg->pg_id].count(soid));
10081 recovery_oids[pg->pg_id].erase(soid);
10082 #endif
10083
10084 _maybe_queue_recovery();
10085 }
10086
10087 bool OSDService::is_recovery_active()
10088 {
10089 if (cct->_conf->osd_debug_pretend_recovery_active) {
10090 return true;
10091 }
10092 return local_reserver.has_reservation() || remote_reserver.has_reservation();
10093 }
10094
10095 void OSDService::release_reserved_pushes(uint64_t pushes)
10096 {
10097 std::lock_guard l(recovery_lock);
10098 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
10099 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
10100 << dendl;
10101 ceph_assert(recovery_ops_reserved >= pushes);
10102 recovery_ops_reserved -= pushes;
10103 _maybe_queue_recovery();
10104 }
10105
10106 // =========================================================
10107 // OPS
10108
10109 bool OSD::op_is_discardable(const MOSDOp *op)
10110 {
10111 // drop client request if they are not connected and can't get the
10112 // reply anyway.
10113 if (!op->get_connection()->is_connected()) {
10114 return true;
10115 }
10116 return false;
10117 }
10118
10119 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
10120 {
10121 const utime_t stamp = op->get_req()->get_recv_stamp();
10122 const utime_t latency = ceph_clock_now() - stamp;
10123 const unsigned priority = op->get_req()->get_priority();
10124 const int cost = op->get_req()->get_cost();
10125 const uint64_t owner = op->get_req()->get_source().num();
10126
10127 dout(15) << "enqueue_op " << op << " prio " << priority
10128 << " cost " << cost
10129 << " latency " << latency
10130 << " epoch " << epoch
10131 << " " << *(op->get_req()) << dendl;
10132 op->osd_trace.event("enqueue op");
10133 op->osd_trace.keyval("priority", priority);
10134 op->osd_trace.keyval("cost", cost);
10135 op->mark_queued_for_pg();
10136 logger->tinc(l_osd_op_before_queue_op_lat, latency);
10137 op_shardedwq.queue(
10138 OpQueueItem(
10139 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
10140 cost, priority, stamp, owner, epoch));
10141 }
10142
10143 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
10144 {
10145 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10146 op_shardedwq.queue(
10147 OpQueueItem(
10148 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10149 10,
10150 cct->_conf->osd_peering_op_priority,
10151 utime_t(),
10152 0,
10153 evt->get_epoch_sent()));
10154 }
10155
10156 void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
10157 {
10158 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10159 op_shardedwq.queue_front(
10160 OpQueueItem(
10161 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10162 10,
10163 cct->_conf->osd_peering_op_priority,
10164 utime_t(),
10165 0,
10166 evt->get_epoch_sent()));
10167 }
10168
10169 /*
10170 * NOTE: dequeue called in worker thread, with pg lock
10171 */
10172 void OSD::dequeue_op(
10173 PGRef pg, OpRequestRef op,
10174 ThreadPool::TPHandle &handle)
10175 {
10176 FUNCTRACE(cct);
10177 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
10178
10179 utime_t now = ceph_clock_now();
10180 op->set_dequeued_time(now);
10181 utime_t latency = now - op->get_req()->get_recv_stamp();
10182 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
10183 << " cost " << op->get_req()->get_cost()
10184 << " latency " << latency
10185 << " " << *(op->get_req())
10186 << " pg " << *pg << dendl;
10187
10188 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
10189
10190 auto priv = op->get_req()->get_connection()->get_priv();
10191 if (auto session = static_cast<Session *>(priv.get()); session) {
10192 maybe_share_map(session, op, pg->get_osdmap());
10193 }
10194
10195 if (pg->is_deleting())
10196 return;
10197
10198 op->mark_reached_pg();
10199 op->osd_trace.event("dequeue_op");
10200
10201 pg->do_request(op, handle);
10202
10203 // finish
10204 dout(10) << "dequeue_op " << op << " finish" << dendl;
10205 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
10206 }
10207
10208
10209 void OSD::dequeue_peering_evt(
10210 OSDShard *sdata,
10211 PG *pg,
10212 PGPeeringEventRef evt,
10213 ThreadPool::TPHandle& handle)
10214 {
10215 PG::RecoveryCtx rctx = create_context();
10216 auto curmap = sdata->get_osdmap();
10217 epoch_t need_up_thru = 0, same_interval_since = 0;
10218 if (!pg) {
10219 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
10220 handle_pg_query_nopg(*q);
10221 } else {
10222 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
10223 ceph_abort();
10224 }
10225 } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
10226 pg->do_peering_event(evt, &rctx);
10227 if (pg->is_deleted()) {
10228 // do not dispatch rctx; the final _delete_some already did it.
10229 discard_context(rctx);
10230 pg->unlock();
10231 return;
10232 }
10233 dispatch_context_transaction(rctx, pg, &handle);
10234 need_up_thru = pg->get_need_up_thru();
10235 same_interval_since = pg->get_same_interval_since();
10236 pg->unlock();
10237 }
10238
10239 if (need_up_thru) {
10240 queue_want_up_thru(same_interval_since);
10241 }
10242 dispatch_context(rctx, pg, curmap, &handle);
10243
10244 service.send_pg_temp();
10245 }
10246
10247 void OSD::dequeue_delete(
10248 OSDShard *sdata,
10249 PG *pg,
10250 epoch_t e,
10251 ThreadPool::TPHandle& handle)
10252 {
10253 dequeue_peering_evt(
10254 sdata,
10255 pg,
10256 PGPeeringEventRef(
10257 std::make_shared<PGPeeringEvent>(
10258 e, e,
10259 PG::DeleteSome())),
10260 handle);
10261 }
10262
10263
10264
10265 // --------------------------------
10266
10267 const char** OSD::get_tracked_conf_keys() const
10268 {
10269 static const char* KEYS[] = {
10270 "osd_max_backfills",
10271 "osd_min_recovery_priority",
10272 "osd_max_trimming_pgs",
10273 "osd_op_complaint_time",
10274 "osd_op_log_threshold",
10275 "osd_op_history_size",
10276 "osd_op_history_duration",
10277 "osd_op_history_slow_op_size",
10278 "osd_op_history_slow_op_threshold",
10279 "osd_enable_op_tracker",
10280 "osd_map_cache_size",
10281 "osd_pg_epoch_max_lag_factor",
10282 "osd_pg_epoch_persisted_max_stale",
10283 // clog & admin clog
10284 "clog_to_monitors",
10285 "clog_to_syslog",
10286 "clog_to_syslog_facility",
10287 "clog_to_syslog_level",
10288 "osd_objectstore_fuse",
10289 "clog_to_graylog",
10290 "clog_to_graylog_host",
10291 "clog_to_graylog_port",
10292 "host",
10293 "fsid",
10294 "osd_recovery_delay_start",
10295 "osd_client_message_size_cap",
10296 "osd_client_message_cap",
10297 "osd_heartbeat_min_size",
10298 "osd_heartbeat_interval",
10299 "osd_scrub_min_interval",
10300 "osd_scrub_max_interval",
10301 NULL
10302 };
10303 return KEYS;
10304 }
10305
10306 void OSD::handle_conf_change(const ConfigProxy& conf,
10307 const std::set <std::string> &changed)
10308 {
10309 Mutex::Locker l(osd_lock);
10310 if (changed.count("osd_max_backfills")) {
10311 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10312 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10313 }
10314 if (changed.count("osd_min_recovery_priority")) {
10315 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10316 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10317 }
10318 if (changed.count("osd_max_trimming_pgs")) {
10319 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10320 }
10321 if (changed.count("osd_op_complaint_time") ||
10322 changed.count("osd_op_log_threshold")) {
10323 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10324 cct->_conf->osd_op_log_threshold);
10325 }
10326 if (changed.count("osd_op_history_size") ||
10327 changed.count("osd_op_history_duration")) {
10328 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10329 cct->_conf->osd_op_history_duration);
10330 }
10331 if (changed.count("osd_op_history_slow_op_size") ||
10332 changed.count("osd_op_history_slow_op_threshold")) {
10333 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10334 cct->_conf->osd_op_history_slow_op_threshold);
10335 }
10336 if (changed.count("osd_enable_op_tracker")) {
10337 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10338 }
10339 if (changed.count("osd_map_cache_size")) {
10340 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10341 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10342 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10343 }
10344 if (changed.count("clog_to_monitors") ||
10345 changed.count("clog_to_syslog") ||
10346 changed.count("clog_to_syslog_level") ||
10347 changed.count("clog_to_syslog_facility") ||
10348 changed.count("clog_to_graylog") ||
10349 changed.count("clog_to_graylog_host") ||
10350 changed.count("clog_to_graylog_port") ||
10351 changed.count("host") ||
10352 changed.count("fsid")) {
10353 update_log_config();
10354 }
10355 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10356 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10357 "osd_pg_epoch_max_lag_factor");
10358 }
10359
10360 #ifdef HAVE_LIBFUSE
10361 if (changed.count("osd_objectstore_fuse")) {
10362 if (store) {
10363 enable_disable_fuse(false);
10364 }
10365 }
10366 #endif
10367
10368 if (changed.count("osd_recovery_delay_start")) {
10369 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10370 service.kick_recovery_queue();
10371 }
10372
10373 if (changed.count("osd_client_message_cap")) {
10374 uint64_t newval = cct->_conf->osd_client_message_cap;
10375 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10376 if (pol.throttler_messages && newval > 0) {
10377 pol.throttler_messages->reset_max(newval);
10378 }
10379 }
10380 if (changed.count("osd_client_message_size_cap")) {
10381 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10382 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10383 if (pol.throttler_bytes && newval > 0) {
10384 pol.throttler_bytes->reset_max(newval);
10385 }
10386 }
10387
10388 if (changed.count("osd_scrub_min_interval") ||
10389 changed.count("osd_scrub_max_interval")) {
10390 resched_all_scrubs();
10391 dout(0) << __func__ << ": scrub interval change" << dendl;
10392 }
10393 check_config();
10394 }
10395
10396 void OSD::update_log_config()
10397 {
10398 map<string,string> log_to_monitors;
10399 map<string,string> log_to_syslog;
10400 map<string,string> log_channel;
10401 map<string,string> log_prio;
10402 map<string,string> log_to_graylog;
10403 map<string,string> log_to_graylog_host;
10404 map<string,string> log_to_graylog_port;
10405 uuid_d fsid;
10406 string host;
10407
10408 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10409 log_channel, log_prio, log_to_graylog,
10410 log_to_graylog_host, log_to_graylog_port,
10411 fsid, host) == 0)
10412 clog->update_config(log_to_monitors, log_to_syslog,
10413 log_channel, log_prio, log_to_graylog,
10414 log_to_graylog_host, log_to_graylog_port,
10415 fsid, host);
10416 derr << "log_to_monitors " << log_to_monitors << dendl;
10417 }
10418
10419 void OSD::check_config()
10420 {
10421 // some sanity checks
10422 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10423 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10424 << " is not > osd_pg_epoch_persisted_max_stale ("
10425 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10426 }
10427 }
10428
10429 // --------------------------------
10430
10431 void OSD::get_latest_osdmap()
10432 {
10433 dout(10) << __func__ << " -- start" << dendl;
10434
10435 C_SaferCond cond;
10436 service.objecter->wait_for_latest_osdmap(&cond);
10437 cond.wait();
10438
10439 dout(10) << __func__ << " -- finish" << dendl;
10440 }
10441
10442 // --------------------------------
10443
10444 int OSD::init_op_flags(OpRequestRef& op)
10445 {
10446 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10447 vector<OSDOp>::const_iterator iter;
10448
10449 // client flags have no bearing on whether an op is a read, write, etc.
10450 op->rmw_flags = 0;
10451
10452 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10453 op->set_force_rwordered();
10454 }
10455
10456 // set bits based on op codes, called methods.
10457 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10458 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10459 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10460 /* This a bit odd. PING isn't actually a write. It can't
10461 * result in an update to the object_info. PINGs also aren't
10462 * resent, so there's no reason to write out a log entry.
10463 *
10464 * However, we pipeline them behind writes, so let's force
10465 * the write_ordered flag.
10466 */
10467 op->set_force_rwordered();
10468 } else {
10469 if (ceph_osd_op_mode_modify(iter->op.op))
10470 op->set_write();
10471 }
10472 if (ceph_osd_op_mode_read(iter->op.op))
10473 op->set_read();
10474
10475 // set READ flag if there are src_oids
10476 if (iter->soid.oid.name.length())
10477 op->set_read();
10478
10479 // set PGOP flag if there are PG ops
10480 if (ceph_osd_op_type_pg(iter->op.op))
10481 op->set_pg_op();
10482
10483 if (ceph_osd_op_mode_cache(iter->op.op))
10484 op->set_cache();
10485
10486 // check for ec base pool
10487 int64_t poolid = m->get_pg().pool();
10488 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10489 if (pool && pool->is_tier()) {
10490 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10491 if (base_pool && base_pool->require_rollback()) {
10492 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10493 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10494 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10495 (iter->op.op != CEPH_OSD_OP_STAT) &&
10496 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10497 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10498 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10499 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10500 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10501 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10502 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10503 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10504 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10505 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10506 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10507 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10508 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10509 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10510 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10511 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10512 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10513 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10514 op->set_promote();
10515 }
10516 }
10517 }
10518
10519 switch (iter->op.op) {
10520 case CEPH_OSD_OP_CALL:
10521 {
10522 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10523 int is_write, is_read;
10524 string cname, mname;
10525 bp.copy(iter->op.cls.class_len, cname);
10526 bp.copy(iter->op.cls.method_len, mname);
10527
10528 ClassHandler::ClassData *cls;
10529 int r = class_handler->open_class(cname, &cls);
10530 if (r) {
10531 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10532 if (r == -ENOENT)
10533 r = -EOPNOTSUPP;
10534 else if (r != -EPERM) // propagate permission errors
10535 r = -EIO;
10536 return r;
10537 }
10538 int flags = cls->get_method_flags(mname.c_str());
10539 if (flags < 0) {
10540 if (flags == -ENOENT)
10541 r = -EOPNOTSUPP;
10542 else
10543 r = flags;
10544 return r;
10545 }
10546 is_read = flags & CLS_METHOD_RD;
10547 is_write = flags & CLS_METHOD_WR;
10548 bool is_promote = flags & CLS_METHOD_PROMOTE;
10549
10550 dout(10) << "class " << cname << " method " << mname << " "
10551 << "flags=" << (is_read ? "r" : "")
10552 << (is_write ? "w" : "")
10553 << (is_promote ? "p" : "")
10554 << dendl;
10555 if (is_read)
10556 op->set_class_read();
10557 if (is_write)
10558 op->set_class_write();
10559 if (is_promote)
10560 op->set_promote();
10561 op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10562 cls->whitelisted);
10563 break;
10564 }
10565
10566 case CEPH_OSD_OP_WATCH:
10567 // force the read bit for watch since it is depends on previous
10568 // watch state (and may return early if the watch exists) or, in
10569 // the case of ping, is simply a read op.
10570 op->set_read();
10571 // fall through
10572 case CEPH_OSD_OP_NOTIFY:
10573 case CEPH_OSD_OP_NOTIFY_ACK:
10574 {
10575 op->set_promote();
10576 break;
10577 }
10578
10579 case CEPH_OSD_OP_DELETE:
10580 // if we get a delete with FAILOK we can skip handle cache. without
10581 // FAILOK we still need to promote (or do something smarter) to
10582 // determine whether to return ENOENT or 0.
10583 if (iter == m->ops.begin() &&
10584 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10585 op->set_skip_handle_cache();
10586 }
10587 // skip promotion when proxying a delete op
10588 if (m->ops.size() == 1) {
10589 op->set_skip_promote();
10590 }
10591 break;
10592
10593 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10594 case CEPH_OSD_OP_CACHE_FLUSH:
10595 case CEPH_OSD_OP_CACHE_EVICT:
10596 // If try_flush/flush/evict is the only op, can skip handle cache.
10597 if (m->ops.size() == 1) {
10598 op->set_skip_handle_cache();
10599 }
10600 break;
10601
10602 case CEPH_OSD_OP_READ:
10603 case CEPH_OSD_OP_SYNC_READ:
10604 case CEPH_OSD_OP_SPARSE_READ:
10605 case CEPH_OSD_OP_CHECKSUM:
10606 case CEPH_OSD_OP_WRITEFULL:
10607 if (m->ops.size() == 1 &&
10608 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10609 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10610 op->set_skip_promote();
10611 }
10612 break;
10613
10614 // force promotion when pin an object in cache tier
10615 case CEPH_OSD_OP_CACHE_PIN:
10616 op->set_promote();
10617 break;
10618
10619 default:
10620 break;
10621 }
10622 }
10623
10624 if (op->rmw_flags == 0)
10625 return -EINVAL;
10626
10627 return 0;
10628 }
10629
10630 void OSD::set_perf_queries(
10631 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10632 dout(10) << "setting " << queries.size() << " queries" << dendl;
10633
10634 std::list<OSDPerfMetricQuery> supported_queries;
10635 for (auto &it : queries) {
10636 auto &query = it.first;
10637 if (!query.key_descriptor.empty()) {
10638 supported_queries.push_back(query);
10639 }
10640 }
10641 if (supported_queries.size() < queries.size()) {
10642 dout(1) << queries.size() - supported_queries.size()
10643 << " unsupported queries" << dendl;
10644 }
10645
10646 {
10647 Mutex::Locker locker(m_perf_queries_lock);
10648 m_perf_queries = supported_queries;
10649 m_perf_limits = queries;
10650 }
10651
10652 std::vector<PGRef> pgs;
10653 _get_pgs(&pgs);
10654 for (auto& pg : pgs) {
10655 pg->lock();
10656 pg->set_dynamic_perf_stats_queries(supported_queries);
10657 pg->unlock();
10658 }
10659 }
10660
10661 void OSD::get_perf_reports(
10662 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10663 std::vector<PGRef> pgs;
10664 _get_pgs(&pgs);
10665 DynamicPerfStats dps;
10666 for (auto& pg : pgs) {
10667 // m_perf_queries can be modified only in set_perf_queries by mgr client
10668 // request, and it is protected by by mgr client's lock, which is held
10669 // when set_perf_queries/get_perf_reports are called, so we may not hold
10670 // m_perf_queries_lock here.
10671 DynamicPerfStats pg_dps(m_perf_queries);
10672 pg->lock();
10673 pg->get_dynamic_perf_stats(&pg_dps);
10674 pg->unlock();
10675 dps.merge(pg_dps);
10676 }
10677 dps.add_to_reports(m_perf_limits, reports);
10678 dout(20) << "reports for " << reports->size() << " queries" << dendl;
10679 }
10680
10681 // =============================================================
10682
10683 #undef dout_context
10684 #define dout_context cct
10685 #undef dout_prefix
10686 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10687
10688 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10689 {
10690 dout(10) << pg->pg_id << " " << pg << dendl;
10691 slot->pg = pg;
10692 pg->osd_shard = this;
10693 pg->pg_slot = slot;
10694 osd->inc_num_pgs();
10695
10696 slot->epoch = pg->get_osdmap_epoch();
10697 pg_slots_by_epoch.insert(*slot);
10698 }
10699
10700 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10701 {
10702 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10703 slot->pg->osd_shard = nullptr;
10704 slot->pg->pg_slot = nullptr;
10705 slot->pg = nullptr;
10706 osd->dec_num_pgs();
10707
10708 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10709 slot->epoch = 0;
10710 if (waiting_for_min_pg_epoch) {
10711 min_pg_epoch_cond.notify_all();
10712 }
10713 }
10714
10715 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10716 {
10717 std::lock_guard l(shard_lock);
10718 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10719 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10720 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10721 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10722 slot->epoch = e;
10723 pg_slots_by_epoch.insert(*slot);
10724 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10725 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10726 if (waiting_for_min_pg_epoch) {
10727 min_pg_epoch_cond.notify_all();
10728 }
10729 }
10730
10731 epoch_t OSDShard::get_min_pg_epoch()
10732 {
10733 std::lock_guard l(shard_lock);
10734 auto p = pg_slots_by_epoch.begin();
10735 if (p == pg_slots_by_epoch.end()) {
10736 return 0;
10737 }
10738 return p->epoch;
10739 }
10740
10741 void OSDShard::wait_min_pg_epoch(epoch_t need)
10742 {
10743 std::unique_lock l{shard_lock};
10744 ++waiting_for_min_pg_epoch;
10745 min_pg_epoch_cond.wait(l, [need, this] {
10746 if (pg_slots_by_epoch.empty()) {
10747 return true;
10748 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10749 return true;
10750 } else {
10751 dout(10) << need << " waiting on "
10752 << pg_slots_by_epoch.begin()->epoch << dendl;
10753 return false;
10754 }
10755 });
10756 --waiting_for_min_pg_epoch;
10757 }
10758
10759 epoch_t OSDShard::get_max_waiting_epoch()
10760 {
10761 std::lock_guard l(shard_lock);
10762 epoch_t r = 0;
10763 for (auto& i : pg_slots) {
10764 if (!i.second->waiting_peering.empty()) {
10765 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10766 }
10767 }
10768 return r;
10769 }
10770
10771 void OSDShard::consume_map(
10772 OSDMapRef& new_osdmap,
10773 unsigned *pushes_to_free)
10774 {
10775 std::lock_guard l(shard_lock);
10776 OSDMapRef old_osdmap;
10777 {
10778 std::lock_guard l(osdmap_lock);
10779 old_osdmap = std::move(shard_osdmap);
10780 shard_osdmap = new_osdmap;
10781 }
10782 dout(10) << new_osdmap->get_epoch()
10783 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10784 << dendl;
10785 bool queued = false;
10786
10787 // check slots
10788 auto p = pg_slots.begin();
10789 while (p != pg_slots.end()) {
10790 OSDShardPGSlot *slot = p->second.get();
10791 const spg_t& pgid = p->first;
10792 dout(20) << __func__ << " " << pgid << dendl;
10793 if (!slot->waiting_for_split.empty()) {
10794 dout(20) << __func__ << " " << pgid
10795 << " waiting for split " << slot->waiting_for_split << dendl;
10796 ++p;
10797 continue;
10798 }
10799 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10800 dout(20) << __func__ << " " << pgid
10801 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10802 << dendl;
10803 ++p;
10804 continue;
10805 }
10806 if (!slot->waiting_peering.empty()) {
10807 epoch_t first = slot->waiting_peering.begin()->first;
10808 if (first <= new_osdmap->get_epoch()) {
10809 dout(20) << __func__ << " " << pgid
10810 << " pending_peering first epoch " << first
10811 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10812 _wake_pg_slot(pgid, slot);
10813 queued = true;
10814 }
10815 ++p;
10816 continue;
10817 }
10818 if (!slot->waiting.empty()) {
10819 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10820 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10821 << dendl;
10822 ++p;
10823 continue;
10824 }
10825 while (!slot->waiting.empty() &&
10826 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10827 auto& qi = slot->waiting.front();
10828 dout(20) << __func__ << " " << pgid
10829 << " waiting item " << qi
10830 << " epoch " << qi.get_map_epoch()
10831 << " <= " << new_osdmap->get_epoch()
10832 << ", "
10833 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10834 "misdirected")
10835 << ", dropping" << dendl;
10836 *pushes_to_free += qi.get_reserved_pushes();
10837 slot->waiting.pop_front();
10838 }
10839 }
10840 if (slot->waiting.empty() &&
10841 slot->num_running == 0 &&
10842 slot->waiting_for_split.empty() &&
10843 !slot->pg) {
10844 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10845 p = pg_slots.erase(p);
10846 continue;
10847 }
10848
10849 ++p;
10850 }
10851 if (queued) {
10852 std::lock_guard l{sdata_wait_lock};
10853 sdata_cond.notify_one();
10854 }
10855 }
10856
10857 void OSDShard::_wake_pg_slot(
10858 spg_t pgid,
10859 OSDShardPGSlot *slot)
10860 {
10861 dout(20) << __func__ << " " << pgid
10862 << " to_process " << slot->to_process
10863 << " waiting " << slot->waiting
10864 << " waiting_peering " << slot->waiting_peering << dendl;
10865 for (auto i = slot->to_process.rbegin();
10866 i != slot->to_process.rend();
10867 ++i) {
10868 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10869 }
10870 slot->to_process.clear();
10871 for (auto i = slot->waiting.rbegin();
10872 i != slot->waiting.rend();
10873 ++i) {
10874 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10875 }
10876 slot->waiting.clear();
10877 for (auto i = slot->waiting_peering.rbegin();
10878 i != slot->waiting_peering.rend();
10879 ++i) {
10880 // this is overkill; we requeue everything, even if some of these
10881 // items are waiting for maps we don't have yet. FIXME, maybe,
10882 // someday, if we decide this inefficiency matters
10883 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10884 _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10885 }
10886 }
10887 slot->waiting_peering.clear();
10888 ++slot->requeue_seq;
10889 }
10890
10891 void OSDShard::identify_splits_and_merges(
10892 const OSDMapRef& as_of_osdmap,
10893 set<pair<spg_t,epoch_t>> *split_pgs,
10894 set<pair<spg_t,epoch_t>> *merge_pgs)
10895 {
10896 std::lock_guard l(shard_lock);
10897 if (shard_osdmap) {
10898 for (auto& i : pg_slots) {
10899 const spg_t& pgid = i.first;
10900 auto *slot = i.second.get();
10901 if (slot->pg) {
10902 osd->service.identify_splits_and_merges(
10903 shard_osdmap, as_of_osdmap, pgid,
10904 split_pgs, merge_pgs);
10905 } else if (!slot->waiting_for_split.empty()) {
10906 osd->service.identify_splits_and_merges(
10907 shard_osdmap, as_of_osdmap, pgid,
10908 split_pgs, nullptr);
10909 } else {
10910 dout(20) << __func__ << " slot " << pgid
10911 << " has no pg and waiting_for_split "
10912 << slot->waiting_for_split << dendl;
10913 }
10914 }
10915 }
10916 }
10917
10918 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10919 set<pair<spg_t,epoch_t>> *pgids)
10920 {
10921 std::lock_guard l(shard_lock);
10922 _prime_splits(pgids);
10923 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10924 set<pair<spg_t,epoch_t>> newer_children;
10925 for (auto i : *pgids) {
10926 osd->service.identify_splits_and_merges(
10927 as_of_osdmap, shard_osdmap, i.first,
10928 &newer_children, nullptr);
10929 }
10930 newer_children.insert(pgids->begin(), pgids->end());
10931 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10932 << shard_osdmap->get_epoch() << ", new children " << newer_children
10933 << dendl;
10934 _prime_splits(&newer_children);
10935 // note: we don't care what is left over here for other shards.
10936 // if this shard is ahead of us and one isn't, e.g., one thread is
10937 // calling into prime_splits via _process (due to a newly created
10938 // pg) and this shard has a newer map due to a racing consume_map,
10939 // then any grandchildren left here will be identified (or were
10940 // identified) when the slower shard's osdmap is advanced.
10941 // _prime_splits() will tolerate the case where the pgid is
10942 // already primed.
10943 }
10944 }
10945
10946 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10947 {
10948 dout(10) << *pgids << dendl;
10949 auto p = pgids->begin();
10950 while (p != pgids->end()) {
10951 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10952 if (shard_index == shard_id) {
10953 auto r = pg_slots.emplace(p->first, nullptr);
10954 if (r.second) {
10955 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10956 r.first->second = make_unique<OSDShardPGSlot>();
10957 r.first->second->waiting_for_split.insert(p->second);
10958 } else {
10959 auto q = r.first;
10960 ceph_assert(q != pg_slots.end());
10961 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10962 << dendl;
10963 q->second->waiting_for_split.insert(p->second);
10964 }
10965 p = pgids->erase(p);
10966 } else {
10967 ++p;
10968 }
10969 }
10970 }
10971
10972 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10973 set<pair<spg_t,epoch_t>> *merge_pgs)
10974 {
10975 std::lock_guard l(shard_lock);
10976 dout(20) << __func__ << " checking shard " << shard_id
10977 << " for remaining merge pgs " << merge_pgs << dendl;
10978 auto p = merge_pgs->begin();
10979 while (p != merge_pgs->end()) {
10980 spg_t pgid = p->first;
10981 epoch_t epoch = p->second;
10982 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10983 if (shard_index != shard_id) {
10984 ++p;
10985 continue;
10986 }
10987 OSDShardPGSlot *slot;
10988 auto r = pg_slots.emplace(pgid, nullptr);
10989 if (r.second) {
10990 r.first->second = make_unique<OSDShardPGSlot>();
10991 }
10992 slot = r.first->second.get();
10993 if (slot->pg) {
10994 // already have pg
10995 dout(20) << __func__ << " have merge participant pg " << pgid
10996 << " " << slot->pg << dendl;
10997 } else if (!slot->waiting_for_split.empty() &&
10998 *slot->waiting_for_split.begin() < epoch) {
10999 dout(20) << __func__ << " pending split on merge participant pg " << pgid
11000 << " " << slot->waiting_for_split << dendl;
11001 } else {
11002 dout(20) << __func__ << " creating empty merge participant " << pgid
11003 << " for merge in " << epoch << dendl;
11004 // leave history zeroed; PG::merge_from() will fill it in.
11005 pg_history_t history;
11006 PGCreateInfo cinfo(pgid, epoch - 1,
11007 history, PastIntervals(), false);
11008 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
11009 _attach_pg(r.first->second.get(), pg.get());
11010 _wake_pg_slot(pgid, slot);
11011 pg->unlock();
11012 }
11013 // mark slot for merge
11014 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
11015 slot->waiting_for_merge_epoch = epoch;
11016 p = merge_pgs->erase(p);
11017 }
11018 }
11019
11020 void OSDShard::register_and_wake_split_child(PG *pg)
11021 {
11022 epoch_t epoch;
11023 {
11024 std::lock_guard l(shard_lock);
11025 dout(10) << pg->pg_id << " " << pg << dendl;
11026 auto p = pg_slots.find(pg->pg_id);
11027 ceph_assert(p != pg_slots.end());
11028 auto *slot = p->second.get();
11029 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
11030 << dendl;
11031 ceph_assert(!slot->pg);
11032 ceph_assert(!slot->waiting_for_split.empty());
11033 _attach_pg(slot, pg);
11034
11035 epoch = pg->get_osdmap_epoch();
11036 ceph_assert(slot->waiting_for_split.count(epoch));
11037 slot->waiting_for_split.erase(epoch);
11038 if (slot->waiting_for_split.empty()) {
11039 _wake_pg_slot(pg->pg_id, slot);
11040 } else {
11041 dout(10) << __func__ << " still waiting for split on "
11042 << slot->waiting_for_split << dendl;
11043 }
11044 }
11045
11046 // kick child to ensure it pulls up to the latest osdmap
11047 osd->enqueue_peering_evt(
11048 pg->pg_id,
11049 PGPeeringEventRef(
11050 std::make_shared<PGPeeringEvent>(
11051 epoch,
11052 epoch,
11053 NullEvt())));
11054
11055 std::lock_guard l{sdata_wait_lock};
11056 sdata_cond.notify_one();
11057 }
11058
11059 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
11060 {
11061 std::lock_guard l(shard_lock);
11062 vector<spg_t> to_delete;
11063 for (auto& i : pg_slots) {
11064 if (i.first != parent &&
11065 i.first.get_ancestor(old_pg_num) == parent) {
11066 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
11067 << dendl;
11068 _wake_pg_slot(i.first, i.second.get());
11069 to_delete.push_back(i.first);
11070 }
11071 }
11072 for (auto pgid : to_delete) {
11073 pg_slots.erase(pgid);
11074 }
11075 }
11076
11077
11078 // =============================================================
11079
11080 #undef dout_context
11081 #define dout_context osd->cct
11082 #undef dout_prefix
11083 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11084
11085 void OSD::ShardedOpWQ::_add_slot_waiter(
11086 spg_t pgid,
11087 OSDShardPGSlot *slot,
11088 OpQueueItem&& qi)
11089 {
11090 if (qi.is_peering()) {
11091 dout(20) << __func__ << " " << pgid
11092 << " peering, item epoch is "
11093 << qi.get_map_epoch()
11094 << ", will wait on " << qi << dendl;
11095 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
11096 } else {
11097 dout(20) << __func__ << " " << pgid
11098 << " item epoch is "
11099 << qi.get_map_epoch()
11100 << ", will wait on " << qi << dendl;
11101 slot->waiting.push_back(std::move(qi));
11102 }
11103 }
11104
11105 #undef dout_prefix
11106 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11107
11108 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
11109 {
11110 uint32_t shard_index = thread_index % osd->num_shards;
11111 auto& sdata = osd->shards[shard_index];
11112 ceph_assert(sdata);
11113
11114 // If all threads of shards do oncommits, there is a out-of-order
11115 // problem. So we choose the thread which has the smallest
11116 // thread_index(thread_index < num_shards) of shard to do oncommit
11117 // callback.
11118 bool is_smallest_thread_index = thread_index < osd->num_shards;
11119
11120 // peek at spg_t
11121 sdata->shard_lock.lock();
11122 if (sdata->pqueue->empty() &&
11123 (!is_smallest_thread_index || sdata->context_queue.empty())) {
11124 std::unique_lock wait_lock{sdata->sdata_wait_lock};
11125 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11126 // we raced with a context_queue addition, don't wait
11127 wait_lock.unlock();
11128 } else if (!sdata->stop_waiting) {
11129 dout(20) << __func__ << " empty q, waiting" << dendl;
11130 osd->cct->get_heartbeat_map()->clear_timeout(hb);
11131 sdata->shard_lock.unlock();
11132 sdata->sdata_cond.wait(wait_lock);
11133 wait_lock.unlock();
11134 sdata->shard_lock.lock();
11135 if (sdata->pqueue->empty() &&
11136 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
11137 sdata->shard_lock.unlock();
11138 return;
11139 }
11140 osd->cct->get_heartbeat_map()->reset_timeout(hb,
11141 osd->cct->_conf->threadpool_default_timeout, 0);
11142 } else {
11143 dout(20) << __func__ << " need return immediately" << dendl;
11144 wait_lock.unlock();
11145 sdata->shard_lock.unlock();
11146 return;
11147 }
11148 }
11149
11150 list<Context *> oncommits;
11151 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11152 sdata->context_queue.swap(oncommits);
11153 }
11154
11155 if (sdata->pqueue->empty()) {
11156 if (osd->is_stopping()) {
11157 sdata->shard_lock.unlock();
11158 for (auto c : oncommits) {
11159 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11160 delete c;
11161 }
11162 return; // OSD shutdown, discard.
11163 }
11164 sdata->shard_lock.unlock();
11165 handle_oncommits(oncommits);
11166 return;
11167 }
11168
11169 OpQueueItem item = sdata->pqueue->dequeue();
11170 if (osd->is_stopping()) {
11171 sdata->shard_lock.unlock();
11172 for (auto c : oncommits) {
11173 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11174 delete c;
11175 }
11176 return; // OSD shutdown, discard.
11177 }
11178
11179 const auto token = item.get_ordering_token();
11180 auto r = sdata->pg_slots.emplace(token, nullptr);
11181 if (r.second) {
11182 r.first->second = make_unique<OSDShardPGSlot>();
11183 }
11184 OSDShardPGSlot *slot = r.first->second.get();
11185 dout(20) << __func__ << " " << token
11186 << (r.second ? " (new)" : "")
11187 << " to_process " << slot->to_process
11188 << " waiting " << slot->waiting
11189 << " waiting_peering " << slot->waiting_peering
11190 << dendl;
11191 slot->to_process.push_back(std::move(item));
11192 dout(20) << __func__ << " " << slot->to_process.back()
11193 << " queued" << dendl;
11194
11195 retry_pg:
11196 PGRef pg = slot->pg;
11197
11198 // lock pg (if we have it)
11199 if (pg) {
11200 // note the requeue seq now...
11201 uint64_t requeue_seq = slot->requeue_seq;
11202 ++slot->num_running;
11203
11204 sdata->shard_lock.unlock();
11205 osd->service.maybe_inject_dispatch_delay();
11206 pg->lock();
11207 osd->service.maybe_inject_dispatch_delay();
11208 sdata->shard_lock.lock();
11209
11210 auto q = sdata->pg_slots.find(token);
11211 if (q == sdata->pg_slots.end()) {
11212 // this can happen if we race with pg removal.
11213 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11214 pg->unlock();
11215 sdata->shard_lock.unlock();
11216 handle_oncommits(oncommits);
11217 return;
11218 }
11219 slot = q->second.get();
11220 --slot->num_running;
11221
11222 if (slot->to_process.empty()) {
11223 // raced with _wake_pg_slot or consume_map
11224 dout(20) << __func__ << " " << token
11225 << " nothing queued" << dendl;
11226 pg->unlock();
11227 sdata->shard_lock.unlock();
11228 handle_oncommits(oncommits);
11229 return;
11230 }
11231 if (requeue_seq != slot->requeue_seq) {
11232 dout(20) << __func__ << " " << token
11233 << " requeue_seq " << slot->requeue_seq << " > our "
11234 << requeue_seq << ", we raced with _wake_pg_slot"
11235 << dendl;
11236 pg->unlock();
11237 sdata->shard_lock.unlock();
11238 handle_oncommits(oncommits);
11239 return;
11240 }
11241 if (slot->pg != pg) {
11242 // this can happen if we race with pg removal.
11243 dout(20) << __func__ << " slot " << token << " no longer attached to "
11244 << pg << dendl;
11245 pg->unlock();
11246 goto retry_pg;
11247 }
11248 }
11249
11250 dout(20) << __func__ << " " << token
11251 << " to_process " << slot->to_process
11252 << " waiting " << slot->waiting
11253 << " waiting_peering " << slot->waiting_peering << dendl;
11254
11255 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11256 suicide_interval);
11257
11258 // take next item
11259 auto qi = std::move(slot->to_process.front());
11260 slot->to_process.pop_front();
11261 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11262 set<pair<spg_t,epoch_t>> new_children;
11263 OSDMapRef osdmap;
11264
11265 while (!pg) {
11266 // should this pg shard exist on this osd in this (or a later) epoch?
11267 osdmap = sdata->shard_osdmap;
11268 const PGCreateInfo *create_info = qi.creates_pg();
11269 if (!slot->waiting_for_split.empty()) {
11270 dout(20) << __func__ << " " << token
11271 << " splitting " << slot->waiting_for_split << dendl;
11272 _add_slot_waiter(token, slot, std::move(qi));
11273 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11274 dout(20) << __func__ << " " << token
11275 << " map " << qi.get_map_epoch() << " > "
11276 << osdmap->get_epoch() << dendl;
11277 _add_slot_waiter(token, slot, std::move(qi));
11278 } else if (qi.is_peering()) {
11279 if (!qi.peering_requires_pg()) {
11280 // for pg-less events, we run them under the ordering lock, since
11281 // we don't have the pg lock to keep them ordered.
11282 qi.run(osd, sdata, pg, tp_handle);
11283 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11284 if (create_info) {
11285 if (create_info->by_mon &&
11286 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11287 dout(20) << __func__ << " " << token
11288 << " no pg, no longer primary, ignoring mon create on "
11289 << qi << dendl;
11290 } else {
11291 dout(20) << __func__ << " " << token
11292 << " no pg, should create on " << qi << dendl;
11293 pg = osd->handle_pg_create_info(osdmap, create_info);
11294 if (pg) {
11295 // we created the pg! drop out and continue "normally"!
11296 sdata->_attach_pg(slot, pg.get());
11297 sdata->_wake_pg_slot(token, slot);
11298
11299 // identify split children between create epoch and shard epoch.
11300 osd->service.identify_splits_and_merges(
11301 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11302 sdata->_prime_splits(&new_children);
11303 // distribute remaining split children to other shards below!
11304 break;
11305 }
11306 dout(20) << __func__ << " ignored create on " << qi << dendl;
11307 }
11308 } else {
11309 dout(20) << __func__ << " " << token
11310 << " no pg, peering, !create, discarding " << qi << dendl;
11311 }
11312 } else {
11313 dout(20) << __func__ << " " << token
11314 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11315 << ", discarding " << qi
11316 << dendl;
11317 }
11318 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11319 dout(20) << __func__ << " " << token
11320 << " no pg, should exist e" << osdmap->get_epoch()
11321 << ", will wait on " << qi << dendl;
11322 _add_slot_waiter(token, slot, std::move(qi));
11323 } else {
11324 dout(20) << __func__ << " " << token
11325 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11326 << ", dropping " << qi << dendl;
11327 // share map with client?
11328 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11329 auto priv = (*_op)->get_req()->get_connection()->get_priv();
11330 if (auto session = static_cast<Session *>(priv.get()); session) {
11331 osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
11332 }
11333 }
11334 unsigned pushes_to_free = qi.get_reserved_pushes();
11335 if (pushes_to_free > 0) {
11336 sdata->shard_lock.unlock();
11337 osd->service.release_reserved_pushes(pushes_to_free);
11338 handle_oncommits(oncommits);
11339 return;
11340 }
11341 }
11342 sdata->shard_lock.unlock();
11343 handle_oncommits(oncommits);
11344 return;
11345 }
11346 if (qi.is_peering()) {
11347 OSDMapRef osdmap = sdata->shard_osdmap;
11348 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11349 _add_slot_waiter(token, slot, std::move(qi));
11350 sdata->shard_lock.unlock();
11351 pg->unlock();
11352 handle_oncommits(oncommits);
11353 return;
11354 }
11355 }
11356 sdata->shard_lock.unlock();
11357
11358 if (!new_children.empty()) {
11359 for (auto shard : osd->shards) {
11360 shard->prime_splits(osdmap, &new_children);
11361 }
11362 ceph_assert(new_children.empty());
11363 }
11364
11365 // osd_opwq_process marks the point at which an operation has been dequeued
11366 // and will begin to be handled by a worker thread.
11367 {
11368 #ifdef WITH_LTTNG
11369 osd_reqid_t reqid;
11370 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11371 reqid = (*_op)->get_reqid();
11372 }
11373 #endif
11374 tracepoint(osd, opwq_process_start, reqid.name._type,
11375 reqid.name._num, reqid.tid, reqid.inc);
11376 }
11377
11378 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11379 Formatter *f = Formatter::create("json");
11380 f->open_object_section("q");
11381 dump(f);
11382 f->close_section();
11383 f->flush(*_dout);
11384 delete f;
11385 *_dout << dendl;
11386
11387 qi.run(osd, sdata, pg, tp_handle);
11388
11389 {
11390 #ifdef WITH_LTTNG
11391 osd_reqid_t reqid;
11392 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11393 reqid = (*_op)->get_reqid();
11394 }
11395 #endif
11396 tracepoint(osd, opwq_process_finish, reqid.name._type,
11397 reqid.name._num, reqid.tid, reqid.inc);
11398 }
11399
11400 handle_oncommits(oncommits);
11401 }
11402
11403 void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
11404 uint32_t shard_index =
11405 item.get_ordering_token().hash_to_shard(osd->shards.size());
11406
11407 OSDShard* sdata = osd->shards[shard_index];
11408 assert (NULL != sdata);
11409 unsigned priority = item.get_priority();
11410 unsigned cost = item.get_cost();
11411 sdata->shard_lock.lock();
11412
11413 dout(20) << __func__ << " " << item << dendl;
11414 if (priority >= osd->op_prio_cutoff)
11415 sdata->pqueue->enqueue_strict(
11416 item.get_owner(), priority, std::move(item));
11417 else
11418 sdata->pqueue->enqueue(
11419 item.get_owner(), priority, cost, std::move(item));
11420 sdata->shard_lock.unlock();
11421
11422 std::lock_guard l{sdata->sdata_wait_lock};
11423 sdata->sdata_cond.notify_one();
11424 }
11425
11426 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
11427 {
11428 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11429 auto& sdata = osd->shards[shard_index];
11430 ceph_assert(sdata);
11431 sdata->shard_lock.lock();
11432 auto p = sdata->pg_slots.find(item.get_ordering_token());
11433 if (p != sdata->pg_slots.end() &&
11434 !p->second->to_process.empty()) {
11435 // we may be racing with _process, which has dequeued a new item
11436 // from pqueue, put it on to_process, and is now busy taking the
11437 // pg lock. ensure this old requeued item is ordered before any
11438 // such newer item in to_process.
11439 p->second->to_process.push_front(std::move(item));
11440 item = std::move(p->second->to_process.back());
11441 p->second->to_process.pop_back();
11442 dout(20) << __func__
11443 << " " << p->second->to_process.front()
11444 << " shuffled w/ " << item << dendl;
11445 } else {
11446 dout(20) << __func__ << " " << item << dendl;
11447 }
11448 sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11449 sdata->shard_lock.unlock();
11450 std::lock_guard l{sdata->sdata_wait_lock};
11451 sdata->sdata_cond.notify_one();
11452 }
11453
11454 namespace ceph {
11455 namespace osd_cmds {
11456
11457 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11458 std::ostream& os)
11459 {
11460 if (!ceph_using_tcmalloc()) {
11461 os << "could not issue heap profiler command -- not using tcmalloc!";
11462 return -EOPNOTSUPP;
11463 }
11464
11465 string cmd;
11466 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11467 os << "unable to get value for command \"" << cmd << "\"";
11468 return -EINVAL;
11469 }
11470
11471 std::vector<std::string> cmd_vec;
11472 get_str_vec(cmd, cmd_vec);
11473
11474 string val;
11475 if (cmd_getval(&cct, cmdmap, "value", val)) {
11476 cmd_vec.push_back(val);
11477 }
11478
11479 ceph_heap_profiler_handle_command(cmd_vec, os);
11480
11481 return 0;
11482 }
11483
11484 }} // namespace ceph::osd_cmds
11485
11486
11487 std::ostream& operator<<(std::ostream& out, const io_queue& q) {
11488 switch(q) {
11489 case io_queue::prioritized:
11490 out << "prioritized";
11491 break;
11492 case io_queue::weightedpriority:
11493 out << "weightedpriority";
11494 break;
11495 case io_queue::mclock_opclass:
11496 out << "mclock_opclass";
11497 break;
11498 case io_queue::mclock_client:
11499 out << "mclock_client";
11500 break;
11501 }
11502 return out;
11503 }