]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import ceph 14.2.5
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_time.h"
52 #include "common/version.h"
53 #include "common/pick_address.h"
54 #include "common/blkdev.h"
55 #include "common/numa.h"
56
57 #include "os/ObjectStore.h"
58 #ifdef HAVE_LIBFUSE
59 #include "os/FuseStore.h"
60 #endif
61
62 #include "PrimaryLogPG.h"
63
64 #include "msg/Messenger.h"
65 #include "msg/Message.h"
66
67 #include "mon/MonClient.h"
68
69 #include "messages/MLog.h"
70
71 #include "messages/MGenericMessage.h"
72 #include "messages/MOSDPing.h"
73 #include "messages/MOSDFailure.h"
74 #include "messages/MOSDMarkMeDown.h"
75 #include "messages/MOSDFull.h"
76 #include "messages/MOSDOp.h"
77 #include "messages/MOSDOpReply.h"
78 #include "messages/MOSDBackoff.h"
79 #include "messages/MOSDBeacon.h"
80 #include "messages/MOSDRepOp.h"
81 #include "messages/MOSDRepOpReply.h"
82 #include "messages/MOSDBoot.h"
83 #include "messages/MOSDPGTemp.h"
84 #include "messages/MOSDPGReadyToMerge.h"
85
86 #include "messages/MOSDMap.h"
87 #include "messages/MMonGetOSDMap.h"
88 #include "messages/MOSDPGNotify.h"
89 #include "messages/MOSDPGQuery.h"
90 #include "messages/MOSDPGLog.h"
91 #include "messages/MOSDPGRemove.h"
92 #include "messages/MOSDPGInfo.h"
93 #include "messages/MOSDPGCreate.h"
94 #include "messages/MOSDPGCreate2.h"
95 #include "messages/MOSDPGTrim.h"
96 #include "messages/MOSDPGScan.h"
97 #include "messages/MBackfillReserve.h"
98 #include "messages/MRecoveryReserve.h"
99 #include "messages/MOSDForceRecovery.h"
100 #include "messages/MOSDECSubOpWrite.h"
101 #include "messages/MOSDECSubOpWriteReply.h"
102 #include "messages/MOSDECSubOpRead.h"
103 #include "messages/MOSDECSubOpReadReply.h"
104 #include "messages/MOSDPGCreated.h"
105 #include "messages/MOSDPGUpdateLogMissing.h"
106 #include "messages/MOSDPGUpdateLogMissingReply.h"
107
108 #include "messages/MOSDPeeringOp.h"
109
110 #include "messages/MOSDAlive.h"
111
112 #include "messages/MOSDScrub.h"
113 #include "messages/MOSDScrub2.h"
114 #include "messages/MOSDRepScrub.h"
115
116 #include "messages/MMonCommand.h"
117 #include "messages/MCommand.h"
118 #include "messages/MCommandReply.h"
119
120 #include "messages/MPGStats.h"
121 #include "messages/MPGStatsAck.h"
122
123 #include "messages/MWatchNotify.h"
124 #include "messages/MOSDPGPush.h"
125 #include "messages/MOSDPGPushReply.h"
126 #include "messages/MOSDPGPull.h"
127
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
135
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
138
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
142
143 #include "osd/OpRequest.h"
144
145 #include "auth/AuthAuthorizeHandler.h"
146 #include "auth/RotatingKeyRing.h"
147
148 #include "objclass/objclass.h"
149
150 #include "common/cmdparse.h"
151 #include "include/str_list.h"
152 #include "include/util.h"
153
154 #include "include/ceph_assert.h"
155 #include "common/config.h"
156 #include "common/EventTrace.h"
157
158 #include "json_spirit/json_spirit_reader.h"
159 #include "json_spirit/json_spirit_writer.h"
160
161 #ifdef WITH_LTTNG
162 #define TRACEPOINT_DEFINE
163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164 #include "tracing/osd.h"
165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
166 #undef TRACEPOINT_DEFINE
167 #else
168 #define tracepoint(...)
169 #endif
170
171 #define dout_context cct
172 #define dout_subsys ceph_subsys_osd
173 #undef dout_prefix
174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
175
176
177 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
178 return *_dout << "osd." << whoami << " " << epoch << " ";
179 }
180
181 //Initial features in new superblock.
182 //Features here are also automatically upgraded
183 CompatSet OSD::get_osd_initial_compat_set() {
184 CompatSet::FeatureSet ceph_osd_feature_compat;
185 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
186 CompatSet::FeatureSet ceph_osd_feature_incompat;
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
202 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
203 ceph_osd_feature_incompat);
204 }
205
206 //Features are added here that this OSD supports.
207 CompatSet OSD::get_osd_compat_set() {
208 CompatSet compat = get_osd_initial_compat_set();
209 //Any features here can be set in code, but not in initial superblock
210 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
211 return compat;
212 }
213
214 OSDService::OSDService(OSD *osd) :
215 osd(osd),
216 cct(osd->cct),
217 whoami(osd->whoami), store(osd->store),
218 log_client(osd->log_client), clog(osd->clog),
219 pg_recovery_stats(osd->pg_recovery_stats),
220 cluster_messenger(osd->cluster_messenger),
221 client_messenger(osd->client_messenger),
222 logger(osd->logger),
223 recoverystate_perf(osd->recoverystate_perf),
224 monc(osd->monc),
225 class_handler(osd->class_handler),
226 osd_max_object_size(cct->_conf, "osd_max_object_size"),
227 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
228 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
229 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
230 max_oldest_map(0),
231 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
232 sched_scrub_lock("OSDService::sched_scrub_lock"),
233 scrubs_local(0),
234 scrubs_remote(0),
235 agent_lock("OSDService::agent_lock"),
236 agent_valid_iterator(false),
237 agent_ops(0),
238 flush_mode_high_count(0),
239 agent_active(true),
240 agent_thread(this),
241 agent_stop_flag(false),
242 agent_timer_lock("OSDService::agent_timer_lock"),
243 agent_timer(osd->client_messenger->cct, agent_timer_lock),
244 last_recalibrate(ceph_clock_now()),
245 promote_max_objects(0),
246 promote_max_bytes(0),
247 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
248 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
249 watch_lock("OSDService::watch_lock"),
250 watch_timer(osd->client_messenger->cct, watch_lock),
251 next_notif_id(0),
252 recovery_request_lock("OSDService::recovery_request_lock"),
253 recovery_request_timer(cct, recovery_request_lock, false),
254 sleep_lock("OSDService::sleep_lock"),
255 sleep_timer(cct, sleep_lock, false),
256 reserver_finisher(cct),
257 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
258 cct->_conf->osd_min_recovery_priority),
259 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
260 cct->_conf->osd_min_recovery_priority),
261 pg_temp_lock("OSDService::pg_temp_lock"),
262 snap_reserver(cct, &reserver_finisher,
263 cct->_conf->osd_max_trimming_pgs),
264 recovery_lock("OSDService::recovery_lock"),
265 recovery_ops_active(0),
266 recovery_ops_reserved(0),
267 recovery_paused(false),
268 map_cache_lock("OSDService::map_cache_lock"),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0), physical_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279 #ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281 #endif
282 {
283 objecter->init();
284
285 for (int i = 0; i < m_objecter_finishers; i++) {
286 ostringstream str;
287 str << "objecter-finisher-" << i;
288 Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
289 objecter_finishers.push_back(fin);
290 }
291 }
292
293 OSDService::~OSDService()
294 {
295 delete objecter;
296
297 for (auto f : objecter_finishers) {
298 delete f;
299 f = NULL;
300 }
301 }
302
303
304
305 #ifdef PG_DEBUG_REFS
306 void OSDService::add_pgid(spg_t pgid, PG *pg){
307 std::lock_guard l(pgid_lock);
308 if (!pgid_tracker.count(pgid)) {
309 live_pgs[pgid] = pg;
310 }
311 pgid_tracker[pgid]++;
312 }
313 void OSDService::remove_pgid(spg_t pgid, PG *pg)
314 {
315 std::lock_guard l(pgid_lock);
316 ceph_assert(pgid_tracker.count(pgid));
317 ceph_assert(pgid_tracker[pgid] > 0);
318 pgid_tracker[pgid]--;
319 if (pgid_tracker[pgid] == 0) {
320 pgid_tracker.erase(pgid);
321 live_pgs.erase(pgid);
322 }
323 }
324 void OSDService::dump_live_pgids()
325 {
326 std::lock_guard l(pgid_lock);
327 derr << "live pgids:" << dendl;
328 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
329 i != pgid_tracker.cend();
330 ++i) {
331 derr << "\t" << *i << dendl;
332 live_pgs[i->first]->dump_live_ids();
333 }
334 }
335 #endif
336
337
338
339 void OSDService::identify_splits_and_merges(
340 OSDMapRef old_map,
341 OSDMapRef new_map,
342 spg_t pgid,
343 set<pair<spg_t,epoch_t>> *split_children,
344 set<pair<spg_t,epoch_t>> *merge_pgs)
345 {
346 if (!old_map->have_pg_pool(pgid.pool())) {
347 return;
348 }
349 int old_pgnum = old_map->get_pg_num(pgid.pool());
350 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
351 if (p == osd->pg_num_history.pg_nums.end()) {
352 return;
353 }
354 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
355 << " to e" << new_map->get_epoch()
356 << " pg_nums " << p->second << dendl;
357 deque<spg_t> queue;
358 queue.push_back(pgid);
359 set<spg_t> did;
360 while (!queue.empty()) {
361 auto cur = queue.front();
362 queue.pop_front();
363 did.insert(cur);
364 unsigned pgnum = old_pgnum;
365 for (auto q = p->second.lower_bound(old_map->get_epoch());
366 q != p->second.end() &&
367 q->first <= new_map->get_epoch();
368 ++q) {
369 if (pgnum < q->second) {
370 // split?
371 if (cur.ps() < pgnum) {
372 set<spg_t> children;
373 if (cur.is_split(pgnum, q->second, &children)) {
374 dout(20) << __func__ << " " << cur << " e" << q->first
375 << " pg_num " << pgnum << " -> " << q->second
376 << " children " << children << dendl;
377 for (auto i : children) {
378 split_children->insert(make_pair(i, q->first));
379 if (!did.count(i))
380 queue.push_back(i);
381 }
382 }
383 } else if (cur.ps() < q->second) {
384 dout(20) << __func__ << " " << cur << " e" << q->first
385 << " pg_num " << pgnum << " -> " << q->second
386 << " is a child" << dendl;
387 // normally we'd capture this from the parent, but it's
388 // possible the parent doesn't exist yet (it will be
389 // fabricated to allow an intervening merge). note this PG
390 // as a split child here to be sure we catch it.
391 split_children->insert(make_pair(cur, q->first));
392 } else {
393 dout(20) << __func__ << " " << cur << " e" << q->first
394 << " pg_num " << pgnum << " -> " << q->second
395 << " is post-split, skipping" << dendl;
396 }
397 } else if (merge_pgs) {
398 // merge?
399 if (cur.ps() >= q->second) {
400 if (cur.ps() < pgnum) {
401 spg_t parent;
402 if (cur.is_merge_source(pgnum, q->second, &parent)) {
403 set<spg_t> children;
404 parent.is_split(q->second, pgnum, &children);
405 dout(20) << __func__ << " " << cur << " e" << q->first
406 << " pg_num " << pgnum << " -> " << q->second
407 << " is merge source, target " << parent
408 << ", source(s) " << children << dendl;
409 merge_pgs->insert(make_pair(parent, q->first));
410 if (!did.count(parent)) {
411 // queue (and re-scan) parent in case it might not exist yet
412 // and there are some future splits pending on it
413 queue.push_back(parent);
414 }
415 for (auto c : children) {
416 merge_pgs->insert(make_pair(c, q->first));
417 if (!did.count(c))
418 queue.push_back(c);
419 }
420 }
421 } else {
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is beyond old pgnum, skipping" << dendl;
425 }
426 } else {
427 set<spg_t> children;
428 if (cur.is_split(q->second, pgnum, &children)) {
429 dout(20) << __func__ << " " << cur << " e" << q->first
430 << " pg_num " << pgnum << " -> " << q->second
431 << " is merge target, source " << children << dendl;
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
434 if (!did.count(c))
435 queue.push_back(c);
436 }
437 merge_pgs->insert(make_pair(cur, q->first));
438 }
439 }
440 }
441 pgnum = q->second;
442 }
443 }
444 }
445
446 void OSDService::need_heartbeat_peer_update()
447 {
448 osd->need_heartbeat_peer_update();
449 }
450
451 void OSDService::start_shutdown()
452 {
453 {
454 std::lock_guard l(agent_timer_lock);
455 agent_timer.shutdown();
456 }
457
458 {
459 std::lock_guard l(sleep_lock);
460 sleep_timer.shutdown();
461 }
462
463 {
464 std::lock_guard l(recovery_request_lock);
465 recovery_request_timer.shutdown();
466 }
467 }
468
469 void OSDService::shutdown_reserver()
470 {
471 reserver_finisher.wait_for_empty();
472 reserver_finisher.stop();
473 }
474
475 void OSDService::shutdown()
476 {
477 {
478 std::lock_guard l(watch_lock);
479 watch_timer.shutdown();
480 }
481
482 objecter->shutdown();
483 for (auto f : objecter_finishers) {
484 f->wait_for_empty();
485 f->stop();
486 }
487
488 publish_map(OSDMapRef());
489 next_osdmap = OSDMapRef();
490 }
491
492 void OSDService::init()
493 {
494 reserver_finisher.start();
495 for (auto f : objecter_finishers) {
496 f->start();
497 }
498 objecter->set_client_incarnation(0);
499
500 // deprioritize objecter in daemonperf output
501 objecter->get_logger()->set_prio_adjust(-3);
502
503 watch_timer.init();
504 agent_timer.init();
505
506 agent_thread.create("osd_srv_agent");
507
508 if (cct->_conf->osd_recovery_delay_start)
509 defer_recovery(cct->_conf->osd_recovery_delay_start);
510 }
511
512 void OSDService::final_init()
513 {
514 objecter->start(osdmap.get());
515 }
516
517 void OSDService::activate_map()
518 {
519 // wake/unwake the tiering agent
520 agent_lock.Lock();
521 agent_active =
522 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
523 osd->is_active();
524 agent_cond.Signal();
525 agent_lock.Unlock();
526 }
527
528 void OSDService::request_osdmap_update(epoch_t e)
529 {
530 osd->osdmap_subscribe(e, false);
531 }
532
533 class AgentTimeoutCB : public Context {
534 PGRef pg;
535 public:
536 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
537 void finish(int) override {
538 pg->agent_choose_mode_restart();
539 }
540 };
541
542 void OSDService::agent_entry()
543 {
544 dout(10) << __func__ << " start" << dendl;
545 agent_lock.Lock();
546
547 while (!agent_stop_flag) {
548 if (agent_queue.empty()) {
549 dout(20) << __func__ << " empty queue" << dendl;
550 agent_cond.Wait(agent_lock);
551 continue;
552 }
553 uint64_t level = agent_queue.rbegin()->first;
554 set<PGRef>& top = agent_queue.rbegin()->second;
555 dout(10) << __func__
556 << " tiers " << agent_queue.size()
557 << ", top is " << level
558 << " with pgs " << top.size()
559 << ", ops " << agent_ops << "/"
560 << cct->_conf->osd_agent_max_ops
561 << (agent_active ? " active" : " NOT ACTIVE")
562 << dendl;
563 dout(20) << __func__ << " oids " << agent_oids << dendl;
564 int max = cct->_conf->osd_agent_max_ops - agent_ops;
565 int agent_flush_quota = max;
566 if (!flush_mode_high_count)
567 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
568 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
569 agent_cond.Wait(agent_lock);
570 continue;
571 }
572
573 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
574 agent_queue_pos = top.begin();
575 agent_valid_iterator = true;
576 }
577 PGRef pg = *agent_queue_pos;
578 dout(10) << "high_count " << flush_mode_high_count
579 << " agent_ops " << agent_ops
580 << " flush_quota " << agent_flush_quota << dendl;
581 agent_lock.Unlock();
582 if (!pg->agent_work(max, agent_flush_quota)) {
583 dout(10) << __func__ << " " << pg->pg_id
584 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
585 << " seconds" << dendl;
586
587 osd->logger->inc(l_osd_tier_delay);
588 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
589 agent_timer_lock.Lock();
590 Context *cb = new AgentTimeoutCB(pg);
591 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
592 agent_timer_lock.Unlock();
593 }
594 agent_lock.Lock();
595 }
596 agent_lock.Unlock();
597 dout(10) << __func__ << " finish" << dendl;
598 }
599
600 void OSDService::agent_stop()
601 {
602 {
603 std::lock_guard l(agent_lock);
604
605 // By this time all ops should be cancelled
606 ceph_assert(agent_ops == 0);
607 // By this time all PGs are shutdown and dequeued
608 if (!agent_queue.empty()) {
609 set<PGRef>& top = agent_queue.rbegin()->second;
610 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
611 ceph_abort_msg("agent queue not empty");
612 }
613
614 agent_stop_flag = true;
615 agent_cond.Signal();
616 }
617 agent_thread.join();
618 }
619
620 // -------------------------------------
621
622 void OSDService::promote_throttle_recalibrate()
623 {
624 utime_t now = ceph_clock_now();
625 double dur = now - last_recalibrate;
626 last_recalibrate = now;
627 unsigned prob = promote_probability_millis;
628
629 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
630 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
631
632 unsigned min_prob = 1;
633
634 uint64_t attempts, obj, bytes;
635 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
636 dout(10) << __func__ << " " << attempts << " attempts, promoted "
637 << obj << " objects and " << byte_u_t(bytes) << "; target "
638 << target_obj_sec << " obj/sec or "
639 << byte_u_t(target_bytes_sec) << "/sec"
640 << dendl;
641
642 // calculate what the probability *should* be, given the targets
643 unsigned new_prob;
644 if (attempts && dur > 0) {
645 uint64_t avg_size = 1;
646 if (obj)
647 avg_size = std::max<uint64_t>(bytes / obj, 1);
648 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
649 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
650 / (double)attempts;
651 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
652 << avg_size << dendl;
653 if (target_obj_sec && target_bytes_sec)
654 new_prob = std::min(po, pb);
655 else if (target_obj_sec)
656 new_prob = po;
657 else if (target_bytes_sec)
658 new_prob = pb;
659 else
660 new_prob = 1000;
661 } else {
662 new_prob = 1000;
663 }
664 dout(20) << __func__ << " new_prob " << new_prob << dendl;
665
666 // correct for persistent skew between target rate and actual rate, adjust
667 double ratio = 1.0;
668 unsigned actual = 0;
669 if (attempts && obj) {
670 actual = obj * 1000 / attempts;
671 ratio = (double)actual / (double)prob;
672 new_prob = (double)new_prob / ratio;
673 }
674 new_prob = std::max(new_prob, min_prob);
675 new_prob = std::min(new_prob, 1000u);
676
677 // adjust
678 prob = (prob + new_prob) / 2;
679 prob = std::max(prob, min_prob);
680 prob = std::min(prob, 1000u);
681 dout(10) << __func__ << " actual " << actual
682 << ", actual/prob ratio " << ratio
683 << ", adjusted new_prob " << new_prob
684 << ", prob " << promote_probability_millis << " -> " << prob
685 << dendl;
686 promote_probability_millis = prob;
687
688 // set hard limits for this interval to mitigate stampedes
689 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
690 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
691 }
692
693 // -------------------------------------
694
695 float OSDService::get_failsafe_full_ratio()
696 {
697 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
698 if (full_ratio > 1.0) full_ratio /= 100.0;
699 return full_ratio;
700 }
701
702 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
703 {
704 // The OSDMap ratios take precendence. So if the failsafe is .95 and
705 // the admin sets the cluster full to .96, the failsafe moves up to .96
706 // too. (Not that having failsafe == full is ideal, but it's better than
707 // dropping writes before the clusters appears full.)
708 OSDMapRef osdmap = get_osdmap();
709 if (!osdmap || osdmap->get_epoch() == 0) {
710 return NONE;
711 }
712 float nearfull_ratio = osdmap->get_nearfull_ratio();
713 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
714 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
715 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
716
717 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
718 // use the failsafe for nearfull and full; the mon isn't using the
719 // flags anyway because we're mid-upgrade.
720 full_ratio = failsafe_ratio;
721 backfillfull_ratio = failsafe_ratio;
722 nearfull_ratio = failsafe_ratio;
723 } else if (full_ratio <= 0 ||
724 backfillfull_ratio <= 0 ||
725 nearfull_ratio <= 0) {
726 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
727 // use failsafe flag. ick. the monitor did something wrong or the user
728 // did something stupid.
729 full_ratio = failsafe_ratio;
730 backfillfull_ratio = failsafe_ratio;
731 nearfull_ratio = failsafe_ratio;
732 }
733
734 if (injectfull_state > NONE && injectfull) {
735 inject = "(Injected)";
736 return injectfull_state;
737 } else if (pratio > failsafe_ratio) {
738 return FAILSAFE;
739 } else if (ratio > full_ratio) {
740 return FULL;
741 } else if (ratio > backfillfull_ratio) {
742 return BACKFILLFULL;
743 } else if (ratio > nearfull_ratio) {
744 return NEARFULL;
745 }
746 return NONE;
747 }
748
749 void OSDService::check_full_status(float ratio, float pratio)
750 {
751 std::lock_guard l(full_status_lock);
752
753 cur_ratio = ratio;
754 physical_ratio = pratio;
755
756 string inject;
757 s_names new_state;
758 new_state = recalc_full_state(ratio, pratio, inject);
759
760 dout(20) << __func__ << " cur ratio " << ratio
761 << ", physical ratio " << pratio
762 << ", new state " << get_full_state_name(new_state)
763 << " " << inject
764 << dendl;
765
766 // warn
767 if (cur_state != new_state) {
768 dout(10) << __func__ << " " << get_full_state_name(cur_state)
769 << " -> " << get_full_state_name(new_state) << dendl;
770 if (new_state == FAILSAFE) {
771 clog->error() << "full status failsafe engaged, dropping updates, now "
772 << (int)roundf(ratio * 100) << "% full";
773 } else if (cur_state == FAILSAFE) {
774 clog->error() << "full status failsafe disengaged, no longer dropping "
775 << "updates, now " << (int)roundf(ratio * 100) << "% full";
776 }
777 cur_state = new_state;
778 }
779 }
780
781 bool OSDService::need_fullness_update()
782 {
783 OSDMapRef osdmap = get_osdmap();
784 s_names cur = NONE;
785 if (osdmap->exists(whoami)) {
786 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
787 cur = FULL;
788 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
789 cur = BACKFILLFULL;
790 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
791 cur = NEARFULL;
792 }
793 }
794 s_names want = NONE;
795 if (is_full())
796 want = FULL;
797 else if (is_backfillfull())
798 want = BACKFILLFULL;
799 else if (is_nearfull())
800 want = NEARFULL;
801 return want != cur;
802 }
803
804 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
805 {
806 if (injectfull && injectfull_state >= type) {
807 // injectfull is either a count of the number of times to return failsafe full
808 // or if -1 then always return full
809 if (injectfull > 0)
810 --injectfull;
811 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
812 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
813 << dendl;
814 return true;
815 }
816 return false;
817 }
818
819 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
820 {
821 std::lock_guard l(full_status_lock);
822
823 if (_check_inject_full(dpp, type))
824 return true;
825
826 if (cur_state >= type)
827 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
828 << " physical " << physical_ratio << dendl;
829
830 return cur_state >= type;
831 }
832
833 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
834 {
835 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
836 {
837 std::lock_guard l(full_status_lock);
838 if (_check_inject_full(dpp, type)) {
839 return true;
840 }
841 }
842
843 float pratio;
844 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
845
846 string notused;
847 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
848
849 if (tentative_state >= type)
850 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
851
852 return tentative_state >= type;
853 }
854
855 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
856 {
857 return _check_full(dpp, FAILSAFE);
858 }
859
860 bool OSDService::check_full(DoutPrefixProvider *dpp) const
861 {
862 return _check_full(dpp, FULL);
863 }
864
865 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
866 {
867 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
868 }
869
870 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
871 {
872 return _check_full(dpp, BACKFILLFULL);
873 }
874
875 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
876 {
877 return _check_full(dpp, NEARFULL);
878 }
879
880 bool OSDService::is_failsafe_full() const
881 {
882 std::lock_guard l(full_status_lock);
883 return cur_state == FAILSAFE;
884 }
885
886 bool OSDService::is_full() const
887 {
888 std::lock_guard l(full_status_lock);
889 return cur_state >= FULL;
890 }
891
892 bool OSDService::is_backfillfull() const
893 {
894 std::lock_guard l(full_status_lock);
895 return cur_state >= BACKFILLFULL;
896 }
897
898 bool OSDService::is_nearfull() const
899 {
900 std::lock_guard l(full_status_lock);
901 return cur_state >= NEARFULL;
902 }
903
904 void OSDService::set_injectfull(s_names type, int64_t count)
905 {
906 std::lock_guard l(full_status_lock);
907 injectfull_state = type;
908 injectfull = count;
909 }
910
911 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
912 osd_alert_list_t& alerts)
913 {
914 uint64_t bytes = stbuf.total;
915 uint64_t avail = stbuf.available;
916 uint64_t used = stbuf.get_used_raw();
917
918 // For testing fake statfs values so it doesn't matter if all
919 // OSDs are using the same partition.
920 if (cct->_conf->fake_statfs_for_testing) {
921 uint64_t total_num_bytes = 0;
922 vector<PGRef> pgs;
923 osd->_get_pgs(&pgs);
924 for (auto p : pgs) {
925 total_num_bytes += p->get_stats_num_bytes();
926 }
927 bytes = cct->_conf->fake_statfs_for_testing;
928 if (total_num_bytes < bytes)
929 avail = bytes - total_num_bytes;
930 else
931 avail = 0;
932 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
933 << " adjust available " << avail
934 << dendl;
935 used = bytes - avail;
936 }
937
938 osd->logger->set(l_osd_stat_bytes, bytes);
939 osd->logger->set(l_osd_stat_bytes_used, used);
940 osd->logger->set(l_osd_stat_bytes_avail, avail);
941
942 std::lock_guard l(stat_lock);
943 osd_stat.statfs = stbuf;
944 osd_stat.os_alerts.clear();
945 osd_stat.os_alerts[whoami].swap(alerts);
946 if (cct->_conf->fake_statfs_for_testing) {
947 osd_stat.statfs.total = bytes;
948 osd_stat.statfs.available = avail;
949 // For testing don't want used to go negative, so clear reserved
950 osd_stat.statfs.internally_reserved = 0;
951 }
952 }
953
954 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
955 int num_pgs)
956 {
957 utime_t now = ceph_clock_now();
958 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
959 std::lock_guard l(stat_lock);
960 osd_stat.hb_peers.swap(hb_peers);
961 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
962 osd_stat.num_pgs = num_pgs;
963 // Clean entries that aren't updated
964 // This is called often enough that we can just remove 1 at a time
965 for (auto i: osd_stat.hb_pingtime) {
966 if (i.second.last_update == 0)
967 continue;
968 if (stale_time && now.sec() - i.second.last_update > stale_time) {
969 dout(20) << __func__ << " time out heartbeat for osd " << i.first
970 << " last_update " << i.second.last_update << dendl;
971 osd_stat.hb_pingtime.erase(i.first);
972 break;
973 }
974 }
975 return osd_stat;
976 }
977
978 void OSDService::inc_osd_stat_repaired()
979 {
980 std::lock_guard l(stat_lock);
981 osd_stat.num_shards_repaired++;
982 return;
983 }
984
985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
986 uint64_t adjust_used)
987 {
988 *pratio =
989 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
990
991 if (adjust_used) {
992 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
993 if (new_stat.statfs.available > adjust_used)
994 new_stat.statfs.available -= adjust_used;
995 else
996 new_stat.statfs.available = 0;
997 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
998 }
999
1000 // Check all pgs and adjust kb_used to include all pending backfill data
1001 int backfill_adjusted = 0;
1002 vector<PGRef> pgs;
1003 osd->_get_pgs(&pgs);
1004 for (auto p : pgs) {
1005 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1006 }
1007 if (backfill_adjusted) {
1008 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1009 }
1010 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1011 }
1012
1013 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
1014 {
1015 OSDMapRef osdmap = get_osdmap();
1016 for (auto shard : missing_on) {
1017 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
1018 return true;
1019 }
1020 return false;
1021 }
1022
1023 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1024 {
1025 OSDMapRef next_map = get_nextmap_reserved();
1026 // service map is always newer/newest
1027 ceph_assert(from_epoch <= next_map->get_epoch());
1028
1029 if (next_map->is_down(peer) ||
1030 next_map->get_info(peer).up_from > from_epoch) {
1031 m->put();
1032 release_map(next_map);
1033 return;
1034 }
1035 ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1036 next_map->get_cluster_addrs(peer));
1037 share_map_peer(peer, peer_con.get(), next_map);
1038 peer_con->send_message(m);
1039 release_map(next_map);
1040 }
1041
1042 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1043 {
1044 OSDMapRef next_map = get_nextmap_reserved();
1045 // service map is always newer/newest
1046 ceph_assert(from_epoch <= next_map->get_epoch());
1047
1048 if (next_map->is_down(peer) ||
1049 next_map->get_info(peer).up_from > from_epoch) {
1050 release_map(next_map);
1051 return NULL;
1052 }
1053 ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1054 next_map->get_cluster_addrs(peer));
1055 release_map(next_map);
1056 return con;
1057 }
1058
1059 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1060 {
1061 OSDMapRef next_map = get_nextmap_reserved();
1062 // service map is always newer/newest
1063 ceph_assert(from_epoch <= next_map->get_epoch());
1064
1065 pair<ConnectionRef,ConnectionRef> ret;
1066 if (next_map->is_down(peer) ||
1067 next_map->get_info(peer).up_from > from_epoch) {
1068 release_map(next_map);
1069 return ret;
1070 }
1071 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1072 next_map->get_hb_back_addrs(peer));
1073 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1074 next_map->get_hb_front_addrs(peer));
1075 release_map(next_map);
1076 return ret;
1077 }
1078
1079 entity_name_t OSDService::get_cluster_msgr_name() const
1080 {
1081 return cluster_messenger->get_myname();
1082 }
1083
1084 void OSDService::queue_want_pg_temp(pg_t pgid,
1085 const vector<int>& want,
1086 bool forced)
1087 {
1088 std::lock_guard l(pg_temp_lock);
1089 auto p = pg_temp_pending.find(pgid);
1090 if (p == pg_temp_pending.end() ||
1091 p->second.acting != want ||
1092 forced) {
1093 pg_temp_wanted[pgid] = {want, forced};
1094 }
1095 }
1096
1097 void OSDService::remove_want_pg_temp(pg_t pgid)
1098 {
1099 std::lock_guard l(pg_temp_lock);
1100 pg_temp_wanted.erase(pgid);
1101 pg_temp_pending.erase(pgid);
1102 }
1103
1104 void OSDService::_sent_pg_temp()
1105 {
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107 pg_temp_pending.merge(pg_temp_wanted);
1108 #else
1109 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1110 make_move_iterator(end(pg_temp_wanted)));
1111 #endif
1112 pg_temp_wanted.clear();
1113 }
1114
1115 void OSDService::requeue_pg_temp()
1116 {
1117 std::lock_guard l(pg_temp_lock);
1118 // wanted overrides pending. note that remove_want_pg_temp
1119 // clears the item out of both.
1120 unsigned old_wanted = pg_temp_wanted.size();
1121 unsigned old_pending = pg_temp_pending.size();
1122 _sent_pg_temp();
1123 pg_temp_wanted.swap(pg_temp_pending);
1124 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1125 << pg_temp_wanted.size() << dendl;
1126 }
1127
1128 std::ostream& operator<<(std::ostream& out,
1129 const OSDService::pg_temp_t& pg_temp)
1130 {
1131 out << pg_temp.acting;
1132 if (pg_temp.forced) {
1133 out << " (forced)";
1134 }
1135 return out;
1136 }
1137
1138 void OSDService::send_pg_temp()
1139 {
1140 std::lock_guard l(pg_temp_lock);
1141 if (pg_temp_wanted.empty())
1142 return;
1143 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1144 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1145 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1146 auto& m = ms[pg_temp.forced];
1147 if (!m) {
1148 m = new MOSDPGTemp(osdmap->get_epoch());
1149 m->forced = pg_temp.forced;
1150 }
1151 m->pg_temp.emplace(pgid, pg_temp.acting);
1152 }
1153 for (auto m : ms) {
1154 if (m) {
1155 monc->send_mon_message(m);
1156 }
1157 }
1158 _sent_pg_temp();
1159 }
1160
1161 void OSDService::send_pg_created(pg_t pgid)
1162 {
1163 std::lock_guard l(pg_created_lock);
1164 dout(20) << __func__ << dendl;
1165 auto o = get_osdmap();
1166 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1167 pg_created.insert(pgid);
1168 monc->send_mon_message(new MOSDPGCreated(pgid));
1169 }
1170 }
1171
1172 void OSDService::send_pg_created()
1173 {
1174 std::lock_guard l(pg_created_lock);
1175 dout(20) << __func__ << dendl;
1176 auto o = get_osdmap();
1177 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1178 for (auto pgid : pg_created) {
1179 monc->send_mon_message(new MOSDPGCreated(pgid));
1180 }
1181 }
1182 }
1183
1184 void OSDService::prune_pg_created()
1185 {
1186 std::lock_guard l(pg_created_lock);
1187 dout(20) << __func__ << dendl;
1188 auto o = get_osdmap();
1189 auto i = pg_created.begin();
1190 while (i != pg_created.end()) {
1191 auto p = o->get_pg_pool(i->pool());
1192 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1193 dout(20) << __func__ << " pruning " << *i << dendl;
1194 i = pg_created.erase(i);
1195 } else {
1196 dout(20) << __func__ << " keeping " << *i << dendl;
1197 ++i;
1198 }
1199 }
1200 }
1201
1202
1203 // --------------------------------------
1204 // dispatch
1205
1206 epoch_t OSDService::get_peer_epoch(int peer)
1207 {
1208 std::lock_guard l(peer_map_epoch_lock);
1209 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1210 if (p == peer_map_epoch.end())
1211 return 0;
1212 return p->second;
1213 }
1214
1215 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1216 {
1217 std::lock_guard l(peer_map_epoch_lock);
1218 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1219 if (p != peer_map_epoch.end()) {
1220 if (p->second < e) {
1221 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1222 p->second = e;
1223 } else {
1224 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1225 }
1226 return p->second;
1227 } else {
1228 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1229 peer_map_epoch[peer] = e;
1230 return e;
1231 }
1232 }
1233
1234 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1235 {
1236 std::lock_guard l(peer_map_epoch_lock);
1237 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1238 if (p != peer_map_epoch.end()) {
1239 if (p->second <= as_of) {
1240 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1241 << " had " << p->second << dendl;
1242 peer_map_epoch.erase(p);
1243 } else {
1244 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1245 << " has " << p->second << " - not forgetting" << dendl;
1246 }
1247 }
1248 }
1249
1250 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1251 epoch_t epoch, const OSDMapRef& osdmap,
1252 const epoch_t *sent_epoch_p)
1253 {
1254 dout(20) << "should_share_map "
1255 << name << " " << con->get_peer_addr()
1256 << " " << epoch << dendl;
1257
1258 // does client have old map?
1259 if (name.is_client()) {
1260 bool message_sendmap = epoch < osdmap->get_epoch();
1261 if (message_sendmap && sent_epoch_p) {
1262 dout(20) << "client session last_sent_epoch: "
1263 << *sent_epoch_p
1264 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1265 if (*sent_epoch_p < osdmap->get_epoch()) {
1266 return true;
1267 } // else we don't need to send it out again
1268 }
1269 }
1270
1271 if (con->get_messenger() == osd->cluster_messenger &&
1272 con != osd->cluster_messenger->get_loopback_connection() &&
1273 osdmap->is_up(name.num()) &&
1274 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1275 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1276 // remember
1277 epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
1278
1279 // share?
1280 if (has < osdmap->get_epoch()) {
1281 dout(10) << name << " " << con->get_peer_addr()
1282 << " has old map " << epoch << " < "
1283 << osdmap->get_epoch() << dendl;
1284 return true;
1285 }
1286 }
1287
1288 return false;
1289 }
1290
1291 void OSDService::share_map(
1292 entity_name_t name,
1293 Connection *con,
1294 epoch_t epoch,
1295 OSDMapRef& osdmap,
1296 epoch_t *sent_epoch_p)
1297 {
1298 dout(20) << "share_map "
1299 << name << " " << con->get_peer_addr()
1300 << " " << epoch << dendl;
1301
1302 if (!osd->is_active()) {
1303 /*It is safe not to proceed as OSD is not in healthy state*/
1304 return;
1305 }
1306
1307 bool want_shared = should_share_map(name, con, epoch,
1308 osdmap, sent_epoch_p);
1309
1310 if (want_shared){
1311 if (name.is_client()) {
1312 dout(10) << name << " has old map " << epoch
1313 << " < " << osdmap->get_epoch() << dendl;
1314 // we know the Session is valid or we wouldn't be sending
1315 if (sent_epoch_p) {
1316 *sent_epoch_p = osdmap->get_epoch();
1317 }
1318 send_incremental_map(epoch, con, osdmap);
1319 } else if (con->get_messenger() == osd->cluster_messenger &&
1320 osdmap->is_up(name.num()) &&
1321 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1322 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1323 dout(10) << name << " " << con->get_peer_addrs()
1324 << " has old map " << epoch << " < "
1325 << osdmap->get_epoch() << dendl;
1326 note_peer_epoch(name.num(), osdmap->get_epoch());
1327 send_incremental_map(epoch, con, osdmap);
1328 }
1329 }
1330 }
1331
1332 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1333 {
1334 if (!map)
1335 map = get_osdmap();
1336
1337 // send map?
1338 epoch_t pe = get_peer_epoch(peer);
1339 if (pe) {
1340 if (pe < map->get_epoch()) {
1341 send_incremental_map(pe, con, map);
1342 note_peer_epoch(peer, map->get_epoch());
1343 } else
1344 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1345 } else {
1346 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1347 // no idea about peer's epoch.
1348 // ??? send recent ???
1349 // do nothing.
1350 }
1351 }
1352
1353 bool OSDService::can_inc_scrubs()
1354 {
1355 bool can_inc = false;
1356 std::lock_guard l(sched_scrub_lock);
1357
1358 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1359 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1360 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1361 can_inc = true;
1362 } else {
1363 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1364 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1365 }
1366
1367 return can_inc;
1368 }
1369
1370 bool OSDService::inc_scrubs_local()
1371 {
1372 bool result = false;
1373 std::lock_guard l{sched_scrub_lock};
1374 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1375 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1376 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1377 result = true;
1378 ++scrubs_local;
1379 } else {
1380 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1381 }
1382 return result;
1383 }
1384
1385 void OSDService::dec_scrubs_local()
1386 {
1387 std::lock_guard l{sched_scrub_lock};
1388 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1389 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1390 --scrubs_local;
1391 ceph_assert(scrubs_local >= 0);
1392 }
1393
1394 bool OSDService::inc_scrubs_remote()
1395 {
1396 bool result = false;
1397 std::lock_guard l{sched_scrub_lock};
1398 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1399 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1400 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1401 result = true;
1402 ++scrubs_remote;
1403 } else {
1404 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1405 }
1406 return result;
1407 }
1408
1409 void OSDService::dec_scrubs_remote()
1410 {
1411 std::lock_guard l{sched_scrub_lock};
1412 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1413 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1414 --scrubs_remote;
1415 ceph_assert(scrubs_remote >= 0);
1416 }
1417
1418 void OSDService::dump_scrub_reservations(Formatter *f)
1419 {
1420 std::lock_guard l{sched_scrub_lock};
1421 f->dump_int("scrubs_local", scrubs_local);
1422 f->dump_int("scrubs_remote", scrubs_remote);
1423 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1424 }
1425
1426 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1427 epoch_t *_bind_epoch) const
1428 {
1429 std::lock_guard l(epoch_lock);
1430 if (_boot_epoch)
1431 *_boot_epoch = boot_epoch;
1432 if (_up_epoch)
1433 *_up_epoch = up_epoch;
1434 if (_bind_epoch)
1435 *_bind_epoch = bind_epoch;
1436 }
1437
1438 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1439 const epoch_t *_bind_epoch)
1440 {
1441 std::lock_guard l(epoch_lock);
1442 if (_boot_epoch) {
1443 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1444 boot_epoch = *_boot_epoch;
1445 }
1446 if (_up_epoch) {
1447 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1448 up_epoch = *_up_epoch;
1449 }
1450 if (_bind_epoch) {
1451 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1452 bind_epoch = *_bind_epoch;
1453 }
1454 }
1455
1456 bool OSDService::prepare_to_stop()
1457 {
1458 std::lock_guard l(is_stopping_lock);
1459 if (get_state() != NOT_STOPPING)
1460 return false;
1461
1462 OSDMapRef osdmap = get_osdmap();
1463 if (osdmap && osdmap->is_up(whoami)) {
1464 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1465 set_state(PREPARING_TO_STOP);
1466 monc->send_mon_message(
1467 new MOSDMarkMeDown(
1468 monc->get_fsid(),
1469 whoami,
1470 osdmap->get_addrs(whoami),
1471 osdmap->get_epoch(),
1472 true // request ack
1473 ));
1474 utime_t now = ceph_clock_now();
1475 utime_t timeout;
1476 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1477 while ((ceph_clock_now() < timeout) &&
1478 (get_state() != STOPPING)) {
1479 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1480 }
1481 }
1482 dout(0) << __func__ << " starting shutdown" << dendl;
1483 set_state(STOPPING);
1484 return true;
1485 }
1486
1487 void OSDService::got_stop_ack()
1488 {
1489 std::lock_guard l(is_stopping_lock);
1490 if (get_state() == PREPARING_TO_STOP) {
1491 dout(0) << __func__ << " starting shutdown" << dendl;
1492 set_state(STOPPING);
1493 is_stopping_cond.Signal();
1494 } else {
1495 dout(10) << __func__ << " ignoring msg" << dendl;
1496 }
1497 }
1498
1499 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1500 OSDSuperblock& sblock)
1501 {
1502 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1503 osdmap->get_encoding_features());
1504 m->oldest_map = max_oldest_map;
1505 m->newest_map = sblock.newest_map;
1506
1507 int max = cct->_conf->osd_map_message_max;
1508 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1509
1510 if (since < m->oldest_map) {
1511 // we don't have the next map the target wants, so start with a
1512 // full map.
1513 bufferlist bl;
1514 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1515 << since << ", starting with full map" << dendl;
1516 since = m->oldest_map;
1517 if (!get_map_bl(since, bl)) {
1518 derr << __func__ << " missing full map " << since << dendl;
1519 goto panic;
1520 }
1521 max--;
1522 max_bytes -= bl.length();
1523 m->maps[since].claim(bl);
1524 }
1525 for (epoch_t e = since + 1; e <= to; ++e) {
1526 bufferlist bl;
1527 if (get_inc_map_bl(e, bl)) {
1528 m->incremental_maps[e].claim(bl);
1529 } else {
1530 derr << __func__ << " missing incremental map " << e << dendl;
1531 if (!get_map_bl(e, bl)) {
1532 derr << __func__ << " also missing full map " << e << dendl;
1533 goto panic;
1534 }
1535 m->maps[e].claim(bl);
1536 }
1537 max--;
1538 max_bytes -= bl.length();
1539 if (max <= 0 || max_bytes <= 0) {
1540 break;
1541 }
1542 }
1543 return m;
1544
1545 panic:
1546 if (!m->maps.empty() ||
1547 !m->incremental_maps.empty()) {
1548 // send what we have so far
1549 return m;
1550 }
1551 // send something
1552 bufferlist bl;
1553 if (get_inc_map_bl(m->newest_map, bl)) {
1554 m->incremental_maps[m->newest_map].claim(bl);
1555 } else {
1556 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1557 if (!get_map_bl(m->newest_map, bl)) {
1558 derr << __func__ << " unable to load latest full map " << m->newest_map
1559 << dendl;
1560 ceph_abort();
1561 }
1562 m->maps[m->newest_map].claim(bl);
1563 }
1564 return m;
1565 }
1566
1567 void OSDService::send_map(MOSDMap *m, Connection *con)
1568 {
1569 con->send_message(m);
1570 }
1571
1572 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1573 OSDMapRef& osdmap)
1574 {
1575 epoch_t to = osdmap->get_epoch();
1576 dout(10) << "send_incremental_map " << since << " -> " << to
1577 << " to " << con << " " << con->get_peer_addr() << dendl;
1578
1579 MOSDMap *m = NULL;
1580 while (!m) {
1581 OSDSuperblock sblock(get_superblock());
1582 if (since < sblock.oldest_map) {
1583 // just send latest full map
1584 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1585 osdmap->get_encoding_features());
1586 m->oldest_map = max_oldest_map;
1587 m->newest_map = sblock.newest_map;
1588 get_map_bl(to, m->maps[to]);
1589 send_map(m, con);
1590 return;
1591 }
1592
1593 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1594 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1595 << ", only sending most recent" << dendl;
1596 since = to - cct->_conf->osd_map_share_max_epochs;
1597 }
1598
1599 m = build_incremental_map_msg(since, to, sblock);
1600 }
1601 send_map(m, con);
1602 }
1603
1604 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1605 {
1606 bool found = map_bl_cache.lookup(e, &bl);
1607 if (found) {
1608 if (logger)
1609 logger->inc(l_osd_map_bl_cache_hit);
1610 return true;
1611 }
1612 if (logger)
1613 logger->inc(l_osd_map_bl_cache_miss);
1614 found = store->read(meta_ch,
1615 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1616 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1617 if (found) {
1618 _add_map_bl(e, bl);
1619 }
1620 return found;
1621 }
1622
1623 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1624 {
1625 std::lock_guard l(map_cache_lock);
1626 bool found = map_bl_inc_cache.lookup(e, &bl);
1627 if (found) {
1628 if (logger)
1629 logger->inc(l_osd_map_bl_cache_hit);
1630 return true;
1631 }
1632 if (logger)
1633 logger->inc(l_osd_map_bl_cache_miss);
1634 found = store->read(meta_ch,
1635 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1636 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1637 if (found) {
1638 _add_map_inc_bl(e, bl);
1639 }
1640 return found;
1641 }
1642
1643 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1644 {
1645 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1646 // cache a contiguous buffer
1647 if (bl.get_num_buffers() > 1) {
1648 bl.rebuild();
1649 }
1650 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1651 map_bl_cache.add(e, bl);
1652 }
1653
1654 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1655 {
1656 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1657 // cache a contiguous buffer
1658 if (bl.get_num_buffers() > 1) {
1659 bl.rebuild();
1660 }
1661 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1662 map_bl_inc_cache.add(e, bl);
1663 }
1664
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool)
1666 {
1667 std::lock_guard l(map_cache_lock);
1668 auto p = deleted_pool_pg_nums.find(pool);
1669 if (p != deleted_pool_pg_nums.end()) {
1670 return p->second;
1671 }
1672 dout(20) << __func__ << " " << pool << " loading" << dendl;
1673 ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1674 bufferlist bl;
1675 int r = store->read(meta_ch, oid, 0, 0, bl);
1676 ceph_assert(r >= 0);
1677 auto blp = bl.cbegin();
1678 pg_pool_t pi;
1679 ::decode(pi, blp);
1680 deleted_pool_pg_nums[pool] = pi.get_pg_num();
1681 dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1682 return pi.get_pg_num();
1683 }
1684
1685 OSDMapRef OSDService::_add_map(OSDMap *o)
1686 {
1687 epoch_t e = o->get_epoch();
1688
1689 if (cct->_conf->osd_map_dedup) {
1690 // Dedup against an existing map at a nearby epoch
1691 OSDMapRef for_dedup = map_cache.lower_bound(e);
1692 if (for_dedup) {
1693 OSDMap::dedup(for_dedup.get(), o);
1694 }
1695 }
1696 bool existed;
1697 OSDMapRef l = map_cache.add(e, o, &existed);
1698 if (existed) {
1699 delete o;
1700 }
1701 return l;
1702 }
1703
1704 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1705 {
1706 std::lock_guard l(map_cache_lock);
1707 OSDMapRef retval = map_cache.lookup(epoch);
1708 if (retval) {
1709 dout(30) << "get_map " << epoch << " -cached" << dendl;
1710 if (logger) {
1711 logger->inc(l_osd_map_cache_hit);
1712 }
1713 return retval;
1714 }
1715 if (logger) {
1716 logger->inc(l_osd_map_cache_miss);
1717 epoch_t lb = map_cache.cached_key_lower_bound();
1718 if (epoch < lb) {
1719 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1720 logger->inc(l_osd_map_cache_miss_low);
1721 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1722 }
1723 }
1724
1725 OSDMap *map = new OSDMap;
1726 if (epoch > 0) {
1727 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1728 bufferlist bl;
1729 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1730 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1731 delete map;
1732 return OSDMapRef();
1733 }
1734 map->decode(bl);
1735 } else {
1736 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1737 }
1738 return _add_map(map);
1739 }
1740
1741 // ops
1742
1743
1744 void OSDService::reply_op_error(OpRequestRef op, int err)
1745 {
1746 reply_op_error(op, err, eversion_t(), 0);
1747 }
1748
1749 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1750 version_t uv)
1751 {
1752 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1753 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1754 int flags;
1755 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1756
1757 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
1758 reply->set_reply_versions(v, uv);
1759 m->get_connection()->send_message(reply);
1760 }
1761
1762 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1763 {
1764 if (!cct->_conf->osd_debug_misdirected_ops) {
1765 return;
1766 }
1767
1768 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1769 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1770
1771 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1772
1773 if (pg->is_ec_pg()) {
1774 /**
1775 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776 * can get this result:
1777 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778 * [CRUSH_ITEM_NONE, 2, 3]/3
1779 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1780 * [3, 2, 3]/3
1781 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1782 * -- misdirected op
1783 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1784 * it and fulfils it
1785 *
1786 * We can't compute the op target based on the sending map epoch due to
1787 * splitting. The simplest thing is to detect such cases here and drop
1788 * them without an error (the client will resend anyway).
1789 */
1790 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1791 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1792 if (!opmap) {
1793 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1794 << m->get_map_epoch() << ", dropping" << dendl;
1795 return;
1796 }
1797 pg_t _pgid = m->get_raw_pg();
1798 spg_t pgid;
1799 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1800 _pgid = opmap->raw_pg_to_pg(_pgid);
1801 if (opmap->get_primary_shard(_pgid, &pgid) &&
1802 pgid.shard != pg->pg_id.shard) {
1803 dout(7) << __func__ << ": " << *pg << " primary changed since "
1804 << m->get_map_epoch() << ", dropping" << dendl;
1805 return;
1806 }
1807 }
1808
1809 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1810 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1811 << " pg " << m->get_raw_pg()
1812 << " to osd." << whoami
1813 << " not " << pg->get_acting()
1814 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1815 }
1816
1817 void OSDService::enqueue_back(OpQueueItem&& qi)
1818 {
1819 osd->op_shardedwq.queue(std::move(qi));
1820 }
1821
1822 void OSDService::enqueue_front(OpQueueItem&& qi)
1823 {
1824 osd->op_shardedwq.queue_front(std::move(qi));
1825 }
1826
1827 void OSDService::queue_recovery_context(
1828 PG *pg,
1829 GenContext<ThreadPool::TPHandle&> *c)
1830 {
1831 epoch_t e = get_osdmap_epoch();
1832 enqueue_back(
1833 OpQueueItem(
1834 unique_ptr<OpQueueItem::OpQueueable>(
1835 new PGRecoveryContext(pg->get_pgid(), c, e)),
1836 cct->_conf->osd_recovery_cost,
1837 cct->_conf->osd_recovery_priority,
1838 ceph_clock_now(),
1839 0,
1840 e));
1841 }
1842
1843 void OSDService::queue_for_snap_trim(PG *pg)
1844 {
1845 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1846 enqueue_back(
1847 OpQueueItem(
1848 unique_ptr<OpQueueItem::OpQueueable>(
1849 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1850 cct->_conf->osd_snap_trim_cost,
1851 cct->_conf->osd_snap_trim_priority,
1852 ceph_clock_now(),
1853 0,
1854 pg->get_osdmap_epoch()));
1855 }
1856
1857 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1858 {
1859 unsigned scrub_queue_priority = pg->scrubber.priority;
1860 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1861 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1862 }
1863 const auto epoch = pg->get_osdmap_epoch();
1864 enqueue_back(
1865 OpQueueItem(
1866 unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1867 cct->_conf->osd_scrub_cost,
1868 scrub_queue_priority,
1869 ceph_clock_now(),
1870 0,
1871 epoch));
1872 }
1873
1874 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1875 {
1876 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1877 enqueue_back(
1878 OpQueueItem(
1879 unique_ptr<OpQueueItem::OpQueueable>(
1880 new PGDelete(pgid, e)),
1881 cct->_conf->osd_pg_delete_cost,
1882 cct->_conf->osd_pg_delete_priority,
1883 ceph_clock_now(),
1884 0,
1885 e));
1886 }
1887
1888 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1889 {
1890 return osd->try_finish_pg_delete(pg, old_pg_num);
1891 }
1892
1893 // ---
1894
1895 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1896 {
1897 std::lock_guard l(merge_lock);
1898 dout(10) << __func__ << " " << pg->pg_id << dendl;
1899 ready_to_merge_source[pg->pg_id.pgid] = version;
1900 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1901 _send_ready_to_merge();
1902 }
1903
1904 void OSDService::set_ready_to_merge_target(PG *pg,
1905 eversion_t version,
1906 epoch_t last_epoch_started,
1907 epoch_t last_epoch_clean)
1908 {
1909 std::lock_guard l(merge_lock);
1910 dout(10) << __func__ << " " << pg->pg_id << dendl;
1911 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1912 make_tuple(version,
1913 last_epoch_started,
1914 last_epoch_clean)));
1915 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1916 _send_ready_to_merge();
1917 }
1918
1919 void OSDService::set_not_ready_to_merge_source(pg_t source)
1920 {
1921 std::lock_guard l(merge_lock);
1922 dout(10) << __func__ << " " << source << dendl;
1923 not_ready_to_merge_source.insert(source);
1924 assert(ready_to_merge_source.count(source) == 0);
1925 _send_ready_to_merge();
1926 }
1927
1928 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1929 {
1930 std::lock_guard l(merge_lock);
1931 dout(10) << __func__ << " " << target << " source " << source << dendl;
1932 not_ready_to_merge_target[target] = source;
1933 assert(ready_to_merge_target.count(target) == 0);
1934 _send_ready_to_merge();
1935 }
1936
1937 void OSDService::send_ready_to_merge()
1938 {
1939 std::lock_guard l(merge_lock);
1940 _send_ready_to_merge();
1941 }
1942
1943 void OSDService::_send_ready_to_merge()
1944 {
1945 dout(20) << __func__
1946 << " ready_to_merge_source " << ready_to_merge_source
1947 << " not_ready_to_merge_source " << not_ready_to_merge_source
1948 << " ready_to_merge_target " << ready_to_merge_target
1949 << " not_ready_to_merge_target " << not_ready_to_merge_target
1950 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1951 << dendl;
1952 for (auto src : not_ready_to_merge_source) {
1953 if (sent_ready_to_merge_source.count(src) == 0) {
1954 monc->send_mon_message(new MOSDPGReadyToMerge(
1955 src,
1956 {}, {}, 0, 0,
1957 false,
1958 osdmap->get_epoch()));
1959 sent_ready_to_merge_source.insert(src);
1960 }
1961 }
1962 for (auto p : not_ready_to_merge_target) {
1963 if (sent_ready_to_merge_source.count(p.second) == 0) {
1964 monc->send_mon_message(new MOSDPGReadyToMerge(
1965 p.second,
1966 {}, {}, 0, 0,
1967 false,
1968 osdmap->get_epoch()));
1969 sent_ready_to_merge_source.insert(p.second);
1970 }
1971 }
1972 for (auto src : ready_to_merge_source) {
1973 if (not_ready_to_merge_source.count(src.first) ||
1974 not_ready_to_merge_target.count(src.first.get_parent())) {
1975 continue;
1976 }
1977 auto p = ready_to_merge_target.find(src.first.get_parent());
1978 if (p != ready_to_merge_target.end() &&
1979 sent_ready_to_merge_source.count(src.first) == 0) {
1980 monc->send_mon_message(new MOSDPGReadyToMerge(
1981 src.first, // source pgid
1982 src.second, // src version
1983 std::get<0>(p->second), // target version
1984 std::get<1>(p->second), // PG's last_epoch_started
1985 std::get<2>(p->second), // PG's last_epoch_clean
1986 true,
1987 osdmap->get_epoch()));
1988 sent_ready_to_merge_source.insert(src.first);
1989 }
1990 }
1991 }
1992
1993 void OSDService::clear_ready_to_merge(PG *pg)
1994 {
1995 std::lock_guard l(merge_lock);
1996 dout(10) << __func__ << " " << pg->pg_id << dendl;
1997 ready_to_merge_source.erase(pg->pg_id.pgid);
1998 ready_to_merge_target.erase(pg->pg_id.pgid);
1999 not_ready_to_merge_source.erase(pg->pg_id.pgid);
2000 not_ready_to_merge_target.erase(pg->pg_id.pgid);
2001 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2002 }
2003
2004 void OSDService::clear_sent_ready_to_merge()
2005 {
2006 std::lock_guard l(merge_lock);
2007 sent_ready_to_merge_source.clear();
2008 }
2009
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
2011 {
2012 std::lock_guard l(merge_lock);
2013 auto i = sent_ready_to_merge_source.begin();
2014 while (i != sent_ready_to_merge_source.end()) {
2015 if (!osdmap->pg_exists(*i)) {
2016 dout(10) << __func__ << " " << *i << dendl;
2017 i = sent_ready_to_merge_source.erase(i);
2018 } else {
2019 ++i;
2020 }
2021 }
2022 }
2023
2024 // ---
2025
2026 void OSDService::_queue_for_recovery(
2027 std::pair<epoch_t, PGRef> p,
2028 uint64_t reserved_pushes)
2029 {
2030 ceph_assert(recovery_lock.is_locked_by_me());
2031 enqueue_back(
2032 OpQueueItem(
2033 unique_ptr<OpQueueItem::OpQueueable>(
2034 new PGRecovery(
2035 p.second->get_pgid(), p.first, reserved_pushes)),
2036 cct->_conf->osd_recovery_cost,
2037 cct->_conf->osd_recovery_priority,
2038 ceph_clock_now(),
2039 0,
2040 p.first));
2041 }
2042
2043 // ====================================================================
2044 // OSD
2045
2046 #undef dout_prefix
2047 #define dout_prefix *_dout
2048
2049 // Commands shared between OSD's console and admin console:
2050 namespace ceph {
2051 namespace osd_cmds {
2052
2053 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2054
2055 }} // namespace ceph::osd_cmds
2056
2057 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
2058 {
2059 int ret;
2060
2061 OSDSuperblock sb;
2062 bufferlist sbbl;
2063 ObjectStore::CollectionHandle ch;
2064
2065 // if we are fed a uuid for this osd, use it.
2066 store->set_fsid(cct->_conf->osd_uuid);
2067
2068 ret = store->mkfs();
2069 if (ret) {
2070 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2071 << cpp_strerror(ret) << dendl;
2072 goto free_store;
2073 }
2074
2075 store->set_cache_shards(1); // doesn't matter for mkfs!
2076
2077 ret = store->mount();
2078 if (ret) {
2079 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2080 << cpp_strerror(ret) << dendl;
2081 goto free_store;
2082 }
2083
2084 ch = store->open_collection(coll_t::meta());
2085 if (ch) {
2086 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2087 if (ret < 0) {
2088 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2089 goto free_store;
2090 }
2091 /* if we already have superblock, check content of superblock */
2092 dout(0) << " have superblock" << dendl;
2093 auto p = sbbl.cbegin();
2094 decode(sb, p);
2095 if (whoami != sb.whoami) {
2096 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2097 << dendl;
2098 ret = -EINVAL;
2099 goto umount_store;
2100 }
2101 if (fsid != sb.cluster_fsid) {
2102 derr << "provided cluster fsid " << fsid
2103 << " != superblock's " << sb.cluster_fsid << dendl;
2104 ret = -EINVAL;
2105 goto umount_store;
2106 }
2107 } else {
2108 // create superblock
2109 sb.cluster_fsid = fsid;
2110 sb.osd_fsid = store->get_fsid();
2111 sb.whoami = whoami;
2112 sb.compat_features = get_osd_initial_compat_set();
2113
2114 bufferlist bl;
2115 encode(sb, bl);
2116
2117 ObjectStore::CollectionHandle ch = store->create_new_collection(
2118 coll_t::meta());
2119 ObjectStore::Transaction t;
2120 t.create_collection(coll_t::meta(), 0);
2121 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2122 ret = store->queue_transaction(ch, std::move(t));
2123 if (ret) {
2124 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2126 goto umount_store;
2127 }
2128 }
2129
2130 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
2131 if (ret) {
2132 derr << "OSD::mkfs: failed to write fsid file: error "
2133 << cpp_strerror(ret) << dendl;
2134 goto umount_store;
2135 }
2136
2137 umount_store:
2138 if (ch) {
2139 ch.reset();
2140 }
2141 store->umount();
2142 free_store:
2143 delete store;
2144 return ret;
2145 }
2146
2147 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
2148 {
2149 char val[80];
2150 int r;
2151
2152 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2153 r = store->write_meta("magic", val);
2154 if (r < 0)
2155 return r;
2156
2157 snprintf(val, sizeof(val), "%d", whoami);
2158 r = store->write_meta("whoami", val);
2159 if (r < 0)
2160 return r;
2161
2162 cluster_fsid.print(val);
2163 r = store->write_meta("ceph_fsid", val);
2164 if (r < 0)
2165 return r;
2166
2167 string key = cct->_conf.get_val<string>("key");
2168 if (key.size()) {
2169 r = store->write_meta("osd_key", key);
2170 if (r < 0)
2171 return r;
2172 } else {
2173 string keyfile = cct->_conf.get_val<string>("keyfile");
2174 if (!keyfile.empty()) {
2175 bufferlist keybl;
2176 string err;
2177 r = keybl.read_file(keyfile.c_str(), &err);
2178 if (r < 0) {
2179 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2180 << err << ": " << cpp_strerror(r) << dendl;
2181 return r;
2182 }
2183 r = store->write_meta("osd_key", keybl.to_str());
2184 if (r < 0)
2185 return r;
2186 }
2187 }
2188
2189 r = store->write_meta("ready", "ready");
2190 if (r < 0)
2191 return r;
2192
2193 return 0;
2194 }
2195
2196 int OSD::peek_meta(ObjectStore *store,
2197 std::string *magic,
2198 uuid_d *cluster_fsid,
2199 uuid_d *osd_fsid,
2200 int *whoami,
2201 int *require_osd_release)
2202 {
2203 string val;
2204
2205 int r = store->read_meta("magic", &val);
2206 if (r < 0)
2207 return r;
2208 *magic = val;
2209
2210 r = store->read_meta("whoami", &val);
2211 if (r < 0)
2212 return r;
2213 *whoami = atoi(val.c_str());
2214
2215 r = store->read_meta("ceph_fsid", &val);
2216 if (r < 0)
2217 return r;
2218 r = cluster_fsid->parse(val.c_str());
2219 if (!r)
2220 return -EINVAL;
2221
2222 r = store->read_meta("fsid", &val);
2223 if (r < 0) {
2224 *osd_fsid = uuid_d();
2225 } else {
2226 r = osd_fsid->parse(val.c_str());
2227 if (!r)
2228 return -EINVAL;
2229 }
2230
2231 r = store->read_meta("require_osd_release", &val);
2232 if (r >= 0) {
2233 *require_osd_release = atoi(val.c_str());
2234 }
2235
2236 return 0;
2237 }
2238
2239
2240 #undef dout_prefix
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2242
2243 // cons/des
2244
2245 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2246 int id,
2247 Messenger *internal_messenger,
2248 Messenger *external_messenger,
2249 Messenger *hb_client_front,
2250 Messenger *hb_client_back,
2251 Messenger *hb_front_serverm,
2252 Messenger *hb_back_serverm,
2253 Messenger *osdc_messenger,
2254 MonClient *mc,
2255 const std::string &dev, const std::string &jdev) :
2256 Dispatcher(cct_),
2257 osd_lock("OSD::osd_lock"),
2258 tick_timer(cct, osd_lock),
2259 tick_timer_lock("OSD::tick_timer_lock"),
2260 tick_timer_without_osd_lock(cct, tick_timer_lock),
2261 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2262 cluster_messenger(internal_messenger),
2263 client_messenger(external_messenger),
2264 objecter_messenger(osdc_messenger),
2265 monc(mc),
2266 mgrc(cct_, client_messenger),
2267 logger(NULL),
2268 recoverystate_perf(NULL),
2269 store(store_),
2270 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2271 clog(log_client.create_channel()),
2272 whoami(id),
2273 dev_path(dev), journal_path(jdev),
2274 store_is_rotational(store->is_rotational()),
2275 trace_endpoint("0.0.0.0", 0, "osd"),
2276 asok_hook(NULL),
2277 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2278 "osd_pg_epoch_max_lag_factor")),
2279 osd_compat(get_osd_compat_set()),
2280 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2281 get_num_op_threads()),
2282 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
2283 session_waiting_lock("OSD::session_waiting_lock"),
2284 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285 heartbeat_lock("OSD::heartbeat_lock"),
2286 heartbeat_stop(false),
2287 heartbeat_need_update(true),
2288 hb_front_client_messenger(hb_client_front),
2289 hb_back_client_messenger(hb_client_back),
2290 hb_front_server_messenger(hb_front_serverm),
2291 hb_back_server_messenger(hb_back_serverm),
2292 daily_loadavg(0.0),
2293 heartbeat_thread(this),
2294 heartbeat_dispatcher(this),
2295 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2296 cct->_conf->osd_num_op_tracker_shard),
2297 test_ops_hook(NULL),
2298 op_queue(get_io_queue()),
2299 op_prio_cutoff(get_io_prio_cut()),
2300 op_shardedwq(
2301 this,
2302 cct->_conf->osd_op_thread_timeout,
2303 cct->_conf->osd_op_thread_suicide_timeout,
2304 &osd_op_tp),
2305 map_lock("OSD::map_lock"),
2306 last_pg_create_epoch(0),
2307 mon_report_lock("OSD::mon_report_lock"),
2308 boot_finisher(cct),
2309 up_thru_wanted(0),
2310 requested_full_first(0),
2311 requested_full_last(0),
2312 command_wq(
2313 this,
2314 cct->_conf->osd_command_thread_timeout,
2315 cct->_conf->osd_command_thread_suicide_timeout,
2316 &command_tp),
2317 service(this)
2318 {
2319
2320 if (!gss_ktfile_client.empty()) {
2321 // Assert we can export environment variable
2322 /*
2323 The default client keytab is used, if it is present and readable,
2324 to automatically obtain initial credentials for GSSAPI client
2325 applications. The principal name of the first entry in the client
2326 keytab is used by default when obtaining initial credentials.
2327 1. The KRB5_CLIENT_KTNAME environment variable.
2328 2. The default_client_keytab_name profile variable in [libdefaults].
2329 3. The hardcoded default, DEFCKTNAME.
2330 */
2331 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332 gss_ktfile_client.c_str(), 1));
2333 ceph_assert(set_result == 0);
2334 }
2335
2336 monc->set_messenger(client_messenger);
2337 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2338 cct->_conf->osd_op_log_threshold);
2339 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2340 cct->_conf->osd_op_history_duration);
2341 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2342 cct->_conf->osd_op_history_slow_op_threshold);
2343 #ifdef WITH_BLKIN
2344 std::stringstream ss;
2345 ss << "osd." << whoami;
2346 trace_endpoint.copy_name(ss.str());
2347 #endif
2348
2349 // initialize shards
2350 num_shards = get_num_op_shards();
2351 for (uint32_t i = 0; i < num_shards; i++) {
2352 OSDShard *one_shard = new OSDShard(
2353 i,
2354 cct,
2355 this,
2356 cct->_conf->osd_op_pq_max_tokens_per_priority,
2357 cct->_conf->osd_op_pq_min_cost,
2358 op_queue);
2359 shards.push_back(one_shard);
2360 }
2361 }
2362
2363 OSD::~OSD()
2364 {
2365 while (!shards.empty()) {
2366 delete shards.back();
2367 shards.pop_back();
2368 }
2369 delete class_handler;
2370 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2371 cct->get_perfcounters_collection()->remove(logger);
2372 delete recoverystate_perf;
2373 delete logger;
2374 delete store;
2375 }
2376
2377 double OSD::get_tick_interval() const
2378 {
2379 // vary +/- 5% to avoid scrub scheduling livelocks
2380 constexpr auto delta = 0.05;
2381 return (OSD_TICK_INTERVAL *
2382 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2383 }
2384
2385 void cls_initialize(ClassHandler *ch);
2386
2387 void OSD::handle_signal(int signum)
2388 {
2389 ceph_assert(signum == SIGINT || signum == SIGTERM);
2390 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2391 shutdown();
2392 }
2393
2394 int OSD::pre_init()
2395 {
2396 std::lock_guard lock(osd_lock);
2397 if (is_stopping())
2398 return 0;
2399
2400 if (store->test_mount_in_use()) {
2401 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2402 << "currently in use. (Is ceph-osd already running?)" << dendl;
2403 return -EBUSY;
2404 }
2405
2406 cct->_conf.add_observer(this);
2407 return 0;
2408 }
2409
2410 int OSD::set_numa_affinity()
2411 {
2412 // storage numa node
2413 int store_node = -1;
2414 store->get_numa_node(&store_node, nullptr, nullptr);
2415 if (store_node >= 0) {
2416 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2417 }
2418
2419 // check network numa node(s)
2420 int front_node = -1, back_node = -1;
2421 string front_iface = pick_iface(
2422 cct,
2423 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2424 string back_iface = pick_iface(
2425 cct,
2426 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2427 int r = get_iface_numa_node(front_iface, &front_node);
2428 if (r >= 0) {
2429 dout(1) << __func__ << " public network " << front_iface << " numa node "
2430 << front_node << dendl;
2431 r = get_iface_numa_node(back_iface, &back_node);
2432 if (r >= 0) {
2433 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2434 << back_node << dendl;
2435 if (front_node == back_node &&
2436 front_node == store_node) {
2437 dout(1) << " objectstore and network numa nodes all match" << dendl;
2438 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2439 numa_node = front_node;
2440 }
2441 } else {
2442 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2443 << dendl;
2444 }
2445 }
2446 } else {
2447 derr << __func__ << " unable to identify public interface '" << front_iface
2448 << "' numa node: " << cpp_strerror(r) << dendl;
2449 }
2450 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2451 // this takes precedence over the automagic logic above
2452 numa_node = node;
2453 }
2454 if (numa_node >= 0) {
2455 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2456 if (r < 0) {
2457 dout(1) << __func__ << " unable to determine numa node " << numa_node
2458 << " CPUs" << dendl;
2459 numa_node = -1;
2460 } else {
2461 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2462 << " cpus "
2463 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2464 << dendl;
2465 r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set);
2466 if (r < 0) {
2467 r = -errno;
2468 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2469 << dendl;
2470 numa_node = -1;
2471 }
2472 }
2473 } else {
2474 dout(1) << __func__ << " not setting numa affinity" << dendl;
2475 }
2476 return 0;
2477 }
2478
2479 // asok
2480
2481 class OSDSocketHook : public AdminSocketHook {
2482 OSD *osd;
2483 public:
2484 explicit OSDSocketHook(OSD *o) : osd(o) {}
2485 bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2486 std::string_view format, bufferlist& out) override {
2487 stringstream ss;
2488 bool r = true;
2489 try {
2490 r = osd->asok_command(admin_command, cmdmap, format, ss);
2491 } catch (const bad_cmd_get& e) {
2492 ss << e.what();
2493 r = true;
2494 }
2495 out.append(ss);
2496 return r;
2497 }
2498 };
2499
2500 std::set<int64_t> OSD::get_mapped_pools()
2501 {
2502 std::set<int64_t> pools;
2503 std::vector<spg_t> pgids;
2504 _get_pgids(&pgids);
2505 for (const auto &pgid : pgids) {
2506 pools.insert(pgid.pool());
2507 }
2508 return pools;
2509 }
2510
2511 bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2512 std::string_view format, ostream& ss)
2513 {
2514 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2515 if (admin_command == "status") {
2516 f->open_object_section("status");
2517 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2518 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2519 f->dump_unsigned("whoami", superblock.whoami);
2520 f->dump_string("state", get_state_name(get_state()));
2521 f->dump_unsigned("oldest_map", superblock.oldest_map);
2522 f->dump_unsigned("newest_map", superblock.newest_map);
2523 f->dump_unsigned("num_pgs", num_pgs);
2524 f->close_section();
2525 } else if (admin_command == "flush_journal") {
2526 store->flush_journal();
2527 } else if (admin_command == "dump_ops_in_flight" ||
2528 admin_command == "ops" ||
2529 admin_command == "dump_blocked_ops" ||
2530 admin_command == "dump_historic_ops" ||
2531 admin_command == "dump_historic_ops_by_duration" ||
2532 admin_command == "dump_historic_slow_ops") {
2533
2534 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2535 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2536 will start to track new ops received afterwards.";
2537
2538 set<string> filters;
2539 vector<string> filter_str;
2540 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2541 copy(filter_str.begin(), filter_str.end(),
2542 inserter(filters, filters.end()));
2543 }
2544
2545 if (admin_command == "dump_ops_in_flight" ||
2546 admin_command == "ops") {
2547 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2548 ss << error_str;
2549 }
2550 }
2551 if (admin_command == "dump_blocked_ops") {
2552 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2553 ss << error_str;
2554 }
2555 }
2556 if (admin_command == "dump_historic_ops") {
2557 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2558 ss << error_str;
2559 }
2560 }
2561 if (admin_command == "dump_historic_ops_by_duration") {
2562 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2563 ss << error_str;
2564 }
2565 }
2566 if (admin_command == "dump_historic_slow_ops") {
2567 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2568 ss << error_str;
2569 }
2570 }
2571 } else if (admin_command == "dump_op_pq_state") {
2572 f->open_object_section("pq");
2573 op_shardedwq.dump(f);
2574 f->close_section();
2575 } else if (admin_command == "dump_blacklist") {
2576 list<pair<entity_addr_t,utime_t> > bl;
2577 OSDMapRef curmap = service.get_osdmap();
2578
2579 f->open_array_section("blacklist");
2580 curmap->get_blacklist(&bl);
2581 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2582 it != bl.end(); ++it) {
2583 f->open_object_section("entry");
2584 f->open_object_section("entity_addr_t");
2585 it->first.dump(f);
2586 f->close_section(); //entity_addr_t
2587 it->second.localtime(f->dump_stream("expire_time"));
2588 f->close_section(); //entry
2589 }
2590 f->close_section(); //blacklist
2591 } else if (admin_command == "dump_watchers") {
2592 list<obj_watch_item_t> watchers;
2593 // scan pg's
2594 vector<PGRef> pgs;
2595 _get_pgs(&pgs);
2596 for (auto& pg : pgs) {
2597 list<obj_watch_item_t> pg_watchers;
2598 pg->get_watchers(&pg_watchers);
2599 watchers.splice(watchers.end(), pg_watchers);
2600 }
2601
2602 f->open_array_section("watchers");
2603 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2604 it != watchers.end(); ++it) {
2605
2606 f->open_object_section("watch");
2607
2608 f->dump_string("namespace", it->obj.nspace);
2609 f->dump_string("object", it->obj.oid.name);
2610
2611 f->open_object_section("entity_name");
2612 it->wi.name.dump(f);
2613 f->close_section(); //entity_name_t
2614
2615 f->dump_unsigned("cookie", it->wi.cookie);
2616 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2617
2618 f->open_object_section("entity_addr_t");
2619 it->wi.addr.dump(f);
2620 f->close_section(); //entity_addr_t
2621
2622 f->close_section(); //watch
2623 }
2624
2625 f->close_section(); //watchers
2626 } else if (admin_command == "dump_recovery_reservations") {
2627 f->open_object_section("reservations");
2628 f->open_object_section("local_reservations");
2629 service.local_reserver.dump(f);
2630 f->close_section();
2631 f->open_object_section("remote_reservations");
2632 service.remote_reserver.dump(f);
2633 f->close_section();
2634 f->close_section();
2635 } else if (admin_command == "dump_scrub_reservations") {
2636 f->open_object_section("scrub_reservations");
2637 service.dump_scrub_reservations(f);
2638 f->close_section();
2639 } else if (admin_command == "get_latest_osdmap") {
2640 get_latest_osdmap();
2641 } else if (admin_command == "heap") {
2642 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2643
2644 // Note: Failed heap profile commands won't necessarily trigger an error:
2645 f->open_object_section("result");
2646 f->dump_string("error", cpp_strerror(result));
2647 f->dump_bool("success", result >= 0);
2648 f->close_section();
2649 } else if (admin_command == "set_heap_property") {
2650 string property;
2651 int64_t value = 0;
2652 string error;
2653 bool success = false;
2654 if (!cmd_getval(cct, cmdmap, "property", property)) {
2655 error = "unable to get property";
2656 success = false;
2657 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2658 error = "unable to get value";
2659 success = false;
2660 } else if (value < 0) {
2661 error = "negative value not allowed";
2662 success = false;
2663 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2664 error = "invalid property";
2665 success = false;
2666 } else {
2667 success = true;
2668 }
2669 f->open_object_section("result");
2670 f->dump_string("error", error);
2671 f->dump_bool("success", success);
2672 f->close_section();
2673 } else if (admin_command == "get_heap_property") {
2674 string property;
2675 size_t value = 0;
2676 string error;
2677 bool success = false;
2678 if (!cmd_getval(cct, cmdmap, "property", property)) {
2679 error = "unable to get property";
2680 success = false;
2681 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2682 error = "invalid property";
2683 success = false;
2684 } else {
2685 success = true;
2686 }
2687 f->open_object_section("result");
2688 f->dump_string("error", error);
2689 f->dump_bool("success", success);
2690 f->dump_int("value", value);
2691 f->close_section();
2692 } else if (admin_command == "dump_objectstore_kv_stats") {
2693 store->get_db_statistics(f);
2694 } else if (admin_command == "dump_scrubs") {
2695 service.dumps_scrub(f);
2696 } else if (admin_command == "calc_objectstore_db_histogram") {
2697 store->generate_db_histogram(f);
2698 } else if (admin_command == "flush_store_cache") {
2699 store->flush_cache(&ss);
2700 } else if (admin_command == "dump_pgstate_history") {
2701 f->open_object_section("pgstate_history");
2702 vector<PGRef> pgs;
2703 _get_pgs(&pgs);
2704 for (auto& pg : pgs) {
2705 f->dump_stream("pg") << pg->pg_id;
2706 pg->dump_pgstate_history(f);
2707 }
2708 f->close_section();
2709 } else if (admin_command == "compact") {
2710 dout(1) << "triggering manual compaction" << dendl;
2711 auto start = ceph::coarse_mono_clock::now();
2712 store->compact();
2713 auto end = ceph::coarse_mono_clock::now();
2714 double duration = std::chrono::duration<double>(end-start).count();
2715 dout(1) << "finished manual compaction in "
2716 << duration
2717 << " seconds" << dendl;
2718 f->open_object_section("compact_result");
2719 f->dump_float("elapsed_time", duration);
2720 f->close_section();
2721 } else if (admin_command == "get_mapped_pools") {
2722 f->open_array_section("mapped_pools");
2723 set<int64_t> poollist = get_mapped_pools();
2724 for (auto pool : poollist) {
2725 f->dump_int("pool_id", pool);
2726 }
2727 f->close_section();
2728 } else if (admin_command == "smart") {
2729 string devid;
2730 cmd_getval(cct, cmdmap, "devid", devid);
2731 probe_smart(devid, ss);
2732 } else if (admin_command == "list_devices") {
2733 set<string> devnames;
2734 store->get_devices(&devnames);
2735 f->open_object_section("list_devices");
2736 for (auto dev : devnames) {
2737 if (dev.find("dm-") == 0) {
2738 continue;
2739 }
2740 f->dump_string("device", "/dev/" + dev);
2741 }
2742 f->close_section();
2743 } else if (admin_command == "send_beacon") {
2744 if (is_active()) {
2745 send_beacon(ceph::coarse_mono_clock::now());
2746 }
2747 } else if (admin_command == "dump_osd_network") {
2748 int64_t value = 0;
2749 if (!(cmd_getval(cct, cmdmap, "value", value))) {
2750 // Convert milliseconds to microseconds
2751 value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
2752 if (value == 0) {
2753 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2754 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2755 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2756 }
2757 } else {
2758 // Convert user input to microseconds
2759 value *= 1000;
2760 }
2761 if (value < 0) value = 0;
2762
2763 struct osd_ping_time_t {
2764 uint32_t pingtime;
2765 int to;
2766 bool back;
2767 std::array<uint32_t,3> times;
2768 std::array<uint32_t,3> min;
2769 std::array<uint32_t,3> max;
2770 uint32_t last;
2771 uint32_t last_update;
2772
2773 bool operator<(const osd_ping_time_t& rhs) const {
2774 if (pingtime < rhs.pingtime)
2775 return true;
2776 if (pingtime > rhs.pingtime)
2777 return false;
2778 if (to < rhs.to)
2779 return true;
2780 if (to > rhs.to)
2781 return false;
2782 return back;
2783 }
2784 };
2785
2786 set<osd_ping_time_t> sorted;
2787 // Get pingtimes under lock and not on the stack
2788 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
2789 service.get_hb_pingtime(pingtimes);
2790 for (auto j : *pingtimes) {
2791 if (j.second.last_update == 0)
2792 continue;
2793 osd_ping_time_t item;
2794 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2795 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
2796 if (item.pingtime >= value) {
2797 item.to = j.first;
2798 item.times[0] = j.second.back_pingtime[0];
2799 item.times[1] = j.second.back_pingtime[1];
2800 item.times[2] = j.second.back_pingtime[2];
2801 item.min[0] = j.second.back_min[0];
2802 item.min[1] = j.second.back_min[1];
2803 item.min[2] = j.second.back_min[2];
2804 item.max[0] = j.second.back_max[0];
2805 item.max[1] = j.second.back_max[1];
2806 item.max[2] = j.second.back_max[2];
2807 item.last = j.second.back_last;
2808 item.back = true;
2809 item.last_update = j.second.last_update;
2810 sorted.emplace(item);
2811 }
2812 if (j.second.front_last == 0)
2813 continue;
2814 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2815 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
2816 if (item.pingtime >= value) {
2817 item.to = j.first;
2818 item.times[0] = j.second.front_pingtime[0];
2819 item.times[1] = j.second.front_pingtime[1];
2820 item.times[2] = j.second.front_pingtime[2];
2821 item.min[0] = j.second.front_min[0];
2822 item.min[1] = j.second.front_min[1];
2823 item.min[2] = j.second.front_min[2];
2824 item.max[0] = j.second.front_max[0];
2825 item.max[1] = j.second.front_max[1];
2826 item.max[2] = j.second.front_max[2];
2827 item.last = j.second.front_last;
2828 item.last_update = j.second.last_update;
2829 item.back = false;
2830 sorted.emplace(item);
2831 }
2832 }
2833 delete pingtimes;
2834 //
2835 // Network ping times (1min 5min 15min)
2836 f->open_object_section("network_ping_times");
2837 f->dump_int("threshold", value / 1000);
2838 f->open_array_section("entries");
2839 for (auto &sitem : boost::adaptors::reverse(sorted)) {
2840 ceph_assert(sitem.pingtime >= value);
2841 f->open_object_section("entry");
2842
2843 const time_t lu(sitem.last_update);
2844 char buffer[26];
2845 string lustr(ctime_r(&lu, buffer));
2846 lustr.pop_back(); // Remove trailing \n
2847 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
2848 f->dump_string("last update", lustr);
2849 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
2850 f->dump_int("from osd", whoami);
2851 f->dump_int("to osd", sitem.to);
2852 f->dump_string("interface", (sitem.back ? "back" : "front"));
2853 f->open_object_section("average");
2854 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
2855 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
2856 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
2857 f->close_section(); // average
2858 f->open_object_section("min");
2859 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2860 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2861 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2862 f->close_section(); // min
2863 f->open_object_section("max");
2864 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2865 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2866 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2867 f->close_section(); // max
2868 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
2869 f->close_section(); // entry
2870 }
2871 f->close_section(); // entries
2872 f->close_section(); // network_ping_times
2873 } else {
2874 ceph_abort_msg("broken asok registration");
2875 }
2876 f->flush(ss);
2877 delete f;
2878 return true;
2879 }
2880
2881 class TestOpsSocketHook : public AdminSocketHook {
2882 OSDService *service;
2883 ObjectStore *store;
2884 public:
2885 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2886 bool call(std::string_view command, const cmdmap_t& cmdmap,
2887 std::string_view format, bufferlist& out) override {
2888 stringstream ss;
2889 try {
2890 test_ops(service, store, command, cmdmap, ss);
2891 } catch (const bad_cmd_get& e) {
2892 ss << e.what();
2893 }
2894 out.append(ss);
2895 return true;
2896 }
2897 void test_ops(OSDService *service, ObjectStore *store,
2898 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
2899
2900 };
2901
2902 class OSD::C_Tick : public Context {
2903 OSD *osd;
2904 public:
2905 explicit C_Tick(OSD *o) : osd(o) {}
2906 void finish(int r) override {
2907 osd->tick();
2908 }
2909 };
2910
2911 class OSD::C_Tick_WithoutOSDLock : public Context {
2912 OSD *osd;
2913 public:
2914 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2915 void finish(int r) override {
2916 osd->tick_without_osd_lock();
2917 }
2918 };
2919
2920 int OSD::enable_disable_fuse(bool stop)
2921 {
2922 #ifdef HAVE_LIBFUSE
2923 int r;
2924 string mntpath = cct->_conf->osd_data + "/fuse";
2925 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2926 dout(1) << __func__ << " disabling" << dendl;
2927 fuse_store->stop();
2928 delete fuse_store;
2929 fuse_store = NULL;
2930 r = ::rmdir(mntpath.c_str());
2931 if (r < 0) {
2932 r = -errno;
2933 derr << __func__ << " failed to rmdir " << mntpath << ": "
2934 << cpp_strerror(r) << dendl;
2935 return r;
2936 }
2937 return 0;
2938 }
2939 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2940 dout(1) << __func__ << " enabling" << dendl;
2941 r = ::mkdir(mntpath.c_str(), 0700);
2942 if (r < 0)
2943 r = -errno;
2944 if (r < 0 && r != -EEXIST) {
2945 derr << __func__ << " unable to create " << mntpath << ": "
2946 << cpp_strerror(r) << dendl;
2947 return r;
2948 }
2949 fuse_store = new FuseStore(store, mntpath);
2950 r = fuse_store->start();
2951 if (r < 0) {
2952 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2953 delete fuse_store;
2954 fuse_store = NULL;
2955 return r;
2956 }
2957 }
2958 #endif // HAVE_LIBFUSE
2959 return 0;
2960 }
2961
2962 int OSD::get_num_op_shards()
2963 {
2964 if (cct->_conf->osd_op_num_shards)
2965 return cct->_conf->osd_op_num_shards;
2966 if (store_is_rotational)
2967 return cct->_conf->osd_op_num_shards_hdd;
2968 else
2969 return cct->_conf->osd_op_num_shards_ssd;
2970 }
2971
2972 int OSD::get_num_op_threads()
2973 {
2974 if (cct->_conf->osd_op_num_threads_per_shard)
2975 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2976 if (store_is_rotational)
2977 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2978 else
2979 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2980 }
2981
2982 float OSD::get_osd_recovery_sleep()
2983 {
2984 if (cct->_conf->osd_recovery_sleep)
2985 return cct->_conf->osd_recovery_sleep;
2986 if (!store_is_rotational && !journal_is_rotational)
2987 return cct->_conf->osd_recovery_sleep_ssd;
2988 else if (store_is_rotational && !journal_is_rotational)
2989 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
2990 else
2991 return cct->_conf->osd_recovery_sleep_hdd;
2992 }
2993
2994 float OSD::get_osd_delete_sleep()
2995 {
2996 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
2997 if (osd_delete_sleep > 0)
2998 return osd_delete_sleep;
2999 if (!store_is_rotational && !journal_is_rotational)
3000 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3001 if (store_is_rotational && !journal_is_rotational)
3002 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3003 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3004 }
3005
3006 float OSD::get_osd_snap_trim_sleep()
3007 {
3008 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3009 if (osd_snap_trim_sleep > 0)
3010 return osd_snap_trim_sleep;
3011 if (!store_is_rotational && !journal_is_rotational)
3012 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3013 if (store_is_rotational && !journal_is_rotational)
3014 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3015 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3016 }
3017
3018 int OSD::init()
3019 {
3020 CompatSet initial, diff;
3021 std::lock_guard lock(osd_lock);
3022 if (is_stopping())
3023 return 0;
3024
3025 tick_timer.init();
3026 tick_timer_without_osd_lock.init();
3027 service.recovery_request_timer.init();
3028 service.sleep_timer.init();
3029
3030 boot_finisher.start();
3031
3032 {
3033 string val;
3034 store->read_meta("require_osd_release", &val);
3035 last_require_osd_release = atoi(val.c_str());
3036 }
3037
3038 // mount.
3039 dout(2) << "init " << dev_path
3040 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3041 << dendl;
3042 dout(2) << "journal " << journal_path << dendl;
3043 ceph_assert(store); // call pre_init() first!
3044
3045 store->set_cache_shards(get_num_op_shards());
3046
3047 int r = store->mount();
3048 if (r < 0) {
3049 derr << "OSD:init: unable to mount object store" << dendl;
3050 return r;
3051 }
3052 journal_is_rotational = store->is_journal_rotational();
3053 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3054 << dendl;
3055
3056 enable_disable_fuse(false);
3057
3058 dout(2) << "boot" << dendl;
3059
3060 service.meta_ch = store->open_collection(coll_t::meta());
3061
3062 // initialize the daily loadavg with current 15min loadavg
3063 double loadavgs[3];
3064 if (getloadavg(loadavgs, 3) == 3) {
3065 daily_loadavg = loadavgs[2];
3066 } else {
3067 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3068 daily_loadavg = 1.0;
3069 }
3070
3071 int rotating_auth_attempts = 0;
3072 auto rotating_auth_timeout =
3073 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3074
3075 // sanity check long object name handling
3076 {
3077 hobject_t l;
3078 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3079 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3080 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3081 r = store->validate_hobject_key(l);
3082 if (r < 0) {
3083 derr << "backend (" << store->get_type() << ") is unable to support max "
3084 << "object name[space] len" << dendl;
3085 derr << " osd max object name len = "
3086 << cct->_conf->osd_max_object_name_len << dendl;
3087 derr << " osd max object namespace len = "
3088 << cct->_conf->osd_max_object_namespace_len << dendl;
3089 derr << cpp_strerror(r) << dendl;
3090 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3091 goto out;
3092 }
3093 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3094 << dendl;
3095 } else {
3096 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3097 }
3098 }
3099
3100 // read superblock
3101 r = read_superblock();
3102 if (r < 0) {
3103 derr << "OSD::init() : unable to read osd superblock" << dendl;
3104 r = -EINVAL;
3105 goto out;
3106 }
3107
3108 if (osd_compat.compare(superblock.compat_features) < 0) {
3109 derr << "The disk uses features unsupported by the executable." << dendl;
3110 derr << " ondisk features " << superblock.compat_features << dendl;
3111 derr << " daemon features " << osd_compat << dendl;
3112
3113 if (osd_compat.writeable(superblock.compat_features)) {
3114 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3115 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3116 r = -EOPNOTSUPP;
3117 goto out;
3118 }
3119 else {
3120 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3121 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3122 r = -EOPNOTSUPP;
3123 goto out;
3124 }
3125 }
3126
3127 assert_warn(whoami == superblock.whoami);
3128 if (whoami != superblock.whoami) {
3129 derr << "OSD::init: superblock says osd"
3130 << superblock.whoami << " but I am osd." << whoami << dendl;
3131 r = -EINVAL;
3132 goto out;
3133 }
3134
3135 // load up "current" osdmap
3136 assert_warn(!osdmap);
3137 if (osdmap) {
3138 derr << "OSD::init: unable to read current osdmap" << dendl;
3139 r = -EINVAL;
3140 goto out;
3141 }
3142 osdmap = get_map(superblock.current_epoch);
3143
3144 // make sure we don't have legacy pgs deleting
3145 {
3146 vector<coll_t> ls;
3147 int r = store->list_collections(ls);
3148 ceph_assert(r >= 0);
3149 for (auto c : ls) {
3150 spg_t pgid;
3151 if (c.is_pg(&pgid) &&
3152 !osdmap->have_pg_pool(pgid.pool())) {
3153 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3154 if (!store->exists(service.meta_ch, oid)) {
3155 derr << __func__ << " missing pg_pool_t for deleted pool "
3156 << pgid.pool() << " for pg " << pgid
3157 << "; please downgrade to luminous and allow "
3158 << "pg deletion to complete before upgrading" << dendl;
3159 ceph_abort();
3160 }
3161 }
3162 }
3163 }
3164
3165 initial = get_osd_initial_compat_set();
3166 diff = superblock.compat_features.unsupported(initial);
3167 if (superblock.compat_features.merge(initial)) {
3168 // We need to persist the new compat_set before we
3169 // do anything else
3170 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3171 ObjectStore::Transaction t;
3172 write_superblock(t);
3173 r = store->queue_transaction(service.meta_ch, std::move(t));
3174 if (r < 0)
3175 goto out;
3176 }
3177
3178 // make sure snap mapper object exists
3179 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3180 dout(10) << "init creating/touching snapmapper object" << dendl;
3181 ObjectStore::Transaction t;
3182 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3183 r = store->queue_transaction(service.meta_ch, std::move(t));
3184 if (r < 0)
3185 goto out;
3186 }
3187
3188 class_handler = new ClassHandler(cct);
3189 cls_initialize(class_handler);
3190
3191 if (cct->_conf->osd_open_classes_on_start) {
3192 int r = class_handler->open_all_classes();
3193 if (r)
3194 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3195 }
3196
3197 check_osdmap_features();
3198
3199 create_recoverystate_perf();
3200
3201 {
3202 epoch_t bind_epoch = osdmap->get_epoch();
3203 service.set_epochs(NULL, NULL, &bind_epoch);
3204 }
3205
3206 clear_temp_objects();
3207
3208 // initialize osdmap references in sharded wq
3209 for (auto& shard : shards) {
3210 std::lock_guard l(shard->osdmap_lock);
3211 shard->shard_osdmap = osdmap;
3212 }
3213
3214 // load up pgs (as they previously existed)
3215 load_pgs();
3216
3217 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3218 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3219 op_prio_cutoff << "." << dendl;
3220
3221 create_logger();
3222
3223 // prime osd stats
3224 {
3225 struct store_statfs_t stbuf;
3226 osd_alert_list_t alerts;
3227 int r = store->statfs(&stbuf, &alerts);
3228 ceph_assert(r == 0);
3229 service.set_statfs(stbuf, alerts);
3230 }
3231
3232 // client_messenger auth_client is already set up by monc.
3233 for (auto m : { cluster_messenger,
3234 objecter_messenger,
3235 hb_front_client_messenger,
3236 hb_back_client_messenger,
3237 hb_front_server_messenger,
3238 hb_back_server_messenger } ) {
3239 m->set_auth_client(monc);
3240 }
3241 for (auto m : { client_messenger,
3242 cluster_messenger,
3243 hb_front_server_messenger,
3244 hb_back_server_messenger }) {
3245 m->set_auth_server(monc);
3246 }
3247 monc->set_handle_authentication_dispatcher(this);
3248
3249 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3250 | CEPH_ENTITY_TYPE_MGR);
3251 r = monc->init();
3252 if (r < 0)
3253 goto out;
3254
3255 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3256 mgrc.set_perf_metric_query_cb(
3257 [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3258 set_perf_queries(queries);
3259 },
3260 [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3261 get_perf_reports(reports);
3262 });
3263 mgrc.init();
3264
3265 // tell monc about log_client so it will know about mon session resets
3266 monc->set_log_client(&log_client);
3267 update_log_config();
3268
3269 // i'm ready!
3270 client_messenger->add_dispatcher_tail(&mgrc);
3271 client_messenger->add_dispatcher_tail(this);
3272 cluster_messenger->add_dispatcher_head(this);
3273
3274 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3275 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3276 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3277 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3278
3279 objecter_messenger->add_dispatcher_head(service.objecter);
3280
3281 service.init();
3282 service.publish_map(osdmap);
3283 service.publish_superblock(superblock);
3284 service.max_oldest_map = superblock.oldest_map;
3285
3286 for (auto& shard : shards) {
3287 // put PGs in a temporary set because we may modify pg_slots
3288 // unordered_map below.
3289 set<PGRef> pgs;
3290 for (auto& i : shard->pg_slots) {
3291 PGRef pg = i.second->pg;
3292 if (!pg) {
3293 continue;
3294 }
3295 pgs.insert(pg);
3296 }
3297 for (auto pg : pgs) {
3298 pg->lock();
3299 set<pair<spg_t,epoch_t>> new_children;
3300 set<pair<spg_t,epoch_t>> merge_pgs;
3301 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3302 &new_children, &merge_pgs);
3303 if (!new_children.empty()) {
3304 for (auto shard : shards) {
3305 shard->prime_splits(osdmap, &new_children);
3306 }
3307 assert(new_children.empty());
3308 }
3309 if (!merge_pgs.empty()) {
3310 for (auto shard : shards) {
3311 shard->prime_merges(osdmap, &merge_pgs);
3312 }
3313 assert(merge_pgs.empty());
3314 }
3315 pg->unlock();
3316 }
3317 }
3318
3319 osd_op_tp.start();
3320 command_tp.start();
3321
3322 // start the heartbeat
3323 heartbeat_thread.create("osd_srv_heartbt");
3324
3325 // tick
3326 tick_timer.add_event_after(get_tick_interval(),
3327 new C_Tick(this));
3328 {
3329 std::lock_guard l(tick_timer_lock);
3330 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3331 new C_Tick_WithoutOSDLock(this));
3332 }
3333
3334 osd_lock.Unlock();
3335
3336 r = monc->authenticate();
3337 if (r < 0) {
3338 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3339 << dendl;
3340 exit(1);
3341 }
3342
3343 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3344 derr << "unable to obtain rotating service keys; retrying" << dendl;
3345 ++rotating_auth_attempts;
3346 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3347 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3348 exit(1);
3349 }
3350 }
3351
3352 r = update_crush_device_class();
3353 if (r < 0) {
3354 derr << __func__ << " unable to update_crush_device_class: "
3355 << cpp_strerror(r) << dendl;
3356 exit(1);
3357 }
3358
3359 r = update_crush_location();
3360 if (r < 0) {
3361 derr << __func__ << " unable to update_crush_location: "
3362 << cpp_strerror(r) << dendl;
3363 exit(1);
3364 }
3365
3366 osd_lock.Lock();
3367 if (is_stopping())
3368 return 0;
3369
3370 // start objecter *after* we have authenticated, so that we don't ignore
3371 // the OSDMaps it requests.
3372 service.final_init();
3373
3374 check_config();
3375
3376 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3377 consume_map();
3378
3379 dout(0) << "done with init, starting boot process" << dendl;
3380
3381 // subscribe to any pg creations
3382 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3383
3384 // MgrClient needs this (it doesn't have MonClient reference itself)
3385 monc->sub_want("mgrmap", 0, 0);
3386
3387 // we don't need to ask for an osdmap here; objecter will
3388 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3389
3390 monc->renew_subs();
3391
3392 start_boot();
3393
3394 return 0;
3395
3396 out:
3397 enable_disable_fuse(true);
3398 store->umount();
3399 delete store;
3400 store = NULL;
3401 return r;
3402 }
3403
3404 void OSD::final_init()
3405 {
3406 AdminSocket *admin_socket = cct->get_admin_socket();
3407 asok_hook = new OSDSocketHook(this);
3408 int r = admin_socket->register_command("status", "status", asok_hook,
3409 "high-level status of OSD");
3410 ceph_assert(r == 0);
3411 r = admin_socket->register_command("flush_journal", "flush_journal",
3412 asok_hook,
3413 "flush the journal to permanent store");
3414 ceph_assert(r == 0);
3415 r = admin_socket->register_command("dump_ops_in_flight",
3416 "dump_ops_in_flight " \
3417 "name=filterstr,type=CephString,n=N,req=false",
3418 asok_hook,
3419 "show the ops currently in flight");
3420 ceph_assert(r == 0);
3421 r = admin_socket->register_command("ops",
3422 "ops " \
3423 "name=filterstr,type=CephString,n=N,req=false",
3424 asok_hook,
3425 "show the ops currently in flight");
3426 ceph_assert(r == 0);
3427 r = admin_socket->register_command("dump_blocked_ops",
3428 "dump_blocked_ops " \
3429 "name=filterstr,type=CephString,n=N,req=false",
3430 asok_hook,
3431 "show the blocked ops currently in flight");
3432 ceph_assert(r == 0);
3433 r = admin_socket->register_command("dump_historic_ops",
3434 "dump_historic_ops " \
3435 "name=filterstr,type=CephString,n=N,req=false",
3436 asok_hook,
3437 "show recent ops");
3438 ceph_assert(r == 0);
3439 r = admin_socket->register_command("dump_historic_slow_ops",
3440 "dump_historic_slow_ops " \
3441 "name=filterstr,type=CephString,n=N,req=false",
3442 asok_hook,
3443 "show slowest recent ops");
3444 ceph_assert(r == 0);
3445 r = admin_socket->register_command("dump_historic_ops_by_duration",
3446 "dump_historic_ops_by_duration " \
3447 "name=filterstr,type=CephString,n=N,req=false",
3448 asok_hook,
3449 "show slowest recent ops, sorted by duration");
3450 ceph_assert(r == 0);
3451 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3452 asok_hook,
3453 "dump op priority queue state");
3454 ceph_assert(r == 0);
3455 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3456 asok_hook,
3457 "dump blacklisted clients and times");
3458 ceph_assert(r == 0);
3459 r = admin_socket->register_command("dump_watchers", "dump_watchers",
3460 asok_hook,
3461 "show clients which have active watches,"
3462 " and on which objects");
3463 ceph_assert(r == 0);
3464 r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3465 asok_hook,
3466 "show recovery reservations");
3467 ceph_assert(r == 0);
3468 r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3469 asok_hook,
3470 "show scrub reservations");
3471 ceph_assert(r == 0);
3472 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3473 asok_hook,
3474 "force osd to update the latest map from "
3475 "the mon");
3476 ceph_assert(r == 0);
3477
3478 r = admin_socket->register_command( "heap",
3479 "heap " \
3480 "name=heapcmd,type=CephString " \
3481 "name=value,type=CephString,req=false",
3482 asok_hook,
3483 "show heap usage info (available only if "
3484 "compiled with tcmalloc)");
3485 ceph_assert(r == 0);
3486
3487 r = admin_socket->register_command("set_heap_property",
3488 "set_heap_property " \
3489 "name=property,type=CephString " \
3490 "name=value,type=CephInt",
3491 asok_hook,
3492 "update malloc extension heap property");
3493 ceph_assert(r == 0);
3494
3495 r = admin_socket->register_command("get_heap_property",
3496 "get_heap_property " \
3497 "name=property,type=CephString",
3498 asok_hook,
3499 "get malloc extension heap property");
3500 ceph_assert(r == 0);
3501
3502 r = admin_socket->register_command("dump_objectstore_kv_stats",
3503 "dump_objectstore_kv_stats",
3504 asok_hook,
3505 "print statistics of kvdb which used by bluestore");
3506 ceph_assert(r == 0);
3507
3508 r = admin_socket->register_command("dump_scrubs",
3509 "dump_scrubs",
3510 asok_hook,
3511 "print scheduled scrubs");
3512 ceph_assert(r == 0);
3513
3514 r = admin_socket->register_command("calc_objectstore_db_histogram",
3515 "calc_objectstore_db_histogram",
3516 asok_hook,
3517 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3518 ceph_assert(r == 0);
3519
3520 r = admin_socket->register_command("flush_store_cache",
3521 "flush_store_cache",
3522 asok_hook,
3523 "Flush bluestore internal cache");
3524 ceph_assert(r == 0);
3525 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3526 asok_hook,
3527 "show recent state history");
3528 ceph_assert(r == 0);
3529
3530 r = admin_socket->register_command("compact", "compact",
3531 asok_hook,
3532 "Commpact object store's omap."
3533 " WARNING: Compaction probably slows your requests");
3534 ceph_assert(r == 0);
3535
3536 r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3537 asok_hook,
3538 "dump pools whose PG(s) are mapped to this OSD.");
3539
3540 ceph_assert(r == 0);
3541
3542 r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3543 asok_hook,
3544 "probe OSD devices for SMART data.");
3545
3546 ceph_assert(r == 0);
3547
3548 r = admin_socket->register_command("list_devices", "list_devices",
3549 asok_hook,
3550 "list OSD devices.");
3551 r = admin_socket->register_command("send_beacon", "send_beacon",
3552 asok_hook,
3553 "send OSD beacon to mon immediately");
3554
3555 r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3556 "Dump osd heartbeat network ping times");
3557 ceph_assert(r == 0);
3558
3559 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3560 // Note: pools are CephString instead of CephPoolname because
3561 // these commands traditionally support both pool names and numbers
3562 r = admin_socket->register_command(
3563 "setomapval",
3564 "setomapval " \
3565 "name=pool,type=CephString " \
3566 "name=objname,type=CephObjectname " \
3567 "name=key,type=CephString "\
3568 "name=val,type=CephString",
3569 test_ops_hook,
3570 "set omap key");
3571 ceph_assert(r == 0);
3572 r = admin_socket->register_command(
3573 "rmomapkey",
3574 "rmomapkey " \
3575 "name=pool,type=CephString " \
3576 "name=objname,type=CephObjectname " \
3577 "name=key,type=CephString",
3578 test_ops_hook,
3579 "remove omap key");
3580 ceph_assert(r == 0);
3581 r = admin_socket->register_command(
3582 "setomapheader",
3583 "setomapheader " \
3584 "name=pool,type=CephString " \
3585 "name=objname,type=CephObjectname " \
3586 "name=header,type=CephString",
3587 test_ops_hook,
3588 "set omap header");
3589 ceph_assert(r == 0);
3590
3591 r = admin_socket->register_command(
3592 "getomap",
3593 "getomap " \
3594 "name=pool,type=CephString " \
3595 "name=objname,type=CephObjectname",
3596 test_ops_hook,
3597 "output entire object map");
3598 ceph_assert(r == 0);
3599
3600 r = admin_socket->register_command(
3601 "truncobj",
3602 "truncobj " \
3603 "name=pool,type=CephString " \
3604 "name=objname,type=CephObjectname " \
3605 "name=len,type=CephInt",
3606 test_ops_hook,
3607 "truncate object to length");
3608 ceph_assert(r == 0);
3609
3610 r = admin_socket->register_command(
3611 "injectdataerr",
3612 "injectdataerr " \
3613 "name=pool,type=CephString " \
3614 "name=objname,type=CephObjectname " \
3615 "name=shardid,type=CephInt,req=false,range=0|255",
3616 test_ops_hook,
3617 "inject data error to an object");
3618 ceph_assert(r == 0);
3619
3620 r = admin_socket->register_command(
3621 "injectmdataerr",
3622 "injectmdataerr " \
3623 "name=pool,type=CephString " \
3624 "name=objname,type=CephObjectname " \
3625 "name=shardid,type=CephInt,req=false,range=0|255",
3626 test_ops_hook,
3627 "inject metadata error to an object");
3628 ceph_assert(r == 0);
3629 r = admin_socket->register_command(
3630 "set_recovery_delay",
3631 "set_recovery_delay " \
3632 "name=utime,type=CephInt,req=false",
3633 test_ops_hook,
3634 "Delay osd recovery by specified seconds");
3635 ceph_assert(r == 0);
3636 r = admin_socket->register_command(
3637 "trigger_scrub",
3638 "trigger_scrub " \
3639 "name=pgid,type=CephString " \
3640 "name=time,type=CephInt,req=false",
3641 test_ops_hook,
3642 "Trigger a scheduled scrub ");
3643 ceph_assert(r == 0);
3644 r = admin_socket->register_command(
3645 "trigger_deep_scrub",
3646 "trigger_deep_scrub " \
3647 "name=pgid,type=CephString " \
3648 "name=time,type=CephInt,req=false",
3649 test_ops_hook,
3650 "Trigger a scheduled deep scrub ");
3651 ceph_assert(r == 0);
3652 r = admin_socket->register_command(
3653 "injectfull",
3654 "injectfull " \
3655 "name=type,type=CephString,req=false " \
3656 "name=count,type=CephInt,req=false ",
3657 test_ops_hook,
3658 "Inject a full disk (optional count times)");
3659 ceph_assert(r == 0);
3660 }
3661
3662 void OSD::create_logger()
3663 {
3664 dout(10) << "create_logger" << dendl;
3665
3666 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3667
3668 // Latency axis configuration for op histograms, values are in nanoseconds
3669 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3670 "Latency (usec)",
3671 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3672 0, ///< Start at 0
3673 100000, ///< Quantization unit is 100usec
3674 32, ///< Enough to cover much longer than slow requests
3675 };
3676
3677 // Op size axis configuration for op histograms, values are in bytes
3678 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3679 "Request size (bytes)",
3680 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3681 0, ///< Start at 0
3682 512, ///< Quantization unit is 512 bytes
3683 32, ///< Enough to cover requests larger than GB
3684 };
3685
3686
3687 // All the basic OSD operation stats are to be considered useful
3688 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3689
3690 osd_plb.add_u64(
3691 l_osd_op_wip, "op_wip",
3692 "Replication operations currently being processed (primary)");
3693 osd_plb.add_u64_counter(
3694 l_osd_op, "op",
3695 "Client operations",
3696 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3697 osd_plb.add_u64_counter(
3698 l_osd_op_inb, "op_in_bytes",
3699 "Client operations total write size",
3700 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3701 osd_plb.add_u64_counter(
3702 l_osd_op_outb, "op_out_bytes",
3703 "Client operations total read size",
3704 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3705 osd_plb.add_time_avg(
3706 l_osd_op_lat, "op_latency",
3707 "Latency of client operations (including queue time)",
3708 "l", 9);
3709 osd_plb.add_time_avg(
3710 l_osd_op_process_lat, "op_process_latency",
3711 "Latency of client operations (excluding queue time)");
3712 osd_plb.add_time_avg(
3713 l_osd_op_prepare_lat, "op_prepare_latency",
3714 "Latency of client operations (excluding queue time and wait for finished)");
3715
3716 osd_plb.add_u64_counter(
3717 l_osd_op_r, "op_r", "Client read operations");
3718 osd_plb.add_u64_counter(
3719 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3720 osd_plb.add_time_avg(
3721 l_osd_op_r_lat, "op_r_latency",
3722 "Latency of read operation (including queue time)");
3723 osd_plb.add_u64_counter_histogram(
3724 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3725 op_hist_x_axis_config, op_hist_y_axis_config,
3726 "Histogram of operation latency (including queue time) + data read");
3727 osd_plb.add_time_avg(
3728 l_osd_op_r_process_lat, "op_r_process_latency",
3729 "Latency of read operation (excluding queue time)");
3730 osd_plb.add_time_avg(
3731 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3732 "Latency of read operations (excluding queue time and wait for finished)");
3733 osd_plb.add_u64_counter(
3734 l_osd_op_w, "op_w", "Client write operations");
3735 osd_plb.add_u64_counter(
3736 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3737 osd_plb.add_time_avg(
3738 l_osd_op_w_lat, "op_w_latency",
3739 "Latency of write operation (including queue time)");
3740 osd_plb.add_u64_counter_histogram(
3741 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3742 op_hist_x_axis_config, op_hist_y_axis_config,
3743 "Histogram of operation latency (including queue time) + data written");
3744 osd_plb.add_time_avg(
3745 l_osd_op_w_process_lat, "op_w_process_latency",
3746 "Latency of write operation (excluding queue time)");
3747 osd_plb.add_time_avg(
3748 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3749 "Latency of write operations (excluding queue time and wait for finished)");
3750 osd_plb.add_u64_counter(
3751 l_osd_op_rw, "op_rw",
3752 "Client read-modify-write operations");
3753 osd_plb.add_u64_counter(
3754 l_osd_op_rw_inb, "op_rw_in_bytes",
3755 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3756 osd_plb.add_u64_counter(
3757 l_osd_op_rw_outb,"op_rw_out_bytes",
3758 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3759 osd_plb.add_time_avg(
3760 l_osd_op_rw_lat, "op_rw_latency",
3761 "Latency of read-modify-write operation (including queue time)");
3762 osd_plb.add_u64_counter_histogram(
3763 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3764 op_hist_x_axis_config, op_hist_y_axis_config,
3765 "Histogram of rw operation latency (including queue time) + data written");
3766 osd_plb.add_u64_counter_histogram(
3767 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3768 op_hist_x_axis_config, op_hist_y_axis_config,
3769 "Histogram of rw operation latency (including queue time) + data read");
3770 osd_plb.add_time_avg(
3771 l_osd_op_rw_process_lat, "op_rw_process_latency",
3772 "Latency of read-modify-write operation (excluding queue time)");
3773 osd_plb.add_time_avg(
3774 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3775 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3776
3777 // Now we move on to some more obscure stats, revert to assuming things
3778 // are low priority unless otherwise specified.
3779 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3780
3781 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3782 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3783 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3784 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3785
3786 osd_plb.add_u64_counter(
3787 l_osd_sop, "subop", "Suboperations");
3788 osd_plb.add_u64_counter(
3789 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
3790 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3791
3792 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3793 osd_plb.add_u64_counter(
3794 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
3795 osd_plb.add_time_avg(
3796 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3797 osd_plb.add_u64_counter(
3798 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3799 osd_plb.add_time_avg(
3800 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3801 osd_plb.add_u64_counter(
3802 l_osd_sop_push, "subop_push", "Suboperations push messages");
3803 osd_plb.add_u64_counter(
3804 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
3805 osd_plb.add_time_avg(
3806 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3807
3808 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3809 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3810 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
3811
3812 osd_plb.add_u64_counter(
3813 l_osd_rop, "recovery_ops",
3814 "Started recovery operations",
3815 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3816
3817 osd_plb.add_u64_counter(
3818 l_osd_rbytes, "recovery_bytes",
3819 "recovery bytes",
3820 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3821
3822 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3823 osd_plb.add_u64(
3824 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3825 osd_plb.add_u64(
3826 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3827 "Total number getting crc from crc_cache with adjusting");
3828 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3829 "Total number of crc cache misses");
3830
3831 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3832 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3833 osd_plb.add_u64(
3834 l_osd_pg_primary, "numpg_primary",
3835 "Placement groups for which this osd is primary");
3836 osd_plb.add_u64(
3837 l_osd_pg_replica, "numpg_replica",
3838 "Placement groups for which this osd is replica");
3839 osd_plb.add_u64(
3840 l_osd_pg_stray, "numpg_stray",
3841 "Placement groups ready to be deleted from this osd");
3842 osd_plb.add_u64(
3843 l_osd_pg_removing, "numpg_removing",
3844 "Placement groups queued for local deletion", "pgsr",
3845 PerfCountersBuilder::PRIO_USEFUL);
3846 osd_plb.add_u64(
3847 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3848 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3849 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3850 osd_plb.add_u64_counter(
3851 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3852 osd_plb.add_u64_counter(
3853 l_osd_waiting_for_map, "messages_delayed_for_map",
3854 "Operations waiting for OSD map");
3855
3856 osd_plb.add_u64_counter(
3857 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3858 osd_plb.add_u64_counter(
3859 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3860 osd_plb.add_u64_counter(
3861 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3862 "osdmap cache miss below cache lower bound");
3863 osd_plb.add_u64_avg(
3864 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3865 "osdmap cache miss, avg distance below cache lower bound");
3866 osd_plb.add_u64_counter(
3867 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3868 "OSDMap buffer cache hits");
3869 osd_plb.add_u64_counter(
3870 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3871 "OSDMap buffer cache misses");
3872
3873 osd_plb.add_u64(
3874 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3875 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3876 osd_plb.add_u64(
3877 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3878 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3879 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
3880
3881 osd_plb.add_u64_counter(
3882 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3883
3884 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3885 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3886 osd_plb.add_u64_counter(
3887 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3888 osd_plb.add_u64_counter(
3889 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3890 osd_plb.add_u64_counter(
3891 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3892 "Failed tier flush attempts");
3893 osd_plb.add_u64_counter(
3894 l_osd_tier_evict, "tier_evict", "Tier evictions");
3895 osd_plb.add_u64_counter(
3896 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3897 osd_plb.add_u64_counter(
3898 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3899 osd_plb.add_u64_counter(
3900 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3901 osd_plb.add_u64_counter(
3902 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3903 osd_plb.add_u64_counter(
3904 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3905 osd_plb.add_u64_counter(
3906 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3907
3908 osd_plb.add_u64_counter(
3909 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3910 osd_plb.add_u64_counter(
3911 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3912 osd_plb.add_u64_counter(
3913 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3914 osd_plb.add_u64_counter(
3915 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3916
3917 osd_plb.add_u64_counter(
3918 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3919 osd_plb.add_u64_counter(
3920 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3921
3922 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3923 osd_plb.add_time_avg(
3924 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3925 osd_plb.add_time_avg(
3926 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3927 osd_plb.add_time_avg(
3928 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3929
3930 osd_plb.add_u64_counter(
3931 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3932 osd_plb.add_u64_counter(
3933 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3934 "PG updated its info using fastinfo attr");
3935 osd_plb.add_u64_counter(
3936 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3937
3938 logger = osd_plb.create_perf_counters();
3939 cct->get_perfcounters_collection()->add(logger);
3940 }
3941
3942 void OSD::create_recoverystate_perf()
3943 {
3944 dout(10) << "create_recoverystate_perf" << dendl;
3945
3946 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3947
3948 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3949 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3950 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3951 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3952 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3953 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3954 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3955 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3956 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3957 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3958 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3959 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3960 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3961 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3962 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3963 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3964 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3965 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3966 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3967 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3968 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3969 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3970 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3971 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3972 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3973 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3974 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3975 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3976 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3977 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3978 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3979
3980 recoverystate_perf = rs_perf.create_perf_counters();
3981 cct->get_perfcounters_collection()->add(recoverystate_perf);
3982 }
3983
3984 int OSD::shutdown()
3985 {
3986 if (!service.prepare_to_stop())
3987 return 0; // already shutting down
3988 osd_lock.Lock();
3989 if (is_stopping()) {
3990 osd_lock.Unlock();
3991 return 0;
3992 }
3993 dout(0) << "shutdown" << dendl;
3994
3995 set_state(STATE_STOPPING);
3996
3997 // Debugging
3998 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
3999 cct->_conf.set_val("debug_osd", "100");
4000 cct->_conf.set_val("debug_journal", "100");
4001 cct->_conf.set_val("debug_filestore", "100");
4002 cct->_conf.set_val("debug_bluestore", "100");
4003 cct->_conf.set_val("debug_ms", "100");
4004 cct->_conf.apply_changes(nullptr);
4005 }
4006
4007 // stop MgrClient earlier as it's more like an internal consumer of OSD
4008 mgrc.shutdown();
4009
4010 service.start_shutdown();
4011
4012 // stop sending work to pgs. this just prevents any new work in _process
4013 // from racing with on_shutdown and potentially entering the pg after.
4014 op_shardedwq.drain();
4015
4016 // Shutdown PGs
4017 {
4018 vector<PGRef> pgs;
4019 _get_pgs(&pgs);
4020 for (auto pg : pgs) {
4021 pg->shutdown();
4022 }
4023 }
4024
4025 // drain op queue again (in case PGs requeued something)
4026 op_shardedwq.drain();
4027 {
4028 finished.clear(); // zap waiters (bleh, this is messy)
4029 waiting_for_osdmap.clear();
4030 }
4031
4032 // unregister commands
4033 cct->get_admin_socket()->unregister_commands(asok_hook);
4034 delete asok_hook;
4035 asok_hook = NULL;
4036
4037 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4038 delete test_ops_hook;
4039 test_ops_hook = NULL;
4040
4041 osd_lock.Unlock();
4042
4043 heartbeat_lock.Lock();
4044 heartbeat_stop = true;
4045 heartbeat_cond.Signal();
4046 heartbeat_lock.Unlock();
4047 heartbeat_thread.join();
4048
4049 osd_op_tp.drain();
4050 osd_op_tp.stop();
4051 dout(10) << "op sharded tp stopped" << dendl;
4052
4053 command_tp.drain();
4054 command_tp.stop();
4055 dout(10) << "command tp stopped" << dendl;
4056
4057 dout(10) << "stopping agent" << dendl;
4058 service.agent_stop();
4059
4060 boot_finisher.wait_for_empty();
4061
4062 osd_lock.Lock();
4063
4064 boot_finisher.stop();
4065 reset_heartbeat_peers(true);
4066
4067 tick_timer.shutdown();
4068
4069 {
4070 std::lock_guard l(tick_timer_lock);
4071 tick_timer_without_osd_lock.shutdown();
4072 }
4073
4074 // note unmount epoch
4075 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
4076 superblock.mounted = service.get_boot_epoch();
4077 superblock.clean_thru = osdmap->get_epoch();
4078 ObjectStore::Transaction t;
4079 write_superblock(t);
4080 int r = store->queue_transaction(service.meta_ch, std::move(t));
4081 if (r) {
4082 derr << "OSD::shutdown: error writing superblock: "
4083 << cpp_strerror(r) << dendl;
4084 }
4085
4086
4087 service.shutdown_reserver();
4088
4089 // Remove PGs
4090 #ifdef PG_DEBUG_REFS
4091 service.dump_live_pgids();
4092 #endif
4093 while (true) {
4094 vector<PGRef> pgs;
4095 _get_pgs(&pgs, true);
4096 if (pgs.empty()) {
4097 break;
4098 }
4099 for (auto& pg : pgs) {
4100 if (pg->is_deleted()) {
4101 continue;
4102 }
4103 dout(20) << " kicking pg " << pg << dendl;
4104 pg->lock();
4105 if (pg->get_num_ref() != 1) {
4106 derr << "pgid " << pg->get_pgid() << " has ref count of "
4107 << pg->get_num_ref() << dendl;
4108 #ifdef PG_DEBUG_REFS
4109 pg->dump_live_ids();
4110 #endif
4111 if (cct->_conf->osd_shutdown_pgref_assert) {
4112 ceph_abort();
4113 }
4114 }
4115 pg->ch.reset();
4116 pg->unlock();
4117 }
4118 }
4119 #ifdef PG_DEBUG_REFS
4120 service.dump_live_pgids();
4121 #endif
4122
4123 osd_lock.Unlock();
4124 cct->_conf.remove_observer(this);
4125 osd_lock.Lock();
4126
4127 service.meta_ch.reset();
4128
4129 dout(10) << "syncing store" << dendl;
4130 enable_disable_fuse(true);
4131
4132 if (cct->_conf->osd_journal_flush_on_shutdown) {
4133 dout(10) << "flushing journal" << dendl;
4134 store->flush_journal();
4135 }
4136
4137 monc->shutdown();
4138 osd_lock.Unlock();
4139
4140 map_lock.get_write();
4141 osdmap = OSDMapRef();
4142 map_lock.put_write();
4143
4144 for (auto s : shards) {
4145 std::lock_guard l(s->osdmap_lock);
4146 s->shard_osdmap = OSDMapRef();
4147 }
4148 service.shutdown();
4149
4150 std::lock_guard lock(osd_lock);
4151 store->umount();
4152 delete store;
4153 store = nullptr;
4154 dout(10) << "Store synced" << dendl;
4155
4156 op_tracker.on_shutdown();
4157
4158 class_handler->shutdown();
4159 client_messenger->shutdown();
4160 cluster_messenger->shutdown();
4161 hb_front_client_messenger->shutdown();
4162 hb_back_client_messenger->shutdown();
4163 objecter_messenger->shutdown();
4164 hb_front_server_messenger->shutdown();
4165 hb_back_server_messenger->shutdown();
4166
4167 return r;
4168 }
4169
4170 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4171 {
4172 bool created = false;
4173 while (true) {
4174 dout(10) << __func__ << " cmd: " << cmd << dendl;
4175 vector<string> vcmd{cmd};
4176 bufferlist inbl;
4177 C_SaferCond w;
4178 string outs;
4179 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4180 int r = w.wait();
4181 if (r < 0) {
4182 if (r == -ENOENT && !created) {
4183 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4184 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4185 vector<string> vnewcmd{newcmd};
4186 bufferlist inbl;
4187 C_SaferCond w;
4188 string outs;
4189 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4190 int r = w.wait();
4191 if (r < 0) {
4192 derr << __func__ << " fail: osd does not exist and created failed: "
4193 << cpp_strerror(r) << dendl;
4194 return r;
4195 }
4196 created = true;
4197 continue;
4198 }
4199 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4200 return r;
4201 }
4202 break;
4203 }
4204
4205 return 0;
4206 }
4207
4208 int OSD::update_crush_location()
4209 {
4210 if (!cct->_conf->osd_crush_update_on_start) {
4211 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4212 return 0;
4213 }
4214
4215 char weight[32];
4216 if (cct->_conf->osd_crush_initial_weight >= 0) {
4217 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4218 } else {
4219 struct store_statfs_t st;
4220 osd_alert_list_t alerts;
4221 int r = store->statfs(&st, &alerts);
4222 if (r < 0) {
4223 derr << "statfs: " << cpp_strerror(r) << dendl;
4224 return r;
4225 }
4226 snprintf(weight, sizeof(weight), "%.4lf",
4227 std::max(.00001,
4228 double(st.total) /
4229 double(1ull << 40 /* TB */)));
4230 }
4231
4232 std::multimap<string,string> loc = cct->crush_location.get_location();
4233 dout(10) << __func__ << " crush location is " << loc << dendl;
4234
4235 string cmd =
4236 string("{\"prefix\": \"osd crush create-or-move\", ") +
4237 string("\"id\": ") + stringify(whoami) + string(", ") +
4238 string("\"weight\":") + weight + string(", ") +
4239 string("\"args\": [");
4240 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4241 if (p != loc.begin())
4242 cmd += ", ";
4243 cmd += "\"" + p->first + "=" + p->second + "\"";
4244 }
4245 cmd += "]}";
4246
4247 return mon_cmd_maybe_osd_create(cmd);
4248 }
4249
4250 int OSD::update_crush_device_class()
4251 {
4252 if (!cct->_conf->osd_class_update_on_start) {
4253 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4254 return 0;
4255 }
4256
4257 string device_class;
4258 int r = store->read_meta("crush_device_class", &device_class);
4259 if (r < 0 || device_class.empty()) {
4260 device_class = store->get_default_device_class();
4261 }
4262
4263 if (device_class.empty()) {
4264 dout(20) << __func__ << " no device class stored locally" << dendl;
4265 return 0;
4266 }
4267
4268 string cmd =
4269 string("{\"prefix\": \"osd crush set-device-class\", ") +
4270 string("\"class\": \"") + device_class + string("\", ") +
4271 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4272
4273 r = mon_cmd_maybe_osd_create(cmd);
4274 if (r == -EBUSY) {
4275 // good, already bound to a device-class
4276 return 0;
4277 } else {
4278 return r;
4279 }
4280 }
4281
4282 void OSD::write_superblock(ObjectStore::Transaction& t)
4283 {
4284 dout(10) << "write_superblock " << superblock << dendl;
4285
4286 //hack: at minimum it's using the baseline feature set
4287 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4288 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4289
4290 bufferlist bl;
4291 encode(superblock, bl);
4292 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4293 }
4294
4295 int OSD::read_superblock()
4296 {
4297 bufferlist bl;
4298 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4299 if (r < 0)
4300 return r;
4301
4302 auto p = bl.cbegin();
4303 decode(superblock, p);
4304
4305 dout(10) << "read_superblock " << superblock << dendl;
4306
4307 return 0;
4308 }
4309
4310 void OSD::clear_temp_objects()
4311 {
4312 dout(10) << __func__ << dendl;
4313 vector<coll_t> ls;
4314 store->list_collections(ls);
4315 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4316 spg_t pgid;
4317 if (!p->is_pg(&pgid))
4318 continue;
4319
4320 // list temp objects
4321 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4322
4323 vector<ghobject_t> temps;
4324 ghobject_t next;
4325 while (1) {
4326 vector<ghobject_t> objects;
4327 auto ch = store->open_collection(*p);
4328 ceph_assert(ch);
4329 store->collection_list(ch, next, ghobject_t::get_max(),
4330 store->get_ideal_list_max(),
4331 &objects, &next);
4332 if (objects.empty())
4333 break;
4334 vector<ghobject_t>::iterator q;
4335 for (q = objects.begin(); q != objects.end(); ++q) {
4336 // Hammer set pool for temps to -1, so check for clean-up
4337 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4338 temps.push_back(*q);
4339 } else {
4340 break;
4341 }
4342 }
4343 // If we saw a non-temp object and hit the break above we can
4344 // break out of the while loop too.
4345 if (q != objects.end())
4346 break;
4347 }
4348 if (!temps.empty()) {
4349 ObjectStore::Transaction t;
4350 int removed = 0;
4351 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4352 dout(20) << " removing " << *p << " object " << *q << dendl;
4353 t.remove(*p, *q);
4354 if (++removed > cct->_conf->osd_target_transaction_size) {
4355 store->queue_transaction(service.meta_ch, std::move(t));
4356 t = ObjectStore::Transaction();
4357 removed = 0;
4358 }
4359 }
4360 if (removed) {
4361 store->queue_transaction(service.meta_ch, std::move(t));
4362 }
4363 }
4364 }
4365 }
4366
4367 void OSD::recursive_remove_collection(CephContext* cct,
4368 ObjectStore *store, spg_t pgid,
4369 coll_t tmp)
4370 {
4371 OSDriver driver(
4372 store,
4373 coll_t(),
4374 make_snapmapper_oid());
4375
4376 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4377 ObjectStore::Transaction t;
4378 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4379
4380 ghobject_t next;
4381 int max = cct->_conf->osd_target_transaction_size;
4382 vector<ghobject_t> objects;
4383 objects.reserve(max);
4384 while (true) {
4385 objects.clear();
4386 store->collection_list(ch, next, ghobject_t::get_max(),
4387 max, &objects, &next);
4388 generic_dout(10) << __func__ << " " << objects << dendl;
4389 if (objects.empty())
4390 break;
4391 for (auto& p: objects) {
4392 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4393 int r = mapper.remove_oid(p.hobj, &_t);
4394 if (r != 0 && r != -ENOENT)
4395 ceph_abort();
4396 t.remove(tmp, p);
4397 }
4398 int r = store->queue_transaction(ch, std::move(t));
4399 ceph_assert(r == 0);
4400 t = ObjectStore::Transaction();
4401 }
4402 t.remove_collection(tmp);
4403 int r = store->queue_transaction(ch, std::move(t));
4404 ceph_assert(r == 0);
4405
4406 C_SaferCond waiter;
4407 if (!ch->flush_commit(&waiter)) {
4408 waiter.wait();
4409 }
4410 }
4411
4412
4413 // ======================================================
4414 // PG's
4415
4416 PG* OSD::_make_pg(
4417 OSDMapRef createmap,
4418 spg_t pgid)
4419 {
4420 dout(10) << __func__ << " " << pgid << dendl;
4421 pg_pool_t pi;
4422 map<string,string> ec_profile;
4423 string name;
4424 if (createmap->have_pg_pool(pgid.pool())) {
4425 pi = *createmap->get_pg_pool(pgid.pool());
4426 name = createmap->get_pool_name(pgid.pool());
4427 if (pi.is_erasure()) {
4428 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4429 }
4430 } else {
4431 // pool was deleted; grab final pg_pool_t off disk.
4432 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4433 bufferlist bl;
4434 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4435 if (r < 0) {
4436 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4437 << dendl;
4438 return nullptr;
4439 }
4440 ceph_assert(r >= 0);
4441 auto p = bl.cbegin();
4442 decode(pi, p);
4443 decode(name, p);
4444 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4445 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4446 << " tombstone" << dendl;
4447 return nullptr;
4448 }
4449 decode(ec_profile, p);
4450 }
4451 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4452 PG *pg;
4453 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4454 pi.type == pg_pool_t::TYPE_ERASURE)
4455 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4456 else
4457 ceph_abort();
4458 return pg;
4459 }
4460
4461 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4462 {
4463 v->clear();
4464 v->reserve(get_num_pgs());
4465 for (auto& s : shards) {
4466 std::lock_guard l(s->shard_lock);
4467 for (auto& j : s->pg_slots) {
4468 if (j.second->pg &&
4469 !j.second->pg->is_deleted()) {
4470 v->push_back(j.second->pg);
4471 if (clear_too) {
4472 s->_detach_pg(j.second.get());
4473 }
4474 }
4475 }
4476 }
4477 }
4478
4479 void OSD::_get_pgids(vector<spg_t> *v)
4480 {
4481 v->clear();
4482 v->reserve(get_num_pgs());
4483 for (auto& s : shards) {
4484 std::lock_guard l(s->shard_lock);
4485 for (auto& j : s->pg_slots) {
4486 if (j.second->pg &&
4487 !j.second->pg->is_deleted()) {
4488 v->push_back(j.first);
4489 }
4490 }
4491 }
4492 }
4493
4494 void OSD::register_pg(PGRef pg)
4495 {
4496 spg_t pgid = pg->get_pgid();
4497 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4498 auto sdata = shards[shard_index];
4499 std::lock_guard l(sdata->shard_lock);
4500 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4501 ceph_assert(r.second);
4502 auto *slot = r.first->second.get();
4503 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4504 sdata->_attach_pg(slot, pg.get());
4505 }
4506
4507 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4508 {
4509 auto sdata = pg->osd_shard;
4510 ceph_assert(sdata);
4511 {
4512 std::lock_guard l(sdata->shard_lock);
4513 auto p = sdata->pg_slots.find(pg->pg_id);
4514 if (p == sdata->pg_slots.end() ||
4515 !p->second->pg) {
4516 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4517 return false;
4518 }
4519 if (p->second->waiting_for_merge_epoch) {
4520 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4521 return false;
4522 }
4523 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4524 sdata->_detach_pg(p->second.get());
4525 }
4526
4527 for (auto shard : shards) {
4528 shard->unprime_split_children(pg->pg_id, old_pg_num);
4529 }
4530
4531 // update pg count now since we might not get an osdmap any time soon.
4532 if (pg->is_primary())
4533 service.logger->dec(l_osd_pg_primary);
4534 else if (pg->is_replica())
4535 service.logger->dec(l_osd_pg_replica);
4536 else
4537 service.logger->dec(l_osd_pg_stray);
4538
4539 return true;
4540 }
4541
4542 PGRef OSD::_lookup_pg(spg_t pgid)
4543 {
4544 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4545 auto sdata = shards[shard_index];
4546 std::lock_guard l(sdata->shard_lock);
4547 auto p = sdata->pg_slots.find(pgid);
4548 if (p == sdata->pg_slots.end()) {
4549 return nullptr;
4550 }
4551 return p->second->pg;
4552 }
4553
4554 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4555 {
4556 PGRef pg = _lookup_pg(pgid);
4557 if (!pg) {
4558 return nullptr;
4559 }
4560 pg->lock();
4561 if (!pg->is_deleted()) {
4562 return pg;
4563 }
4564 pg->unlock();
4565 return nullptr;
4566 }
4567
4568 PGRef OSD::lookup_lock_pg(spg_t pgid)
4569 {
4570 return _lookup_lock_pg(pgid);
4571 }
4572
4573 void OSD::load_pgs()
4574 {
4575 ceph_assert(osd_lock.is_locked());
4576 dout(0) << "load_pgs" << dendl;
4577
4578 {
4579 auto pghist = make_pg_num_history_oid();
4580 bufferlist bl;
4581 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4582 if (r >= 0 && bl.length() > 0) {
4583 auto p = bl.cbegin();
4584 decode(pg_num_history, p);
4585 }
4586 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4587 }
4588
4589 vector<coll_t> ls;
4590 int r = store->list_collections(ls);
4591 if (r < 0) {
4592 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4593 }
4594
4595 int num = 0;
4596 for (vector<coll_t>::iterator it = ls.begin();
4597 it != ls.end();
4598 ++it) {
4599 spg_t pgid;
4600 if (it->is_temp(&pgid) ||
4601 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4602 dout(10) << "load_pgs " << *it
4603 << " removing, legacy or flagged for removal pg" << dendl;
4604 recursive_remove_collection(cct, store, pgid, *it);
4605 continue;
4606 }
4607
4608 if (!it->is_pg(&pgid)) {
4609 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4610 continue;
4611 }
4612
4613 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4614 epoch_t map_epoch = 0;
4615 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4616 if (r < 0) {
4617 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4618 << dendl;
4619 continue;
4620 }
4621
4622 PGRef pg;
4623 if (map_epoch > 0) {
4624 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4625 if (!pgosdmap) {
4626 if (!osdmap->have_pg_pool(pgid.pool())) {
4627 derr << __func__ << ": could not find map for epoch " << map_epoch
4628 << " on pg " << pgid << ", but the pool is not present in the "
4629 << "current map, so this is probably a result of bug 10617. "
4630 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4631 << "to clean it up later." << dendl;
4632 continue;
4633 } else {
4634 derr << __func__ << ": have pgid " << pgid << " at epoch "
4635 << map_epoch << ", but missing map. Crashing."
4636 << dendl;
4637 ceph_abort_msg("Missing map in load_pgs");
4638 }
4639 }
4640 pg = _make_pg(pgosdmap, pgid);
4641 } else {
4642 pg = _make_pg(osdmap, pgid);
4643 }
4644 if (!pg) {
4645 recursive_remove_collection(cct, store, pgid, *it);
4646 continue;
4647 }
4648
4649 // there can be no waiters here, so we don't call _wake_pg_slot
4650
4651 pg->lock();
4652 pg->ch = store->open_collection(pg->coll);
4653
4654 // read pg state, log
4655 pg->read_state(store);
4656
4657 if (pg->dne()) {
4658 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4659 pg->ch = nullptr;
4660 pg->unlock();
4661 recursive_remove_collection(cct, store, pgid, *it);
4662 continue;
4663 }
4664 {
4665 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4666 assert(NULL != shards[shard_index]);
4667 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4668 }
4669
4670 pg->reg_next_scrub();
4671
4672 dout(10) << __func__ << " loaded " << *pg << dendl;
4673 pg->unlock();
4674
4675 register_pg(pg);
4676 ++num;
4677 }
4678 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4679 }
4680
4681
4682 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4683 const PGCreateInfo *info)
4684 {
4685 spg_t pgid = info->pgid;
4686
4687 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4688 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4689 return nullptr;
4690 }
4691
4692 PG::RecoveryCtx rctx = create_context();
4693
4694 OSDMapRef startmap = get_map(info->epoch);
4695
4696 if (info->by_mon) {
4697 int64_t pool_id = pgid.pgid.pool();
4698 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4699 if (!pool) {
4700 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4701 return nullptr;
4702 }
4703 if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4704 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4705 // this ensures we do not process old creating messages after the
4706 // pool's initial pgs have been created (and pg are subsequently
4707 // allowed to split or merge).
4708 dout(20) << __func__ << " dropping " << pgid
4709 << "create, pool does not have CREATING flag set" << dendl;
4710 return nullptr;
4711 }
4712 }
4713
4714 int up_primary, acting_primary;
4715 vector<int> up, acting;
4716 startmap->pg_to_up_acting_osds(
4717 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4718
4719 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4720 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4721 store->get_type() != "bluestore") {
4722 clog->warn() << "pg " << pgid
4723 << " is at risk of silent data corruption: "
4724 << "the pool allows ec overwrites but is not stored in "
4725 << "bluestore, so deep scrubbing will not detect bitrot";
4726 }
4727 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4728 PG::_init(*rctx.transaction, pgid, pp);
4729
4730 int role = startmap->calc_pg_role(whoami, acting, acting.size());
4731 if (!pp->is_replicated() && role != pgid.shard) {
4732 role = -1;
4733 }
4734
4735 PGRef pg = _make_pg(startmap, pgid);
4736 pg->ch = store->create_new_collection(pg->coll);
4737
4738 {
4739 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4740 assert(NULL != shards[shard_index]);
4741 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4742 }
4743
4744 pg->lock(true);
4745
4746 // we are holding the shard lock
4747 ceph_assert(!pg->is_deleted());
4748
4749 pg->init(
4750 role,
4751 up,
4752 up_primary,
4753 acting,
4754 acting_primary,
4755 info->history,
4756 info->past_intervals,
4757 false,
4758 rctx.transaction);
4759
4760 if (pg->is_primary()) {
4761 Mutex::Locker locker(m_perf_queries_lock);
4762 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4763 }
4764
4765 pg->handle_initialize(&rctx);
4766 pg->handle_activate_map(&rctx);
4767
4768 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4769
4770 dout(10) << __func__ << " new pg " << *pg << dendl;
4771 return pg;
4772 }
4773
4774 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4775 spg_t pgid,
4776 bool is_mon_create)
4777 {
4778 const auto max_pgs_per_osd =
4779 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4780 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4781
4782 if (num_pgs < max_pgs_per_osd) {
4783 return false;
4784 }
4785
4786 std::lock_guard l(pending_creates_lock);
4787 if (is_mon_create) {
4788 pending_creates_from_mon++;
4789 } else {
4790 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4791 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4792 }
4793 dout(1) << __func__ << " withhold creation of pg " << pgid
4794 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4795 return true;
4796 }
4797
4798 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4799 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4800 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4801 static vector<int32_t> twiddle(const vector<int>& acting) {
4802 if (acting.size() > 1) {
4803 return {acting[0]};
4804 } else {
4805 vector<int32_t> twiddled(acting.begin(), acting.end());
4806 twiddled.push_back(-1);
4807 return twiddled;
4808 }
4809 }
4810
4811 void OSD::resume_creating_pg()
4812 {
4813 bool do_sub_pg_creates = false;
4814 bool have_pending_creates = false;
4815 {
4816 const auto max_pgs_per_osd =
4817 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4818 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4819 if (max_pgs_per_osd <= num_pgs) {
4820 // this could happen if admin decreases this setting before a PG is removed
4821 return;
4822 }
4823 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4824 std::lock_guard l(pending_creates_lock);
4825 if (pending_creates_from_mon > 0) {
4826 dout(20) << __func__ << " pending_creates_from_mon "
4827 << pending_creates_from_mon << dendl;
4828 do_sub_pg_creates = true;
4829 if (pending_creates_from_mon >= spare_pgs) {
4830 spare_pgs = pending_creates_from_mon = 0;
4831 } else {
4832 spare_pgs -= pending_creates_from_mon;
4833 pending_creates_from_mon = 0;
4834 }
4835 }
4836 auto pg = pending_creates_from_osd.cbegin();
4837 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4838 dout(20) << __func__ << " pg " << pg->first << dendl;
4839 vector<int> acting;
4840 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4841 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4842 pg = pending_creates_from_osd.erase(pg);
4843 do_sub_pg_creates = true;
4844 spare_pgs--;
4845 }
4846 have_pending_creates = (pending_creates_from_mon > 0 ||
4847 !pending_creates_from_osd.empty());
4848 }
4849
4850 bool do_renew_subs = false;
4851 if (do_sub_pg_creates) {
4852 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4853 dout(4) << __func__ << ": resolicit pg creates from mon since "
4854 << last_pg_create_epoch << dendl;
4855 do_renew_subs = true;
4856 }
4857 }
4858 version_t start = osdmap->get_epoch() + 1;
4859 if (have_pending_creates) {
4860 // don't miss any new osdmap deleting PGs
4861 if (monc->sub_want("osdmap", start, 0)) {
4862 dout(4) << __func__ << ": resolicit osdmap from mon since "
4863 << start << dendl;
4864 do_renew_subs = true;
4865 }
4866 } else if (do_sub_pg_creates) {
4867 // no need to subscribe the osdmap continuously anymore
4868 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4869 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4870 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4871 << start << dendl;
4872 do_renew_subs = true;
4873 }
4874 }
4875
4876 if (do_renew_subs) {
4877 monc->renew_subs();
4878 }
4879
4880 service.send_pg_temp();
4881 }
4882
4883 void OSD::build_initial_pg_history(
4884 spg_t pgid,
4885 epoch_t created,
4886 utime_t created_stamp,
4887 pg_history_t *h,
4888 PastIntervals *pi)
4889 {
4890 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4891 h->epoch_created = created;
4892 h->epoch_pool_created = created;
4893 h->same_interval_since = created;
4894 h->same_up_since = created;
4895 h->same_primary_since = created;
4896 h->last_scrub_stamp = created_stamp;
4897 h->last_deep_scrub_stamp = created_stamp;
4898 h->last_clean_scrub_stamp = created_stamp;
4899
4900 OSDMapRef lastmap = service.get_map(created);
4901 int up_primary, acting_primary;
4902 vector<int> up, acting;
4903 lastmap->pg_to_up_acting_osds(
4904 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4905
4906 ostringstream debug;
4907 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4908 OSDMapRef osdmap = service.get_map(e);
4909 int new_up_primary, new_acting_primary;
4910 vector<int> new_up, new_acting;
4911 osdmap->pg_to_up_acting_osds(
4912 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4913
4914 // this is a bit imprecise, but sufficient?
4915 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4916 const pg_pool_t *pi;
4917 bool operator()(const set<pg_shard_t> &have) const {
4918 return have.size() >= pi->min_size;
4919 }
4920 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4921 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4922
4923 bool new_interval = PastIntervals::check_new_interval(
4924 acting_primary,
4925 new_acting_primary,
4926 acting, new_acting,
4927 up_primary,
4928 new_up_primary,
4929 up, new_up,
4930 h->same_interval_since,
4931 h->last_epoch_clean,
4932 osdmap,
4933 lastmap,
4934 pgid.pgid,
4935 &min_size_predicate,
4936 pi,
4937 &debug);
4938 if (new_interval) {
4939 h->same_interval_since = e;
4940 if (up != new_up) {
4941 h->same_up_since = e;
4942 }
4943 if (acting_primary != new_acting_primary) {
4944 h->same_primary_since = e;
4945 }
4946 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4947 osdmap->get_pg_num(pgid.pgid.pool()),
4948 nullptr)) {
4949 h->last_epoch_split = e;
4950 }
4951 up = new_up;
4952 acting = new_acting;
4953 up_primary = new_up_primary;
4954 acting_primary = new_acting_primary;
4955 }
4956 lastmap = osdmap;
4957 }
4958 dout(20) << __func__ << " " << debug.str() << dendl;
4959 dout(10) << __func__ << " " << *h << " " << *pi
4960 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4961 pi->get_bounds()) << ")"
4962 << dendl;
4963 }
4964
4965 void OSD::_add_heartbeat_peer(int p)
4966 {
4967 if (p == whoami)
4968 return;
4969 HeartbeatInfo *hi;
4970
4971 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4972 if (i == heartbeat_peers.end()) {
4973 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4974 if (!cons.first)
4975 return;
4976 hi = &heartbeat_peers[p];
4977 hi->peer = p;
4978 RefCountedPtr s{new HeartbeatSession{p}, false};
4979 hi->hb_interval_start = ceph_clock_now();
4980 hi->con_back = cons.first.get();
4981 hi->con_back->set_priv(s);
4982 if (cons.second) {
4983 hi->con_front = cons.second.get();
4984 hi->con_front->set_priv(s);
4985 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4986 << " " << hi->con_back->get_peer_addr()
4987 << " " << hi->con_front->get_peer_addr()
4988 << dendl;
4989 } else {
4990 hi->con_front.reset(NULL);
4991 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4992 << " " << hi->con_back->get_peer_addr()
4993 << dendl;
4994 }
4995 } else {
4996 hi = &i->second;
4997 }
4998 hi->epoch = osdmap->get_epoch();
4999 }
5000
5001 void OSD::_remove_heartbeat_peer(int n)
5002 {
5003 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5004 ceph_assert(q != heartbeat_peers.end());
5005 dout(20) << " removing heartbeat peer osd." << n
5006 << " " << q->second.con_back->get_peer_addr()
5007 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5008 << dendl;
5009 q->second.con_back->mark_down();
5010 if (q->second.con_front) {
5011 q->second.con_front->mark_down();
5012 }
5013 heartbeat_peers.erase(q);
5014 }
5015
5016 void OSD::need_heartbeat_peer_update()
5017 {
5018 if (is_stopping())
5019 return;
5020 dout(20) << "need_heartbeat_peer_update" << dendl;
5021 heartbeat_set_peers_need_update();
5022 }
5023
5024 void OSD::maybe_update_heartbeat_peers()
5025 {
5026 ceph_assert(osd_lock.is_locked());
5027
5028 if (is_waiting_for_healthy() || is_active()) {
5029 utime_t now = ceph_clock_now();
5030 if (last_heartbeat_resample == utime_t()) {
5031 last_heartbeat_resample = now;
5032 heartbeat_set_peers_need_update();
5033 } else if (!heartbeat_peers_need_update()) {
5034 utime_t dur = now - last_heartbeat_resample;
5035 if (dur > cct->_conf->osd_heartbeat_grace) {
5036 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5037 heartbeat_set_peers_need_update();
5038 last_heartbeat_resample = now;
5039 // automatically clean up any stale heartbeat peers
5040 // if we are unhealthy, then clean all
5041 reset_heartbeat_peers(is_waiting_for_healthy());
5042 }
5043 }
5044 }
5045
5046 if (!heartbeat_peers_need_update())
5047 return;
5048 heartbeat_clear_peers_need_update();
5049
5050 std::lock_guard l(heartbeat_lock);
5051
5052 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5053
5054
5055 // build heartbeat from set
5056 if (is_active()) {
5057 vector<PGRef> pgs;
5058 _get_pgs(&pgs);
5059 for (auto& pg : pgs) {
5060 pg->with_heartbeat_peers([&](int peer) {
5061 if (osdmap->is_up(peer)) {
5062 _add_heartbeat_peer(peer);
5063 }
5064 });
5065 }
5066 }
5067
5068 // include next and previous up osds to ensure we have a fully-connected set
5069 set<int> want, extras;
5070 const int next = osdmap->get_next_up_osd_after(whoami);
5071 if (next >= 0)
5072 want.insert(next);
5073 int prev = osdmap->get_previous_up_osd_before(whoami);
5074 if (prev >= 0 && prev != next)
5075 want.insert(prev);
5076
5077 // make sure we have at least **min_down** osds coming from different
5078 // subtree level (e.g., hosts) for fast failure detection.
5079 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5080 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5081 osdmap->get_random_up_osds_by_subtree(
5082 whoami, subtree, min_down, want, &want);
5083
5084 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5085 dout(10) << " adding neighbor peer osd." << *p << dendl;
5086 extras.insert(*p);
5087 _add_heartbeat_peer(*p);
5088 }
5089
5090 // remove down peers; enumerate extras
5091 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5092 while (p != heartbeat_peers.end()) {
5093 if (!osdmap->is_up(p->first)) {
5094 int o = p->first;
5095 ++p;
5096 _remove_heartbeat_peer(o);
5097 continue;
5098 }
5099 if (p->second.epoch < osdmap->get_epoch()) {
5100 extras.insert(p->first);
5101 }
5102 ++p;
5103 }
5104
5105 // too few?
5106 for (int n = next; n >= 0; ) {
5107 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5108 break;
5109 if (!extras.count(n) && !want.count(n) && n != whoami) {
5110 dout(10) << " adding random peer osd." << n << dendl;
5111 extras.insert(n);
5112 _add_heartbeat_peer(n);
5113 }
5114 n = osdmap->get_next_up_osd_after(n);
5115 if (n == next)
5116 break; // came full circle; stop
5117 }
5118
5119 // too many?
5120 for (set<int>::iterator p = extras.begin();
5121 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5122 ++p) {
5123 if (want.count(*p))
5124 continue;
5125 _remove_heartbeat_peer(*p);
5126 }
5127
5128 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5129 }
5130
5131 void OSD::reset_heartbeat_peers(bool all)
5132 {
5133 ceph_assert(osd_lock.is_locked());
5134 dout(10) << "reset_heartbeat_peers" << dendl;
5135 utime_t stale = ceph_clock_now();
5136 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5137 std::lock_guard l(heartbeat_lock);
5138 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5139 HeartbeatInfo& hi = it->second;
5140 if (all || hi.is_stale(stale)) {
5141 hi.con_back->mark_down();
5142 if (hi.con_front) {
5143 hi.con_front->mark_down();
5144 }
5145 // stop sending failure_report to mon too
5146 failure_queue.erase(it->first);
5147 heartbeat_peers.erase(it++);
5148 } else {
5149 it++;
5150 }
5151 }
5152 }
5153
5154 void OSD::handle_osd_ping(MOSDPing *m)
5155 {
5156 if (superblock.cluster_fsid != m->fsid) {
5157 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5158 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
5159 m->put();
5160 return;
5161 }
5162
5163 int from = m->get_source().num();
5164
5165 heartbeat_lock.Lock();
5166 if (is_stopping()) {
5167 heartbeat_lock.Unlock();
5168 m->put();
5169 return;
5170 }
5171
5172 OSDMapRef curmap = service.get_osdmap();
5173 if (!curmap) {
5174 heartbeat_lock.Unlock();
5175 m->put();
5176 return;
5177 }
5178
5179 switch (m->op) {
5180
5181 case MOSDPing::PING:
5182 {
5183 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5184 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5185 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5186 if (heartbeat_drop->second == 0) {
5187 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5188 } else {
5189 --heartbeat_drop->second;
5190 dout(5) << "Dropping heartbeat from " << from
5191 << ", " << heartbeat_drop->second
5192 << " remaining to drop" << dendl;
5193 break;
5194 }
5195 } else if (cct->_conf->osd_debug_drop_ping_probability >
5196 ((((double)(rand()%100))/100.0))) {
5197 heartbeat_drop =
5198 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5199 cct->_conf->osd_debug_drop_ping_duration)).first;
5200 dout(5) << "Dropping heartbeat from " << from
5201 << ", " << heartbeat_drop->second
5202 << " remaining to drop" << dendl;
5203 break;
5204 }
5205 }
5206
5207 if (!cct->get_heartbeat_map()->is_healthy()) {
5208 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5209 break;
5210 }
5211
5212 Message *r = new MOSDPing(monc->get_fsid(),
5213 curmap->get_epoch(),
5214 MOSDPing::PING_REPLY, m->stamp,
5215 cct->_conf->osd_heartbeat_min_size);
5216 m->get_connection()->send_message(r);
5217
5218 if (curmap->is_up(from)) {
5219 service.note_peer_epoch(from, m->map_epoch);
5220 if (is_active()) {
5221 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5222 if (con) {
5223 service.share_map_peer(from, con.get());
5224 }
5225 }
5226 } else if (!curmap->exists(from) ||
5227 curmap->get_down_at(from) > m->map_epoch) {
5228 // tell them they have died
5229 Message *r = new MOSDPing(monc->get_fsid(),
5230 curmap->get_epoch(),
5231 MOSDPing::YOU_DIED,
5232 m->stamp,
5233 cct->_conf->osd_heartbeat_min_size);
5234 m->get_connection()->send_message(r);
5235 }
5236 }
5237 break;
5238
5239 case MOSDPing::PING_REPLY:
5240 {
5241 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5242 if (i != heartbeat_peers.end()) {
5243 auto acked = i->second.ping_history.find(m->stamp);
5244 if (acked != i->second.ping_history.end()) {
5245 utime_t now = ceph_clock_now();
5246 int &unacknowledged = acked->second.second;
5247 if (m->get_connection() == i->second.con_back) {
5248 dout(25) << "handle_osd_ping got reply from osd." << from
5249 << " first_tx " << i->second.first_tx
5250 << " last_tx " << i->second.last_tx
5251 << " last_rx_back " << i->second.last_rx_back << " -> " << now
5252 << " last_rx_front " << i->second.last_rx_front
5253 << dendl;
5254 i->second.last_rx_back = now;
5255 ceph_assert(unacknowledged > 0);
5256 --unacknowledged;
5257 // if there is no front con, set both stamps.
5258 if (i->second.con_front == NULL) {
5259 i->second.last_rx_front = now;
5260 ceph_assert(unacknowledged > 0);
5261 --unacknowledged;
5262 }
5263 } else if (m->get_connection() == i->second.con_front) {
5264 dout(25) << "handle_osd_ping got reply from osd." << from
5265 << " first_tx " << i->second.first_tx
5266 << " last_tx " << i->second.last_tx
5267 << " last_rx_back " << i->second.last_rx_back
5268 << " last_rx_front " << i->second.last_rx_front << " -> " << now
5269 << dendl;
5270 i->second.last_rx_front = now;
5271 ceph_assert(unacknowledged > 0);
5272 --unacknowledged;
5273 }
5274
5275 if (unacknowledged == 0) {
5276 // succeeded in getting all replies
5277 dout(25) << "handle_osd_ping got all replies from osd." << from
5278 << " , erase pending ping(sent at " << m->stamp << ")"
5279 << " and older pending ping(s)"
5280 << dendl;
5281
5282 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5283 ++i->second.hb_average_count;
5284 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
5285 i->second.hb_total_back += back_pingtime;
5286 if (back_pingtime < i->second.hb_min_back)
5287 i->second.hb_min_back = back_pingtime;
5288 if (back_pingtime > i->second.hb_max_back)
5289 i->second.hb_max_back = back_pingtime;
5290 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
5291 i->second.hb_total_front += front_pingtime;
5292 if (front_pingtime < i->second.hb_min_front)
5293 i->second.hb_min_front = front_pingtime;
5294 if (front_pingtime > i->second.hb_max_front)
5295 i->second.hb_max_front = front_pingtime;
5296
5297 ceph_assert(i->second.hb_interval_start != utime_t());
5298 if (i->second.hb_interval_start == utime_t())
5299 i->second.hb_interval_start = now;
5300 int64_t hb_avg_time_period = 60;
5301 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5302 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5303 }
5304 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5305 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5306 uint32_t back_min = i->second.hb_min_back;
5307 uint32_t back_max = i->second.hb_max_back;
5308 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5309 uint32_t front_min = i->second.hb_min_front;
5310 uint32_t front_max = i->second.hb_max_front;
5311
5312 // Reset for new interval
5313 i->second.hb_average_count = 0;
5314 i->second.hb_interval_start = now;
5315 i->second.hb_total_back = i->second.hb_max_back = 0;
5316 i->second.hb_min_back = UINT_MAX;
5317 i->second.hb_total_front = i->second.hb_max_front = 0;
5318 i->second.hb_min_front = UINT_MAX;
5319
5320 // Record per osd interace ping times
5321 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5322 if (i->second.hb_back_pingtime.size() == 0) {
5323 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5324 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5325 i->second.hb_back_pingtime.push_back(back_avg);
5326 i->second.hb_back_min.push_back(back_min);
5327 i->second.hb_back_max.push_back(back_max);
5328 i->second.hb_front_pingtime.push_back(front_avg);
5329 i->second.hb_front_min.push_back(front_min);
5330 i->second.hb_front_max.push_back(front_max);
5331 ++i->second.hb_index;
5332 }
5333 } else {
5334 int index = i->second.hb_index & (hb_vector_size - 1);
5335 i->second.hb_back_pingtime[index] = back_avg;
5336 i->second.hb_back_min[index] = back_min;
5337 i->second.hb_back_max[index] = back_max;
5338 i->second.hb_front_pingtime[index] = front_avg;
5339 i->second.hb_front_min[index] = front_min;
5340 i->second.hb_front_max[index] = front_max;
5341 ++i->second.hb_index;
5342 }
5343
5344 {
5345 std::lock_guard l(service.stat_lock);
5346 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5347 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5348
5349 uint32_t total = 0;
5350 uint32_t min = UINT_MAX;
5351 uint32_t max = 0;
5352 uint32_t count = 0;
5353 uint32_t which = 0;
5354 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5355 for (int32_t k = size - 1 ; k >= 0; --k) {
5356 ++count;
5357 int index = (i->second.hb_index + k) % size;
5358 total += i->second.hb_back_pingtime[index];
5359 if (i->second.hb_back_min[index] < min)
5360 min = i->second.hb_back_min[index];
5361 if (i->second.hb_back_max[index] > max)
5362 max = i->second.hb_back_max[index];
5363 if (count == 1 || count == 5 || count == 15) {
5364 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5365 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5366 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5367 which++;
5368 if (count == 15)
5369 break;
5370 }
5371 }
5372
5373 if (i->second.con_front != NULL) {
5374 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5375
5376 total = 0;
5377 min = UINT_MAX;
5378 max = 0;
5379 count = 0;
5380 which = 0;
5381 for (int32_t k = size - 1 ; k >= 0; --k) {
5382 ++count;
5383 int index = (i->second.hb_index + k) % size;
5384 total += i->second.hb_front_pingtime[index];
5385 if (i->second.hb_front_min[index] < min)
5386 min = i->second.hb_front_min[index];
5387 if (i->second.hb_front_max[index] > max)
5388 max = i->second.hb_front_max[index];
5389 if (count == 1 || count == 5 || count == 15) {
5390 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5391 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5392 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5393 which++;
5394 if (count == 15)
5395 break;
5396 }
5397 }
5398 }
5399 }
5400 } else {
5401 std::lock_guard l(service.stat_lock);
5402 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5403 if (i->second.con_front != NULL)
5404 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5405 }
5406 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5407 }
5408
5409 if (i->second.is_healthy(now)) {
5410 // Cancel false reports
5411 auto failure_queue_entry = failure_queue.find(from);
5412 if (failure_queue_entry != failure_queue.end()) {
5413 dout(10) << "handle_osd_ping canceling queued "
5414 << "failure report for osd." << from << dendl;
5415 failure_queue.erase(failure_queue_entry);
5416 }
5417
5418 auto failure_pending_entry = failure_pending.find(from);
5419 if (failure_pending_entry != failure_pending.end()) {
5420 dout(10) << "handle_osd_ping canceling in-flight "
5421 << "failure report for osd." << from << dendl;
5422 send_still_alive(curmap->get_epoch(),
5423 from,
5424 failure_pending_entry->second.second);
5425 failure_pending.erase(failure_pending_entry);
5426 }
5427 }
5428 } else {
5429 // old replies, deprecated by newly sent pings.
5430 dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5431 << ") is found, treat as covered by newly sent pings "
5432 << "and ignore"
5433 << dendl;
5434 }
5435 }
5436
5437 if (m->map_epoch &&
5438 curmap->is_up(from)) {
5439 service.note_peer_epoch(from, m->map_epoch);
5440 if (is_active()) {
5441 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5442 if (con) {
5443 service.share_map_peer(from, con.get());
5444 }
5445 }
5446 }
5447 }
5448 break;
5449
5450 case MOSDPing::YOU_DIED:
5451 dout(10) << "handle_osd_ping " << m->get_source_inst()
5452 << " says i am down in " << m->map_epoch << dendl;
5453 osdmap_subscribe(curmap->get_epoch()+1, false);
5454 break;
5455 }
5456
5457 heartbeat_lock.Unlock();
5458 m->put();
5459 }
5460
5461 void OSD::heartbeat_entry()
5462 {
5463 std::lock_guard l(heartbeat_lock);
5464 if (is_stopping())
5465 return;
5466 while (!heartbeat_stop) {
5467 heartbeat();
5468
5469 double wait;
5470 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5471 wait = (float)cct->_conf->osd_heartbeat_interval;
5472 } else {
5473 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5474 }
5475 utime_t w;
5476 w.set_from_double(wait);
5477 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5478 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5479 if (is_stopping())
5480 return;
5481 dout(30) << "heartbeat_entry woke up" << dendl;
5482 }
5483 }
5484
5485 void OSD::heartbeat_check()
5486 {
5487 ceph_assert(heartbeat_lock.is_locked());
5488 utime_t now = ceph_clock_now();
5489
5490 // check for incoming heartbeats (move me elsewhere?)
5491 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5492 p != heartbeat_peers.end();
5493 ++p) {
5494
5495 if (p->second.first_tx == utime_t()) {
5496 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5497 << " yet, skipping" << dendl;
5498 continue;
5499 }
5500
5501 dout(25) << "heartbeat_check osd." << p->first
5502 << " first_tx " << p->second.first_tx
5503 << " last_tx " << p->second.last_tx
5504 << " last_rx_back " << p->second.last_rx_back
5505 << " last_rx_front " << p->second.last_rx_front
5506 << dendl;
5507 if (p->second.is_unhealthy(now)) {
5508 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5509 if (p->second.last_rx_back == utime_t() ||
5510 p->second.last_rx_front == utime_t()) {
5511 derr << "heartbeat_check: no reply from "
5512 << p->second.con_front->get_peer_addr().get_sockaddr()
5513 << " osd." << p->first
5514 << " ever on either front or back, first ping sent "
5515 << p->second.first_tx
5516 << " (oldest deadline " << oldest_deadline << ")"
5517 << dendl;
5518 // fail
5519 failure_queue[p->first] = p->second.first_tx;
5520 } else {
5521 derr << "heartbeat_check: no reply from "
5522 << p->second.con_front->get_peer_addr().get_sockaddr()
5523 << " osd." << p->first << " since back " << p->second.last_rx_back
5524 << " front " << p->second.last_rx_front
5525 << " (oldest deadline " << oldest_deadline << ")"
5526 << dendl;
5527 // fail
5528 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5529 }
5530 }
5531 }
5532 }
5533
5534 void OSD::heartbeat()
5535 {
5536 ceph_assert(heartbeat_lock.is_locked_by_me());
5537 dout(30) << "heartbeat" << dendl;
5538
5539 // get CPU load avg
5540 double loadavgs[1];
5541 int hb_interval = cct->_conf->osd_heartbeat_interval;
5542 int n_samples = 86400;
5543 if (hb_interval > 1) {
5544 n_samples /= hb_interval;
5545 if (n_samples < 1)
5546 n_samples = 1;
5547 }
5548
5549 if (getloadavg(loadavgs, 1) == 1) {
5550 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5551 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5552 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5553 }
5554
5555 dout(30) << "heartbeat checking stats" << dendl;
5556
5557 // refresh peer list and osd stats
5558 vector<int> hb_peers;
5559 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5560 p != heartbeat_peers.end();
5561 ++p)
5562 hb_peers.push_back(p->first);
5563
5564 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5565 dout(5) << __func__ << " " << new_stat << dendl;
5566 ceph_assert(new_stat.statfs.total);
5567
5568 float pratio;
5569 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5570
5571 service.check_full_status(ratio, pratio);
5572
5573 utime_t now = ceph_clock_now();
5574 utime_t deadline = now;
5575 deadline += cct->_conf->osd_heartbeat_grace;
5576
5577 // send heartbeats
5578 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5579 i != heartbeat_peers.end();
5580 ++i) {
5581 int peer = i->first;
5582 i->second.last_tx = now;
5583 if (i->second.first_tx == utime_t())
5584 i->second.first_tx = now;
5585 i->second.ping_history[now] = make_pair(deadline,
5586 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5587 if (i->second.hb_interval_start == utime_t())
5588 i->second.hb_interval_start = now;
5589 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5590 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5591 service.get_osdmap_epoch(),
5592 MOSDPing::PING, now,
5593 cct->_conf->osd_heartbeat_min_size));
5594
5595 if (i->second.con_front)
5596 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5597 service.get_osdmap_epoch(),
5598 MOSDPing::PING, now,
5599 cct->_conf->osd_heartbeat_min_size));
5600 }
5601
5602 logger->set(l_osd_hb_to, heartbeat_peers.size());
5603
5604 // hmm.. am i all alone?
5605 dout(30) << "heartbeat lonely?" << dendl;
5606 if (heartbeat_peers.empty()) {
5607 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5608 last_mon_heartbeat = now;
5609 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5610 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5611 }
5612 }
5613
5614 dout(30) << "heartbeat done" << dendl;
5615 }
5616
5617 bool OSD::heartbeat_reset(Connection *con)
5618 {
5619 std::lock_guard l(heartbeat_lock);
5620 auto s = con->get_priv();
5621 con->set_priv(nullptr);
5622 if (s) {
5623 if (is_stopping()) {
5624 return true;
5625 }
5626 auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5627 auto p = heartbeat_peers.find(heartbeat_session->peer);
5628 if (p != heartbeat_peers.end() &&
5629 (p->second.con_back == con ||
5630 p->second.con_front == con)) {
5631 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5632 << ", reopening" << dendl;
5633 if (con != p->second.con_back) {
5634 p->second.con_back->mark_down();
5635 }
5636 p->second.con_back.reset(NULL);
5637 if (p->second.con_front && con != p->second.con_front) {
5638 p->second.con_front->mark_down();
5639 }
5640 p->second.con_front.reset(NULL);
5641 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5642 if (newcon.first) {
5643 p->second.con_back = newcon.first.get();
5644 p->second.con_back->set_priv(s);
5645 if (newcon.second) {
5646 p->second.con_front = newcon.second.get();
5647 p->second.con_front->set_priv(s);
5648 }
5649 p->second.ping_history.clear();
5650 } else {
5651 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5652 << ", raced with osdmap update, closing out peer" << dendl;
5653 heartbeat_peers.erase(p);
5654 }
5655 } else {
5656 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5657 }
5658 }
5659 return true;
5660 }
5661
5662
5663
5664 // =========================================
5665
5666 void OSD::tick()
5667 {
5668 ceph_assert(osd_lock.is_locked());
5669 dout(10) << "tick" << dendl;
5670
5671 if (is_active() || is_waiting_for_healthy()) {
5672 maybe_update_heartbeat_peers();
5673 }
5674
5675 if (is_waiting_for_healthy()) {
5676 start_boot();
5677 }
5678
5679 if (is_waiting_for_healthy() || is_booting()) {
5680 std::lock_guard l(heartbeat_lock);
5681 utime_t now = ceph_clock_now();
5682 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5683 last_mon_heartbeat = now;
5684 dout(1) << __func__ << " checking mon for new map" << dendl;
5685 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5686 }
5687 }
5688
5689 do_waiters();
5690
5691 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5692 }
5693
5694 void OSD::tick_without_osd_lock()
5695 {
5696 ceph_assert(tick_timer_lock.is_locked());
5697 dout(10) << "tick_without_osd_lock" << dendl;
5698
5699 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5700 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5701 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5702
5703 // refresh osd stats
5704 struct store_statfs_t stbuf;
5705 osd_alert_list_t alerts;
5706 int r = store->statfs(&stbuf, &alerts);
5707 ceph_assert(r == 0);
5708 service.set_statfs(stbuf, alerts);
5709
5710 // osd_lock is not being held, which means the OSD state
5711 // might change when doing the monitor report
5712 if (is_active() || is_waiting_for_healthy()) {
5713 heartbeat_lock.Lock();
5714 heartbeat_check();
5715 heartbeat_lock.Unlock();
5716
5717 map_lock.get_read();
5718 std::lock_guard l(mon_report_lock);
5719
5720 // mon report?
5721 utime_t now = ceph_clock_now();
5722 if (service.need_fullness_update() ||
5723 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5724 last_mon_report = now;
5725 send_full_update();
5726 send_failures();
5727 }
5728 map_lock.put_read();
5729
5730 epoch_t max_waiting_epoch = 0;
5731 for (auto s : shards) {
5732 max_waiting_epoch = std::max(max_waiting_epoch,
5733 s->get_max_waiting_epoch());
5734 }
5735 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5736 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5737 << ", requesting new map" << dendl;
5738 osdmap_subscribe(superblock.newest_map + 1, false);
5739 }
5740 }
5741
5742 if (is_active()) {
5743 if (!scrub_random_backoff()) {
5744 sched_scrub();
5745 }
5746 service.promote_throttle_recalibrate();
5747 resume_creating_pg();
5748 bool need_send_beacon = false;
5749 const auto now = ceph::coarse_mono_clock::now();
5750 {
5751 // borrow lec lock to pretect last_sent_beacon from changing
5752 std::lock_guard l{min_last_epoch_clean_lock};
5753 const auto elapsed = now - last_sent_beacon;
5754 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5755 cct->_conf->osd_beacon_report_interval) {
5756 need_send_beacon = true;
5757 }
5758 }
5759 if (need_send_beacon) {
5760 send_beacon(now);
5761 }
5762 }
5763
5764 mgrc.update_daemon_health(get_health_metrics());
5765 service.kick_recovery_queue();
5766 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5767 new C_Tick_WithoutOSDLock(this));
5768 }
5769
5770 // Usage:
5771 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5772 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5773 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5774 // getomap <pool> [namespace/]<obj-name>
5775 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5776 // injectmdataerr [namespace/]<obj-name> [shardid]
5777 // injectdataerr [namespace/]<obj-name> [shardid]
5778 //
5779 // set_recovery_delay [utime]
5780 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5781 std::string_view command,
5782 const cmdmap_t& cmdmap, ostream &ss)
5783 {
5784 //Test support
5785 //Support changing the omap on a single osd by using the Admin Socket to
5786 //directly request the osd make a change.
5787 if (command == "setomapval" || command == "rmomapkey" ||
5788 command == "setomapheader" || command == "getomap" ||
5789 command == "truncobj" || command == "injectmdataerr" ||
5790 command == "injectdataerr"
5791 ) {
5792 pg_t rawpg;
5793 int64_t pool;
5794 OSDMapRef curmap = service->get_osdmap();
5795 int r = -1;
5796
5797 string poolstr;
5798
5799 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5800 pool = curmap->lookup_pg_pool_name(poolstr);
5801 //If we can't find it by name then maybe id specified
5802 if (pool < 0 && isdigit(poolstr[0]))
5803 pool = atoll(poolstr.c_str());
5804 if (pool < 0) {
5805 ss << "Invalid pool '" << poolstr << "''";
5806 return;
5807 }
5808
5809 string objname, nspace;
5810 cmd_getval(service->cct, cmdmap, "objname", objname);
5811 std::size_t found = objname.find_first_of('/');
5812 if (found != string::npos) {
5813 nspace = objname.substr(0, found);
5814 objname = objname.substr(found+1);
5815 }
5816 object_locator_t oloc(pool, nspace);
5817 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5818
5819 if (r < 0) {
5820 ss << "Invalid namespace/objname";
5821 return;
5822 }
5823
5824 int64_t shardid;
5825 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5826 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5827 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5828 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5829 if (curmap->pg_is_ec(rawpg)) {
5830 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5831 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5832 return;
5833 }
5834 }
5835
5836 ObjectStore::Transaction t;
5837
5838 if (command == "setomapval") {
5839 map<string, bufferlist> newattrs;
5840 bufferlist val;
5841 string key, valstr;
5842 cmd_getval(service->cct, cmdmap, "key", key);
5843 cmd_getval(service->cct, cmdmap, "val", valstr);
5844
5845 val.append(valstr);
5846 newattrs[key] = val;
5847 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5848 r = store->queue_transaction(service->meta_ch, std::move(t));
5849 if (r < 0)
5850 ss << "error=" << r;
5851 else
5852 ss << "ok";
5853 } else if (command == "rmomapkey") {
5854 string key;
5855 set<string> keys;
5856 cmd_getval(service->cct, cmdmap, "key", key);
5857
5858 keys.insert(key);
5859 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5860 r = store->queue_transaction(service->meta_ch, std::move(t));
5861 if (r < 0)
5862 ss << "error=" << r;
5863 else
5864 ss << "ok";
5865 } else if (command == "setomapheader") {
5866 bufferlist newheader;
5867 string headerstr;
5868
5869 cmd_getval(service->cct, cmdmap, "header", headerstr);
5870 newheader.append(headerstr);
5871 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5872 r = store->queue_transaction(service->meta_ch, std::move(t));
5873 if (r < 0)
5874 ss << "error=" << r;
5875 else
5876 ss << "ok";
5877 } else if (command == "getomap") {
5878 //Debug: Output entire omap
5879 bufferlist hdrbl;
5880 map<string, bufferlist> keyvals;
5881 auto ch = store->open_collection(coll_t(pgid));
5882 if (!ch) {
5883 ss << "unable to open collection for " << pgid;
5884 r = -ENOENT;
5885 } else {
5886 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5887 if (r >= 0) {
5888 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5889 for (map<string, bufferlist>::iterator it = keyvals.begin();
5890 it != keyvals.end(); ++it)
5891 ss << " key=" << (*it).first << " val="
5892 << string((*it).second.c_str(), (*it).second.length());
5893 } else {
5894 ss << "error=" << r;
5895 }
5896 }
5897 } else if (command == "truncobj") {
5898 int64_t trunclen;
5899 cmd_getval(service->cct, cmdmap, "len", trunclen);
5900 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5901 r = store->queue_transaction(service->meta_ch, std::move(t));
5902 if (r < 0)
5903 ss << "error=" << r;
5904 else
5905 ss << "ok";
5906 } else if (command == "injectdataerr") {
5907 store->inject_data_error(gobj);
5908 ss << "ok";
5909 } else if (command == "injectmdataerr") {
5910 store->inject_mdata_error(gobj);
5911 ss << "ok";
5912 }
5913 return;
5914 }
5915 if (command == "set_recovery_delay") {
5916 int64_t delay;
5917 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5918 ostringstream oss;
5919 oss << delay;
5920 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
5921 oss.str().c_str());
5922 if (r != 0) {
5923 ss << "set_recovery_delay: error setting "
5924 << "osd_recovery_delay_start to '" << delay << "': error "
5925 << r;
5926 return;
5927 }
5928 service->cct->_conf.apply_changes(nullptr);
5929 ss << "set_recovery_delay: set osd_recovery_delay_start "
5930 << "to " << service->cct->_conf->osd_recovery_delay_start;
5931 return;
5932 }
5933 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
5934 spg_t pgid;
5935 bool deep = (command == "trigger_deep_scrub");
5936 OSDMapRef curmap = service->get_osdmap();
5937
5938 string pgidstr;
5939
5940 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5941 if (!pgid.parse(pgidstr.c_str())) {
5942 ss << "Invalid pgid specified";
5943 return;
5944 }
5945
5946 int64_t time;
5947 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5948
5949 PGRef pg = service->osd->_lookup_lock_pg(pgid);
5950 if (pg == nullptr) {
5951 ss << "Can't find pg " << pgid;
5952 return;
5953 }
5954
5955 if (pg->is_primary()) {
5956 pg->unreg_next_scrub();
5957 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5958 double pool_scrub_max_interval = 0;
5959 double scrub_max_interval;
5960 if (deep) {
5961 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5962 scrub_max_interval = pool_scrub_max_interval > 0 ?
5963 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
5964 } else {
5965 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5966 scrub_max_interval = pool_scrub_max_interval > 0 ?
5967 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
5968 }
5969 // Instead of marking must_scrub force a schedule scrub
5970 utime_t stamp = ceph_clock_now();
5971 if (time == 0)
5972 stamp -= scrub_max_interval;
5973 else
5974 stamp -= (float)time;
5975 stamp -= 100.0; // push back last scrub more for good measure
5976 if (deep) {
5977 pg->set_last_deep_scrub_stamp(stamp);
5978 } else {
5979 pg->set_last_scrub_stamp(stamp);
5980 }
5981 pg->reg_next_scrub();
5982 pg->publish_stats_to_osd();
5983 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
5984 } else {
5985 ss << "Not primary";
5986 }
5987 pg->unlock();
5988 return;
5989 }
5990 if (command == "injectfull") {
5991 int64_t count;
5992 string type;
5993 OSDService::s_names state;
5994 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5995 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5996 if (type == "none" || count == 0) {
5997 type = "none";
5998 count = 0;
5999 }
6000 state = service->get_full_state(type);
6001 if (state == OSDService::s_names::INVALID) {
6002 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6003 return;
6004 }
6005 service->set_injectfull(state, count);
6006 return;
6007 }
6008 ss << "Internal error - command=" << command;
6009 }
6010
6011 // =========================================
6012
6013 void OSD::ms_handle_connect(Connection *con)
6014 {
6015 dout(10) << __func__ << " con " << con << dendl;
6016 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6017 std::lock_guard l(osd_lock);
6018 if (is_stopping())
6019 return;
6020 dout(10) << __func__ << " on mon" << dendl;
6021
6022 if (is_preboot()) {
6023 start_boot();
6024 } else if (is_booting()) {
6025 _send_boot(); // resend boot message
6026 } else {
6027 map_lock.get_read();
6028 std::lock_guard l2(mon_report_lock);
6029
6030 utime_t now = ceph_clock_now();
6031 last_mon_report = now;
6032
6033 // resend everything, it's a new session
6034 send_full_update();
6035 send_alive();
6036 service.requeue_pg_temp();
6037 service.clear_sent_ready_to_merge();
6038 service.send_pg_temp();
6039 service.send_ready_to_merge();
6040 service.send_pg_created();
6041 requeue_failures();
6042 send_failures();
6043
6044 map_lock.put_read();
6045 if (is_active()) {
6046 send_beacon(ceph::coarse_mono_clock::now());
6047 }
6048 }
6049
6050 // full map requests may happen while active or pre-boot
6051 if (requested_full_first) {
6052 rerequest_full_maps();
6053 }
6054 }
6055 }
6056
6057 void OSD::ms_handle_fast_connect(Connection *con)
6058 {
6059 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6060 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6061 auto priv = con->get_priv();
6062 auto s = static_cast<Session*>(priv.get());
6063 if (!s) {
6064 s = new Session{cct, con};
6065 con->set_priv(RefCountedPtr{s, false});
6066 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6067 << " addr=" << s->con->get_peer_addr() << dendl;
6068 // we don't connect to clients
6069 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6070 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6071 }
6072 }
6073 }
6074
6075 void OSD::ms_handle_fast_accept(Connection *con)
6076 {
6077 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6078 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6079 auto priv = con->get_priv();
6080 auto s = static_cast<Session*>(priv.get());
6081 if (!s) {
6082 s = new Session{cct, con};
6083 con->set_priv(RefCountedPtr{s, false});
6084 dout(10) << "new session (incoming)" << s << " con=" << con
6085 << " addr=" << con->get_peer_addr()
6086 << " must have raced with connect" << dendl;
6087 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6088 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6089 }
6090 }
6091 }
6092
6093 bool OSD::ms_handle_reset(Connection *con)
6094 {
6095 auto s = con->get_priv();
6096 auto session = static_cast<Session*>(s.get());
6097 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
6098 if (!session)
6099 return false;
6100 session->wstate.reset(con);
6101 session->con->set_priv(nullptr);
6102 session->con.reset(); // break con <-> session ref cycle
6103 // note that we break session->con *before* the session_handle_reset
6104 // cleanup below. this avoids a race between us and
6105 // PG::add_backoff, Session::check_backoff, etc.
6106 session_handle_reset(SessionRef{session});
6107 return true;
6108 }
6109
6110 bool OSD::ms_handle_refused(Connection *con)
6111 {
6112 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6113 return false;
6114
6115 auto priv = con->get_priv();
6116 auto session = static_cast<Session*>(priv.get());
6117 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
6118 if (!session)
6119 return false;
6120 int type = con->get_peer_type();
6121 // handle only OSD failures here
6122 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6123 OSDMapRef osdmap = get_osdmap();
6124 if (osdmap) {
6125 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6126 if (id >= 0 && osdmap->is_up(id)) {
6127 // I'm cheating mon heartbeat grace logic, because we know it's not going
6128 // to respawn alone. +1 so we won't hit any boundary case.
6129 monc->send_mon_message(
6130 new MOSDFailure(
6131 monc->get_fsid(),
6132 id,
6133 osdmap->get_addrs(id),
6134 cct->_conf->osd_heartbeat_grace + 1,
6135 osdmap->get_epoch(),
6136 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6137 ));
6138 }
6139 }
6140 }
6141 return true;
6142 }
6143
6144 struct C_OSD_GetVersion : public Context {
6145 OSD *osd;
6146 uint64_t oldest, newest;
6147 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6148 void finish(int r) override {
6149 if (r >= 0)
6150 osd->_got_mon_epochs(oldest, newest);
6151 }
6152 };
6153
6154 void OSD::start_boot()
6155 {
6156 if (!_is_healthy()) {
6157 // if we are not healthy, do not mark ourselves up (yet)
6158 dout(1) << "not healthy; waiting to boot" << dendl;
6159 if (!is_waiting_for_healthy())
6160 start_waiting_for_healthy();
6161 // send pings sooner rather than later
6162 heartbeat_kick();
6163 return;
6164 }
6165 dout(1) << __func__ << dendl;
6166 set_state(STATE_PREBOOT);
6167 dout(10) << "start_boot - have maps " << superblock.oldest_map
6168 << ".." << superblock.newest_map << dendl;
6169 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6170 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6171 }
6172
6173 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6174 {
6175 std::lock_guard l(osd_lock);
6176 if (is_preboot()) {
6177 _preboot(oldest, newest);
6178 }
6179 }
6180
6181 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6182 {
6183 ceph_assert(is_preboot());
6184 dout(10) << __func__ << " _preboot mon has osdmaps "
6185 << oldest << ".." << newest << dendl;
6186
6187 // ensure our local fullness awareness is accurate
6188 {
6189 std::lock_guard l(heartbeat_lock);
6190 heartbeat();
6191 }
6192
6193 // if our map within recent history, try to add ourselves to the osdmap.
6194 if (osdmap->get_epoch() == 0) {
6195 derr << "waiting for initial osdmap" << dendl;
6196 } else if (osdmap->is_destroyed(whoami)) {
6197 derr << "osdmap says I am destroyed" << dendl;
6198 // provide a small margin so we don't livelock seeing if we
6199 // un-destroyed ourselves.
6200 if (osdmap->get_epoch() > newest - 1) {
6201 exit(0);
6202 }
6203 } else if (osdmap->is_noup(whoami)) {
6204 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6205 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6206 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6207 << dendl;
6208 } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6209 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6210 << dendl;
6211 } else if (service.need_fullness_update()) {
6212 derr << "osdmap fullness state needs update" << dendl;
6213 send_full_update();
6214 } else if (osdmap->get_epoch() >= oldest - 1 &&
6215 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6216
6217 // wait for pgs to fully catch up in a different thread, since
6218 // this thread might be required for splitting and merging PGs to
6219 // make progress.
6220 boot_finisher.queue(
6221 new FunctionContext(
6222 [this](int r) {
6223 std::lock_guard l(osd_lock);
6224 if (is_preboot()) {
6225 dout(10) << __func__ << " waiting for peering work to drain"
6226 << dendl;
6227 osd_lock.Unlock();
6228 for (auto shard : shards) {
6229 shard->wait_min_pg_epoch(osdmap->get_epoch());
6230 }
6231 osd_lock.Lock();
6232 }
6233 if (is_preboot()) {
6234 _send_boot();
6235 }
6236 }));
6237 return;
6238 }
6239
6240 // get all the latest maps
6241 if (osdmap->get_epoch() + 1 >= oldest)
6242 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6243 else
6244 osdmap_subscribe(oldest - 1, true);
6245 }
6246
6247 void OSD::send_full_update()
6248 {
6249 if (!service.need_fullness_update())
6250 return;
6251 unsigned state = 0;
6252 if (service.is_full()) {
6253 state = CEPH_OSD_FULL;
6254 } else if (service.is_backfillfull()) {
6255 state = CEPH_OSD_BACKFILLFULL;
6256 } else if (service.is_nearfull()) {
6257 state = CEPH_OSD_NEARFULL;
6258 }
6259 set<string> s;
6260 OSDMap::calc_state_set(state, s);
6261 dout(10) << __func__ << " want state " << s << dendl;
6262 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6263 }
6264
6265 void OSD::start_waiting_for_healthy()
6266 {
6267 dout(1) << "start_waiting_for_healthy" << dendl;
6268 set_state(STATE_WAITING_FOR_HEALTHY);
6269 last_heartbeat_resample = utime_t();
6270
6271 // subscribe to osdmap updates, in case our peers really are known to be dead
6272 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6273 }
6274
6275 bool OSD::_is_healthy()
6276 {
6277 if (!cct->get_heartbeat_map()->is_healthy()) {
6278 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6279 return false;
6280 }
6281
6282 if (is_waiting_for_healthy()) {
6283 utime_t now = ceph_clock_now();
6284 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6285 while (!osd_markdown_log.empty() &&
6286 osd_markdown_log.front() + grace < now)
6287 osd_markdown_log.pop_front();
6288 if (osd_markdown_log.size() <= 1) {
6289 dout(5) << __func__ << " first time marked as down,"
6290 << " try reboot unconditionally" << dendl;
6291 return true;
6292 }
6293 std::lock_guard l(heartbeat_lock);
6294 int num = 0, up = 0;
6295 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6296 p != heartbeat_peers.end();
6297 ++p) {
6298 if (p->second.is_healthy(now))
6299 ++up;
6300 ++num;
6301 }
6302 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6303 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6304 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6305 return false;
6306 }
6307 }
6308
6309 return true;
6310 }
6311
6312 void OSD::_send_boot()
6313 {
6314 dout(10) << "_send_boot" << dendl;
6315 Connection *local_connection =
6316 cluster_messenger->get_loopback_connection().get();
6317 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6318 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6319 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6320 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6321
6322 dout(20) << " initial client_addrs " << client_addrs
6323 << ", cluster_addrs " << cluster_addrs
6324 << ", hb_back_addrs " << hb_back_addrs
6325 << ", hb_front_addrs " << hb_front_addrs
6326 << dendl;
6327 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6328 dout(10) << " assuming cluster_addrs match client_addrs "
6329 << client_addrs << dendl;
6330 cluster_addrs = cluster_messenger->get_myaddrs();
6331 }
6332 if (auto session = local_connection->get_priv(); !session) {
6333 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6334 }
6335
6336 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6337 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6338 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6339 << cluster_addrs << dendl;
6340 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6341 }
6342 if (auto session = local_connection->get_priv(); !session) {
6343 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6344 }
6345
6346 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6347 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6348 dout(10) << " assuming hb_front_addrs match client_addrs "
6349 << client_addrs << dendl;
6350 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6351 }
6352 if (auto session = local_connection->get_priv(); !session) {
6353 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6354 }
6355
6356 // we now know what our front and back addrs will be, and we are
6357 // about to tell the mon what our metadata (including numa bindings)
6358 // are, so now is a good time!
6359 set_numa_affinity();
6360
6361 MOSDBoot *mboot = new MOSDBoot(
6362 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6363 hb_back_addrs, hb_front_addrs, cluster_addrs,
6364 CEPH_FEATURES_ALL);
6365 dout(10) << " final client_addrs " << client_addrs
6366 << ", cluster_addrs " << cluster_addrs
6367 << ", hb_back_addrs " << hb_back_addrs
6368 << ", hb_front_addrs " << hb_front_addrs
6369 << dendl;
6370 _collect_metadata(&mboot->metadata);
6371 monc->send_mon_message(mboot);
6372 set_state(STATE_BOOTING);
6373 }
6374
6375 void OSD::_collect_metadata(map<string,string> *pm)
6376 {
6377 // config info
6378 (*pm)["osd_data"] = dev_path;
6379 if (store->get_type() == "filestore") {
6380 // not applicable for bluestore
6381 (*pm)["osd_journal"] = journal_path;
6382 }
6383 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6384 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6385 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6386 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6387
6388 // backend
6389 (*pm)["osd_objectstore"] = store->get_type();
6390 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6391 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6392 (*pm)["default_device_class"] = store->get_default_device_class();
6393 store->collect_metadata(pm);
6394
6395 collect_sys_info(pm, cct);
6396
6397 (*pm)["front_iface"] = pick_iface(
6398 cct,
6399 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6400 (*pm)["back_iface"] = pick_iface(
6401 cct,
6402 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6403
6404 // network numa
6405 {
6406 int node = -1;
6407 set<int> nodes;
6408 set<string> unknown;
6409 for (auto nm : { "front_iface", "back_iface" }) {
6410 if (!(*pm)[nm].size()) {
6411 unknown.insert(nm);
6412 continue;
6413 }
6414 int n = -1;
6415 int r = get_iface_numa_node((*pm)[nm], &n);
6416 if (r < 0) {
6417 unknown.insert((*pm)[nm]);
6418 continue;
6419 }
6420 nodes.insert(n);
6421 if (node < 0) {
6422 node = n;
6423 }
6424 }
6425 if (unknown.size()) {
6426 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6427 }
6428 if (!nodes.empty()) {
6429 (*pm)["network_numa_nodes"] = stringify(nodes);
6430 }
6431 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6432 (*pm)["network_numa_node"] = stringify(node);
6433 }
6434 }
6435
6436 if (numa_node >= 0) {
6437 (*pm)["numa_node"] = stringify(numa_node);
6438 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6439 &numa_cpu_set);
6440 }
6441
6442 set<string> devnames;
6443 store->get_devices(&devnames);
6444 (*pm)["devices"] = stringify(devnames);
6445 string devids;
6446 for (auto& dev : devnames) {
6447 string err;
6448 string id = get_device_id(dev, &err);
6449 if (id.size()) {
6450 if (!devids.empty()) {
6451 devids += ",";
6452 }
6453 devids += dev + "=" + id;
6454 } else {
6455 dout(10) << __func__ << " no unique device id for " << dev << ": "
6456 << err << dendl;
6457 }
6458 }
6459 (*pm)["device_ids"] = devids;
6460
6461 dout(10) << __func__ << " " << *pm << dendl;
6462 }
6463
6464 void OSD::queue_want_up_thru(epoch_t want)
6465 {
6466 map_lock.get_read();
6467 epoch_t cur = osdmap->get_up_thru(whoami);
6468 std::lock_guard l(mon_report_lock);
6469 if (want > up_thru_wanted) {
6470 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6471 << ", currently " << cur
6472 << dendl;
6473 up_thru_wanted = want;
6474 send_alive();
6475 } else {
6476 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6477 << ", currently " << cur
6478 << dendl;
6479 }
6480 map_lock.put_read();
6481 }
6482
6483 void OSD::send_alive()
6484 {
6485 ceph_assert(mon_report_lock.is_locked());
6486 if (!osdmap->exists(whoami))
6487 return;
6488 epoch_t up_thru = osdmap->get_up_thru(whoami);
6489 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6490 if (up_thru_wanted > up_thru) {
6491 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6492 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6493 }
6494 }
6495
6496 void OSD::request_full_map(epoch_t first, epoch_t last)
6497 {
6498 dout(10) << __func__ << " " << first << ".." << last
6499 << ", previously requested "
6500 << requested_full_first << ".." << requested_full_last << dendl;
6501 ceph_assert(osd_lock.is_locked());
6502 ceph_assert(first > 0 && last > 0);
6503 ceph_assert(first <= last);
6504 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6505 if (requested_full_first == 0) {
6506 // first request
6507 requested_full_first = first;
6508 requested_full_last = last;
6509 } else if (last <= requested_full_last) {
6510 // dup
6511 return;
6512 } else {
6513 // additional request
6514 first = requested_full_last + 1;
6515 requested_full_last = last;
6516 }
6517 MMonGetOSDMap *req = new MMonGetOSDMap;
6518 req->request_full(first, last);
6519 monc->send_mon_message(req);
6520 }
6521
6522 void OSD::got_full_map(epoch_t e)
6523 {
6524 ceph_assert(requested_full_first <= requested_full_last);
6525 ceph_assert(osd_lock.is_locked());
6526 if (requested_full_first == 0) {
6527 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6528 return;
6529 }
6530 if (e < requested_full_first) {
6531 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6532 << ".." << requested_full_last
6533 << ", ignoring" << dendl;
6534 return;
6535 }
6536 if (e >= requested_full_last) {
6537 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6538 << ".." << requested_full_last << ", resetting" << dendl;
6539 requested_full_first = requested_full_last = 0;
6540 return;
6541 }
6542
6543 requested_full_first = e + 1;
6544
6545 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6546 << ".." << requested_full_last
6547 << ", still need more" << dendl;
6548 }
6549
6550 void OSD::requeue_failures()
6551 {
6552 std::lock_guard l(heartbeat_lock);
6553 unsigned old_queue = failure_queue.size();
6554 unsigned old_pending = failure_pending.size();
6555 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6556 failure_queue[p->first] = p->second.first;
6557 failure_pending.erase(p++);
6558 }
6559 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6560 << failure_queue.size() << dendl;
6561 }
6562
6563 void OSD::send_failures()
6564 {
6565 ceph_assert(map_lock.is_locked());
6566 ceph_assert(mon_report_lock.is_locked());
6567 std::lock_guard l(heartbeat_lock);
6568 utime_t now = ceph_clock_now();
6569 while (!failure_queue.empty()) {
6570 int osd = failure_queue.begin()->first;
6571 if (!failure_pending.count(osd)) {
6572 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6573 monc->send_mon_message(
6574 new MOSDFailure(
6575 monc->get_fsid(),
6576 osd,
6577 osdmap->get_addrs(osd),
6578 failed_for,
6579 osdmap->get_epoch()));
6580 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6581 osdmap->get_addrs(osd));
6582 }
6583 failure_queue.erase(osd);
6584 }
6585 }
6586
6587 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6588 {
6589 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6590 MOSDFailure::FLAG_ALIVE);
6591 monc->send_mon_message(m);
6592 }
6593
6594 void OSD::cancel_pending_failures()
6595 {
6596 std::lock_guard l(heartbeat_lock);
6597 auto it = failure_pending.begin();
6598 while (it != failure_pending.end()) {
6599 dout(10) << __func__ << " canceling in-flight failure report for osd."
6600 << it->first << dendl;
6601 send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6602 failure_pending.erase(it++);
6603 }
6604 }
6605
6606 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6607 {
6608 const auto& monmap = monc->monmap;
6609 // send beacon to mon even if we are just connected, and the monmap is not
6610 // initialized yet by then.
6611 if (monmap.epoch > 0 &&
6612 monmap.get_required_features().contains_all(
6613 ceph::features::mon::FEATURE_LUMINOUS)) {
6614 dout(20) << __func__ << " sending" << dendl;
6615 MOSDBeacon* beacon = nullptr;
6616 {
6617 std::lock_guard l{min_last_epoch_clean_lock};
6618 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6619 beacon->pgs = min_last_epoch_clean_pgs;
6620 last_sent_beacon = now;
6621 }
6622 monc->send_mon_message(beacon);
6623 } else {
6624 dout(20) << __func__ << " not sending" << dendl;
6625 }
6626 }
6627
6628 void OSD::handle_command(MMonCommand *m)
6629 {
6630 if (!require_mon_peer(m)) {
6631 m->put();
6632 return;
6633 }
6634
6635 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6636 command_wq.queue(c);
6637 m->put();
6638 }
6639
6640 void OSD::handle_command(MCommand *m)
6641 {
6642 ConnectionRef con = m->get_connection();
6643 auto priv = con->get_priv();
6644 auto session = static_cast<Session *>(priv.get());
6645 if (!session) {
6646 con->send_message(new MCommandReply(m, -EPERM));
6647 m->put();
6648 return;
6649 }
6650
6651 OSDCap& caps = session->caps;
6652 priv.reset();
6653
6654 if (!caps.allow_all() || m->get_source().is_mon()) {
6655 con->send_message(new MCommandReply(m, -EPERM));
6656 m->put();
6657 return;
6658 }
6659
6660 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6661 command_wq.queue(c);
6662
6663 m->put();
6664 }
6665
6666 struct OSDCommand {
6667 string cmdstring;
6668 string helpstring;
6669 string module;
6670 string perm;
6671 } osd_commands[] = {
6672
6673 #define COMMAND(parsesig, helptext, module, perm) \
6674 {parsesig, helptext, module, perm},
6675
6676 // yes, these are really pg commands, but there's a limit to how
6677 // much work it's worth. The OSD returns all of them. Make this
6678 // form (pg <pgid> <cmd>) valid only for the cli.
6679 // Rest uses "tell <pgid> <cmd>"
6680
6681 COMMAND("pg " \
6682 "name=pgid,type=CephPgid " \
6683 "name=cmd,type=CephChoices,strings=query", \
6684 "show details of a specific pg", "osd", "r")
6685 COMMAND("pg " \
6686 "name=pgid,type=CephPgid " \
6687 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6688 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6689 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6690 "osd", "rw")
6691 COMMAND("pg " \
6692 "name=pgid,type=CephPgid " \
6693 "name=cmd,type=CephChoices,strings=list_unfound " \
6694 "name=offset,type=CephString,req=false",
6695 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6696 "osd", "r")
6697
6698 // new form: tell <pgid> <cmd> for both cli and rest
6699
6700 COMMAND("query",
6701 "show details of a specific pg", "osd", "r")
6702 COMMAND("mark_unfound_lost " \
6703 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6704 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6705 "osd", "rw")
6706 COMMAND("list_unfound " \
6707 "name=offset,type=CephString,req=false",
6708 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6709 "osd", "r")
6710 COMMAND("perf histogram dump "
6711 "name=logger,type=CephString,req=false "
6712 "name=counter,type=CephString,req=false",
6713 "Get histogram data",
6714 "osd", "r")
6715
6716 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6717 COMMAND("version", "report version of OSD", "osd", "r")
6718 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6719 COMMAND("injectargs " \
6720 "name=injected_args,type=CephString,n=N",
6721 "inject configuration arguments into running OSD",
6722 "osd", "rw")
6723 COMMAND("config set " \
6724 "name=key,type=CephString name=value,type=CephString",
6725 "Set a configuration option at runtime (not persistent)",
6726 "osd", "rw")
6727 COMMAND("config get " \
6728 "name=key,type=CephString",
6729 "Get a configuration option at runtime",
6730 "osd", "r")
6731 COMMAND("config unset " \
6732 "name=key,type=CephString",
6733 "Unset a configuration option at runtime (not persistent)",
6734 "osd", "rw")
6735 COMMAND("cluster_log " \
6736 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6737 "name=message,type=CephString,n=N",
6738 "log a message to the cluster log",
6739 "osd", "rw")
6740 COMMAND("bench " \
6741 "name=count,type=CephInt,req=false " \
6742 "name=size,type=CephInt,req=false " \
6743 "name=object_size,type=CephInt,req=false " \
6744 "name=object_num,type=CephInt,req=false ", \
6745 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6746 "(default count=1G default size=4MB). Results in log.",
6747 "osd", "rw")
6748 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6749 COMMAND("heap " \
6750 "name=heapcmd,type=CephChoices,strings="\
6751 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6752 "name=value,type=CephString,req=false",
6753 "show heap usage info (available only if compiled with tcmalloc)",
6754 "osd", "rw")
6755 COMMAND("debug dump_missing " \
6756 "name=filename,type=CephFilepath",
6757 "dump missing objects to a named file", "osd", "r")
6758 COMMAND("debug kick_recovery_wq " \
6759 "name=delay,type=CephInt,range=0",
6760 "set osd_recovery_delay_start to <val>", "osd", "rw")
6761 COMMAND("cpu_profiler " \
6762 "name=arg,type=CephChoices,strings=status|flush",
6763 "run cpu profiling on daemon", "osd", "rw")
6764 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6765 "osd", "r")
6766 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6767 "osd", "rw")
6768 COMMAND("compact",
6769 "compact object store's omap. "
6770 "WARNING: Compaction probably slows your requests",
6771 "osd", "rw")
6772 COMMAND("smart name=devid,type=CephString,req=False",
6773 "runs smartctl on this osd devices. ",
6774 "osd", "rw")
6775 COMMAND("cache drop",
6776 "Drop all OSD caches",
6777 "osd", "rwx")
6778 COMMAND("cache status",
6779 "Get OSD caches statistics",
6780 "osd", "r")
6781 COMMAND("send_beacon",
6782 "Send OSD beacon to mon immediately",
6783 "osd", "r")
6784 };
6785
6786 void OSD::do_command(
6787 Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6788 {
6789 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6790
6791 int r = 0;
6792 stringstream ss, ds;
6793 bufferlist odata;
6794 cmdmap_t cmdmap;
6795 if (cmd.empty()) {
6796 ss << "no command given";
6797 goto out;
6798 }
6799 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6800 r = -EINVAL;
6801 goto out;
6802 }
6803
6804 try {
6805 r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6806 } catch (const bad_cmd_get& e) {
6807 r = -EINVAL;
6808 ss << e.what();
6809 }
6810 if (r == -EAGAIN) {
6811 return;
6812 }
6813 out:
6814 string rs = ss.str();
6815 odata.append(ds);
6816 dout(0) << "do_command r=" << r << " " << rs << dendl;
6817 clog->info() << rs;
6818 if (con) {
6819 MCommandReply *reply = new MCommandReply(r, rs);
6820 reply->set_tid(tid);
6821 reply->set_data(odata);
6822 con->send_message(reply);
6823 }
6824 }
6825
6826 namespace {
6827 class unlock_guard {
6828 Mutex& m;
6829 public:
6830 explicit unlock_guard(Mutex& mutex)
6831 : m(mutex)
6832 {
6833 m.unlock();
6834 }
6835 unlock_guard(unlock_guard&) = delete;
6836 ~unlock_guard() {
6837 m.lock();
6838 }
6839 };
6840 }
6841
6842 int OSD::_do_command(
6843 Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6844 bufferlist& odata, stringstream& ss, stringstream& ds)
6845 {
6846 int r = 0;
6847 string prefix;
6848 string format;
6849 string pgidstr;
6850 boost::scoped_ptr<Formatter> f;
6851
6852 cmd_getval(cct, cmdmap, "prefix", prefix);
6853
6854 if (prefix == "get_command_descriptions") {
6855 int cmdnum = 0;
6856 JSONFormatter *f = new JSONFormatter();
6857 f->open_object_section("command_descriptions");
6858 for (OSDCommand *cp = osd_commands;
6859 cp < &osd_commands[std::size(osd_commands)]; cp++) {
6860
6861 ostringstream secname;
6862 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6863 dump_cmddesc_to_json(f, con->get_features(),
6864 secname.str(), cp->cmdstring, cp->helpstring,
6865 cp->module, cp->perm, 0);
6866 cmdnum++;
6867 }
6868 f->close_section(); // command_descriptions
6869
6870 f->flush(ds);
6871 delete f;
6872 goto out;
6873 }
6874
6875 cmd_getval(cct, cmdmap, "format", format);
6876 f.reset(Formatter::create(format));
6877
6878 if (prefix == "version") {
6879 if (f) {
6880 f->open_object_section("version");
6881 f->dump_string("version", pretty_version_to_str());
6882 f->close_section();
6883 f->flush(ds);
6884 } else {
6885 ds << pretty_version_to_str();
6886 }
6887 goto out;
6888 }
6889 else if (prefix == "injectargs") {
6890 vector<string> argsvec;
6891 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6892
6893 if (argsvec.empty()) {
6894 r = -EINVAL;
6895 ss << "ignoring empty injectargs";
6896 goto out;
6897 }
6898 string args = argsvec.front();
6899 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6900 args += " " + *a;
6901 unlock_guard unlock{osd_lock};
6902 r = cct->_conf.injectargs(args, &ss);
6903 }
6904 else if (prefix == "config set") {
6905 std::string key;
6906 std::string val;
6907 cmd_getval(cct, cmdmap, "key", key);
6908 cmd_getval(cct, cmdmap, "value", val);
6909 unlock_guard unlock{osd_lock};
6910 r = cct->_conf.set_val(key, val, &ss);
6911 if (r == 0) {
6912 cct->_conf.apply_changes(nullptr);
6913 }
6914 }
6915 else if (prefix == "config get") {
6916 std::string key;
6917 cmd_getval(cct, cmdmap, "key", key);
6918 unlock_guard unlock{osd_lock};
6919 std::string val;
6920 r = cct->_conf.get_val(key, &val);
6921 if (r == 0) {
6922 ds << val;
6923 }
6924 }
6925 else if (prefix == "config unset") {
6926 std::string key;
6927 cmd_getval(cct, cmdmap, "key", key);
6928 unlock_guard unlock{osd_lock};
6929 r = cct->_conf.rm_val(key);
6930 if (r == 0) {
6931 cct->_conf.apply_changes(nullptr);
6932 }
6933 if (r == -ENOENT) {
6934 r = 0; // make command idempotent
6935 }
6936 }
6937 else if (prefix == "cluster_log") {
6938 vector<string> msg;
6939 cmd_getval(cct, cmdmap, "message", msg);
6940 if (msg.empty()) {
6941 r = -EINVAL;
6942 ss << "ignoring empty log message";
6943 goto out;
6944 }
6945 string message = msg.front();
6946 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6947 message += " " + *a;
6948 string lvl;
6949 cmd_getval(cct, cmdmap, "level", lvl);
6950 clog_type level = string_to_clog_type(lvl);
6951 if (level < 0) {
6952 r = -EINVAL;
6953 ss << "unknown level '" << lvl << "'";
6954 goto out;
6955 }
6956 clog->do_log(level, message);
6957 }
6958
6959 // either 'pg <pgid> <command>' or
6960 // 'tell <pgid>' (which comes in without any of that prefix)?
6961
6962 else if (prefix == "pg" ||
6963 prefix == "query" ||
6964 prefix == "mark_unfound_lost" ||
6965 prefix == "list_unfound"
6966 ) {
6967 pg_t pgid;
6968
6969 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6970 ss << "no pgid specified";
6971 r = -EINVAL;
6972 } else if (!pgid.parse(pgidstr.c_str())) {
6973 ss << "couldn't parse pgid '" << pgidstr << "'";
6974 r = -EINVAL;
6975 } else {
6976 spg_t pcand;
6977 PGRef pg;
6978 if (osdmap->get_primary_shard(pgid, &pcand) &&
6979 (pg = _lookup_lock_pg(pcand))) {
6980 if (pg->is_primary()) {
6981 // simulate pg <pgid> cmd= for pg->do-command
6982 if (prefix != "pg")
6983 cmd_putval(cct, cmdmap, "cmd", prefix);
6984 try {
6985 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6986 } catch (const bad_cmd_get& e) {
6987 pg->unlock();
6988 ss << e.what();
6989 return -EINVAL;
6990 }
6991 if (r == -EAGAIN) {
6992 pg->unlock();
6993 // don't reply, pg will do so async
6994 return -EAGAIN;
6995 }
6996 } else {
6997 ss << "not primary for pgid " << pgid;
6998
6999 // send them the latest diff to ensure they realize the mapping
7000 // has changed.
7001 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
7002
7003 // do not reply; they will get newer maps and realize they
7004 // need to resend.
7005 pg->unlock();
7006 return -EAGAIN;
7007 }
7008 pg->unlock();
7009 } else {
7010 ss << "i don't have pgid " << pgid;
7011 r = -ENOENT;
7012 }
7013 }
7014 }
7015
7016 else if (prefix == "bench") {
7017 int64_t count;
7018 int64_t bsize;
7019 int64_t osize, onum;
7020 // default count 1G, size 4MB
7021 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
7022 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
7023 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
7024 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
7025
7026 uint32_t duration = cct->_conf->osd_bench_duration;
7027
7028 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
7029 // let us limit the block size because the next checks rely on it
7030 // having a sane value. If we allow any block size to be set things
7031 // can still go sideways.
7032 ss << "block 'size' values are capped at "
7033 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7034 << " a higher value, please adjust 'osd_bench_max_block_size'";
7035 r = -EINVAL;
7036 goto out;
7037 } else if (bsize < (int64_t) (1 << 20)) {
7038 // entering the realm of small block sizes.
7039 // limit the count to a sane value, assuming a configurable amount of
7040 // IOPS and duration, so that the OSD doesn't get hung up on this,
7041 // preventing timeouts from going off
7042 int64_t max_count =
7043 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
7044 if (count > max_count) {
7045 ss << "'count' values greater than " << max_count
7046 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7047 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
7048 << " for " << duration << " seconds,"
7049 << " can cause ill effects on osd. "
7050 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7051 << " value if you wish to use a higher 'count'.";
7052 r = -EINVAL;
7053 goto out;
7054 }
7055 } else {
7056 // 1MB block sizes are big enough so that we get more stuff done.
7057 // However, to avoid the osd from getting hung on this and having
7058 // timers being triggered, we are going to limit the count assuming
7059 // a configurable throughput and duration.
7060 // NOTE: max_count is the total amount of bytes that we believe we
7061 // will be able to write during 'duration' for the given
7062 // throughput. The block size hardly impacts this unless it's
7063 // way too big. Given we already check how big the block size
7064 // is, it's safe to assume everything will check out.
7065 int64_t max_count =
7066 cct->_conf->osd_bench_large_size_max_throughput * duration;
7067 if (count > max_count) {
7068 ss << "'count' values greater than " << max_count
7069 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7070 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7071 << " for " << duration << " seconds,"
7072 << " can cause ill effects on osd. "
7073 << " Please adjust 'osd_bench_large_size_max_throughput'"
7074 << " with a higher value if you wish to use a higher 'count'.";
7075 r = -EINVAL;
7076 goto out;
7077 }
7078 }
7079
7080 if (osize && bsize > osize)
7081 bsize = osize;
7082
7083 dout(1) << " bench count " << count
7084 << " bsize " << byte_u_t(bsize) << dendl;
7085
7086 ObjectStore::Transaction cleanupt;
7087
7088 if (osize && onum) {
7089 bufferlist bl;
7090 bufferptr bp(osize);
7091 bp.zero();
7092 bl.push_back(std::move(bp));
7093 bl.rebuild_page_aligned();
7094 for (int i=0; i<onum; ++i) {
7095 char nm[30];
7096 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
7097 object_t oid(nm);
7098 hobject_t soid(sobject_t(oid, 0));
7099 ObjectStore::Transaction t;
7100 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
7101 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7102 cleanupt.remove(coll_t(), ghobject_t(soid));
7103 }
7104 }
7105
7106 bufferlist bl;
7107 bufferptr bp(bsize);
7108 bp.zero();
7109 bl.push_back(std::move(bp));
7110 bl.rebuild_page_aligned();
7111
7112 {
7113 C_SaferCond waiter;
7114 if (!service.meta_ch->flush_commit(&waiter)) {
7115 waiter.wait();
7116 }
7117 }
7118
7119 utime_t start = ceph_clock_now();
7120 for (int64_t pos = 0; pos < count; pos += bsize) {
7121 char nm[30];
7122 unsigned offset = 0;
7123 if (onum && osize) {
7124 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
7125 offset = rand() % (osize / bsize) * bsize;
7126 } else {
7127 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
7128 }
7129 object_t oid(nm);
7130 hobject_t soid(sobject_t(oid, 0));
7131 ObjectStore::Transaction t;
7132 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
7133 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7134 if (!onum || !osize)
7135 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
7136 }
7137
7138 {
7139 C_SaferCond waiter;
7140 if (!service.meta_ch->flush_commit(&waiter)) {
7141 waiter.wait();
7142 }
7143 }
7144 utime_t end = ceph_clock_now();
7145
7146 // clean up
7147 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7148 {
7149 C_SaferCond waiter;
7150 if (!service.meta_ch->flush_commit(&waiter)) {
7151 waiter.wait();
7152 }
7153 }
7154
7155 double elapsed = end - start;
7156 double rate = count / elapsed;
7157 double iops = rate / bsize;
7158 if (f) {
7159 f->open_object_section("osd_bench_results");
7160 f->dump_int("bytes_written", count);
7161 f->dump_int("blocksize", bsize);
7162 f->dump_float("elapsed_sec", elapsed);
7163 f->dump_float("bytes_per_sec", rate);
7164 f->dump_float("iops", iops);
7165 f->close_section();
7166 f->flush(ds);
7167 } else {
7168 ds << "bench: wrote " << byte_u_t(count)
7169 << " in blocks of " << byte_u_t(bsize) << " in "
7170 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
7171 << si_u_t(iops) << " IOPS";
7172 }
7173 }
7174
7175 else if (prefix == "flush_pg_stats") {
7176 mgrc.send_pgstats();
7177 ds << service.get_osd_stat_seq() << "\n";
7178 }
7179
7180 else if (prefix == "heap") {
7181 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
7182 }
7183
7184 else if (prefix == "debug dump_missing") {
7185 if (!f) {
7186 f.reset(new JSONFormatter(true));
7187 }
7188 f->open_array_section("pgs");
7189 vector<PGRef> pgs;
7190 _get_pgs(&pgs);
7191 for (auto& pg : pgs) {
7192 string s = stringify(pg->pg_id);
7193 f->open_array_section(s.c_str());
7194 pg->lock();
7195 pg->dump_missing(f.get());
7196 pg->unlock();
7197 f->close_section();
7198 }
7199 f->close_section();
7200 f->flush(ds);
7201 }
7202 else if (prefix == "debug kick_recovery_wq") {
7203 int64_t delay;
7204 cmd_getval(cct, cmdmap, "delay", delay);
7205 ostringstream oss;
7206 oss << delay;
7207 unlock_guard unlock{osd_lock};
7208 r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7209 if (r != 0) {
7210 ss << "kick_recovery_wq: error setting "
7211 << "osd_recovery_delay_start to '" << delay << "': error "
7212 << r;
7213 goto out;
7214 }
7215 cct->_conf.apply_changes(nullptr);
7216 ss << "kicking recovery queue. set osd_recovery_delay_start "
7217 << "to " << cct->_conf->osd_recovery_delay_start;
7218 }
7219
7220 else if (prefix == "cpu_profiler") {
7221 string arg;
7222 cmd_getval(cct, cmdmap, "arg", arg);
7223 vector<string> argvec;
7224 get_str_vec(arg, argvec);
7225 cpu_profiler_handle_command(argvec, ds);
7226 }
7227
7228 else if (prefix == "dump_pg_recovery_stats") {
7229 stringstream s;
7230 if (f) {
7231 pg_recovery_stats.dump_formatted(f.get());
7232 f->flush(ds);
7233 } else {
7234 pg_recovery_stats.dump(s);
7235 ds << "dump pg recovery stats: " << s.str();
7236 }
7237 }
7238
7239 else if (prefix == "reset_pg_recovery_stats") {
7240 ss << "reset pg recovery stats";
7241 pg_recovery_stats.reset();
7242 }
7243
7244 else if (prefix == "perf histogram dump") {
7245 std::string logger;
7246 std::string counter;
7247 cmd_getval(cct, cmdmap, "logger", logger);
7248 cmd_getval(cct, cmdmap, "counter", counter);
7249 if (f) {
7250 cct->get_perfcounters_collection()->dump_formatted_histograms(
7251 f.get(), false, logger, counter);
7252 f->flush(ds);
7253 }
7254 }
7255
7256 else if (prefix == "compact") {
7257 dout(1) << "triggering manual compaction" << dendl;
7258 auto start = ceph::coarse_mono_clock::now();
7259 store->compact();
7260 auto end = ceph::coarse_mono_clock::now();
7261 double duration = std::chrono::duration<double>(end-start).count();
7262 dout(1) << "finished manual compaction in "
7263 << duration
7264 << " seconds" << dendl;
7265 ss << "compacted omap in " << duration << " seconds";
7266 }
7267
7268 else if (prefix == "smart") {
7269 string devid;
7270 cmd_getval(cct, cmdmap, "devid", devid);
7271 probe_smart(devid, ds);
7272 }
7273
7274 else if (prefix == "cache drop") {
7275 dout(20) << "clearing all caches" << dendl;
7276 // Clear the objectstore's cache - onode and buffer for Bluestore,
7277 // system's pagecache for Filestore
7278 r = store->flush_cache(&ss);
7279 if (r < 0) {
7280 ds << "Error flushing objectstore cache: " << cpp_strerror(r);
7281 goto out;
7282 }
7283 // Clear the objectcontext cache (per PG)
7284 vector<PGRef> pgs;
7285 _get_pgs(&pgs);
7286 for (auto& pg: pgs) {
7287 pg->clear_cache();
7288 }
7289 }
7290
7291 else if (prefix == "cache status") {
7292 int obj_ctx_count = 0;
7293 vector<PGRef> pgs;
7294 _get_pgs(&pgs);
7295 for (auto& pg: pgs) {
7296 obj_ctx_count += pg->get_cache_obj_count();
7297 }
7298 if (f) {
7299 f->open_object_section("cache_status");
7300 f->dump_int("object_ctx", obj_ctx_count);
7301 store->dump_cache_stats(f.get());
7302 f->close_section();
7303 f->flush(ds);
7304 } else {
7305 ds << "object_ctx: " << obj_ctx_count;
7306 store->dump_cache_stats(ds);
7307 }
7308 }
7309 else if (prefix == "send_beacon") {
7310 if (is_active()) {
7311 send_beacon(ceph::coarse_mono_clock::now());
7312 }
7313 } else {
7314 ss << "unrecognized command '" << prefix << "'";
7315 r = -EINVAL;
7316 }
7317
7318 out:
7319 return r;
7320 }
7321
7322 void OSD::probe_smart(const string& only_devid, ostream& ss)
7323 {
7324 set<string> devnames;
7325 store->get_devices(&devnames);
7326 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7327 "osd_smart_report_timeout");
7328
7329 // == typedef std::map<std::string, mValue> mObject;
7330 json_spirit::mObject json_map;
7331
7332 for (auto dev : devnames) {
7333 // smartctl works only on physical devices; filter out any logical device
7334 if (dev.find("dm-") == 0) {
7335 continue;
7336 }
7337
7338 string err;
7339 string devid = get_device_id(dev, &err);
7340 if (devid.size() == 0) {
7341 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7342 << err << "), skipping" << dendl;
7343 continue;
7344 }
7345 if (only_devid.size() && devid != only_devid) {
7346 continue;
7347 }
7348
7349 json_spirit::mValue smart_json;
7350 if (block_device_get_metrics(dev, smart_timeout,
7351 &smart_json)) {
7352 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7353 continue;
7354 }
7355 json_map[devid] = smart_json;
7356 }
7357 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7358 }
7359
7360 bool OSD::heartbeat_dispatch(Message *m)
7361 {
7362 dout(30) << "heartbeat_dispatch " << m << dendl;
7363 switch (m->get_type()) {
7364
7365 case CEPH_MSG_PING:
7366 dout(10) << "ping from " << m->get_source_inst() << dendl;
7367 m->put();
7368 break;
7369
7370 case MSG_OSD_PING:
7371 handle_osd_ping(static_cast<MOSDPing*>(m));
7372 break;
7373
7374 default:
7375 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7376 m->put();
7377 }
7378
7379 return true;
7380 }
7381
7382 bool OSD::ms_dispatch(Message *m)
7383 {
7384 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7385 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7386 service.got_stop_ack();
7387 m->put();
7388 return true;
7389 }
7390
7391 // lock!
7392
7393 osd_lock.Lock();
7394 if (is_stopping()) {
7395 osd_lock.Unlock();
7396 m->put();
7397 return true;
7398 }
7399
7400 do_waiters();
7401 _dispatch(m);
7402
7403 osd_lock.Unlock();
7404
7405 return true;
7406 }
7407
7408 void OSD::maybe_share_map(
7409 Session *session,
7410 OpRequestRef op,
7411 OSDMapRef osdmap)
7412 {
7413 if (!op->check_send_map) {
7414 return;
7415 }
7416 epoch_t last_sent_epoch = 0;
7417
7418 session->sent_epoch_lock.lock();
7419 last_sent_epoch = session->last_sent_epoch;
7420 session->sent_epoch_lock.unlock();
7421
7422 // assume the peer has the newer of the op's sent_epoch and what
7423 // we think we sent them.
7424 epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7425
7426 const Message *m = op->get_req();
7427 service.share_map(
7428 m->get_source(),
7429 m->get_connection().get(),
7430 from,
7431 osdmap,
7432 session ? &last_sent_epoch : NULL);
7433
7434 session->sent_epoch_lock.lock();
7435 if (session->last_sent_epoch < last_sent_epoch) {
7436 session->last_sent_epoch = last_sent_epoch;
7437 }
7438 session->sent_epoch_lock.unlock();
7439
7440 op->check_send_map = false;
7441 }
7442
7443 void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7444 {
7445 ceph_assert(session->session_dispatch_lock.is_locked());
7446
7447 auto i = session->waiting_on_map.begin();
7448 while (i != session->waiting_on_map.end()) {
7449 OpRequestRef op = &(*i);
7450 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7451 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7452 op->get_req());
7453 if (m->get_min_epoch() > osdmap->get_epoch()) {
7454 break;
7455 }
7456 session->waiting_on_map.erase(i++);
7457 op->put();
7458
7459 spg_t pgid;
7460 if (m->get_type() == CEPH_MSG_OSD_OP) {
7461 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7462 static_cast<const MOSDOp*>(m)->get_pg());
7463 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7464 continue;
7465 }
7466 } else {
7467 pgid = m->get_spg();
7468 }
7469 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7470 }
7471
7472 if (session->waiting_on_map.empty()) {
7473 clear_session_waiting_on_map(session);
7474 } else {
7475 register_session_waiting_on_map(session);
7476 }
7477 }
7478
7479 void OSD::ms_fast_dispatch(Message *m)
7480 {
7481 FUNCTRACE(cct);
7482 if (service.is_stopping()) {
7483 m->put();
7484 return;
7485 }
7486
7487 // peering event?
7488 switch (m->get_type()) {
7489 case CEPH_MSG_PING:
7490 dout(10) << "ping from " << m->get_source() << dendl;
7491 m->put();
7492 return;
7493 case MSG_MON_COMMAND:
7494 handle_command(static_cast<MMonCommand*>(m));
7495 return;
7496 case MSG_OSD_FORCE_RECOVERY:
7497 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7498 return;
7499 case MSG_OSD_SCRUB2:
7500 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7501 return;
7502
7503 case MSG_OSD_PG_CREATE2:
7504 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7505 case MSG_OSD_PG_QUERY:
7506 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7507 case MSG_OSD_PG_NOTIFY:
7508 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7509 case MSG_OSD_PG_INFO:
7510 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7511 case MSG_OSD_PG_REMOVE:
7512 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7513
7514 // these are single-pg messages that handle themselves
7515 case MSG_OSD_PG_LOG:
7516 case MSG_OSD_PG_TRIM:
7517 case MSG_OSD_BACKFILL_RESERVE:
7518 case MSG_OSD_RECOVERY_RESERVE:
7519 {
7520 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7521 if (require_osd_peer(pm)) {
7522 enqueue_peering_evt(
7523 pm->get_spg(),
7524 PGPeeringEventRef(pm->get_event()));
7525 }
7526 pm->put();
7527 return;
7528 }
7529 }
7530
7531 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7532 {
7533 #ifdef WITH_LTTNG
7534 osd_reqid_t reqid = op->get_reqid();
7535 #endif
7536 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7537 reqid.name._num, reqid.tid, reqid.inc);
7538 }
7539
7540 if (m->trace)
7541 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7542
7543 // note sender epoch, min req's epoch
7544 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7545 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7546 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7547
7548 service.maybe_inject_dispatch_delay();
7549
7550 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7551 m->get_type() != CEPH_MSG_OSD_OP) {
7552 // queue it directly
7553 enqueue_op(
7554 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7555 std::move(op),
7556 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7557 } else {
7558 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7559 // message that didn't have an explicit spg_t); we need to map
7560 // them to an spg_t while preserving delivery order.
7561 auto priv = m->get_connection()->get_priv();
7562 if (auto session = static_cast<Session*>(priv.get()); session) {
7563 std::lock_guard l{session->session_dispatch_lock};
7564 op->get();
7565 session->waiting_on_map.push_back(*op);
7566 OSDMapRef nextmap = service.get_nextmap_reserved();
7567 dispatch_session_waiting(session, nextmap);
7568 service.release_map(nextmap);
7569 }
7570 }
7571 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7572 }
7573
7574 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7575 {
7576 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7577
7578 if (is_stopping()) {
7579 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7580 return false;
7581 }
7582
7583 if (dest_type == CEPH_ENTITY_TYPE_MON)
7584 return true;
7585
7586 *authorizer = monc->build_authorizer(dest_type);
7587 return *authorizer != NULL;
7588 }
7589
7590 KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7591 {
7592 return monc->rotating_secrets.get();
7593 }
7594
7595 int OSD::ms_handle_authentication(Connection *con)
7596 {
7597 int ret = 0;
7598 auto priv = con->get_priv();
7599 Session *s = static_cast<Session*>(priv.get());
7600 if (!s) {
7601 s = new Session(cct, con);
7602 con->set_priv(RefCountedPtr{s, false});
7603 s->entity_name = con->get_peer_entity_name();
7604 dout(10) << __func__ << " new session " << s << " con " << s->con
7605 << " entity " << s->entity_name
7606 << " addr " << con->get_peer_addrs() << dendl;
7607 } else {
7608 dout(10) << __func__ << " existing session " << s << " con " << s->con
7609 << " entity " << s->entity_name
7610 << " addr " << con->get_peer_addrs() << dendl;
7611 }
7612
7613 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7614 if (caps_info.allow_all)
7615 s->caps.set_allow_all();
7616
7617 if (caps_info.caps.length() > 0) {
7618 bufferlist::const_iterator p = caps_info.caps.cbegin();
7619 string str;
7620 try {
7621 decode(str, p);
7622 }
7623 catch (buffer::error& e) {
7624 dout(10) << __func__ << " session " << s << " " << s->entity_name
7625 << " failed to decode caps string" << dendl;
7626 ret = -EPERM;
7627 }
7628 if (!ret) {
7629 bool success = s->caps.parse(str);
7630 if (success) {
7631 dout(10) << __func__ << " session " << s
7632 << " " << s->entity_name
7633 << " has caps " << s->caps << " '" << str << "'" << dendl;
7634 ret = 1;
7635 } else {
7636 dout(10) << __func__ << " session " << s << " " << s->entity_name
7637 << " failed to parse caps '" << str << "'" << dendl;
7638 ret = -EPERM;
7639 }
7640 }
7641 }
7642 return ret;
7643 }
7644
7645 void OSD::do_waiters()
7646 {
7647 ceph_assert(osd_lock.is_locked());
7648
7649 dout(10) << "do_waiters -- start" << dendl;
7650 while (!finished.empty()) {
7651 OpRequestRef next = finished.front();
7652 finished.pop_front();
7653 dispatch_op(next);
7654 }
7655 dout(10) << "do_waiters -- finish" << dendl;
7656 }
7657
7658 void OSD::dispatch_op(OpRequestRef op)
7659 {
7660 switch (op->get_req()->get_type()) {
7661
7662 case MSG_OSD_PG_CREATE:
7663 handle_pg_create(op);
7664 break;
7665 }
7666 }
7667
7668 void OSD::_dispatch(Message *m)
7669 {
7670 ceph_assert(osd_lock.is_locked());
7671 dout(20) << "_dispatch " << m << " " << *m << dendl;
7672
7673 switch (m->get_type()) {
7674 // -- don't need OSDMap --
7675
7676 // map and replication
7677 case CEPH_MSG_OSD_MAP:
7678 handle_osd_map(static_cast<MOSDMap*>(m));
7679 break;
7680
7681 // osd
7682 case MSG_OSD_SCRUB:
7683 handle_scrub(static_cast<MOSDScrub*>(m));
7684 break;
7685
7686 case MSG_COMMAND:
7687 handle_command(static_cast<MCommand*>(m));
7688 return;
7689
7690 // -- need OSDMap --
7691
7692 case MSG_OSD_PG_CREATE:
7693 {
7694 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7695 if (m->trace)
7696 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7697 // no map? starting up?
7698 if (!osdmap) {
7699 dout(7) << "no OSDMap, not booted" << dendl;
7700 logger->inc(l_osd_waiting_for_map);
7701 waiting_for_osdmap.push_back(op);
7702 op->mark_delayed("no osdmap");
7703 break;
7704 }
7705
7706 // need OSDMap
7707 dispatch_op(op);
7708 }
7709 }
7710 }
7711
7712 // remove me post-nautilus
7713 void OSD::handle_scrub(MOSDScrub *m)
7714 {
7715 dout(10) << "handle_scrub " << *m << dendl;
7716 if (!require_mon_or_mgr_peer(m)) {
7717 m->put();
7718 return;
7719 }
7720 if (m->fsid != monc->get_fsid()) {
7721 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7722 << dendl;
7723 m->put();
7724 return;
7725 }
7726
7727 vector<spg_t> spgs;
7728 _get_pgids(&spgs);
7729
7730 if (!m->scrub_pgs.empty()) {
7731 vector<spg_t> v;
7732 for (auto pgid : m->scrub_pgs) {
7733 spg_t pcand;
7734 if (osdmap->get_primary_shard(pgid, &pcand) &&
7735 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7736 v.push_back(pcand);
7737 }
7738 }
7739 spgs.swap(v);
7740 }
7741
7742 for (auto pgid : spgs) {
7743 enqueue_peering_evt(
7744 pgid,
7745 PGPeeringEventRef(
7746 std::make_shared<PGPeeringEvent>(
7747 get_osdmap_epoch(),
7748 get_osdmap_epoch(),
7749 PG::RequestScrub(m->deep, m->repair))));
7750 }
7751
7752 m->put();
7753 }
7754
7755 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7756 {
7757 dout(10) << __func__ << " " << *m << dendl;
7758 if (!require_mon_or_mgr_peer(m)) {
7759 m->put();
7760 return;
7761 }
7762 if (m->fsid != monc->get_fsid()) {
7763 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7764 << dendl;
7765 m->put();
7766 return;
7767 }
7768 for (auto pgid : m->scrub_pgs) {
7769 enqueue_peering_evt(
7770 pgid,
7771 PGPeeringEventRef(
7772 std::make_shared<PGPeeringEvent>(
7773 m->epoch,
7774 m->epoch,
7775 PG::RequestScrub(m->deep, m->repair))));
7776 }
7777 m->put();
7778 }
7779
7780 bool OSD::scrub_random_backoff()
7781 {
7782 bool coin_flip = (rand() / (double)RAND_MAX >=
7783 cct->_conf->osd_scrub_backoff_ratio);
7784 if (!coin_flip) {
7785 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7786 return true;
7787 }
7788 return false;
7789 }
7790
7791 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7792 const spg_t& pg, const utime_t& timestamp,
7793 double pool_scrub_min_interval,
7794 double pool_scrub_max_interval, bool must)
7795 : cct(cct),
7796 pgid(pg),
7797 sched_time(timestamp),
7798 deadline(timestamp)
7799 {
7800 // if not explicitly requested, postpone the scrub with a random delay
7801 if (!must) {
7802 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7803 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7804 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7805 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7806
7807 sched_time += scrub_min_interval;
7808 double r = rand() / (double)RAND_MAX;
7809 sched_time +=
7810 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7811 if (scrub_max_interval == 0) {
7812 deadline = utime_t();
7813 } else {
7814 deadline += scrub_max_interval;
7815 }
7816
7817 }
7818 }
7819
7820 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7821 if (sched_time < rhs.sched_time)
7822 return true;
7823 if (sched_time > rhs.sched_time)
7824 return false;
7825 return pgid < rhs.pgid;
7826 }
7827
7828 bool OSD::scrub_time_permit(utime_t now)
7829 {
7830 struct tm bdt;
7831 time_t tt = now.sec();
7832 localtime_r(&tt, &bdt);
7833
7834 bool day_permit = false;
7835 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7836 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7837 day_permit = true;
7838 }
7839 } else {
7840 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7841 day_permit = true;
7842 }
7843 }
7844
7845 if (!day_permit) {
7846 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7847 << " - " << cct->_conf->osd_scrub_end_week_day
7848 << " now " << bdt.tm_wday << " = no" << dendl;
7849 return false;
7850 }
7851
7852 bool time_permit = false;
7853 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7854 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7855 time_permit = true;
7856 }
7857 } else {
7858 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7859 time_permit = true;
7860 }
7861 }
7862 if (!time_permit) {
7863 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7864 << " - " << cct->_conf->osd_scrub_end_hour
7865 << " now " << bdt.tm_hour << " = no" << dendl;
7866 } else {
7867 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7868 << " - " << cct->_conf->osd_scrub_end_hour
7869 << " now " << bdt.tm_hour << " = yes" << dendl;
7870 }
7871 return time_permit;
7872 }
7873
7874 bool OSD::scrub_load_below_threshold()
7875 {
7876 double loadavgs[3];
7877 if (getloadavg(loadavgs, 3) != 3) {
7878 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7879 return false;
7880 }
7881
7882 // allow scrub if below configured threshold
7883 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7884 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7885 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7886 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7887 << " < max " << cct->_conf->osd_scrub_load_threshold
7888 << " = yes" << dendl;
7889 return true;
7890 }
7891
7892 // allow scrub if below daily avg and currently decreasing
7893 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7894 dout(20) << __func__ << " loadavg " << loadavgs[0]
7895 << " < daily_loadavg " << daily_loadavg
7896 << " and < 15m avg " << loadavgs[2]
7897 << " = yes" << dendl;
7898 return true;
7899 }
7900
7901 dout(20) << __func__ << " loadavg " << loadavgs[0]
7902 << " >= max " << cct->_conf->osd_scrub_load_threshold
7903 << " and ( >= daily_loadavg " << daily_loadavg
7904 << " or >= 15m avg " << loadavgs[2]
7905 << ") = no" << dendl;
7906 return false;
7907 }
7908
7909 void OSD::sched_scrub()
7910 {
7911 // if not permitted, fail fast
7912 if (!service.can_inc_scrubs()) {
7913 return;
7914 }
7915 bool allow_requested_repair_only = false;
7916 if (service.is_recovery_active()) {
7917 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7918 dout(10) << __func__
7919 << " will only schedule explicitly requested repair due to active recovery"
7920 << dendl;
7921 allow_requested_repair_only = true;
7922 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7923 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7924 return;
7925 }
7926 }
7927
7928 utime_t now = ceph_clock_now();
7929 bool time_permit = scrub_time_permit(now);
7930 bool load_is_low = scrub_load_below_threshold();
7931 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7932
7933 OSDService::ScrubJob scrub;
7934 if (service.first_scrub_stamp(&scrub)) {
7935 do {
7936 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7937
7938 if (scrub.sched_time > now) {
7939 // save ourselves some effort
7940 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7941 << " > " << now << dendl;
7942 break;
7943 }
7944
7945 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7946 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7947 << (!time_permit ? "time not permit" : "high load") << dendl;
7948 continue;
7949 }
7950
7951 PGRef pg = _lookup_lock_pg(scrub.pgid);
7952 if (!pg)
7953 continue;
7954 // This has already started, so go on to the next scrub job
7955 if (pg->scrubber.active) {
7956 pg->unlock();
7957 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7958 continue;
7959 }
7960 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7961 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7962 pg->unlock();
7963 dout(10) << __func__ << " skip " << scrub.pgid
7964 << " because repairing is not explicitly requested on it"
7965 << dendl;
7966 continue;
7967 }
7968 // If it is reserving, let it resolve before going to the next scrub job
7969 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7970 pg->unlock();
7971 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7972 break;
7973 }
7974 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7975 << (pg->get_must_scrub() ? ", explicitly requested" :
7976 (load_is_low ? ", load_is_low" : " deadline < now"))
7977 << dendl;
7978 if (pg->sched_scrub()) {
7979 pg->unlock();
7980 break;
7981 }
7982 pg->unlock();
7983 } while (service.next_scrub_stamp(scrub, &scrub));
7984 }
7985 dout(20) << "sched_scrub done" << dendl;
7986 }
7987
7988 void OSD::resched_all_scrubs()
7989 {
7990 dout(10) << __func__ << ": start" << dendl;
7991 OSDService::ScrubJob scrub;
7992 if (service.first_scrub_stamp(&scrub)) {
7993 do {
7994 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7995
7996 PGRef pg = _lookup_lock_pg(scrub.pgid);
7997 if (!pg)
7998 continue;
7999 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
8000 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
8001 pg->on_info_history_change();
8002 }
8003 pg->unlock();
8004 } while (service.next_scrub_stamp(scrub, &scrub));
8005 }
8006 dout(10) << __func__ << ": done" << dendl;
8007 }
8008
8009 MPGStats* OSD::collect_pg_stats()
8010 {
8011 // This implementation unconditionally sends every is_primary PG's
8012 // stats every time we're called. This has equivalent cost to the
8013 // previous implementation's worst case where all PGs are busy and
8014 // their stats are always enqueued for sending.
8015 RWLock::RLocker l(map_lock);
8016
8017 utime_t had_for = ceph_clock_now() - had_map_since;
8018 osd_stat_t cur_stat = service.get_osd_stat();
8019 cur_stat.os_perf_stat = store->get_cur_stats();
8020
8021 auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
8022 m->osd_stat = cur_stat;
8023
8024 std::lock_guard lec{min_last_epoch_clean_lock};
8025 min_last_epoch_clean = osdmap->get_epoch();
8026 min_last_epoch_clean_pgs.clear();
8027
8028 std::set<int64_t> pool_set;
8029 vector<PGRef> pgs;
8030 _get_pgs(&pgs);
8031 for (auto& pg : pgs) {
8032 auto pool = pg->pg_id.pgid.pool();
8033 pool_set.emplace((int64_t)pool);
8034 if (!pg->is_primary()) {
8035 continue;
8036 }
8037 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
8038 m->pg_stat[pg->pg_id.pgid] = s;
8039 min_last_epoch_clean = min(min_last_epoch_clean, lec);
8040 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
8041 });
8042 }
8043 store_statfs_t st;
8044 bool per_pool_stats = false;
8045 for (auto p : pool_set) {
8046 int r = store->pool_statfs(p, &st);
8047 if (r == -ENOTSUP) {
8048 break;
8049 } else {
8050 assert(r >= 0);
8051 m->pool_stat[p] = st;
8052 per_pool_stats = true;
8053 }
8054 }
8055
8056 // indicate whether we are reporting per-pool stats
8057 m->osd_stat.num_osds = 1;
8058 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
8059
8060 return m;
8061 }
8062
8063 vector<DaemonHealthMetric> OSD::get_health_metrics()
8064 {
8065 vector<DaemonHealthMetric> metrics;
8066 {
8067 utime_t oldest_secs;
8068 const utime_t now = ceph_clock_now();
8069 auto too_old = now;
8070 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
8071 int slow = 0;
8072 TrackedOpRef oldest_op;
8073 auto count_slow_ops = [&](TrackedOp& op) {
8074 if (op.get_initiated() < too_old) {
8075 lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
8076 << " initiated "
8077 << op.get_initiated() << dendl;
8078 slow++;
8079 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
8080 oldest_op = &op;
8081 }
8082 return true;
8083 } else {
8084 return false;
8085 }
8086 };
8087 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
8088 if (slow) {
8089 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
8090 << oldest_op->get_desc() << dendl;
8091 }
8092 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
8093 } else {
8094 // no news is not good news.
8095 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
8096 }
8097 }
8098 {
8099 std::lock_guard l(pending_creates_lock);
8100 auto n_primaries = pending_creates_from_mon;
8101 for (const auto& create : pending_creates_from_osd) {
8102 if (create.second) {
8103 n_primaries++;
8104 }
8105 }
8106 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
8107 }
8108 return metrics;
8109 }
8110
8111 // =====================================================
8112 // MAP
8113
8114 void OSD::wait_for_new_map(OpRequestRef op)
8115 {
8116 // ask?
8117 if (waiting_for_osdmap.empty()) {
8118 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8119 }
8120
8121 logger->inc(l_osd_waiting_for_map);
8122 waiting_for_osdmap.push_back(op);
8123 op->mark_delayed("wait for new map");
8124 }
8125
8126
8127 /** update_map
8128 * assimilate new OSDMap(s). scan pgs, etc.
8129 */
8130
8131 void OSD::note_down_osd(int peer)
8132 {
8133 ceph_assert(osd_lock.is_locked());
8134 cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
8135
8136 heartbeat_lock.Lock();
8137 failure_queue.erase(peer);
8138 failure_pending.erase(peer);
8139 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
8140 if (p != heartbeat_peers.end()) {
8141 p->second.con_back->mark_down();
8142 if (p->second.con_front) {
8143 p->second.con_front->mark_down();
8144 }
8145 heartbeat_peers.erase(p);
8146 }
8147 heartbeat_lock.Unlock();
8148 }
8149
8150 void OSD::note_up_osd(int peer)
8151 {
8152 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
8153 heartbeat_set_peers_need_update();
8154 }
8155
8156 struct C_OnMapCommit : public Context {
8157 OSD *osd;
8158 epoch_t first, last;
8159 MOSDMap *msg;
8160 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
8161 : osd(o), first(f), last(l), msg(m) {}
8162 void finish(int r) override {
8163 osd->_committed_osd_maps(first, last, msg);
8164 msg->put();
8165 }
8166 };
8167
8168 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
8169 {
8170 std::lock_guard l(osdmap_subscribe_lock);
8171 if (latest_subscribed_epoch >= epoch && !force_request)
8172 return;
8173
8174 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
8175
8176 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
8177 force_request) {
8178 monc->renew_subs();
8179 }
8180 }
8181
8182 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
8183 {
8184 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
8185 if (min <= superblock.oldest_map)
8186 return;
8187
8188 int num = 0;
8189 ObjectStore::Transaction t;
8190 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
8191 dout(20) << " removing old osdmap epoch " << e << dendl;
8192 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
8193 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
8194 superblock.oldest_map = e + 1;
8195 num++;
8196 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
8197 service.publish_superblock(superblock);
8198 write_superblock(t);
8199 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8200 ceph_assert(tr == 0);
8201 num = 0;
8202 if (!skip_maps) {
8203 // skip_maps leaves us with a range of old maps if we fail to remove all
8204 // of them before moving superblock.oldest_map forward to the first map
8205 // in the incoming MOSDMap msg. so we should continue removing them in
8206 // this case, even we could do huge series of delete transactions all at
8207 // once.
8208 break;
8209 }
8210 }
8211 }
8212 if (num > 0) {
8213 service.publish_superblock(superblock);
8214 write_superblock(t);
8215 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8216 ceph_assert(tr == 0);
8217 }
8218 // we should not remove the cached maps
8219 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
8220 }
8221
8222 void OSD::handle_osd_map(MOSDMap *m)
8223 {
8224 // wait for pgs to catch up
8225 {
8226 // we extend the map cache pins to accomodate pgs slow to consume maps
8227 // for some period, until we hit the max_lag_factor bound, at which point
8228 // we block here to stop injesting more maps than they are able to keep
8229 // up with.
8230 epoch_t max_lag = cct->_conf->osd_map_cache_size *
8231 m_osd_pg_epoch_max_lag_factor;
8232 ceph_assert(max_lag > 0);
8233 epoch_t osd_min = 0;
8234 for (auto shard : shards) {
8235 epoch_t min = shard->get_min_pg_epoch();
8236 if (osd_min == 0 || min < osd_min) {
8237 osd_min = min;
8238 }
8239 }
8240 if (osd_min > 0 &&
8241 osdmap->get_epoch() > max_lag &&
8242 osdmap->get_epoch() - max_lag > osd_min) {
8243 epoch_t need = osdmap->get_epoch() - max_lag;
8244 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
8245 << " max_lag " << max_lag << ")" << dendl;
8246 for (auto shard : shards) {
8247 epoch_t min = shard->get_min_pg_epoch();
8248 if (need > min) {
8249 dout(10) << __func__ << " waiting for pgs to consume " << need
8250 << " (shard " << shard->shard_id << " min " << min
8251 << ", map cache is " << cct->_conf->osd_map_cache_size
8252 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8253 << ")" << dendl;
8254 unlock_guard unlock{osd_lock};
8255 shard->wait_min_pg_epoch(need);
8256 }
8257 }
8258 }
8259 }
8260
8261 ceph_assert(osd_lock.is_locked());
8262 map<epoch_t,OSDMapRef> added_maps;
8263 map<epoch_t,bufferlist> added_maps_bl;
8264 if (m->fsid != monc->get_fsid()) {
8265 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8266 << monc->get_fsid() << dendl;
8267 m->put();
8268 return;
8269 }
8270 if (is_initializing()) {
8271 dout(0) << "ignoring osdmap until we have initialized" << dendl;
8272 m->put();
8273 return;
8274 }
8275
8276 auto priv = m->get_connection()->get_priv();
8277 if (auto session = static_cast<Session *>(priv.get());
8278 session && !(session->entity_name.is_mon() ||
8279 session->entity_name.is_osd())) {
8280 //not enough perms!
8281 dout(10) << "got osd map from Session " << session
8282 << " which we can't take maps from (not a mon or osd)" << dendl;
8283 m->put();
8284 return;
8285 }
8286
8287 // share with the objecter
8288 if (!is_preboot())
8289 service.objecter->handle_osd_map(m);
8290
8291 epoch_t first = m->get_first();
8292 epoch_t last = m->get_last();
8293 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8294 << superblock.newest_map
8295 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8296 << dendl;
8297
8298 logger->inc(l_osd_map);
8299 logger->inc(l_osd_mape, last - first + 1);
8300 if (first <= superblock.newest_map)
8301 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8302 if (service.max_oldest_map < m->oldest_map) {
8303 service.max_oldest_map = m->oldest_map;
8304 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8305 }
8306
8307 // make sure there is something new, here, before we bother flushing
8308 // the queues and such
8309 if (last <= superblock.newest_map) {
8310 dout(10) << " no new maps here, dropping" << dendl;
8311 m->put();
8312 return;
8313 }
8314
8315 // missing some?
8316 bool skip_maps = false;
8317 if (first > superblock.newest_map + 1) {
8318 dout(10) << "handle_osd_map message skips epochs "
8319 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8320 if (m->oldest_map <= superblock.newest_map + 1) {
8321 osdmap_subscribe(superblock.newest_map + 1, false);
8322 m->put();
8323 return;
8324 }
8325 // always try to get the full range of maps--as many as we can. this
8326 // 1- is good to have
8327 // 2- is at present the only way to ensure that we get a *full* map as
8328 // the first map!
8329 if (m->oldest_map < first) {
8330 osdmap_subscribe(m->oldest_map - 1, true);
8331 m->put();
8332 return;
8333 }
8334 skip_maps = true;
8335 }
8336
8337 ObjectStore::Transaction t;
8338 uint64_t txn_size = 0;
8339
8340 // store new maps: queue for disk and put in the osdmap cache
8341 epoch_t start = std::max(superblock.newest_map + 1, first);
8342 for (epoch_t e = start; e <= last; e++) {
8343 if (txn_size >= t.get_num_bytes()) {
8344 derr << __func__ << " transaction size overflowed" << dendl;
8345 ceph_assert(txn_size < t.get_num_bytes());
8346 }
8347 txn_size = t.get_num_bytes();
8348 map<epoch_t,bufferlist>::iterator p;
8349 p = m->maps.find(e);
8350 if (p != m->maps.end()) {
8351 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8352 OSDMap *o = new OSDMap;
8353 bufferlist& bl = p->second;
8354
8355 o->decode(bl);
8356
8357 ghobject_t fulloid = get_osdmap_pobject_name(e);
8358 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8359 added_maps[e] = add_map(o);
8360 added_maps_bl[e] = bl;
8361 got_full_map(e);
8362 continue;
8363 }
8364
8365 p = m->incremental_maps.find(e);
8366 if (p != m->incremental_maps.end()) {
8367 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8368 bufferlist& bl = p->second;
8369 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8370 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8371
8372 OSDMap *o = new OSDMap;
8373 if (e > 1) {
8374 bufferlist obl;
8375 bool got = get_map_bl(e - 1, obl);
8376 if (!got) {
8377 auto p = added_maps_bl.find(e - 1);
8378 ceph_assert(p != added_maps_bl.end());
8379 obl = p->second;
8380 }
8381 o->decode(obl);
8382 }
8383
8384 OSDMap::Incremental inc;
8385 auto p = bl.cbegin();
8386 inc.decode(p);
8387
8388 if (o->apply_incremental(inc) < 0) {
8389 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
8390 ceph_abort_msg("bad fsid");
8391 }
8392
8393 bufferlist fbl;
8394 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8395
8396 bool injected_failure = false;
8397 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8398 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8399 derr << __func__ << " injecting map crc failure" << dendl;
8400 injected_failure = true;
8401 }
8402
8403 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8404 dout(2) << "got incremental " << e
8405 << " but failed to encode full with correct crc; requesting"
8406 << dendl;
8407 clog->warn() << "failed to encode map e" << e << " with expected crc";
8408 dout(20) << "my encoded map was:\n";
8409 fbl.hexdump(*_dout);
8410 *_dout << dendl;
8411 delete o;
8412 request_full_map(e, last);
8413 last = e - 1;
8414 break;
8415 }
8416 got_full_map(e);
8417
8418 ghobject_t fulloid = get_osdmap_pobject_name(e);
8419 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8420 added_maps[e] = add_map(o);
8421 added_maps_bl[e] = fbl;
8422 continue;
8423 }
8424
8425 ceph_abort_msg("MOSDMap lied about what maps it had?");
8426 }
8427
8428 // even if this map isn't from a mon, we may have satisfied our subscription
8429 monc->sub_got("osdmap", last);
8430
8431 if (!m->maps.empty() && requested_full_first) {
8432 dout(10) << __func__ << " still missing full maps " << requested_full_first
8433 << ".." << requested_full_last << dendl;
8434 rerequest_full_maps();
8435 }
8436
8437 if (superblock.oldest_map) {
8438 // make sure we at least keep pace with incoming maps
8439 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8440 pg_num_history.prune(superblock.oldest_map);
8441 }
8442
8443 if (!superblock.oldest_map || skip_maps)
8444 superblock.oldest_map = first;
8445 superblock.newest_map = last;
8446 superblock.current_epoch = last;
8447
8448 // note in the superblock that we were clean thru the prior epoch
8449 epoch_t boot_epoch = service.get_boot_epoch();
8450 if (boot_epoch && boot_epoch >= superblock.mounted) {
8451 superblock.mounted = boot_epoch;
8452 superblock.clean_thru = last;
8453 }
8454
8455 // check for pg_num changes and deleted pools
8456 OSDMapRef lastmap;
8457 for (auto& i : added_maps) {
8458 if (!lastmap) {
8459 if (!(lastmap = service.try_get_map(i.first - 1))) {
8460 dout(10) << __func__ << " can't get previous map " << i.first - 1
8461 << " probably first start of this osd" << dendl;
8462 continue;
8463 }
8464 }
8465 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8466 for (auto& j : lastmap->get_pools()) {
8467 if (!i.second->have_pg_pool(j.first)) {
8468 pg_num_history.log_pool_delete(i.first, j.first);
8469 dout(10) << __func__ << " recording final pg_pool_t for pool "
8470 << j.first << dendl;
8471 // this information is needed by _make_pg() if have to restart before
8472 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8473 ghobject_t obj = make_final_pool_info_oid(j.first);
8474 bufferlist bl;
8475 encode(j.second, bl, CEPH_FEATURES_ALL);
8476 string name = lastmap->get_pool_name(j.first);
8477 encode(name, bl);
8478 map<string,string> profile;
8479 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8480 profile = lastmap->get_erasure_code_profile(
8481 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8482 }
8483 encode(profile, bl);
8484 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8485 service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8486 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8487 new_pg_num != j.second.get_pg_num()) {
8488 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8489 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8490 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8491 }
8492 }
8493 for (auto& j : i.second->get_pools()) {
8494 if (!lastmap->have_pg_pool(j.first)) {
8495 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8496 << j.second.get_pg_num() << dendl;
8497 pg_num_history.log_pg_num_change(i.first, j.first,
8498 j.second.get_pg_num());
8499 }
8500 }
8501 lastmap = i.second;
8502 }
8503 pg_num_history.epoch = last;
8504 {
8505 bufferlist bl;
8506 ::encode(pg_num_history, bl);
8507 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8508 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8509 }
8510
8511 // superblock and commit
8512 write_superblock(t);
8513 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8514 store->queue_transaction(
8515 service.meta_ch,
8516 std::move(t));
8517 service.publish_superblock(superblock);
8518 }
8519
8520 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8521 {
8522 dout(10) << __func__ << " " << first << ".." << last << dendl;
8523 if (is_stopping()) {
8524 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8525 return;
8526 }
8527 std::lock_guard l(osd_lock);
8528 if (is_stopping()) {
8529 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8530 return;
8531 }
8532 map_lock.get_write();
8533
8534 bool do_shutdown = false;
8535 bool do_restart = false;
8536 bool network_error = false;
8537
8538 // advance through the new maps
8539 for (epoch_t cur = first; cur <= last; cur++) {
8540 dout(10) << " advance to epoch " << cur
8541 << " (<= last " << last
8542 << " <= newest_map " << superblock.newest_map
8543 << ")" << dendl;
8544
8545 OSDMapRef newmap = get_map(cur);
8546 ceph_assert(newmap); // we just cached it above!
8547
8548 // start blacklisting messages sent to peers that go down.
8549 service.pre_publish_map(newmap);
8550
8551 // kill connections to newly down osds
8552 bool waited_for_reservations = false;
8553 set<int> old;
8554 osdmap->get_all_osds(old);
8555 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8556 if (*p != whoami &&
8557 osdmap->is_up(*p) && // in old map
8558 newmap->is_down(*p)) { // but not the new one
8559 if (!waited_for_reservations) {
8560 service.await_reserved_maps();
8561 waited_for_reservations = true;
8562 }
8563 note_down_osd(*p);
8564 } else if (*p != whoami &&
8565 osdmap->is_down(*p) &&
8566 newmap->is_up(*p)) {
8567 note_up_osd(*p);
8568 }
8569 }
8570
8571 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8572 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8573 << dendl;
8574 if (is_booting()) {
8575 // this captures the case where we sent the boot message while
8576 // NOUP was being set on the mon and our boot request was
8577 // dropped, and then later it is cleared. it imperfectly
8578 // handles the case where our original boot message was not
8579 // dropped and we restart even though we might have booted, but
8580 // that is harmless (boot will just take slightly longer).
8581 do_restart = true;
8582 }
8583 }
8584
8585 osdmap = newmap;
8586 epoch_t up_epoch;
8587 epoch_t boot_epoch;
8588 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8589 if (!up_epoch &&
8590 osdmap->is_up(whoami) &&
8591 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8592 up_epoch = osdmap->get_epoch();
8593 dout(10) << "up_epoch is " << up_epoch << dendl;
8594 if (!boot_epoch) {
8595 boot_epoch = osdmap->get_epoch();
8596 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8597 }
8598 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8599 }
8600 }
8601
8602 had_map_since = ceph_clock_now();
8603
8604 epoch_t _bind_epoch = service.get_bind_epoch();
8605 if (osdmap->is_up(whoami) &&
8606 osdmap->get_addrs(whoami).legacy_equals(
8607 client_messenger->get_myaddrs()) &&
8608 _bind_epoch < osdmap->get_up_from(whoami)) {
8609
8610 if (is_booting()) {
8611 dout(1) << "state: booting -> active" << dendl;
8612 set_state(STATE_ACTIVE);
8613 do_restart = false;
8614
8615 // set incarnation so that osd_reqid_t's we generate for our
8616 // objecter requests are unique across restarts.
8617 service.objecter->set_client_incarnation(osdmap->get_epoch());
8618 cancel_pending_failures();
8619 }
8620 }
8621
8622 if (osdmap->get_epoch() > 0 &&
8623 is_active()) {
8624 if (!osdmap->exists(whoami)) {
8625 dout(0) << "map says i do not exist. shutting down." << dendl;
8626 do_shutdown = true; // don't call shutdown() while we have
8627 // everything paused
8628 } else if (!osdmap->is_up(whoami) ||
8629 !osdmap->get_addrs(whoami).legacy_equals(
8630 client_messenger->get_myaddrs()) ||
8631 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8632 cluster_messenger->get_myaddrs()) ||
8633 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8634 hb_back_server_messenger->get_myaddrs()) ||
8635 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8636 hb_front_server_messenger->get_myaddrs())) {
8637 if (!osdmap->is_up(whoami)) {
8638 if (service.is_preparing_to_stop() || service.is_stopping()) {
8639 service.got_stop_ack();
8640 } else {
8641 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8642 "but it is still running";
8643 clog->debug() << "map e" << osdmap->get_epoch()
8644 << " wrongly marked me down at e"
8645 << osdmap->get_down_at(whoami);
8646 }
8647 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8648 client_messenger->get_myaddrs())) {
8649 clog->error() << "map e" << osdmap->get_epoch()
8650 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8651 << " != my " << client_messenger->get_myaddrs() << ")";
8652 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8653 cluster_messenger->get_myaddrs())) {
8654 clog->error() << "map e" << osdmap->get_epoch()
8655 << " had wrong cluster addr ("
8656 << osdmap->get_cluster_addrs(whoami)
8657 << " != my " << cluster_messenger->get_myaddrs() << ")";
8658 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8659 hb_back_server_messenger->get_myaddrs())) {
8660 clog->error() << "map e" << osdmap->get_epoch()
8661 << " had wrong heartbeat back addr ("
8662 << osdmap->get_hb_back_addrs(whoami)
8663 << " != my " << hb_back_server_messenger->get_myaddrs()
8664 << ")";
8665 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8666 hb_front_server_messenger->get_myaddrs())) {
8667 clog->error() << "map e" << osdmap->get_epoch()
8668 << " had wrong heartbeat front addr ("
8669 << osdmap->get_hb_front_addrs(whoami)
8670 << " != my " << hb_front_server_messenger->get_myaddrs()
8671 << ")";
8672 }
8673
8674 if (!service.is_stopping()) {
8675 epoch_t up_epoch = 0;
8676 epoch_t bind_epoch = osdmap->get_epoch();
8677 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8678 do_restart = true;
8679
8680 //add markdown log
8681 utime_t now = ceph_clock_now();
8682 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8683 osd_markdown_log.push_back(now);
8684 //clear all out-of-date log
8685 while (!osd_markdown_log.empty() &&
8686 osd_markdown_log.front() + grace < now)
8687 osd_markdown_log.pop_front();
8688 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8689 dout(0) << __func__ << " marked down "
8690 << osd_markdown_log.size()
8691 << " > osd_max_markdown_count "
8692 << cct->_conf->osd_max_markdown_count
8693 << " in last " << grace << " seconds, shutting down"
8694 << dendl;
8695 do_restart = false;
8696 do_shutdown = true;
8697 }
8698
8699 start_waiting_for_healthy();
8700
8701 set<int> avoid_ports;
8702 #if defined(__FreeBSD__)
8703 // prevent FreeBSD from grabbing the client_messenger port during
8704 // rebinding. In which case a cluster_meesneger will connect also
8705 // to the same port
8706 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8707 #endif
8708 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8709 hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8710 hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8711
8712 int r = cluster_messenger->rebind(avoid_ports);
8713 if (r != 0) {
8714 do_shutdown = true; // FIXME: do_restart?
8715 network_error = true;
8716 dout(0) << __func__ << " marked down:"
8717 << " rebind cluster_messenger failed" << dendl;
8718 }
8719
8720 r = hb_back_server_messenger->rebind(avoid_ports);
8721 if (r != 0) {
8722 do_shutdown = true; // FIXME: do_restart?
8723 network_error = true;
8724 dout(0) << __func__ << " marked down:"
8725 << " rebind hb_back_server_messenger failed" << dendl;
8726 }
8727
8728 r = hb_front_server_messenger->rebind(avoid_ports);
8729 if (r != 0) {
8730 do_shutdown = true; // FIXME: do_restart?
8731 network_error = true;
8732 dout(0) << __func__ << " marked down:"
8733 << " rebind hb_front_server_messenger failed" << dendl;
8734 }
8735
8736 hb_front_client_messenger->mark_down_all();
8737 hb_back_client_messenger->mark_down_all();
8738
8739 reset_heartbeat_peers(true);
8740 }
8741 }
8742 }
8743
8744 map_lock.put_write();
8745
8746 check_osdmap_features();
8747
8748 // yay!
8749 consume_map();
8750
8751 if (is_active() || is_waiting_for_healthy())
8752 maybe_update_heartbeat_peers();
8753
8754 if (is_active()) {
8755 activate_map();
8756 }
8757
8758 if (do_shutdown) {
8759 if (network_error) {
8760 cancel_pending_failures();
8761 }
8762 // trigger shutdown in a different thread
8763 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8764 queue_async_signal(SIGINT);
8765 }
8766 else if (m->newest_map && m->newest_map > last) {
8767 dout(10) << " msg say newest map is " << m->newest_map
8768 << ", requesting more" << dendl;
8769 osdmap_subscribe(osdmap->get_epoch()+1, false);
8770 }
8771 else if (is_preboot()) {
8772 if (m->get_source().is_mon())
8773 _preboot(m->oldest_map, m->newest_map);
8774 else
8775 start_boot();
8776 }
8777 else if (do_restart)
8778 start_boot();
8779
8780 }
8781
8782 void OSD::check_osdmap_features()
8783 {
8784 // adjust required feature bits?
8785
8786 // we have to be a bit careful here, because we are accessing the
8787 // Policy structures without taking any lock. in particular, only
8788 // modify integer values that can safely be read by a racing CPU.
8789 // since we are only accessing existing Policy structures a their
8790 // current memory location, and setting or clearing bits in integer
8791 // fields, and we are the only writer, this is not a problem.
8792
8793 {
8794 Messenger::Policy p = client_messenger->get_default_policy();
8795 uint64_t mask;
8796 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8797 if ((p.features_required & mask) != features) {
8798 dout(0) << "crush map has features " << features
8799 << ", adjusting msgr requires for clients" << dendl;
8800 p.features_required = (p.features_required & ~mask) | features;
8801 client_messenger->set_default_policy(p);
8802 }
8803 }
8804 {
8805 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8806 uint64_t mask;
8807 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8808 if ((p.features_required & mask) != features) {
8809 dout(0) << "crush map has features " << features
8810 << " was " << p.features_required
8811 << ", adjusting msgr requires for mons" << dendl;
8812 p.features_required = (p.features_required & ~mask) | features;
8813 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8814 }
8815 }
8816 {
8817 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8818 uint64_t mask;
8819 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8820
8821 if ((p.features_required & mask) != features) {
8822 dout(0) << "crush map has features " << features
8823 << ", adjusting msgr requires for osds" << dendl;
8824 p.features_required = (p.features_required & ~mask) | features;
8825 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8826 }
8827
8828 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8829 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8830 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8831 ObjectStore::Transaction t;
8832 write_superblock(t);
8833 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8834 ceph_assert(err == 0);
8835 }
8836 }
8837
8838 if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8839 heartbeat_dispatcher.ms_set_require_authorizer(false);
8840 }
8841
8842 if (osdmap->require_osd_release != last_require_osd_release) {
8843 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8844 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8845 store->write_meta("require_osd_release",
8846 stringify((int)osdmap->require_osd_release));
8847 last_require_osd_release = osdmap->require_osd_release;
8848 }
8849 }
8850
8851 struct C_FinishSplits : public Context {
8852 OSD *osd;
8853 set<PGRef> pgs;
8854 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8855 : osd(osd), pgs(in) {}
8856 void finish(int r) override {
8857 osd->_finish_splits(pgs);
8858 }
8859 };
8860
8861 void OSD::_finish_splits(set<PGRef>& pgs)
8862 {
8863 dout(10) << __func__ << " " << pgs << dendl;
8864 if (is_stopping())
8865 return;
8866 PG::RecoveryCtx rctx = create_context();
8867 for (set<PGRef>::iterator i = pgs.begin();
8868 i != pgs.end();
8869 ++i) {
8870 PG *pg = i->get();
8871
8872 pg->lock();
8873 dout(10) << __func__ << " " << *pg << dendl;
8874 epoch_t e = pg->get_osdmap_epoch();
8875 pg->handle_initialize(&rctx);
8876 pg->queue_null(e, e);
8877 dispatch_context_transaction(rctx, pg);
8878 pg->unlock();
8879
8880 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8881 shards[shard_index]->register_and_wake_split_child(pg);
8882 }
8883
8884 dispatch_context(rctx, 0, service.get_osdmap());
8885 };
8886
8887 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8888 unsigned need)
8889 {
8890 std::lock_guard l(merge_lock);
8891 auto& p = merge_waiters[nextmap->get_epoch()][target];
8892 p[src->pg_id] = src;
8893 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8894 << " for " << target << ", have " << p.size() << "/" << need
8895 << dendl;
8896 return p.size() == need;
8897 }
8898
8899 bool OSD::advance_pg(
8900 epoch_t osd_epoch,
8901 PG *pg,
8902 ThreadPool::TPHandle &handle,
8903 PG::RecoveryCtx *rctx)
8904 {
8905 if (osd_epoch <= pg->get_osdmap_epoch()) {
8906 return true;
8907 }
8908 ceph_assert(pg->is_locked());
8909 OSDMapRef lastmap = pg->get_osdmap();
8910 ceph_assert(lastmap->get_epoch() < osd_epoch);
8911 set<PGRef> new_pgs; // any split children
8912 bool ret = true;
8913
8914 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8915 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8916 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8917 next_epoch <= osd_epoch;
8918 ++next_epoch) {
8919 OSDMapRef nextmap = service.try_get_map(next_epoch);
8920 if (!nextmap) {
8921 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8922 continue;
8923 }
8924
8925 unsigned new_pg_num =
8926 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8927 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8928 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8929 // check for merge
8930 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8931 spg_t parent;
8932 if (pg->pg_id.is_merge_source(
8933 old_pg_num,
8934 new_pg_num,
8935 &parent)) {
8936 // we are merge source
8937 PGRef spg = pg; // carry a ref
8938 dout(1) << __func__ << " " << pg->pg_id
8939 << " is merge source, target is " << parent
8940 << dendl;
8941 pg->write_if_dirty(rctx);
8942 dispatch_context_transaction(*rctx, pg, &handle);
8943 pg->ch->flush();
8944 // release backoffs explicitly, since the on_shutdown path
8945 // aggressively tears down backoff state.
8946 if (pg->is_primary()) {
8947 pg->release_pg_backoffs();
8948 }
8949 pg->on_shutdown();
8950 OSDShard *sdata = pg->osd_shard;
8951 {
8952 std::lock_guard l(sdata->shard_lock);
8953 if (pg->pg_slot) {
8954 sdata->_detach_pg(pg->pg_slot);
8955 // update pg count now since we might not get an osdmap
8956 // any time soon.
8957 if (pg->is_primary())
8958 logger->dec(l_osd_pg_primary);
8959 else if (pg->is_replica())
8960 logger->dec(l_osd_pg_replica);
8961 else
8962 logger->dec(l_osd_pg_stray);
8963 }
8964 }
8965 pg->unlock();
8966
8967 set<spg_t> children;
8968 parent.is_split(new_pg_num, old_pg_num, &children);
8969 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8970 enqueue_peering_evt(
8971 parent,
8972 PGPeeringEventRef(
8973 std::make_shared<PGPeeringEvent>(
8974 nextmap->get_epoch(),
8975 nextmap->get_epoch(),
8976 NullEvt())));
8977 }
8978 ret = false;
8979 goto out;
8980 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8981 // we are merge target
8982 set<spg_t> children;
8983 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8984 dout(20) << __func__ << " " << pg->pg_id
8985 << " is merge target, sources are " << children
8986 << dendl;
8987 map<spg_t,PGRef> sources;
8988 {
8989 std::lock_guard l(merge_lock);
8990 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8991 unsigned need = children.size();
8992 dout(20) << __func__ << " have " << s.size() << "/"
8993 << need << dendl;
8994 if (s.size() == need) {
8995 sources.swap(s);
8996 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8997 if (merge_waiters[nextmap->get_epoch()].empty()) {
8998 merge_waiters.erase(nextmap->get_epoch());
8999 }
9000 }
9001 }
9002 if (!sources.empty()) {
9003 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
9004 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
9005 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
9006 pg->merge_from(
9007 sources, rctx, split_bits,
9008 nextmap->get_pg_pool(
9009 pg->pg_id.pool())->last_pg_merge_meta);
9010 pg->pg_slot->waiting_for_merge_epoch = 0;
9011 } else {
9012 dout(20) << __func__ << " not ready to merge yet" << dendl;
9013 pg->write_if_dirty(rctx);
9014 pg->unlock();
9015 // kick source(s) to get them ready
9016 for (auto& i : children) {
9017 dout(20) << __func__ << " kicking source " << i << dendl;
9018 enqueue_peering_evt(
9019 i,
9020 PGPeeringEventRef(
9021 std::make_shared<PGPeeringEvent>(
9022 nextmap->get_epoch(),
9023 nextmap->get_epoch(),
9024 NullEvt())));
9025 }
9026 ret = false;
9027 goto out;
9028 }
9029 }
9030 }
9031 }
9032
9033 vector<int> newup, newacting;
9034 int up_primary, acting_primary;
9035 nextmap->pg_to_up_acting_osds(
9036 pg->pg_id.pgid,
9037 &newup, &up_primary,
9038 &newacting, &acting_primary);
9039 pg->handle_advance_map(
9040 nextmap, lastmap, newup, up_primary,
9041 newacting, acting_primary, rctx);
9042
9043 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
9044 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
9045 if (oldpool != lastmap->get_pools().end()
9046 && newpool != nextmap->get_pools().end()) {
9047 dout(20) << __func__
9048 << " new pool opts " << newpool->second.opts
9049 << " old pool opts " << oldpool->second.opts
9050 << dendl;
9051
9052 double old_min_interval = 0, new_min_interval = 0;
9053 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
9054 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
9055
9056 double old_max_interval = 0, new_max_interval = 0;
9057 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
9058 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
9059
9060 // Assume if an interval is change from set to unset or vice versa the actual config
9061 // is different. Keep it simple even if it is possible to call resched_all_scrub()
9062 // unnecessarily.
9063 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
9064 pg->on_info_history_change();
9065 }
9066 }
9067
9068 if (new_pg_num && old_pg_num != new_pg_num) {
9069 // check for split
9070 set<spg_t> children;
9071 if (pg->pg_id.is_split(
9072 old_pg_num,
9073 new_pg_num,
9074 &children)) {
9075 split_pgs(
9076 pg, children, &new_pgs, lastmap, nextmap,
9077 rctx);
9078 }
9079 }
9080
9081 lastmap = nextmap;
9082 old_pg_num = new_pg_num;
9083 handle.reset_tp_timeout();
9084 }
9085 pg->handle_activate_map(rctx);
9086
9087 ret = true;
9088 out:
9089 if (!new_pgs.empty()) {
9090 rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
9091 }
9092 return ret;
9093 }
9094
9095 void OSD::consume_map()
9096 {
9097 ceph_assert(osd_lock.is_locked());
9098 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
9099
9100 /** make sure the cluster is speaking in SORTBITWISE, because we don't
9101 * speak the older sorting version any more. Be careful not to force
9102 * a shutdown if we are merely processing old maps, though.
9103 */
9104 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
9105 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
9106 ceph_abort();
9107 }
9108
9109 service.pre_publish_map(osdmap);
9110 service.await_reserved_maps();
9111 service.publish_map(osdmap);
9112
9113 // prime splits and merges
9114 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
9115 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
9116 for (auto& shard : shards) {
9117 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
9118 }
9119 if (!newly_split.empty()) {
9120 for (auto& shard : shards) {
9121 shard->prime_splits(osdmap, &newly_split);
9122 }
9123 ceph_assert(newly_split.empty());
9124 }
9125
9126 // prune sent_ready_to_merge
9127 service.prune_sent_ready_to_merge(osdmap);
9128
9129 // FIXME, maybe: We could race against an incoming peering message
9130 // that instantiates a merge PG after identify_merges() below and
9131 // never set up its peer to complete the merge. An OSD restart
9132 // would clear it up. This is a hard race to resolve,
9133 // extraordinarily rare (we only merge PGs that are stable and
9134 // clean, so it'd have to be an imported PG to an OSD with a
9135 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
9136 // replace all of this with a seastar-based code soon anyway.
9137 if (!merge_pgs.empty()) {
9138 // mark the pgs we already have, or create new and empty merge
9139 // participants for those we are missing. do this all under the
9140 // shard lock so we don't have to worry about racing pg creates
9141 // via _process.
9142 for (auto& shard : shards) {
9143 shard->prime_merges(osdmap, &merge_pgs);
9144 }
9145 ceph_assert(merge_pgs.empty());
9146 }
9147
9148 service.prune_pg_created();
9149
9150 unsigned pushes_to_free = 0;
9151 for (auto& shard : shards) {
9152 shard->consume_map(osdmap, &pushes_to_free);
9153 }
9154
9155 vector<spg_t> pgids;
9156 _get_pgids(&pgids);
9157
9158 // count (FIXME, probably during seastar rewrite)
9159 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
9160 vector<PGRef> pgs;
9161 _get_pgs(&pgs);
9162 for (auto& pg : pgs) {
9163 // FIXME (probably during seastar rewrite): this is lockless and
9164 // racy, but we don't want to take pg lock here.
9165 if (pg->is_primary())
9166 num_pg_primary++;
9167 else if (pg->is_replica())
9168 num_pg_replica++;
9169 else
9170 num_pg_stray++;
9171 }
9172
9173 {
9174 // FIXME (as part of seastar rewrite): move to OSDShard
9175 std::lock_guard l(pending_creates_lock);
9176 for (auto pg = pending_creates_from_osd.begin();
9177 pg != pending_creates_from_osd.end();) {
9178 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
9179 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
9180 << "discarding pending_create_from_osd" << dendl;
9181 pg = pending_creates_from_osd.erase(pg);
9182 } else {
9183 ++pg;
9184 }
9185 }
9186 }
9187
9188 service.maybe_inject_dispatch_delay();
9189
9190 dispatch_sessions_waiting_on_map();
9191
9192 service.maybe_inject_dispatch_delay();
9193
9194 service.release_reserved_pushes(pushes_to_free);
9195
9196 // queue null events to push maps down to individual PGs
9197 for (auto pgid : pgids) {
9198 enqueue_peering_evt(
9199 pgid,
9200 PGPeeringEventRef(
9201 std::make_shared<PGPeeringEvent>(
9202 osdmap->get_epoch(),
9203 osdmap->get_epoch(),
9204 NullEvt())));
9205 }
9206 logger->set(l_osd_pg, pgids.size());
9207 logger->set(l_osd_pg_primary, num_pg_primary);
9208 logger->set(l_osd_pg_replica, num_pg_replica);
9209 logger->set(l_osd_pg_stray, num_pg_stray);
9210 }
9211
9212 void OSD::activate_map()
9213 {
9214 ceph_assert(osd_lock.is_locked());
9215
9216 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
9217
9218 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
9219 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
9220 osdmap_subscribe(osdmap->get_epoch() + 1, false);
9221 }
9222
9223 // norecover?
9224 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
9225 if (!service.recovery_is_paused()) {
9226 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
9227 service.pause_recovery();
9228 }
9229 } else {
9230 if (service.recovery_is_paused()) {
9231 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9232 service.unpause_recovery();
9233 }
9234 }
9235
9236 service.activate_map();
9237
9238 // process waiters
9239 take_waiters(waiting_for_osdmap);
9240 }
9241
9242 bool OSD::require_mon_peer(const Message *m)
9243 {
9244 if (!m->get_connection()->peer_is_mon()) {
9245 dout(0) << "require_mon_peer received from non-mon "
9246 << m->get_connection()->get_peer_addr()
9247 << " " << *m << dendl;
9248 return false;
9249 }
9250 return true;
9251 }
9252
9253 bool OSD::require_mon_or_mgr_peer(const Message *m)
9254 {
9255 if (!m->get_connection()->peer_is_mon() &&
9256 !m->get_connection()->peer_is_mgr()) {
9257 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9258 << m->get_connection()->get_peer_addr()
9259 << " " << *m << dendl;
9260 return false;
9261 }
9262 return true;
9263 }
9264
9265 bool OSD::require_osd_peer(const Message *m)
9266 {
9267 if (!m->get_connection()->peer_is_osd()) {
9268 dout(0) << "require_osd_peer received from non-osd "
9269 << m->get_connection()->get_peer_addr()
9270 << " " << *m << dendl;
9271 return false;
9272 }
9273 return true;
9274 }
9275
9276 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9277 {
9278 epoch_t up_epoch = service.get_up_epoch();
9279 if (epoch < up_epoch) {
9280 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9281 return false;
9282 }
9283
9284 if (!is_active()) {
9285 dout(7) << "still in boot state, dropping message " << *m << dendl;
9286 return false;
9287 }
9288
9289 return true;
9290 }
9291
9292 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
9293 bool is_fast_dispatch)
9294 {
9295 int from = m->get_source().num();
9296
9297 if (map->is_down(from) ||
9298 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9299 dout(5) << "from dead osd." << from << ", marking down, "
9300 << " msg was " << m->get_source_inst().addr
9301 << " expected "
9302 << (map->is_up(from) ?
9303 map->get_cluster_addrs(from) : entity_addrvec_t())
9304 << dendl;
9305 ConnectionRef con = m->get_connection();
9306 con->mark_down();
9307 auto priv = con->get_priv();
9308 if (auto s = static_cast<Session*>(priv.get()); s) {
9309 if (!is_fast_dispatch)
9310 s->session_dispatch_lock.Lock();
9311 clear_session_waiting_on_map(s);
9312 con->set_priv(nullptr); // break ref <-> session cycle, if any
9313 s->con.reset();
9314 if (!is_fast_dispatch)
9315 s->session_dispatch_lock.Unlock();
9316 }
9317 return false;
9318 }
9319 return true;
9320 }
9321
9322
9323 /*
9324 * require that we have same (or newer) map, and that
9325 * the source is the pg primary.
9326 */
9327 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9328 bool is_fast_dispatch)
9329 {
9330 const Message *m = op->get_req();
9331 dout(15) << "require_same_or_newer_map " << epoch
9332 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9333
9334 ceph_assert(osd_lock.is_locked());
9335
9336 // do they have a newer map?
9337 if (epoch > osdmap->get_epoch()) {
9338 dout(7) << "waiting for newer map epoch " << epoch
9339 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9340 wait_for_new_map(op);
9341 return false;
9342 }
9343
9344 if (!require_self_aliveness(op->get_req(), epoch)) {
9345 return false;
9346 }
9347
9348 // ok, our map is same or newer.. do they still exist?
9349 if (m->get_connection()->get_messenger() == cluster_messenger &&
9350 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9351 return false;
9352 }
9353
9354 return true;
9355 }
9356
9357
9358
9359
9360
9361 // ----------------------------------------
9362 // pg creation
9363
9364 void OSD::split_pgs(
9365 PG *parent,
9366 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9367 OSDMapRef curmap,
9368 OSDMapRef nextmap,
9369 PG::RecoveryCtx *rctx)
9370 {
9371 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9372 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9373
9374 vector<object_stat_sum_t> updated_stats;
9375 parent->start_split_stats(childpgids, &updated_stats);
9376
9377 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9378 for (set<spg_t>::const_iterator i = childpgids.begin();
9379 i != childpgids.end();
9380 ++i, ++stat_iter) {
9381 ceph_assert(stat_iter != updated_stats.end());
9382 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9383 PG* child = _make_pg(nextmap, *i);
9384 child->lock(true);
9385 out_pgs->insert(child);
9386 child->ch = store->create_new_collection(child->coll);
9387
9388 {
9389 uint32_t shard_index = i->hash_to_shard(shards.size());
9390 assert(NULL != shards[shard_index]);
9391 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9392 }
9393
9394 unsigned split_bits = i->get_split_bits(pg_num);
9395 dout(10) << " pg_num is " << pg_num
9396 << ", m_seed " << i->ps()
9397 << ", split_bits is " << split_bits << dendl;
9398 parent->split_colls(
9399 *i,
9400 split_bits,
9401 i->ps(),
9402 &child->get_pool().info,
9403 rctx->transaction);
9404 parent->split_into(
9405 i->pgid,
9406 child,
9407 split_bits);
9408
9409 child->finish_split_stats(*stat_iter, rctx->transaction);
9410 child->unlock();
9411 }
9412 ceph_assert(stat_iter != updated_stats.end());
9413 parent->finish_split_stats(*stat_iter, rctx->transaction);
9414 }
9415
9416 /*
9417 * holding osd_lock
9418 */
9419 void OSD::handle_pg_create(OpRequestRef op)
9420 {
9421 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
9422 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9423
9424 dout(10) << "handle_pg_create " << *m << dendl;
9425
9426 if (!require_mon_peer(op->get_req())) {
9427 return;
9428 }
9429
9430 if (!require_same_or_newer_map(op, m->epoch, false))
9431 return;
9432
9433 op->mark_started();
9434
9435 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9436 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9437 p != m->mkpg.end();
9438 ++p, ++ci) {
9439 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9440 epoch_t created = p->second.created;
9441 if (p->second.split_bits) // Skip split pgs
9442 continue;
9443 pg_t on = p->first;
9444
9445 if (!osdmap->have_pg_pool(on.pool())) {
9446 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9447 continue;
9448 }
9449
9450 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9451
9452 // is it still ours?
9453 vector<int> up, acting;
9454 int up_primary = -1;
9455 int acting_primary = -1;
9456 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9457 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9458
9459 if (acting_primary != whoami) {
9460 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9461 << "), my role=" << role << ", skipping" << dendl;
9462 continue;
9463 }
9464
9465 spg_t pgid;
9466 bool mapped = osdmap->get_primary_shard(on, &pgid);
9467 ceph_assert(mapped);
9468
9469 PastIntervals pi;
9470 pg_history_t history;
9471 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9472
9473 // The mon won't resend unless the primary changed, so we ignore
9474 // same_interval_since. We'll pass this history with the current
9475 // epoch as the event.
9476 if (history.same_primary_since > m->epoch) {
9477 dout(10) << __func__ << ": got obsolete pg create on pgid "
9478 << pgid << " from epoch " << m->epoch
9479 << ", primary changed in " << history.same_primary_since
9480 << dendl;
9481 continue;
9482 }
9483 enqueue_peering_evt(
9484 pgid,
9485 PGPeeringEventRef(
9486 std::make_shared<PGPeeringEvent>(
9487 osdmap->get_epoch(),
9488 osdmap->get_epoch(),
9489 NullEvt(),
9490 true,
9491 new PGCreateInfo(
9492 pgid,
9493 osdmap->get_epoch(),
9494 history,
9495 pi,
9496 true)
9497 )));
9498 }
9499
9500 {
9501 std::lock_guard l(pending_creates_lock);
9502 if (pending_creates_from_mon == 0) {
9503 last_pg_create_epoch = m->epoch;
9504 }
9505 }
9506
9507 maybe_update_heartbeat_peers();
9508 }
9509
9510
9511 // ----------------------------------------
9512 // peering and recovery
9513
9514 PG::RecoveryCtx OSD::create_context()
9515 {
9516 ObjectStore::Transaction *t = new ObjectStore::Transaction;
9517 map<int, map<spg_t,pg_query_t> > *query_map =
9518 new map<int, map<spg_t, pg_query_t> >;
9519 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9520 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9521 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9522 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
9523 PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
9524 return rctx;
9525 }
9526
9527 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9528 ThreadPool::TPHandle *handle)
9529 {
9530 if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
9531 int tr = store->queue_transaction(
9532 pg->ch,
9533 std::move(*ctx.transaction), TrackedOpRef(), handle);
9534 ceph_assert(tr == 0);
9535 delete (ctx.transaction);
9536 ctx.transaction = new ObjectStore::Transaction;
9537 }
9538 }
9539
9540 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9541 ThreadPool::TPHandle *handle)
9542 {
9543 if (!service.get_osdmap()->is_up(whoami)) {
9544 dout(20) << __func__ << " not up in osdmap" << dendl;
9545 } else if (!is_active()) {
9546 dout(20) << __func__ << " not active" << dendl;
9547 } else {
9548 do_notifies(*ctx.notify_list, curmap);
9549 do_queries(*ctx.query_map, curmap);
9550 do_infos(*ctx.info_map, curmap);
9551 }
9552 if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
9553 int tr = store->queue_transaction(
9554 pg->ch,
9555 std::move(*ctx.transaction), TrackedOpRef(),
9556 handle);
9557 ceph_assert(tr == 0);
9558 }
9559 delete ctx.notify_list;
9560 delete ctx.query_map;
9561 delete ctx.info_map;
9562 delete ctx.transaction;
9563 }
9564
9565 void OSD::discard_context(PG::RecoveryCtx& ctx)
9566 {
9567 delete ctx.notify_list;
9568 delete ctx.query_map;
9569 delete ctx.info_map;
9570 delete ctx.transaction;
9571 }
9572
9573
9574 /** do_notifies
9575 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9576 * content for, and they are primary for.
9577 */
9578
9579 void OSD::do_notifies(
9580 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9581 OSDMapRef curmap)
9582 {
9583 for (map<int,
9584 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9585 notify_list.begin();
9586 it != notify_list.end();
9587 ++it) {
9588 if (!curmap->is_up(it->first)) {
9589 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9590 continue;
9591 }
9592 ConnectionRef con = service.get_con_osd_cluster(
9593 it->first, curmap->get_epoch());
9594 if (!con) {
9595 dout(20) << __func__ << " skipping osd." << it->first
9596 << " (NULL con)" << dendl;
9597 continue;
9598 }
9599 service.share_map_peer(it->first, con.get(), curmap);
9600 dout(7) << __func__ << " osd." << it->first
9601 << " on " << it->second.size() << " PGs" << dendl;
9602 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9603 it->second);
9604 con->send_message(m);
9605 }
9606 }
9607
9608
9609 /** do_queries
9610 * send out pending queries for info | summaries
9611 */
9612 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9613 OSDMapRef curmap)
9614 {
9615 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9616 pit != query_map.end();
9617 ++pit) {
9618 if (!curmap->is_up(pit->first)) {
9619 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9620 continue;
9621 }
9622 int who = pit->first;
9623 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9624 if (!con) {
9625 dout(20) << __func__ << " skipping osd." << who
9626 << " (NULL con)" << dendl;
9627 continue;
9628 }
9629 service.share_map_peer(who, con.get(), curmap);
9630 dout(7) << __func__ << " querying osd." << who
9631 << " on " << pit->second.size() << " PGs" << dendl;
9632 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9633 con->send_message(m);
9634 }
9635 }
9636
9637
9638 void OSD::do_infos(map<int,
9639 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9640 OSDMapRef curmap)
9641 {
9642 for (map<int,
9643 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9644 info_map.begin();
9645 p != info_map.end();
9646 ++p) {
9647 if (!curmap->is_up(p->first)) {
9648 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9649 continue;
9650 }
9651 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9652 i != p->second.end();
9653 ++i) {
9654 dout(20) << __func__ << " sending info " << i->first.info
9655 << " to shard " << p->first << dendl;
9656 }
9657 ConnectionRef con = service.get_con_osd_cluster(
9658 p->first, curmap->get_epoch());
9659 if (!con) {
9660 dout(20) << __func__ << " skipping osd." << p->first
9661 << " (NULL con)" << dendl;
9662 continue;
9663 }
9664 service.share_map_peer(p->first, con.get(), curmap);
9665 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9666 m->pg_list = p->second;
9667 con->send_message(m);
9668 }
9669 info_map.clear();
9670 }
9671
9672 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9673 {
9674 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9675 if (!require_mon_peer(m)) {
9676 m->put();
9677 return;
9678 }
9679 for (auto& p : m->pgs) {
9680 spg_t pgid = p.first;
9681 epoch_t created = p.second.first;
9682 utime_t created_stamp = p.second.second;
9683 dout(20) << __func__ << " " << pgid << " e" << created
9684 << "@" << created_stamp << dendl;
9685 pg_history_t h;
9686 h.epoch_created = created;
9687 h.epoch_pool_created = created;
9688 h.same_up_since = created;
9689 h.same_interval_since = created;
9690 h.same_primary_since = created;
9691 h.last_scrub_stamp = created_stamp;
9692 h.last_deep_scrub_stamp = created_stamp;
9693 h.last_clean_scrub_stamp = created_stamp;
9694
9695 enqueue_peering_evt(
9696 pgid,
9697 PGPeeringEventRef(
9698 std::make_shared<PGPeeringEvent>(
9699 m->epoch,
9700 m->epoch,
9701 NullEvt(),
9702 true,
9703 new PGCreateInfo(
9704 pgid,
9705 created,
9706 h,
9707 PastIntervals(),
9708 true)
9709 )));
9710 }
9711
9712 {
9713 std::lock_guard l(pending_creates_lock);
9714 if (pending_creates_from_mon == 0) {
9715 last_pg_create_epoch = m->epoch;
9716 }
9717 }
9718
9719 m->put();
9720 }
9721
9722 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9723 {
9724 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9725 if (!require_osd_peer(m)) {
9726 m->put();
9727 return;
9728 }
9729 int from = m->get_source().num();
9730 for (auto& p : m->pg_list) {
9731 enqueue_peering_evt(
9732 p.first,
9733 PGPeeringEventRef(
9734 std::make_shared<PGPeeringEvent>(
9735 p.second.epoch_sent, p.second.epoch_sent,
9736 MQuery(
9737 p.first,
9738 pg_shard_t(from, p.second.from),
9739 p.second,
9740 p.second.epoch_sent),
9741 false))
9742 );
9743 }
9744 m->put();
9745 }
9746
9747 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9748 {
9749 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9750 if (!require_osd_peer(m)) {
9751 m->put();
9752 return;
9753 }
9754 int from = m->get_source().num();
9755 for (auto& p : m->get_pg_list()) {
9756 spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9757 enqueue_peering_evt(
9758 pgid,
9759 PGPeeringEventRef(
9760 std::make_shared<PGPeeringEvent>(
9761 p.first.epoch_sent,
9762 p.first.query_epoch,
9763 MNotifyRec(
9764 pgid, pg_shard_t(from, p.first.from),
9765 p.first,
9766 m->get_connection()->get_features(),
9767 p.second),
9768 true,
9769 new PGCreateInfo(
9770 pgid,
9771 p.first.query_epoch,
9772 p.first.info.history,
9773 p.second,
9774 false)
9775 )));
9776 }
9777 m->put();
9778 }
9779
9780 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9781 {
9782 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9783 if (!require_osd_peer(m)) {
9784 m->put();
9785 return;
9786 }
9787 int from = m->get_source().num();
9788 for (auto& p : m->pg_list) {
9789 enqueue_peering_evt(
9790 spg_t(p.first.info.pgid.pgid, p.first.to),
9791 PGPeeringEventRef(
9792 std::make_shared<PGPeeringEvent>(
9793 p.first.epoch_sent, p.first.query_epoch,
9794 MInfoRec(
9795 pg_shard_t(from, p.first.from),
9796 p.first.info,
9797 p.first.epoch_sent)))
9798 );
9799 }
9800 m->put();
9801 }
9802
9803 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9804 {
9805 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9806 if (!require_osd_peer(m)) {
9807 m->put();
9808 return;
9809 }
9810 for (auto& pgid : m->pg_list) {
9811 enqueue_peering_evt(
9812 pgid,
9813 PGPeeringEventRef(
9814 std::make_shared<PGPeeringEvent>(
9815 m->get_epoch(), m->get_epoch(),
9816 PG::DeleteStart())));
9817 }
9818 m->put();
9819 }
9820
9821 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9822 {
9823 dout(10) << __func__ << " " << *m << dendl;
9824 if (!require_mon_or_mgr_peer(m)) {
9825 m->put();
9826 return;
9827 }
9828 epoch_t epoch = get_osdmap_epoch();
9829 for (auto pgid : m->forced_pgs) {
9830 if (m->options & OFR_BACKFILL) {
9831 if (m->options & OFR_CANCEL) {
9832 enqueue_peering_evt(
9833 pgid,
9834 PGPeeringEventRef(
9835 std::make_shared<PGPeeringEvent>(
9836 epoch, epoch,
9837 PG::UnsetForceBackfill())));
9838 } else {
9839 enqueue_peering_evt(
9840 pgid,
9841 PGPeeringEventRef(
9842 std::make_shared<PGPeeringEvent>(
9843 epoch, epoch,
9844 PG::SetForceBackfill())));
9845 }
9846 } else if (m->options & OFR_RECOVERY) {
9847 if (m->options & OFR_CANCEL) {
9848 enqueue_peering_evt(
9849 pgid,
9850 PGPeeringEventRef(
9851 std::make_shared<PGPeeringEvent>(
9852 epoch, epoch,
9853 PG::UnsetForceRecovery())));
9854 } else {
9855 enqueue_peering_evt(
9856 pgid,
9857 PGPeeringEventRef(
9858 std::make_shared<PGPeeringEvent>(
9859 epoch, epoch,
9860 PG::SetForceRecovery())));
9861 }
9862 }
9863 }
9864 m->put();
9865 }
9866
9867 void OSD::handle_pg_query_nopg(const MQuery& q)
9868 {
9869 spg_t pgid = q.pgid;
9870 dout(10) << __func__ << " " << pgid << dendl;
9871
9872 OSDMapRef osdmap = get_osdmap();
9873 if (!osdmap->have_pg_pool(pgid.pool()))
9874 return;
9875
9876 dout(10) << " pg " << pgid << " dne" << dendl;
9877 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9878 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9879 if (con) {
9880 Message *m;
9881 if (q.query.type == pg_query_t::LOG ||
9882 q.query.type == pg_query_t::FULLLOG) {
9883 m = new MOSDPGLog(
9884 q.query.from, q.query.to,
9885 osdmap->get_epoch(), empty,
9886 q.query.epoch_sent);
9887 } else {
9888 vector<pair<pg_notify_t,PastIntervals>> ls;
9889 ls.push_back(
9890 make_pair(
9891 pg_notify_t(
9892 q.query.from, q.query.to,
9893 q.query.epoch_sent,
9894 osdmap->get_epoch(),
9895 empty),
9896 PastIntervals()));
9897 m = new MOSDPGNotify(osdmap->get_epoch(), ls);
9898 }
9899 service.share_map_peer(q.from.osd, con.get(), osdmap);
9900 con->send_message(m);
9901 }
9902 }
9903
9904
9905 // =========================================================
9906 // RECOVERY
9907
9908 void OSDService::_maybe_queue_recovery() {
9909 ceph_assert(recovery_lock.is_locked_by_me());
9910 uint64_t available_pushes;
9911 while (!awaiting_throttle.empty() &&
9912 _recover_now(&available_pushes)) {
9913 uint64_t to_start = std::min(
9914 available_pushes,
9915 cct->_conf->osd_recovery_max_single_start);
9916 _queue_for_recovery(awaiting_throttle.front(), to_start);
9917 awaiting_throttle.pop_front();
9918 dout(10) << __func__ << " starting " << to_start
9919 << ", recovery_ops_reserved " << recovery_ops_reserved
9920 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9921 recovery_ops_reserved += to_start;
9922 }
9923 }
9924
9925 bool OSDService::_recover_now(uint64_t *available_pushes)
9926 {
9927 if (available_pushes)
9928 *available_pushes = 0;
9929
9930 if (ceph_clock_now() < defer_recovery_until) {
9931 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9932 return false;
9933 }
9934
9935 if (recovery_paused) {
9936 dout(15) << __func__ << " paused" << dendl;
9937 return false;
9938 }
9939
9940 uint64_t max = cct->_conf->osd_recovery_max_active;
9941 if (max <= recovery_ops_active + recovery_ops_reserved) {
9942 dout(15) << __func__ << " active " << recovery_ops_active
9943 << " + reserved " << recovery_ops_reserved
9944 << " >= max " << max << dendl;
9945 return false;
9946 }
9947
9948 if (available_pushes)
9949 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9950
9951 return true;
9952 }
9953
9954 void OSD::do_recovery(
9955 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9956 ThreadPool::TPHandle &handle)
9957 {
9958 uint64_t started = 0;
9959
9960 /*
9961 * When the value of osd_recovery_sleep is set greater than zero, recovery
9962 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9963 * recovery event's schedule time. This is done by adding a
9964 * recovery_requeue_callback event, which re-queues the recovery op using
9965 * queue_recovery_after_sleep.
9966 */
9967 float recovery_sleep = get_osd_recovery_sleep();
9968 {
9969 std::lock_guard l(service.sleep_lock);
9970 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9971 PGRef pgref(pg);
9972 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9973 dout(20) << "do_recovery wake up at "
9974 << ceph_clock_now()
9975 << ", re-queuing recovery" << dendl;
9976 std::lock_guard l(service.sleep_lock);
9977 service.recovery_needs_sleep = false;
9978 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9979 });
9980
9981 // This is true for the first recovery op and when the previous recovery op
9982 // has been scheduled in the past. The next recovery op is scheduled after
9983 // completing the sleep from now.
9984 if (service.recovery_schedule_time < ceph_clock_now()) {
9985 service.recovery_schedule_time = ceph_clock_now();
9986 }
9987 service.recovery_schedule_time += recovery_sleep;
9988 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9989 recovery_requeue_callback);
9990 dout(20) << "Recovery event scheduled at "
9991 << service.recovery_schedule_time << dendl;
9992 return;
9993 }
9994 }
9995
9996 {
9997 {
9998 std::lock_guard l(service.sleep_lock);
9999 service.recovery_needs_sleep = true;
10000 }
10001
10002 if (pg->pg_has_reset_since(queued)) {
10003 goto out;
10004 }
10005
10006 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
10007 #ifdef DEBUG_RECOVERY_OIDS
10008 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
10009 #endif
10010
10011 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
10012 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10013 << " on " << *pg << dendl;
10014
10015 if (do_unfound) {
10016 PG::RecoveryCtx rctx = create_context();
10017 rctx.handle = &handle;
10018 pg->find_unfound(queued, &rctx);
10019 dispatch_context(rctx, pg, pg->get_osdmap());
10020 }
10021 }
10022
10023 out:
10024 ceph_assert(started <= reserved_pushes);
10025 service.release_reserved_pushes(reserved_pushes);
10026 }
10027
10028 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
10029 {
10030 std::lock_guard l(recovery_lock);
10031 dout(10) << "start_recovery_op " << *pg << " " << soid
10032 << " (" << recovery_ops_active << "/"
10033 << cct->_conf->osd_recovery_max_active << " rops)"
10034 << dendl;
10035 recovery_ops_active++;
10036
10037 #ifdef DEBUG_RECOVERY_OIDS
10038 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
10039 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
10040 recovery_oids[pg->pg_id].insert(soid);
10041 #endif
10042 }
10043
10044 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
10045 {
10046 std::lock_guard l(recovery_lock);
10047 dout(10) << "finish_recovery_op " << *pg << " " << soid
10048 << " dequeue=" << dequeue
10049 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
10050 << dendl;
10051
10052 // adjust count
10053 ceph_assert(recovery_ops_active > 0);
10054 recovery_ops_active--;
10055
10056 #ifdef DEBUG_RECOVERY_OIDS
10057 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
10058 ceph_assert(recovery_oids[pg->pg_id].count(soid));
10059 recovery_oids[pg->pg_id].erase(soid);
10060 #endif
10061
10062 _maybe_queue_recovery();
10063 }
10064
10065 bool OSDService::is_recovery_active()
10066 {
10067 if (cct->_conf->osd_debug_pretend_recovery_active) {
10068 return true;
10069 }
10070 return local_reserver.has_reservation() || remote_reserver.has_reservation();
10071 }
10072
10073 void OSDService::release_reserved_pushes(uint64_t pushes)
10074 {
10075 std::lock_guard l(recovery_lock);
10076 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
10077 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
10078 << dendl;
10079 ceph_assert(recovery_ops_reserved >= pushes);
10080 recovery_ops_reserved -= pushes;
10081 _maybe_queue_recovery();
10082 }
10083
10084 // =========================================================
10085 // OPS
10086
10087 bool OSD::op_is_discardable(const MOSDOp *op)
10088 {
10089 // drop client request if they are not connected and can't get the
10090 // reply anyway.
10091 if (!op->get_connection()->is_connected()) {
10092 return true;
10093 }
10094 return false;
10095 }
10096
10097 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
10098 {
10099 const utime_t stamp = op->get_req()->get_recv_stamp();
10100 const utime_t latency = ceph_clock_now() - stamp;
10101 const unsigned priority = op->get_req()->get_priority();
10102 const int cost = op->get_req()->get_cost();
10103 const uint64_t owner = op->get_req()->get_source().num();
10104
10105 dout(15) << "enqueue_op " << op << " prio " << priority
10106 << " cost " << cost
10107 << " latency " << latency
10108 << " epoch " << epoch
10109 << " " << *(op->get_req()) << dendl;
10110 op->osd_trace.event("enqueue op");
10111 op->osd_trace.keyval("priority", priority);
10112 op->osd_trace.keyval("cost", cost);
10113 op->mark_queued_for_pg();
10114 logger->tinc(l_osd_op_before_queue_op_lat, latency);
10115 op_shardedwq.queue(
10116 OpQueueItem(
10117 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
10118 cost, priority, stamp, owner, epoch));
10119 }
10120
10121 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
10122 {
10123 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10124 op_shardedwq.queue(
10125 OpQueueItem(
10126 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10127 10,
10128 cct->_conf->osd_peering_op_priority,
10129 utime_t(),
10130 0,
10131 evt->get_epoch_sent()));
10132 }
10133
10134 void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
10135 {
10136 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10137 op_shardedwq.queue_front(
10138 OpQueueItem(
10139 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10140 10,
10141 cct->_conf->osd_peering_op_priority,
10142 utime_t(),
10143 0,
10144 evt->get_epoch_sent()));
10145 }
10146
10147 /*
10148 * NOTE: dequeue called in worker thread, with pg lock
10149 */
10150 void OSD::dequeue_op(
10151 PGRef pg, OpRequestRef op,
10152 ThreadPool::TPHandle &handle)
10153 {
10154 FUNCTRACE(cct);
10155 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
10156
10157 utime_t now = ceph_clock_now();
10158 op->set_dequeued_time(now);
10159 utime_t latency = now - op->get_req()->get_recv_stamp();
10160 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
10161 << " cost " << op->get_req()->get_cost()
10162 << " latency " << latency
10163 << " " << *(op->get_req())
10164 << " pg " << *pg << dendl;
10165
10166 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
10167
10168 auto priv = op->get_req()->get_connection()->get_priv();
10169 if (auto session = static_cast<Session *>(priv.get()); session) {
10170 maybe_share_map(session, op, pg->get_osdmap());
10171 }
10172
10173 if (pg->is_deleting())
10174 return;
10175
10176 op->mark_reached_pg();
10177 op->osd_trace.event("dequeue_op");
10178
10179 pg->do_request(op, handle);
10180
10181 // finish
10182 dout(10) << "dequeue_op " << op << " finish" << dendl;
10183 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
10184 }
10185
10186
10187 void OSD::dequeue_peering_evt(
10188 OSDShard *sdata,
10189 PG *pg,
10190 PGPeeringEventRef evt,
10191 ThreadPool::TPHandle& handle)
10192 {
10193 PG::RecoveryCtx rctx = create_context();
10194 auto curmap = sdata->get_osdmap();
10195 epoch_t need_up_thru = 0, same_interval_since = 0;
10196 if (!pg) {
10197 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
10198 handle_pg_query_nopg(*q);
10199 } else {
10200 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
10201 ceph_abort();
10202 }
10203 } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
10204 pg->do_peering_event(evt, &rctx);
10205 if (pg->is_deleted()) {
10206 // do not dispatch rctx; the final _delete_some already did it.
10207 discard_context(rctx);
10208 pg->unlock();
10209 return;
10210 }
10211 dispatch_context_transaction(rctx, pg, &handle);
10212 need_up_thru = pg->get_need_up_thru();
10213 same_interval_since = pg->get_same_interval_since();
10214 pg->unlock();
10215 }
10216
10217 if (need_up_thru) {
10218 queue_want_up_thru(same_interval_since);
10219 }
10220 dispatch_context(rctx, pg, curmap, &handle);
10221
10222 service.send_pg_temp();
10223 }
10224
10225 void OSD::dequeue_delete(
10226 OSDShard *sdata,
10227 PG *pg,
10228 epoch_t e,
10229 ThreadPool::TPHandle& handle)
10230 {
10231 dequeue_peering_evt(
10232 sdata,
10233 pg,
10234 PGPeeringEventRef(
10235 std::make_shared<PGPeeringEvent>(
10236 e, e,
10237 PG::DeleteSome())),
10238 handle);
10239 }
10240
10241
10242
10243 // --------------------------------
10244
10245 const char** OSD::get_tracked_conf_keys() const
10246 {
10247 static const char* KEYS[] = {
10248 "osd_max_backfills",
10249 "osd_min_recovery_priority",
10250 "osd_max_trimming_pgs",
10251 "osd_op_complaint_time",
10252 "osd_op_log_threshold",
10253 "osd_op_history_size",
10254 "osd_op_history_duration",
10255 "osd_op_history_slow_op_size",
10256 "osd_op_history_slow_op_threshold",
10257 "osd_enable_op_tracker",
10258 "osd_map_cache_size",
10259 "osd_pg_epoch_max_lag_factor",
10260 "osd_pg_epoch_persisted_max_stale",
10261 // clog & admin clog
10262 "clog_to_monitors",
10263 "clog_to_syslog",
10264 "clog_to_syslog_facility",
10265 "clog_to_syslog_level",
10266 "osd_objectstore_fuse",
10267 "clog_to_graylog",
10268 "clog_to_graylog_host",
10269 "clog_to_graylog_port",
10270 "host",
10271 "fsid",
10272 "osd_recovery_delay_start",
10273 "osd_client_message_size_cap",
10274 "osd_client_message_cap",
10275 "osd_heartbeat_min_size",
10276 "osd_heartbeat_interval",
10277 "osd_scrub_min_interval",
10278 "osd_scrub_max_interval",
10279 NULL
10280 };
10281 return KEYS;
10282 }
10283
10284 void OSD::handle_conf_change(const ConfigProxy& conf,
10285 const std::set <std::string> &changed)
10286 {
10287 Mutex::Locker l(osd_lock);
10288 if (changed.count("osd_max_backfills")) {
10289 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10290 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10291 }
10292 if (changed.count("osd_min_recovery_priority")) {
10293 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10294 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10295 }
10296 if (changed.count("osd_max_trimming_pgs")) {
10297 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10298 }
10299 if (changed.count("osd_op_complaint_time") ||
10300 changed.count("osd_op_log_threshold")) {
10301 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10302 cct->_conf->osd_op_log_threshold);
10303 }
10304 if (changed.count("osd_op_history_size") ||
10305 changed.count("osd_op_history_duration")) {
10306 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10307 cct->_conf->osd_op_history_duration);
10308 }
10309 if (changed.count("osd_op_history_slow_op_size") ||
10310 changed.count("osd_op_history_slow_op_threshold")) {
10311 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10312 cct->_conf->osd_op_history_slow_op_threshold);
10313 }
10314 if (changed.count("osd_enable_op_tracker")) {
10315 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10316 }
10317 if (changed.count("osd_map_cache_size")) {
10318 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10319 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10320 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10321 }
10322 if (changed.count("clog_to_monitors") ||
10323 changed.count("clog_to_syslog") ||
10324 changed.count("clog_to_syslog_level") ||
10325 changed.count("clog_to_syslog_facility") ||
10326 changed.count("clog_to_graylog") ||
10327 changed.count("clog_to_graylog_host") ||
10328 changed.count("clog_to_graylog_port") ||
10329 changed.count("host") ||
10330 changed.count("fsid")) {
10331 update_log_config();
10332 }
10333 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10334 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10335 "osd_pg_epoch_max_lag_factor");
10336 }
10337
10338 #ifdef HAVE_LIBFUSE
10339 if (changed.count("osd_objectstore_fuse")) {
10340 if (store) {
10341 enable_disable_fuse(false);
10342 }
10343 }
10344 #endif
10345
10346 if (changed.count("osd_recovery_delay_start")) {
10347 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10348 service.kick_recovery_queue();
10349 }
10350
10351 if (changed.count("osd_client_message_cap")) {
10352 uint64_t newval = cct->_conf->osd_client_message_cap;
10353 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10354 if (pol.throttler_messages && newval > 0) {
10355 pol.throttler_messages->reset_max(newval);
10356 }
10357 }
10358 if (changed.count("osd_client_message_size_cap")) {
10359 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10360 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10361 if (pol.throttler_bytes && newval > 0) {
10362 pol.throttler_bytes->reset_max(newval);
10363 }
10364 }
10365
10366 if (changed.count("osd_scrub_min_interval") ||
10367 changed.count("osd_scrub_max_interval")) {
10368 resched_all_scrubs();
10369 dout(0) << __func__ << ": scrub interval change" << dendl;
10370 }
10371 check_config();
10372 }
10373
10374 void OSD::update_log_config()
10375 {
10376 map<string,string> log_to_monitors;
10377 map<string,string> log_to_syslog;
10378 map<string,string> log_channel;
10379 map<string,string> log_prio;
10380 map<string,string> log_to_graylog;
10381 map<string,string> log_to_graylog_host;
10382 map<string,string> log_to_graylog_port;
10383 uuid_d fsid;
10384 string host;
10385
10386 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10387 log_channel, log_prio, log_to_graylog,
10388 log_to_graylog_host, log_to_graylog_port,
10389 fsid, host) == 0)
10390 clog->update_config(log_to_monitors, log_to_syslog,
10391 log_channel, log_prio, log_to_graylog,
10392 log_to_graylog_host, log_to_graylog_port,
10393 fsid, host);
10394 derr << "log_to_monitors " << log_to_monitors << dendl;
10395 }
10396
10397 void OSD::check_config()
10398 {
10399 // some sanity checks
10400 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10401 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10402 << " is not > osd_pg_epoch_persisted_max_stale ("
10403 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10404 }
10405 }
10406
10407 // --------------------------------
10408
10409 void OSD::get_latest_osdmap()
10410 {
10411 dout(10) << __func__ << " -- start" << dendl;
10412
10413 C_SaferCond cond;
10414 service.objecter->wait_for_latest_osdmap(&cond);
10415 cond.wait();
10416
10417 dout(10) << __func__ << " -- finish" << dendl;
10418 }
10419
10420 // --------------------------------
10421
10422 int OSD::init_op_flags(OpRequestRef& op)
10423 {
10424 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10425 vector<OSDOp>::const_iterator iter;
10426
10427 // client flags have no bearing on whether an op is a read, write, etc.
10428 op->rmw_flags = 0;
10429
10430 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10431 op->set_force_rwordered();
10432 }
10433
10434 // set bits based on op codes, called methods.
10435 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10436 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10437 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10438 /* This a bit odd. PING isn't actually a write. It can't
10439 * result in an update to the object_info. PINGs also aren't
10440 * resent, so there's no reason to write out a log entry.
10441 *
10442 * However, we pipeline them behind writes, so let's force
10443 * the write_ordered flag.
10444 */
10445 op->set_force_rwordered();
10446 } else {
10447 if (ceph_osd_op_mode_modify(iter->op.op))
10448 op->set_write();
10449 }
10450 if (ceph_osd_op_mode_read(iter->op.op))
10451 op->set_read();
10452
10453 // set READ flag if there are src_oids
10454 if (iter->soid.oid.name.length())
10455 op->set_read();
10456
10457 // set PGOP flag if there are PG ops
10458 if (ceph_osd_op_type_pg(iter->op.op))
10459 op->set_pg_op();
10460
10461 if (ceph_osd_op_mode_cache(iter->op.op))
10462 op->set_cache();
10463
10464 // check for ec base pool
10465 int64_t poolid = m->get_pg().pool();
10466 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10467 if (pool && pool->is_tier()) {
10468 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10469 if (base_pool && base_pool->require_rollback()) {
10470 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10471 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10472 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10473 (iter->op.op != CEPH_OSD_OP_STAT) &&
10474 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10475 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10476 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10477 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10478 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10479 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10480 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10481 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10482 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10483 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10484 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10485 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10486 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10487 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10488 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10489 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10490 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10491 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10492 op->set_promote();
10493 }
10494 }
10495 }
10496
10497 switch (iter->op.op) {
10498 case CEPH_OSD_OP_CALL:
10499 {
10500 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10501 int is_write, is_read;
10502 string cname, mname;
10503 bp.copy(iter->op.cls.class_len, cname);
10504 bp.copy(iter->op.cls.method_len, mname);
10505
10506 ClassHandler::ClassData *cls;
10507 int r = class_handler->open_class(cname, &cls);
10508 if (r) {
10509 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10510 if (r == -ENOENT)
10511 r = -EOPNOTSUPP;
10512 else if (r != -EPERM) // propagate permission errors
10513 r = -EIO;
10514 return r;
10515 }
10516 int flags = cls->get_method_flags(mname.c_str());
10517 if (flags < 0) {
10518 if (flags == -ENOENT)
10519 r = -EOPNOTSUPP;
10520 else
10521 r = flags;
10522 return r;
10523 }
10524 is_read = flags & CLS_METHOD_RD;
10525 is_write = flags & CLS_METHOD_WR;
10526 bool is_promote = flags & CLS_METHOD_PROMOTE;
10527
10528 dout(10) << "class " << cname << " method " << mname << " "
10529 << "flags=" << (is_read ? "r" : "")
10530 << (is_write ? "w" : "")
10531 << (is_promote ? "p" : "")
10532 << dendl;
10533 if (is_read)
10534 op->set_class_read();
10535 if (is_write)
10536 op->set_class_write();
10537 if (is_promote)
10538 op->set_promote();
10539 op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10540 cls->whitelisted);
10541 break;
10542 }
10543
10544 case CEPH_OSD_OP_WATCH:
10545 // force the read bit for watch since it is depends on previous
10546 // watch state (and may return early if the watch exists) or, in
10547 // the case of ping, is simply a read op.
10548 op->set_read();
10549 // fall through
10550 case CEPH_OSD_OP_NOTIFY:
10551 case CEPH_OSD_OP_NOTIFY_ACK:
10552 {
10553 op->set_promote();
10554 break;
10555 }
10556
10557 case CEPH_OSD_OP_DELETE:
10558 // if we get a delete with FAILOK we can skip handle cache. without
10559 // FAILOK we still need to promote (or do something smarter) to
10560 // determine whether to return ENOENT or 0.
10561 if (iter == m->ops.begin() &&
10562 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10563 op->set_skip_handle_cache();
10564 }
10565 // skip promotion when proxying a delete op
10566 if (m->ops.size() == 1) {
10567 op->set_skip_promote();
10568 }
10569 break;
10570
10571 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10572 case CEPH_OSD_OP_CACHE_FLUSH:
10573 case CEPH_OSD_OP_CACHE_EVICT:
10574 // If try_flush/flush/evict is the only op, can skip handle cache.
10575 if (m->ops.size() == 1) {
10576 op->set_skip_handle_cache();
10577 }
10578 break;
10579
10580 case CEPH_OSD_OP_READ:
10581 case CEPH_OSD_OP_SYNC_READ:
10582 case CEPH_OSD_OP_SPARSE_READ:
10583 case CEPH_OSD_OP_CHECKSUM:
10584 case CEPH_OSD_OP_WRITEFULL:
10585 if (m->ops.size() == 1 &&
10586 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10587 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10588 op->set_skip_promote();
10589 }
10590 break;
10591
10592 // force promotion when pin an object in cache tier
10593 case CEPH_OSD_OP_CACHE_PIN:
10594 op->set_promote();
10595 break;
10596
10597 default:
10598 break;
10599 }
10600 }
10601
10602 if (op->rmw_flags == 0)
10603 return -EINVAL;
10604
10605 return 0;
10606 }
10607
10608 void OSD::set_perf_queries(
10609 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10610 dout(10) << "setting " << queries.size() << " queries" << dendl;
10611
10612 std::list<OSDPerfMetricQuery> supported_queries;
10613 for (auto &it : queries) {
10614 auto &query = it.first;
10615 if (!query.key_descriptor.empty()) {
10616 supported_queries.push_back(query);
10617 }
10618 }
10619 if (supported_queries.size() < queries.size()) {
10620 dout(1) << queries.size() - supported_queries.size()
10621 << " unsupported queries" << dendl;
10622 }
10623
10624 {
10625 Mutex::Locker locker(m_perf_queries_lock);
10626 m_perf_queries = supported_queries;
10627 m_perf_limits = queries;
10628 }
10629
10630 std::vector<PGRef> pgs;
10631 _get_pgs(&pgs);
10632 for (auto& pg : pgs) {
10633 pg->lock();
10634 pg->set_dynamic_perf_stats_queries(supported_queries);
10635 pg->unlock();
10636 }
10637 }
10638
10639 void OSD::get_perf_reports(
10640 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10641 std::vector<PGRef> pgs;
10642 _get_pgs(&pgs);
10643 DynamicPerfStats dps;
10644 for (auto& pg : pgs) {
10645 // m_perf_queries can be modified only in set_perf_queries by mgr client
10646 // request, and it is protected by by mgr client's lock, which is held
10647 // when set_perf_queries/get_perf_reports are called, so we may not hold
10648 // m_perf_queries_lock here.
10649 DynamicPerfStats pg_dps(m_perf_queries);
10650 pg->lock();
10651 pg->get_dynamic_perf_stats(&pg_dps);
10652 pg->unlock();
10653 dps.merge(pg_dps);
10654 }
10655 dps.add_to_reports(m_perf_limits, reports);
10656 dout(20) << "reports for " << reports->size() << " queries" << dendl;
10657 }
10658
10659 // =============================================================
10660
10661 #undef dout_context
10662 #define dout_context cct
10663 #undef dout_prefix
10664 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10665
10666 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10667 {
10668 dout(10) << pg->pg_id << " " << pg << dendl;
10669 slot->pg = pg;
10670 pg->osd_shard = this;
10671 pg->pg_slot = slot;
10672 osd->inc_num_pgs();
10673
10674 slot->epoch = pg->get_osdmap_epoch();
10675 pg_slots_by_epoch.insert(*slot);
10676 }
10677
10678 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10679 {
10680 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10681 slot->pg->osd_shard = nullptr;
10682 slot->pg->pg_slot = nullptr;
10683 slot->pg = nullptr;
10684 osd->dec_num_pgs();
10685
10686 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10687 slot->epoch = 0;
10688 if (waiting_for_min_pg_epoch) {
10689 min_pg_epoch_cond.notify_all();
10690 }
10691 }
10692
10693 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10694 {
10695 std::lock_guard l(shard_lock);
10696 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10697 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10698 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10699 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10700 slot->epoch = e;
10701 pg_slots_by_epoch.insert(*slot);
10702 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10703 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10704 if (waiting_for_min_pg_epoch) {
10705 min_pg_epoch_cond.notify_all();
10706 }
10707 }
10708
10709 epoch_t OSDShard::get_min_pg_epoch()
10710 {
10711 std::lock_guard l(shard_lock);
10712 auto p = pg_slots_by_epoch.begin();
10713 if (p == pg_slots_by_epoch.end()) {
10714 return 0;
10715 }
10716 return p->epoch;
10717 }
10718
10719 void OSDShard::wait_min_pg_epoch(epoch_t need)
10720 {
10721 std::unique_lock l{shard_lock};
10722 ++waiting_for_min_pg_epoch;
10723 min_pg_epoch_cond.wait(l, [need, this] {
10724 if (pg_slots_by_epoch.empty()) {
10725 return true;
10726 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10727 return true;
10728 } else {
10729 dout(10) << need << " waiting on "
10730 << pg_slots_by_epoch.begin()->epoch << dendl;
10731 return false;
10732 }
10733 });
10734 --waiting_for_min_pg_epoch;
10735 }
10736
10737 epoch_t OSDShard::get_max_waiting_epoch()
10738 {
10739 std::lock_guard l(shard_lock);
10740 epoch_t r = 0;
10741 for (auto& i : pg_slots) {
10742 if (!i.second->waiting_peering.empty()) {
10743 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10744 }
10745 }
10746 return r;
10747 }
10748
10749 void OSDShard::consume_map(
10750 OSDMapRef& new_osdmap,
10751 unsigned *pushes_to_free)
10752 {
10753 std::lock_guard l(shard_lock);
10754 OSDMapRef old_osdmap;
10755 {
10756 std::lock_guard l(osdmap_lock);
10757 old_osdmap = std::move(shard_osdmap);
10758 shard_osdmap = new_osdmap;
10759 }
10760 dout(10) << new_osdmap->get_epoch()
10761 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10762 << dendl;
10763 bool queued = false;
10764
10765 // check slots
10766 auto p = pg_slots.begin();
10767 while (p != pg_slots.end()) {
10768 OSDShardPGSlot *slot = p->second.get();
10769 const spg_t& pgid = p->first;
10770 dout(20) << __func__ << " " << pgid << dendl;
10771 if (!slot->waiting_for_split.empty()) {
10772 dout(20) << __func__ << " " << pgid
10773 << " waiting for split " << slot->waiting_for_split << dendl;
10774 ++p;
10775 continue;
10776 }
10777 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10778 dout(20) << __func__ << " " << pgid
10779 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10780 << dendl;
10781 ++p;
10782 continue;
10783 }
10784 if (!slot->waiting_peering.empty()) {
10785 epoch_t first = slot->waiting_peering.begin()->first;
10786 if (first <= new_osdmap->get_epoch()) {
10787 dout(20) << __func__ << " " << pgid
10788 << " pending_peering first epoch " << first
10789 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10790 _wake_pg_slot(pgid, slot);
10791 queued = true;
10792 }
10793 ++p;
10794 continue;
10795 }
10796 if (!slot->waiting.empty()) {
10797 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10798 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10799 << dendl;
10800 ++p;
10801 continue;
10802 }
10803 while (!slot->waiting.empty() &&
10804 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10805 auto& qi = slot->waiting.front();
10806 dout(20) << __func__ << " " << pgid
10807 << " waiting item " << qi
10808 << " epoch " << qi.get_map_epoch()
10809 << " <= " << new_osdmap->get_epoch()
10810 << ", "
10811 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10812 "misdirected")
10813 << ", dropping" << dendl;
10814 *pushes_to_free += qi.get_reserved_pushes();
10815 slot->waiting.pop_front();
10816 }
10817 }
10818 if (slot->waiting.empty() &&
10819 slot->num_running == 0 &&
10820 slot->waiting_for_split.empty() &&
10821 !slot->pg) {
10822 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10823 p = pg_slots.erase(p);
10824 continue;
10825 }
10826
10827 ++p;
10828 }
10829 if (queued) {
10830 std::lock_guard l{sdata_wait_lock};
10831 sdata_cond.notify_one();
10832 }
10833 }
10834
10835 void OSDShard::_wake_pg_slot(
10836 spg_t pgid,
10837 OSDShardPGSlot *slot)
10838 {
10839 dout(20) << __func__ << " " << pgid
10840 << " to_process " << slot->to_process
10841 << " waiting " << slot->waiting
10842 << " waiting_peering " << slot->waiting_peering << dendl;
10843 for (auto i = slot->to_process.rbegin();
10844 i != slot->to_process.rend();
10845 ++i) {
10846 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10847 }
10848 slot->to_process.clear();
10849 for (auto i = slot->waiting.rbegin();
10850 i != slot->waiting.rend();
10851 ++i) {
10852 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10853 }
10854 slot->waiting.clear();
10855 for (auto i = slot->waiting_peering.rbegin();
10856 i != slot->waiting_peering.rend();
10857 ++i) {
10858 // this is overkill; we requeue everything, even if some of these
10859 // items are waiting for maps we don't have yet. FIXME, maybe,
10860 // someday, if we decide this inefficiency matters
10861 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10862 _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10863 }
10864 }
10865 slot->waiting_peering.clear();
10866 ++slot->requeue_seq;
10867 }
10868
10869 void OSDShard::identify_splits_and_merges(
10870 const OSDMapRef& as_of_osdmap,
10871 set<pair<spg_t,epoch_t>> *split_pgs,
10872 set<pair<spg_t,epoch_t>> *merge_pgs)
10873 {
10874 std::lock_guard l(shard_lock);
10875 if (shard_osdmap) {
10876 for (auto& i : pg_slots) {
10877 const spg_t& pgid = i.first;
10878 auto *slot = i.second.get();
10879 if (slot->pg) {
10880 osd->service.identify_splits_and_merges(
10881 shard_osdmap, as_of_osdmap, pgid,
10882 split_pgs, merge_pgs);
10883 } else if (!slot->waiting_for_split.empty()) {
10884 osd->service.identify_splits_and_merges(
10885 shard_osdmap, as_of_osdmap, pgid,
10886 split_pgs, nullptr);
10887 } else {
10888 dout(20) << __func__ << " slot " << pgid
10889 << " has no pg and waiting_for_split "
10890 << slot->waiting_for_split << dendl;
10891 }
10892 }
10893 }
10894 }
10895
10896 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10897 set<pair<spg_t,epoch_t>> *pgids)
10898 {
10899 std::lock_guard l(shard_lock);
10900 _prime_splits(pgids);
10901 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10902 set<pair<spg_t,epoch_t>> newer_children;
10903 for (auto i : *pgids) {
10904 osd->service.identify_splits_and_merges(
10905 as_of_osdmap, shard_osdmap, i.first,
10906 &newer_children, nullptr);
10907 }
10908 newer_children.insert(pgids->begin(), pgids->end());
10909 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10910 << shard_osdmap->get_epoch() << ", new children " << newer_children
10911 << dendl;
10912 _prime_splits(&newer_children);
10913 // note: we don't care what is left over here for other shards.
10914 // if this shard is ahead of us and one isn't, e.g., one thread is
10915 // calling into prime_splits via _process (due to a newly created
10916 // pg) and this shard has a newer map due to a racing consume_map,
10917 // then any grandchildren left here will be identified (or were
10918 // identified) when the slower shard's osdmap is advanced.
10919 // _prime_splits() will tolerate the case where the pgid is
10920 // already primed.
10921 }
10922 }
10923
10924 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10925 {
10926 dout(10) << *pgids << dendl;
10927 auto p = pgids->begin();
10928 while (p != pgids->end()) {
10929 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10930 if (shard_index == shard_id) {
10931 auto r = pg_slots.emplace(p->first, nullptr);
10932 if (r.second) {
10933 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10934 r.first->second = make_unique<OSDShardPGSlot>();
10935 r.first->second->waiting_for_split.insert(p->second);
10936 } else {
10937 auto q = r.first;
10938 ceph_assert(q != pg_slots.end());
10939 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10940 << dendl;
10941 q->second->waiting_for_split.insert(p->second);
10942 }
10943 p = pgids->erase(p);
10944 } else {
10945 ++p;
10946 }
10947 }
10948 }
10949
10950 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10951 set<pair<spg_t,epoch_t>> *merge_pgs)
10952 {
10953 std::lock_guard l(shard_lock);
10954 dout(20) << __func__ << " checking shard " << shard_id
10955 << " for remaining merge pgs " << merge_pgs << dendl;
10956 auto p = merge_pgs->begin();
10957 while (p != merge_pgs->end()) {
10958 spg_t pgid = p->first;
10959 epoch_t epoch = p->second;
10960 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10961 if (shard_index != shard_id) {
10962 ++p;
10963 continue;
10964 }
10965 OSDShardPGSlot *slot;
10966 auto r = pg_slots.emplace(pgid, nullptr);
10967 if (r.second) {
10968 r.first->second = make_unique<OSDShardPGSlot>();
10969 }
10970 slot = r.first->second.get();
10971 if (slot->pg) {
10972 // already have pg
10973 dout(20) << __func__ << " have merge participant pg " << pgid
10974 << " " << slot->pg << dendl;
10975 } else if (!slot->waiting_for_split.empty() &&
10976 *slot->waiting_for_split.begin() < epoch) {
10977 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10978 << " " << slot->waiting_for_split << dendl;
10979 } else {
10980 dout(20) << __func__ << " creating empty merge participant " << pgid
10981 << " for merge in " << epoch << dendl;
10982 // leave history zeroed; PG::merge_from() will fill it in.
10983 pg_history_t history;
10984 PGCreateInfo cinfo(pgid, epoch - 1,
10985 history, PastIntervals(), false);
10986 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10987 _attach_pg(r.first->second.get(), pg.get());
10988 _wake_pg_slot(pgid, slot);
10989 pg->unlock();
10990 }
10991 // mark slot for merge
10992 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10993 slot->waiting_for_merge_epoch = epoch;
10994 p = merge_pgs->erase(p);
10995 }
10996 }
10997
10998 void OSDShard::register_and_wake_split_child(PG *pg)
10999 {
11000 epoch_t epoch;
11001 {
11002 std::lock_guard l(shard_lock);
11003 dout(10) << pg->pg_id << " " << pg << dendl;
11004 auto p = pg_slots.find(pg->pg_id);
11005 ceph_assert(p != pg_slots.end());
11006 auto *slot = p->second.get();
11007 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
11008 << dendl;
11009 ceph_assert(!slot->pg);
11010 ceph_assert(!slot->waiting_for_split.empty());
11011 _attach_pg(slot, pg);
11012
11013 epoch = pg->get_osdmap_epoch();
11014 ceph_assert(slot->waiting_for_split.count(epoch));
11015 slot->waiting_for_split.erase(epoch);
11016 if (slot->waiting_for_split.empty()) {
11017 _wake_pg_slot(pg->pg_id, slot);
11018 } else {
11019 dout(10) << __func__ << " still waiting for split on "
11020 << slot->waiting_for_split << dendl;
11021 }
11022 }
11023
11024 // kick child to ensure it pulls up to the latest osdmap
11025 osd->enqueue_peering_evt(
11026 pg->pg_id,
11027 PGPeeringEventRef(
11028 std::make_shared<PGPeeringEvent>(
11029 epoch,
11030 epoch,
11031 NullEvt())));
11032
11033 std::lock_guard l{sdata_wait_lock};
11034 sdata_cond.notify_one();
11035 }
11036
11037 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
11038 {
11039 std::lock_guard l(shard_lock);
11040 vector<spg_t> to_delete;
11041 for (auto& i : pg_slots) {
11042 if (i.first != parent &&
11043 i.first.get_ancestor(old_pg_num) == parent) {
11044 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
11045 << dendl;
11046 _wake_pg_slot(i.first, i.second.get());
11047 to_delete.push_back(i.first);
11048 }
11049 }
11050 for (auto pgid : to_delete) {
11051 pg_slots.erase(pgid);
11052 }
11053 }
11054
11055
11056 // =============================================================
11057
11058 #undef dout_context
11059 #define dout_context osd->cct
11060 #undef dout_prefix
11061 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11062
11063 void OSD::ShardedOpWQ::_add_slot_waiter(
11064 spg_t pgid,
11065 OSDShardPGSlot *slot,
11066 OpQueueItem&& qi)
11067 {
11068 if (qi.is_peering()) {
11069 dout(20) << __func__ << " " << pgid
11070 << " peering, item epoch is "
11071 << qi.get_map_epoch()
11072 << ", will wait on " << qi << dendl;
11073 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
11074 } else {
11075 dout(20) << __func__ << " " << pgid
11076 << " item epoch is "
11077 << qi.get_map_epoch()
11078 << ", will wait on " << qi << dendl;
11079 slot->waiting.push_back(std::move(qi));
11080 }
11081 }
11082
11083 #undef dout_prefix
11084 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11085
11086 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
11087 {
11088 uint32_t shard_index = thread_index % osd->num_shards;
11089 auto& sdata = osd->shards[shard_index];
11090 ceph_assert(sdata);
11091
11092 // If all threads of shards do oncommits, there is a out-of-order
11093 // problem. So we choose the thread which has the smallest
11094 // thread_index(thread_index < num_shards) of shard to do oncommit
11095 // callback.
11096 bool is_smallest_thread_index = thread_index < osd->num_shards;
11097
11098 // peek at spg_t
11099 sdata->shard_lock.lock();
11100 if (sdata->pqueue->empty() &&
11101 (!is_smallest_thread_index || sdata->context_queue.empty())) {
11102 std::unique_lock wait_lock{sdata->sdata_wait_lock};
11103 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11104 // we raced with a context_queue addition, don't wait
11105 wait_lock.unlock();
11106 } else if (!sdata->stop_waiting) {
11107 dout(20) << __func__ << " empty q, waiting" << dendl;
11108 osd->cct->get_heartbeat_map()->clear_timeout(hb);
11109 sdata->shard_lock.unlock();
11110 sdata->sdata_cond.wait(wait_lock);
11111 wait_lock.unlock();
11112 sdata->shard_lock.lock();
11113 if (sdata->pqueue->empty() &&
11114 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
11115 sdata->shard_lock.unlock();
11116 return;
11117 }
11118 osd->cct->get_heartbeat_map()->reset_timeout(hb,
11119 osd->cct->_conf->threadpool_default_timeout, 0);
11120 } else {
11121 dout(20) << __func__ << " need return immediately" << dendl;
11122 wait_lock.unlock();
11123 sdata->shard_lock.unlock();
11124 return;
11125 }
11126 }
11127
11128 list<Context *> oncommits;
11129 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11130 sdata->context_queue.swap(oncommits);
11131 }
11132
11133 if (sdata->pqueue->empty()) {
11134 if (osd->is_stopping()) {
11135 sdata->shard_lock.unlock();
11136 for (auto c : oncommits) {
11137 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11138 delete c;
11139 }
11140 return; // OSD shutdown, discard.
11141 }
11142 sdata->shard_lock.unlock();
11143 handle_oncommits(oncommits);
11144 return;
11145 }
11146
11147 OpQueueItem item = sdata->pqueue->dequeue();
11148 if (osd->is_stopping()) {
11149 sdata->shard_lock.unlock();
11150 for (auto c : oncommits) {
11151 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11152 delete c;
11153 }
11154 return; // OSD shutdown, discard.
11155 }
11156
11157 const auto token = item.get_ordering_token();
11158 auto r = sdata->pg_slots.emplace(token, nullptr);
11159 if (r.second) {
11160 r.first->second = make_unique<OSDShardPGSlot>();
11161 }
11162 OSDShardPGSlot *slot = r.first->second.get();
11163 dout(20) << __func__ << " " << token
11164 << (r.second ? " (new)" : "")
11165 << " to_process " << slot->to_process
11166 << " waiting " << slot->waiting
11167 << " waiting_peering " << slot->waiting_peering
11168 << dendl;
11169 slot->to_process.push_back(std::move(item));
11170 dout(20) << __func__ << " " << slot->to_process.back()
11171 << " queued" << dendl;
11172
11173 retry_pg:
11174 PGRef pg = slot->pg;
11175
11176 // lock pg (if we have it)
11177 if (pg) {
11178 // note the requeue seq now...
11179 uint64_t requeue_seq = slot->requeue_seq;
11180 ++slot->num_running;
11181
11182 sdata->shard_lock.unlock();
11183 osd->service.maybe_inject_dispatch_delay();
11184 pg->lock();
11185 osd->service.maybe_inject_dispatch_delay();
11186 sdata->shard_lock.lock();
11187
11188 auto q = sdata->pg_slots.find(token);
11189 if (q == sdata->pg_slots.end()) {
11190 // this can happen if we race with pg removal.
11191 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11192 pg->unlock();
11193 sdata->shard_lock.unlock();
11194 handle_oncommits(oncommits);
11195 return;
11196 }
11197 slot = q->second.get();
11198 --slot->num_running;
11199
11200 if (slot->to_process.empty()) {
11201 // raced with _wake_pg_slot or consume_map
11202 dout(20) << __func__ << " " << token
11203 << " nothing queued" << dendl;
11204 pg->unlock();
11205 sdata->shard_lock.unlock();
11206 handle_oncommits(oncommits);
11207 return;
11208 }
11209 if (requeue_seq != slot->requeue_seq) {
11210 dout(20) << __func__ << " " << token
11211 << " requeue_seq " << slot->requeue_seq << " > our "
11212 << requeue_seq << ", we raced with _wake_pg_slot"
11213 << dendl;
11214 pg->unlock();
11215 sdata->shard_lock.unlock();
11216 handle_oncommits(oncommits);
11217 return;
11218 }
11219 if (slot->pg != pg) {
11220 // this can happen if we race with pg removal.
11221 dout(20) << __func__ << " slot " << token << " no longer attached to "
11222 << pg << dendl;
11223 pg->unlock();
11224 goto retry_pg;
11225 }
11226 }
11227
11228 dout(20) << __func__ << " " << token
11229 << " to_process " << slot->to_process
11230 << " waiting " << slot->waiting
11231 << " waiting_peering " << slot->waiting_peering << dendl;
11232
11233 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11234 suicide_interval);
11235
11236 // take next item
11237 auto qi = std::move(slot->to_process.front());
11238 slot->to_process.pop_front();
11239 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11240 set<pair<spg_t,epoch_t>> new_children;
11241 OSDMapRef osdmap;
11242
11243 while (!pg) {
11244 // should this pg shard exist on this osd in this (or a later) epoch?
11245 osdmap = sdata->shard_osdmap;
11246 const PGCreateInfo *create_info = qi.creates_pg();
11247 if (!slot->waiting_for_split.empty()) {
11248 dout(20) << __func__ << " " << token
11249 << " splitting " << slot->waiting_for_split << dendl;
11250 _add_slot_waiter(token, slot, std::move(qi));
11251 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11252 dout(20) << __func__ << " " << token
11253 << " map " << qi.get_map_epoch() << " > "
11254 << osdmap->get_epoch() << dendl;
11255 _add_slot_waiter(token, slot, std::move(qi));
11256 } else if (qi.is_peering()) {
11257 if (!qi.peering_requires_pg()) {
11258 // for pg-less events, we run them under the ordering lock, since
11259 // we don't have the pg lock to keep them ordered.
11260 qi.run(osd, sdata, pg, tp_handle);
11261 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11262 if (create_info) {
11263 if (create_info->by_mon &&
11264 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11265 dout(20) << __func__ << " " << token
11266 << " no pg, no longer primary, ignoring mon create on "
11267 << qi << dendl;
11268 } else {
11269 dout(20) << __func__ << " " << token
11270 << " no pg, should create on " << qi << dendl;
11271 pg = osd->handle_pg_create_info(osdmap, create_info);
11272 if (pg) {
11273 // we created the pg! drop out and continue "normally"!
11274 sdata->_attach_pg(slot, pg.get());
11275 sdata->_wake_pg_slot(token, slot);
11276
11277 // identify split children between create epoch and shard epoch.
11278 osd->service.identify_splits_and_merges(
11279 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11280 sdata->_prime_splits(&new_children);
11281 // distribute remaining split children to other shards below!
11282 break;
11283 }
11284 dout(20) << __func__ << " ignored create on " << qi << dendl;
11285 }
11286 } else {
11287 dout(20) << __func__ << " " << token
11288 << " no pg, peering, !create, discarding " << qi << dendl;
11289 }
11290 } else {
11291 dout(20) << __func__ << " " << token
11292 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11293 << ", discarding " << qi
11294 << dendl;
11295 }
11296 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11297 dout(20) << __func__ << " " << token
11298 << " no pg, should exist e" << osdmap->get_epoch()
11299 << ", will wait on " << qi << dendl;
11300 _add_slot_waiter(token, slot, std::move(qi));
11301 } else {
11302 dout(20) << __func__ << " " << token
11303 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11304 << ", dropping " << qi << dendl;
11305 // share map with client?
11306 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11307 auto priv = (*_op)->get_req()->get_connection()->get_priv();
11308 if (auto session = static_cast<Session *>(priv.get()); session) {
11309 osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
11310 }
11311 }
11312 unsigned pushes_to_free = qi.get_reserved_pushes();
11313 if (pushes_to_free > 0) {
11314 sdata->shard_lock.unlock();
11315 osd->service.release_reserved_pushes(pushes_to_free);
11316 handle_oncommits(oncommits);
11317 return;
11318 }
11319 }
11320 sdata->shard_lock.unlock();
11321 handle_oncommits(oncommits);
11322 return;
11323 }
11324 if (qi.is_peering()) {
11325 OSDMapRef osdmap = sdata->shard_osdmap;
11326 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11327 _add_slot_waiter(token, slot, std::move(qi));
11328 sdata->shard_lock.unlock();
11329 pg->unlock();
11330 handle_oncommits(oncommits);
11331 return;
11332 }
11333 }
11334 sdata->shard_lock.unlock();
11335
11336 if (!new_children.empty()) {
11337 for (auto shard : osd->shards) {
11338 shard->prime_splits(osdmap, &new_children);
11339 }
11340 ceph_assert(new_children.empty());
11341 }
11342
11343 // osd_opwq_process marks the point at which an operation has been dequeued
11344 // and will begin to be handled by a worker thread.
11345 {
11346 #ifdef WITH_LTTNG
11347 osd_reqid_t reqid;
11348 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11349 reqid = (*_op)->get_reqid();
11350 }
11351 #endif
11352 tracepoint(osd, opwq_process_start, reqid.name._type,
11353 reqid.name._num, reqid.tid, reqid.inc);
11354 }
11355
11356 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11357 Formatter *f = Formatter::create("json");
11358 f->open_object_section("q");
11359 dump(f);
11360 f->close_section();
11361 f->flush(*_dout);
11362 delete f;
11363 *_dout << dendl;
11364
11365 qi.run(osd, sdata, pg, tp_handle);
11366
11367 {
11368 #ifdef WITH_LTTNG
11369 osd_reqid_t reqid;
11370 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11371 reqid = (*_op)->get_reqid();
11372 }
11373 #endif
11374 tracepoint(osd, opwq_process_finish, reqid.name._type,
11375 reqid.name._num, reqid.tid, reqid.inc);
11376 }
11377
11378 handle_oncommits(oncommits);
11379 }
11380
11381 void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
11382 uint32_t shard_index =
11383 item.get_ordering_token().hash_to_shard(osd->shards.size());
11384
11385 OSDShard* sdata = osd->shards[shard_index];
11386 assert (NULL != sdata);
11387 unsigned priority = item.get_priority();
11388 unsigned cost = item.get_cost();
11389 sdata->shard_lock.lock();
11390
11391 dout(20) << __func__ << " " << item << dendl;
11392 if (priority >= osd->op_prio_cutoff)
11393 sdata->pqueue->enqueue_strict(
11394 item.get_owner(), priority, std::move(item));
11395 else
11396 sdata->pqueue->enqueue(
11397 item.get_owner(), priority, cost, std::move(item));
11398 sdata->shard_lock.unlock();
11399
11400 std::lock_guard l{sdata->sdata_wait_lock};
11401 sdata->sdata_cond.notify_one();
11402 }
11403
11404 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
11405 {
11406 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11407 auto& sdata = osd->shards[shard_index];
11408 ceph_assert(sdata);
11409 sdata->shard_lock.lock();
11410 auto p = sdata->pg_slots.find(item.get_ordering_token());
11411 if (p != sdata->pg_slots.end() &&
11412 !p->second->to_process.empty()) {
11413 // we may be racing with _process, which has dequeued a new item
11414 // from pqueue, put it on to_process, and is now busy taking the
11415 // pg lock. ensure this old requeued item is ordered before any
11416 // such newer item in to_process.
11417 p->second->to_process.push_front(std::move(item));
11418 item = std::move(p->second->to_process.back());
11419 p->second->to_process.pop_back();
11420 dout(20) << __func__
11421 << " " << p->second->to_process.front()
11422 << " shuffled w/ " << item << dendl;
11423 } else {
11424 dout(20) << __func__ << " " << item << dendl;
11425 }
11426 sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11427 sdata->shard_lock.unlock();
11428 std::lock_guard l{sdata->sdata_wait_lock};
11429 sdata->sdata_cond.notify_one();
11430 }
11431
11432 namespace ceph {
11433 namespace osd_cmds {
11434
11435 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11436 std::ostream& os)
11437 {
11438 if (!ceph_using_tcmalloc()) {
11439 os << "could not issue heap profiler command -- not using tcmalloc!";
11440 return -EOPNOTSUPP;
11441 }
11442
11443 string cmd;
11444 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11445 os << "unable to get value for command \"" << cmd << "\"";
11446 return -EINVAL;
11447 }
11448
11449 std::vector<std::string> cmd_vec;
11450 get_str_vec(cmd, cmd_vec);
11451
11452 string val;
11453 if (cmd_getval(&cct, cmdmap, "value", val)) {
11454 cmd_vec.push_back(val);
11455 }
11456
11457 ceph_heap_profiler_handle_command(cmd_vec, os);
11458
11459 return 0;
11460 }
11461
11462 }} // namespace ceph::osd_cmds
11463
11464
11465 std::ostream& operator<<(std::ostream& out, const io_queue& q) {
11466 switch(q) {
11467 case io_queue::prioritized:
11468 out << "prioritized";
11469 break;
11470 case io_queue::weightedpriority:
11471 out << "weightedpriority";
11472 break;
11473 case io_queue::mclock_opclass:
11474 out << "mclock_opclass";
11475 break;
11476 case io_queue::mclock_client:
11477 out << "mclock_client";
11478 break;
11479 }
11480 return out;
11481 }