]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to 12.2.10
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16 #include <unistd.h>
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24 #include <random>
25
26 #ifdef HAVE_SYS_PARAM_H
27 #include <sys/param.h>
28 #endif
29
30 #ifdef HAVE_SYS_MOUNT_H
31 #include <sys/mount.h>
32 #endif
33
34 #include "osd/PG.h"
35
36 #include "include/types.h"
37 #include "include/compat.h"
38
39 #include "OSD.h"
40 #include "OSDMap.h"
41 #include "Watch.h"
42 #include "osdc/Objecter.h"
43
44 #include "common/errno.h"
45 #include "common/ceph_argparse.h"
46 #include "common/ceph_time.h"
47 #include "common/version.h"
48 #include "common/io_priority.h"
49 #include "common/pick_address.h"
50
51 #include "os/ObjectStore.h"
52 #ifdef HAVE_LIBFUSE
53 #include "os/FuseStore.h"
54 #endif
55
56 #include "PrimaryLogPG.h"
57
58
59 #include "msg/Messenger.h"
60 #include "msg/Message.h"
61
62 #include "mon/MonClient.h"
63
64 #include "messages/MLog.h"
65
66 #include "messages/MGenericMessage.h"
67 #include "messages/MOSDPing.h"
68 #include "messages/MOSDFailure.h"
69 #include "messages/MOSDMarkMeDown.h"
70 #include "messages/MOSDFull.h"
71 #include "messages/MOSDOp.h"
72 #include "messages/MOSDOpReply.h"
73 #include "messages/MOSDBackoff.h"
74 #include "messages/MOSDBeacon.h"
75 #include "messages/MOSDRepOp.h"
76 #include "messages/MOSDRepOpReply.h"
77 #include "messages/MOSDBoot.h"
78 #include "messages/MOSDPGTemp.h"
79
80 #include "messages/MOSDMap.h"
81 #include "messages/MMonGetOSDMap.h"
82 #include "messages/MOSDPGNotify.h"
83 #include "messages/MOSDPGQuery.h"
84 #include "messages/MOSDPGLog.h"
85 #include "messages/MOSDPGRemove.h"
86 #include "messages/MOSDPGInfo.h"
87 #include "messages/MOSDPGCreate.h"
88 #include "messages/MOSDPGTrim.h"
89 #include "messages/MOSDPGScan.h"
90 #include "messages/MOSDPGBackfill.h"
91 #include "messages/MBackfillReserve.h"
92 #include "messages/MRecoveryReserve.h"
93 #include "messages/MOSDForceRecovery.h"
94 #include "messages/MOSDECSubOpWrite.h"
95 #include "messages/MOSDECSubOpWriteReply.h"
96 #include "messages/MOSDECSubOpRead.h"
97 #include "messages/MOSDECSubOpReadReply.h"
98 #include "messages/MOSDPGCreated.h"
99 #include "messages/MOSDPGUpdateLogMissing.h"
100 #include "messages/MOSDPGUpdateLogMissingReply.h"
101
102 #include "messages/MOSDAlive.h"
103
104 #include "messages/MOSDScrub.h"
105 #include "messages/MOSDScrubReserve.h"
106 #include "messages/MOSDRepScrub.h"
107
108 #include "messages/MMonCommand.h"
109 #include "messages/MCommand.h"
110 #include "messages/MCommandReply.h"
111
112 #include "messages/MPGStats.h"
113 #include "messages/MPGStatsAck.h"
114
115 #include "messages/MWatchNotify.h"
116 #include "messages/MOSDPGPush.h"
117 #include "messages/MOSDPGPushReply.h"
118 #include "messages/MOSDPGPull.h"
119
120 #include "common/perf_counters.h"
121 #include "common/Timer.h"
122 #include "common/LogClient.h"
123 #include "common/AsyncReserver.h"
124 #include "common/HeartbeatMap.h"
125 #include "common/admin_socket.h"
126 #include "common/ceph_context.h"
127
128 #include "global/signal_handler.h"
129 #include "global/pidfile.h"
130
131 #include "include/color.h"
132 #include "perfglue/cpu_profiler.h"
133 #include "perfglue/heap_profiler.h"
134
135 #include "osd/OpRequest.h"
136
137 #include "auth/AuthAuthorizeHandler.h"
138 #include "auth/RotatingKeyRing.h"
139 #include "common/errno.h"
140
141 #include "objclass/objclass.h"
142
143 #include "common/cmdparse.h"
144 #include "include/str_list.h"
145 #include "include/util.h"
146
147 #include "include/assert.h"
148 #include "common/config.h"
149 #include "common/EventTrace.h"
150
151 #ifdef WITH_LTTNG
152 #define TRACEPOINT_DEFINE
153 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
154 #include "tracing/osd.h"
155 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
156 #undef TRACEPOINT_DEFINE
157 #else
158 #define tracepoint(...)
159 #endif
160
161 #define dout_context cct
162 #define dout_subsys ceph_subsys_osd
163 #undef dout_prefix
164 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
165
166
167 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
168 return *_dout << "osd." << whoami << " " << epoch << " ";
169 }
170
171 //Initial features in new superblock.
172 //Features here are also automatically upgraded
173 CompatSet OSD::get_osd_initial_compat_set() {
174 CompatSet::FeatureSet ceph_osd_feature_compat;
175 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
176 CompatSet::FeatureSet ceph_osd_feature_incompat;
177 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
192 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
193 ceph_osd_feature_incompat);
194 }
195
196 //Features are added here that this OSD supports.
197 CompatSet OSD::get_osd_compat_set() {
198 CompatSet compat = get_osd_initial_compat_set();
199 //Any features here can be set in code, but not in initial superblock
200 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
201 return compat;
202 }
203
204 OSDService::OSDService(OSD *osd) :
205 osd(osd),
206 cct(osd->cct),
207 meta_osr(new ObjectStore::Sequencer("meta")),
208 whoami(osd->whoami), store(osd->store),
209 log_client(osd->log_client), clog(osd->clog),
210 pg_recovery_stats(osd->pg_recovery_stats),
211 cluster_messenger(osd->cluster_messenger),
212 client_messenger(osd->client_messenger),
213 logger(osd->logger),
214 recoverystate_perf(osd->recoverystate_perf),
215 monc(osd->monc),
216 peering_wq(osd->peering_wq),
217 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
218 &osd->disk_tp),
219 class_handler(osd->class_handler),
220 pg_epoch_lock("OSDService::pg_epoch_lock"),
221 publish_lock("OSDService::publish_lock"),
222 pre_publish_lock("OSDService::pre_publish_lock"),
223 max_oldest_map(0),
224 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
225 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
226 scrubs_active(0),
227 agent_lock("OSDService::agent_lock"),
228 agent_valid_iterator(false),
229 agent_ops(0),
230 flush_mode_high_count(0),
231 agent_active(true),
232 agent_thread(this),
233 agent_stop_flag(false),
234 agent_timer_lock("OSDService::agent_timer_lock"),
235 agent_timer(osd->client_messenger->cct, agent_timer_lock),
236 last_recalibrate(ceph_clock_now()),
237 promote_max_objects(0),
238 promote_max_bytes(0),
239 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
240 objecter_finisher(osd->client_messenger->cct),
241 watch_lock("OSDService::watch_lock"),
242 watch_timer(osd->client_messenger->cct, watch_lock),
243 next_notif_id(0),
244 recovery_request_lock("OSDService::recovery_request_lock"),
245 recovery_request_timer(cct, recovery_request_lock, false),
246 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
247 recovery_sleep_timer(cct, recovery_sleep_lock, false),
248 reserver_finisher(cct),
249 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
250 cct->_conf->osd_min_recovery_priority),
251 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
252 cct->_conf->osd_min_recovery_priority),
253 pg_temp_lock("OSDService::pg_temp_lock"),
254 snap_sleep_lock("OSDService::snap_sleep_lock"),
255 snap_sleep_timer(
256 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
257 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
258 scrub_sleep_timer(
259 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
260 snap_reserver(cct, &reserver_finisher,
261 cct->_conf->osd_max_trimming_pgs),
262 recovery_lock("OSDService::recovery_lock"),
263 recovery_ops_active(0),
264 recovery_ops_reserved(0),
265 recovery_paused(false),
266 map_cache_lock("OSDService::map_cache_lock"),
267 map_cache(cct, cct->_conf->osd_map_cache_size),
268 map_bl_cache(cct->_conf->osd_map_cache_size),
269 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
270 in_progress_split_lock("OSDService::in_progress_split_lock"),
271 stat_lock("OSDService::stat_lock"),
272 full_status_lock("OSDService::full_status_lock"),
273 cur_state(NONE),
274 cur_ratio(0),
275 epoch_lock("OSDService::epoch_lock"),
276 boot_epoch(0), up_epoch(0), bind_epoch(0),
277 is_stopping_lock("OSDService::is_stopping_lock")
278 #ifdef PG_DEBUG_REFS
279 , pgid_lock("OSDService::pgid_lock")
280 #endif
281 {
282 objecter->init();
283 }
284
285 OSDService::~OSDService()
286 {
287 delete objecter;
288 }
289
290
291
292 #ifdef PG_DEBUG_REFS
293 void OSDService::add_pgid(spg_t pgid, PG *pg){
294 Mutex::Locker l(pgid_lock);
295 if (!pgid_tracker.count(pgid)) {
296 live_pgs[pgid] = pg;
297 }
298 pgid_tracker[pgid]++;
299 }
300 void OSDService::remove_pgid(spg_t pgid, PG *pg)
301 {
302 Mutex::Locker l(pgid_lock);
303 assert(pgid_tracker.count(pgid));
304 assert(pgid_tracker[pgid] > 0);
305 pgid_tracker[pgid]--;
306 if (pgid_tracker[pgid] == 0) {
307 pgid_tracker.erase(pgid);
308 live_pgs.erase(pgid);
309 }
310 }
311 void OSDService::dump_live_pgids()
312 {
313 Mutex::Locker l(pgid_lock);
314 derr << "live pgids:" << dendl;
315 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
316 i != pgid_tracker.cend();
317 ++i) {
318 derr << "\t" << *i << dendl;
319 live_pgs[i->first]->dump_live_ids();
320 }
321 }
322 #endif
323
324
325 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
326 {
327 for (set<spg_t>::const_iterator i = children.begin();
328 i != children.end();
329 ++i) {
330 dout(10) << __func__ << ": Starting split on pg " << *i
331 << ", parent=" << parent << dendl;
332 assert(!pending_splits.count(*i));
333 assert(!in_progress_splits.count(*i));
334 pending_splits.insert(make_pair(*i, parent));
335
336 assert(!rev_pending_splits[parent].count(*i));
337 rev_pending_splits[parent].insert(*i);
338 }
339 }
340
341 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
342 {
343 Mutex::Locker l(in_progress_split_lock);
344 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
345 assert(piter != rev_pending_splits.end());
346 for (set<spg_t>::const_iterator i = children.begin();
347 i != children.end();
348 ++i) {
349 assert(piter->second.count(*i));
350 assert(pending_splits.count(*i));
351 assert(!in_progress_splits.count(*i));
352 assert(pending_splits[*i] == parent);
353
354 pending_splits.erase(*i);
355 piter->second.erase(*i);
356 in_progress_splits.insert(*i);
357 }
358 if (piter->second.empty())
359 rev_pending_splits.erase(piter);
360 }
361
362 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
363 {
364 Mutex::Locker l(in_progress_split_lock);
365 _cancel_pending_splits_for_parent(parent);
366 }
367
368 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
369 {
370 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
371 if (piter == rev_pending_splits.end())
372 return;
373
374 for (set<spg_t>::iterator i = piter->second.begin();
375 i != piter->second.end();
376 ++i) {
377 assert(pending_splits.count(*i));
378 assert(!in_progress_splits.count(*i));
379 pending_splits.erase(*i);
380 dout(10) << __func__ << ": Completing split on pg " << *i
381 << " for parent: " << parent << dendl;
382 _cancel_pending_splits_for_parent(*i);
383 }
384 rev_pending_splits.erase(piter);
385 }
386
387 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
388 OSDMapRef new_map,
389 spg_t pgid)
390 {
391 assert(old_map->have_pg_pool(pgid.pool()));
392 int old_pgnum = old_map->get_pg_num(pgid.pool());
393 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
394 set<spg_t> children;
395 if (pgid.is_split(old_pgnum,
396 new_map->get_pg_num(pgid.pool()), &children)) {
397 _start_split(pgid, children); }
398 } else {
399 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
400 }
401 }
402
403 void OSDService::init_splits_between(spg_t pgid,
404 OSDMapRef frommap,
405 OSDMapRef tomap)
406 {
407 // First, check whether we can avoid this potentially expensive check
408 if (tomap->have_pg_pool(pgid.pool()) &&
409 pgid.is_split(
410 frommap->get_pg_num(pgid.pool()),
411 tomap->get_pg_num(pgid.pool()),
412 NULL)) {
413 // Ok, a split happened, so we need to walk the osdmaps
414 set<spg_t> new_pgs; // pgs to scan on each map
415 new_pgs.insert(pgid);
416 OSDMapRef curmap(get_map(frommap->get_epoch()));
417 for (epoch_t e = frommap->get_epoch() + 1;
418 e <= tomap->get_epoch();
419 ++e) {
420 OSDMapRef nextmap(try_get_map(e));
421 if (!nextmap)
422 continue;
423 set<spg_t> even_newer_pgs; // pgs added in this loop
424 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
425 set<spg_t> split_pgs;
426 if (i->is_split(curmap->get_pg_num(i->pool()),
427 nextmap->get_pg_num(i->pool()),
428 &split_pgs)) {
429 start_split(*i, split_pgs);
430 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
431 }
432 }
433 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
434 curmap = nextmap;
435 }
436 assert(curmap == tomap); // we must have had both frommap and tomap
437 }
438 }
439
440 void OSDService::expand_pg_num(OSDMapRef old_map,
441 OSDMapRef new_map)
442 {
443 Mutex::Locker l(in_progress_split_lock);
444 for (set<spg_t>::iterator i = in_progress_splits.begin();
445 i != in_progress_splits.end();
446 ) {
447 if (!new_map->have_pg_pool(i->pool())) {
448 in_progress_splits.erase(i++);
449 } else {
450 _maybe_split_pgid(old_map, new_map, *i);
451 ++i;
452 }
453 }
454 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
455 i != pending_splits.end();
456 ) {
457 if (!new_map->have_pg_pool(i->first.pool())) {
458 rev_pending_splits.erase(i->second);
459 pending_splits.erase(i++);
460 } else {
461 _maybe_split_pgid(old_map, new_map, i->first);
462 ++i;
463 }
464 }
465 }
466
467 bool OSDService::splitting(spg_t pgid)
468 {
469 Mutex::Locker l(in_progress_split_lock);
470 return in_progress_splits.count(pgid) ||
471 pending_splits.count(pgid);
472 }
473
474 void OSDService::complete_split(const set<spg_t> &pgs)
475 {
476 Mutex::Locker l(in_progress_split_lock);
477 for (set<spg_t>::const_iterator i = pgs.begin();
478 i != pgs.end();
479 ++i) {
480 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
481 assert(!pending_splits.count(*i));
482 assert(in_progress_splits.count(*i));
483 in_progress_splits.erase(*i);
484 }
485 }
486
487 void OSDService::need_heartbeat_peer_update()
488 {
489 osd->need_heartbeat_peer_update();
490 }
491
492 void OSDService::pg_stat_queue_enqueue(PG *pg)
493 {
494 osd->pg_stat_queue_enqueue(pg);
495 }
496
497 void OSDService::pg_stat_queue_dequeue(PG *pg)
498 {
499 osd->pg_stat_queue_dequeue(pg);
500 }
501
502 void OSDService::start_shutdown()
503 {
504 {
505 Mutex::Locker l(agent_timer_lock);
506 agent_timer.shutdown();
507 }
508
509 {
510 Mutex::Locker l(recovery_sleep_lock);
511 recovery_sleep_timer.shutdown();
512 }
513 }
514
515 void OSDService::shutdown_reserver()
516 {
517 reserver_finisher.wait_for_empty();
518 reserver_finisher.stop();
519 }
520
521 void OSDService::shutdown()
522 {
523 {
524 Mutex::Locker l(watch_lock);
525 watch_timer.shutdown();
526 }
527
528 objecter->shutdown();
529 objecter_finisher.wait_for_empty();
530 objecter_finisher.stop();
531
532 {
533 Mutex::Locker l(recovery_request_lock);
534 recovery_request_timer.shutdown();
535 }
536
537 {
538 Mutex::Locker l(snap_sleep_lock);
539 snap_sleep_timer.shutdown();
540 }
541
542 {
543 Mutex::Locker l(scrub_sleep_lock);
544 scrub_sleep_timer.shutdown();
545 }
546
547 osdmap = OSDMapRef();
548 next_osdmap = OSDMapRef();
549 }
550
551 void OSDService::init()
552 {
553 reserver_finisher.start();
554 objecter_finisher.start();
555 objecter->set_client_incarnation(0);
556
557 // deprioritize objecter in daemonperf output
558 objecter->get_logger()->set_prio_adjust(-3);
559
560 watch_timer.init();
561 agent_timer.init();
562 snap_sleep_timer.init();
563 scrub_sleep_timer.init();
564
565 agent_thread.create("osd_srv_agent");
566
567 if (cct->_conf->osd_recovery_delay_start)
568 defer_recovery(cct->_conf->osd_recovery_delay_start);
569 }
570
571 void OSDService::final_init()
572 {
573 objecter->start(osdmap.get());
574 }
575
576 void OSDService::activate_map()
577 {
578 // wake/unwake the tiering agent
579 agent_lock.Lock();
580 agent_active =
581 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
582 osd->is_active();
583 agent_cond.Signal();
584 agent_lock.Unlock();
585 }
586
587 void OSDService::request_osdmap_update(epoch_t e)
588 {
589 osd->osdmap_subscribe(e, false);
590 }
591
592 class AgentTimeoutCB : public Context {
593 PGRef pg;
594 public:
595 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
596 void finish(int) override {
597 pg->agent_choose_mode_restart();
598 }
599 };
600
601 void OSDService::agent_entry()
602 {
603 dout(10) << __func__ << " start" << dendl;
604 agent_lock.Lock();
605
606 while (!agent_stop_flag) {
607 if (agent_queue.empty()) {
608 dout(20) << __func__ << " empty queue" << dendl;
609 agent_cond.Wait(agent_lock);
610 continue;
611 }
612 uint64_t level = agent_queue.rbegin()->first;
613 set<PGRef>& top = agent_queue.rbegin()->second;
614 dout(10) << __func__
615 << " tiers " << agent_queue.size()
616 << ", top is " << level
617 << " with pgs " << top.size()
618 << ", ops " << agent_ops << "/"
619 << cct->_conf->osd_agent_max_ops
620 << (agent_active ? " active" : " NOT ACTIVE")
621 << dendl;
622 dout(20) << __func__ << " oids " << agent_oids << dendl;
623 int max = cct->_conf->osd_agent_max_ops - agent_ops;
624 int agent_flush_quota = max;
625 if (!flush_mode_high_count)
626 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
627 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
628 agent_cond.Wait(agent_lock);
629 continue;
630 }
631
632 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
633 agent_queue_pos = top.begin();
634 agent_valid_iterator = true;
635 }
636 PGRef pg = *agent_queue_pos;
637 dout(10) << "high_count " << flush_mode_high_count
638 << " agent_ops " << agent_ops
639 << " flush_quota " << agent_flush_quota << dendl;
640 agent_lock.Unlock();
641 if (!pg->agent_work(max, agent_flush_quota)) {
642 dout(10) << __func__ << " " << pg->get_pgid()
643 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
644 << " seconds" << dendl;
645
646 osd->logger->inc(l_osd_tier_delay);
647 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
648 agent_timer_lock.Lock();
649 Context *cb = new AgentTimeoutCB(pg);
650 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
651 agent_timer_lock.Unlock();
652 }
653 agent_lock.Lock();
654 }
655 agent_lock.Unlock();
656 dout(10) << __func__ << " finish" << dendl;
657 }
658
659 void OSDService::agent_stop()
660 {
661 {
662 Mutex::Locker l(agent_lock);
663
664 // By this time all ops should be cancelled
665 assert(agent_ops == 0);
666 // By this time all PGs are shutdown and dequeued
667 if (!agent_queue.empty()) {
668 set<PGRef>& top = agent_queue.rbegin()->second;
669 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
670 assert(0 == "agent queue not empty");
671 }
672
673 agent_stop_flag = true;
674 agent_cond.Signal();
675 }
676 agent_thread.join();
677 }
678
679 // -------------------------------------
680
681 void OSDService::promote_throttle_recalibrate()
682 {
683 utime_t now = ceph_clock_now();
684 double dur = now - last_recalibrate;
685 last_recalibrate = now;
686 unsigned prob = promote_probability_millis;
687
688 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
689 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
690
691 unsigned min_prob = 1;
692
693 uint64_t attempts, obj, bytes;
694 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
695 dout(10) << __func__ << " " << attempts << " attempts, promoted "
696 << obj << " objects and " << byte_u_t(bytes) << "; target "
697 << target_obj_sec << " obj/sec or "
698 << byte_u_t(target_bytes_sec) << "/sec"
699 << dendl;
700
701 // calculate what the probability *should* be, given the targets
702 unsigned new_prob;
703 if (attempts && dur > 0) {
704 uint64_t avg_size = 1;
705 if (obj)
706 avg_size = MAX(bytes / obj, 1);
707 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
708 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
709 / (double)attempts;
710 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
711 << avg_size << dendl;
712 if (target_obj_sec && target_bytes_sec)
713 new_prob = MIN(po, pb);
714 else if (target_obj_sec)
715 new_prob = po;
716 else if (target_bytes_sec)
717 new_prob = pb;
718 else
719 new_prob = 1000;
720 } else {
721 new_prob = 1000;
722 }
723 dout(20) << __func__ << " new_prob " << new_prob << dendl;
724
725 // correct for persistent skew between target rate and actual rate, adjust
726 double ratio = 1.0;
727 unsigned actual = 0;
728 if (attempts && obj) {
729 actual = obj * 1000 / attempts;
730 ratio = (double)actual / (double)prob;
731 new_prob = (double)new_prob / ratio;
732 }
733 new_prob = MAX(new_prob, min_prob);
734 new_prob = MIN(new_prob, 1000);
735
736 // adjust
737 prob = (prob + new_prob) / 2;
738 prob = MAX(prob, min_prob);
739 prob = MIN(prob, 1000);
740 dout(10) << __func__ << " actual " << actual
741 << ", actual/prob ratio " << ratio
742 << ", adjusted new_prob " << new_prob
743 << ", prob " << promote_probability_millis << " -> " << prob
744 << dendl;
745 promote_probability_millis = prob;
746
747 // set hard limits for this interval to mitigate stampedes
748 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
749 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
750 }
751
752 // -------------------------------------
753
754 float OSDService::get_failsafe_full_ratio()
755 {
756 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
757 if (full_ratio > 1.0) full_ratio /= 100.0;
758 return full_ratio;
759 }
760
761 void OSDService::check_full_status(float ratio)
762 {
763 Mutex::Locker l(full_status_lock);
764
765 cur_ratio = ratio;
766
767 // The OSDMap ratios take precendence. So if the failsafe is .95 and
768 // the admin sets the cluster full to .96, the failsafe moves up to .96
769 // too. (Not that having failsafe == full is ideal, but it's better than
770 // dropping writes before the clusters appears full.)
771 OSDMapRef osdmap = get_osdmap();
772 if (!osdmap || osdmap->get_epoch() == 0) {
773 cur_state = NONE;
774 return;
775 }
776 float nearfull_ratio = osdmap->get_nearfull_ratio();
777 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
778 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
779 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
780
781 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
782 // use the failsafe for nearfull and full; the mon isn't using the
783 // flags anyway because we're mid-upgrade.
784 full_ratio = failsafe_ratio;
785 backfillfull_ratio = failsafe_ratio;
786 nearfull_ratio = failsafe_ratio;
787 } else if (full_ratio <= 0 ||
788 backfillfull_ratio <= 0 ||
789 nearfull_ratio <= 0) {
790 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
791 // use failsafe flag. ick. the monitor did something wrong or the user
792 // did something stupid.
793 full_ratio = failsafe_ratio;
794 backfillfull_ratio = failsafe_ratio;
795 nearfull_ratio = failsafe_ratio;
796 }
797
798 string inject;
799 s_names new_state;
800 if (injectfull_state > NONE && injectfull) {
801 new_state = injectfull_state;
802 inject = "(Injected)";
803 } else if (ratio > failsafe_ratio) {
804 new_state = FAILSAFE;
805 } else if (ratio > full_ratio) {
806 new_state = FULL;
807 } else if (ratio > backfillfull_ratio) {
808 new_state = BACKFILLFULL;
809 } else if (ratio > nearfull_ratio) {
810 new_state = NEARFULL;
811 } else {
812 new_state = NONE;
813 }
814 dout(20) << __func__ << " cur ratio " << ratio
815 << ". nearfull_ratio " << nearfull_ratio
816 << ". backfillfull_ratio " << backfillfull_ratio
817 << ", full_ratio " << full_ratio
818 << ", failsafe_ratio " << failsafe_ratio
819 << ", new state " << get_full_state_name(new_state)
820 << " " << inject
821 << dendl;
822
823 // warn
824 if (cur_state != new_state) {
825 dout(10) << __func__ << " " << get_full_state_name(cur_state)
826 << " -> " << get_full_state_name(new_state) << dendl;
827 if (new_state == FAILSAFE) {
828 clog->error() << "full status failsafe engaged, dropping updates, now "
829 << (int)roundf(ratio * 100) << "% full";
830 } else if (cur_state == FAILSAFE) {
831 clog->error() << "full status failsafe disengaged, no longer dropping "
832 << "updates, now " << (int)roundf(ratio * 100) << "% full";
833 }
834 cur_state = new_state;
835 }
836 }
837
838 bool OSDService::need_fullness_update()
839 {
840 OSDMapRef osdmap = get_osdmap();
841 s_names cur = NONE;
842 if (osdmap->exists(whoami)) {
843 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
844 cur = FULL;
845 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
846 cur = BACKFILLFULL;
847 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
848 cur = NEARFULL;
849 }
850 }
851 s_names want = NONE;
852 if (is_full())
853 want = FULL;
854 else if (is_backfillfull())
855 want = BACKFILLFULL;
856 else if (is_nearfull())
857 want = NEARFULL;
858 return want != cur;
859 }
860
861 bool OSDService::_check_full(s_names type, ostream &ss) const
862 {
863 Mutex::Locker l(full_status_lock);
864
865 if (injectfull && injectfull_state >= type) {
866 // injectfull is either a count of the number of times to return failsafe full
867 // or if -1 then always return full
868 if (injectfull > 0)
869 --injectfull;
870 ss << "Injected " << get_full_state_name(type) << " OSD ("
871 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
872 return true;
873 }
874
875 ss << "current usage is " << cur_ratio;
876 return cur_state >= type;
877 }
878
879 bool OSDService::check_failsafe_full(ostream &ss) const
880 {
881 return _check_full(FAILSAFE, ss);
882 }
883
884 bool OSDService::check_full(ostream &ss) const
885 {
886 return _check_full(FULL, ss);
887 }
888
889 bool OSDService::check_backfill_full(ostream &ss) const
890 {
891 return _check_full(BACKFILLFULL, ss);
892 }
893
894 bool OSDService::check_nearfull(ostream &ss) const
895 {
896 return _check_full(NEARFULL, ss);
897 }
898
899 bool OSDService::is_failsafe_full() const
900 {
901 Mutex::Locker l(full_status_lock);
902 return cur_state == FAILSAFE;
903 }
904
905 bool OSDService::is_full() const
906 {
907 Mutex::Locker l(full_status_lock);
908 return cur_state >= FULL;
909 }
910
911 bool OSDService::is_backfillfull() const
912 {
913 Mutex::Locker l(full_status_lock);
914 return cur_state >= BACKFILLFULL;
915 }
916
917 bool OSDService::is_nearfull() const
918 {
919 Mutex::Locker l(full_status_lock);
920 return cur_state >= NEARFULL;
921 }
922
923 void OSDService::set_injectfull(s_names type, int64_t count)
924 {
925 Mutex::Locker l(full_status_lock);
926 injectfull_state = type;
927 injectfull = count;
928 }
929
930 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
931 vector<int>& hb_peers,
932 int num_pgs)
933 {
934 uint64_t bytes = stbuf.total;
935 uint64_t used = bytes - stbuf.available;
936 uint64_t avail = stbuf.available;
937
938 osd->logger->set(l_osd_stat_bytes, bytes);
939 osd->logger->set(l_osd_stat_bytes_used, used);
940 osd->logger->set(l_osd_stat_bytes_avail, avail);
941
942 {
943 Mutex::Locker l(stat_lock);
944 osd_stat.hb_peers.swap(hb_peers);
945 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
946 osd_stat.kb = bytes >> 10;
947 osd_stat.kb_used = used >> 10;
948 osd_stat.kb_avail = avail >> 10;
949 osd_stat.num_pgs = num_pgs;
950 return osd_stat;
951 }
952 }
953
954 void OSDService::update_osd_stat(vector<int>& hb_peers)
955 {
956 // load osd stats first
957 struct store_statfs_t stbuf;
958 int r = osd->store->statfs(&stbuf);
959 if (r < 0) {
960 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
961 return;
962 }
963
964 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
965 dout(20) << "update_osd_stat " << new_stat << dendl;
966 assert(new_stat.kb);
967 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
968 check_full_status(ratio);
969 }
970
971 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
972 {
973 OSDMapRef osdmap = get_osdmap();
974 for (auto shard : missing_on) {
975 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
976 return true;
977 }
978 return false;
979 }
980
981 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
982 {
983 OSDMapRef next_map = get_nextmap_reserved();
984 // service map is always newer/newest
985 assert(from_epoch <= next_map->get_epoch());
986
987 if (next_map->is_down(peer) ||
988 next_map->get_info(peer).up_from > from_epoch) {
989 m->put();
990 release_map(next_map);
991 return;
992 }
993 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
994 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
995 share_map_peer(peer, peer_con.get(), next_map);
996 peer_con->send_message(m);
997 release_map(next_map);
998 }
999
1000 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1001 {
1002 OSDMapRef next_map = get_nextmap_reserved();
1003 // service map is always newer/newest
1004 assert(from_epoch <= next_map->get_epoch());
1005
1006 if (next_map->is_down(peer) ||
1007 next_map->get_info(peer).up_from > from_epoch) {
1008 release_map(next_map);
1009 return NULL;
1010 }
1011 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1012 release_map(next_map);
1013 return con;
1014 }
1015
1016 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1017 {
1018 OSDMapRef next_map = get_nextmap_reserved();
1019 // service map is always newer/newest
1020 assert(from_epoch <= next_map->get_epoch());
1021
1022 pair<ConnectionRef,ConnectionRef> ret;
1023 if (next_map->is_down(peer) ||
1024 next_map->get_info(peer).up_from > from_epoch) {
1025 release_map(next_map);
1026 return ret;
1027 }
1028 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1029 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1030 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1031 release_map(next_map);
1032 return ret;
1033 }
1034
1035
1036 void OSDService::queue_want_pg_temp(pg_t pgid,
1037 const vector<int>& want,
1038 bool forced)
1039 {
1040 Mutex::Locker l(pg_temp_lock);
1041 auto p = pg_temp_pending.find(pgid);
1042 if (p == pg_temp_pending.end() ||
1043 p->second.acting != want ||
1044 forced) {
1045 pg_temp_wanted[pgid] = pg_temp_t{want, forced};
1046 }
1047 }
1048
1049 void OSDService::remove_want_pg_temp(pg_t pgid)
1050 {
1051 Mutex::Locker l(pg_temp_lock);
1052 pg_temp_wanted.erase(pgid);
1053 pg_temp_pending.erase(pgid);
1054 }
1055
1056 void OSDService::_sent_pg_temp()
1057 {
1058 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1059 make_move_iterator(end(pg_temp_wanted)));
1060 pg_temp_wanted.clear();
1061 }
1062
1063 void OSDService::requeue_pg_temp()
1064 {
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1070 _sent_pg_temp();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1074 }
1075
1076 std::ostream& operator<<(std::ostream& out,
1077 const OSDService::pg_temp_t& pg_temp)
1078 {
1079 out << pg_temp.acting;
1080 if (pg_temp.forced) {
1081 out << " (forced)";
1082 }
1083 return out;
1084 }
1085
1086 void OSDService::send_pg_temp()
1087 {
1088 Mutex::Locker l(pg_temp_lock);
1089 if (pg_temp_wanted.empty())
1090 return;
1091 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1092 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1093 for (auto& pg_temp : pg_temp_wanted) {
1094 auto& m = ms[pg_temp.second.forced];
1095 if (!m) {
1096 m = new MOSDPGTemp(osdmap->get_epoch());
1097 m->forced = pg_temp.second.forced;
1098 }
1099 m->pg_temp.emplace(pg_temp.first,
1100 pg_temp.second.acting);
1101 }
1102 for (auto m : ms) {
1103 if (m) {
1104 monc->send_mon_message(m);
1105 }
1106 }
1107 _sent_pg_temp();
1108 }
1109
1110 void OSDService::send_pg_created(pg_t pgid)
1111 {
1112 dout(20) << __func__ << dendl;
1113 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1114 monc->send_mon_message(new MOSDPGCreated(pgid));
1115 }
1116 }
1117
1118 // --------------------------------------
1119 // dispatch
1120
1121 epoch_t OSDService::get_peer_epoch(int peer)
1122 {
1123 Mutex::Locker l(peer_map_epoch_lock);
1124 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1125 if (p == peer_map_epoch.end())
1126 return 0;
1127 return p->second;
1128 }
1129
1130 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1131 {
1132 Mutex::Locker l(peer_map_epoch_lock);
1133 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1134 if (p != peer_map_epoch.end()) {
1135 if (p->second < e) {
1136 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1137 p->second = e;
1138 } else {
1139 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1140 }
1141 return p->second;
1142 } else {
1143 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1144 peer_map_epoch[peer] = e;
1145 return e;
1146 }
1147 }
1148
1149 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1150 {
1151 Mutex::Locker l(peer_map_epoch_lock);
1152 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1153 if (p != peer_map_epoch.end()) {
1154 if (p->second <= as_of) {
1155 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1156 << " had " << p->second << dendl;
1157 peer_map_epoch.erase(p);
1158 } else {
1159 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1160 << " has " << p->second << " - not forgetting" << dendl;
1161 }
1162 }
1163 }
1164
1165 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1166 epoch_t epoch, const OSDMapRef& osdmap,
1167 const epoch_t *sent_epoch_p)
1168 {
1169 dout(20) << "should_share_map "
1170 << name << " " << con->get_peer_addr()
1171 << " " << epoch << dendl;
1172
1173 // does client have old map?
1174 if (name.is_client()) {
1175 bool message_sendmap = epoch < osdmap->get_epoch();
1176 if (message_sendmap && sent_epoch_p) {
1177 dout(20) << "client session last_sent_epoch: "
1178 << *sent_epoch_p
1179 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1180 if (*sent_epoch_p < osdmap->get_epoch()) {
1181 return true;
1182 } // else we don't need to send it out again
1183 }
1184 }
1185
1186 if (con->get_messenger() == osd->cluster_messenger &&
1187 con != osd->cluster_messenger->get_loopback_connection() &&
1188 osdmap->is_up(name.num()) &&
1189 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1190 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1191 // remember
1192 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1193
1194 // share?
1195 if (has < osdmap->get_epoch()) {
1196 dout(10) << name << " " << con->get_peer_addr()
1197 << " has old map " << epoch << " < "
1198 << osdmap->get_epoch() << dendl;
1199 return true;
1200 }
1201 }
1202
1203 return false;
1204 }
1205
1206 void OSDService::share_map(
1207 entity_name_t name,
1208 Connection *con,
1209 epoch_t epoch,
1210 OSDMapRef& osdmap,
1211 epoch_t *sent_epoch_p)
1212 {
1213 dout(20) << "share_map "
1214 << name << " " << con->get_peer_addr()
1215 << " " << epoch << dendl;
1216
1217 if (!osd->is_active()) {
1218 /*It is safe not to proceed as OSD is not in healthy state*/
1219 return;
1220 }
1221
1222 bool want_shared = should_share_map(name, con, epoch,
1223 osdmap, sent_epoch_p);
1224
1225 if (want_shared){
1226 if (name.is_client()) {
1227 dout(10) << name << " has old map " << epoch
1228 << " < " << osdmap->get_epoch() << dendl;
1229 // we know the Session is valid or we wouldn't be sending
1230 if (sent_epoch_p) {
1231 *sent_epoch_p = osdmap->get_epoch();
1232 }
1233 send_incremental_map(epoch, con, osdmap);
1234 } else if (con->get_messenger() == osd->cluster_messenger &&
1235 osdmap->is_up(name.num()) &&
1236 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1237 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1238 dout(10) << name << " " << con->get_peer_addr()
1239 << " has old map " << epoch << " < "
1240 << osdmap->get_epoch() << dendl;
1241 note_peer_epoch(name.num(), osdmap->get_epoch());
1242 send_incremental_map(epoch, con, osdmap);
1243 }
1244 }
1245 }
1246
1247 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1248 {
1249 if (!map)
1250 map = get_osdmap();
1251
1252 // send map?
1253 epoch_t pe = get_peer_epoch(peer);
1254 if (pe) {
1255 if (pe < map->get_epoch()) {
1256 send_incremental_map(pe, con, map);
1257 note_peer_epoch(peer, map->get_epoch());
1258 } else
1259 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1260 } else {
1261 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1262 // no idea about peer's epoch.
1263 // ??? send recent ???
1264 // do nothing.
1265 }
1266 }
1267
1268 bool OSDService::can_inc_scrubs_pending()
1269 {
1270 bool can_inc = false;
1271 Mutex::Locker l(sched_scrub_lock);
1272
1273 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1274 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1275 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
1276 << ")" << dendl;
1277 can_inc = true;
1278 } else {
1279 dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
1280 << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1281 }
1282
1283 return can_inc;
1284 }
1285
1286 bool OSDService::inc_scrubs_pending()
1287 {
1288 bool result = false;
1289
1290 sched_scrub_lock.Lock();
1291 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1292 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1293 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1294 result = true;
1295 ++scrubs_pending;
1296 } else {
1297 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1298 }
1299 sched_scrub_lock.Unlock();
1300
1301 return result;
1302 }
1303
1304 void OSDService::dec_scrubs_pending()
1305 {
1306 sched_scrub_lock.Lock();
1307 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1308 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1309 --scrubs_pending;
1310 assert(scrubs_pending >= 0);
1311 sched_scrub_lock.Unlock();
1312 }
1313
1314 void OSDService::inc_scrubs_active(bool reserved)
1315 {
1316 sched_scrub_lock.Lock();
1317 ++(scrubs_active);
1318 if (reserved) {
1319 --(scrubs_pending);
1320 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1321 << " (max " << cct->_conf->osd_max_scrubs
1322 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1323 assert(scrubs_pending >= 0);
1324 } else {
1325 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1326 << " (max " << cct->_conf->osd_max_scrubs
1327 << ", pending " << scrubs_pending << ")" << dendl;
1328 }
1329 sched_scrub_lock.Unlock();
1330 }
1331
1332 void OSDService::dec_scrubs_active()
1333 {
1334 sched_scrub_lock.Lock();
1335 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1336 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1337 --scrubs_active;
1338 assert(scrubs_active >= 0);
1339 sched_scrub_lock.Unlock();
1340 }
1341
1342 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1343 epoch_t *_bind_epoch) const
1344 {
1345 Mutex::Locker l(epoch_lock);
1346 if (_boot_epoch)
1347 *_boot_epoch = boot_epoch;
1348 if (_up_epoch)
1349 *_up_epoch = up_epoch;
1350 if (_bind_epoch)
1351 *_bind_epoch = bind_epoch;
1352 }
1353
1354 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1355 const epoch_t *_bind_epoch)
1356 {
1357 Mutex::Locker l(epoch_lock);
1358 if (_boot_epoch) {
1359 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1360 boot_epoch = *_boot_epoch;
1361 }
1362 if (_up_epoch) {
1363 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1364 up_epoch = *_up_epoch;
1365 }
1366 if (_bind_epoch) {
1367 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1368 bind_epoch = *_bind_epoch;
1369 }
1370 }
1371
1372 bool OSDService::prepare_to_stop()
1373 {
1374 Mutex::Locker l(is_stopping_lock);
1375 if (get_state() != NOT_STOPPING)
1376 return false;
1377
1378 OSDMapRef osdmap = get_osdmap();
1379 if (osdmap && osdmap->is_up(whoami)) {
1380 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1381 set_state(PREPARING_TO_STOP);
1382 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1383 osdmap->get_inst(whoami),
1384 osdmap->get_epoch(),
1385 true // request ack
1386 ));
1387 utime_t now = ceph_clock_now();
1388 utime_t timeout;
1389 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1390 while ((ceph_clock_now() < timeout) &&
1391 (get_state() != STOPPING)) {
1392 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1393 }
1394 }
1395 dout(0) << __func__ << " starting shutdown" << dendl;
1396 set_state(STOPPING);
1397 return true;
1398 }
1399
1400 void OSDService::got_stop_ack()
1401 {
1402 Mutex::Locker l(is_stopping_lock);
1403 if (get_state() == PREPARING_TO_STOP) {
1404 dout(0) << __func__ << " starting shutdown" << dendl;
1405 set_state(STOPPING);
1406 is_stopping_cond.Signal();
1407 } else {
1408 dout(10) << __func__ << " ignoring msg" << dendl;
1409 }
1410 }
1411
1412 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413 OSDSuperblock& sblock)
1414 {
1415 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416 osdmap->get_encoding_features());
1417 m->oldest_map = max_oldest_map;
1418 m->newest_map = sblock.newest_map;
1419
1420 for (epoch_t e = to; e > since; e--) {
1421 bufferlist bl;
1422 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1423 m->incremental_maps[e].claim(bl);
1424 } else if (get_map_bl(e, bl)) {
1425 m->maps[e].claim(bl);
1426 break;
1427 } else {
1428 derr << "since " << since << " to " << to
1429 << " oldest " << m->oldest_map << " newest " << m->newest_map
1430 << dendl;
1431 m->put();
1432 m = NULL;
1433 break;
1434 }
1435 }
1436 return m;
1437 }
1438
1439 void OSDService::send_map(MOSDMap *m, Connection *con)
1440 {
1441 con->send_message(m);
1442 }
1443
1444 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1445 OSDMapRef& osdmap)
1446 {
1447 epoch_t to = osdmap->get_epoch();
1448 dout(10) << "send_incremental_map " << since << " -> " << to
1449 << " to " << con << " " << con->get_peer_addr() << dendl;
1450
1451 MOSDMap *m = NULL;
1452 while (!m) {
1453 OSDSuperblock sblock(get_superblock());
1454 if (since < sblock.oldest_map) {
1455 // just send latest full map
1456 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1457 osdmap->get_encoding_features());
1458 m->oldest_map = max_oldest_map;
1459 m->newest_map = sblock.newest_map;
1460 get_map_bl(to, m->maps[to]);
1461 send_map(m, con);
1462 return;
1463 }
1464
1465 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1466 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1467 << ", only sending most recent" << dendl;
1468 since = to - cct->_conf->osd_map_share_max_epochs;
1469 }
1470
1471 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1472 to = since + cct->_conf->osd_map_message_max;
1473 m = build_incremental_map_msg(since, to, sblock);
1474 }
1475 send_map(m, con);
1476 }
1477
1478 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1479 {
1480 bool found = map_bl_cache.lookup(e, &bl);
1481 if (found) {
1482 if (logger)
1483 logger->inc(l_osd_map_bl_cache_hit);
1484 return true;
1485 }
1486 if (logger)
1487 logger->inc(l_osd_map_bl_cache_miss);
1488 found = store->read(coll_t::meta(),
1489 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1490 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1491 if (found) {
1492 _add_map_bl(e, bl);
1493 }
1494 return found;
1495 }
1496
1497 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1498 {
1499 Mutex::Locker l(map_cache_lock);
1500 bool found = map_bl_inc_cache.lookup(e, &bl);
1501 if (found) {
1502 if (logger)
1503 logger->inc(l_osd_map_bl_cache_hit);
1504 return true;
1505 }
1506 if (logger)
1507 logger->inc(l_osd_map_bl_cache_miss);
1508 found = store->read(coll_t::meta(),
1509 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1510 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1511 if (found) {
1512 _add_map_inc_bl(e, bl);
1513 }
1514 return found;
1515 }
1516
1517 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1518 {
1519 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1520 // cache a contiguous buffer
1521 if (bl.get_num_buffers() > 1) {
1522 bl.rebuild();
1523 }
1524 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1525 map_bl_cache.add(e, bl);
1526 }
1527
1528 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1529 {
1530 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1531 // cache a contiguous buffer
1532 if (bl.get_num_buffers() > 1) {
1533 bl.rebuild();
1534 }
1535 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1536 map_bl_inc_cache.add(e, bl);
1537 }
1538
1539 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1540 {
1541 Mutex::Locker l(map_cache_lock);
1542 // cache a contiguous buffer
1543 if (bl.get_num_buffers() > 1) {
1544 bl.rebuild();
1545 }
1546 map_bl_inc_cache.pin(e, bl);
1547 }
1548
1549 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1550 {
1551 Mutex::Locker l(map_cache_lock);
1552 // cache a contiguous buffer
1553 if (bl.get_num_buffers() > 1) {
1554 bl.rebuild();
1555 }
1556 map_bl_cache.pin(e, bl);
1557 }
1558
1559 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1560 {
1561 Mutex::Locker l(map_cache_lock);
1562 map_bl_inc_cache.clear_pinned(e);
1563 map_bl_cache.clear_pinned(e);
1564 }
1565
1566 OSDMapRef OSDService::_add_map(OSDMap *o)
1567 {
1568 epoch_t e = o->get_epoch();
1569
1570 if (cct->_conf->osd_map_dedup) {
1571 // Dedup against an existing map at a nearby epoch
1572 OSDMapRef for_dedup = map_cache.lower_bound(e);
1573 if (for_dedup) {
1574 OSDMap::dedup(for_dedup.get(), o);
1575 }
1576 }
1577 bool existed;
1578 OSDMapRef l = map_cache.add(e, o, &existed);
1579 if (existed) {
1580 delete o;
1581 }
1582 return l;
1583 }
1584
1585 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1586 {
1587 Mutex::Locker l(map_cache_lock);
1588 OSDMapRef retval = map_cache.lookup(epoch);
1589 if (retval) {
1590 dout(30) << "get_map " << epoch << " -cached" << dendl;
1591 if (logger) {
1592 logger->inc(l_osd_map_cache_hit);
1593 }
1594 return retval;
1595 }
1596 if (logger) {
1597 logger->inc(l_osd_map_cache_miss);
1598 epoch_t lb = map_cache.cached_key_lower_bound();
1599 if (epoch < lb) {
1600 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1601 logger->inc(l_osd_map_cache_miss_low);
1602 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1603 }
1604 }
1605
1606 OSDMap *map = new OSDMap;
1607 if (epoch > 0) {
1608 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1609 bufferlist bl;
1610 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1611 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1612 delete map;
1613 return OSDMapRef();
1614 }
1615 map->decode(bl);
1616 } else {
1617 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1618 }
1619 return _add_map(map);
1620 }
1621
1622 // ops
1623
1624
1625 void OSDService::reply_op_error(OpRequestRef op, int err)
1626 {
1627 reply_op_error(op, err, eversion_t(), 0);
1628 }
1629
1630 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1631 version_t uv)
1632 {
1633 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1634 assert(m->get_type() == CEPH_MSG_OSD_OP);
1635 int flags;
1636 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1637
1638 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1639 true);
1640 reply->set_reply_versions(v, uv);
1641 m->get_connection()->send_message(reply);
1642 }
1643
1644 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1645 {
1646 if (!cct->_conf->osd_debug_misdirected_ops) {
1647 return;
1648 }
1649
1650 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1651 assert(m->get_type() == CEPH_MSG_OSD_OP);
1652
1653 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1654
1655 if (pg->is_ec_pg()) {
1656 /**
1657 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1658 * can get this result:
1659 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1660 * [CRUSH_ITEM_NONE, 2, 3]/3
1661 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1662 * [3, 2, 3]/3
1663 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1664 * -- misdirected op
1665 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1666 * it and fulfils it
1667 *
1668 * We can't compute the op target based on the sending map epoch due to
1669 * splitting. The simplest thing is to detect such cases here and drop
1670 * them without an error (the client will resend anyway).
1671 */
1672 assert(m->get_map_epoch() <= superblock.newest_map);
1673 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1674 if (!opmap) {
1675 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1676 << m->get_map_epoch() << ", dropping" << dendl;
1677 return;
1678 }
1679 pg_t _pgid = m->get_raw_pg();
1680 spg_t pgid;
1681 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1682 _pgid = opmap->raw_pg_to_pg(_pgid);
1683 if (opmap->get_primary_shard(_pgid, &pgid) &&
1684 pgid.shard != pg->info.pgid.shard) {
1685 dout(7) << __func__ << ": " << *pg << " primary changed since "
1686 << m->get_map_epoch() << ", dropping" << dendl;
1687 return;
1688 }
1689 }
1690
1691 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1692 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1693 << " pg " << m->get_raw_pg()
1694 << " to osd." << whoami
1695 << " not " << pg->acting
1696 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1697 }
1698
1699 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1700 {
1701 osd->op_shardedwq.queue(make_pair(pgid, qi));
1702 }
1703
1704 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1705 {
1706 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1707 }
1708
1709 void OSDService::queue_for_peering(PG *pg)
1710 {
1711 peering_wq.queue(pg);
1712 }
1713
1714 void OSDService::queue_for_snap_trim(PG *pg)
1715 {
1716 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1717 osd->op_shardedwq.queue(
1718 make_pair(
1719 pg->info.pgid,
1720 PGQueueable(
1721 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1722 cct->_conf->osd_snap_trim_cost,
1723 cct->_conf->osd_snap_trim_priority,
1724 ceph_clock_now(),
1725 entity_inst_t(),
1726 pg->get_osdmap()->get_epoch())));
1727 }
1728
1729
1730 // ====================================================================
1731 // OSD
1732
1733 #undef dout_prefix
1734 #define dout_prefix *_dout
1735
1736 // Commands shared between OSD's console and admin console:
1737 namespace ceph {
1738 namespace osd_cmds {
1739
1740 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1741
1742 }} // namespace ceph::osd_cmds
1743
1744 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1745 uuid_d fsid, int whoami)
1746 {
1747 int ret;
1748
1749 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1750 new ObjectStore::Sequencer("mkfs"));
1751 OSDSuperblock sb;
1752 bufferlist sbbl;
1753 C_SaferCond waiter;
1754
1755 // if we are fed a uuid for this osd, use it.
1756 store->set_fsid(cct->_conf->osd_uuid);
1757
1758 ret = store->mkfs();
1759 if (ret) {
1760 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1761 << cpp_strerror(ret) << dendl;
1762 goto free_store;
1763 }
1764
1765 store->set_cache_shards(1); // doesn't matter for mkfs!
1766
1767 ret = store->mount();
1768 if (ret) {
1769 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1770 << cpp_strerror(ret) << dendl;
1771 goto free_store;
1772 }
1773
1774 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1775 if (ret >= 0) {
1776 /* if we already have superblock, check content of superblock */
1777 dout(0) << " have superblock" << dendl;
1778 bufferlist::iterator p;
1779 p = sbbl.begin();
1780 ::decode(sb, p);
1781 if (whoami != sb.whoami) {
1782 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1783 << dendl;
1784 ret = -EINVAL;
1785 goto umount_store;
1786 }
1787 if (fsid != sb.cluster_fsid) {
1788 derr << "provided cluster fsid " << fsid
1789 << " != superblock's " << sb.cluster_fsid << dendl;
1790 ret = -EINVAL;
1791 goto umount_store;
1792 }
1793 } else {
1794 // create superblock
1795 sb.cluster_fsid = fsid;
1796 sb.osd_fsid = store->get_fsid();
1797 sb.whoami = whoami;
1798 sb.compat_features = get_osd_initial_compat_set();
1799
1800 bufferlist bl;
1801 ::encode(sb, bl);
1802
1803 ObjectStore::Transaction t;
1804 t.create_collection(coll_t::meta(), 0);
1805 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1806 ret = store->apply_transaction(osr.get(), std::move(t));
1807 if (ret) {
1808 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1809 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1810 goto umount_store;
1811 }
1812 }
1813
1814 if (!osr->flush_commit(&waiter)) {
1815 waiter.wait();
1816 }
1817
1818 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
1819 if (ret) {
1820 derr << "OSD::mkfs: failed to write fsid file: error "
1821 << cpp_strerror(ret) << dendl;
1822 goto umount_store;
1823 }
1824
1825 umount_store:
1826 store->umount();
1827 free_store:
1828 delete store;
1829 return ret;
1830 }
1831
1832 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1833 {
1834 char val[80];
1835 int r;
1836
1837 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1838 r = store->write_meta("magic", val);
1839 if (r < 0)
1840 return r;
1841
1842 snprintf(val, sizeof(val), "%d", whoami);
1843 r = store->write_meta("whoami", val);
1844 if (r < 0)
1845 return r;
1846
1847 cluster_fsid.print(val);
1848 r = store->write_meta("ceph_fsid", val);
1849 if (r < 0)
1850 return r;
1851
1852 string key = cct->_conf->get_val<string>("key");
1853 if (key.size()) {
1854 r = store->write_meta("osd_key", key);
1855 if (r < 0)
1856 return r;
1857 } else {
1858 string keyfile = cct->_conf->get_val<string>("keyfile");
1859 if (!keyfile.empty()) {
1860 bufferlist keybl;
1861 string err;
1862 if (keyfile == "-") {
1863 static_assert(1024 * 1024 >
1864 (sizeof(CryptoKey) - sizeof(bufferptr) +
1865 sizeof(__u16) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1866 "1MB should be enough for a base64 encoded CryptoKey");
1867 r = keybl.read_fd(STDIN_FILENO, 1024 * 1024);
1868 } else {
1869 r = keybl.read_file(keyfile.c_str(), &err);
1870 }
1871 if (r < 0) {
1872 derr << __func__ << " failed to read keyfile " << keyfile << ": "
1873 << err << ": " << cpp_strerror(r) << dendl;
1874 return r;
1875 }
1876 r = store->write_meta("osd_key", keybl.to_str());
1877 if (r < 0)
1878 return r;
1879 }
1880 }
1881
1882 r = store->write_meta("ready", "ready");
1883 if (r < 0)
1884 return r;
1885
1886 return 0;
1887 }
1888
1889 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1890 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1891 {
1892 string val;
1893
1894 int r = store->read_meta("magic", &val);
1895 if (r < 0)
1896 return r;
1897 magic = val;
1898
1899 r = store->read_meta("whoami", &val);
1900 if (r < 0)
1901 return r;
1902 whoami = atoi(val.c_str());
1903
1904 r = store->read_meta("ceph_fsid", &val);
1905 if (r < 0)
1906 return r;
1907 r = cluster_fsid.parse(val.c_str());
1908 if (!r)
1909 return -EINVAL;
1910
1911 r = store->read_meta("fsid", &val);
1912 if (r < 0) {
1913 osd_fsid = uuid_d();
1914 } else {
1915 r = osd_fsid.parse(val.c_str());
1916 if (!r)
1917 return -EINVAL;
1918 }
1919
1920 return 0;
1921 }
1922
1923
1924 #undef dout_prefix
1925 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1926
1927 // cons/des
1928
1929 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1930 int id,
1931 Messenger *internal_messenger,
1932 Messenger *external_messenger,
1933 Messenger *hb_client_front,
1934 Messenger *hb_client_back,
1935 Messenger *hb_front_serverm,
1936 Messenger *hb_back_serverm,
1937 Messenger *osdc_messenger,
1938 MonClient *mc,
1939 const std::string &dev, const std::string &jdev) :
1940 Dispatcher(cct_),
1941 osd_lock("OSD::osd_lock"),
1942 tick_timer(cct, osd_lock),
1943 tick_timer_lock("OSD::tick_timer_lock"),
1944 tick_timer_without_osd_lock(cct, tick_timer_lock),
1945 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1946 cct->_conf->auth_supported.empty() ?
1947 cct->_conf->auth_cluster_required :
1948 cct->_conf->auth_supported)),
1949 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1950 cct->_conf->auth_supported.empty() ?
1951 cct->_conf->auth_service_required :
1952 cct->_conf->auth_supported)),
1953 cluster_messenger(internal_messenger),
1954 client_messenger(external_messenger),
1955 objecter_messenger(osdc_messenger),
1956 monc(mc),
1957 mgrc(cct_, client_messenger),
1958 logger(NULL),
1959 recoverystate_perf(NULL),
1960 store(store_),
1961 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1962 clog(log_client.create_channel()),
1963 whoami(id),
1964 dev_path(dev), journal_path(jdev),
1965 store_is_rotational(store->is_rotational()),
1966 trace_endpoint("0.0.0.0", 0, "osd"),
1967 asok_hook(NULL),
1968 osd_compat(get_osd_compat_set()),
1969 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1970 cct->_conf->osd_peering_wq_threads,
1971 "osd_peering_tp_threads"),
1972 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1973 get_num_op_threads()),
1974 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1975 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1976 session_waiting_lock("OSD::session_waiting_lock"),
1977 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
1978 heartbeat_lock("OSD::heartbeat_lock"),
1979 heartbeat_stop(false),
1980 heartbeat_need_update(true),
1981 hb_front_client_messenger(hb_client_front),
1982 hb_back_client_messenger(hb_client_back),
1983 hb_front_server_messenger(hb_front_serverm),
1984 hb_back_server_messenger(hb_back_serverm),
1985 daily_loadavg(0.0),
1986 heartbeat_thread(this),
1987 heartbeat_dispatcher(this),
1988 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1989 cct->_conf->osd_num_op_tracker_shard),
1990 test_ops_hook(NULL),
1991 op_queue(get_io_queue()),
1992 op_prio_cutoff(get_io_prio_cut()),
1993 op_shardedwq(
1994 get_num_op_shards(),
1995 this,
1996 cct->_conf->osd_op_thread_timeout,
1997 cct->_conf->osd_op_thread_suicide_timeout,
1998 &osd_op_tp),
1999 peering_wq(
2000 this,
2001 cct->_conf->osd_op_thread_timeout,
2002 cct->_conf->osd_op_thread_suicide_timeout,
2003 &peering_tp),
2004 map_lock("OSD::map_lock"),
2005 pg_map_lock("OSD::pg_map_lock"),
2006 last_pg_create_epoch(0),
2007 mon_report_lock("OSD::mon_report_lock"),
2008 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
2009 up_thru_wanted(0),
2010 requested_full_first(0),
2011 requested_full_last(0),
2012 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
2013 osd_stat_updated(false),
2014 pg_stat_tid(0), pg_stat_tid_flushed(0),
2015 command_wq(
2016 this,
2017 cct->_conf->osd_command_thread_timeout,
2018 cct->_conf->osd_command_thread_suicide_timeout,
2019 &command_tp),
2020 remove_wq(
2021 cct,
2022 store,
2023 cct->_conf->osd_remove_thread_timeout,
2024 cct->_conf->osd_remove_thread_suicide_timeout,
2025 &disk_tp),
2026 service(this)
2027 {
2028 monc->set_messenger(client_messenger);
2029 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2030 cct->_conf->osd_op_log_threshold);
2031 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2032 cct->_conf->osd_op_history_duration);
2033 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2034 cct->_conf->osd_op_history_slow_op_threshold);
2035 #ifdef WITH_BLKIN
2036 std::stringstream ss;
2037 ss << "osd." << whoami;
2038 trace_endpoint.copy_name(ss.str());
2039 #endif
2040 }
2041
2042 OSD::~OSD()
2043 {
2044 delete authorize_handler_cluster_registry;
2045 delete authorize_handler_service_registry;
2046 delete class_handler;
2047 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2048 cct->get_perfcounters_collection()->remove(logger);
2049 delete recoverystate_perf;
2050 delete logger;
2051 delete store;
2052 }
2053
2054 double OSD::get_tick_interval() const
2055 {
2056 // vary +/- 5% to avoid scrub scheduling livelocks
2057 constexpr auto delta = 0.05;
2058 std::default_random_engine rng{static_cast<unsigned>(whoami)};
2059 return (OSD_TICK_INTERVAL *
2060 std::uniform_real_distribution<>{1.0 - delta, 1.0 + delta}(rng));
2061 }
2062
2063 void cls_initialize(ClassHandler *ch);
2064
2065 void OSD::handle_signal(int signum)
2066 {
2067 assert(signum == SIGINT || signum == SIGTERM);
2068 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2069 shutdown();
2070 }
2071
2072 int OSD::pre_init()
2073 {
2074 Mutex::Locker lock(osd_lock);
2075 if (is_stopping())
2076 return 0;
2077
2078 if (store->test_mount_in_use()) {
2079 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2080 << "currently in use. (Is ceph-osd already running?)" << dendl;
2081 return -EBUSY;
2082 }
2083
2084 cct->_conf->add_observer(this);
2085 return 0;
2086 }
2087
2088 // asok
2089
2090 class OSDSocketHook : public AdminSocketHook {
2091 OSD *osd;
2092 public:
2093 explicit OSDSocketHook(OSD *o) : osd(o) {}
2094 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2095 bufferlist& out) override {
2096 stringstream ss;
2097 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2098 out.append(ss);
2099 return r;
2100 }
2101 };
2102
2103 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2104 ostream& ss)
2105 {
2106 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2107 if (admin_command == "status") {
2108 f->open_object_section("status");
2109 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2110 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2111 f->dump_unsigned("whoami", superblock.whoami);
2112 f->dump_string("state", get_state_name(get_state()));
2113 f->dump_unsigned("oldest_map", superblock.oldest_map);
2114 f->dump_unsigned("newest_map", superblock.newest_map);
2115 {
2116 RWLock::RLocker l(pg_map_lock);
2117 f->dump_unsigned("num_pgs", pg_map.size());
2118 }
2119 f->close_section();
2120 } else if (admin_command == "flush_journal") {
2121 store->flush_journal();
2122 } else if (admin_command == "dump_ops_in_flight" ||
2123 admin_command == "ops" ||
2124 admin_command == "dump_blocked_ops" ||
2125 admin_command == "dump_historic_ops" ||
2126 admin_command == "dump_historic_ops_by_duration" ||
2127 admin_command == "dump_historic_slow_ops") {
2128
2129 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2130 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2131 will start to track new ops received afterwards.";
2132
2133 set<string> filters;
2134 vector<string> filter_str;
2135 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2136 copy(filter_str.begin(), filter_str.end(),
2137 inserter(filters, filters.end()));
2138 }
2139
2140 if (admin_command == "dump_ops_in_flight" ||
2141 admin_command == "ops") {
2142 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2143 ss << error_str;
2144 }
2145 }
2146 if (admin_command == "dump_blocked_ops") {
2147 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2148 ss << error_str;
2149 }
2150 }
2151 if (admin_command == "dump_historic_ops") {
2152 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2153 ss << error_str;
2154 }
2155 }
2156 if (admin_command == "dump_historic_ops_by_duration") {
2157 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2158 ss << error_str;
2159 }
2160 }
2161 if (admin_command == "dump_historic_slow_ops") {
2162 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2163 ss << error_str;
2164 }
2165 }
2166 } else if (admin_command == "dump_op_pq_state") {
2167 f->open_object_section("pq");
2168 op_shardedwq.dump(f);
2169 f->close_section();
2170 } else if (admin_command == "dump_blacklist") {
2171 list<pair<entity_addr_t,utime_t> > bl;
2172 OSDMapRef curmap = service.get_osdmap();
2173
2174 f->open_array_section("blacklist");
2175 curmap->get_blacklist(&bl);
2176 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2177 it != bl.end(); ++it) {
2178 f->open_object_section("entry");
2179 f->open_object_section("entity_addr_t");
2180 it->first.dump(f);
2181 f->close_section(); //entity_addr_t
2182 it->second.localtime(f->dump_stream("expire_time"));
2183 f->close_section(); //entry
2184 }
2185 f->close_section(); //blacklist
2186 } else if (admin_command == "dump_watchers") {
2187 list<obj_watch_item_t> watchers;
2188 // scan pg's
2189 {
2190 Mutex::Locker l(osd_lock);
2191 RWLock::RLocker l2(pg_map_lock);
2192 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2193 it != pg_map.end();
2194 ++it) {
2195
2196 list<obj_watch_item_t> pg_watchers;
2197 PG *pg = it->second;
2198 pg->lock();
2199 pg->get_watchers(pg_watchers);
2200 pg->unlock();
2201 watchers.splice(watchers.end(), pg_watchers);
2202 }
2203 }
2204
2205 f->open_array_section("watchers");
2206 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2207 it != watchers.end(); ++it) {
2208
2209 f->open_object_section("watch");
2210
2211 f->dump_string("namespace", it->obj.nspace);
2212 f->dump_string("object", it->obj.oid.name);
2213
2214 f->open_object_section("entity_name");
2215 it->wi.name.dump(f);
2216 f->close_section(); //entity_name_t
2217
2218 f->dump_unsigned("cookie", it->wi.cookie);
2219 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2220
2221 f->open_object_section("entity_addr_t");
2222 it->wi.addr.dump(f);
2223 f->close_section(); //entity_addr_t
2224
2225 f->close_section(); //watch
2226 }
2227
2228 f->close_section(); //watchers
2229 } else if (admin_command == "dump_reservations") {
2230 f->open_object_section("reservations");
2231 f->open_object_section("local_reservations");
2232 service.local_reserver.dump(f);
2233 f->close_section();
2234 f->open_object_section("remote_reservations");
2235 service.remote_reserver.dump(f);
2236 f->close_section();
2237 f->close_section();
2238 } else if (admin_command == "get_latest_osdmap") {
2239 get_latest_osdmap();
2240 } else if (admin_command == "heap") {
2241 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2242
2243 // Note: Failed heap profile commands won't necessarily trigger an error:
2244 f->open_object_section("result");
2245 f->dump_string("error", cpp_strerror(result));
2246 f->dump_bool("success", result >= 0);
2247 f->close_section();
2248 } else if (admin_command == "set_heap_property") {
2249 string property;
2250 int64_t value = 0;
2251 string error;
2252 bool success = false;
2253 if (!cmd_getval(cct, cmdmap, "property", property)) {
2254 error = "unable to get property";
2255 success = false;
2256 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2257 error = "unable to get value";
2258 success = false;
2259 } else if (value < 0) {
2260 error = "negative value not allowed";
2261 success = false;
2262 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2263 error = "invalid property";
2264 success = false;
2265 } else {
2266 success = true;
2267 }
2268 f->open_object_section("result");
2269 f->dump_string("error", error);
2270 f->dump_bool("success", success);
2271 f->close_section();
2272 } else if (admin_command == "get_heap_property") {
2273 string property;
2274 size_t value = 0;
2275 string error;
2276 bool success = false;
2277 if (!cmd_getval(cct, cmdmap, "property", property)) {
2278 error = "unable to get property";
2279 success = false;
2280 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2281 error = "invalid property";
2282 success = false;
2283 } else {
2284 success = true;
2285 }
2286 f->open_object_section("result");
2287 f->dump_string("error", error);
2288 f->dump_bool("success", success);
2289 f->dump_int("value", value);
2290 f->close_section();
2291 } else if (admin_command == "dump_objectstore_kv_stats") {
2292 store->get_db_statistics(f);
2293 } else if (admin_command == "dump_scrubs") {
2294 service.dumps_scrub(f);
2295 } else if (admin_command == "calc_objectstore_db_histogram") {
2296 store->generate_db_histogram(f);
2297 } else if (admin_command == "flush_store_cache") {
2298 store->flush_cache();
2299 } else if (admin_command == "dump_pgstate_history") {
2300 f->open_object_section("pgstate_history");
2301 RWLock::RLocker l2(pg_map_lock);
2302 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2303 it != pg_map.end();
2304 ++it) {
2305
2306 PG *pg = it->second;
2307 f->dump_stream("pg") << pg->get_pgid();
2308 pg->lock();
2309 pg->pgstate_history.dump(f);
2310 pg->unlock();
2311 }
2312 f->close_section();
2313 } else if (admin_command == "compact") {
2314 dout(1) << "triggering manual compaction" << dendl;
2315 auto start = ceph::coarse_mono_clock::now();
2316 store->compact();
2317 auto end = ceph::coarse_mono_clock::now();
2318 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2319 dout(1) << "finished manual compaction in "
2320 << time_span.count()
2321 << " seconds" << dendl;
2322 f->open_object_section("compact_result");
2323 f->dump_float("elapsed_time", time_span.count());
2324 f->close_section();
2325 } else {
2326 assert(0 == "broken asok registration");
2327 }
2328 f->flush(ss);
2329 delete f;
2330 return true;
2331 }
2332
2333 class TestOpsSocketHook : public AdminSocketHook {
2334 OSDService *service;
2335 ObjectStore *store;
2336 public:
2337 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2338 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2339 bufferlist& out) override {
2340 stringstream ss;
2341 test_ops(service, store, command, cmdmap, ss);
2342 out.append(ss);
2343 return true;
2344 }
2345 void test_ops(OSDService *service, ObjectStore *store,
2346 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2347
2348 };
2349
2350 class OSD::C_Tick : public Context {
2351 OSD *osd;
2352 public:
2353 explicit C_Tick(OSD *o) : osd(o) {}
2354 void finish(int r) override {
2355 osd->tick();
2356 }
2357 };
2358
2359 class OSD::C_Tick_WithoutOSDLock : public Context {
2360 OSD *osd;
2361 public:
2362 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2363 void finish(int r) override {
2364 osd->tick_without_osd_lock();
2365 }
2366 };
2367
2368 int OSD::enable_disable_fuse(bool stop)
2369 {
2370 #ifdef HAVE_LIBFUSE
2371 int r;
2372 string mntpath = cct->_conf->osd_data + "/fuse";
2373 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2374 dout(1) << __func__ << " disabling" << dendl;
2375 fuse_store->stop();
2376 delete fuse_store;
2377 fuse_store = NULL;
2378 r = ::rmdir(mntpath.c_str());
2379 if (r < 0) {
2380 r = -errno;
2381 derr << __func__ << " failed to rmdir " << mntpath << ": "
2382 << cpp_strerror(r) << dendl;
2383 return r;
2384 }
2385 return 0;
2386 }
2387 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2388 dout(1) << __func__ << " enabling" << dendl;
2389 r = ::mkdir(mntpath.c_str(), 0700);
2390 if (r < 0)
2391 r = -errno;
2392 if (r < 0 && r != -EEXIST) {
2393 derr << __func__ << " unable to create " << mntpath << ": "
2394 << cpp_strerror(r) << dendl;
2395 return r;
2396 }
2397 fuse_store = new FuseStore(store, mntpath);
2398 r = fuse_store->start();
2399 if (r < 0) {
2400 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2401 delete fuse_store;
2402 fuse_store = NULL;
2403 return r;
2404 }
2405 }
2406 #endif // HAVE_LIBFUSE
2407 return 0;
2408 }
2409
2410 int OSD::get_num_op_shards()
2411 {
2412 if (cct->_conf->osd_op_num_shards)
2413 return cct->_conf->osd_op_num_shards;
2414 if (store_is_rotational)
2415 return cct->_conf->osd_op_num_shards_hdd;
2416 else
2417 return cct->_conf->osd_op_num_shards_ssd;
2418 }
2419
2420 int OSD::get_num_op_threads()
2421 {
2422 if (cct->_conf->osd_op_num_threads_per_shard)
2423 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2424 if (store_is_rotational)
2425 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2426 else
2427 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2428 }
2429
2430 float OSD::get_osd_recovery_sleep()
2431 {
2432 if (cct->_conf->osd_recovery_sleep)
2433 return cct->_conf->osd_recovery_sleep;
2434 if (!store_is_rotational && !journal_is_rotational)
2435 return cct->_conf->osd_recovery_sleep_ssd;
2436 else if (store_is_rotational && !journal_is_rotational)
2437 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2438 else
2439 return cct->_conf->osd_recovery_sleep_hdd;
2440 }
2441
2442 int OSD::init()
2443 {
2444 CompatSet initial, diff;
2445 Mutex::Locker lock(osd_lock);
2446 if (is_stopping())
2447 return 0;
2448
2449 tick_timer.init();
2450 tick_timer_without_osd_lock.init();
2451 service.recovery_request_timer.init();
2452 service.recovery_sleep_timer.init();
2453
2454 // mount.
2455 dout(2) << "init " << dev_path
2456 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2457 << dendl;
2458 dout(2) << "journal " << journal_path << dendl;
2459 assert(store); // call pre_init() first!
2460
2461 store->set_cache_shards(get_num_op_shards());
2462
2463 int r = store->mount();
2464 if (r < 0) {
2465 derr << "OSD:init: unable to mount object store" << dendl;
2466 return r;
2467 }
2468 journal_is_rotational = store->is_journal_rotational();
2469 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2470 << dendl;
2471
2472 enable_disable_fuse(false);
2473
2474 dout(2) << "boot" << dendl;
2475
2476 // initialize the daily loadavg with current 15min loadavg
2477 double loadavgs[3];
2478 if (getloadavg(loadavgs, 3) == 3) {
2479 daily_loadavg = loadavgs[2];
2480 } else {
2481 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2482 daily_loadavg = 1.0;
2483 }
2484
2485 int rotating_auth_attempts = 0;
2486
2487 // sanity check long object name handling
2488 {
2489 hobject_t l;
2490 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2491 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2492 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2493 r = store->validate_hobject_key(l);
2494 if (r < 0) {
2495 derr << "backend (" << store->get_type() << ") is unable to support max "
2496 << "object name[space] len" << dendl;
2497 derr << " osd max object name len = "
2498 << cct->_conf->osd_max_object_name_len << dendl;
2499 derr << " osd max object namespace len = "
2500 << cct->_conf->osd_max_object_namespace_len << dendl;
2501 derr << cpp_strerror(r) << dendl;
2502 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2503 goto out;
2504 }
2505 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2506 << dendl;
2507 } else {
2508 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2509 }
2510 }
2511
2512 // read superblock
2513 r = read_superblock();
2514 if (r < 0) {
2515 derr << "OSD::init() : unable to read osd superblock" << dendl;
2516 r = -EINVAL;
2517 goto out;
2518 }
2519
2520 if (osd_compat.compare(superblock.compat_features) < 0) {
2521 derr << "The disk uses features unsupported by the executable." << dendl;
2522 derr << " ondisk features " << superblock.compat_features << dendl;
2523 derr << " daemon features " << osd_compat << dendl;
2524
2525 if (osd_compat.writeable(superblock.compat_features)) {
2526 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2527 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2528 r = -EOPNOTSUPP;
2529 goto out;
2530 }
2531 else {
2532 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2533 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2534 r = -EOPNOTSUPP;
2535 goto out;
2536 }
2537 }
2538
2539 assert_warn(whoami == superblock.whoami);
2540 if (whoami != superblock.whoami) {
2541 derr << "OSD::init: superblock says osd"
2542 << superblock.whoami << " but I am osd." << whoami << dendl;
2543 r = -EINVAL;
2544 goto out;
2545 }
2546
2547 initial = get_osd_initial_compat_set();
2548 diff = superblock.compat_features.unsupported(initial);
2549 if (superblock.compat_features.merge(initial)) {
2550 // We need to persist the new compat_set before we
2551 // do anything else
2552 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2553 ObjectStore::Transaction t;
2554 write_superblock(t);
2555 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2556 if (r < 0)
2557 goto out;
2558 }
2559
2560 // make sure snap mapper object exists
2561 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2562 dout(10) << "init creating/touching snapmapper object" << dendl;
2563 ObjectStore::Transaction t;
2564 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2565 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2566 if (r < 0)
2567 goto out;
2568 }
2569
2570 class_handler = new ClassHandler(cct);
2571 cls_initialize(class_handler);
2572
2573 if (cct->_conf->osd_open_classes_on_start) {
2574 int r = class_handler->open_all_classes();
2575 if (r)
2576 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2577 }
2578
2579 // load up "current" osdmap
2580 assert_warn(!osdmap);
2581 if (osdmap) {
2582 derr << "OSD::init: unable to read current osdmap" << dendl;
2583 r = -EINVAL;
2584 goto out;
2585 }
2586 osdmap = get_map(superblock.current_epoch);
2587 check_osdmap_features(store);
2588
2589 create_recoverystate_perf();
2590
2591 {
2592 epoch_t bind_epoch = osdmap->get_epoch();
2593 service.set_epochs(NULL, NULL, &bind_epoch);
2594 }
2595
2596 clear_temp_objects();
2597
2598 // initialize osdmap references in sharded wq
2599 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2600
2601 // load up pgs (as they previously existed)
2602 load_pgs();
2603
2604 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2605 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2606 op_prio_cutoff << "." << dendl;
2607
2608 create_logger();
2609
2610 // i'm ready!
2611 client_messenger->add_dispatcher_head(this);
2612 cluster_messenger->add_dispatcher_head(this);
2613
2614 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2615 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2616 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2617 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2618
2619 objecter_messenger->add_dispatcher_head(service.objecter);
2620
2621 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2622 | CEPH_ENTITY_TYPE_MGR);
2623 r = monc->init();
2624 if (r < 0)
2625 goto out;
2626
2627 /**
2628 * FIXME: this is a placeholder implementation that unconditionally
2629 * sends every is_primary PG's stats every time we're called, unlike
2630 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2631 * This has equivalent cost to the existing worst case where all
2632 * PGs are busy and their stats are always enqueued for sending.
2633 */
2634 mgrc.set_pgstats_cb([this](){
2635 RWLock::RLocker l(map_lock);
2636
2637 utime_t had_for = ceph_clock_now() - had_map_since;
2638 osd_stat_t cur_stat = service.get_osd_stat();
2639 cur_stat.os_perf_stat = store->get_cur_stats();
2640
2641 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2642 m->osd_stat = cur_stat;
2643
2644 Mutex::Locker lec{min_last_epoch_clean_lock};
2645 min_last_epoch_clean = osdmap->get_epoch();
2646 min_last_epoch_clean_pgs.clear();
2647 RWLock::RLocker lpg(pg_map_lock);
2648 for (const auto &i : pg_map) {
2649 PG *pg = i.second;
2650 if (!pg->is_primary()) {
2651 continue;
2652 }
2653
2654 pg->pg_stats_publish_lock.Lock();
2655 if (pg->pg_stats_publish_valid) {
2656 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2657 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2658 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2659 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2660 }
2661 pg->pg_stats_publish_lock.Unlock();
2662 }
2663
2664 return m;
2665 });
2666
2667 mgrc.init();
2668 client_messenger->add_dispatcher_head(&mgrc);
2669
2670 // tell monc about log_client so it will know about mon session resets
2671 monc->set_log_client(&log_client);
2672 update_log_config();
2673
2674 peering_tp.start();
2675
2676 service.init();
2677 service.publish_map(osdmap);
2678 service.publish_superblock(superblock);
2679 service.max_oldest_map = superblock.oldest_map;
2680
2681 osd_op_tp.start();
2682 disk_tp.start();
2683 command_tp.start();
2684
2685 set_disk_tp_priority();
2686
2687 // start the heartbeat
2688 heartbeat_thread.create("osd_srv_heartbt");
2689
2690 // tick
2691 tick_timer.add_event_after(get_tick_interval(),
2692 new C_Tick(this));
2693 {
2694 Mutex::Locker l(tick_timer_lock);
2695 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
2696 new C_Tick_WithoutOSDLock(this));
2697 }
2698
2699 osd_lock.Unlock();
2700
2701 r = monc->authenticate();
2702 if (r < 0) {
2703 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2704 << dendl;
2705 osd_lock.Lock(); // locker is going to unlock this on function exit
2706 if (is_stopping())
2707 r = 0;
2708 goto monout;
2709 }
2710
2711 while (monc->wait_auth_rotating(30.0) < 0) {
2712 derr << "unable to obtain rotating service keys; retrying" << dendl;
2713 ++rotating_auth_attempts;
2714 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2715 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2716 osd_lock.Lock(); // make locker happy
2717 if (!is_stopping()) {
2718 r = -ETIMEDOUT;
2719 }
2720 goto monout;
2721 }
2722 }
2723
2724 r = update_crush_device_class();
2725 if (r < 0) {
2726 derr << __func__ << " unable to update_crush_device_class: "
2727 << cpp_strerror(r) << dendl;
2728 osd_lock.Lock();
2729 goto monout;
2730 }
2731
2732 r = update_crush_location();
2733 if (r < 0) {
2734 derr << __func__ << " unable to update_crush_location: "
2735 << cpp_strerror(r) << dendl;
2736 osd_lock.Lock();
2737 goto monout;
2738 }
2739
2740 osd_lock.Lock();
2741 if (is_stopping())
2742 return 0;
2743
2744 // start objecter *after* we have authenticated, so that we don't ignore
2745 // the OSDMaps it requests.
2746 service.final_init();
2747
2748 check_config();
2749
2750 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2751 consume_map();
2752 peering_wq.drain();
2753
2754 dout(0) << "done with init, starting boot process" << dendl;
2755
2756 // subscribe to any pg creations
2757 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2758
2759 // MgrClient needs this (it doesn't have MonClient reference itself)
2760 monc->sub_want("mgrmap", 0, 0);
2761
2762 // we don't need to ask for an osdmap here; objecter will
2763 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2764
2765 monc->renew_subs();
2766
2767 start_boot();
2768
2769 return 0;
2770 monout:
2771 exit(1);
2772
2773 out:
2774 enable_disable_fuse(true);
2775 store->umount();
2776 delete store;
2777 store = NULL;
2778 return r;
2779 }
2780
2781 void OSD::final_init()
2782 {
2783 AdminSocket *admin_socket = cct->get_admin_socket();
2784 asok_hook = new OSDSocketHook(this);
2785 int r = admin_socket->register_command("status", "status", asok_hook,
2786 "high-level status of OSD");
2787 assert(r == 0);
2788 r = admin_socket->register_command("flush_journal", "flush_journal",
2789 asok_hook,
2790 "flush the journal to permanent store");
2791 assert(r == 0);
2792 r = admin_socket->register_command("dump_ops_in_flight",
2793 "dump_ops_in_flight " \
2794 "name=filterstr,type=CephString,n=N,req=false",
2795 asok_hook,
2796 "show the ops currently in flight");
2797 assert(r == 0);
2798 r = admin_socket->register_command("ops",
2799 "ops " \
2800 "name=filterstr,type=CephString,n=N,req=false",
2801 asok_hook,
2802 "show the ops currently in flight");
2803 assert(r == 0);
2804 r = admin_socket->register_command("dump_blocked_ops",
2805 "dump_blocked_ops " \
2806 "name=filterstr,type=CephString,n=N,req=false",
2807 asok_hook,
2808 "show the blocked ops currently in flight");
2809 assert(r == 0);
2810 r = admin_socket->register_command("dump_historic_ops",
2811 "dump_historic_ops " \
2812 "name=filterstr,type=CephString,n=N,req=false",
2813 asok_hook,
2814 "show recent ops");
2815 assert(r == 0);
2816 r = admin_socket->register_command("dump_historic_slow_ops",
2817 "dump_historic_slow_ops " \
2818 "name=filterstr,type=CephString,n=N,req=false",
2819 asok_hook,
2820 "show slowest recent ops");
2821 assert(r == 0);
2822 r = admin_socket->register_command("dump_historic_ops_by_duration",
2823 "dump_historic_ops_by_duration " \
2824 "name=filterstr,type=CephString,n=N,req=false",
2825 asok_hook,
2826 "show slowest recent ops, sorted by duration");
2827 assert(r == 0);
2828 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2829 asok_hook,
2830 "dump op priority queue state");
2831 assert(r == 0);
2832 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2833 asok_hook,
2834 "dump blacklisted clients and times");
2835 assert(r == 0);
2836 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2837 asok_hook,
2838 "show clients which have active watches,"
2839 " and on which objects");
2840 assert(r == 0);
2841 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2842 asok_hook,
2843 "show recovery reservations");
2844 assert(r == 0);
2845 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2846 asok_hook,
2847 "force osd to update the latest map from "
2848 "the mon");
2849 assert(r == 0);
2850
2851 r = admin_socket->register_command( "heap",
2852 "heap " \
2853 "name=heapcmd,type=CephString",
2854 asok_hook,
2855 "show heap usage info (available only if "
2856 "compiled with tcmalloc)");
2857 assert(r == 0);
2858
2859 r = admin_socket->register_command("set_heap_property",
2860 "set_heap_property " \
2861 "name=property,type=CephString " \
2862 "name=value,type=CephInt",
2863 asok_hook,
2864 "update malloc extension heap property");
2865 assert(r == 0);
2866
2867 r = admin_socket->register_command("get_heap_property",
2868 "get_heap_property " \
2869 "name=property,type=CephString",
2870 asok_hook,
2871 "get malloc extension heap property");
2872 assert(r == 0);
2873
2874 r = admin_socket->register_command("dump_objectstore_kv_stats",
2875 "dump_objectstore_kv_stats",
2876 asok_hook,
2877 "print statistics of kvdb which used by bluestore");
2878 assert(r == 0);
2879
2880 r = admin_socket->register_command("dump_scrubs",
2881 "dump_scrubs",
2882 asok_hook,
2883 "print scheduled scrubs");
2884 assert(r == 0);
2885
2886 r = admin_socket->register_command("calc_objectstore_db_histogram",
2887 "calc_objectstore_db_histogram",
2888 asok_hook,
2889 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2890 assert(r == 0);
2891
2892 r = admin_socket->register_command("flush_store_cache",
2893 "flush_store_cache",
2894 asok_hook,
2895 "Flush bluestore internal cache");
2896 assert(r == 0);
2897 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2898 asok_hook,
2899 "show recent state history");
2900 assert(r == 0);
2901
2902 r = admin_socket->register_command("compact", "compact",
2903 asok_hook,
2904 "Commpact object store's omap."
2905 " WARNING: Compaction probably slows your requests");
2906 assert(r == 0);
2907
2908 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2909 // Note: pools are CephString instead of CephPoolname because
2910 // these commands traditionally support both pool names and numbers
2911 r = admin_socket->register_command(
2912 "setomapval",
2913 "setomapval " \
2914 "name=pool,type=CephString " \
2915 "name=objname,type=CephObjectname " \
2916 "name=key,type=CephString "\
2917 "name=val,type=CephString",
2918 test_ops_hook,
2919 "set omap key");
2920 assert(r == 0);
2921 r = admin_socket->register_command(
2922 "rmomapkey",
2923 "rmomapkey " \
2924 "name=pool,type=CephString " \
2925 "name=objname,type=CephObjectname " \
2926 "name=key,type=CephString",
2927 test_ops_hook,
2928 "remove omap key");
2929 assert(r == 0);
2930 r = admin_socket->register_command(
2931 "setomapheader",
2932 "setomapheader " \
2933 "name=pool,type=CephString " \
2934 "name=objname,type=CephObjectname " \
2935 "name=header,type=CephString",
2936 test_ops_hook,
2937 "set omap header");
2938 assert(r == 0);
2939
2940 r = admin_socket->register_command(
2941 "getomap",
2942 "getomap " \
2943 "name=pool,type=CephString " \
2944 "name=objname,type=CephObjectname",
2945 test_ops_hook,
2946 "output entire object map");
2947 assert(r == 0);
2948
2949 r = admin_socket->register_command(
2950 "truncobj",
2951 "truncobj " \
2952 "name=pool,type=CephString " \
2953 "name=objname,type=CephObjectname " \
2954 "name=len,type=CephInt",
2955 test_ops_hook,
2956 "truncate object to length");
2957 assert(r == 0);
2958
2959 r = admin_socket->register_command(
2960 "injectdataerr",
2961 "injectdataerr " \
2962 "name=pool,type=CephString " \
2963 "name=objname,type=CephObjectname " \
2964 "name=shardid,type=CephInt,req=false,range=0|255",
2965 test_ops_hook,
2966 "inject data error to an object");
2967 assert(r == 0);
2968
2969 r = admin_socket->register_command(
2970 "injectmdataerr",
2971 "injectmdataerr " \
2972 "name=pool,type=CephString " \
2973 "name=objname,type=CephObjectname " \
2974 "name=shardid,type=CephInt,req=false,range=0|255",
2975 test_ops_hook,
2976 "inject metadata error to an object");
2977 assert(r == 0);
2978 r = admin_socket->register_command(
2979 "set_recovery_delay",
2980 "set_recovery_delay " \
2981 "name=utime,type=CephInt,req=false",
2982 test_ops_hook,
2983 "Delay osd recovery by specified seconds");
2984 assert(r == 0);
2985 r = admin_socket->register_command(
2986 "trigger_scrub",
2987 "trigger_scrub " \
2988 "name=pgid,type=CephString ",
2989 test_ops_hook,
2990 "Trigger a scheduled scrub ");
2991 assert(r == 0);
2992 r = admin_socket->register_command(
2993 "injectfull",
2994 "injectfull " \
2995 "name=type,type=CephString,req=false " \
2996 "name=count,type=CephInt,req=false ",
2997 test_ops_hook,
2998 "Inject a full disk (optional count times)");
2999 assert(r == 0);
3000 }
3001
3002 void OSD::create_logger()
3003 {
3004 dout(10) << "create_logger" << dendl;
3005
3006 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3007
3008 // Latency axis configuration for op histograms, values are in nanoseconds
3009 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3010 "Latency (usec)",
3011 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3012 0, ///< Start at 0
3013 100000, ///< Quantization unit is 100usec
3014 32, ///< Enough to cover much longer than slow requests
3015 };
3016
3017 // Op size axis configuration for op histograms, values are in bytes
3018 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3019 "Request size (bytes)",
3020 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3021 0, ///< Start at 0
3022 512, ///< Quantization unit is 512 bytes
3023 32, ///< Enough to cover requests larger than GB
3024 };
3025
3026
3027 // All the basic OSD operation stats are to be considered useful
3028 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3029
3030 osd_plb.add_u64(
3031 l_osd_op_wip, "op_wip",
3032 "Replication operations currently being processed (primary)");
3033 osd_plb.add_u64_counter(
3034 l_osd_op, "op",
3035 "Client operations",
3036 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3037 osd_plb.add_u64_counter(
3038 l_osd_op_inb, "op_in_bytes",
3039 "Client operations total write size",
3040 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
3041 osd_plb.add_u64_counter(
3042 l_osd_op_outb, "op_out_bytes",
3043 "Client operations total read size",
3044 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
3045 osd_plb.add_time_avg(
3046 l_osd_op_lat, "op_latency",
3047 "Latency of client operations (including queue time)",
3048 "l", 9);
3049 osd_plb.add_time_avg(
3050 l_osd_op_process_lat, "op_process_latency",
3051 "Latency of client operations (excluding queue time)");
3052 osd_plb.add_time_avg(
3053 l_osd_op_prepare_lat, "op_prepare_latency",
3054 "Latency of client operations (excluding queue time and wait for finished)");
3055
3056 osd_plb.add_u64_counter(
3057 l_osd_op_r, "op_r", "Client read operations");
3058 osd_plb.add_u64_counter(
3059 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3060 osd_plb.add_time_avg(
3061 l_osd_op_r_lat, "op_r_latency",
3062 "Latency of read operation (including queue time)");
3063 osd_plb.add_u64_counter_histogram(
3064 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3065 op_hist_x_axis_config, op_hist_y_axis_config,
3066 "Histogram of operation latency (including queue time) + data read");
3067 osd_plb.add_time_avg(
3068 l_osd_op_r_process_lat, "op_r_process_latency",
3069 "Latency of read operation (excluding queue time)");
3070 osd_plb.add_time_avg(
3071 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3072 "Latency of read operations (excluding queue time and wait for finished)");
3073 osd_plb.add_u64_counter(
3074 l_osd_op_w, "op_w", "Client write operations");
3075 osd_plb.add_u64_counter(
3076 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3077 osd_plb.add_time_avg(
3078 l_osd_op_w_lat, "op_w_latency",
3079 "Latency of write operation (including queue time)");
3080 osd_plb.add_u64_counter_histogram(
3081 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3082 op_hist_x_axis_config, op_hist_y_axis_config,
3083 "Histogram of operation latency (including queue time) + data written");
3084 osd_plb.add_time_avg(
3085 l_osd_op_w_process_lat, "op_w_process_latency",
3086 "Latency of write operation (excluding queue time)");
3087 osd_plb.add_time_avg(
3088 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3089 "Latency of write operations (excluding queue time and wait for finished)");
3090 osd_plb.add_u64_counter(
3091 l_osd_op_rw, "op_rw",
3092 "Client read-modify-write operations");
3093 osd_plb.add_u64_counter(
3094 l_osd_op_rw_inb, "op_rw_in_bytes",
3095 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3096 osd_plb.add_u64_counter(
3097 l_osd_op_rw_outb,"op_rw_out_bytes",
3098 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3099 osd_plb.add_time_avg(
3100 l_osd_op_rw_lat, "op_rw_latency",
3101 "Latency of read-modify-write operation (including queue time)");
3102 osd_plb.add_u64_counter_histogram(
3103 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3104 op_hist_x_axis_config, op_hist_y_axis_config,
3105 "Histogram of rw operation latency (including queue time) + data written");
3106 osd_plb.add_u64_counter_histogram(
3107 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3108 op_hist_x_axis_config, op_hist_y_axis_config,
3109 "Histogram of rw operation latency (including queue time) + data read");
3110 osd_plb.add_time_avg(
3111 l_osd_op_rw_process_lat, "op_rw_process_latency",
3112 "Latency of read-modify-write operation (excluding queue time)");
3113 osd_plb.add_time_avg(
3114 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3115 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3116
3117 // Now we move on to some more obscure stats, revert to assuming things
3118 // are low priority unless otherwise specified.
3119 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3120
3121 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3122 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3123 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3124 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3125
3126 osd_plb.add_u64_counter(
3127 l_osd_sop, "subop", "Suboperations");
3128 osd_plb.add_u64_counter(
3129 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(BYTES));
3130 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3131
3132 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3133 osd_plb.add_u64_counter(
3134 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(BYTES));
3135 osd_plb.add_time_avg(
3136 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3137 osd_plb.add_u64_counter(
3138 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3139 osd_plb.add_time_avg(
3140 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3141 osd_plb.add_u64_counter(
3142 l_osd_sop_push, "subop_push", "Suboperations push messages");
3143 osd_plb.add_u64_counter(
3144 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(BYTES));
3145 osd_plb.add_time_avg(
3146 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3147
3148 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3149 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3150 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(BYTES));
3151
3152 osd_plb.add_u64_counter(
3153 l_osd_rop, "recovery_ops",
3154 "Started recovery operations",
3155 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3156
3157 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3158 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size", NULL, 0, unit_t(BYTES));
3159 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes", NULL, 0, unit_t(BYTES));
3160 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3161 osd_plb.add_u64(
3162 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3163 osd_plb.add_u64(
3164 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3165 "Total number getting crc from crc_cache with adjusting");
3166 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3167 "Total number of crc cache misses");
3168
3169 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3170 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3171 osd_plb.add_u64(
3172 l_osd_pg_primary, "numpg_primary",
3173 "Placement groups for which this osd is primary");
3174 osd_plb.add_u64(
3175 l_osd_pg_replica, "numpg_replica",
3176 "Placement groups for which this osd is replica");
3177 osd_plb.add_u64(
3178 l_osd_pg_stray, "numpg_stray",
3179 "Placement groups ready to be deleted from this osd");
3180 osd_plb.add_u64(
3181 l_osd_pg_removing, "numpg_removing",
3182 "Placement groups queued for local deletion", "pgsr",
3183 PerfCountersBuilder::PRIO_USEFUL);
3184 osd_plb.add_u64(
3185 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3186 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3187 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3188 osd_plb.add_u64_counter(
3189 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3190 osd_plb.add_u64_counter(
3191 l_osd_waiting_for_map, "messages_delayed_for_map",
3192 "Operations waiting for OSD map");
3193
3194 osd_plb.add_u64_counter(
3195 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3196 osd_plb.add_u64_counter(
3197 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3198 osd_plb.add_u64_counter(
3199 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3200 "osdmap cache miss below cache lower bound");
3201 osd_plb.add_u64_avg(
3202 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3203 "osdmap cache miss, avg distance below cache lower bound");
3204 osd_plb.add_u64_counter(
3205 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3206 "OSDMap buffer cache hits");
3207 osd_plb.add_u64_counter(
3208 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3209 "OSDMap buffer cache misses");
3210
3211 osd_plb.add_u64(
3212 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3213 PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3214 osd_plb.add_u64(
3215 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3216 PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3217 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(BYTES));
3218
3219 osd_plb.add_u64_counter(
3220 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3221
3222 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3223 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3224 osd_plb.add_u64_counter(
3225 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3226 osd_plb.add_u64_counter(
3227 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3228 osd_plb.add_u64_counter(
3229 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3230 "Failed tier flush attempts");
3231 osd_plb.add_u64_counter(
3232 l_osd_tier_evict, "tier_evict", "Tier evictions");
3233 osd_plb.add_u64_counter(
3234 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3235 osd_plb.add_u64_counter(
3236 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3237 osd_plb.add_u64_counter(
3238 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3239 osd_plb.add_u64_counter(
3240 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3241 osd_plb.add_u64_counter(
3242 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3243 osd_plb.add_u64_counter(
3244 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3245
3246 osd_plb.add_u64_counter(
3247 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3248 osd_plb.add_u64_counter(
3249 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3250 osd_plb.add_u64_counter(
3251 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3252 osd_plb.add_u64_counter(
3253 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3254
3255 osd_plb.add_u64_counter(
3256 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3257 osd_plb.add_u64_counter(
3258 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3259
3260 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3261 osd_plb.add_time_avg(
3262 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3263 osd_plb.add_time_avg(
3264 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3265 osd_plb.add_time_avg(
3266 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3267
3268 osd_plb.add_u64_counter(
3269 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3270 osd_plb.add_u64_counter(
3271 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3272 "PG updated its info using fastinfo attr");
3273 osd_plb.add_u64_counter(
3274 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3275
3276 logger = osd_plb.create_perf_counters();
3277 cct->get_perfcounters_collection()->add(logger);
3278 }
3279
3280 void OSD::create_recoverystate_perf()
3281 {
3282 dout(10) << "create_recoverystate_perf" << dendl;
3283
3284 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3285
3286 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3287 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3288 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3289 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3290 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3291 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3292 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3293 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3294 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3295 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3296 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3297 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3298 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3299 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3300 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3301 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3302 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3303 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3304 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3305 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3306 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3307 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3308 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3309 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3310 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3311 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3312 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3313 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3314 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3315 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3316 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3317
3318 recoverystate_perf = rs_perf.create_perf_counters();
3319 cct->get_perfcounters_collection()->add(recoverystate_perf);
3320 }
3321
3322 int OSD::shutdown()
3323 {
3324 if (!service.prepare_to_stop())
3325 return 0; // already shutting down
3326 osd_lock.Lock();
3327 if (is_stopping()) {
3328 osd_lock.Unlock();
3329 return 0;
3330 }
3331 derr << "shutdown" << dendl;
3332
3333 set_state(STATE_STOPPING);
3334
3335 // Debugging
3336 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3337 cct->_conf->set_val("debug_osd", "100");
3338 cct->_conf->set_val("debug_journal", "100");
3339 cct->_conf->set_val("debug_filestore", "100");
3340 cct->_conf->set_val("debug_bluestore", "100");
3341 cct->_conf->set_val("debug_ms", "100");
3342 cct->_conf->apply_changes(NULL);
3343 }
3344
3345 // stop MgrClient earlier as it's more like an internal consumer of OSD
3346 mgrc.shutdown();
3347
3348 service.start_shutdown();
3349
3350 // stop sending work to pgs. this just prevents any new work in _process
3351 // from racing with on_shutdown and potentially entering the pg after.
3352 op_shardedwq.drain();
3353
3354 // Shutdown PGs
3355 {
3356 RWLock::RLocker l(pg_map_lock);
3357 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3358 p != pg_map.end();
3359 ++p) {
3360 dout(20) << " kicking pg " << p->first << dendl;
3361 p->second->lock();
3362 p->second->on_shutdown();
3363 p->second->unlock();
3364 p->second->osr->flush();
3365 }
3366 }
3367 clear_pg_stat_queue();
3368
3369 // drain op queue again (in case PGs requeued something)
3370 op_shardedwq.drain();
3371 {
3372 finished.clear(); // zap waiters (bleh, this is messy)
3373 }
3374
3375 op_shardedwq.clear_pg_slots();
3376
3377 // unregister commands
3378 cct->get_admin_socket()->unregister_command("status");
3379 cct->get_admin_socket()->unregister_command("flush_journal");
3380 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3381 cct->get_admin_socket()->unregister_command("ops");
3382 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3383 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3384 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3385 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3386 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3387 cct->get_admin_socket()->unregister_command("dump_blacklist");
3388 cct->get_admin_socket()->unregister_command("dump_watchers");
3389 cct->get_admin_socket()->unregister_command("dump_reservations");
3390 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3391 cct->get_admin_socket()->unregister_command("heap");
3392 cct->get_admin_socket()->unregister_command("set_heap_property");
3393 cct->get_admin_socket()->unregister_command("get_heap_property");
3394 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3395 cct->get_admin_socket()->unregister_command("dump_scrubs");
3396 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3397 cct->get_admin_socket()->unregister_command("flush_store_cache");
3398 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3399 cct->get_admin_socket()->unregister_command("compact");
3400 delete asok_hook;
3401 asok_hook = NULL;
3402
3403 cct->get_admin_socket()->unregister_command("setomapval");
3404 cct->get_admin_socket()->unregister_command("rmomapkey");
3405 cct->get_admin_socket()->unregister_command("setomapheader");
3406 cct->get_admin_socket()->unregister_command("getomap");
3407 cct->get_admin_socket()->unregister_command("truncobj");
3408 cct->get_admin_socket()->unregister_command("injectdataerr");
3409 cct->get_admin_socket()->unregister_command("injectmdataerr");
3410 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3411 cct->get_admin_socket()->unregister_command("trigger_scrub");
3412 cct->get_admin_socket()->unregister_command("injectfull");
3413 delete test_ops_hook;
3414 test_ops_hook = NULL;
3415
3416 osd_lock.Unlock();
3417
3418 heartbeat_lock.Lock();
3419 heartbeat_stop = true;
3420 heartbeat_cond.Signal();
3421 heartbeat_lock.Unlock();
3422 heartbeat_thread.join();
3423
3424 peering_tp.drain();
3425 peering_wq.clear();
3426 peering_tp.stop();
3427 dout(10) << "osd tp stopped" << dendl;
3428
3429 osd_op_tp.drain();
3430 osd_op_tp.stop();
3431 dout(10) << "op sharded tp stopped" << dendl;
3432
3433 command_tp.drain();
3434 command_tp.stop();
3435 dout(10) << "command tp stopped" << dendl;
3436
3437 disk_tp.drain();
3438 disk_tp.stop();
3439 dout(10) << "disk tp paused (new)" << dendl;
3440
3441 dout(10) << "stopping agent" << dendl;
3442 service.agent_stop();
3443
3444 osd_lock.Lock();
3445
3446 reset_heartbeat_peers();
3447
3448 tick_timer.shutdown();
3449
3450 {
3451 Mutex::Locker l(tick_timer_lock);
3452 tick_timer_without_osd_lock.shutdown();
3453 }
3454
3455 // note unmount epoch
3456 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3457 superblock.mounted = service.get_boot_epoch();
3458 superblock.clean_thru = osdmap->get_epoch();
3459 ObjectStore::Transaction t;
3460 write_superblock(t);
3461 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3462 if (r) {
3463 derr << "OSD::shutdown: error writing superblock: "
3464 << cpp_strerror(r) << dendl;
3465 }
3466
3467
3468 {
3469 Mutex::Locker l(pg_stat_queue_lock);
3470 assert(pg_stat_queue.empty());
3471 }
3472
3473 service.shutdown_reserver();
3474
3475 // Remove PGs
3476 #ifdef PG_DEBUG_REFS
3477 service.dump_live_pgids();
3478 #endif
3479 {
3480 RWLock::RLocker l(pg_map_lock);
3481 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3482 p != pg_map.end();
3483 ++p) {
3484 dout(20) << " kicking pg " << p->first << dendl;
3485 p->second->lock();
3486 if (p->second->ref != 1) {
3487 derr << "pgid " << p->first << " has ref count of "
3488 << p->second->ref << dendl;
3489 #ifdef PG_DEBUG_REFS
3490 p->second->dump_live_ids();
3491 #endif
3492 if (cct->_conf->osd_shutdown_pgref_assert) {
3493 ceph_abort();
3494 }
3495 }
3496 p->second->unlock();
3497 p->second->put("PGMap");
3498 }
3499 pg_map.clear();
3500 }
3501 #ifdef PG_DEBUG_REFS
3502 service.dump_live_pgids();
3503 #endif
3504 cct->_conf->remove_observer(this);
3505
3506 dout(10) << "syncing store" << dendl;
3507 enable_disable_fuse(true);
3508
3509 if (cct->_conf->osd_journal_flush_on_shutdown) {
3510 dout(10) << "flushing journal" << dendl;
3511 store->flush_journal();
3512 }
3513
3514 store->umount();
3515 delete store;
3516 store = 0;
3517 dout(10) << "Store synced" << dendl;
3518
3519 monc->shutdown();
3520 osd_lock.Unlock();
3521
3522 osdmap = OSDMapRef();
3523 service.shutdown();
3524 op_tracker.on_shutdown();
3525
3526 class_handler->shutdown();
3527 client_messenger->shutdown();
3528 cluster_messenger->shutdown();
3529 hb_front_client_messenger->shutdown();
3530 hb_back_client_messenger->shutdown();
3531 objecter_messenger->shutdown();
3532 hb_front_server_messenger->shutdown();
3533 hb_back_server_messenger->shutdown();
3534
3535 peering_wq.clear();
3536
3537 return r;
3538 }
3539
3540 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3541 {
3542 bool created = false;
3543 while (true) {
3544 dout(10) << __func__ << " cmd: " << cmd << dendl;
3545 vector<string> vcmd{cmd};
3546 bufferlist inbl;
3547 C_SaferCond w;
3548 string outs;
3549 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3550 int r = w.wait();
3551 if (r < 0) {
3552 if (r == -ENOENT && !created) {
3553 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3554 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3555 vector<string> vnewcmd{newcmd};
3556 bufferlist inbl;
3557 C_SaferCond w;
3558 string outs;
3559 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3560 int r = w.wait();
3561 if (r < 0) {
3562 derr << __func__ << " fail: osd does not exist and created failed: "
3563 << cpp_strerror(r) << dendl;
3564 return r;
3565 }
3566 created = true;
3567 continue;
3568 }
3569 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3570 return r;
3571 }
3572 break;
3573 }
3574
3575 return 0;
3576 }
3577
3578 int OSD::update_crush_location()
3579 {
3580 if (!cct->_conf->osd_crush_update_on_start) {
3581 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3582 return 0;
3583 }
3584
3585 char weight[32];
3586 if (cct->_conf->osd_crush_initial_weight >= 0) {
3587 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3588 } else {
3589 struct store_statfs_t st;
3590 int r = store->statfs(&st);
3591 if (r < 0) {
3592 derr << "statfs: " << cpp_strerror(r) << dendl;
3593 return r;
3594 }
3595 snprintf(weight, sizeof(weight), "%.4lf",
3596 MAX((double).00001,
3597 (double)(st.total) /
3598 (double)(1ull << 40 /* TB */)));
3599 }
3600
3601 std::multimap<string,string> loc = cct->crush_location.get_location();
3602 dout(10) << __func__ << " crush location is " << loc << dendl;
3603
3604 string cmd =
3605 string("{\"prefix\": \"osd crush create-or-move\", ") +
3606 string("\"id\": ") + stringify(whoami) + string(", ") +
3607 string("\"weight\":") + weight + string(", ") +
3608 string("\"args\": [");
3609 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3610 if (p != loc.begin())
3611 cmd += ", ";
3612 cmd += "\"" + p->first + "=" + p->second + "\"";
3613 }
3614 cmd += "]}";
3615
3616 return mon_cmd_maybe_osd_create(cmd);
3617 }
3618
3619 int OSD::update_crush_device_class()
3620 {
3621 if (!cct->_conf->osd_class_update_on_start) {
3622 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3623 return 0;
3624 }
3625
3626 string device_class;
3627 int r = store->read_meta("crush_device_class", &device_class);
3628 if (r < 0 || device_class.empty()) {
3629 device_class = store->get_default_device_class();
3630 }
3631
3632 if (device_class.empty()) {
3633 dout(20) << __func__ << " no device class stored locally" << dendl;
3634 return 0;
3635 }
3636
3637 string cmd =
3638 string("{\"prefix\": \"osd crush set-device-class\", ") +
3639 string("\"class\": \"") + device_class + string("\", ") +
3640 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3641
3642 r = mon_cmd_maybe_osd_create(cmd);
3643 // the above cmd can fail for various reasons, e.g.:
3644 // (1) we are connecting to a pre-luminous monitor
3645 // (2) user manually specify a class other than
3646 // 'ceph-disk prepare --crush-device-class'
3647 // simply skip result-checking for now
3648 return 0;
3649 }
3650
3651 void OSD::write_superblock(ObjectStore::Transaction& t)
3652 {
3653 dout(10) << "write_superblock " << superblock << dendl;
3654
3655 //hack: at minimum it's using the baseline feature set
3656 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3657 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3658
3659 bufferlist bl;
3660 ::encode(superblock, bl);
3661 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3662 }
3663
3664 int OSD::read_superblock()
3665 {
3666 bufferlist bl;
3667 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3668 if (r < 0)
3669 return r;
3670
3671 bufferlist::iterator p = bl.begin();
3672 ::decode(superblock, p);
3673
3674 dout(10) << "read_superblock " << superblock << dendl;
3675
3676 return 0;
3677 }
3678
3679 void OSD::clear_temp_objects()
3680 {
3681 dout(10) << __func__ << dendl;
3682 vector<coll_t> ls;
3683 store->list_collections(ls);
3684 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3685 spg_t pgid;
3686 if (!p->is_pg(&pgid))
3687 continue;
3688
3689 // list temp objects
3690 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3691
3692 vector<ghobject_t> temps;
3693 ghobject_t next;
3694 while (1) {
3695 vector<ghobject_t> objects;
3696 store->collection_list(*p, next, ghobject_t::get_max(),
3697 store->get_ideal_list_max(),
3698 &objects, &next);
3699 if (objects.empty())
3700 break;
3701 vector<ghobject_t>::iterator q;
3702 for (q = objects.begin(); q != objects.end(); ++q) {
3703 // Hammer set pool for temps to -1, so check for clean-up
3704 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3705 temps.push_back(*q);
3706 } else {
3707 break;
3708 }
3709 }
3710 // If we saw a non-temp object and hit the break above we can
3711 // break out of the while loop too.
3712 if (q != objects.end())
3713 break;
3714 }
3715 if (!temps.empty()) {
3716 ObjectStore::Transaction t;
3717 int removed = 0;
3718 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3719 dout(20) << " removing " << *p << " object " << *q << dendl;
3720 t.remove(*p, *q);
3721 if (++removed > cct->_conf->osd_target_transaction_size) {
3722 store->apply_transaction(service.meta_osr.get(), std::move(t));
3723 t = ObjectStore::Transaction();
3724 removed = 0;
3725 }
3726 }
3727 if (removed) {
3728 store->apply_transaction(service.meta_osr.get(), std::move(t));
3729 }
3730 }
3731 }
3732 }
3733
3734 void OSD::recursive_remove_collection(CephContext* cct,
3735 ObjectStore *store, spg_t pgid,
3736 coll_t tmp)
3737 {
3738 OSDriver driver(
3739 store,
3740 coll_t(),
3741 make_snapmapper_oid());
3742
3743 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3744 ObjectStore::Sequencer>("rm"));
3745 ObjectStore::Transaction t;
3746 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3747
3748 vector<ghobject_t> objects;
3749 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3750 INT_MAX, &objects, 0);
3751 generic_dout(10) << __func__ << " " << objects << dendl;
3752 // delete them.
3753 int removed = 0;
3754 for (vector<ghobject_t>::iterator p = objects.begin();
3755 p != objects.end();
3756 ++p, removed++) {
3757 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3758 int r = mapper.remove_oid(p->hobj, &_t);
3759 if (r != 0 && r != -ENOENT)
3760 ceph_abort();
3761 t.remove(tmp, *p);
3762 if (removed > cct->_conf->osd_target_transaction_size) {
3763 int r = store->apply_transaction(osr.get(), std::move(t));
3764 assert(r == 0);
3765 t = ObjectStore::Transaction();
3766 removed = 0;
3767 }
3768 }
3769 t.remove_collection(tmp);
3770 int r = store->apply_transaction(osr.get(), std::move(t));
3771 assert(r == 0);
3772
3773 C_SaferCond waiter;
3774 if (!osr->flush_commit(&waiter)) {
3775 waiter.wait();
3776 }
3777 }
3778
3779
3780 // ======================================================
3781 // PG's
3782
3783 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3784 {
3785 if (!createmap->have_pg_pool(id)) {
3786 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3787 << id << dendl;
3788 ceph_abort();
3789 }
3790
3791 PGPool p = PGPool(cct, createmap, id);
3792
3793 dout(10) << "_get_pool " << p.id << dendl;
3794 return p;
3795 }
3796
3797 PG *OSD::_open_lock_pg(
3798 OSDMapRef createmap,
3799 spg_t pgid, bool no_lockdep_check)
3800 {
3801 assert(osd_lock.is_locked());
3802
3803 PG* pg = _make_pg(createmap, pgid);
3804 {
3805 RWLock::WLocker l(pg_map_lock);
3806 pg->lock(no_lockdep_check);
3807 pg_map[pgid] = pg;
3808 pg->get("PGMap"); // because it's in pg_map
3809 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3810 }
3811 return pg;
3812 }
3813
3814 PG* OSD::_make_pg(
3815 OSDMapRef createmap,
3816 spg_t pgid)
3817 {
3818 dout(10) << "_open_lock_pg " << pgid << dendl;
3819 PGPool pool = _get_pool(pgid.pool(), createmap);
3820
3821 // create
3822 PG *pg;
3823 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3824 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3825 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3826 else
3827 ceph_abort();
3828
3829 return pg;
3830 }
3831
3832
3833 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3834 {
3835 epoch_t e(service.get_osdmap()->get_epoch());
3836 pg->get("PGMap"); // For pg_map
3837 pg_map[pg->info.pgid] = pg;
3838 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3839
3840 dout(10) << "Adding newly split pg " << *pg << dendl;
3841 pg->handle_loaded(rctx);
3842 pg->write_if_dirty(*(rctx->transaction));
3843 pg->queue_null(e, e);
3844 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3845 peering_wait_for_split.find(pg->info.pgid);
3846 if (to_wake != peering_wait_for_split.end()) {
3847 for (list<PG::CephPeeringEvtRef>::iterator i =
3848 to_wake->second.begin();
3849 i != to_wake->second.end();
3850 ++i) {
3851 pg->queue_peering_event(*i);
3852 }
3853 peering_wait_for_split.erase(to_wake);
3854 }
3855 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3856 _remove_pg(pg);
3857 }
3858
3859 OSD::res_result OSD::_try_resurrect_pg(
3860 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3861 {
3862 assert(resurrected);
3863 assert(old_pg_state);
3864 // find nearest ancestor
3865 DeletingStateRef df;
3866 spg_t cur(pgid);
3867 while (true) {
3868 df = service.deleting_pgs.lookup(cur);
3869 if (df)
3870 break;
3871 if (!cur.ps())
3872 break;
3873 cur = cur.get_parent();
3874 }
3875 if (!df)
3876 return RES_NONE; // good to go
3877
3878 df->old_pg_state->lock();
3879 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3880 df->old_pg_state->unlock();
3881
3882 set<spg_t> children;
3883 if (cur == pgid) {
3884 if (df->try_stop_deletion()) {
3885 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3886 *resurrected = cur;
3887 *old_pg_state = df->old_pg_state;
3888 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3889 return RES_SELF;
3890 } else {
3891 // raced, ensure we don't see DeletingStateRef when we try to
3892 // delete this pg
3893 service.deleting_pgs.remove(pgid);
3894 return RES_NONE;
3895 }
3896 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3897 curmap->get_pg_num(cur.pool()),
3898 &children) &&
3899 children.count(pgid)) {
3900 if (df->try_stop_deletion()) {
3901 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3902 << dendl;
3903 *resurrected = cur;
3904 *old_pg_state = df->old_pg_state;
3905 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3906 return RES_PARENT;
3907 } else {
3908 /* this is not a problem, failing to cancel proves that all objects
3909 * have been removed, so no hobject_t overlap is possible
3910 */
3911 return RES_NONE;
3912 }
3913 }
3914 return RES_NONE;
3915 }
3916
3917 PG *OSD::_create_lock_pg(
3918 OSDMapRef createmap,
3919 spg_t pgid,
3920 bool hold_map_lock,
3921 bool backfill,
3922 int role,
3923 vector<int>& up, int up_primary,
3924 vector<int>& acting, int acting_primary,
3925 pg_history_t history,
3926 const PastIntervals& pi,
3927 ObjectStore::Transaction& t)
3928 {
3929 assert(osd_lock.is_locked());
3930 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3931
3932 PG *pg = _open_lock_pg(createmap, pgid, true);
3933
3934 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3935
3936 pg->init(
3937 role,
3938 up,
3939 up_primary,
3940 acting,
3941 acting_primary,
3942 history,
3943 pi,
3944 backfill,
3945 &t);
3946
3947 dout(7) << "_create_lock_pg " << *pg << dendl;
3948 return pg;
3949 }
3950
3951 PG *OSD::_lookup_lock_pg(spg_t pgid)
3952 {
3953 RWLock::RLocker l(pg_map_lock);
3954
3955 auto pg_map_entry = pg_map.find(pgid);
3956 if (pg_map_entry == pg_map.end())
3957 return nullptr;
3958 PG *pg = pg_map_entry->second;
3959 pg->lock();
3960 return pg;
3961 }
3962
3963 PG *OSD::lookup_lock_pg(spg_t pgid)
3964 {
3965 return _lookup_lock_pg(pgid);
3966 }
3967
3968 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3969 {
3970 assert(pg_map.count(pgid));
3971 PG *pg = pg_map[pgid];
3972 pg->lock();
3973 return pg;
3974 }
3975
3976 void OSD::load_pgs()
3977 {
3978 assert(osd_lock.is_locked());
3979 dout(0) << "load_pgs" << dendl;
3980 {
3981 RWLock::RLocker l(pg_map_lock);
3982 assert(pg_map.empty());
3983 }
3984
3985 vector<coll_t> ls;
3986 int r = store->list_collections(ls);
3987 if (r < 0) {
3988 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3989 }
3990
3991 bool has_upgraded = false;
3992
3993 for (vector<coll_t>::iterator it = ls.begin();
3994 it != ls.end();
3995 ++it) {
3996 spg_t pgid;
3997 if (it->is_temp(&pgid) ||
3998 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3999 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
4000 recursive_remove_collection(cct, store, pgid, *it);
4001 continue;
4002 }
4003
4004 if (!it->is_pg(&pgid)) {
4005 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4006 continue;
4007 }
4008
4009 if (pgid.preferred() >= 0) {
4010 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
4011 // FIXME: delete it too, eventually
4012 continue;
4013 }
4014
4015 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4016 bufferlist bl;
4017 epoch_t map_epoch = 0;
4018 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
4019 if (r < 0) {
4020 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4021 << dendl;
4022 continue;
4023 }
4024
4025 PG *pg = NULL;
4026 if (map_epoch > 0) {
4027 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4028 if (!pgosdmap) {
4029 if (!osdmap->have_pg_pool(pgid.pool())) {
4030 derr << __func__ << ": could not find map for epoch " << map_epoch
4031 << " on pg " << pgid << ", but the pool is not present in the "
4032 << "current map, so this is probably a result of bug 10617. "
4033 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4034 << "to clean it up later." << dendl;
4035 continue;
4036 } else {
4037 derr << __func__ << ": have pgid " << pgid << " at epoch "
4038 << map_epoch << ", but missing map. Crashing."
4039 << dendl;
4040 assert(0 == "Missing map in load_pgs");
4041 }
4042 }
4043 pg = _open_lock_pg(pgosdmap, pgid);
4044 } else {
4045 pg = _open_lock_pg(osdmap, pgid);
4046 }
4047 // there can be no waiters here, so we don't call wake_pg_waiters
4048
4049 pg->ch = store->open_collection(pg->coll);
4050
4051 // read pg state, log
4052 pg->read_state(store, bl);
4053
4054 if (pg->must_upgrade()) {
4055 if (!pg->can_upgrade()) {
4056 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4057 << " an older version first." << dendl;
4058 assert(0 == "PG too old to upgrade");
4059 }
4060 if (!has_upgraded) {
4061 derr << "PGs are upgrading" << dendl;
4062 has_upgraded = true;
4063 }
4064 dout(10) << "PG " << pg->info.pgid
4065 << " must upgrade..." << dendl;
4066 pg->upgrade(store);
4067 }
4068
4069 if (pg->dne()) {
4070 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4071 pg->ch = nullptr;
4072 service.pg_remove_epoch(pg->pg_id);
4073 pg->unlock();
4074 {
4075 // Delete pg
4076 RWLock::WLocker l(pg_map_lock);
4077 auto p = pg_map.find(pg->get_pgid());
4078 assert(p != pg_map.end() && p->second == pg);
4079 dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
4080 pg_map.erase(p);
4081 pg->put("PGMap");
4082 }
4083 recursive_remove_collection(cct, store, pgid, *it);
4084 continue;
4085 }
4086
4087 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4088
4089 // generate state for PG's current mapping
4090 int primary, up_primary;
4091 vector<int> acting, up;
4092 pg->get_osdmap()->pg_to_up_acting_osds(
4093 pgid.pgid, &up, &up_primary, &acting, &primary);
4094 pg->init_primary_up_acting(
4095 up,
4096 acting,
4097 up_primary,
4098 primary);
4099 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4100 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4101 pg->set_role(role);
4102 else
4103 pg->set_role(-1);
4104
4105 pg->reg_next_scrub();
4106
4107 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4108 pg->handle_loaded(&rctx);
4109
4110 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4111 if (pg->pg_log.is_dirty()) {
4112 ObjectStore::Transaction t;
4113 pg->write_if_dirty(t);
4114 store->apply_transaction(pg->osr.get(), std::move(t));
4115 }
4116 pg->unlock();
4117 }
4118 {
4119 RWLock::RLocker l(pg_map_lock);
4120 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4121 }
4122
4123 // clean up old infos object?
4124 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4125 dout(1) << __func__ << " removing legacy infos object" << dendl;
4126 ObjectStore::Transaction t;
4127 t.remove(coll_t::meta(), OSD::make_infos_oid());
4128 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4129 if (r != 0) {
4130 derr << __func__ << ": apply_transaction returned "
4131 << cpp_strerror(r) << dendl;
4132 ceph_abort();
4133 }
4134 }
4135
4136 build_past_intervals_parallel();
4137 }
4138
4139
4140 /*
4141 * build past_intervals efficiently on old, degraded, and buried
4142 * clusters. this is important for efficiently catching up osds that
4143 * are way behind on maps to the current cluster state.
4144 *
4145 * this is a parallel version of PG::generate_past_intervals().
4146 * follow the same logic, but do all pgs at the same time so that we
4147 * can make a single pass across the osdmap history.
4148 */
4149 void OSD::build_past_intervals_parallel()
4150 {
4151 struct pistate {
4152 epoch_t start, end;
4153 vector<int> old_acting, old_up;
4154 epoch_t same_interval_since;
4155 int primary;
4156 int up_primary;
4157 };
4158 map<PG*,pistate> pis;
4159
4160 // calculate junction of map range
4161 epoch_t end_epoch = superblock.oldest_map;
4162 epoch_t cur_epoch = superblock.newest_map;
4163 {
4164 RWLock::RLocker l(pg_map_lock);
4165 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4166 i != pg_map.end();
4167 ++i) {
4168 PG *pg = i->second;
4169
4170 // Ignore PGs only partially created (DNE)
4171 if (pg->info.dne()) {
4172 continue;
4173 }
4174
4175 auto rpib = pg->get_required_past_interval_bounds(
4176 pg->info,
4177 superblock.oldest_map);
4178 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4179 if (pg->info.history.same_interval_since == 0) {
4180 pg->info.history.same_interval_since = rpib.second;
4181 }
4182 continue;
4183 } else {
4184 auto apib = pg->past_intervals.get_bounds();
4185 if (apib.second >= rpib.second &&
4186 apib.first <= rpib.first) {
4187 if (pg->info.history.same_interval_since == 0) {
4188 pg->info.history.same_interval_since = rpib.second;
4189 }
4190 continue;
4191 }
4192 }
4193
4194 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4195 << rpib.second << dendl;
4196 pistate& p = pis[pg];
4197 p.start = rpib.first;
4198 p.end = rpib.second;
4199 p.same_interval_since = 0;
4200
4201 if (rpib.first < cur_epoch)
4202 cur_epoch = rpib.first;
4203 if (rpib.second > end_epoch)
4204 end_epoch = rpib.second;
4205 }
4206 }
4207 if (pis.empty()) {
4208 dout(10) << __func__ << " nothing to build" << dendl;
4209 return;
4210 }
4211
4212 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4213 assert(cur_epoch <= end_epoch);
4214
4215 OSDMapRef cur_map, last_map;
4216 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4217 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4218 last_map = cur_map;
4219 cur_map = get_map(cur_epoch);
4220
4221 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4222 PG *pg = i->first;
4223 pistate& p = i->second;
4224
4225 if (cur_epoch < p.start || cur_epoch > p.end)
4226 continue;
4227
4228 vector<int> acting, up;
4229 int up_primary;
4230 int primary;
4231 pg_t pgid = pg->info.pgid.pgid;
4232 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4233 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4234 cur_map->pg_to_up_acting_osds(
4235 pgid, &up, &up_primary, &acting, &primary);
4236
4237 if (p.same_interval_since == 0) {
4238 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4239 << " first map, acting " << acting
4240 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4241 p.same_interval_since = cur_epoch;
4242 p.old_up = up;
4243 p.old_acting = acting;
4244 p.primary = primary;
4245 p.up_primary = up_primary;
4246 continue;
4247 }
4248 assert(last_map);
4249
4250 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4251 pg->get_is_recoverable_predicate());
4252 std::stringstream debug;
4253 bool new_interval = PastIntervals::check_new_interval(
4254 p.primary,
4255 primary,
4256 p.old_acting, acting,
4257 p.up_primary,
4258 up_primary,
4259 p.old_up, up,
4260 p.same_interval_since,
4261 pg->info.history.last_epoch_clean,
4262 cur_map, last_map,
4263 pgid,
4264 recoverable.get(),
4265 &pg->past_intervals,
4266 &debug);
4267 if (new_interval) {
4268 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4269 << " " << debug.str() << dendl;
4270 p.old_up = up;
4271 p.old_acting = acting;
4272 p.primary = primary;
4273 p.up_primary = up_primary;
4274 p.same_interval_since = cur_epoch;
4275 }
4276 }
4277 }
4278
4279 // Now that past_intervals have been recomputed let's fix the same_interval_since
4280 // if it was cleared by import.
4281 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4282 PG *pg = i->first;
4283 pistate& p = i->second;
4284
4285 if (pg->info.history.same_interval_since == 0) {
4286 assert(p.same_interval_since);
4287 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4288 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4289 // Fix it
4290 pg->info.history.same_interval_since = p.same_interval_since;
4291 }
4292 }
4293
4294 // write info only at the end. this is necessary because we check
4295 // whether the past_intervals go far enough back or forward in time,
4296 // but we don't check for holes. we could avoid it by discarding
4297 // the previous past_intervals and rebuilding from scratch, or we
4298 // can just do this and commit all our work at the end.
4299 ObjectStore::Transaction t;
4300 int num = 0;
4301 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4302 PG *pg = i->first;
4303 pg->lock();
4304 pg->dirty_big_info = true;
4305 pg->dirty_info = true;
4306 pg->write_if_dirty(t);
4307 pg->unlock();
4308
4309 // don't let the transaction get too big
4310 if (++num >= cct->_conf->osd_target_transaction_size) {
4311 store->apply_transaction(service.meta_osr.get(), std::move(t));
4312 t = ObjectStore::Transaction();
4313 num = 0;
4314 }
4315 }
4316 if (!t.empty())
4317 store->apply_transaction(service.meta_osr.get(), std::move(t));
4318 }
4319
4320 /*
4321 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4322 * hasn't changed since the given epoch and we are the primary.
4323 */
4324 int OSD::handle_pg_peering_evt(
4325 spg_t pgid,
4326 const pg_history_t& orig_history,
4327 const PastIntervals& pi,
4328 epoch_t epoch,
4329 PG::CephPeeringEvtRef evt)
4330 {
4331 if (service.splitting(pgid)) {
4332 peering_wait_for_split[pgid].push_back(evt);
4333 return -EEXIST;
4334 }
4335
4336 PG *pg = _lookup_lock_pg(pgid);
4337 if (!pg) {
4338 // same primary?
4339 if (!osdmap->have_pg_pool(pgid.pool()))
4340 return -EINVAL;
4341 int up_primary, acting_primary;
4342 vector<int> up, acting;
4343 osdmap->pg_to_up_acting_osds(
4344 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4345
4346 pg_history_t history = orig_history;
4347 bool valid_history = project_pg_history(
4348 pgid, history, epoch, up, up_primary, acting, acting_primary);
4349
4350 if (!valid_history || epoch < history.same_interval_since) {
4351 dout(10) << __func__ << pgid << " acting changed in "
4352 << history.same_interval_since << " (msg from " << epoch << ")"
4353 << dendl;
4354 return -EINVAL;
4355 }
4356
4357 if (service.splitting(pgid)) {
4358 ceph_abort();
4359 }
4360
4361 const bool is_mon_create =
4362 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4363 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4364 return -EAGAIN;
4365 }
4366 // do we need to resurrect a deleting pg?
4367 spg_t resurrected;
4368 PGRef old_pg_state;
4369 res_result result = _try_resurrect_pg(
4370 service.get_osdmap(),
4371 pgid,
4372 &resurrected,
4373 &old_pg_state);
4374
4375 PG::RecoveryCtx rctx = create_context();
4376 switch (result) {
4377 case RES_NONE: {
4378 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4379 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4380 store->get_type() != "bluestore") {
4381 clog->warn() << "pg " << pgid
4382 << " is at risk of silent data corruption: "
4383 << "the pool allows ec overwrites but is not stored in "
4384 << "bluestore, so deep scrubbing will not detect bitrot";
4385 }
4386 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4387 PG::_init(*rctx.transaction, pgid, pp);
4388
4389 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4390 if (!pp->is_replicated() && role != pgid.shard)
4391 role = -1;
4392
4393 pg = _create_lock_pg(
4394 get_map(epoch),
4395 pgid, false, false,
4396 role,
4397 up, up_primary,
4398 acting, acting_primary,
4399 history, pi,
4400 *rctx.transaction);
4401 pg->handle_create(&rctx);
4402 pg->write_if_dirty(*rctx.transaction);
4403 dispatch_context(rctx, pg, osdmap);
4404
4405 dout(10) << *pg << " is new" << dendl;
4406
4407 pg->queue_peering_event(evt);
4408 wake_pg_waiters(pg);
4409 pg->unlock();
4410 return 0;
4411 }
4412 case RES_SELF: {
4413 old_pg_state->lock();
4414 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4415 int old_role = old_pg_state->role;
4416 vector<int> old_up = old_pg_state->up;
4417 int old_up_primary = old_pg_state->up_primary.osd;
4418 vector<int> old_acting = old_pg_state->acting;
4419 int old_primary = old_pg_state->primary.osd;
4420 pg_history_t old_history = old_pg_state->info.history;
4421 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4422 old_pg_state->unlock();
4423 pg = _create_lock_pg(
4424 old_osd_map,
4425 resurrected,
4426 false,
4427 true,
4428 old_role,
4429 old_up,
4430 old_up_primary,
4431 old_acting,
4432 old_primary,
4433 old_history,
4434 old_past_intervals,
4435 *rctx.transaction);
4436 pg->handle_create(&rctx);
4437 pg->write_if_dirty(*rctx.transaction);
4438 dispatch_context(rctx, pg, osdmap);
4439
4440 dout(10) << *pg << " is new (resurrected)" << dendl;
4441
4442 pg->queue_peering_event(evt);
4443 wake_pg_waiters(pg);
4444 pg->unlock();
4445 return 0;
4446 }
4447 case RES_PARENT: {
4448 assert(old_pg_state);
4449 old_pg_state->lock();
4450 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4451 int old_role = old_pg_state->role;
4452 vector<int> old_up = old_pg_state->up;
4453 int old_up_primary = old_pg_state->up_primary.osd;
4454 vector<int> old_acting = old_pg_state->acting;
4455 int old_primary = old_pg_state->primary.osd;
4456 pg_history_t old_history = old_pg_state->info.history;
4457 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4458 old_pg_state->unlock();
4459 PG *parent = _create_lock_pg(
4460 old_osd_map,
4461 resurrected,
4462 false,
4463 true,
4464 old_role,
4465 old_up,
4466 old_up_primary,
4467 old_acting,
4468 old_primary,
4469 old_history,
4470 old_past_intervals,
4471 *rctx.transaction
4472 );
4473 parent->handle_create(&rctx);
4474 parent->write_if_dirty(*rctx.transaction);
4475 dispatch_context(rctx, parent, osdmap);
4476
4477 dout(10) << *parent << " is new" << dendl;
4478
4479 assert(service.splitting(pgid));
4480 peering_wait_for_split[pgid].push_back(evt);
4481
4482 //parent->queue_peering_event(evt);
4483 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4484 wake_pg_waiters(parent);
4485 parent->unlock();
4486 return 0;
4487 }
4488 default:
4489 assert(0);
4490 return 0;
4491 }
4492 } else {
4493 // already had it. did the mapping change?
4494 if (epoch < pg->info.history.same_interval_since) {
4495 dout(10) << *pg << __func__ << " acting changed in "
4496 << pg->info.history.same_interval_since
4497 << " (msg from " << epoch << ")" << dendl;
4498 } else {
4499 pg->queue_peering_event(evt);
4500 }
4501 pg->unlock();
4502 return -EEXIST;
4503 }
4504 }
4505
4506 bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4507 {
4508 const auto max_pgs_per_osd =
4509 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4510 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4511
4512 RWLock::RLocker pg_map_locker{pg_map_lock};
4513 if (pg_map.size() < max_pgs_per_osd) {
4514 return false;
4515 }
4516 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4517 if (is_mon_create) {
4518 pending_creates_from_mon++;
4519 } else {
4520 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4521 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4522 }
4523 dout(1) << __func__ << " withhold creation of pg " << pgid
4524 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4525 return true;
4526 }
4527
4528 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4529 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4530 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4531 static vector<int32_t> twiddle(const vector<int>& acting) {
4532 if (acting.size() > 1) {
4533 return {acting[0]};
4534 } else {
4535 vector<int32_t> twiddled(acting.begin(), acting.end());
4536 twiddled.push_back(-1);
4537 return twiddled;
4538 }
4539 }
4540
4541 void OSD::resume_creating_pg()
4542 {
4543 bool do_sub_pg_creates = false;
4544 bool have_pending_creates = false;
4545 {
4546 const auto max_pgs_per_osd =
4547 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4548 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4549 RWLock::RLocker l(pg_map_lock);
4550 if (max_pgs_per_osd <= pg_map.size()) {
4551 // this could happen if admin decreases this setting before a PG is removed
4552 return;
4553 }
4554 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4555 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4556 if (pending_creates_from_mon > 0) {
4557 do_sub_pg_creates = true;
4558 if (pending_creates_from_mon >= spare_pgs) {
4559 spare_pgs = pending_creates_from_mon = 0;
4560 } else {
4561 spare_pgs -= pending_creates_from_mon;
4562 pending_creates_from_mon = 0;
4563 }
4564 }
4565 auto pg = pending_creates_from_osd.cbegin();
4566 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4567 dout(20) << __func__ << " pg " << pg->first << dendl;
4568 vector<int> acting;
4569 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4570 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4571 pg = pending_creates_from_osd.erase(pg);
4572 do_sub_pg_creates = true;
4573 spare_pgs--;
4574 }
4575 have_pending_creates = (pending_creates_from_mon > 0 ||
4576 !pending_creates_from_osd.empty());
4577 }
4578
4579 bool do_renew_subs = false;
4580 if (do_sub_pg_creates) {
4581 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4582 dout(4) << __func__ << ": resolicit pg creates from mon since "
4583 << last_pg_create_epoch << dendl;
4584 do_renew_subs = true;
4585 }
4586 }
4587 version_t start = osdmap->get_epoch() + 1;
4588 if (have_pending_creates) {
4589 // don't miss any new osdmap deleting PGs
4590 if (monc->sub_want("osdmap", start, 0)) {
4591 dout(4) << __func__ << ": resolicit osdmap from mon since "
4592 << start << dendl;
4593 do_renew_subs = true;
4594 }
4595 } else if (do_sub_pg_creates) {
4596 // no need to subscribe the osdmap continuously anymore
4597 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4598 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4599 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
4600 << start << dendl;
4601 do_renew_subs = true;
4602 }
4603 }
4604
4605 if (do_renew_subs) {
4606 monc->renew_subs();
4607 }
4608
4609 service.send_pg_temp();
4610 }
4611
4612 void OSD::build_initial_pg_history(
4613 spg_t pgid,
4614 epoch_t created,
4615 utime_t created_stamp,
4616 pg_history_t *h,
4617 PastIntervals *pi)
4618 {
4619 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4620 h->epoch_created = created;
4621 h->epoch_pool_created = created;
4622 h->same_interval_since = created;
4623 h->same_up_since = created;
4624 h->same_primary_since = created;
4625 h->last_scrub_stamp = created_stamp;
4626 h->last_deep_scrub_stamp = created_stamp;
4627 h->last_clean_scrub_stamp = created_stamp;
4628
4629 OSDMapRef lastmap = service.get_map(created);
4630 int up_primary, acting_primary;
4631 vector<int> up, acting;
4632 lastmap->pg_to_up_acting_osds(
4633 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4634
4635 ostringstream debug;
4636 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4637 OSDMapRef osdmap = service.get_map(e);
4638 int new_up_primary, new_acting_primary;
4639 vector<int> new_up, new_acting;
4640 osdmap->pg_to_up_acting_osds(
4641 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4642
4643 // this is a bit imprecise, but sufficient?
4644 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4645 const pg_pool_t *pi;
4646 bool operator()(const set<pg_shard_t> &have) const {
4647 return have.size() >= pi->min_size;
4648 }
4649 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4650 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4651
4652 bool new_interval = PastIntervals::check_new_interval(
4653 acting_primary,
4654 new_acting_primary,
4655 acting, new_acting,
4656 up_primary,
4657 new_up_primary,
4658 up, new_up,
4659 h->same_interval_since,
4660 h->last_epoch_clean,
4661 osdmap,
4662 lastmap,
4663 pgid.pgid,
4664 &min_size_predicate,
4665 pi,
4666 &debug);
4667 if (new_interval) {
4668 h->same_interval_since = e;
4669 if (up != new_up) {
4670 h->same_up_since = e;
4671 }
4672 if (acting_primary != new_acting_primary) {
4673 h->same_primary_since = e;
4674 }
4675 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4676 osdmap->get_pg_num(pgid.pgid.pool()),
4677 nullptr)) {
4678 h->last_epoch_split = e;
4679 }
4680 up = new_up;
4681 acting = new_acting;
4682 up_primary = new_up_primary;
4683 acting_primary = new_acting_primary;
4684 }
4685 lastmap = osdmap;
4686 }
4687 dout(20) << __func__ << " " << debug.str() << dendl;
4688 dout(10) << __func__ << " " << *h << " " << *pi
4689 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4690 pi->get_bounds()) << ")"
4691 << dendl;
4692 }
4693
4694 /**
4695 * Fill in the passed history so you know same_interval_since, same_up_since,
4696 * and same_primary_since.
4697 */
4698 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4699 const vector<int>& currentup,
4700 int currentupprimary,
4701 const vector<int>& currentacting,
4702 int currentactingprimary)
4703 {
4704 dout(15) << "project_pg_history " << pgid
4705 << " from " << from << " to " << osdmap->get_epoch()
4706 << ", start " << h
4707 << dendl;
4708
4709 epoch_t e;
4710 for (e = osdmap->get_epoch();
4711 e > from;
4712 e--) {
4713 // verify during intermediate epoch (e-1)
4714 OSDMapRef oldmap = service.try_get_map(e-1);
4715 if (!oldmap) {
4716 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4717 return false;
4718 }
4719 assert(oldmap->have_pg_pool(pgid.pool()));
4720
4721 int upprimary, actingprimary;
4722 vector<int> up, acting;
4723 oldmap->pg_to_up_acting_osds(
4724 pgid.pgid,
4725 &up,
4726 &upprimary,
4727 &acting,
4728 &actingprimary);
4729
4730 // acting set change?
4731 if ((actingprimary != currentactingprimary ||
4732 upprimary != currentupprimary ||
4733 acting != currentacting ||
4734 up != currentup) && e > h.same_interval_since) {
4735 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4736 << " from " << acting << "/" << up
4737 << " " << actingprimary << "/" << upprimary
4738 << " -> " << currentacting << "/" << currentup
4739 << " " << currentactingprimary << "/" << currentupprimary
4740 << dendl;
4741 h.same_interval_since = e;
4742 }
4743 // split?
4744 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4745 osdmap->get_pg_num(pgid.pool()),
4746 0) && e > h.same_interval_since) {
4747 h.same_interval_since = e;
4748 }
4749 // up set change?
4750 if ((up != currentup || upprimary != currentupprimary)
4751 && e > h.same_up_since) {
4752 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4753 << " from " << up << " " << upprimary
4754 << " -> " << currentup << " " << currentupprimary << dendl;
4755 h.same_up_since = e;
4756 }
4757
4758 // primary change?
4759 if (OSDMap::primary_changed(
4760 actingprimary,
4761 acting,
4762 currentactingprimary,
4763 currentacting) &&
4764 e > h.same_primary_since) {
4765 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4766 h.same_primary_since = e;
4767 }
4768
4769 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4770 break;
4771 }
4772
4773 // base case: these floors should be the pg creation epoch if we didn't
4774 // find any changes.
4775 if (e == h.epoch_created) {
4776 if (!h.same_interval_since)
4777 h.same_interval_since = e;
4778 if (!h.same_up_since)
4779 h.same_up_since = e;
4780 if (!h.same_primary_since)
4781 h.same_primary_since = e;
4782 }
4783
4784 dout(15) << "project_pg_history end " << h << dendl;
4785 return true;
4786 }
4787
4788
4789
4790 void OSD::_add_heartbeat_peer(int p)
4791 {
4792 if (p == whoami)
4793 return;
4794 HeartbeatInfo *hi;
4795
4796 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4797 if (i == heartbeat_peers.end()) {
4798 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4799 if (!cons.first)
4800 return;
4801 hi = &heartbeat_peers[p];
4802 hi->peer = p;
4803 HeartbeatSession *s = new HeartbeatSession(p);
4804 hi->con_back = cons.first.get();
4805 hi->con_back->set_priv(s->get());
4806 if (cons.second) {
4807 hi->con_front = cons.second.get();
4808 hi->con_front->set_priv(s->get());
4809 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4810 << " " << hi->con_back->get_peer_addr()
4811 << " " << hi->con_front->get_peer_addr()
4812 << dendl;
4813 } else {
4814 hi->con_front.reset(NULL);
4815 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4816 << " " << hi->con_back->get_peer_addr()
4817 << dendl;
4818 }
4819 s->put();
4820 } else {
4821 hi = &i->second;
4822 }
4823 hi->epoch = osdmap->get_epoch();
4824 }
4825
4826 void OSD::_remove_heartbeat_peer(int n)
4827 {
4828 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4829 assert(q != heartbeat_peers.end());
4830 dout(20) << " removing heartbeat peer osd." << n
4831 << " " << q->second.con_back->get_peer_addr()
4832 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4833 << dendl;
4834 q->second.con_back->mark_down();
4835 if (q->second.con_front) {
4836 q->second.con_front->mark_down();
4837 }
4838 heartbeat_peers.erase(q);
4839 }
4840
4841 void OSD::need_heartbeat_peer_update()
4842 {
4843 if (is_stopping())
4844 return;
4845 dout(20) << "need_heartbeat_peer_update" << dendl;
4846 heartbeat_set_peers_need_update();
4847 }
4848
4849 void OSD::maybe_update_heartbeat_peers()
4850 {
4851 assert(osd_lock.is_locked());
4852
4853 if (is_waiting_for_healthy()) {
4854 utime_t now = ceph_clock_now();
4855 if (last_heartbeat_resample == utime_t()) {
4856 last_heartbeat_resample = now;
4857 heartbeat_set_peers_need_update();
4858 } else if (!heartbeat_peers_need_update()) {
4859 utime_t dur = now - last_heartbeat_resample;
4860 if (dur > cct->_conf->osd_heartbeat_grace) {
4861 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4862 heartbeat_set_peers_need_update();
4863 last_heartbeat_resample = now;
4864 reset_heartbeat_peers(); // we want *new* peers!
4865 }
4866 }
4867 }
4868
4869 if (!heartbeat_peers_need_update())
4870 return;
4871 heartbeat_clear_peers_need_update();
4872
4873 Mutex::Locker l(heartbeat_lock);
4874
4875 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4876
4877
4878 // build heartbeat from set
4879 if (is_active()) {
4880 RWLock::RLocker l(pg_map_lock);
4881 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4882 i != pg_map.end();
4883 ++i) {
4884 PG *pg = i->second;
4885 pg->heartbeat_peer_lock.Lock();
4886 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4887 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4888 p != pg->heartbeat_peers.end();
4889 ++p)
4890 if (osdmap->is_up(*p))
4891 _add_heartbeat_peer(*p);
4892 for (set<int>::iterator p = pg->probe_targets.begin();
4893 p != pg->probe_targets.end();
4894 ++p)
4895 if (osdmap->is_up(*p))
4896 _add_heartbeat_peer(*p);
4897 pg->heartbeat_peer_lock.Unlock();
4898 }
4899 }
4900
4901 // include next and previous up osds to ensure we have a fully-connected set
4902 set<int> want, extras;
4903 int next = osdmap->get_next_up_osd_after(whoami);
4904 if (next >= 0)
4905 want.insert(next);
4906 int prev = osdmap->get_previous_up_osd_before(whoami);
4907 if (prev >= 0 && prev != next)
4908 want.insert(prev);
4909
4910 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4911 dout(10) << " adding neighbor peer osd." << *p << dendl;
4912 extras.insert(*p);
4913 _add_heartbeat_peer(*p);
4914 }
4915
4916 // remove down peers; enumerate extras
4917 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4918 while (p != heartbeat_peers.end()) {
4919 if (!osdmap->is_up(p->first)) {
4920 int o = p->first;
4921 ++p;
4922 _remove_heartbeat_peer(o);
4923 continue;
4924 }
4925 if (p->second.epoch < osdmap->get_epoch()) {
4926 extras.insert(p->first);
4927 }
4928 ++p;
4929 }
4930
4931 // too few?
4932 int start = osdmap->get_next_up_osd_after(whoami);
4933 for (int n = start; n >= 0; ) {
4934 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4935 break;
4936 if (!extras.count(n) && !want.count(n) && n != whoami) {
4937 dout(10) << " adding random peer osd." << n << dendl;
4938 extras.insert(n);
4939 _add_heartbeat_peer(n);
4940 }
4941 n = osdmap->get_next_up_osd_after(n);
4942 if (n == start)
4943 break; // came full circle; stop
4944 }
4945
4946 // too many?
4947 for (set<int>::iterator p = extras.begin();
4948 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4949 ++p) {
4950 if (want.count(*p))
4951 continue;
4952 _remove_heartbeat_peer(*p);
4953 }
4954
4955 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4956 }
4957
4958 void OSD::reset_heartbeat_peers()
4959 {
4960 assert(osd_lock.is_locked());
4961 dout(10) << "reset_heartbeat_peers" << dendl;
4962 Mutex::Locker l(heartbeat_lock);
4963 while (!heartbeat_peers.empty()) {
4964 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4965 hi.con_back->mark_down();
4966 if (hi.con_front) {
4967 hi.con_front->mark_down();
4968 }
4969 heartbeat_peers.erase(heartbeat_peers.begin());
4970 }
4971 failure_queue.clear();
4972 }
4973
4974 void OSD::handle_osd_ping(MOSDPing *m)
4975 {
4976 if (superblock.cluster_fsid != m->fsid) {
4977 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4978 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4979 m->put();
4980 return;
4981 }
4982
4983 int from = m->get_source().num();
4984
4985 heartbeat_lock.Lock();
4986 if (is_stopping()) {
4987 heartbeat_lock.Unlock();
4988 m->put();
4989 return;
4990 }
4991
4992 OSDMapRef curmap = service.get_osdmap();
4993 if (!curmap) {
4994 heartbeat_lock.Unlock();
4995 m->put();
4996 return;
4997 }
4998
4999 switch (m->op) {
5000
5001 case MOSDPing::PING:
5002 {
5003 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5004 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5005 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5006 if (heartbeat_drop->second == 0) {
5007 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5008 } else {
5009 --heartbeat_drop->second;
5010 dout(5) << "Dropping heartbeat from " << from
5011 << ", " << heartbeat_drop->second
5012 << " remaining to drop" << dendl;
5013 break;
5014 }
5015 } else if (cct->_conf->osd_debug_drop_ping_probability >
5016 ((((double)(rand()%100))/100.0))) {
5017 heartbeat_drop =
5018 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5019 cct->_conf->osd_debug_drop_ping_duration)).first;
5020 dout(5) << "Dropping heartbeat from " << from
5021 << ", " << heartbeat_drop->second
5022 << " remaining to drop" << dendl;
5023 break;
5024 }
5025 }
5026
5027 if (!cct->get_heartbeat_map()->is_healthy()) {
5028 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5029 break;
5030 }
5031
5032 Message *r = new MOSDPing(monc->get_fsid(),
5033 curmap->get_epoch(),
5034 MOSDPing::PING_REPLY, m->stamp,
5035 cct->_conf->osd_heartbeat_min_size);
5036 m->get_connection()->send_message(r);
5037
5038 if (curmap->is_up(from)) {
5039 service.note_peer_epoch(from, m->map_epoch);
5040 if (is_active()) {
5041 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5042 if (con) {
5043 service.share_map_peer(from, con.get());
5044 }
5045 }
5046 } else if (!curmap->exists(from) ||
5047 curmap->get_down_at(from) > m->map_epoch) {
5048 // tell them they have died
5049 Message *r = new MOSDPing(monc->get_fsid(),
5050 curmap->get_epoch(),
5051 MOSDPing::YOU_DIED,
5052 m->stamp,
5053 cct->_conf->osd_heartbeat_min_size);
5054 m->get_connection()->send_message(r);
5055 }
5056 }
5057 break;
5058
5059 case MOSDPing::PING_REPLY:
5060 {
5061 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5062 if (i != heartbeat_peers.end()) {
5063 if (m->get_connection() == i->second.con_back) {
5064 dout(25) << "handle_osd_ping got reply from osd." << from
5065 << " first_tx " << i->second.first_tx
5066 << " last_tx " << i->second.last_tx
5067 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
5068 << " last_rx_front " << i->second.last_rx_front
5069 << dendl;
5070 i->second.last_rx_back = m->stamp;
5071 // if there is no front con, set both stamps.
5072 if (i->second.con_front == NULL)
5073 i->second.last_rx_front = m->stamp;
5074 } else if (m->get_connection() == i->second.con_front) {
5075 dout(25) << "handle_osd_ping got reply from osd." << from
5076 << " first_tx " << i->second.first_tx
5077 << " last_tx " << i->second.last_tx
5078 << " last_rx_back " << i->second.last_rx_back
5079 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
5080 << dendl;
5081 i->second.last_rx_front = m->stamp;
5082 }
5083
5084 utime_t cutoff = ceph_clock_now();
5085 cutoff -= cct->_conf->osd_heartbeat_grace;
5086 if (i->second.is_healthy(cutoff)) {
5087 // Cancel false reports
5088 auto failure_queue_entry = failure_queue.find(from);
5089 if (failure_queue_entry != failure_queue.end()) {
5090 dout(10) << "handle_osd_ping canceling queued "
5091 << "failure report for osd." << from << dendl;
5092 failure_queue.erase(failure_queue_entry);
5093 }
5094
5095 auto failure_pending_entry = failure_pending.find(from);
5096 if (failure_pending_entry != failure_pending.end()) {
5097 dout(10) << "handle_osd_ping canceling in-flight "
5098 << "failure report for osd." << from << dendl;
5099 send_still_alive(curmap->get_epoch(),
5100 failure_pending_entry->second.second);
5101 failure_pending.erase(failure_pending_entry);
5102 }
5103 }
5104 }
5105
5106 if (m->map_epoch &&
5107 curmap->is_up(from)) {
5108 service.note_peer_epoch(from, m->map_epoch);
5109 if (is_active()) {
5110 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5111 if (con) {
5112 service.share_map_peer(from, con.get());
5113 }
5114 }
5115 }
5116 }
5117 break;
5118
5119 case MOSDPing::YOU_DIED:
5120 dout(10) << "handle_osd_ping " << m->get_source_inst()
5121 << " says i am down in " << m->map_epoch << dendl;
5122 osdmap_subscribe(curmap->get_epoch()+1, false);
5123 break;
5124 }
5125
5126 heartbeat_lock.Unlock();
5127 m->put();
5128 }
5129
5130 void OSD::heartbeat_entry()
5131 {
5132 Mutex::Locker l(heartbeat_lock);
5133 if (is_stopping())
5134 return;
5135 while (!heartbeat_stop) {
5136 heartbeat();
5137
5138 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5139 utime_t w;
5140 w.set_from_double(wait);
5141 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5142 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5143 if (is_stopping())
5144 return;
5145 dout(30) << "heartbeat_entry woke up" << dendl;
5146 }
5147 }
5148
5149 void OSD::heartbeat_check()
5150 {
5151 assert(heartbeat_lock.is_locked());
5152 utime_t now = ceph_clock_now();
5153
5154 // check for heartbeat replies (move me elsewhere?)
5155 utime_t cutoff = now;
5156 cutoff -= cct->_conf->osd_heartbeat_grace;
5157 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5158 p != heartbeat_peers.end();
5159 ++p) {
5160
5161 if (p->second.first_tx == utime_t()) {
5162 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5163 << "yet, skipping" << dendl;
5164 continue;
5165 }
5166
5167 dout(25) << "heartbeat_check osd." << p->first
5168 << " first_tx " << p->second.first_tx
5169 << " last_tx " << p->second.last_tx
5170 << " last_rx_back " << p->second.last_rx_back
5171 << " last_rx_front " << p->second.last_rx_front
5172 << dendl;
5173 if (p->second.is_unhealthy(cutoff)) {
5174 if (p->second.last_rx_back == utime_t() ||
5175 p->second.last_rx_front == utime_t()) {
5176 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5177 << " osd." << p->first << " ever on either front or back, first ping sent "
5178 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5179 // fail
5180 failure_queue[p->first] = p->second.last_tx;
5181 } else {
5182 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5183 << " osd." << p->first << " since back " << p->second.last_rx_back
5184 << " front " << p->second.last_rx_front
5185 << " (cutoff " << cutoff << ")" << dendl;
5186 // fail
5187 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5188 }
5189 }
5190 }
5191 }
5192
5193 void OSD::heartbeat()
5194 {
5195 dout(30) << "heartbeat" << dendl;
5196
5197 // get CPU load avg
5198 double loadavgs[1];
5199 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5200 if (getloadavg(loadavgs, 1) == 1) {
5201 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5202 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5203 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5204 }
5205
5206 dout(30) << "heartbeat checking stats" << dendl;
5207
5208 // refresh stats?
5209 vector<int> hb_peers;
5210 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5211 p != heartbeat_peers.end();
5212 ++p)
5213 hb_peers.push_back(p->first);
5214 service.update_osd_stat(hb_peers);
5215
5216 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5217
5218 utime_t now = ceph_clock_now();
5219
5220 // send heartbeats
5221 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5222 i != heartbeat_peers.end();
5223 ++i) {
5224 int peer = i->first;
5225 i->second.last_tx = now;
5226 if (i->second.first_tx == utime_t())
5227 i->second.first_tx = now;
5228 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5229 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5230 service.get_osdmap()->get_epoch(),
5231 MOSDPing::PING, now,
5232 cct->_conf->osd_heartbeat_min_size));
5233
5234 if (i->second.con_front)
5235 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5236 service.get_osdmap()->get_epoch(),
5237 MOSDPing::PING, now,
5238 cct->_conf->osd_heartbeat_min_size));
5239 }
5240
5241 logger->set(l_osd_hb_to, heartbeat_peers.size());
5242
5243 // hmm.. am i all alone?
5244 dout(30) << "heartbeat lonely?" << dendl;
5245 if (heartbeat_peers.empty()) {
5246 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5247 last_mon_heartbeat = now;
5248 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5249 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5250 }
5251 }
5252
5253 dout(30) << "heartbeat done" << dendl;
5254 }
5255
5256 bool OSD::heartbeat_reset(Connection *con)
5257 {
5258 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5259 if (s) {
5260 heartbeat_lock.Lock();
5261 if (is_stopping()) {
5262 heartbeat_lock.Unlock();
5263 s->put();
5264 return true;
5265 }
5266 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5267 if (p != heartbeat_peers.end() &&
5268 (p->second.con_back == con ||
5269 p->second.con_front == con)) {
5270 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5271 << ", reopening" << dendl;
5272 if (con != p->second.con_back) {
5273 p->second.con_back->mark_down();
5274 }
5275 p->second.con_back.reset(NULL);
5276 if (p->second.con_front && con != p->second.con_front) {
5277 p->second.con_front->mark_down();
5278 }
5279 p->second.con_front.reset(NULL);
5280 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5281 if (newcon.first) {
5282 p->second.con_back = newcon.first.get();
5283 p->second.con_back->set_priv(s->get());
5284 if (newcon.second) {
5285 p->second.con_front = newcon.second.get();
5286 p->second.con_front->set_priv(s->get());
5287 }
5288 } else {
5289 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5290 << ", raced with osdmap update, closing out peer" << dendl;
5291 heartbeat_peers.erase(p);
5292 }
5293 } else {
5294 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5295 }
5296 heartbeat_lock.Unlock();
5297 s->put();
5298 }
5299 return true;
5300 }
5301
5302
5303
5304 // =========================================
5305
5306 void OSD::tick()
5307 {
5308 assert(osd_lock.is_locked());
5309 dout(10) << "tick" << dendl;
5310
5311 if (is_active() || is_waiting_for_healthy()) {
5312 maybe_update_heartbeat_peers();
5313 }
5314
5315 if (is_waiting_for_healthy()) {
5316 start_boot();
5317 } else if (is_preboot() &&
5318 waiting_for_luminous_mons &&
5319 monc->monmap.get_required_features().contains_all(
5320 ceph::features::mon::FEATURE_LUMINOUS)) {
5321 // mon upgrade finished!
5322 start_boot();
5323 }
5324
5325 do_waiters();
5326
5327 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5328 }
5329
5330 void OSD::tick_without_osd_lock()
5331 {
5332 assert(tick_timer_lock.is_locked());
5333 dout(10) << "tick_without_osd_lock" << dendl;
5334
5335 logger->set(l_osd_buf, buffer::get_total_alloc());
5336 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5337 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5338 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5339 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5340 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5341 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
5342
5343 // osd_lock is not being held, which means the OSD state
5344 // might change when doing the monitor report
5345 if (is_active() || is_waiting_for_healthy()) {
5346 heartbeat_lock.Lock();
5347 heartbeat_check();
5348 heartbeat_lock.Unlock();
5349
5350 map_lock.get_read();
5351 Mutex::Locker l(mon_report_lock);
5352
5353 // mon report?
5354 bool reset = false;
5355 bool report = false;
5356 utime_t now = ceph_clock_now();
5357 pg_stat_queue_lock.Lock();
5358 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5359 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5360 // note: we shouldn't adjust max because it must remain < the
5361 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5362 // value).
5363 double max = cct->_conf->osd_mon_report_interval_max;
5364 if (!outstanding_pg_stats.empty() &&
5365 (now - stats_ack_timeout) > last_pg_stats_ack) {
5366 dout(1) << __func__ << " mon hasn't acked PGStats in "
5367 << now - last_pg_stats_ack
5368 << " seconds, reconnecting elsewhere" << dendl;
5369 reset = true;
5370 last_pg_stats_ack = now; // reset clock
5371 last_pg_stats_sent = utime_t();
5372 stats_ack_timeout =
5373 MAX(cct->_conf->osd_mon_ack_timeout,
5374 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5375 outstanding_pg_stats.clear();
5376 }
5377 if (now - last_pg_stats_sent > max) {
5378 osd_stat_updated = true;
5379 report = true;
5380 } else if (service.need_fullness_update()) {
5381 report = true;
5382 } else if ((int)outstanding_pg_stats.size() >=
5383 cct->_conf->osd_mon_report_max_in_flight) {
5384 dout(20) << __func__ << " have max " << outstanding_pg_stats
5385 << " stats updates in flight" << dendl;
5386 } else {
5387 if (now - last_mon_report > adjusted_min) {
5388 dout(20) << __func__ << " stats backoff " << backoff
5389 << " adjusted_min " << adjusted_min << " - sending report"
5390 << dendl;
5391 osd_stat_updated = true;
5392 report = true;
5393 }
5394 }
5395 pg_stat_queue_lock.Unlock();
5396
5397 if (reset) {
5398 monc->reopen_session();
5399 } else if (report) {
5400 last_mon_report = now;
5401
5402 // do any pending reports
5403 send_full_update();
5404 send_failures();
5405 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5406 send_pg_stats(now);
5407 }
5408 }
5409 map_lock.put_read();
5410 }
5411
5412 if (is_active()) {
5413 if (!scrub_random_backoff()) {
5414 sched_scrub();
5415 }
5416 service.promote_throttle_recalibrate();
5417 resume_creating_pg();
5418 bool need_send_beacon = false;
5419 const auto now = ceph::coarse_mono_clock::now();
5420 {
5421 // borrow lec lock to pretect last_sent_beacon from changing
5422 Mutex::Locker l{min_last_epoch_clean_lock};
5423 const auto elapsed = now - last_sent_beacon;
5424 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5425 cct->_conf->osd_beacon_report_interval) {
5426 need_send_beacon = true;
5427 }
5428 }
5429 if (need_send_beacon) {
5430 send_beacon(now);
5431 }
5432 }
5433
5434 mgrc.update_osd_health(get_health_metrics());
5435 service.kick_recovery_queue();
5436 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5437 new C_Tick_WithoutOSDLock(this));
5438 }
5439
5440 void OSD::check_ops_in_flight()
5441 {
5442 vector<string> warnings;
5443 if (op_tracker.check_ops_in_flight(warnings)) {
5444 for (vector<string>::iterator i = warnings.begin();
5445 i != warnings.end();
5446 ++i) {
5447 clog->warn() << *i;
5448 }
5449 }
5450 }
5451
5452 // Usage:
5453 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5454 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5455 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5456 // getomap <pool> [namespace/]<obj-name>
5457 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5458 // injectmdataerr [namespace/]<obj-name> [shardid]
5459 // injectdataerr [namespace/]<obj-name> [shardid]
5460 //
5461 // set_recovery_delay [utime]
5462 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5463 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5464 {
5465 //Test support
5466 //Support changing the omap on a single osd by using the Admin Socket to
5467 //directly request the osd make a change.
5468 if (command == "setomapval" || command == "rmomapkey" ||
5469 command == "setomapheader" || command == "getomap" ||
5470 command == "truncobj" || command == "injectmdataerr" ||
5471 command == "injectdataerr"
5472 ) {
5473 pg_t rawpg;
5474 int64_t pool;
5475 OSDMapRef curmap = service->get_osdmap();
5476 int r = -1;
5477
5478 string poolstr;
5479
5480 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5481 pool = curmap->lookup_pg_pool_name(poolstr);
5482 //If we can't find it by name then maybe id specified
5483 if (pool < 0 && isdigit(poolstr[0]))
5484 pool = atoll(poolstr.c_str());
5485 if (pool < 0) {
5486 ss << "Invalid pool '" << poolstr << "''";
5487 return;
5488 }
5489
5490 string objname, nspace;
5491 cmd_getval(service->cct, cmdmap, "objname", objname);
5492 std::size_t found = objname.find_first_of('/');
5493 if (found != string::npos) {
5494 nspace = objname.substr(0, found);
5495 objname = objname.substr(found+1);
5496 }
5497 object_locator_t oloc(pool, nspace);
5498 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5499
5500 if (r < 0) {
5501 ss << "Invalid namespace/objname";
5502 return;
5503 }
5504
5505 int64_t shardid;
5506 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5507 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5508 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5509 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5510 if (curmap->pg_is_ec(rawpg)) {
5511 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5512 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5513 return;
5514 }
5515 }
5516
5517 ObjectStore::Transaction t;
5518
5519 if (command == "setomapval") {
5520 map<string, bufferlist> newattrs;
5521 bufferlist val;
5522 string key, valstr;
5523 cmd_getval(service->cct, cmdmap, "key", key);
5524 cmd_getval(service->cct, cmdmap, "val", valstr);
5525
5526 val.append(valstr);
5527 newattrs[key] = val;
5528 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5529 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5530 if (r < 0)
5531 ss << "error=" << r;
5532 else
5533 ss << "ok";
5534 } else if (command == "rmomapkey") {
5535 string key;
5536 set<string> keys;
5537 cmd_getval(service->cct, cmdmap, "key", key);
5538
5539 keys.insert(key);
5540 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5541 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5542 if (r < 0)
5543 ss << "error=" << r;
5544 else
5545 ss << "ok";
5546 } else if (command == "setomapheader") {
5547 bufferlist newheader;
5548 string headerstr;
5549
5550 cmd_getval(service->cct, cmdmap, "header", headerstr);
5551 newheader.append(headerstr);
5552 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5553 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5554 if (r < 0)
5555 ss << "error=" << r;
5556 else
5557 ss << "ok";
5558 } else if (command == "getomap") {
5559 //Debug: Output entire omap
5560 bufferlist hdrbl;
5561 map<string, bufferlist> keyvals;
5562 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5563 if (r >= 0) {
5564 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5565 for (map<string, bufferlist>::iterator it = keyvals.begin();
5566 it != keyvals.end(); ++it)
5567 ss << " key=" << (*it).first << " val="
5568 << string((*it).second.c_str(), (*it).second.length());
5569 } else {
5570 ss << "error=" << r;
5571 }
5572 } else if (command == "truncobj") {
5573 int64_t trunclen;
5574 cmd_getval(service->cct, cmdmap, "len", trunclen);
5575 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5576 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5577 if (r < 0)
5578 ss << "error=" << r;
5579 else
5580 ss << "ok";
5581 } else if (command == "injectdataerr") {
5582 store->inject_data_error(gobj);
5583 ss << "ok";
5584 } else if (command == "injectmdataerr") {
5585 store->inject_mdata_error(gobj);
5586 ss << "ok";
5587 }
5588 return;
5589 }
5590 if (command == "set_recovery_delay") {
5591 int64_t delay;
5592 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5593 ostringstream oss;
5594 oss << delay;
5595 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5596 oss.str().c_str());
5597 if (r != 0) {
5598 ss << "set_recovery_delay: error setting "
5599 << "osd_recovery_delay_start to '" << delay << "': error "
5600 << r;
5601 return;
5602 }
5603 service->cct->_conf->apply_changes(NULL);
5604 ss << "set_recovery_delay: set osd_recovery_delay_start "
5605 << "to " << service->cct->_conf->osd_recovery_delay_start;
5606 return;
5607 }
5608 if (command == "trigger_scrub") {
5609 spg_t pgid;
5610 OSDMapRef curmap = service->get_osdmap();
5611
5612 string pgidstr;
5613
5614 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5615 if (!pgid.parse(pgidstr.c_str())) {
5616 ss << "Invalid pgid specified";
5617 return;
5618 }
5619
5620 PG *pg = service->osd->_lookup_lock_pg(pgid);
5621 if (pg == nullptr) {
5622 ss << "Can't find pg " << pgid;
5623 return;
5624 }
5625
5626 if (pg->is_primary()) {
5627 pg->unreg_next_scrub();
5628 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5629 double pool_scrub_max_interval = 0;
5630 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5631 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5632 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5633 // Instead of marking must_scrub force a schedule scrub
5634 utime_t stamp = ceph_clock_now();
5635 stamp -= scrub_max_interval;
5636 stamp -= 100.0; // push back last scrub more for good measure
5637 pg->info.history.last_scrub_stamp = stamp;
5638 pg->reg_next_scrub();
5639 ss << "ok";
5640 } else {
5641 ss << "Not primary";
5642 }
5643 pg->unlock();
5644 return;
5645 }
5646 if (command == "injectfull") {
5647 int64_t count;
5648 string type;
5649 OSDService::s_names state;
5650 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5651 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5652 if (type == "none" || count == 0) {
5653 type = "none";
5654 count = 0;
5655 }
5656 state = service->get_full_state(type);
5657 if (state == OSDService::s_names::INVALID) {
5658 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5659 return;
5660 }
5661 service->set_injectfull(state, count);
5662 return;
5663 }
5664 ss << "Internal error - command=" << command;
5665 }
5666
5667 // =========================================
5668 bool remove_dir(
5669 CephContext *cct,
5670 ObjectStore *store, SnapMapper *mapper,
5671 OSDriver *osdriver,
5672 ObjectStore::Sequencer *osr,
5673 coll_t coll, DeletingStateRef dstate,
5674 bool *finished,
5675 ThreadPool::TPHandle &handle)
5676 {
5677 vector<ghobject_t> olist;
5678 int64_t num = 0;
5679 ObjectStore::Transaction t;
5680 ghobject_t next;
5681 handle.reset_tp_timeout();
5682 store->collection_list(
5683 coll,
5684 next,
5685 ghobject_t::get_max(),
5686 store->get_ideal_list_max(),
5687 &olist,
5688 &next);
5689 generic_dout(10) << __func__ << " " << olist << dendl;
5690 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5691 // will recheck the answer before it really goes on.
5692 bool cont = true;
5693 for (vector<ghobject_t>::iterator i = olist.begin();
5694 i != olist.end();
5695 ++i) {
5696 if (i->is_pgmeta())
5697 continue;
5698 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5699 int r = mapper->remove_oid(i->hobj, &_t);
5700 if (r != 0 && r != -ENOENT) {
5701 ceph_abort();
5702 }
5703 t.remove(coll, *i);
5704 if (++num >= cct->_conf->osd_target_transaction_size) {
5705 C_SaferCond waiter;
5706 store->queue_transaction(osr, std::move(t), &waiter);
5707 cont = dstate->pause_clearing();
5708 handle.suspend_tp_timeout();
5709 waiter.wait();
5710 handle.reset_tp_timeout();
5711 if (cont)
5712 cont = dstate->resume_clearing();
5713 if (!cont)
5714 return false;
5715 t = ObjectStore::Transaction();
5716 num = 0;
5717 }
5718 }
5719 if (num) {
5720 C_SaferCond waiter;
5721 store->queue_transaction(osr, std::move(t), &waiter);
5722 cont = dstate->pause_clearing();
5723 handle.suspend_tp_timeout();
5724 waiter.wait();
5725 handle.reset_tp_timeout();
5726 if (cont)
5727 cont = dstate->resume_clearing();
5728 }
5729 // whether there are more objects to remove in the collection
5730 *finished = next.is_max();
5731 return cont;
5732 }
5733
5734 void OSD::RemoveWQ::_process(
5735 pair<PGRef, DeletingStateRef> item,
5736 ThreadPool::TPHandle &handle)
5737 {
5738 FUNCTRACE();
5739 PGRef pg(item.first);
5740 SnapMapper &mapper = pg->snap_mapper;
5741 OSDriver &driver = pg->osdriver;
5742 coll_t coll = coll_t(pg->info.pgid);
5743 pg->osr->flush();
5744 bool finished = false;
5745
5746 if (!item.second->start_or_resume_clearing())
5747 return;
5748
5749 bool cont = remove_dir(
5750 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5751 &finished, handle);
5752 if (!cont)
5753 return;
5754 if (!finished) {
5755 if (item.second->pause_clearing())
5756 queue_front(item);
5757 return;
5758 }
5759
5760 if (!item.second->start_deleting())
5761 return;
5762
5763 ObjectStore::Transaction t;
5764 PGLog::clear_info_log(pg->info.pgid, &t);
5765
5766 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5767 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5768 _exit(1);
5769 }
5770 t.remove_collection(coll);
5771
5772 // We need the sequencer to stick around until the op is complete
5773 store->queue_transaction(
5774 pg->osr.get(),
5775 std::move(t),
5776 0, // onapplied
5777 0, // oncommit
5778 0, // onreadable sync
5779 new ContainerContext<PGRef>(pg),
5780 TrackedOpRef());
5781
5782 item.second->finish_deleting();
5783 }
5784 // =========================================
5785
5786 void OSD::ms_handle_connect(Connection *con)
5787 {
5788 dout(10) << __func__ << " con " << con << dendl;
5789 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5790 Mutex::Locker l(osd_lock);
5791 if (is_stopping())
5792 return;
5793 dout(10) << __func__ << " on mon" << dendl;
5794
5795 if (is_preboot()) {
5796 start_boot();
5797 } else if (is_booting()) {
5798 _send_boot(); // resend boot message
5799 } else {
5800 map_lock.get_read();
5801 Mutex::Locker l2(mon_report_lock);
5802
5803 utime_t now = ceph_clock_now();
5804 last_mon_report = now;
5805
5806 // resend everything, it's a new session
5807 send_full_update();
5808 send_alive();
5809 service.requeue_pg_temp();
5810 service.send_pg_temp();
5811 requeue_failures();
5812 send_failures();
5813 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5814 send_pg_stats(now);
5815 }
5816
5817 map_lock.put_read();
5818 if (is_active()) {
5819 send_beacon(ceph::coarse_mono_clock::now());
5820 }
5821 }
5822
5823 // full map requests may happen while active or pre-boot
5824 if (requested_full_first) {
5825 rerequest_full_maps();
5826 }
5827 }
5828 }
5829
5830 void OSD::ms_handle_fast_connect(Connection *con)
5831 {
5832 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5833 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5834 Session *s = static_cast<Session*>(con->get_priv());
5835 if (!s) {
5836 s = new Session(cct);
5837 con->set_priv(s->get());
5838 s->con = con;
5839 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5840 << " addr=" << s->con->get_peer_addr() << dendl;
5841 // we don't connect to clients
5842 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5843 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5844 }
5845 s->put();
5846 }
5847 }
5848
5849 void OSD::ms_handle_fast_accept(Connection *con)
5850 {
5851 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5852 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5853 Session *s = static_cast<Session*>(con->get_priv());
5854 if (!s) {
5855 s = new Session(cct);
5856 con->set_priv(s->get());
5857 s->con = con;
5858 dout(10) << "new session (incoming)" << s << " con=" << con
5859 << " addr=" << con->get_peer_addr()
5860 << " must have raced with connect" << dendl;
5861 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5862 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5863 }
5864 s->put();
5865 }
5866 }
5867
5868 bool OSD::ms_handle_reset(Connection *con)
5869 {
5870 Session *session = static_cast<Session*>(con->get_priv());
5871 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5872 if (!session)
5873 return false;
5874 session->wstate.reset(con);
5875 session->con.reset(NULL); // break con <-> session ref cycle
5876 // note that we break session->con *before* the session_handle_reset
5877 // cleanup below. this avoids a race between us and
5878 // PG::add_backoff, Session::check_backoff, etc.
5879 session_handle_reset(session);
5880 session->put();
5881 return true;
5882 }
5883
5884 bool OSD::ms_handle_refused(Connection *con)
5885 {
5886 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5887 return false;
5888
5889 Session *session = static_cast<Session*>(con->get_priv());
5890 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5891 if (!session)
5892 return false;
5893 int type = con->get_peer_type();
5894 // handle only OSD failures here
5895 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5896 OSDMapRef osdmap = get_osdmap();
5897 if (osdmap) {
5898 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5899 if (id >= 0 && osdmap->is_up(id)) {
5900 // I'm cheating mon heartbeat grace logic, because we know it's not going
5901 // to respawn alone. +1 so we won't hit any boundary case.
5902 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5903 osdmap->get_inst(id),
5904 cct->_conf->osd_heartbeat_grace + 1,
5905 osdmap->get_epoch(),
5906 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5907 ));
5908 }
5909 }
5910 }
5911 session->put();
5912 return true;
5913 }
5914
5915 struct C_OSD_GetVersion : public Context {
5916 OSD *osd;
5917 uint64_t oldest, newest;
5918 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5919 void finish(int r) override {
5920 if (r >= 0)
5921 osd->_got_mon_epochs(oldest, newest);
5922 }
5923 };
5924
5925 void OSD::start_boot()
5926 {
5927 if (!_is_healthy()) {
5928 // if we are not healthy, do not mark ourselves up (yet)
5929 dout(1) << "not healthy; waiting to boot" << dendl;
5930 if (!is_waiting_for_healthy())
5931 start_waiting_for_healthy();
5932 // send pings sooner rather than later
5933 heartbeat_kick();
5934 return;
5935 }
5936 dout(1) << __func__ << dendl;
5937 set_state(STATE_PREBOOT);
5938 waiting_for_luminous_mons = false;
5939 dout(10) << "start_boot - have maps " << superblock.oldest_map
5940 << ".." << superblock.newest_map << dendl;
5941 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5942 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5943 }
5944
5945 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5946 {
5947 Mutex::Locker l(osd_lock);
5948 if (is_preboot()) {
5949 _preboot(oldest, newest);
5950 }
5951 }
5952
5953 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5954 {
5955 assert(is_preboot());
5956 dout(10) << __func__ << " _preboot mon has osdmaps "
5957 << oldest << ".." << newest << dendl;
5958
5959 // ensure our local fullness awareness is accurate
5960 heartbeat();
5961
5962 // if our map within recent history, try to add ourselves to the osdmap.
5963 if (osdmap->get_epoch() == 0) {
5964 derr << "waiting for initial osdmap" << dendl;
5965 } else if (osdmap->is_destroyed(whoami)) {
5966 derr << "osdmap says I am destroyed" << dendl;
5967 // provide a small margin so we don't livelock seeing if we
5968 // un-destroyed ourselves.
5969 if (osdmap->get_epoch() > newest - 1) {
5970 exit(0);
5971 }
5972 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5973 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5974 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5975 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5976 << dendl;
5977 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5978 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5979 << dendl;
5980 } else if (!monc->monmap.get_required_features().contains_all(
5981 ceph::features::mon::FEATURE_LUMINOUS)) {
5982 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5983 << "Luminous or later before Luminous OSDs will boot" << dendl;
5984 waiting_for_luminous_mons = true;
5985 } else if (service.need_fullness_update()) {
5986 derr << "osdmap fullness state needs update" << dendl;
5987 send_full_update();
5988 } else if (osdmap->get_epoch() >= oldest - 1 &&
5989 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5990 _send_boot();
5991 return;
5992 }
5993
5994 // get all the latest maps
5995 if (osdmap->get_epoch() + 1 >= oldest)
5996 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5997 else
5998 osdmap_subscribe(oldest - 1, true);
5999 }
6000
6001 void OSD::send_full_update()
6002 {
6003 if (!service.need_fullness_update())
6004 return;
6005 unsigned state = 0;
6006 if (service.is_full()) {
6007 state = CEPH_OSD_FULL;
6008 } else if (service.is_backfillfull()) {
6009 state = CEPH_OSD_BACKFILLFULL;
6010 } else if (service.is_nearfull()) {
6011 state = CEPH_OSD_NEARFULL;
6012 }
6013 set<string> s;
6014 OSDMap::calc_state_set(state, s);
6015 dout(10) << __func__ << " want state " << s << dendl;
6016 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6017 }
6018
6019 void OSD::start_waiting_for_healthy()
6020 {
6021 dout(1) << "start_waiting_for_healthy" << dendl;
6022 set_state(STATE_WAITING_FOR_HEALTHY);
6023 last_heartbeat_resample = utime_t();
6024
6025 // subscribe to osdmap updates, in case our peers really are known to be dead
6026 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6027 }
6028
6029 bool OSD::_is_healthy()
6030 {
6031 if (!cct->get_heartbeat_map()->is_healthy()) {
6032 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6033 return false;
6034 }
6035
6036 if (is_waiting_for_healthy()) {
6037 Mutex::Locker l(heartbeat_lock);
6038 utime_t cutoff = ceph_clock_now();
6039 cutoff -= cct->_conf->osd_heartbeat_grace;
6040 int num = 0, up = 0;
6041 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6042 p != heartbeat_peers.end();
6043 ++p) {
6044 if (p->second.is_healthy(cutoff))
6045 ++up;
6046 ++num;
6047 }
6048 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6049 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6050 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6051 return false;
6052 }
6053 }
6054
6055 return true;
6056 }
6057
6058 void OSD::_send_boot()
6059 {
6060 dout(10) << "_send_boot" << dendl;
6061 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
6062 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
6063 if (cluster_addr.is_blank_ip()) {
6064 int port = cluster_addr.get_port();
6065 cluster_addr = client_messenger->get_myaddr();
6066 cluster_addr.set_port(port);
6067 cluster_messenger->set_addr_unknowns(cluster_addr);
6068 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
6069 } else {
6070 Session *s = static_cast<Session*>(local_connection->get_priv());
6071 if (s)
6072 s->put();
6073 else
6074 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6075 }
6076
6077 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
6078 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6079 if (hb_back_addr.is_blank_ip()) {
6080 int port = hb_back_addr.get_port();
6081 hb_back_addr = cluster_addr;
6082 hb_back_addr.set_port(port);
6083 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
6084 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
6085 } else {
6086 Session *s = static_cast<Session*>(local_connection->get_priv());
6087 if (s)
6088 s->put();
6089 else
6090 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6091 }
6092
6093 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
6094 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6095 if (hb_front_addr.is_blank_ip()) {
6096 int port = hb_front_addr.get_port();
6097 hb_front_addr = client_messenger->get_myaddr();
6098 hb_front_addr.set_port(port);
6099 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
6100 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
6101 } else {
6102 Session *s = static_cast<Session*>(local_connection->get_priv());
6103 if (s)
6104 s->put();
6105 else
6106 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6107 }
6108
6109 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6110 hb_back_addr, hb_front_addr, cluster_addr,
6111 CEPH_FEATURES_ALL);
6112 dout(10) << " client_addr " << client_messenger->get_myaddr()
6113 << ", cluster_addr " << cluster_addr
6114 << ", hb_back_addr " << hb_back_addr
6115 << ", hb_front_addr " << hb_front_addr
6116 << dendl;
6117 _collect_metadata(&mboot->metadata);
6118 monc->send_mon_message(mboot);
6119 set_state(STATE_BOOTING);
6120 }
6121
6122 void OSD::_collect_metadata(map<string,string> *pm)
6123 {
6124 // config info
6125 (*pm)["osd_data"] = dev_path;
6126 if (store->get_type() == "filestore") {
6127 // not applicable for bluestore
6128 (*pm)["osd_journal"] = journal_path;
6129 }
6130 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6131 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6132 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6133 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6134
6135 // backend
6136 (*pm)["osd_objectstore"] = store->get_type();
6137 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6138 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6139 (*pm)["default_device_class"] = store->get_default_device_class();
6140 store->collect_metadata(pm);
6141
6142 collect_sys_info(pm, cct);
6143
6144 std::string front_iface, back_iface;
6145 /*
6146 pick_iface(cct,
6147 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6148 &front_iface, &back_iface);
6149 */
6150 (*pm)["front_iface"] = pick_iface(cct,
6151 client_messenger->get_myaddr().get_sockaddr_storage());
6152 (*pm)["back_iface"] = pick_iface(cct,
6153 cluster_messenger->get_myaddr().get_sockaddr_storage());
6154
6155 dout(10) << __func__ << " " << *pm << dendl;
6156 }
6157
6158 void OSD::queue_want_up_thru(epoch_t want)
6159 {
6160 map_lock.get_read();
6161 epoch_t cur = osdmap->get_up_thru(whoami);
6162 Mutex::Locker l(mon_report_lock);
6163 if (want > up_thru_wanted) {
6164 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6165 << ", currently " << cur
6166 << dendl;
6167 up_thru_wanted = want;
6168 send_alive();
6169 } else {
6170 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6171 << ", currently " << cur
6172 << dendl;
6173 }
6174 map_lock.put_read();
6175 }
6176
6177 void OSD::send_alive()
6178 {
6179 assert(mon_report_lock.is_locked());
6180 if (!osdmap->exists(whoami))
6181 return;
6182 epoch_t up_thru = osdmap->get_up_thru(whoami);
6183 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6184 if (up_thru_wanted > up_thru) {
6185 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6186 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6187 }
6188 }
6189
6190 void OSD::request_full_map(epoch_t first, epoch_t last)
6191 {
6192 dout(10) << __func__ << " " << first << ".." << last
6193 << ", previously requested "
6194 << requested_full_first << ".." << requested_full_last << dendl;
6195 assert(osd_lock.is_locked());
6196 assert(first > 0 && last > 0);
6197 assert(first <= last);
6198 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6199 if (requested_full_first == 0) {
6200 // first request
6201 requested_full_first = first;
6202 requested_full_last = last;
6203 } else if (last <= requested_full_last) {
6204 // dup
6205 return;
6206 } else {
6207 // additional request
6208 first = requested_full_last + 1;
6209 requested_full_last = last;
6210 }
6211 MMonGetOSDMap *req = new MMonGetOSDMap;
6212 req->request_full(first, last);
6213 monc->send_mon_message(req);
6214 }
6215
6216 void OSD::got_full_map(epoch_t e)
6217 {
6218 assert(requested_full_first <= requested_full_last);
6219 assert(osd_lock.is_locked());
6220 if (requested_full_first == 0) {
6221 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6222 return;
6223 }
6224 if (e < requested_full_first) {
6225 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6226 << ".." << requested_full_last
6227 << ", ignoring" << dendl;
6228 return;
6229 }
6230 if (e >= requested_full_last) {
6231 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6232 << ".." << requested_full_last << ", resetting" << dendl;
6233 requested_full_first = requested_full_last = 0;
6234 return;
6235 }
6236
6237 requested_full_first = e + 1;
6238
6239 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6240 << ".." << requested_full_last
6241 << ", still need more" << dendl;
6242 }
6243
6244 void OSD::requeue_failures()
6245 {
6246 Mutex::Locker l(heartbeat_lock);
6247 unsigned old_queue = failure_queue.size();
6248 unsigned old_pending = failure_pending.size();
6249 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6250 failure_pending.begin();
6251 p != failure_pending.end(); ) {
6252 failure_queue[p->first] = p->second.first;
6253 failure_pending.erase(p++);
6254 }
6255 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6256 << failure_queue.size() << dendl;
6257 }
6258
6259 void OSD::send_failures()
6260 {
6261 assert(map_lock.is_locked());
6262 assert(mon_report_lock.is_locked());
6263 Mutex::Locker l(heartbeat_lock);
6264 utime_t now = ceph_clock_now();
6265 while (!failure_queue.empty()) {
6266 int osd = failure_queue.begin()->first;
6267 if (!failure_pending.count(osd)) {
6268 entity_inst_t i = osdmap->get_inst(osd);
6269 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6270 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6271 osdmap->get_epoch()));
6272 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6273 }
6274 failure_queue.erase(osd);
6275 }
6276 }
6277
6278 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6279 {
6280 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6281 monc->send_mon_message(m);
6282 }
6283
6284 void OSD::send_pg_stats(const utime_t &now)
6285 {
6286 assert(map_lock.is_locked());
6287 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6288 dout(20) << "send_pg_stats" << dendl;
6289
6290 osd_stat_t cur_stat = service.get_osd_stat();
6291
6292 cur_stat.os_perf_stat = store->get_cur_stats();
6293
6294 pg_stat_queue_lock.Lock();
6295
6296 if (osd_stat_updated || !pg_stat_queue.empty()) {
6297 last_pg_stats_sent = now;
6298 osd_stat_updated = false;
6299
6300 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6301
6302 utime_t had_for(now);
6303 had_for -= had_map_since;
6304
6305 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6306
6307 uint64_t tid = ++pg_stat_tid;
6308 m->set_tid(tid);
6309 m->osd_stat = cur_stat;
6310
6311 xlist<PG*>::iterator p = pg_stat_queue.begin();
6312 while (!p.end()) {
6313 PG *pg = *p;
6314 ++p;
6315 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6316 pg->stat_queue_item.remove_myself();
6317 pg->put("pg_stat_queue");
6318 continue;
6319 }
6320 pg->pg_stats_publish_lock.Lock();
6321 if (pg->pg_stats_publish_valid) {
6322 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6323 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6324 << pg->pg_stats_publish.reported_seq << dendl;
6325 } else {
6326 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6327 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6328 }
6329 pg->pg_stats_publish_lock.Unlock();
6330 }
6331
6332 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6333 last_pg_stats_ack = ceph_clock_now();
6334 }
6335 outstanding_pg_stats.insert(tid);
6336 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6337
6338 monc->send_mon_message(m);
6339 }
6340
6341 pg_stat_queue_lock.Unlock();
6342 }
6343
6344 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6345 {
6346 dout(10) << "handle_pg_stats_ack " << dendl;
6347
6348 if (!require_mon_peer(ack)) {
6349 ack->put();
6350 return;
6351 }
6352
6353 // NOTE: we may get replies from a previous mon even while
6354 // outstanding_pg_stats is empty if reconnecting races with replies
6355 // in flight.
6356
6357 pg_stat_queue_lock.Lock();
6358
6359 last_pg_stats_ack = ceph_clock_now();
6360
6361 // decay timeout slowly (analogous to TCP)
6362 stats_ack_timeout =
6363 MAX(cct->_conf->osd_mon_ack_timeout,
6364 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6365 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6366
6367 if (ack->get_tid() > pg_stat_tid_flushed) {
6368 pg_stat_tid_flushed = ack->get_tid();
6369 pg_stat_queue_cond.Signal();
6370 }
6371
6372 xlist<PG*>::iterator p = pg_stat_queue.begin();
6373 while (!p.end()) {
6374 PG *pg = *p;
6375 PGRef _pg(pg);
6376 ++p;
6377
6378 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6379 if (acked != ack->pg_stat.end()) {
6380 pg->pg_stats_publish_lock.Lock();
6381 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6382 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6383 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6384 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6385 pg->stat_queue_item.remove_myself();
6386 pg->put("pg_stat_queue");
6387 } else {
6388 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6389 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6390 << acked->second << dendl;
6391 }
6392 pg->pg_stats_publish_lock.Unlock();
6393 } else {
6394 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6395 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6396 }
6397 }
6398
6399 outstanding_pg_stats.erase(ack->get_tid());
6400 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6401
6402 pg_stat_queue_lock.Unlock();
6403
6404 ack->put();
6405 }
6406
6407 void OSD::flush_pg_stats()
6408 {
6409 dout(10) << "flush_pg_stats" << dendl;
6410 osd_lock.Unlock();
6411 utime_t now = ceph_clock_now();
6412 map_lock.get_read();
6413 mon_report_lock.Lock();
6414 send_pg_stats(now);
6415 mon_report_lock.Unlock();
6416 map_lock.put_read();
6417
6418
6419 pg_stat_queue_lock.Lock();
6420 uint64_t tid = pg_stat_tid;
6421 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6422 while (tid > pg_stat_tid_flushed)
6423 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6424 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6425 pg_stat_queue_lock.Unlock();
6426
6427 osd_lock.Lock();
6428 }
6429
6430 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6431 {
6432 const auto& monmap = monc->monmap;
6433 // send beacon to mon even if we are just connected, and the monmap is not
6434 // initialized yet by then.
6435 if (monmap.epoch > 0 &&
6436 monmap.get_required_features().contains_all(
6437 ceph::features::mon::FEATURE_LUMINOUS)) {
6438 dout(20) << __func__ << " sending" << dendl;
6439 MOSDBeacon* beacon = nullptr;
6440 {
6441 Mutex::Locker l{min_last_epoch_clean_lock};
6442 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6443 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6444 last_sent_beacon = now;
6445 }
6446 monc->send_mon_message(beacon);
6447 } else {
6448 dout(20) << __func__ << " not sending" << dendl;
6449 }
6450 }
6451
6452 void OSD::handle_command(MMonCommand *m)
6453 {
6454 if (!require_mon_peer(m)) {
6455 m->put();
6456 return;
6457 }
6458
6459 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6460 command_wq.queue(c);
6461 m->put();
6462 }
6463
6464 void OSD::handle_command(MCommand *m)
6465 {
6466 ConnectionRef con = m->get_connection();
6467 Session *session = static_cast<Session *>(con->get_priv());
6468 if (!session) {
6469 con->send_message(new MCommandReply(m, -EPERM));
6470 m->put();
6471 return;
6472 }
6473
6474 OSDCap& caps = session->caps;
6475 session->put();
6476
6477 if (!caps.allow_all() || m->get_source().is_mon()) {
6478 con->send_message(new MCommandReply(m, -EPERM));
6479 m->put();
6480 return;
6481 }
6482
6483 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6484 command_wq.queue(c);
6485
6486 m->put();
6487 }
6488
6489 struct OSDCommand {
6490 string cmdstring;
6491 string helpstring;
6492 string module;
6493 string perm;
6494 string availability;
6495 } osd_commands[] = {
6496
6497 #define COMMAND(parsesig, helptext, module, perm, availability) \
6498 {parsesig, helptext, module, perm, availability},
6499
6500 // yes, these are really pg commands, but there's a limit to how
6501 // much work it's worth. The OSD returns all of them. Make this
6502 // form (pg <pgid> <cmd>) valid only for the cli.
6503 // Rest uses "tell <pgid> <cmd>"
6504
6505 COMMAND("pg " \
6506 "name=pgid,type=CephPgid " \
6507 "name=cmd,type=CephChoices,strings=query", \
6508 "show details of a specific pg", "osd", "r", "cli")
6509 COMMAND("pg " \
6510 "name=pgid,type=CephPgid " \
6511 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6512 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6513 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6514 "osd", "rw", "cli")
6515 COMMAND("pg " \
6516 "name=pgid,type=CephPgid " \
6517 "name=cmd,type=CephChoices,strings=list_missing " \
6518 "name=offset,type=CephString,req=false",
6519 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6520 "osd", "r", "cli")
6521
6522 // new form: tell <pgid> <cmd> for both cli and rest
6523
6524 COMMAND("query",
6525 "show details of a specific pg", "osd", "r", "cli,rest")
6526 COMMAND("mark_unfound_lost " \
6527 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6528 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6529 "osd", "rw", "cli,rest")
6530 COMMAND("list_missing " \
6531 "name=offset,type=CephString,req=false",
6532 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6533 "osd", "r", "cli,rest")
6534 COMMAND("perf histogram dump "
6535 "name=logger,type=CephString,req=false "
6536 "name=counter,type=CephString,req=false",
6537 "Get histogram data",
6538 "osd", "r", "cli,rest")
6539
6540 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6541 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6542 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6543 COMMAND("injectargs " \
6544 "name=injected_args,type=CephString,n=N",
6545 "inject configuration arguments into running OSD",
6546 "osd", "rw", "cli,rest")
6547 COMMAND("config set " \
6548 "name=key,type=CephString name=value,type=CephString",
6549 "Set a configuration option at runtime (not persistent)",
6550 "osd", "rw", "cli,rest")
6551 COMMAND("cluster_log " \
6552 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6553 "name=message,type=CephString,n=N",
6554 "log a message to the cluster log",
6555 "osd", "rw", "cli,rest")
6556 COMMAND("bench " \
6557 "name=count,type=CephInt,req=false " \
6558 "name=size,type=CephInt,req=false " \
6559 "name=object_size,type=CephInt,req=false " \
6560 "name=object_num,type=CephInt,req=false ", \
6561 "OSD benchmark: write <count> <size>-byte objects, " \
6562 "(default 1G size 4MB). Results in log.",
6563 "osd", "rw", "cli,rest")
6564 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6565 COMMAND("heap " \
6566 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6567 "show heap usage info (available only if compiled with tcmalloc)", \
6568 "osd", "rw", "cli,rest")
6569 COMMAND("debug dump_missing " \
6570 "name=filename,type=CephFilepath",
6571 "dump missing objects to a named file", "osd", "r", "cli,rest")
6572 COMMAND("debug kick_recovery_wq " \
6573 "name=delay,type=CephInt,range=0",
6574 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6575 COMMAND("cpu_profiler " \
6576 "name=arg,type=CephChoices,strings=status|flush",
6577 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6578 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6579 "osd", "r", "cli,rest")
6580 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6581 "osd", "rw", "cli,rest")
6582 COMMAND("compact",
6583 "compact object store's omap. "
6584 "WARNING: Compaction probably slows your requests",
6585 "osd", "rw", "cli,rest")
6586 };
6587
6588 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6589 {
6590 int r = 0;
6591 stringstream ss, ds;
6592 string rs;
6593 bufferlist odata;
6594
6595 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6596
6597 map<string, cmd_vartype> cmdmap;
6598 string prefix;
6599 string format;
6600 string pgidstr;
6601 boost::scoped_ptr<Formatter> f;
6602
6603 if (cmd.empty()) {
6604 ss << "no command given";
6605 goto out;
6606 }
6607
6608 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6609 r = -EINVAL;
6610 goto out;
6611 }
6612
6613 cmd_getval(cct, cmdmap, "prefix", prefix);
6614
6615 if (prefix == "get_command_descriptions") {
6616 int cmdnum = 0;
6617 JSONFormatter *f = new JSONFormatter();
6618 f->open_object_section("command_descriptions");
6619 for (OSDCommand *cp = osd_commands;
6620 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6621
6622 ostringstream secname;
6623 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6624 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6625 cp->module, cp->perm, cp->availability, 0);
6626 cmdnum++;
6627 }
6628 f->close_section(); // command_descriptions
6629
6630 f->flush(ds);
6631 delete f;
6632 goto out;
6633 }
6634
6635 cmd_getval(cct, cmdmap, "format", format);
6636 f.reset(Formatter::create(format));
6637
6638 if (prefix == "version") {
6639 if (f) {
6640 f->open_object_section("version");
6641 f->dump_string("version", pretty_version_to_str());
6642 f->close_section();
6643 f->flush(ds);
6644 } else {
6645 ds << pretty_version_to_str();
6646 }
6647 goto out;
6648 }
6649 else if (prefix == "injectargs") {
6650 vector<string> argsvec;
6651 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6652
6653 if (argsvec.empty()) {
6654 r = -EINVAL;
6655 ss << "ignoring empty injectargs";
6656 goto out;
6657 }
6658 string args = argsvec.front();
6659 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6660 args += " " + *a;
6661 osd_lock.Unlock();
6662 r = cct->_conf->injectargs(args, &ss);
6663 osd_lock.Lock();
6664 }
6665 else if (prefix == "config set") {
6666 std::string key;
6667 std::string val;
6668 cmd_getval(cct, cmdmap, "key", key);
6669 cmd_getval(cct, cmdmap, "value", val);
6670 osd_lock.Unlock();
6671 r = cct->_conf->set_val(key, val, true, &ss);
6672 if (r == 0) {
6673 cct->_conf->apply_changes(nullptr);
6674 }
6675 osd_lock.Lock();
6676 }
6677 else if (prefix == "cluster_log") {
6678 vector<string> msg;
6679 cmd_getval(cct, cmdmap, "message", msg);
6680 if (msg.empty()) {
6681 r = -EINVAL;
6682 ss << "ignoring empty log message";
6683 goto out;
6684 }
6685 string message = msg.front();
6686 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6687 message += " " + *a;
6688 string lvl;
6689 cmd_getval(cct, cmdmap, "level", lvl);
6690 clog_type level = string_to_clog_type(lvl);
6691 if (level < 0) {
6692 r = -EINVAL;
6693 ss << "unknown level '" << lvl << "'";
6694 goto out;
6695 }
6696 clog->do_log(level, message);
6697 }
6698
6699 // either 'pg <pgid> <command>' or
6700 // 'tell <pgid>' (which comes in without any of that prefix)?
6701
6702 else if (prefix == "pg" ||
6703 prefix == "query" ||
6704 prefix == "mark_unfound_lost" ||
6705 prefix == "list_missing"
6706 ) {
6707 pg_t pgid;
6708
6709 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6710 ss << "no pgid specified";
6711 r = -EINVAL;
6712 } else if (!pgid.parse(pgidstr.c_str())) {
6713 ss << "couldn't parse pgid '" << pgidstr << "'";
6714 r = -EINVAL;
6715 } else {
6716 spg_t pcand;
6717 PG *pg = nullptr;
6718 if (osdmap->get_primary_shard(pgid, &pcand) &&
6719 (pg = _lookup_lock_pg(pcand))) {
6720 if (pg->is_primary()) {
6721 // simulate pg <pgid> cmd= for pg->do-command
6722 if (prefix != "pg")
6723 cmd_putval(cct, cmdmap, "cmd", prefix);
6724 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6725 if (r == -EAGAIN) {
6726 pg->unlock();
6727 // don't reply, pg will do so async
6728 return;
6729 }
6730 } else {
6731 ss << "not primary for pgid " << pgid;
6732
6733 // send them the latest diff to ensure they realize the mapping
6734 // has changed.
6735 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6736
6737 // do not reply; they will get newer maps and realize they
6738 // need to resend.
6739 pg->unlock();
6740 return;
6741 }
6742 pg->unlock();
6743 } else {
6744 ss << "i don't have pgid " << pgid;
6745 r = -ENOENT;
6746 }
6747 }
6748 }
6749
6750 else if (prefix == "bench") {
6751 int64_t count;
6752 int64_t bsize;
6753 int64_t osize, onum;
6754 // default count 1G, size 4MB
6755 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6756 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6757 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6758 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6759
6760 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6761 ObjectStore::Sequencer>("bench"));
6762
6763 uint32_t duration = cct->_conf->osd_bench_duration;
6764
6765 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6766 // let us limit the block size because the next checks rely on it
6767 // having a sane value. If we allow any block size to be set things
6768 // can still go sideways.
6769 ss << "block 'size' values are capped at "
6770 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6771 << " a higher value, please adjust 'osd_bench_max_block_size'";
6772 r = -EINVAL;
6773 goto out;
6774 } else if (bsize < (int64_t) (1 << 20)) {
6775 // entering the realm of small block sizes.
6776 // limit the count to a sane value, assuming a configurable amount of
6777 // IOPS and duration, so that the OSD doesn't get hung up on this,
6778 // preventing timeouts from going off
6779 int64_t max_count =
6780 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6781 if (count > max_count) {
6782 ss << "'count' values greater than " << max_count
6783 << " for a block size of " << byte_u_t(bsize) << ", assuming "
6784 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6785 << " for " << duration << " seconds,"
6786 << " can cause ill effects on osd. "
6787 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6788 << " value if you wish to use a higher 'count'.";
6789 r = -EINVAL;
6790 goto out;
6791 }
6792 } else {
6793 // 1MB block sizes are big enough so that we get more stuff done.
6794 // However, to avoid the osd from getting hung on this and having
6795 // timers being triggered, we are going to limit the count assuming
6796 // a configurable throughput and duration.
6797 // NOTE: max_count is the total amount of bytes that we believe we
6798 // will be able to write during 'duration' for the given
6799 // throughput. The block size hardly impacts this unless it's
6800 // way too big. Given we already check how big the block size
6801 // is, it's safe to assume everything will check out.
6802 int64_t max_count =
6803 cct->_conf->osd_bench_large_size_max_throughput * duration;
6804 if (count > max_count) {
6805 ss << "'count' values greater than " << max_count
6806 << " for a block size of " << byte_u_t(bsize) << ", assuming "
6807 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6808 << " for " << duration << " seconds,"
6809 << " can cause ill effects on osd. "
6810 << " Please adjust 'osd_bench_large_size_max_throughput'"
6811 << " with a higher value if you wish to use a higher 'count'.";
6812 r = -EINVAL;
6813 goto out;
6814 }
6815 }
6816
6817 if (osize && bsize > osize)
6818 bsize = osize;
6819
6820 dout(1) << " bench count " << count
6821 << " bsize " << byte_u_t(bsize) << dendl;
6822
6823 ObjectStore::Transaction cleanupt;
6824
6825 if (osize && onum) {
6826 bufferlist bl;
6827 bufferptr bp(osize);
6828 bp.zero();
6829 bl.push_back(std::move(bp));
6830 bl.rebuild_page_aligned();
6831 for (int i=0; i<onum; ++i) {
6832 char nm[30];
6833 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6834 object_t oid(nm);
6835 hobject_t soid(sobject_t(oid, 0));
6836 ObjectStore::Transaction t;
6837 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6838 store->queue_transaction(osr.get(), std::move(t), NULL);
6839 cleanupt.remove(coll_t(), ghobject_t(soid));
6840 }
6841 }
6842
6843 bufferlist bl;
6844 bufferptr bp(bsize);
6845 bp.zero();
6846 bl.push_back(std::move(bp));
6847 bl.rebuild_page_aligned();
6848
6849 {
6850 C_SaferCond waiter;
6851 if (!osr->flush_commit(&waiter)) {
6852 waiter.wait();
6853 }
6854 }
6855
6856 utime_t start = ceph_clock_now();
6857 for (int64_t pos = 0; pos < count; pos += bsize) {
6858 char nm[30];
6859 unsigned offset = 0;
6860 if (onum && osize) {
6861 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6862 offset = rand() % (osize / bsize) * bsize;
6863 } else {
6864 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6865 }
6866 object_t oid(nm);
6867 hobject_t soid(sobject_t(oid, 0));
6868 ObjectStore::Transaction t;
6869 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6870 store->queue_transaction(osr.get(), std::move(t), NULL);
6871 if (!onum || !osize)
6872 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6873 }
6874
6875 {
6876 C_SaferCond waiter;
6877 if (!osr->flush_commit(&waiter)) {
6878 waiter.wait();
6879 }
6880 }
6881 utime_t end = ceph_clock_now();
6882
6883 // clean up
6884 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6885 {
6886 C_SaferCond waiter;
6887 if (!osr->flush_commit(&waiter)) {
6888 waiter.wait();
6889 }
6890 }
6891
6892 double elapsed = end - start;
6893 double rate = count / elapsed;
6894 double iops = rate / bsize;
6895 if (f) {
6896 f->open_object_section("osd_bench_results");
6897 f->dump_int("bytes_written", count);
6898 f->dump_int("blocksize", bsize);
6899 f->dump_float("elapsed_sec", elapsed);
6900 f->dump_float("bytes_per_sec", rate);
6901 f->dump_float("iops", iops);
6902 f->close_section();
6903 f->flush(ds);
6904 } else {
6905 ds << "bench: wrote " << byte_u_t(count)
6906 << " in blocks of " << byte_u_t(bsize) << " in "
6907 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
6908 << si_u_t(iops) << " IOPS";
6909 }
6910 }
6911
6912 else if (prefix == "flush_pg_stats") {
6913 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6914 mgrc.send_pgstats();
6915 ds << service.get_osd_stat_seq() << "\n";
6916 } else {
6917 flush_pg_stats();
6918 }
6919 }
6920
6921 else if (prefix == "heap") {
6922 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6923 }
6924
6925 else if (prefix == "debug dump_missing") {
6926 string file_name;
6927 cmd_getval(cct, cmdmap, "filename", file_name);
6928 std::ofstream fout(file_name.c_str());
6929 if (!fout.is_open()) {
6930 ss << "failed to open file '" << file_name << "'";
6931 r = -EINVAL;
6932 goto out;
6933 }
6934
6935 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6936 RWLock::RLocker l(pg_map_lock);
6937 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6938 pg_map_e != pg_map.end(); ++pg_map_e) {
6939 PG *pg = pg_map_e->second;
6940 pg->lock();
6941
6942 fout << *pg << std::endl;
6943 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6944 pg->pg_log.get_missing().get_items().end();
6945 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6946 pg->pg_log.get_missing().get_items().begin();
6947 for (; mi != mend; ++mi) {
6948 fout << mi->first << " -> " << mi->second << std::endl;
6949 if (!pg->missing_loc.needs_recovery(mi->first))
6950 continue;
6951 if (pg->missing_loc.is_unfound(mi->first))
6952 fout << " unfound ";
6953 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6954 if (mls.empty())
6955 continue;
6956 fout << "missing_loc: " << mls << std::endl;
6957 }
6958 pg->unlock();
6959 fout << std::endl;
6960 }
6961
6962 fout.close();
6963 }
6964 else if (prefix == "debug kick_recovery_wq") {
6965 int64_t delay;
6966 cmd_getval(cct, cmdmap, "delay", delay);
6967 ostringstream oss;
6968 oss << delay;
6969 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6970 if (r != 0) {
6971 ss << "kick_recovery_wq: error setting "
6972 << "osd_recovery_delay_start to '" << delay << "': error "
6973 << r;
6974 goto out;
6975 }
6976 cct->_conf->apply_changes(NULL);
6977 ss << "kicking recovery queue. set osd_recovery_delay_start "
6978 << "to " << cct->_conf->osd_recovery_delay_start;
6979 }
6980
6981 else if (prefix == "cpu_profiler") {
6982 string arg;
6983 cmd_getval(cct, cmdmap, "arg", arg);
6984 vector<string> argvec;
6985 get_str_vec(arg, argvec);
6986 cpu_profiler_handle_command(argvec, ds);
6987 }
6988
6989 else if (prefix == "dump_pg_recovery_stats") {
6990 stringstream s;
6991 if (f) {
6992 pg_recovery_stats.dump_formatted(f.get());
6993 f->flush(ds);
6994 } else {
6995 pg_recovery_stats.dump(s);
6996 ds << "dump pg recovery stats: " << s.str();
6997 }
6998 }
6999
7000 else if (prefix == "reset_pg_recovery_stats") {
7001 ss << "reset pg recovery stats";
7002 pg_recovery_stats.reset();
7003 }
7004
7005 else if (prefix == "perf histogram dump") {
7006 std::string logger;
7007 std::string counter;
7008 cmd_getval(cct, cmdmap, "logger", logger);
7009 cmd_getval(cct, cmdmap, "counter", counter);
7010 if (f) {
7011 cct->get_perfcounters_collection()->dump_formatted_histograms(
7012 f.get(), false, logger, counter);
7013 f->flush(ds);
7014 }
7015 }
7016
7017 else if (prefix == "compact") {
7018 dout(1) << "triggering manual compaction" << dendl;
7019 auto start = ceph::coarse_mono_clock::now();
7020 store->compact();
7021 auto end = ceph::coarse_mono_clock::now();
7022 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
7023 dout(1) << "finished manual compaction in "
7024 << time_span.count()
7025 << " seconds" << dendl;
7026 ss << "compacted omap in " << time_span.count() << " seconds";
7027 }
7028
7029 else {
7030 ss << "unrecognized command! " << cmd;
7031 r = -EINVAL;
7032 }
7033
7034 out:
7035 rs = ss.str();
7036 odata.append(ds);
7037 dout(0) << "do_command r=" << r << " " << rs << dendl;
7038 clog->info() << rs;
7039 if (con) {
7040 MCommandReply *reply = new MCommandReply(r, rs);
7041 reply->set_tid(tid);
7042 reply->set_data(odata);
7043 con->send_message(reply);
7044 }
7045 }
7046
7047 bool OSD::heartbeat_dispatch(Message *m)
7048 {
7049 dout(30) << "heartbeat_dispatch " << m << dendl;
7050 switch (m->get_type()) {
7051
7052 case CEPH_MSG_PING:
7053 dout(10) << "ping from " << m->get_source_inst() << dendl;
7054 m->put();
7055 break;
7056
7057 case MSG_OSD_PING:
7058 handle_osd_ping(static_cast<MOSDPing*>(m));
7059 break;
7060
7061 default:
7062 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7063 m->put();
7064 }
7065
7066 return true;
7067 }
7068
7069 bool OSD::ms_dispatch(Message *m)
7070 {
7071 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7072 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7073 service.got_stop_ack();
7074 m->put();
7075 return true;
7076 }
7077
7078 // lock!
7079
7080 osd_lock.Lock();
7081 if (is_stopping()) {
7082 osd_lock.Unlock();
7083 m->put();
7084 return true;
7085 }
7086
7087 do_waiters();
7088 _dispatch(m);
7089
7090 osd_lock.Unlock();
7091
7092 return true;
7093 }
7094
7095 void OSD::maybe_share_map(
7096 Session *session,
7097 OpRequestRef op,
7098 OSDMapRef osdmap)
7099 {
7100 if (!op->check_send_map) {
7101 return;
7102 }
7103 epoch_t last_sent_epoch = 0;
7104
7105 session->sent_epoch_lock.lock();
7106 last_sent_epoch = session->last_sent_epoch;
7107 session->sent_epoch_lock.unlock();
7108
7109 const Message *m = op->get_req();
7110 service.share_map(
7111 m->get_source(),
7112 m->get_connection().get(),
7113 op->sent_epoch,
7114 osdmap,
7115 session ? &last_sent_epoch : NULL);
7116
7117 session->sent_epoch_lock.lock();
7118 if (session->last_sent_epoch < last_sent_epoch) {
7119 session->last_sent_epoch = last_sent_epoch;
7120 }
7121 session->sent_epoch_lock.unlock();
7122
7123 op->check_send_map = false;
7124 }
7125
7126 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7127 {
7128 assert(session->session_dispatch_lock.is_locked());
7129
7130 auto i = session->waiting_on_map.begin();
7131 while (i != session->waiting_on_map.end()) {
7132 OpRequestRef op = &(*i);
7133 assert(ms_can_fast_dispatch(op->get_req()));
7134 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7135 op->get_req());
7136 if (m->get_min_epoch() > osdmap->get_epoch()) {
7137 break;
7138 }
7139 session->waiting_on_map.erase(i++);
7140 op->put();
7141
7142 spg_t pgid;
7143 if (m->get_type() == CEPH_MSG_OSD_OP) {
7144 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7145 static_cast<const MOSDOp*>(m)->get_pg());
7146 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7147 continue;
7148 }
7149 } else {
7150 pgid = m->get_spg();
7151 }
7152 enqueue_op(pgid, op, m->get_map_epoch());
7153 }
7154
7155 if (session->waiting_on_map.empty()) {
7156 clear_session_waiting_on_map(session);
7157 } else {
7158 register_session_waiting_on_map(session);
7159 }
7160 }
7161
7162 void OSD::ms_fast_dispatch(Message *m)
7163 {
7164 FUNCTRACE();
7165 if (service.is_stopping()) {
7166 m->put();
7167 return;
7168 }
7169 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7170 {
7171 #ifdef WITH_LTTNG
7172 osd_reqid_t reqid = op->get_reqid();
7173 #endif
7174 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7175 reqid.name._num, reqid.tid, reqid.inc);
7176 }
7177
7178 if (m->trace)
7179 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7180
7181 // note sender epoch, min req'd epoch
7182 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7183 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7184 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7185
7186 service.maybe_inject_dispatch_delay();
7187
7188 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7189 m->get_type() != CEPH_MSG_OSD_OP) {
7190 // queue it directly
7191 enqueue_op(
7192 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7193 op,
7194 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7195 } else {
7196 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7197 // message that didn't have an explicit spg_t); we need to map
7198 // them to an spg_t while preserving delivery order.
7199 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7200 if (session) {
7201 {
7202 Mutex::Locker l(session->session_dispatch_lock);
7203 op->get();
7204 session->waiting_on_map.push_back(*op);
7205 OSDMapRef nextmap = service.get_nextmap_reserved();
7206 dispatch_session_waiting(session, nextmap);
7207 service.release_map(nextmap);
7208 }
7209 session->put();
7210 }
7211 }
7212 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7213 }
7214
7215 void OSD::ms_fast_preprocess(Message *m)
7216 {
7217 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7218 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7219 MOSDMap *mm = static_cast<MOSDMap*>(m);
7220 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7221 if (s) {
7222 s->received_map_lock.lock();
7223 s->received_map_epoch = mm->get_last();
7224 s->received_map_lock.unlock();
7225 s->put();
7226 }
7227 }
7228 }
7229 }
7230
7231 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7232 {
7233 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7234
7235 if (is_stopping()) {
7236 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7237 return false;
7238 }
7239
7240 if (dest_type == CEPH_ENTITY_TYPE_MON)
7241 return true;
7242
7243 if (force_new) {
7244 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7245 to get through */
7246 if (monc->wait_auth_rotating(10) < 0) {
7247 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7248 return false;
7249 }
7250 }
7251
7252 *authorizer = monc->build_authorizer(dest_type);
7253 return *authorizer != NULL;
7254 }
7255
7256
7257 bool OSD::ms_verify_authorizer(
7258 Connection *con, int peer_type,
7259 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7260 bool& isvalid, CryptoKey& session_key,
7261 std::unique_ptr<AuthAuthorizerChallenge> *challenge)
7262 {
7263 AuthAuthorizeHandler *authorize_handler = 0;
7264 switch (peer_type) {
7265 case CEPH_ENTITY_TYPE_MDS:
7266 /*
7267 * note: mds is technically a client from our perspective, but
7268 * this makes the 'cluster' consistent w/ monitor's usage.
7269 */
7270 case CEPH_ENTITY_TYPE_OSD:
7271 case CEPH_ENTITY_TYPE_MGR:
7272 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7273 break;
7274 default:
7275 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7276 }
7277 if (!authorize_handler) {
7278 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7279 isvalid = false;
7280 return true;
7281 }
7282
7283 AuthCapsInfo caps_info;
7284 EntityName name;
7285 uint64_t global_id;
7286 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7287
7288 RotatingKeyRing *keys = monc->rotating_secrets.get();
7289 if (keys) {
7290 isvalid = authorize_handler->verify_authorizer(
7291 cct, keys,
7292 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7293 &auid, challenge);
7294 } else {
7295 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7296 isvalid = false;
7297 }
7298
7299 if (isvalid) {
7300 Session *s = static_cast<Session *>(con->get_priv());
7301 if (!s) {
7302 s = new Session(cct);
7303 con->set_priv(s->get());
7304 s->con = con;
7305 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7306 }
7307
7308 s->entity_name = name;
7309 if (caps_info.allow_all)
7310 s->caps.set_allow_all();
7311 s->auid = auid;
7312
7313 if (caps_info.caps.length() > 0) {
7314 bufferlist::iterator p = caps_info.caps.begin();
7315 string str;
7316 try {
7317 ::decode(str, p);
7318 }
7319 catch (buffer::error& e) {
7320 }
7321 bool success = s->caps.parse(str);
7322 if (success)
7323 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7324 else
7325 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7326 }
7327
7328 s->put();
7329 }
7330 return true;
7331 }
7332
7333 void OSD::do_waiters()
7334 {
7335 assert(osd_lock.is_locked());
7336
7337 dout(10) << "do_waiters -- start" << dendl;
7338 while (!finished.empty()) {
7339 OpRequestRef next = finished.front();
7340 finished.pop_front();
7341 dispatch_op(next);
7342 }
7343 dout(10) << "do_waiters -- finish" << dendl;
7344 }
7345
7346 void OSD::dispatch_op(OpRequestRef op)
7347 {
7348 switch (op->get_req()->get_type()) {
7349
7350 case MSG_OSD_PG_CREATE:
7351 handle_pg_create(op);
7352 break;
7353 case MSG_OSD_PG_NOTIFY:
7354 handle_pg_notify(op);
7355 break;
7356 case MSG_OSD_PG_QUERY:
7357 handle_pg_query(op);
7358 break;
7359 case MSG_OSD_PG_LOG:
7360 handle_pg_log(op);
7361 break;
7362 case MSG_OSD_PG_REMOVE:
7363 handle_pg_remove(op);
7364 break;
7365 case MSG_OSD_PG_INFO:
7366 handle_pg_info(op);
7367 break;
7368 case MSG_OSD_PG_TRIM:
7369 handle_pg_trim(op);
7370 break;
7371 case MSG_OSD_BACKFILL_RESERVE:
7372 handle_pg_backfill_reserve(op);
7373 break;
7374 case MSG_OSD_RECOVERY_RESERVE:
7375 handle_pg_recovery_reserve(op);
7376 break;
7377 }
7378 }
7379
7380 void OSD::_dispatch(Message *m)
7381 {
7382 assert(osd_lock.is_locked());
7383 dout(20) << "_dispatch " << m << " " << *m << dendl;
7384
7385 switch (m->get_type()) {
7386
7387 // -- don't need lock --
7388 case CEPH_MSG_PING:
7389 dout(10) << "ping from " << m->get_source() << dendl;
7390 m->put();
7391 break;
7392
7393 // -- don't need OSDMap --
7394
7395 // map and replication
7396 case CEPH_MSG_OSD_MAP:
7397 handle_osd_map(static_cast<MOSDMap*>(m));
7398 break;
7399
7400 // osd
7401 case MSG_PGSTATSACK:
7402 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7403 break;
7404
7405 case MSG_MON_COMMAND:
7406 handle_command(static_cast<MMonCommand*>(m));
7407 break;
7408 case MSG_COMMAND:
7409 handle_command(static_cast<MCommand*>(m));
7410 break;
7411
7412 case MSG_OSD_SCRUB:
7413 handle_scrub(static_cast<MOSDScrub*>(m));
7414 break;
7415
7416 case MSG_OSD_FORCE_RECOVERY:
7417 handle_force_recovery(m);
7418 break;
7419
7420 // -- need OSDMap --
7421
7422 case MSG_OSD_PG_CREATE:
7423 case MSG_OSD_PG_NOTIFY:
7424 case MSG_OSD_PG_QUERY:
7425 case MSG_OSD_PG_LOG:
7426 case MSG_OSD_PG_REMOVE:
7427 case MSG_OSD_PG_INFO:
7428 case MSG_OSD_PG_TRIM:
7429 case MSG_OSD_BACKFILL_RESERVE:
7430 case MSG_OSD_RECOVERY_RESERVE:
7431 {
7432 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7433 if (m->trace)
7434 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7435 // no map? starting up?
7436 if (!osdmap) {
7437 dout(7) << "no OSDMap, not booted" << dendl;
7438 logger->inc(l_osd_waiting_for_map);
7439 waiting_for_osdmap.push_back(op);
7440 op->mark_delayed("no osdmap");
7441 break;
7442 }
7443
7444 // need OSDMap
7445 dispatch_op(op);
7446 }
7447 }
7448 }
7449
7450 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7451 {
7452 pg->lock();
7453 if (pg->is_primary()) {
7454 pg->unreg_next_scrub();
7455 pg->scrubber.must_scrub = true;
7456 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7457 pg->scrubber.must_repair = m->repair;
7458 pg->reg_next_scrub();
7459 dout(10) << "marking " << *pg << " for scrub" << dendl;
7460 }
7461 pg->unlock();
7462 }
7463
7464 void OSD::handle_scrub(MOSDScrub *m)
7465 {
7466 dout(10) << "handle_scrub " << *m << dendl;
7467 if (!require_mon_or_mgr_peer(m)) {
7468 m->put();
7469 return;
7470 }
7471 if (m->fsid != monc->get_fsid()) {
7472 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7473 m->put();
7474 return;
7475 }
7476
7477 RWLock::RLocker l(pg_map_lock);
7478 if (m->scrub_pgs.empty()) {
7479 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7480 p != pg_map.end();
7481 ++p)
7482 handle_pg_scrub(m, p->second);
7483 } else {
7484 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7485 p != m->scrub_pgs.end();
7486 ++p) {
7487 spg_t pcand;
7488 if (osdmap->get_primary_shard(*p, &pcand)) {
7489 auto pg_map_entry = pg_map.find(pcand);
7490 if (pg_map_entry != pg_map.end()) {
7491 handle_pg_scrub(m, pg_map_entry->second);
7492 }
7493 }
7494 }
7495 }
7496
7497 m->put();
7498 }
7499
7500 bool OSD::scrub_random_backoff()
7501 {
7502 bool coin_flip = (rand() / (double)RAND_MAX >=
7503 cct->_conf->osd_scrub_backoff_ratio);
7504 if (!coin_flip) {
7505 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7506 return true;
7507 }
7508 return false;
7509 }
7510
7511 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7512 const spg_t& pg, const utime_t& timestamp,
7513 double pool_scrub_min_interval,
7514 double pool_scrub_max_interval, bool must)
7515 : cct(cct),
7516 pgid(pg),
7517 sched_time(timestamp),
7518 deadline(timestamp)
7519 {
7520 // if not explicitly requested, postpone the scrub with a random delay
7521 if (!must) {
7522 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7523 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7524 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7525 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7526
7527 sched_time += scrub_min_interval;
7528 double r = rand() / (double)RAND_MAX;
7529 sched_time +=
7530 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7531 deadline += scrub_max_interval;
7532 }
7533 }
7534
7535 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7536 if (sched_time < rhs.sched_time)
7537 return true;
7538 if (sched_time > rhs.sched_time)
7539 return false;
7540 return pgid < rhs.pgid;
7541 }
7542
7543 bool OSD::scrub_time_permit(utime_t now)
7544 {
7545 struct tm bdt;
7546 time_t tt = now.sec();
7547 localtime_r(&tt, &bdt);
7548
7549 bool day_permit = false;
7550 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7551 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7552 day_permit = true;
7553 }
7554 } else {
7555 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7556 day_permit = true;
7557 }
7558 }
7559
7560 if (!day_permit) {
7561 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7562 << " - " << cct->_conf->osd_scrub_end_week_day
7563 << " now " << bdt.tm_wday << " = no" << dendl;
7564 return false;
7565 }
7566
7567 bool time_permit = false;
7568 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7569 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7570 time_permit = true;
7571 }
7572 } else {
7573 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7574 time_permit = true;
7575 }
7576 }
7577 if (!time_permit) {
7578 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7579 << " - " << cct->_conf->osd_scrub_end_hour
7580 << " now " << bdt.tm_hour << " = no" << dendl;
7581 } else {
7582 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7583 << " - " << cct->_conf->osd_scrub_end_hour
7584 << " now " << bdt.tm_hour << " = yes" << dendl;
7585 }
7586 return time_permit;
7587 }
7588
7589 bool OSD::scrub_load_below_threshold()
7590 {
7591 double loadavgs[3];
7592 if (getloadavg(loadavgs, 3) != 3) {
7593 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7594 return false;
7595 }
7596
7597 // allow scrub if below configured threshold
7598 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7599 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7600 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7601 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7602 << " < max " << cct->_conf->osd_scrub_load_threshold
7603 << " = yes" << dendl;
7604 return true;
7605 }
7606
7607 // allow scrub if below daily avg and currently decreasing
7608 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7609 dout(20) << __func__ << " loadavg " << loadavgs[0]
7610 << " < daily_loadavg " << daily_loadavg
7611 << " and < 15m avg " << loadavgs[2]
7612 << " = yes" << dendl;
7613 return true;
7614 }
7615
7616 dout(20) << __func__ << " loadavg " << loadavgs[0]
7617 << " >= max " << cct->_conf->osd_scrub_load_threshold
7618 << " and ( >= daily_loadavg " << daily_loadavg
7619 << " or >= 15m avg " << loadavgs[2]
7620 << ") = no" << dendl;
7621 return false;
7622 }
7623
7624 void OSD::sched_scrub()
7625 {
7626 // if not permitted, fail fast
7627 if (!service.can_inc_scrubs_pending()) {
7628 return;
7629 }
7630 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7631 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7632 return;
7633 }
7634
7635
7636 utime_t now = ceph_clock_now();
7637 bool time_permit = scrub_time_permit(now);
7638 bool load_is_low = scrub_load_below_threshold();
7639 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7640
7641 OSDService::ScrubJob scrub;
7642 if (service.first_scrub_stamp(&scrub)) {
7643 do {
7644 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7645
7646 if (scrub.sched_time > now) {
7647 // save ourselves some effort
7648 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7649 << " > " << now << dendl;
7650 break;
7651 }
7652
7653 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7654 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7655 << (!time_permit ? "time not permit" : "high load") << dendl;
7656 continue;
7657 }
7658
7659 PG *pg = _lookup_lock_pg(scrub.pgid);
7660 if (!pg)
7661 continue;
7662 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7663 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7664 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7665 (load_is_low ? ", load_is_low" : " deadline < now"))
7666 << dendl;
7667 if (pg->sched_scrub()) {
7668 pg->unlock();
7669 break;
7670 }
7671 }
7672 pg->unlock();
7673 } while (service.next_scrub_stamp(scrub, &scrub));
7674 }
7675 dout(20) << "sched_scrub done" << dendl;
7676 }
7677
7678
7679
7680 vector<OSDHealthMetric> OSD::get_health_metrics()
7681 {
7682 vector<OSDHealthMetric> metrics;
7683 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
7684 auto n_primaries = pending_creates_from_mon;
7685 for (const auto& create : pending_creates_from_osd) {
7686 if (create.second) {
7687 n_primaries++;
7688 }
7689 }
7690 metrics.emplace_back(osd_metric::PENDING_CREATING_PGS, n_primaries);
7691 return metrics;
7692 }
7693
7694 // =====================================================
7695 // MAP
7696
7697 void OSD::wait_for_new_map(OpRequestRef op)
7698 {
7699 // ask?
7700 if (waiting_for_osdmap.empty()) {
7701 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7702 }
7703
7704 logger->inc(l_osd_waiting_for_map);
7705 waiting_for_osdmap.push_back(op);
7706 op->mark_delayed("wait for new map");
7707 }
7708
7709
7710 /** update_map
7711 * assimilate new OSDMap(s). scan pgs, etc.
7712 */
7713
7714 void OSD::note_down_osd(int peer)
7715 {
7716 assert(osd_lock.is_locked());
7717 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7718
7719 heartbeat_lock.Lock();
7720 failure_queue.erase(peer);
7721 failure_pending.erase(peer);
7722 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7723 if (p != heartbeat_peers.end()) {
7724 p->second.con_back->mark_down();
7725 if (p->second.con_front) {
7726 p->second.con_front->mark_down();
7727 }
7728 heartbeat_peers.erase(p);
7729 }
7730 heartbeat_lock.Unlock();
7731 }
7732
7733 void OSD::note_up_osd(int peer)
7734 {
7735 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7736 heartbeat_set_peers_need_update();
7737 }
7738
7739 struct C_OnMapCommit : public Context {
7740 OSD *osd;
7741 epoch_t first, last;
7742 MOSDMap *msg;
7743 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7744 : osd(o), first(f), last(l), msg(m) {}
7745 void finish(int r) override {
7746 osd->_committed_osd_maps(first, last, msg);
7747 msg->put();
7748 }
7749 };
7750
7751 struct C_OnMapApply : public Context {
7752 OSDService *service;
7753 list<OSDMapRef> pinned_maps;
7754 epoch_t e;
7755 C_OnMapApply(OSDService *service,
7756 const list<OSDMapRef> &pinned_maps,
7757 epoch_t e)
7758 : service(service), pinned_maps(pinned_maps), e(e) {}
7759 void finish(int r) override {
7760 service->clear_map_bl_cache_pins(e);
7761 }
7762 };
7763
7764 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7765 {
7766 Mutex::Locker l(osdmap_subscribe_lock);
7767 if (latest_subscribed_epoch >= epoch && !force_request)
7768 return;
7769
7770 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7771
7772 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7773 force_request) {
7774 monc->renew_subs();
7775 }
7776 }
7777
7778 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7779 {
7780 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7781 if (min <= superblock.oldest_map)
7782 return;
7783
7784 int num = 0;
7785 ObjectStore::Transaction t;
7786 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7787 dout(20) << " removing old osdmap epoch " << e << dendl;
7788 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7789 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7790 superblock.oldest_map = e + 1;
7791 num++;
7792 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7793 service.publish_superblock(superblock);
7794 write_superblock(t);
7795 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7796 assert(tr == 0);
7797 num = 0;
7798 if (!skip_maps) {
7799 // skip_maps leaves us with a range of old maps if we fail to remove all
7800 // of them before moving superblock.oldest_map forward to the first map
7801 // in the incoming MOSDMap msg. so we should continue removing them in
7802 // this case, even we could do huge series of delete transactions all at
7803 // once.
7804 break;
7805 }
7806 }
7807 }
7808 if (num > 0) {
7809 service.publish_superblock(superblock);
7810 write_superblock(t);
7811 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7812 assert(tr == 0);
7813 }
7814 // we should not remove the cached maps
7815 assert(min <= service.map_cache.cached_key_lower_bound());
7816 }
7817
7818 void OSD::handle_osd_map(MOSDMap *m)
7819 {
7820 assert(osd_lock.is_locked());
7821 // Keep a ref in the list until we get the newly received map written
7822 // onto disk. This is important because as long as the refs are alive,
7823 // the OSDMaps will be pinned in the cache and we won't try to read it
7824 // off of disk. Otherwise these maps will probably not stay in the cache,
7825 // and reading those OSDMaps before they are actually written can result
7826 // in a crash.
7827 list<OSDMapRef> pinned_maps;
7828 if (m->fsid != monc->get_fsid()) {
7829 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7830 << monc->get_fsid() << dendl;
7831 m->put();
7832 return;
7833 }
7834 if (is_initializing()) {
7835 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7836 m->put();
7837 return;
7838 }
7839
7840 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7841 if (session && !(session->entity_name.is_mon() ||
7842 session->entity_name.is_osd())) {
7843 //not enough perms!
7844 dout(10) << "got osd map from Session " << session
7845 << " which we can't take maps from (not a mon or osd)" << dendl;
7846 m->put();
7847 session->put();
7848 return;
7849 }
7850 if (session)
7851 session->put();
7852
7853 // share with the objecter
7854 if (!is_preboot())
7855 service.objecter->handle_osd_map(m);
7856
7857 epoch_t first = m->get_first();
7858 epoch_t last = m->get_last();
7859 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7860 << superblock.newest_map
7861 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7862 << dendl;
7863
7864 logger->inc(l_osd_map);
7865 logger->inc(l_osd_mape, last - first + 1);
7866 if (first <= superblock.newest_map)
7867 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7868 if (service.max_oldest_map < m->oldest_map) {
7869 service.max_oldest_map = m->oldest_map;
7870 assert(service.max_oldest_map >= superblock.oldest_map);
7871 }
7872
7873 // make sure there is something new, here, before we bother flushing
7874 // the queues and such
7875 if (last <= superblock.newest_map) {
7876 dout(10) << " no new maps here, dropping" << dendl;
7877 m->put();
7878 return;
7879 }
7880
7881 // missing some?
7882 bool skip_maps = false;
7883 if (first > superblock.newest_map + 1) {
7884 dout(10) << "handle_osd_map message skips epochs "
7885 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7886 if (m->oldest_map <= superblock.newest_map + 1) {
7887 osdmap_subscribe(superblock.newest_map + 1, false);
7888 m->put();
7889 return;
7890 }
7891 // always try to get the full range of maps--as many as we can. this
7892 // 1- is good to have
7893 // 2- is at present the only way to ensure that we get a *full* map as
7894 // the first map!
7895 if (m->oldest_map < first) {
7896 osdmap_subscribe(m->oldest_map - 1, true);
7897 m->put();
7898 return;
7899 }
7900 skip_maps = true;
7901 }
7902
7903 ObjectStore::Transaction t;
7904 uint64_t txn_size = 0;
7905
7906 // store new maps: queue for disk and put in the osdmap cache
7907 epoch_t start = MAX(superblock.newest_map + 1, first);
7908 for (epoch_t e = start; e <= last; e++) {
7909 if (txn_size >= t.get_num_bytes()) {
7910 derr << __func__ << " transaction size overflowed" << dendl;
7911 assert(txn_size < t.get_num_bytes());
7912 }
7913 txn_size = t.get_num_bytes();
7914 map<epoch_t,bufferlist>::iterator p;
7915 p = m->maps.find(e);
7916 if (p != m->maps.end()) {
7917 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7918 OSDMap *o = new OSDMap;
7919 bufferlist& bl = p->second;
7920
7921 o->decode(bl);
7922
7923 ghobject_t fulloid = get_osdmap_pobject_name(e);
7924 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7925 pin_map_bl(e, bl);
7926 pinned_maps.push_back(add_map(o));
7927
7928 got_full_map(e);
7929 continue;
7930 }
7931
7932 p = m->incremental_maps.find(e);
7933 if (p != m->incremental_maps.end()) {
7934 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7935 bufferlist& bl = p->second;
7936 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7937 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7938 pin_map_inc_bl(e, bl);
7939
7940 OSDMap *o = new OSDMap;
7941 if (e > 1) {
7942 bufferlist obl;
7943 bool got = get_map_bl(e - 1, obl);
7944 assert(got);
7945 o->decode(obl);
7946 }
7947
7948 OSDMap::Incremental inc;
7949 bufferlist::iterator p = bl.begin();
7950 inc.decode(p);
7951 if (o->apply_incremental(inc) < 0) {
7952 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7953 assert(0 == "bad fsid");
7954 }
7955
7956 bufferlist fbl;
7957 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7958
7959 bool injected_failure = false;
7960 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7961 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7962 derr << __func__ << " injecting map crc failure" << dendl;
7963 injected_failure = true;
7964 }
7965
7966 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7967 dout(2) << "got incremental " << e
7968 << " but failed to encode full with correct crc; requesting"
7969 << dendl;
7970 clog->warn() << "failed to encode map e" << e << " with expected crc";
7971 dout(20) << "my encoded map was:\n";
7972 fbl.hexdump(*_dout);
7973 *_dout << dendl;
7974 delete o;
7975 request_full_map(e, last);
7976 last = e - 1;
7977 break;
7978 }
7979 got_full_map(e);
7980
7981 ghobject_t fulloid = get_osdmap_pobject_name(e);
7982 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7983 pin_map_bl(e, fbl);
7984 pinned_maps.push_back(add_map(o));
7985 continue;
7986 }
7987
7988 assert(0 == "MOSDMap lied about what maps it had?");
7989 }
7990
7991 // even if this map isn't from a mon, we may have satisfied our subscription
7992 monc->sub_got("osdmap", last);
7993
7994 if (!m->maps.empty() && requested_full_first) {
7995 dout(10) << __func__ << " still missing full maps " << requested_full_first
7996 << ".." << requested_full_last << dendl;
7997 rerequest_full_maps();
7998 }
7999
8000 if (superblock.oldest_map) {
8001 // make sure we at least keep pace with incoming maps
8002 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8003 }
8004
8005 if (!superblock.oldest_map || skip_maps)
8006 superblock.oldest_map = first;
8007 superblock.newest_map = last;
8008 superblock.current_epoch = last;
8009
8010 // note in the superblock that we were clean thru the prior epoch
8011 epoch_t boot_epoch = service.get_boot_epoch();
8012 if (boot_epoch && boot_epoch >= superblock.mounted) {
8013 superblock.mounted = boot_epoch;
8014 superblock.clean_thru = last;
8015 }
8016
8017 // superblock and commit
8018 write_superblock(t);
8019 store->queue_transaction(
8020 service.meta_osr.get(),
8021 std::move(t),
8022 new C_OnMapApply(&service, pinned_maps, last),
8023 new C_OnMapCommit(this, start, last, m), 0);
8024 service.publish_superblock(superblock);
8025 }
8026
8027 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8028 {
8029 dout(10) << __func__ << " " << first << ".." << last << dendl;
8030 if (is_stopping()) {
8031 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8032 return;
8033 }
8034 Mutex::Locker l(osd_lock);
8035 if (is_stopping()) {
8036 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8037 return;
8038 }
8039 map_lock.get_write();
8040
8041 bool do_shutdown = false;
8042 bool do_restart = false;
8043 bool network_error = false;
8044
8045 // advance through the new maps
8046 for (epoch_t cur = first; cur <= last; cur++) {
8047 dout(10) << " advance to epoch " << cur
8048 << " (<= last " << last
8049 << " <= newest_map " << superblock.newest_map
8050 << ")" << dendl;
8051
8052 OSDMapRef newmap = get_map(cur);
8053 assert(newmap); // we just cached it above!
8054
8055 // start blacklisting messages sent to peers that go down.
8056 service.pre_publish_map(newmap);
8057
8058 // kill connections to newly down osds
8059 bool waited_for_reservations = false;
8060 set<int> old;
8061 osdmap->get_all_osds(old);
8062 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8063 if (*p != whoami &&
8064 osdmap->is_up(*p) && // in old map
8065 newmap->is_down(*p)) { // but not the new one
8066 if (!waited_for_reservations) {
8067 service.await_reserved_maps();
8068 waited_for_reservations = true;
8069 }
8070 note_down_osd(*p);
8071 } else if (*p != whoami &&
8072 osdmap->is_down(*p) &&
8073 newmap->is_up(*p)) {
8074 note_up_osd(*p);
8075 }
8076 }
8077
8078 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
8079 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
8080 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
8081 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8082 << dendl;
8083 if (is_booting()) {
8084 // this captures the case where we sent the boot message while
8085 // NOUP was being set on the mon and our boot request was
8086 // dropped, and then later it is cleared. it imperfectly
8087 // handles the case where our original boot message was not
8088 // dropped and we restart even though we might have booted, but
8089 // that is harmless (boot will just take slightly longer).
8090 do_restart = true;
8091 }
8092 }
8093 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
8094 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
8095 dout(10) << __func__ << " require_osd_release reached luminous in "
8096 << newmap->get_epoch() << dendl;
8097 clear_pg_stat_queue();
8098 clear_outstanding_pg_stats();
8099 }
8100
8101 osdmap = newmap;
8102 epoch_t up_epoch;
8103 epoch_t boot_epoch;
8104 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8105 if (!up_epoch &&
8106 osdmap->is_up(whoami) &&
8107 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
8108 up_epoch = osdmap->get_epoch();
8109 dout(10) << "up_epoch is " << up_epoch << dendl;
8110 if (!boot_epoch) {
8111 boot_epoch = osdmap->get_epoch();
8112 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8113 }
8114 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8115 }
8116 }
8117
8118 had_map_since = ceph_clock_now();
8119
8120 epoch_t _bind_epoch = service.get_bind_epoch();
8121 if (osdmap->is_up(whoami) &&
8122 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
8123 _bind_epoch < osdmap->get_up_from(whoami)) {
8124
8125 if (is_booting()) {
8126 dout(1) << "state: booting -> active" << dendl;
8127 set_state(STATE_ACTIVE);
8128
8129 // set incarnation so that osd_reqid_t's we generate for our
8130 // objecter requests are unique across restarts.
8131 service.objecter->set_client_incarnation(osdmap->get_epoch());
8132 }
8133 }
8134
8135 if (osdmap->get_epoch() > 0 &&
8136 is_active()) {
8137 if (!osdmap->exists(whoami)) {
8138 dout(0) << "map says i do not exist. shutting down." << dendl;
8139 do_shutdown = true; // don't call shutdown() while we have
8140 // everything paused
8141 } else if (!osdmap->is_up(whoami) ||
8142 !osdmap->get_addr(whoami).probably_equals(
8143 client_messenger->get_myaddr()) ||
8144 !osdmap->get_cluster_addr(whoami).probably_equals(
8145 cluster_messenger->get_myaddr()) ||
8146 !osdmap->get_hb_back_addr(whoami).probably_equals(
8147 hb_back_server_messenger->get_myaddr()) ||
8148 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8149 !osdmap->get_hb_front_addr(whoami).probably_equals(
8150 hb_front_server_messenger->get_myaddr()))) {
8151 if (!osdmap->is_up(whoami)) {
8152 if (service.is_preparing_to_stop() || service.is_stopping()) {
8153 service.got_stop_ack();
8154 } else {
8155 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8156 "but it is still running";
8157 clog->debug() << "map e" << osdmap->get_epoch()
8158 << " wrongly marked me down at e"
8159 << osdmap->get_down_at(whoami);
8160 }
8161 } else if (!osdmap->get_addr(whoami).probably_equals(
8162 client_messenger->get_myaddr())) {
8163 clog->error() << "map e" << osdmap->get_epoch()
8164 << " had wrong client addr (" << osdmap->get_addr(whoami)
8165 << " != my " << client_messenger->get_myaddr() << ")";
8166 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8167 cluster_messenger->get_myaddr())) {
8168 clog->error() << "map e" << osdmap->get_epoch()
8169 << " had wrong cluster addr ("
8170 << osdmap->get_cluster_addr(whoami)
8171 << " != my " << cluster_messenger->get_myaddr() << ")";
8172 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8173 hb_back_server_messenger->get_myaddr())) {
8174 clog->error() << "map e" << osdmap->get_epoch()
8175 << " had wrong heartbeat back addr ("
8176 << osdmap->get_hb_back_addr(whoami)
8177 << " != my " << hb_back_server_messenger->get_myaddr()
8178 << ")";
8179 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8180 !osdmap->get_hb_front_addr(whoami).probably_equals(
8181 hb_front_server_messenger->get_myaddr())) {
8182 clog->error() << "map e" << osdmap->get_epoch()
8183 << " had wrong heartbeat front addr ("
8184 << osdmap->get_hb_front_addr(whoami)
8185 << " != my " << hb_front_server_messenger->get_myaddr()
8186 << ")";
8187 }
8188
8189 if (!service.is_stopping()) {
8190 epoch_t up_epoch = 0;
8191 epoch_t bind_epoch = osdmap->get_epoch();
8192 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8193 do_restart = true;
8194
8195 //add markdown log
8196 utime_t now = ceph_clock_now();
8197 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8198 osd_markdown_log.push_back(now);
8199 //clear all out-of-date log
8200 while (!osd_markdown_log.empty() &&
8201 osd_markdown_log.front() + grace < now)
8202 osd_markdown_log.pop_front();
8203 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8204 dout(0) << __func__ << " marked down "
8205 << osd_markdown_log.size()
8206 << " > osd_max_markdown_count "
8207 << cct->_conf->osd_max_markdown_count
8208 << " in last " << grace << " seconds, shutting down"
8209 << dendl;
8210 do_restart = false;
8211 do_shutdown = true;
8212 }
8213
8214 start_waiting_for_healthy();
8215
8216 set<int> avoid_ports;
8217 #if defined(__FreeBSD__)
8218 // prevent FreeBSD from grabbing the client_messenger port during
8219 // rebinding. In which case a cluster_meesneger will connect also
8220 // to the same port
8221 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8222 #endif
8223 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8224 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8225 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8226
8227 int r = cluster_messenger->rebind(avoid_ports);
8228 if (r != 0) {
8229 do_shutdown = true; // FIXME: do_restart?
8230 network_error = true;
8231 dout(0) << __func__ << " marked down:"
8232 << " rebind cluster_messenger failed" << dendl;
8233 }
8234
8235 r = hb_back_server_messenger->rebind(avoid_ports);
8236 if (r != 0) {
8237 do_shutdown = true; // FIXME: do_restart?
8238 network_error = true;
8239 dout(0) << __func__ << " marked down:"
8240 << " rebind hb_back_server_messenger failed" << dendl;
8241 }
8242
8243 r = hb_front_server_messenger->rebind(avoid_ports);
8244 if (r != 0) {
8245 do_shutdown = true; // FIXME: do_restart?
8246 network_error = true;
8247 dout(0) << __func__ << " marked down:"
8248 << " rebind hb_front_server_messenger failed" << dendl;
8249 }
8250
8251 hb_front_client_messenger->mark_down_all();
8252 hb_back_client_messenger->mark_down_all();
8253
8254 reset_heartbeat_peers();
8255 }
8256 }
8257 }
8258
8259 map_lock.put_write();
8260
8261 check_osdmap_features(store);
8262
8263 // yay!
8264 consume_map();
8265
8266 if (is_active() || is_waiting_for_healthy())
8267 maybe_update_heartbeat_peers();
8268
8269 if (!is_active()) {
8270 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8271 peering_wq.drain();
8272 } else {
8273 activate_map();
8274 }
8275
8276 if (do_shutdown) {
8277 if (network_error) {
8278 Mutex::Locker l(heartbeat_lock);
8279 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8280 failure_pending.begin();
8281 while (it != failure_pending.end()) {
8282 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8283 << it->first << dendl;
8284 send_still_alive(osdmap->get_epoch(), it->second.second);
8285 failure_pending.erase(it++);
8286 }
8287 }
8288 // trigger shutdown in a different thread
8289 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8290 queue_async_signal(SIGINT);
8291 }
8292 else if (m->newest_map && m->newest_map > last) {
8293 dout(10) << " msg say newest map is " << m->newest_map
8294 << ", requesting more" << dendl;
8295 osdmap_subscribe(osdmap->get_epoch()+1, false);
8296 }
8297 else if (is_preboot()) {
8298 if (m->get_source().is_mon())
8299 _preboot(m->oldest_map, m->newest_map);
8300 else
8301 start_boot();
8302 }
8303 else if (do_restart)
8304 start_boot();
8305
8306 }
8307
8308 void OSD::check_osdmap_features(ObjectStore *fs)
8309 {
8310 // adjust required feature bits?
8311
8312 // we have to be a bit careful here, because we are accessing the
8313 // Policy structures without taking any lock. in particular, only
8314 // modify integer values that can safely be read by a racing CPU.
8315 // since we are only accessing existing Policy structures a their
8316 // current memory location, and setting or clearing bits in integer
8317 // fields, and we are the only writer, this is not a problem.
8318
8319 {
8320 Messenger::Policy p = client_messenger->get_default_policy();
8321 uint64_t mask;
8322 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8323 if ((p.features_required & mask) != features) {
8324 dout(0) << "crush map has features " << features
8325 << ", adjusting msgr requires for clients" << dendl;
8326 p.features_required = (p.features_required & ~mask) | features;
8327 client_messenger->set_default_policy(p);
8328 }
8329 }
8330 {
8331 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8332 uint64_t mask;
8333 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8334 if ((p.features_required & mask) != features) {
8335 dout(0) << "crush map has features " << features
8336 << " was " << p.features_required
8337 << ", adjusting msgr requires for mons" << dendl;
8338 p.features_required = (p.features_required & ~mask) | features;
8339 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8340 }
8341 }
8342 {
8343 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8344 uint64_t mask;
8345 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8346
8347 if ((p.features_required & mask) != features) {
8348 dout(0) << "crush map has features " << features
8349 << ", adjusting msgr requires for osds" << dendl;
8350 p.features_required = (p.features_required & ~mask) | features;
8351 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8352 }
8353
8354 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8355 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8356 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8357 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8358 ObjectStore::Transaction t;
8359 write_superblock(t);
8360 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8361 assert(err == 0);
8362 }
8363 }
8364 }
8365
8366 bool OSD::advance_pg(
8367 epoch_t osd_epoch, PG *pg,
8368 ThreadPool::TPHandle &handle,
8369 PG::RecoveryCtx *rctx,
8370 set<PGRef> *new_pgs)
8371 {
8372 assert(pg->is_locked());
8373 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8374 OSDMapRef lastmap = pg->get_osdmap();
8375
8376 if (lastmap->get_epoch() == osd_epoch)
8377 return true;
8378 assert(lastmap->get_epoch() < osd_epoch);
8379
8380 epoch_t min_epoch = service.get_min_pg_epoch();
8381 epoch_t max;
8382 if (min_epoch) {
8383 max = min_epoch + cct->_conf->osd_map_max_advance;
8384 } else {
8385 max = next_epoch + cct->_conf->osd_map_max_advance;
8386 }
8387
8388 for (;
8389 next_epoch <= osd_epoch && next_epoch <= max;
8390 ++next_epoch) {
8391 OSDMapRef nextmap = service.try_get_map(next_epoch);
8392 if (!nextmap) {
8393 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8394 // make sure max is bumped up so that we can get past any
8395 // gap in maps
8396 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8397 continue;
8398 }
8399
8400 vector<int> newup, newacting;
8401 int up_primary, acting_primary;
8402 nextmap->pg_to_up_acting_osds(
8403 pg->info.pgid.pgid,
8404 &newup, &up_primary,
8405 &newacting, &acting_primary);
8406 pg->handle_advance_map(
8407 nextmap, lastmap, newup, up_primary,
8408 newacting, acting_primary, rctx);
8409
8410 // Check for split!
8411 set<spg_t> children;
8412 spg_t parent(pg->info.pgid);
8413 if (parent.is_split(
8414 lastmap->get_pg_num(pg->pool.id),
8415 nextmap->get_pg_num(pg->pool.id),
8416 &children)) {
8417 service.mark_split_in_progress(pg->info.pgid, children);
8418 split_pgs(
8419 pg, children, new_pgs, lastmap, nextmap,
8420 rctx);
8421 }
8422
8423 lastmap = nextmap;
8424 handle.reset_tp_timeout();
8425 }
8426 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8427 pg->handle_activate_map(rctx);
8428 if (next_epoch <= osd_epoch) {
8429 dout(10) << __func__ << " advanced to max " << max
8430 << " past min epoch " << min_epoch
8431 << " ... will requeue " << *pg << dendl;
8432 return false;
8433 }
8434 return true;
8435 }
8436
8437 void OSD::consume_map()
8438 {
8439 assert(osd_lock.is_locked());
8440 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8441
8442 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8443 * speak the older sorting version any more. Be careful not to force
8444 * a shutdown if we are merely processing old maps, though.
8445 */
8446 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8447 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8448 ceph_abort();
8449 }
8450
8451 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8452 list<PGRef> to_remove;
8453
8454 // scan pg's
8455 {
8456 RWLock::RLocker l(pg_map_lock);
8457 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8458 it != pg_map.end();
8459 ++it) {
8460 PG *pg = it->second;
8461 pg->lock();
8462 if (pg->is_primary())
8463 num_pg_primary++;
8464 else if (pg->is_replica())
8465 num_pg_replica++;
8466 else
8467 num_pg_stray++;
8468
8469 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8470 //pool is deleted!
8471 to_remove.push_back(PGRef(pg));
8472 } else {
8473 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8474 }
8475
8476 pg->unlock();
8477 }
8478
8479 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8480 for (auto pg = pending_creates_from_osd.cbegin();
8481 pg != pending_creates_from_osd.cend();) {
8482 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
8483 pg = pending_creates_from_osd.erase(pg);
8484 } else {
8485 ++pg;
8486 }
8487 }
8488 }
8489
8490 for (list<PGRef>::iterator i = to_remove.begin();
8491 i != to_remove.end();
8492 to_remove.erase(i++)) {
8493 RWLock::WLocker locker(pg_map_lock);
8494 (*i)->lock();
8495 _remove_pg(&**i);
8496 (*i)->unlock();
8497 }
8498
8499 service.expand_pg_num(service.get_osdmap(), osdmap);
8500
8501 service.pre_publish_map(osdmap);
8502 service.await_reserved_maps();
8503 service.publish_map(osdmap);
8504
8505 service.maybe_inject_dispatch_delay();
8506
8507 dispatch_sessions_waiting_on_map();
8508
8509 service.maybe_inject_dispatch_delay();
8510
8511 // remove any PGs which we no longer host from the session waiting_for_pg lists
8512 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8513 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8514
8515 service.maybe_inject_dispatch_delay();
8516
8517 // scan pg's
8518 {
8519 RWLock::RLocker l(pg_map_lock);
8520 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8521 it != pg_map.end();
8522 ++it) {
8523 PG *pg = it->second;
8524 pg->lock();
8525 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8526 pg->unlock();
8527 }
8528
8529 logger->set(l_osd_pg, pg_map.size());
8530 }
8531 logger->set(l_osd_pg_primary, num_pg_primary);
8532 logger->set(l_osd_pg_replica, num_pg_replica);
8533 logger->set(l_osd_pg_stray, num_pg_stray);
8534 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
8535 }
8536
8537 void OSD::activate_map()
8538 {
8539 assert(osd_lock.is_locked());
8540
8541 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8542
8543 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8544 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8545 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8546 }
8547
8548 // norecover?
8549 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8550 if (!service.recovery_is_paused()) {
8551 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8552 service.pause_recovery();
8553 }
8554 } else {
8555 if (service.recovery_is_paused()) {
8556 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8557 service.unpause_recovery();
8558 }
8559 }
8560
8561 service.activate_map();
8562
8563 // process waiters
8564 take_waiters(waiting_for_osdmap);
8565 }
8566
8567 bool OSD::require_mon_peer(const Message *m)
8568 {
8569 if (!m->get_connection()->peer_is_mon()) {
8570 dout(0) << "require_mon_peer received from non-mon "
8571 << m->get_connection()->get_peer_addr()
8572 << " " << *m << dendl;
8573 return false;
8574 }
8575 return true;
8576 }
8577
8578 bool OSD::require_mon_or_mgr_peer(const Message *m)
8579 {
8580 if (!m->get_connection()->peer_is_mon() &&
8581 !m->get_connection()->peer_is_mgr()) {
8582 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8583 << m->get_connection()->get_peer_addr()
8584 << " " << *m << dendl;
8585 return false;
8586 }
8587 return true;
8588 }
8589
8590 bool OSD::require_osd_peer(const Message *m)
8591 {
8592 if (!m->get_connection()->peer_is_osd()) {
8593 dout(0) << "require_osd_peer received from non-osd "
8594 << m->get_connection()->get_peer_addr()
8595 << " " << *m << dendl;
8596 return false;
8597 }
8598 return true;
8599 }
8600
8601 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8602 {
8603 epoch_t up_epoch = service.get_up_epoch();
8604 if (epoch < up_epoch) {
8605 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8606 return false;
8607 }
8608
8609 if (!is_active()) {
8610 dout(7) << "still in boot state, dropping message " << *m << dendl;
8611 return false;
8612 }
8613
8614 return true;
8615 }
8616
8617 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8618 bool is_fast_dispatch)
8619 {
8620 int from = m->get_source().num();
8621
8622 if (map->is_down(from) ||
8623 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8624 dout(5) << "from dead osd." << from << ", marking down, "
8625 << " msg was " << m->get_source_inst().addr
8626 << " expected " << (map->is_up(from) ?
8627 map->get_cluster_addr(from) : entity_addr_t())
8628 << dendl;
8629 ConnectionRef con = m->get_connection();
8630 con->mark_down();
8631 Session *s = static_cast<Session*>(con->get_priv());
8632 if (s) {
8633 if (!is_fast_dispatch)
8634 s->session_dispatch_lock.Lock();
8635 clear_session_waiting_on_map(s);
8636 con->set_priv(NULL); // break ref <-> session cycle, if any
8637 if (!is_fast_dispatch)
8638 s->session_dispatch_lock.Unlock();
8639 s->put();
8640 }
8641 return false;
8642 }
8643 return true;
8644 }
8645
8646
8647 /*
8648 * require that we have same (or newer) map, and that
8649 * the source is the pg primary.
8650 */
8651 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8652 bool is_fast_dispatch)
8653 {
8654 const Message *m = op->get_req();
8655 dout(15) << "require_same_or_newer_map " << epoch
8656 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8657
8658 assert(osd_lock.is_locked());
8659
8660 // do they have a newer map?
8661 if (epoch > osdmap->get_epoch()) {
8662 dout(7) << "waiting for newer map epoch " << epoch
8663 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8664 wait_for_new_map(op);
8665 return false;
8666 }
8667
8668 if (!require_self_aliveness(op->get_req(), epoch)) {
8669 return false;
8670 }
8671
8672 // ok, our map is same or newer.. do they still exist?
8673 if (m->get_connection()->get_messenger() == cluster_messenger &&
8674 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8675 return false;
8676 }
8677
8678 return true;
8679 }
8680
8681
8682
8683
8684
8685 // ----------------------------------------
8686 // pg creation
8687
8688 void OSD::split_pgs(
8689 PG *parent,
8690 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8691 OSDMapRef curmap,
8692 OSDMapRef nextmap,
8693 PG::RecoveryCtx *rctx)
8694 {
8695 unsigned pg_num = nextmap->get_pg_num(
8696 parent->pool.id);
8697 parent->update_snap_mapper_bits(
8698 parent->info.pgid.get_split_bits(pg_num)
8699 );
8700
8701 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8702 parent->info.stats.stats.sum.split(updated_stats);
8703
8704 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8705 for (set<spg_t>::const_iterator i = childpgids.begin();
8706 i != childpgids.end();
8707 ++i, ++stat_iter) {
8708 assert(stat_iter != updated_stats.end());
8709 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8710 assert(service.splitting(*i));
8711 PG* child = _make_pg(nextmap, *i);
8712 child->lock(true);
8713 out_pgs->insert(child);
8714 rctx->created_pgs.insert(child);
8715
8716 unsigned split_bits = i->get_split_bits(pg_num);
8717 dout(10) << "pg_num is " << pg_num << dendl;
8718 dout(10) << "m_seed " << i->ps() << dendl;
8719 dout(10) << "split_bits is " << split_bits << dendl;
8720
8721 parent->split_colls(
8722 *i,
8723 split_bits,
8724 i->ps(),
8725 &child->pool.info,
8726 rctx->transaction);
8727 parent->split_into(
8728 i->pgid,
8729 child,
8730 split_bits);
8731 child->info.stats.stats.sum = *stat_iter;
8732
8733 child->write_if_dirty(*(rctx->transaction));
8734 child->unlock();
8735 }
8736 assert(stat_iter != updated_stats.end());
8737 parent->info.stats.stats.sum = *stat_iter;
8738 parent->write_if_dirty(*(rctx->transaction));
8739 }
8740
8741 /*
8742 * holding osd_lock
8743 */
8744 void OSD::handle_pg_create(OpRequestRef op)
8745 {
8746 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8747 assert(m->get_type() == MSG_OSD_PG_CREATE);
8748
8749 dout(10) << "handle_pg_create " << *m << dendl;
8750
8751 if (!require_mon_peer(op->get_req())) {
8752 return;
8753 }
8754
8755 if (!require_same_or_newer_map(op, m->epoch, false))
8756 return;
8757
8758 op->mark_started();
8759
8760 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8761 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8762 p != m->mkpg.end();
8763 ++p, ++ci) {
8764 assert(ci != m->ctimes.end() && ci->first == p->first);
8765 epoch_t created = p->second.created;
8766 if (p->second.split_bits) // Skip split pgs
8767 continue;
8768 pg_t on = p->first;
8769
8770 if (on.preferred() >= 0) {
8771 dout(20) << "ignoring localized pg " << on << dendl;
8772 continue;
8773 }
8774
8775 if (!osdmap->have_pg_pool(on.pool())) {
8776 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8777 continue;
8778 }
8779
8780 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8781
8782 // is it still ours?
8783 vector<int> up, acting;
8784 int up_primary = -1;
8785 int acting_primary = -1;
8786 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8787 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8788
8789 if (acting_primary != whoami) {
8790 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8791 << "), my role=" << role << ", skipping" << dendl;
8792 continue;
8793 }
8794
8795 spg_t pgid;
8796 bool mapped = osdmap->get_primary_shard(on, &pgid);
8797 assert(mapped);
8798
8799 PastIntervals pi(
8800 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8801 *osdmap);
8802 pg_history_t history;
8803 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8804
8805 // The mon won't resend unless the primary changed, so
8806 // we ignore same_interval_since. We'll pass this history
8807 // to handle_pg_peering_evt with the current epoch as the
8808 // event -- the project_pg_history check in
8809 // handle_pg_peering_evt will be a noop.
8810 if (history.same_primary_since > m->epoch) {
8811 dout(10) << __func__ << ": got obsolete pg create on pgid "
8812 << pgid << " from epoch " << m->epoch
8813 << ", primary changed in " << history.same_primary_since
8814 << dendl;
8815 continue;
8816 }
8817 if (handle_pg_peering_evt(
8818 pgid,
8819 history,
8820 pi,
8821 osdmap->get_epoch(),
8822 PG::CephPeeringEvtRef(
8823 new PG::CephPeeringEvt(
8824 osdmap->get_epoch(),
8825 osdmap->get_epoch(),
8826 PG::NullEvt()))
8827 ) == -EEXIST) {
8828 service.send_pg_created(pgid.pgid);
8829 }
8830 }
8831
8832 {
8833 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8834 if (pending_creates_from_mon == 0) {
8835 last_pg_create_epoch = m->epoch;
8836 }
8837 }
8838 maybe_update_heartbeat_peers();
8839 }
8840
8841
8842 // ----------------------------------------
8843 // peering and recovery
8844
8845 PG::RecoveryCtx OSD::create_context()
8846 {
8847 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8848 C_Contexts *on_applied = new C_Contexts(cct);
8849 C_Contexts *on_safe = new C_Contexts(cct);
8850 map<int, map<spg_t,pg_query_t> > *query_map =
8851 new map<int, map<spg_t, pg_query_t> >;
8852 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8853 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8854 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8855 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8856 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8857 on_applied, on_safe, t);
8858 return rctx;
8859 }
8860
8861 struct C_OpenPGs : public Context {
8862 set<PGRef> pgs;
8863 ObjectStore *store;
8864 OSD *osd;
8865 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8866 pgs.swap(p);
8867 }
8868 void finish(int r) override {
8869 RWLock::RLocker l(osd->pg_map_lock);
8870 for (auto p : pgs) {
8871 if (osd->pg_map.count(p->info.pgid)) {
8872 p->ch = store->open_collection(p->coll);
8873 assert(p->ch);
8874 }
8875 }
8876 }
8877 };
8878
8879 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8880 ThreadPool::TPHandle *handle)
8881 {
8882 if (!ctx.transaction->empty()) {
8883 if (!ctx.created_pgs.empty()) {
8884 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8885 }
8886 int tr = store->queue_transaction(
8887 pg->osr.get(),
8888 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8889 TrackedOpRef(), handle);
8890 delete (ctx.transaction);
8891 assert(tr == 0);
8892 ctx.transaction = new ObjectStore::Transaction;
8893 ctx.on_applied = new C_Contexts(cct);
8894 ctx.on_safe = new C_Contexts(cct);
8895 }
8896 }
8897
8898 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8899 ThreadPool::TPHandle *handle)
8900 {
8901 if (service.get_osdmap()->is_up(whoami) &&
8902 is_active()) {
8903 do_notifies(*ctx.notify_list, curmap);
8904 do_queries(*ctx.query_map, curmap);
8905 do_infos(*ctx.info_map, curmap);
8906 }
8907 delete ctx.notify_list;
8908 delete ctx.query_map;
8909 delete ctx.info_map;
8910 if ((ctx.on_applied->empty() &&
8911 ctx.on_safe->empty() &&
8912 ctx.transaction->empty() &&
8913 ctx.created_pgs.empty()) || !pg) {
8914 delete ctx.transaction;
8915 delete ctx.on_applied;
8916 delete ctx.on_safe;
8917 assert(ctx.created_pgs.empty());
8918 } else {
8919 if (!ctx.created_pgs.empty()) {
8920 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8921 }
8922 int tr = store->queue_transaction(
8923 pg->osr.get(),
8924 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8925 handle);
8926 delete (ctx.transaction);
8927 assert(tr == 0);
8928 }
8929 }
8930
8931 /** do_notifies
8932 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8933 * content for, and they are primary for.
8934 */
8935
8936 void OSD::do_notifies(
8937 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8938 OSDMapRef curmap)
8939 {
8940 for (map<int,
8941 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8942 notify_list.begin();
8943 it != notify_list.end();
8944 ++it) {
8945 if (!curmap->is_up(it->first)) {
8946 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8947 continue;
8948 }
8949 ConnectionRef con = service.get_con_osd_cluster(
8950 it->first, curmap->get_epoch());
8951 if (!con) {
8952 dout(20) << __func__ << " skipping osd." << it->first
8953 << " (NULL con)" << dendl;
8954 continue;
8955 }
8956 service.share_map_peer(it->first, con.get(), curmap);
8957 dout(7) << __func__ << " osd." << it->first
8958 << " on " << it->second.size() << " PGs" << dendl;
8959 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8960 it->second);
8961 con->send_message(m);
8962 }
8963 }
8964
8965
8966 /** do_queries
8967 * send out pending queries for info | summaries
8968 */
8969 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8970 OSDMapRef curmap)
8971 {
8972 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8973 pit != query_map.end();
8974 ++pit) {
8975 if (!curmap->is_up(pit->first)) {
8976 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8977 continue;
8978 }
8979 int who = pit->first;
8980 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8981 if (!con) {
8982 dout(20) << __func__ << " skipping osd." << who
8983 << " (NULL con)" << dendl;
8984 continue;
8985 }
8986 service.share_map_peer(who, con.get(), curmap);
8987 dout(7) << __func__ << " querying osd." << who
8988 << " on " << pit->second.size() << " PGs" << dendl;
8989 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8990 con->send_message(m);
8991 }
8992 }
8993
8994
8995 void OSD::do_infos(map<int,
8996 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8997 OSDMapRef curmap)
8998 {
8999 for (map<int,
9000 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9001 info_map.begin();
9002 p != info_map.end();
9003 ++p) {
9004 if (!curmap->is_up(p->first)) {
9005 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9006 continue;
9007 }
9008 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9009 i != p->second.end();
9010 ++i) {
9011 dout(20) << __func__ << " sending info " << i->first.info
9012 << " to shard " << p->first << dendl;
9013 }
9014 ConnectionRef con = service.get_con_osd_cluster(
9015 p->first, curmap->get_epoch());
9016 if (!con) {
9017 dout(20) << __func__ << " skipping osd." << p->first
9018 << " (NULL con)" << dendl;
9019 continue;
9020 }
9021 service.share_map_peer(p->first, con.get(), curmap);
9022 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9023 m->pg_list = p->second;
9024 con->send_message(m);
9025 }
9026 info_map.clear();
9027 }
9028
9029
9030 /** PGNotify
9031 * from non-primary to primary
9032 * includes pg_info_t.
9033 * NOTE: called with opqueue active.
9034 */
9035 void OSD::handle_pg_notify(OpRequestRef op)
9036 {
9037 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
9038 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
9039
9040 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
9041 int from = m->get_source().num();
9042
9043 if (!require_osd_peer(op->get_req()))
9044 return;
9045
9046 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9047 return;
9048
9049 op->mark_started();
9050
9051 for (auto it = m->get_pg_list().begin();
9052 it != m->get_pg_list().end();
9053 ++it) {
9054 if (it->first.info.pgid.preferred() >= 0) {
9055 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
9056 continue;
9057 }
9058
9059 handle_pg_peering_evt(
9060 spg_t(it->first.info.pgid.pgid, it->first.to),
9061 it->first.info.history, it->second,
9062 it->first.query_epoch,
9063 PG::CephPeeringEvtRef(
9064 new PG::CephPeeringEvt(
9065 it->first.epoch_sent, it->first.query_epoch,
9066 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
9067 op->get_req()->get_connection()->get_features())))
9068 );
9069 }
9070 }
9071
9072 void OSD::handle_pg_log(OpRequestRef op)
9073 {
9074 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
9075 assert(m->get_type() == MSG_OSD_PG_LOG);
9076 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
9077
9078 if (!require_osd_peer(op->get_req()))
9079 return;
9080
9081 int from = m->get_source().num();
9082 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9083 return;
9084
9085 if (m->info.pgid.preferred() >= 0) {
9086 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
9087 return;
9088 }
9089
9090 op->mark_started();
9091 handle_pg_peering_evt(
9092 spg_t(m->info.pgid.pgid, m->to),
9093 m->info.history, m->past_intervals, m->get_epoch(),
9094 PG::CephPeeringEvtRef(
9095 new PG::CephPeeringEvt(
9096 m->get_epoch(), m->get_query_epoch(),
9097 PG::MLogRec(pg_shard_t(from, m->from), m)))
9098 );
9099 }
9100
9101 void OSD::handle_pg_info(OpRequestRef op)
9102 {
9103 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
9104 assert(m->get_type() == MSG_OSD_PG_INFO);
9105 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
9106
9107 if (!require_osd_peer(op->get_req()))
9108 return;
9109
9110 int from = m->get_source().num();
9111 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9112 return;
9113
9114 op->mark_started();
9115
9116 for (auto p = m->pg_list.begin();
9117 p != m->pg_list.end();
9118 ++p) {
9119 if (p->first.info.pgid.preferred() >= 0) {
9120 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
9121 continue;
9122 }
9123
9124 handle_pg_peering_evt(
9125 spg_t(p->first.info.pgid.pgid, p->first.to),
9126 p->first.info.history, p->second, p->first.epoch_sent,
9127 PG::CephPeeringEvtRef(
9128 new PG::CephPeeringEvt(
9129 p->first.epoch_sent, p->first.query_epoch,
9130 PG::MInfoRec(
9131 pg_shard_t(
9132 from, p->first.from), p->first.info, p->first.epoch_sent)))
9133 );
9134 }
9135 }
9136
9137 void OSD::handle_pg_trim(OpRequestRef op)
9138 {
9139 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
9140 assert(m->get_type() == MSG_OSD_PG_TRIM);
9141
9142 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
9143
9144 if (!require_osd_peer(op->get_req()))
9145 return;
9146
9147 int from = m->get_source().num();
9148 if (!require_same_or_newer_map(op, m->epoch, false))
9149 return;
9150
9151 if (m->pgid.preferred() >= 0) {
9152 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9153 return;
9154 }
9155
9156 op->mark_started();
9157
9158 PG *pg = _lookup_lock_pg(m->pgid);
9159 if(!pg) {
9160 dout(10) << " don't have pg " << m->pgid << dendl;
9161 return;
9162 }
9163
9164 if (m->epoch < pg->info.history.same_interval_since) {
9165 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9166 pg->unlock();
9167 return;
9168 }
9169
9170 if (pg->is_primary()) {
9171 // peer is informing us of their last_complete_ondisk
9172 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9173 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9174 m->trim_to;
9175 // trim log when the pg is recovered
9176 pg->calc_min_last_complete_ondisk();
9177 } else {
9178 // primary is instructing us to trim
9179 ObjectStore::Transaction t;
9180 pg->pg_log.trim(m->trim_to, pg->info);
9181 pg->dirty_info = true;
9182 pg->write_if_dirty(t);
9183 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9184 assert(tr == 0);
9185 }
9186 pg->unlock();
9187 }
9188
9189 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9190 {
9191 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9192 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9193
9194 if (!require_osd_peer(op->get_req()))
9195 return;
9196 if (!require_same_or_newer_map(op, m->query_epoch, false))
9197 return;
9198
9199 PG::CephPeeringEvtRef evt;
9200 if (m->type == MBackfillReserve::REQUEST) {
9201 evt = PG::CephPeeringEvtRef(
9202 new PG::CephPeeringEvt(
9203 m->query_epoch,
9204 m->query_epoch,
9205 PG::RequestBackfillPrio(m->priority)));
9206 } else if (m->type == MBackfillReserve::GRANT) {
9207 evt = PG::CephPeeringEvtRef(
9208 new PG::CephPeeringEvt(
9209 m->query_epoch,
9210 m->query_epoch,
9211 PG::RemoteBackfillReserved()));
9212 } else if (m->type == MBackfillReserve::REJECT) {
9213 // NOTE: this is replica -> primary "i reject your request"
9214 // and also primary -> replica "cancel my previously-granted request"
9215 evt = PG::CephPeeringEvtRef(
9216 new PG::CephPeeringEvt(
9217 m->query_epoch,
9218 m->query_epoch,
9219 PG::RemoteReservationRejected()));
9220 } else {
9221 ceph_abort();
9222 }
9223
9224 if (service.splitting(m->pgid)) {
9225 peering_wait_for_split[m->pgid].push_back(evt);
9226 return;
9227 }
9228
9229 PG *pg = _lookup_lock_pg(m->pgid);
9230 if (!pg) {
9231 dout(10) << " don't have pg " << m->pgid << dendl;
9232 return;
9233 }
9234
9235 pg->queue_peering_event(evt);
9236 pg->unlock();
9237 }
9238
9239 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9240 {
9241 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9242 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9243
9244 if (!require_osd_peer(op->get_req()))
9245 return;
9246 if (!require_same_or_newer_map(op, m->query_epoch, false))
9247 return;
9248
9249 PG::CephPeeringEvtRef evt;
9250 if (m->type == MRecoveryReserve::REQUEST) {
9251 evt = PG::CephPeeringEvtRef(
9252 new PG::CephPeeringEvt(
9253 m->query_epoch,
9254 m->query_epoch,
9255 PG::RequestRecovery()));
9256 } else if (m->type == MRecoveryReserve::GRANT) {
9257 evt = PG::CephPeeringEvtRef(
9258 new PG::CephPeeringEvt(
9259 m->query_epoch,
9260 m->query_epoch,
9261 PG::RemoteRecoveryReserved()));
9262 } else if (m->type == MRecoveryReserve::RELEASE) {
9263 evt = PG::CephPeeringEvtRef(
9264 new PG::CephPeeringEvt(
9265 m->query_epoch,
9266 m->query_epoch,
9267 PG::RecoveryDone()));
9268 } else {
9269 ceph_abort();
9270 }
9271
9272 if (service.splitting(m->pgid)) {
9273 peering_wait_for_split[m->pgid].push_back(evt);
9274 return;
9275 }
9276
9277 PG *pg = _lookup_lock_pg(m->pgid);
9278 if (!pg) {
9279 dout(10) << " don't have pg " << m->pgid << dendl;
9280 return;
9281 }
9282
9283 pg->queue_peering_event(evt);
9284 pg->unlock();
9285 }
9286
9287 void OSD::handle_force_recovery(Message *m)
9288 {
9289 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9290 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
9291
9292 vector<PGRef> local_pgs;
9293 local_pgs.reserve(msg->forced_pgs.size());
9294
9295 {
9296 RWLock::RLocker l(pg_map_lock);
9297 for (auto& i : msg->forced_pgs) {
9298 spg_t locpg;
9299 if (osdmap->get_primary_shard(i, &locpg)) {
9300 auto pg_map_entry = pg_map.find(locpg);
9301 if (pg_map_entry != pg_map.end()) {
9302 local_pgs.push_back(pg_map_entry->second);
9303 }
9304 }
9305 }
9306 }
9307
9308 if (local_pgs.size()) {
9309 service.adjust_pg_priorities(local_pgs, msg->options);
9310 }
9311
9312 msg->put();
9313 }
9314
9315 /** PGQuery
9316 * from primary to replica | stray
9317 * NOTE: called with opqueue active.
9318 */
9319 void OSD::handle_pg_query(OpRequestRef op)
9320 {
9321 assert(osd_lock.is_locked());
9322
9323 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9324 assert(m->get_type() == MSG_OSD_PG_QUERY);
9325
9326 if (!require_osd_peer(op->get_req()))
9327 return;
9328
9329 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9330 int from = m->get_source().num();
9331
9332 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9333 return;
9334
9335 op->mark_started();
9336
9337 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9338
9339 for (auto it = m->pg_list.begin();
9340 it != m->pg_list.end();
9341 ++it) {
9342 spg_t pgid = it->first;
9343
9344 if (pgid.preferred() >= 0) {
9345 dout(10) << "ignoring localized pg " << pgid << dendl;
9346 continue;
9347 }
9348
9349 if (service.splitting(pgid)) {
9350 peering_wait_for_split[pgid].push_back(
9351 PG::CephPeeringEvtRef(
9352 new PG::CephPeeringEvt(
9353 it->second.epoch_sent, it->second.epoch_sent,
9354 PG::MQuery(pg_shard_t(from, it->second.from),
9355 it->second, it->second.epoch_sent))));
9356 continue;
9357 }
9358
9359 {
9360 RWLock::RLocker l(pg_map_lock);
9361 if (pg_map.count(pgid)) {
9362 PG *pg = 0;
9363 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9364 pg->queue_query(
9365 it->second.epoch_sent, it->second.epoch_sent,
9366 pg_shard_t(from, it->second.from), it->second);
9367 pg->unlock();
9368 continue;
9369 }
9370 }
9371
9372 if (!osdmap->have_pg_pool(pgid.pool()))
9373 continue;
9374
9375 // get active crush mapping
9376 int up_primary, acting_primary;
9377 vector<int> up, acting;
9378 osdmap->pg_to_up_acting_osds(
9379 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9380
9381 // same primary?
9382 pg_history_t history = it->second.history;
9383 bool valid_history = project_pg_history(
9384 pgid, history, it->second.epoch_sent,
9385 up, up_primary, acting, acting_primary);
9386
9387 if (!valid_history ||
9388 it->second.epoch_sent < history.same_interval_since) {
9389 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9390 << history.same_interval_since
9391 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9392 continue;
9393 }
9394
9395 dout(10) << " pg " << pgid << " dne" << dendl;
9396 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9397 /* This is racy, but that should be ok: if we complete the deletion
9398 * before the pg is recreated, we'll just start it off backfilling
9399 * instead of just empty */
9400 if (service.deleting_pgs.lookup(pgid))
9401 empty.set_last_backfill(hobject_t());
9402 if (it->second.type == pg_query_t::LOG ||
9403 it->second.type == pg_query_t::FULLLOG) {
9404 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9405 if (con) {
9406 MOSDPGLog *mlog = new MOSDPGLog(
9407 it->second.from, it->second.to,
9408 osdmap->get_epoch(), empty,
9409 it->second.epoch_sent);
9410 service.share_map_peer(from, con.get(), osdmap);
9411 con->send_message(mlog);
9412 }
9413 } else {
9414 notify_list[from].push_back(
9415 make_pair(
9416 pg_notify_t(
9417 it->second.from, it->second.to,
9418 it->second.epoch_sent,
9419 osdmap->get_epoch(),
9420 empty),
9421 PastIntervals(
9422 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9423 *osdmap)));
9424 }
9425 }
9426 do_notifies(notify_list, osdmap);
9427 }
9428
9429
9430 void OSD::handle_pg_remove(OpRequestRef op)
9431 {
9432 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9433 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9434 assert(osd_lock.is_locked());
9435
9436 if (!require_osd_peer(op->get_req()))
9437 return;
9438
9439 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9440 << m->pg_list.size() << " pgs" << dendl;
9441
9442 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9443 return;
9444
9445 op->mark_started();
9446
9447 for (auto it = m->pg_list.begin();
9448 it != m->pg_list.end();
9449 ++it) {
9450 spg_t pgid = *it;
9451 if (pgid.preferred() >= 0) {
9452 dout(10) << "ignoring localized pg " << pgid << dendl;
9453 continue;
9454 }
9455
9456 RWLock::WLocker l(pg_map_lock);
9457 if (pg_map.count(pgid) == 0) {
9458 dout(10) << " don't have pg " << pgid << dendl;
9459 continue;
9460 }
9461 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9462 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9463 pg_history_t history = pg->info.history;
9464 int up_primary, acting_primary;
9465 vector<int> up, acting;
9466 osdmap->pg_to_up_acting_osds(
9467 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9468 bool valid_history = project_pg_history(
9469 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9470 up, up_primary, acting, acting_primary);
9471 if (valid_history &&
9472 history.same_interval_since <= m->get_epoch()) {
9473 assert(pg->get_primary().osd == m->get_source().num());
9474 PGRef _pg(pg);
9475 _remove_pg(pg);
9476 pg->unlock();
9477 } else {
9478 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9479 << history.same_interval_since
9480 << " > " << m->get_epoch() << dendl;
9481 pg->unlock();
9482 }
9483 }
9484 }
9485
9486 void OSD::_remove_pg(PG *pg)
9487 {
9488 ObjectStore::Transaction rmt ;
9489
9490 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9491 // the pg_map must be done together without unlocking the pg lock,
9492 // to avoid racing with watcher cleanup in ms_handle_reset
9493 // and handle_notify_timeout
9494 pg->on_removal(&rmt);
9495
9496 service.cancel_pending_splits_for_parent(pg->info.pgid);
9497 int tr = store->queue_transaction(
9498 pg->osr.get(), std::move(rmt), NULL,
9499 new ContainerContext<
9500 SequencerRef>(pg->osr));
9501 assert(tr == 0);
9502
9503 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9504 pg->info.pgid,
9505 make_pair(
9506 pg->info.pgid,
9507 PGRef(pg))
9508 );
9509 remove_wq.queue(make_pair(PGRef(pg), deleting));
9510
9511 service.pg_remove_epoch(pg->info.pgid);
9512
9513 // dereference from op_wq
9514 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9515
9516 // remove from map
9517 pg_map.erase(pg->info.pgid);
9518 pg->put("PGMap"); // since we've taken it out of map
9519 }
9520
9521 // =========================================================
9522 // RECOVERY
9523
9524 void OSDService::_maybe_queue_recovery() {
9525 assert(recovery_lock.is_locked_by_me());
9526 uint64_t available_pushes;
9527 while (!awaiting_throttle.empty() &&
9528 _recover_now(&available_pushes)) {
9529 uint64_t to_start = MIN(
9530 available_pushes,
9531 cct->_conf->osd_recovery_max_single_start);
9532 _queue_for_recovery(awaiting_throttle.front(), to_start);
9533 awaiting_throttle.pop_front();
9534 recovery_ops_reserved += to_start;
9535 }
9536 }
9537
9538 bool OSDService::_recover_now(uint64_t *available_pushes)
9539 {
9540 if (available_pushes)
9541 *available_pushes = 0;
9542
9543 if (ceph_clock_now() < defer_recovery_until) {
9544 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9545 return false;
9546 }
9547
9548 if (recovery_paused) {
9549 dout(15) << __func__ << " paused" << dendl;
9550 return false;
9551 }
9552
9553 uint64_t max = cct->_conf->osd_recovery_max_active;
9554 if (max <= recovery_ops_active + recovery_ops_reserved) {
9555 dout(15) << __func__ << " active " << recovery_ops_active
9556 << " + reserved " << recovery_ops_reserved
9557 << " >= max " << max << dendl;
9558 return false;
9559 }
9560
9561 if (available_pushes)
9562 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9563
9564 return true;
9565 }
9566
9567
9568 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9569 {
9570 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9571 return;
9572 int newstate = 0;
9573
9574 if (newflags & OFR_BACKFILL) {
9575 newstate = PG_STATE_FORCED_BACKFILL;
9576 } else if (newflags & OFR_RECOVERY) {
9577 newstate = PG_STATE_FORCED_RECOVERY;
9578 }
9579
9580 // debug output here may get large, don't generate it if debug level is below
9581 // 10 and use abbreviated pg ids otherwise
9582 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9583 stringstream ss;
9584
9585 for (auto& i : pgs) {
9586 ss << i->get_pgid() << " ";
9587 }
9588
9589 dout(10) << __func__ << " working on " << ss.str() << dendl;
9590 }
9591
9592 if (newflags & OFR_CANCEL) {
9593 for (auto& i : pgs) {
9594 i->lock();
9595 i->_change_recovery_force_mode(newstate, true);
9596 i->unlock();
9597 }
9598 } else {
9599 for (auto& i : pgs) {
9600 // make sure the PG is in correct state before forcing backfill or recovery, or
9601 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9602 // or forcing somehow recovery/backfill.
9603 i->lock();
9604 int pgstate = i->get_state();
9605 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9606 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
9607 i->_change_recovery_force_mode(newstate, false);
9608 i->unlock();
9609 }
9610 }
9611 }
9612
9613 void OSD::do_recovery(
9614 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9615 ThreadPool::TPHandle &handle)
9616 {
9617 uint64_t started = 0;
9618
9619 /*
9620 * When the value of osd_recovery_sleep is set greater than zero, recovery
9621 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9622 * recovery event's schedule time. This is done by adding a
9623 * recovery_requeue_callback event, which re-queues the recovery op using
9624 * queue_recovery_after_sleep.
9625 */
9626 float recovery_sleep = get_osd_recovery_sleep();
9627 {
9628 Mutex::Locker l(service.recovery_sleep_lock);
9629 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9630 PGRef pgref(pg);
9631 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9632 dout(20) << "do_recovery wake up at "
9633 << ceph_clock_now()
9634 << ", re-queuing recovery" << dendl;
9635 Mutex::Locker l(service.recovery_sleep_lock);
9636 service.recovery_needs_sleep = false;
9637 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9638 });
9639
9640 // This is true for the first recovery op and when the previous recovery op
9641 // has been scheduled in the past. The next recovery op is scheduled after
9642 // completing the sleep from now.
9643 if (service.recovery_schedule_time < ceph_clock_now()) {
9644 service.recovery_schedule_time = ceph_clock_now();
9645 }
9646 service.recovery_schedule_time += recovery_sleep;
9647 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9648 recovery_requeue_callback);
9649 dout(20) << "Recovery event scheduled at "
9650 << service.recovery_schedule_time << dendl;
9651 return;
9652 }
9653 }
9654
9655 {
9656 {
9657 Mutex::Locker l(service.recovery_sleep_lock);
9658 service.recovery_needs_sleep = true;
9659 }
9660
9661 if (pg->pg_has_reset_since(queued)) {
9662 goto out;
9663 }
9664
9665 assert(!pg->deleting);
9666 assert(pg->is_peered() && pg->is_primary());
9667
9668 assert(pg->recovery_queued);
9669 pg->recovery_queued = false;
9670
9671 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9672 #ifdef DEBUG_RECOVERY_OIDS
9673 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9674 #endif
9675
9676 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9677 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9678 << " on " << *pg << dendl;
9679
9680 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9681 if (!started && (more || !pg->have_unfound())) {
9682 goto out;
9683 }
9684
9685 PG::RecoveryCtx rctx = create_context();
9686 rctx.handle = &handle;
9687
9688 /*
9689 * if we couldn't start any recovery ops and things are still
9690 * unfound, see if we can discover more missing object locations.
9691 * It may be that our initial locations were bad and we errored
9692 * out while trying to pull.
9693 */
9694 if (!more && pg->have_unfound()) {
9695 pg->discover_all_missing(*rctx.query_map);
9696 if (rctx.query_map->empty()) {
9697 string action;
9698 if (pg->state_test(PG_STATE_BACKFILLING)) {
9699 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9700 queued,
9701 queued,
9702 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
9703 pg->queue_peering_event(evt);
9704 action = "in backfill";
9705 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9706 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9707 queued,
9708 queued,
9709 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
9710 pg->queue_peering_event(evt);
9711 action = "in recovery";
9712 } else {
9713 action = "already out of recovery/backfill";
9714 }
9715 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9716 } else {
9717 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9718 pg->queue_recovery();
9719 }
9720 }
9721
9722 pg->write_if_dirty(*rctx.transaction);
9723 OSDMapRef curmap = pg->get_osdmap();
9724 dispatch_context(rctx, pg, curmap);
9725 }
9726
9727 out:
9728 assert(started <= reserved_pushes);
9729 service.release_reserved_pushes(reserved_pushes);
9730 }
9731
9732 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9733 {
9734 Mutex::Locker l(recovery_lock);
9735 dout(10) << "start_recovery_op " << *pg << " " << soid
9736 << " (" << recovery_ops_active << "/"
9737 << cct->_conf->osd_recovery_max_active << " rops)"
9738 << dendl;
9739 recovery_ops_active++;
9740
9741 #ifdef DEBUG_RECOVERY_OIDS
9742 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9743 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9744 recovery_oids[pg->info.pgid].insert(soid);
9745 #endif
9746 }
9747
9748 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9749 {
9750 Mutex::Locker l(recovery_lock);
9751 dout(10) << "finish_recovery_op " << *pg << " " << soid
9752 << " dequeue=" << dequeue
9753 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9754 << dendl;
9755
9756 // adjust count
9757 assert(recovery_ops_active > 0);
9758 recovery_ops_active--;
9759
9760 #ifdef DEBUG_RECOVERY_OIDS
9761 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9762 assert(recovery_oids[pg->info.pgid].count(soid));
9763 recovery_oids[pg->info.pgid].erase(soid);
9764 #endif
9765
9766 _maybe_queue_recovery();
9767 }
9768
9769 bool OSDService::is_recovery_active()
9770 {
9771 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9772 }
9773
9774 // =========================================================
9775 // OPS
9776
9777 bool OSD::op_is_discardable(const MOSDOp *op)
9778 {
9779 // drop client request if they are not connected and can't get the
9780 // reply anyway.
9781 if (!op->get_connection()->is_connected()) {
9782 return true;
9783 }
9784 return false;
9785 }
9786
9787 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9788 {
9789 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9790 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9791 << " cost " << op->get_req()->get_cost()
9792 << " latency " << latency
9793 << " epoch " << epoch
9794 << " " << *(op->get_req()) << dendl;
9795 op->osd_trace.event("enqueue op");
9796 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9797 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9798 op->mark_queued_for_pg();
9799 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9800 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9801 }
9802
9803
9804
9805 /*
9806 * NOTE: dequeue called in worker thread, with pg lock
9807 */
9808 void OSD::dequeue_op(
9809 PGRef pg, OpRequestRef op,
9810 ThreadPool::TPHandle &handle)
9811 {
9812 FUNCTRACE();
9813 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9814
9815 utime_t now = ceph_clock_now();
9816 op->set_dequeued_time(now);
9817 utime_t latency = now - op->get_req()->get_recv_stamp();
9818 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9819 << " cost " << op->get_req()->get_cost()
9820 << " latency " << latency
9821 << " " << *(op->get_req())
9822 << " pg " << *pg << dendl;
9823
9824 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9825
9826 Session *session = static_cast<Session *>(
9827 op->get_req()->get_connection()->get_priv());
9828 if (session) {
9829 maybe_share_map(session, op, pg->get_osdmap());
9830 session->put();
9831 }
9832
9833 if (pg->deleting)
9834 return;
9835
9836 op->mark_reached_pg();
9837 op->osd_trace.event("dequeue_op");
9838
9839 pg->do_request(op, handle);
9840
9841 // finish
9842 dout(10) << "dequeue_op " << op << " finish" << dendl;
9843 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9844 }
9845
9846
9847 struct C_CompleteSplits : public Context {
9848 OSD *osd;
9849 set<PGRef> pgs;
9850 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9851 : osd(osd), pgs(in) {}
9852 void finish(int r) override {
9853 Mutex::Locker l(osd->osd_lock);
9854 if (osd->is_stopping())
9855 return;
9856 PG::RecoveryCtx rctx = osd->create_context();
9857 for (set<PGRef>::iterator i = pgs.begin();
9858 i != pgs.end();
9859 ++i) {
9860 osd->pg_map_lock.get_write();
9861 (*i)->lock();
9862 PG *pg = i->get();
9863 osd->add_newly_split_pg(pg, &rctx);
9864 if (!((*i)->deleting)) {
9865 set<spg_t> to_complete;
9866 to_complete.insert((*i)->info.pgid);
9867 osd->service.complete_split(to_complete);
9868 }
9869 osd->pg_map_lock.put_write();
9870 osd->dispatch_context_transaction(rctx, pg);
9871 osd->wake_pg_waiters(*i);
9872 (*i)->unlock();
9873 }
9874
9875 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9876 }
9877 };
9878
9879 void OSD::process_peering_events(
9880 const list<PG*> &pgs,
9881 ThreadPool::TPHandle &handle
9882 )
9883 {
9884 bool need_up_thru = false;
9885 epoch_t same_interval_since = 0;
9886 OSDMapRef curmap;
9887 PG::RecoveryCtx rctx = create_context();
9888 rctx.handle = &handle;
9889 for (list<PG*>::const_iterator i = pgs.begin();
9890 i != pgs.end();
9891 ++i) {
9892 set<PGRef> split_pgs;
9893 PG *pg = *i;
9894 pg->lock_suspend_timeout(handle);
9895 curmap = service.get_osdmap();
9896 if (pg->deleting) {
9897 pg->unlock();
9898 continue;
9899 }
9900 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9901 // we need to requeue the PG explicitly since we didn't actually
9902 // handle an event
9903 peering_wq.queue(pg);
9904 } else {
9905 assert(!pg->peering_queue.empty());
9906 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9907 pg->peering_queue.pop_front();
9908 pg->handle_peering_event(evt, &rctx);
9909 }
9910 need_up_thru = pg->need_up_thru || need_up_thru;
9911 same_interval_since = MAX(pg->info.history.same_interval_since,
9912 same_interval_since);
9913 pg->write_if_dirty(*rctx.transaction);
9914 if (!split_pgs.empty()) {
9915 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9916 split_pgs.clear();
9917 }
9918 dispatch_context_transaction(rctx, pg, &handle);
9919 pg->unlock();
9920 }
9921 if (need_up_thru)
9922 queue_want_up_thru(same_interval_since);
9923 dispatch_context(rctx, 0, curmap, &handle);
9924
9925 service.send_pg_temp();
9926 }
9927
9928 // --------------------------------
9929
9930 const char** OSD::get_tracked_conf_keys() const
9931 {
9932 static const char* KEYS[] = {
9933 "osd_max_backfills",
9934 "osd_min_recovery_priority",
9935 "osd_max_trimming_pgs",
9936 "osd_op_complaint_time",
9937 "osd_op_log_threshold",
9938 "osd_op_history_size",
9939 "osd_op_history_duration",
9940 "osd_op_history_slow_op_size",
9941 "osd_op_history_slow_op_threshold",
9942 "osd_enable_op_tracker",
9943 "osd_map_cache_size",
9944 "osd_map_max_advance",
9945 "osd_pg_epoch_persisted_max_stale",
9946 "osd_disk_thread_ioprio_class",
9947 "osd_disk_thread_ioprio_priority",
9948 // clog & admin clog
9949 "clog_to_monitors",
9950 "clog_to_syslog",
9951 "clog_to_syslog_facility",
9952 "clog_to_syslog_level",
9953 "osd_objectstore_fuse",
9954 "clog_to_graylog",
9955 "clog_to_graylog_host",
9956 "clog_to_graylog_port",
9957 "host",
9958 "fsid",
9959 "osd_recovery_delay_start",
9960 "osd_client_message_size_cap",
9961 "osd_client_message_cap",
9962 "osd_heartbeat_min_size",
9963 "osd_heartbeat_interval",
9964 NULL
9965 };
9966 return KEYS;
9967 }
9968
9969 void OSD::handle_conf_change(const struct md_config_t *conf,
9970 const std::set <std::string> &changed)
9971 {
9972 if (changed.count("osd_max_backfills")) {
9973 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9974 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9975 }
9976 if (changed.count("osd_min_recovery_priority")) {
9977 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9978 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9979 }
9980 if (changed.count("osd_max_trimming_pgs")) {
9981 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9982 }
9983 if (changed.count("osd_op_complaint_time") ||
9984 changed.count("osd_op_log_threshold")) {
9985 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9986 cct->_conf->osd_op_log_threshold);
9987 }
9988 if (changed.count("osd_op_history_size") ||
9989 changed.count("osd_op_history_duration")) {
9990 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9991 cct->_conf->osd_op_history_duration);
9992 }
9993 if (changed.count("osd_op_history_slow_op_size") ||
9994 changed.count("osd_op_history_slow_op_threshold")) {
9995 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9996 cct->_conf->osd_op_history_slow_op_threshold);
9997 }
9998 if (changed.count("osd_enable_op_tracker")) {
9999 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10000 }
10001 if (changed.count("osd_disk_thread_ioprio_class") ||
10002 changed.count("osd_disk_thread_ioprio_priority")) {
10003 set_disk_tp_priority();
10004 }
10005 if (changed.count("osd_map_cache_size")) {
10006 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10007 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10008 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10009 }
10010 if (changed.count("clog_to_monitors") ||
10011 changed.count("clog_to_syslog") ||
10012 changed.count("clog_to_syslog_level") ||
10013 changed.count("clog_to_syslog_facility") ||
10014 changed.count("clog_to_graylog") ||
10015 changed.count("clog_to_graylog_host") ||
10016 changed.count("clog_to_graylog_port") ||
10017 changed.count("host") ||
10018 changed.count("fsid")) {
10019 update_log_config();
10020 }
10021
10022 #ifdef HAVE_LIBFUSE
10023 if (changed.count("osd_objectstore_fuse")) {
10024 if (store) {
10025 enable_disable_fuse(false);
10026 }
10027 }
10028 #endif
10029
10030 if (changed.count("osd_recovery_delay_start")) {
10031 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10032 service.kick_recovery_queue();
10033 }
10034
10035 if (changed.count("osd_client_message_cap")) {
10036 uint64_t newval = cct->_conf->osd_client_message_cap;
10037 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10038 if (pol.throttler_messages && newval > 0) {
10039 pol.throttler_messages->reset_max(newval);
10040 }
10041 }
10042 if (changed.count("osd_client_message_size_cap")) {
10043 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10044 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10045 if (pol.throttler_bytes && newval > 0) {
10046 pol.throttler_bytes->reset_max(newval);
10047 }
10048 }
10049
10050 check_config();
10051 }
10052
10053 void OSD::update_log_config()
10054 {
10055 map<string,string> log_to_monitors;
10056 map<string,string> log_to_syslog;
10057 map<string,string> log_channel;
10058 map<string,string> log_prio;
10059 map<string,string> log_to_graylog;
10060 map<string,string> log_to_graylog_host;
10061 map<string,string> log_to_graylog_port;
10062 uuid_d fsid;
10063 string host;
10064
10065 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10066 log_channel, log_prio, log_to_graylog,
10067 log_to_graylog_host, log_to_graylog_port,
10068 fsid, host) == 0)
10069 clog->update_config(log_to_monitors, log_to_syslog,
10070 log_channel, log_prio, log_to_graylog,
10071 log_to_graylog_host, log_to_graylog_port,
10072 fsid, host);
10073 derr << "log_to_monitors " << log_to_monitors << dendl;
10074 }
10075
10076 void OSD::check_config()
10077 {
10078 // some sanity checks
10079 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
10080 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10081 << " is not > osd_map_max_advance ("
10082 << cct->_conf->osd_map_max_advance << ")";
10083 }
10084 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10085 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10086 << " is not > osd_pg_epoch_persisted_max_stale ("
10087 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10088 }
10089 }
10090
10091 void OSD::set_disk_tp_priority()
10092 {
10093 dout(10) << __func__
10094 << " class " << cct->_conf->osd_disk_thread_ioprio_class
10095 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
10096 << dendl;
10097 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
10098 cct->_conf->osd_disk_thread_ioprio_priority < 0)
10099 return;
10100 int cls =
10101 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
10102 if (cls < 0)
10103 derr << __func__ << cpp_strerror(cls) << ": "
10104 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
10105 << " but only the following values are allowed: idle, be or rt" << dendl;
10106 else
10107 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10108 }
10109
10110 // --------------------------------
10111
10112 void OSD::get_latest_osdmap()
10113 {
10114 dout(10) << __func__ << " -- start" << dendl;
10115
10116 C_SaferCond cond;
10117 service.objecter->wait_for_latest_osdmap(&cond);
10118 cond.wait();
10119
10120 dout(10) << __func__ << " -- finish" << dendl;
10121 }
10122
10123 // --------------------------------
10124
10125 int OSD::init_op_flags(OpRequestRef& op)
10126 {
10127 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10128 vector<OSDOp>::const_iterator iter;
10129
10130 // client flags have no bearing on whether an op is a read, write, etc.
10131 op->rmw_flags = 0;
10132
10133 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10134 op->set_force_rwordered();
10135 }
10136
10137 // set bits based on op codes, called methods.
10138 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10139 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10140 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10141 /* This a bit odd. PING isn't actually a write. It can't
10142 * result in an update to the object_info. PINGs also aren'ty
10143 * resent, so there's no reason to write out a log entry
10144 *
10145 * However, we pipeline them behind writes, so let's force
10146 * the write_ordered flag.
10147 */
10148 op->set_force_rwordered();
10149 } else {
10150 if (ceph_osd_op_mode_modify(iter->op.op))
10151 op->set_write();
10152 }
10153 if (ceph_osd_op_mode_read(iter->op.op))
10154 op->set_read();
10155
10156 // set READ flag if there are src_oids
10157 if (iter->soid.oid.name.length())
10158 op->set_read();
10159
10160 // set PGOP flag if there are PG ops
10161 if (ceph_osd_op_type_pg(iter->op.op))
10162 op->set_pg_op();
10163
10164 if (ceph_osd_op_mode_cache(iter->op.op))
10165 op->set_cache();
10166
10167 // check for ec base pool
10168 int64_t poolid = m->get_pg().pool();
10169 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10170 if (pool && pool->is_tier()) {
10171 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10172 if (base_pool && base_pool->require_rollback()) {
10173 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10174 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10175 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10176 (iter->op.op != CEPH_OSD_OP_STAT) &&
10177 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10178 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10179 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10180 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10181 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10182 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10183 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10184 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10185 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10186 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10187 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10188 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10189 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10190 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10191 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10192 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10193 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10194 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10195 op->set_promote();
10196 }
10197 }
10198 }
10199
10200 switch (iter->op.op) {
10201 case CEPH_OSD_OP_CALL:
10202 {
10203 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10204 int is_write, is_read;
10205 string cname, mname;
10206 bp.copy(iter->op.cls.class_len, cname);
10207 bp.copy(iter->op.cls.method_len, mname);
10208
10209 ClassHandler::ClassData *cls;
10210 int r = class_handler->open_class(cname, &cls);
10211 if (r) {
10212 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10213 if (r == -ENOENT)
10214 r = -EOPNOTSUPP;
10215 else if (r != -EPERM) // propagate permission errors
10216 r = -EIO;
10217 return r;
10218 }
10219 int flags = cls->get_method_flags(mname.c_str());
10220 if (flags < 0) {
10221 if (flags == -ENOENT)
10222 r = -EOPNOTSUPP;
10223 else
10224 r = flags;
10225 return r;
10226 }
10227 is_read = flags & CLS_METHOD_RD;
10228 is_write = flags & CLS_METHOD_WR;
10229 bool is_promote = flags & CLS_METHOD_PROMOTE;
10230
10231 dout(10) << "class " << cname << " method " << mname << " "
10232 << "flags=" << (is_read ? "r" : "")
10233 << (is_write ? "w" : "")
10234 << (is_promote ? "p" : "")
10235 << dendl;
10236 if (is_read)
10237 op->set_class_read();
10238 if (is_write)
10239 op->set_class_write();
10240 if (is_promote)
10241 op->set_promote();
10242 op->add_class(cname, is_read, is_write, cls->whitelisted);
10243 break;
10244 }
10245
10246 case CEPH_OSD_OP_WATCH:
10247 // force the read bit for watch since it is depends on previous
10248 // watch state (and may return early if the watch exists) or, in
10249 // the case of ping, is simply a read op.
10250 op->set_read();
10251 // fall through
10252 case CEPH_OSD_OP_NOTIFY:
10253 case CEPH_OSD_OP_NOTIFY_ACK:
10254 {
10255 op->set_promote();
10256 break;
10257 }
10258
10259 case CEPH_OSD_OP_DELETE:
10260 // if we get a delete with FAILOK we can skip handle cache. without
10261 // FAILOK we still need to promote (or do something smarter) to
10262 // determine whether to return ENOENT or 0.
10263 if (iter == m->ops.begin() &&
10264 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10265 op->set_skip_handle_cache();
10266 }
10267 // skip promotion when proxying a delete op
10268 if (m->ops.size() == 1) {
10269 op->set_skip_promote();
10270 }
10271 break;
10272
10273 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10274 case CEPH_OSD_OP_CACHE_FLUSH:
10275 case CEPH_OSD_OP_CACHE_EVICT:
10276 // If try_flush/flush/evict is the only op, can skip handle cache.
10277 if (m->ops.size() == 1) {
10278 op->set_skip_handle_cache();
10279 }
10280 break;
10281
10282 case CEPH_OSD_OP_READ:
10283 case CEPH_OSD_OP_SYNC_READ:
10284 case CEPH_OSD_OP_SPARSE_READ:
10285 case CEPH_OSD_OP_CHECKSUM:
10286 case CEPH_OSD_OP_WRITEFULL:
10287 if (m->ops.size() == 1 &&
10288 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10289 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10290 op->set_skip_promote();
10291 }
10292 break;
10293
10294 // force promotion when pin an object in cache tier
10295 case CEPH_OSD_OP_CACHE_PIN:
10296 op->set_promote();
10297 break;
10298
10299 default:
10300 break;
10301 }
10302 }
10303
10304 if (op->rmw_flags == 0)
10305 return -EINVAL;
10306
10307 return 0;
10308 }
10309
10310 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10311 for (list<PG*>::iterator i = peering_queue.begin();
10312 i != peering_queue.end() &&
10313 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10314 ) {
10315 if (in_use.count(*i)) {
10316 ++i;
10317 } else {
10318 out->push_back(*i);
10319 peering_queue.erase(i++);
10320 }
10321 }
10322 in_use.insert(out->begin(), out->end());
10323 }
10324
10325
10326 // =============================================================
10327
10328 #undef dout_context
10329 #define dout_context osd->cct
10330 #undef dout_prefix
10331 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10332
10333 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10334 {
10335 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10336 auto sdata = shard_list[shard_index];
10337 bool queued = false;
10338 {
10339 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10340 auto p = sdata->pg_slots.find(pgid);
10341 if (p != sdata->pg_slots.end()) {
10342 dout(20) << __func__ << " " << pgid
10343 << " to_process " << p->second.to_process
10344 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10345 for (auto i = p->second.to_process.rbegin();
10346 i != p->second.to_process.rend();
10347 ++i) {
10348 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10349 }
10350 p->second.to_process.clear();
10351 p->second.waiting_for_pg = false;
10352 ++p->second.requeue_seq;
10353 queued = true;
10354 }
10355 }
10356 if (queued) {
10357 sdata->sdata_lock.Lock();
10358 sdata->sdata_cond.SignalOne();
10359 sdata->sdata_lock.Unlock();
10360 }
10361 }
10362
10363 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10364 {
10365 unsigned pushes_to_free = 0;
10366 for (auto sdata : shard_list) {
10367 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10368 sdata->waiting_for_pg_osdmap = osdmap;
10369 auto p = sdata->pg_slots.begin();
10370 while (p != sdata->pg_slots.end()) {
10371 ShardData::pg_slot& slot = p->second;
10372 if (!slot.to_process.empty() && slot.num_running == 0) {
10373 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10374 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10375 << dendl;
10376 ++p;
10377 continue;
10378 }
10379 while (!slot.to_process.empty() &&
10380 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10381 auto& qi = slot.to_process.front();
10382 dout(20) << __func__ << " " << p->first
10383 << " item " << qi
10384 << " epoch " << qi.get_map_epoch()
10385 << " <= " << osdmap->get_epoch()
10386 << ", stale, dropping" << dendl;
10387 pushes_to_free += qi.get_reserved_pushes();
10388 slot.to_process.pop_front();
10389 }
10390 }
10391 if (slot.to_process.empty() &&
10392 slot.num_running == 0 &&
10393 !slot.pg) {
10394 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10395 p = sdata->pg_slots.erase(p);
10396 } else {
10397 ++p;
10398 }
10399 }
10400 }
10401 if (pushes_to_free > 0) {
10402 osd->service.release_reserved_pushes(pushes_to_free);
10403 }
10404 }
10405
10406 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10407 {
10408 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10409 auto sdata = shard_list[shard_index];
10410 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10411 auto p = sdata->pg_slots.find(pgid);
10412 if (p != sdata->pg_slots.end()) {
10413 auto& slot = p->second;
10414 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10415 assert(!slot.pg || slot.pg->deleting);
10416 slot.pg = nullptr;
10417 }
10418 }
10419
10420 void OSD::ShardedOpWQ::clear_pg_slots()
10421 {
10422 for (auto sdata : shard_list) {
10423 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10424 sdata->pg_slots.clear();
10425 sdata->waiting_for_pg_osdmap.reset();
10426 // don't bother with reserved pushes; we are shutting down
10427 }
10428 }
10429
10430 #undef dout_prefix
10431 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10432
10433 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10434 {
10435 uint32_t shard_index = thread_index % num_shards;
10436 ShardData *sdata = shard_list[shard_index];
10437 assert(NULL != sdata);
10438
10439 // peek at spg_t
10440 sdata->sdata_op_ordering_lock.Lock();
10441 if (sdata->pqueue->empty()) {
10442 dout(20) << __func__ << " empty q, waiting" << dendl;
10443 // optimistically sleep a moment; maybe another work item will come along.
10444 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10445 osd->cct->_conf->threadpool_default_timeout, 0);
10446 sdata->sdata_lock.Lock();
10447 sdata->sdata_op_ordering_lock.Unlock();
10448 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10449 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10450 sdata->sdata_lock.Unlock();
10451 sdata->sdata_op_ordering_lock.Lock();
10452 if (sdata->pqueue->empty()) {
10453 sdata->sdata_op_ordering_lock.Unlock();
10454 return;
10455 }
10456 }
10457 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10458 if (osd->is_stopping()) {
10459 sdata->sdata_op_ordering_lock.Unlock();
10460 return; // OSD shutdown, discard.
10461 }
10462 PGRef pg;
10463 uint64_t requeue_seq;
10464 {
10465 auto& slot = sdata->pg_slots[item.first];
10466 dout(30) << __func__ << " " << item.first
10467 << " to_process " << slot.to_process
10468 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10469 slot.to_process.push_back(item.second);
10470 // note the requeue seq now...
10471 requeue_seq = slot.requeue_seq;
10472 if (slot.waiting_for_pg) {
10473 // save ourselves a bit of effort
10474 dout(20) << __func__ << " " << item.first << " item " << item.second
10475 << " queued, waiting_for_pg" << dendl;
10476 sdata->sdata_op_ordering_lock.Unlock();
10477 return;
10478 }
10479 pg = slot.pg;
10480 dout(20) << __func__ << " " << item.first << " item " << item.second
10481 << " queued" << dendl;
10482 ++slot.num_running;
10483 }
10484 sdata->sdata_op_ordering_lock.Unlock();
10485
10486 osd->service.maybe_inject_dispatch_delay();
10487
10488 // [lookup +] lock pg (if we have it)
10489 if (!pg) {
10490 pg = osd->_lookup_lock_pg(item.first);
10491 } else {
10492 pg->lock();
10493 }
10494
10495 osd->service.maybe_inject_dispatch_delay();
10496
10497 boost::optional<PGQueueable> qi;
10498
10499 // we don't use a Mutex::Locker here because of the
10500 // osd->service.release_reserved_pushes() call below
10501 sdata->sdata_op_ordering_lock.Lock();
10502
10503 auto q = sdata->pg_slots.find(item.first);
10504 assert(q != sdata->pg_slots.end());
10505 auto& slot = q->second;
10506 --slot.num_running;
10507
10508 if (slot.to_process.empty()) {
10509 // raced with wake_pg_waiters or prune_pg_waiters
10510 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10511 if (pg) {
10512 pg->unlock();
10513 }
10514 sdata->sdata_op_ordering_lock.Unlock();
10515 return;
10516 }
10517 if (requeue_seq != slot.requeue_seq) {
10518 dout(20) << __func__ << " " << item.first
10519 << " requeue_seq " << slot.requeue_seq << " > our "
10520 << requeue_seq << ", we raced with wake_pg_waiters"
10521 << dendl;
10522 if (pg) {
10523 pg->unlock();
10524 }
10525 sdata->sdata_op_ordering_lock.Unlock();
10526 return;
10527 }
10528 if (pg && !slot.pg && !pg->deleting) {
10529 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10530 slot.pg = pg;
10531 }
10532 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10533 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10534
10535 // make sure we're not already waiting for this pg
10536 if (slot.waiting_for_pg) {
10537 dout(20) << __func__ << " " << item.first << " item " << item.second
10538 << " slot is waiting_for_pg" << dendl;
10539 if (pg) {
10540 pg->unlock();
10541 }
10542 sdata->sdata_op_ordering_lock.Unlock();
10543 return;
10544 }
10545
10546 // take next item
10547 qi = slot.to_process.front();
10548 slot.to_process.pop_front();
10549 dout(20) << __func__ << " " << item.first << " item " << *qi
10550 << " pg " << pg << dendl;
10551
10552 if (!pg) {
10553 // should this pg shard exist on this osd in this (or a later) epoch?
10554 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10555 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10556 dout(20) << __func__ << " " << item.first
10557 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10558 slot.to_process.push_front(*qi);
10559 slot.waiting_for_pg = true;
10560 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10561 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10562 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10563 << ", will wait on " << *qi << dendl;
10564 slot.to_process.push_front(*qi);
10565 slot.waiting_for_pg = true;
10566 } else {
10567 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10568 << " dropping " << *qi << dendl;
10569 // share map with client?
10570 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10571 Session *session = static_cast<Session *>(
10572 (*_op)->get_req()->get_connection()->get_priv());
10573 if (session) {
10574 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10575 session->put();
10576 }
10577 }
10578 unsigned pushes_to_free = qi->get_reserved_pushes();
10579 if (pushes_to_free > 0) {
10580 sdata->sdata_op_ordering_lock.Unlock();
10581 osd->service.release_reserved_pushes(pushes_to_free);
10582 return;
10583 }
10584 }
10585 sdata->sdata_op_ordering_lock.Unlock();
10586 return;
10587 }
10588 sdata->sdata_op_ordering_lock.Unlock();
10589
10590
10591 // osd_opwq_process marks the point at which an operation has been dequeued
10592 // and will begin to be handled by a worker thread.
10593 {
10594 #ifdef WITH_LTTNG
10595 osd_reqid_t reqid;
10596 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10597 reqid = (*_op)->get_reqid();
10598 }
10599 #endif
10600 tracepoint(osd, opwq_process_start, reqid.name._type,
10601 reqid.name._num, reqid.tid, reqid.inc);
10602 }
10603
10604 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10605 Formatter *f = Formatter::create("json");
10606 f->open_object_section("q");
10607 dump(f);
10608 f->close_section();
10609 f->flush(*_dout);
10610 delete f;
10611 *_dout << dendl;
10612
10613 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10614 suicide_interval);
10615 qi->run(osd, pg, tp_handle);
10616
10617 {
10618 #ifdef WITH_LTTNG
10619 osd_reqid_t reqid;
10620 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10621 reqid = (*_op)->get_reqid();
10622 }
10623 #endif
10624 tracepoint(osd, opwq_process_finish, reqid.name._type,
10625 reqid.name._num, reqid.tid, reqid.inc);
10626 }
10627
10628 pg->unlock();
10629 }
10630
10631 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10632 uint32_t shard_index =
10633 item.first.hash_to_shard(shard_list.size());
10634
10635 ShardData* sdata = shard_list[shard_index];
10636 assert (NULL != sdata);
10637 unsigned priority = item.second.get_priority();
10638 unsigned cost = item.second.get_cost();
10639 sdata->sdata_op_ordering_lock.Lock();
10640
10641 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10642 if (priority >= osd->op_prio_cutoff)
10643 sdata->pqueue->enqueue_strict(
10644 item.second.get_owner(), priority, item);
10645 else
10646 sdata->pqueue->enqueue(
10647 item.second.get_owner(),
10648 priority, cost, item);
10649 sdata->sdata_op_ordering_lock.Unlock();
10650
10651 sdata->sdata_lock.Lock();
10652 sdata->sdata_cond.SignalOne();
10653 sdata->sdata_lock.Unlock();
10654
10655 }
10656
10657 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10658 {
10659 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10660 ShardData* sdata = shard_list[shard_index];
10661 assert (NULL != sdata);
10662 sdata->sdata_op_ordering_lock.Lock();
10663 auto p = sdata->pg_slots.find(item.first);
10664 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10665 // we may be racing with _process, which has dequeued a new item
10666 // from pqueue, put it on to_process, and is now busy taking the
10667 // pg lock. ensure this old requeued item is ordered before any
10668 // such newer item in to_process.
10669 p->second.to_process.push_front(item.second);
10670 item.second = p->second.to_process.back();
10671 p->second.to_process.pop_back();
10672 dout(20) << __func__ << " " << item.first
10673 << " " << p->second.to_process.front()
10674 << " shuffled w/ " << item.second << dendl;
10675 } else {
10676 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10677 }
10678 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10679 sdata->sdata_op_ordering_lock.Unlock();
10680 sdata->sdata_lock.Lock();
10681 sdata->sdata_cond.SignalOne();
10682 sdata->sdata_lock.Unlock();
10683 }
10684
10685 namespace ceph {
10686 namespace osd_cmds {
10687
10688 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10689 {
10690 if (!ceph_using_tcmalloc()) {
10691 os << "could not issue heap profiler command -- not using tcmalloc!";
10692 return -EOPNOTSUPP;
10693 }
10694
10695 string cmd;
10696 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10697 os << "unable to get value for command \"" << cmd << "\"";
10698 return -EINVAL;
10699 }
10700
10701 std::vector<std::string> cmd_vec;
10702 get_str_vec(cmd, cmd_vec);
10703
10704 ceph_heap_profiler_handle_command(cmd_vec, os);
10705
10706 return 0;
10707 }
10708
10709 }} // namespace ceph::osd_cmds
10710
10711
10712 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10713 switch(q) {
10714 case OSD::io_queue::prioritized:
10715 out << "prioritized";
10716 break;
10717 case OSD::io_queue::weightedpriority:
10718 out << "weightedpriority";
10719 break;
10720 case OSD::io_queue::mclock_opclass:
10721 out << "mclock_opclass";
10722 break;
10723 case OSD::io_queue::mclock_client:
10724 out << "mclock_client";
10725 break;
10726 }
10727 return out;
10728 }