]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to v12.2.5
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48 #include "common/pick_address.h"
49
50 #include "os/ObjectStore.h"
51 #ifdef HAVE_LIBFUSE
52 #include "os/FuseStore.h"
53 #endif
54
55 #include "PrimaryLogPG.h"
56
57
58 #include "msg/Messenger.h"
59 #include "msg/Message.h"
60
61 #include "mon/MonClient.h"
62
63 #include "messages/MLog.h"
64
65 #include "messages/MGenericMessage.h"
66 #include "messages/MOSDPing.h"
67 #include "messages/MOSDFailure.h"
68 #include "messages/MOSDMarkMeDown.h"
69 #include "messages/MOSDFull.h"
70 #include "messages/MOSDOp.h"
71 #include "messages/MOSDOpReply.h"
72 #include "messages/MOSDBackoff.h"
73 #include "messages/MOSDBeacon.h"
74 #include "messages/MOSDRepOp.h"
75 #include "messages/MOSDRepOpReply.h"
76 #include "messages/MOSDBoot.h"
77 #include "messages/MOSDPGTemp.h"
78
79 #include "messages/MOSDMap.h"
80 #include "messages/MMonGetOSDMap.h"
81 #include "messages/MOSDPGNotify.h"
82 #include "messages/MOSDPGQuery.h"
83 #include "messages/MOSDPGLog.h"
84 #include "messages/MOSDPGRemove.h"
85 #include "messages/MOSDPGInfo.h"
86 #include "messages/MOSDPGCreate.h"
87 #include "messages/MOSDPGTrim.h"
88 #include "messages/MOSDPGScan.h"
89 #include "messages/MOSDPGBackfill.h"
90 #include "messages/MBackfillReserve.h"
91 #include "messages/MRecoveryReserve.h"
92 #include "messages/MOSDForceRecovery.h"
93 #include "messages/MOSDECSubOpWrite.h"
94 #include "messages/MOSDECSubOpWriteReply.h"
95 #include "messages/MOSDECSubOpRead.h"
96 #include "messages/MOSDECSubOpReadReply.h"
97 #include "messages/MOSDPGCreated.h"
98 #include "messages/MOSDPGUpdateLogMissing.h"
99 #include "messages/MOSDPGUpdateLogMissingReply.h"
100
101 #include "messages/MOSDAlive.h"
102
103 #include "messages/MOSDScrub.h"
104 #include "messages/MOSDScrubReserve.h"
105 #include "messages/MOSDRepScrub.h"
106
107 #include "messages/MMonCommand.h"
108 #include "messages/MCommand.h"
109 #include "messages/MCommandReply.h"
110
111 #include "messages/MPGStats.h"
112 #include "messages/MPGStatsAck.h"
113
114 #include "messages/MWatchNotify.h"
115 #include "messages/MOSDPGPush.h"
116 #include "messages/MOSDPGPushReply.h"
117 #include "messages/MOSDPGPull.h"
118
119 #include "common/perf_counters.h"
120 #include "common/Timer.h"
121 #include "common/LogClient.h"
122 #include "common/AsyncReserver.h"
123 #include "common/HeartbeatMap.h"
124 #include "common/admin_socket.h"
125 #include "common/ceph_context.h"
126
127 #include "global/signal_handler.h"
128 #include "global/pidfile.h"
129
130 #include "include/color.h"
131 #include "perfglue/cpu_profiler.h"
132 #include "perfglue/heap_profiler.h"
133
134 #include "osd/OpRequest.h"
135
136 #include "auth/AuthAuthorizeHandler.h"
137 #include "auth/RotatingKeyRing.h"
138 #include "common/errno.h"
139
140 #include "objclass/objclass.h"
141
142 #include "common/cmdparse.h"
143 #include "include/str_list.h"
144 #include "include/util.h"
145
146 #include "include/assert.h"
147 #include "common/config.h"
148 #include "common/EventTrace.h"
149
150 #ifdef WITH_LTTNG
151 #define TRACEPOINT_DEFINE
152 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #include "tracing/osd.h"
154 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155 #undef TRACEPOINT_DEFINE
156 #else
157 #define tracepoint(...)
158 #endif
159
160 #define dout_context cct
161 #define dout_subsys ceph_subsys_osd
162 #undef dout_prefix
163 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
164
165
166 const double OSD::OSD_TICK_INTERVAL = 1.0;
167
168 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
170 }
171
172 //Initial features in new superblock.
173 //Features here are also automatically upgraded
174 CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
195 }
196
197 //Features are added here that this OSD supports.
198 CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
202 return compat;
203 }
204
205 OSDService::OSDService(OSD *osd) :
206 osd(osd),
207 cct(osd->cct),
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
214 logger(osd->logger),
215 recoverystate_perf(osd->recoverystate_perf),
216 monc(osd->monc),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
219 &osd->disk_tp),
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
224 max_oldest_map(0),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 scrubs_active(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
230 agent_ops(0),
231 flush_mode_high_count(0),
232 agent_active(true),
233 agent_thread(this),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
244 next_notif_id(0),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
249 reserver_finisher(cct),
250 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
251 cct->_conf->osd_min_recovery_priority),
252 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 snap_sleep_timer(
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 scrub_sleep_timer(
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
261 snap_reserver(cct, &reserver_finisher,
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279 #ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281 #endif
282 {
283 objecter->init();
284 }
285
286 OSDService::~OSDService()
287 {
288 delete objecter;
289 }
290
291
292
293 #ifdef PG_DEBUG_REFS
294 void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
297 live_pgs[pgid] = pg;
298 }
299 pgid_tracker[pgid]++;
300 }
301 void OSDService::remove_pgid(spg_t pgid, PG *pg)
302 {
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
310 }
311 }
312 void OSDService::dump_live_pgids()
313 {
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
318 ++i) {
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
321 }
322 }
323 #endif
324
325
326 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
327 {
328 for (set<spg_t>::const_iterator i = children.begin();
329 i != children.end();
330 ++i) {
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
336
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
339 }
340 }
341
342 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
343 {
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
348 i != children.end();
349 ++i) {
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
354
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
358 }
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
361 }
362
363 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
364 {
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
367 }
368
369 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
370 {
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
373 return;
374
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
377 ++i) {
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
384 }
385 rev_pending_splits.erase(piter);
386 }
387
388 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
389 OSDMapRef new_map,
390 spg_t pgid)
391 {
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
395 set<spg_t> children;
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
399 } else {
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
401 }
402 }
403
404 void OSDService::init_splits_between(spg_t pgid,
405 OSDMapRef frommap,
406 OSDMapRef tomap)
407 {
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
410 pgid.is_split(
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
413 NULL)) {
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
420 ++e) {
421 OSDMapRef nextmap(try_get_map(e));
422 if (!nextmap)
423 continue;
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
429 &split_pgs)) {
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
432 }
433 }
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
435 curmap = nextmap;
436 }
437 assert(curmap == tomap); // we must have had both frommap and tomap
438 }
439 }
440
441 void OSDService::expand_pg_num(OSDMapRef old_map,
442 OSDMapRef new_map)
443 {
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
447 ) {
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
450 } else {
451 _maybe_split_pgid(old_map, new_map, *i);
452 ++i;
453 }
454 }
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
457 ) {
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
461 } else {
462 _maybe_split_pgid(old_map, new_map, i->first);
463 ++i;
464 }
465 }
466 }
467
468 bool OSDService::splitting(spg_t pgid)
469 {
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
473 }
474
475 void OSDService::complete_split(const set<spg_t> &pgs)
476 {
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
479 i != pgs.end();
480 ++i) {
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
485 }
486 }
487
488 void OSDService::need_heartbeat_peer_update()
489 {
490 osd->need_heartbeat_peer_update();
491 }
492
493 void OSDService::pg_stat_queue_enqueue(PG *pg)
494 {
495 osd->pg_stat_queue_enqueue(pg);
496 }
497
498 void OSDService::pg_stat_queue_dequeue(PG *pg)
499 {
500 osd->pg_stat_queue_dequeue(pg);
501 }
502
503 void OSDService::start_shutdown()
504 {
505 {
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
508 }
509
510 {
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
513 }
514 }
515
516 void OSDService::shutdown_reserver()
517 {
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
520 }
521
522 void OSDService::shutdown()
523 {
524 {
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
527 }
528
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
532
533 {
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
536 }
537
538 {
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
541 }
542
543 {
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
546 }
547
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
550 }
551
552 void OSDService::init()
553 {
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
557
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
560
561 watch_timer.init();
562 agent_timer.init();
563 snap_sleep_timer.init();
564 scrub_sleep_timer.init();
565
566 agent_thread.create("osd_srv_agent");
567
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
570 }
571
572 void OSDService::final_init()
573 {
574 objecter->start(osdmap.get());
575 }
576
577 void OSDService::activate_map()
578 {
579 // wake/unwake the tiering agent
580 agent_lock.Lock();
581 agent_active =
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
583 osd->is_active();
584 agent_cond.Signal();
585 agent_lock.Unlock();
586 }
587
588 void OSDService::request_osdmap_update(epoch_t e)
589 {
590 osd->osdmap_subscribe(e, false);
591 }
592
593 class AgentTimeoutCB : public Context {
594 PGRef pg;
595 public:
596 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
597 void finish(int) override {
598 pg->agent_choose_mode_restart();
599 }
600 };
601
602 void OSDService::agent_entry()
603 {
604 dout(10) << __func__ << " start" << dendl;
605 agent_lock.Lock();
606
607 while (!agent_stop_flag) {
608 if (agent_queue.empty()) {
609 dout(20) << __func__ << " empty queue" << dendl;
610 agent_cond.Wait(agent_lock);
611 continue;
612 }
613 uint64_t level = agent_queue.rbegin()->first;
614 set<PGRef>& top = agent_queue.rbegin()->second;
615 dout(10) << __func__
616 << " tiers " << agent_queue.size()
617 << ", top is " << level
618 << " with pgs " << top.size()
619 << ", ops " << agent_ops << "/"
620 << cct->_conf->osd_agent_max_ops
621 << (agent_active ? " active" : " NOT ACTIVE")
622 << dendl;
623 dout(20) << __func__ << " oids " << agent_oids << dendl;
624 int max = cct->_conf->osd_agent_max_ops - agent_ops;
625 int agent_flush_quota = max;
626 if (!flush_mode_high_count)
627 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
628 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
629 agent_cond.Wait(agent_lock);
630 continue;
631 }
632
633 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
634 agent_queue_pos = top.begin();
635 agent_valid_iterator = true;
636 }
637 PGRef pg = *agent_queue_pos;
638 dout(10) << "high_count " << flush_mode_high_count
639 << " agent_ops " << agent_ops
640 << " flush_quota " << agent_flush_quota << dendl;
641 agent_lock.Unlock();
642 if (!pg->agent_work(max, agent_flush_quota)) {
643 dout(10) << __func__ << " " << pg->get_pgid()
644 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
645 << " seconds" << dendl;
646
647 osd->logger->inc(l_osd_tier_delay);
648 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
649 agent_timer_lock.Lock();
650 Context *cb = new AgentTimeoutCB(pg);
651 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
652 agent_timer_lock.Unlock();
653 }
654 agent_lock.Lock();
655 }
656 agent_lock.Unlock();
657 dout(10) << __func__ << " finish" << dendl;
658 }
659
660 void OSDService::agent_stop()
661 {
662 {
663 Mutex::Locker l(agent_lock);
664
665 // By this time all ops should be cancelled
666 assert(agent_ops == 0);
667 // By this time all PGs are shutdown and dequeued
668 if (!agent_queue.empty()) {
669 set<PGRef>& top = agent_queue.rbegin()->second;
670 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
671 assert(0 == "agent queue not empty");
672 }
673
674 agent_stop_flag = true;
675 agent_cond.Signal();
676 }
677 agent_thread.join();
678 }
679
680 // -------------------------------------
681
682 void OSDService::promote_throttle_recalibrate()
683 {
684 utime_t now = ceph_clock_now();
685 double dur = now - last_recalibrate;
686 last_recalibrate = now;
687 unsigned prob = promote_probability_millis;
688
689 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
690 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
691
692 unsigned min_prob = 1;
693
694 uint64_t attempts, obj, bytes;
695 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
696 dout(10) << __func__ << " " << attempts << " attempts, promoted "
697 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
698 << target_obj_sec << " obj/sec or "
699 << pretty_si_t(target_bytes_sec) << " bytes/sec"
700 << dendl;
701
702 // calculate what the probability *should* be, given the targets
703 unsigned new_prob;
704 if (attempts && dur > 0) {
705 uint64_t avg_size = 1;
706 if (obj)
707 avg_size = MAX(bytes / obj, 1);
708 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
709 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
710 / (double)attempts;
711 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
712 << avg_size << dendl;
713 if (target_obj_sec && target_bytes_sec)
714 new_prob = MIN(po, pb);
715 else if (target_obj_sec)
716 new_prob = po;
717 else if (target_bytes_sec)
718 new_prob = pb;
719 else
720 new_prob = 1000;
721 } else {
722 new_prob = 1000;
723 }
724 dout(20) << __func__ << " new_prob " << new_prob << dendl;
725
726 // correct for persistent skew between target rate and actual rate, adjust
727 double ratio = 1.0;
728 unsigned actual = 0;
729 if (attempts && obj) {
730 actual = obj * 1000 / attempts;
731 ratio = (double)actual / (double)prob;
732 new_prob = (double)new_prob / ratio;
733 }
734 new_prob = MAX(new_prob, min_prob);
735 new_prob = MIN(new_prob, 1000);
736
737 // adjust
738 prob = (prob + new_prob) / 2;
739 prob = MAX(prob, min_prob);
740 prob = MIN(prob, 1000);
741 dout(10) << __func__ << " actual " << actual
742 << ", actual/prob ratio " << ratio
743 << ", adjusted new_prob " << new_prob
744 << ", prob " << promote_probability_millis << " -> " << prob
745 << dendl;
746 promote_probability_millis = prob;
747
748 // set hard limits for this interval to mitigate stampedes
749 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
750 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
751 }
752
753 // -------------------------------------
754
755 float OSDService::get_failsafe_full_ratio()
756 {
757 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
758 if (full_ratio > 1.0) full_ratio /= 100.0;
759 return full_ratio;
760 }
761
762 void OSDService::check_full_status(float ratio)
763 {
764 Mutex::Locker l(full_status_lock);
765
766 cur_ratio = ratio;
767
768 // The OSDMap ratios take precendence. So if the failsafe is .95 and
769 // the admin sets the cluster full to .96, the failsafe moves up to .96
770 // too. (Not that having failsafe == full is ideal, but it's better than
771 // dropping writes before the clusters appears full.)
772 OSDMapRef osdmap = get_osdmap();
773 if (!osdmap || osdmap->get_epoch() == 0) {
774 cur_state = NONE;
775 return;
776 }
777 float nearfull_ratio = osdmap->get_nearfull_ratio();
778 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
779 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
780 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
781
782 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
783 // use the failsafe for nearfull and full; the mon isn't using the
784 // flags anyway because we're mid-upgrade.
785 full_ratio = failsafe_ratio;
786 backfillfull_ratio = failsafe_ratio;
787 nearfull_ratio = failsafe_ratio;
788 } else if (full_ratio <= 0 ||
789 backfillfull_ratio <= 0 ||
790 nearfull_ratio <= 0) {
791 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
792 // use failsafe flag. ick. the monitor did something wrong or the user
793 // did something stupid.
794 full_ratio = failsafe_ratio;
795 backfillfull_ratio = failsafe_ratio;
796 nearfull_ratio = failsafe_ratio;
797 }
798
799 string inject;
800 s_names new_state;
801 if (injectfull_state > NONE && injectfull) {
802 new_state = injectfull_state;
803 inject = "(Injected)";
804 } else if (ratio > failsafe_ratio) {
805 new_state = FAILSAFE;
806 } else if (ratio > full_ratio) {
807 new_state = FULL;
808 } else if (ratio > backfillfull_ratio) {
809 new_state = BACKFILLFULL;
810 } else if (ratio > nearfull_ratio) {
811 new_state = NEARFULL;
812 } else {
813 new_state = NONE;
814 }
815 dout(20) << __func__ << " cur ratio " << ratio
816 << ". nearfull_ratio " << nearfull_ratio
817 << ". backfillfull_ratio " << backfillfull_ratio
818 << ", full_ratio " << full_ratio
819 << ", failsafe_ratio " << failsafe_ratio
820 << ", new state " << get_full_state_name(new_state)
821 << " " << inject
822 << dendl;
823
824 // warn
825 if (cur_state != new_state) {
826 dout(10) << __func__ << " " << get_full_state_name(cur_state)
827 << " -> " << get_full_state_name(new_state) << dendl;
828 if (new_state == FAILSAFE) {
829 clog->error() << "full status failsafe engaged, dropping updates, now "
830 << (int)roundf(ratio * 100) << "% full";
831 } else if (cur_state == FAILSAFE) {
832 clog->error() << "full status failsafe disengaged, no longer dropping "
833 << "updates, now " << (int)roundf(ratio * 100) << "% full";
834 }
835 cur_state = new_state;
836 }
837 }
838
839 bool OSDService::need_fullness_update()
840 {
841 OSDMapRef osdmap = get_osdmap();
842 s_names cur = NONE;
843 if (osdmap->exists(whoami)) {
844 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
845 cur = FULL;
846 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
847 cur = BACKFILLFULL;
848 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
849 cur = NEARFULL;
850 }
851 }
852 s_names want = NONE;
853 if (is_full())
854 want = FULL;
855 else if (is_backfillfull())
856 want = BACKFILLFULL;
857 else if (is_nearfull())
858 want = NEARFULL;
859 return want != cur;
860 }
861
862 bool OSDService::_check_full(s_names type, ostream &ss) const
863 {
864 Mutex::Locker l(full_status_lock);
865
866 if (injectfull && injectfull_state >= type) {
867 // injectfull is either a count of the number of times to return failsafe full
868 // or if -1 then always return full
869 if (injectfull > 0)
870 --injectfull;
871 ss << "Injected " << get_full_state_name(type) << " OSD ("
872 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
873 return true;
874 }
875
876 ss << "current usage is " << cur_ratio;
877 return cur_state >= type;
878 }
879
880 bool OSDService::check_failsafe_full(ostream &ss) const
881 {
882 return _check_full(FAILSAFE, ss);
883 }
884
885 bool OSDService::check_full(ostream &ss) const
886 {
887 return _check_full(FULL, ss);
888 }
889
890 bool OSDService::check_backfill_full(ostream &ss) const
891 {
892 return _check_full(BACKFILLFULL, ss);
893 }
894
895 bool OSDService::check_nearfull(ostream &ss) const
896 {
897 return _check_full(NEARFULL, ss);
898 }
899
900 bool OSDService::is_failsafe_full() const
901 {
902 Mutex::Locker l(full_status_lock);
903 return cur_state == FAILSAFE;
904 }
905
906 bool OSDService::is_full() const
907 {
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= FULL;
910 }
911
912 bool OSDService::is_backfillfull() const
913 {
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= BACKFILLFULL;
916 }
917
918 bool OSDService::is_nearfull() const
919 {
920 Mutex::Locker l(full_status_lock);
921 return cur_state >= NEARFULL;
922 }
923
924 void OSDService::set_injectfull(s_names type, int64_t count)
925 {
926 Mutex::Locker l(full_status_lock);
927 injectfull_state = type;
928 injectfull = count;
929 }
930
931 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
932 vector<int>& hb_peers,
933 int num_pgs)
934 {
935 uint64_t bytes = stbuf.total;
936 uint64_t used = bytes - stbuf.available;
937 uint64_t avail = stbuf.available;
938
939 osd->logger->set(l_osd_stat_bytes, bytes);
940 osd->logger->set(l_osd_stat_bytes_used, used);
941 osd->logger->set(l_osd_stat_bytes_avail, avail);
942
943 {
944 Mutex::Locker l(stat_lock);
945 osd_stat.hb_peers.swap(hb_peers);
946 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
947 osd_stat.kb = bytes >> 10;
948 osd_stat.kb_used = used >> 10;
949 osd_stat.kb_avail = avail >> 10;
950 osd_stat.num_pgs = num_pgs;
951 return osd_stat;
952 }
953 }
954
955 void OSDService::update_osd_stat(vector<int>& hb_peers)
956 {
957 // load osd stats first
958 struct store_statfs_t stbuf;
959 int r = osd->store->statfs(&stbuf);
960 if (r < 0) {
961 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
962 return;
963 }
964
965 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
966 dout(20) << "update_osd_stat " << new_stat << dendl;
967 assert(new_stat.kb);
968 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
969 check_full_status(ratio);
970 }
971
972 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
973 {
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
977 return true;
978 }
979 return false;
980 }
981
982 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
983 {
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
987
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
990 m->put();
991 release_map(next_map);
992 return;
993 }
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
999 }
1000
1001 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1002 {
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1006
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1010 return NULL;
1011 }
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1014 return con;
1015 }
1016
1017 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1018 {
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1022
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1027 return ret;
1028 }
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1033 return ret;
1034 }
1035
1036
1037 void OSDService::queue_want_pg_temp(pg_t pgid,
1038 const vector<int>& want,
1039 bool forced)
1040 {
1041 Mutex::Locker l(pg_temp_lock);
1042 auto p = pg_temp_pending.find(pgid);
1043 if (p == pg_temp_pending.end() ||
1044 p->second.acting != want ||
1045 forced) {
1046 pg_temp_wanted[pgid] = pg_temp_t{want, forced};
1047 }
1048 }
1049
1050 void OSDService::remove_want_pg_temp(pg_t pgid)
1051 {
1052 Mutex::Locker l(pg_temp_lock);
1053 pg_temp_wanted.erase(pgid);
1054 pg_temp_pending.erase(pgid);
1055 }
1056
1057 void OSDService::_sent_pg_temp()
1058 {
1059 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1060 make_move_iterator(end(pg_temp_wanted)));
1061 pg_temp_wanted.clear();
1062 }
1063
1064 void OSDService::requeue_pg_temp()
1065 {
1066 Mutex::Locker l(pg_temp_lock);
1067 // wanted overrides pending. note that remove_want_pg_temp
1068 // clears the item out of both.
1069 unsigned old_wanted = pg_temp_wanted.size();
1070 unsigned old_pending = pg_temp_pending.size();
1071 _sent_pg_temp();
1072 pg_temp_wanted.swap(pg_temp_pending);
1073 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1074 << pg_temp_wanted.size() << dendl;
1075 }
1076
1077 std::ostream& operator<<(std::ostream& out,
1078 const OSDService::pg_temp_t& pg_temp)
1079 {
1080 out << pg_temp.acting;
1081 if (pg_temp.forced) {
1082 out << " (forced)";
1083 }
1084 return out;
1085 }
1086
1087 void OSDService::send_pg_temp()
1088 {
1089 Mutex::Locker l(pg_temp_lock);
1090 if (pg_temp_wanted.empty())
1091 return;
1092 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1093 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1094 for (auto& pg_temp : pg_temp_wanted) {
1095 auto& m = ms[pg_temp.second.forced];
1096 if (!m) {
1097 m = new MOSDPGTemp(osdmap->get_epoch());
1098 m->forced = pg_temp.second.forced;
1099 }
1100 m->pg_temp.emplace(pg_temp.first,
1101 pg_temp.second.acting);
1102 }
1103 for (auto m : ms) {
1104 if (m) {
1105 monc->send_mon_message(m);
1106 }
1107 }
1108 _sent_pg_temp();
1109 }
1110
1111 void OSDService::send_pg_created(pg_t pgid)
1112 {
1113 dout(20) << __func__ << dendl;
1114 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1115 monc->send_mon_message(new MOSDPGCreated(pgid));
1116 }
1117 }
1118
1119 // --------------------------------------
1120 // dispatch
1121
1122 epoch_t OSDService::get_peer_epoch(int peer)
1123 {
1124 Mutex::Locker l(peer_map_epoch_lock);
1125 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1126 if (p == peer_map_epoch.end())
1127 return 0;
1128 return p->second;
1129 }
1130
1131 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1132 {
1133 Mutex::Locker l(peer_map_epoch_lock);
1134 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1135 if (p != peer_map_epoch.end()) {
1136 if (p->second < e) {
1137 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1138 p->second = e;
1139 } else {
1140 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1141 }
1142 return p->second;
1143 } else {
1144 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1145 peer_map_epoch[peer] = e;
1146 return e;
1147 }
1148 }
1149
1150 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1151 {
1152 Mutex::Locker l(peer_map_epoch_lock);
1153 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1154 if (p != peer_map_epoch.end()) {
1155 if (p->second <= as_of) {
1156 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1157 << " had " << p->second << dendl;
1158 peer_map_epoch.erase(p);
1159 } else {
1160 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1161 << " has " << p->second << " - not forgetting" << dendl;
1162 }
1163 }
1164 }
1165
1166 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1167 epoch_t epoch, const OSDMapRef& osdmap,
1168 const epoch_t *sent_epoch_p)
1169 {
1170 dout(20) << "should_share_map "
1171 << name << " " << con->get_peer_addr()
1172 << " " << epoch << dendl;
1173
1174 // does client have old map?
1175 if (name.is_client()) {
1176 bool message_sendmap = epoch < osdmap->get_epoch();
1177 if (message_sendmap && sent_epoch_p) {
1178 dout(20) << "client session last_sent_epoch: "
1179 << *sent_epoch_p
1180 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1181 if (*sent_epoch_p < osdmap->get_epoch()) {
1182 return true;
1183 } // else we don't need to send it out again
1184 }
1185 }
1186
1187 if (con->get_messenger() == osd->cluster_messenger &&
1188 con != osd->cluster_messenger->get_loopback_connection() &&
1189 osdmap->is_up(name.num()) &&
1190 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1191 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1192 // remember
1193 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1194
1195 // share?
1196 if (has < osdmap->get_epoch()) {
1197 dout(10) << name << " " << con->get_peer_addr()
1198 << " has old map " << epoch << " < "
1199 << osdmap->get_epoch() << dendl;
1200 return true;
1201 }
1202 }
1203
1204 return false;
1205 }
1206
1207 void OSDService::share_map(
1208 entity_name_t name,
1209 Connection *con,
1210 epoch_t epoch,
1211 OSDMapRef& osdmap,
1212 epoch_t *sent_epoch_p)
1213 {
1214 dout(20) << "share_map "
1215 << name << " " << con->get_peer_addr()
1216 << " " << epoch << dendl;
1217
1218 if (!osd->is_active()) {
1219 /*It is safe not to proceed as OSD is not in healthy state*/
1220 return;
1221 }
1222
1223 bool want_shared = should_share_map(name, con, epoch,
1224 osdmap, sent_epoch_p);
1225
1226 if (want_shared){
1227 if (name.is_client()) {
1228 dout(10) << name << " has old map " << epoch
1229 << " < " << osdmap->get_epoch() << dendl;
1230 // we know the Session is valid or we wouldn't be sending
1231 if (sent_epoch_p) {
1232 *sent_epoch_p = osdmap->get_epoch();
1233 }
1234 send_incremental_map(epoch, con, osdmap);
1235 } else if (con->get_messenger() == osd->cluster_messenger &&
1236 osdmap->is_up(name.num()) &&
1237 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1238 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1239 dout(10) << name << " " << con->get_peer_addr()
1240 << " has old map " << epoch << " < "
1241 << osdmap->get_epoch() << dendl;
1242 note_peer_epoch(name.num(), osdmap->get_epoch());
1243 send_incremental_map(epoch, con, osdmap);
1244 }
1245 }
1246 }
1247
1248 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1249 {
1250 if (!map)
1251 map = get_osdmap();
1252
1253 // send map?
1254 epoch_t pe = get_peer_epoch(peer);
1255 if (pe) {
1256 if (pe < map->get_epoch()) {
1257 send_incremental_map(pe, con, map);
1258 note_peer_epoch(peer, map->get_epoch());
1259 } else
1260 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1261 } else {
1262 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1263 // no idea about peer's epoch.
1264 // ??? send recent ???
1265 // do nothing.
1266 }
1267 }
1268
1269 bool OSDService::can_inc_scrubs_pending()
1270 {
1271 bool can_inc = false;
1272 Mutex::Locker l(sched_scrub_lock);
1273
1274 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1275 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1276 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1277 can_inc = true;
1278 } else {
1279 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1280 }
1281
1282 return can_inc;
1283 }
1284
1285 bool OSDService::inc_scrubs_pending()
1286 {
1287 bool result = false;
1288
1289 sched_scrub_lock.Lock();
1290 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1291 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1292 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1293 result = true;
1294 ++scrubs_pending;
1295 } else {
1296 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1297 }
1298 sched_scrub_lock.Unlock();
1299
1300 return result;
1301 }
1302
1303 void OSDService::dec_scrubs_pending()
1304 {
1305 sched_scrub_lock.Lock();
1306 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1307 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1308 --scrubs_pending;
1309 assert(scrubs_pending >= 0);
1310 sched_scrub_lock.Unlock();
1311 }
1312
1313 void OSDService::inc_scrubs_active(bool reserved)
1314 {
1315 sched_scrub_lock.Lock();
1316 ++(scrubs_active);
1317 if (reserved) {
1318 --(scrubs_pending);
1319 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1320 << " (max " << cct->_conf->osd_max_scrubs
1321 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1322 assert(scrubs_pending >= 0);
1323 } else {
1324 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1325 << " (max " << cct->_conf->osd_max_scrubs
1326 << ", pending " << scrubs_pending << ")" << dendl;
1327 }
1328 sched_scrub_lock.Unlock();
1329 }
1330
1331 void OSDService::dec_scrubs_active()
1332 {
1333 sched_scrub_lock.Lock();
1334 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1335 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1336 --scrubs_active;
1337 assert(scrubs_active >= 0);
1338 sched_scrub_lock.Unlock();
1339 }
1340
1341 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1342 epoch_t *_bind_epoch) const
1343 {
1344 Mutex::Locker l(epoch_lock);
1345 if (_boot_epoch)
1346 *_boot_epoch = boot_epoch;
1347 if (_up_epoch)
1348 *_up_epoch = up_epoch;
1349 if (_bind_epoch)
1350 *_bind_epoch = bind_epoch;
1351 }
1352
1353 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1354 const epoch_t *_bind_epoch)
1355 {
1356 Mutex::Locker l(epoch_lock);
1357 if (_boot_epoch) {
1358 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1359 boot_epoch = *_boot_epoch;
1360 }
1361 if (_up_epoch) {
1362 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1363 up_epoch = *_up_epoch;
1364 }
1365 if (_bind_epoch) {
1366 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1367 bind_epoch = *_bind_epoch;
1368 }
1369 }
1370
1371 bool OSDService::prepare_to_stop()
1372 {
1373 Mutex::Locker l(is_stopping_lock);
1374 if (get_state() != NOT_STOPPING)
1375 return false;
1376
1377 OSDMapRef osdmap = get_osdmap();
1378 if (osdmap && osdmap->is_up(whoami)) {
1379 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1380 set_state(PREPARING_TO_STOP);
1381 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1382 osdmap->get_inst(whoami),
1383 osdmap->get_epoch(),
1384 true // request ack
1385 ));
1386 utime_t now = ceph_clock_now();
1387 utime_t timeout;
1388 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1389 while ((ceph_clock_now() < timeout) &&
1390 (get_state() != STOPPING)) {
1391 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1392 }
1393 }
1394 dout(0) << __func__ << " starting shutdown" << dendl;
1395 set_state(STOPPING);
1396 return true;
1397 }
1398
1399 void OSDService::got_stop_ack()
1400 {
1401 Mutex::Locker l(is_stopping_lock);
1402 if (get_state() == PREPARING_TO_STOP) {
1403 dout(0) << __func__ << " starting shutdown" << dendl;
1404 set_state(STOPPING);
1405 is_stopping_cond.Signal();
1406 } else {
1407 dout(10) << __func__ << " ignoring msg" << dendl;
1408 }
1409 }
1410
1411 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1412 OSDSuperblock& sblock)
1413 {
1414 MOSDMap *m = new MOSDMap(monc->get_fsid());
1415 m->oldest_map = max_oldest_map;
1416 m->newest_map = sblock.newest_map;
1417
1418 for (epoch_t e = to; e > since; e--) {
1419 bufferlist bl;
1420 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1421 m->incremental_maps[e].claim(bl);
1422 } else if (get_map_bl(e, bl)) {
1423 m->maps[e].claim(bl);
1424 break;
1425 } else {
1426 derr << "since " << since << " to " << to
1427 << " oldest " << m->oldest_map << " newest " << m->newest_map
1428 << dendl;
1429 m->put();
1430 m = NULL;
1431 break;
1432 }
1433 }
1434 return m;
1435 }
1436
1437 void OSDService::send_map(MOSDMap *m, Connection *con)
1438 {
1439 con->send_message(m);
1440 }
1441
1442 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1443 OSDMapRef& osdmap)
1444 {
1445 epoch_t to = osdmap->get_epoch();
1446 dout(10) << "send_incremental_map " << since << " -> " << to
1447 << " to " << con << " " << con->get_peer_addr() << dendl;
1448
1449 MOSDMap *m = NULL;
1450 while (!m) {
1451 OSDSuperblock sblock(get_superblock());
1452 if (since < sblock.oldest_map) {
1453 // just send latest full map
1454 MOSDMap *m = new MOSDMap(monc->get_fsid());
1455 m->oldest_map = max_oldest_map;
1456 m->newest_map = sblock.newest_map;
1457 get_map_bl(to, m->maps[to]);
1458 send_map(m, con);
1459 return;
1460 }
1461
1462 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1463 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1464 << ", only sending most recent" << dendl;
1465 since = to - cct->_conf->osd_map_share_max_epochs;
1466 }
1467
1468 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1469 to = since + cct->_conf->osd_map_message_max;
1470 m = build_incremental_map_msg(since, to, sblock);
1471 }
1472 send_map(m, con);
1473 }
1474
1475 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1476 {
1477 bool found = map_bl_cache.lookup(e, &bl);
1478 if (found) {
1479 if (logger)
1480 logger->inc(l_osd_map_bl_cache_hit);
1481 return true;
1482 }
1483 if (logger)
1484 logger->inc(l_osd_map_bl_cache_miss);
1485 found = store->read(coll_t::meta(),
1486 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1487 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1488 if (found) {
1489 _add_map_bl(e, bl);
1490 }
1491 return found;
1492 }
1493
1494 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1495 {
1496 Mutex::Locker l(map_cache_lock);
1497 bool found = map_bl_inc_cache.lookup(e, &bl);
1498 if (found) {
1499 if (logger)
1500 logger->inc(l_osd_map_bl_cache_hit);
1501 return true;
1502 }
1503 if (logger)
1504 logger->inc(l_osd_map_bl_cache_miss);
1505 found = store->read(coll_t::meta(),
1506 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1507 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1508 if (found) {
1509 _add_map_inc_bl(e, bl);
1510 }
1511 return found;
1512 }
1513
1514 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1515 {
1516 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1517 // cache a contiguous buffer
1518 if (bl.get_num_buffers() > 1) {
1519 bl.rebuild();
1520 }
1521 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1522 map_bl_cache.add(e, bl);
1523 }
1524
1525 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1526 {
1527 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1528 // cache a contiguous buffer
1529 if (bl.get_num_buffers() > 1) {
1530 bl.rebuild();
1531 }
1532 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1533 map_bl_inc_cache.add(e, bl);
1534 }
1535
1536 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1537 {
1538 Mutex::Locker l(map_cache_lock);
1539 // cache a contiguous buffer
1540 if (bl.get_num_buffers() > 1) {
1541 bl.rebuild();
1542 }
1543 map_bl_inc_cache.pin(e, bl);
1544 }
1545
1546 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1547 {
1548 Mutex::Locker l(map_cache_lock);
1549 // cache a contiguous buffer
1550 if (bl.get_num_buffers() > 1) {
1551 bl.rebuild();
1552 }
1553 map_bl_cache.pin(e, bl);
1554 }
1555
1556 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1557 {
1558 Mutex::Locker l(map_cache_lock);
1559 map_bl_inc_cache.clear_pinned(e);
1560 map_bl_cache.clear_pinned(e);
1561 }
1562
1563 OSDMapRef OSDService::_add_map(OSDMap *o)
1564 {
1565 epoch_t e = o->get_epoch();
1566
1567 if (cct->_conf->osd_map_dedup) {
1568 // Dedup against an existing map at a nearby epoch
1569 OSDMapRef for_dedup = map_cache.lower_bound(e);
1570 if (for_dedup) {
1571 OSDMap::dedup(for_dedup.get(), o);
1572 }
1573 }
1574 bool existed;
1575 OSDMapRef l = map_cache.add(e, o, &existed);
1576 if (existed) {
1577 delete o;
1578 }
1579 return l;
1580 }
1581
1582 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1583 {
1584 Mutex::Locker l(map_cache_lock);
1585 OSDMapRef retval = map_cache.lookup(epoch);
1586 if (retval) {
1587 dout(30) << "get_map " << epoch << " -cached" << dendl;
1588 if (logger) {
1589 logger->inc(l_osd_map_cache_hit);
1590 }
1591 return retval;
1592 }
1593 if (logger) {
1594 logger->inc(l_osd_map_cache_miss);
1595 epoch_t lb = map_cache.cached_key_lower_bound();
1596 if (epoch < lb) {
1597 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1598 logger->inc(l_osd_map_cache_miss_low);
1599 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1600 }
1601 }
1602
1603 OSDMap *map = new OSDMap;
1604 if (epoch > 0) {
1605 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1606 bufferlist bl;
1607 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1608 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1609 delete map;
1610 return OSDMapRef();
1611 }
1612 map->decode(bl);
1613 } else {
1614 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1615 }
1616 return _add_map(map);
1617 }
1618
1619 // ops
1620
1621
1622 void OSDService::reply_op_error(OpRequestRef op, int err)
1623 {
1624 reply_op_error(op, err, eversion_t(), 0);
1625 }
1626
1627 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1628 version_t uv)
1629 {
1630 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1631 assert(m->get_type() == CEPH_MSG_OSD_OP);
1632 int flags;
1633 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1634
1635 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1636 true);
1637 reply->set_reply_versions(v, uv);
1638 m->get_connection()->send_message(reply);
1639 }
1640
1641 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1642 {
1643 if (!cct->_conf->osd_debug_misdirected_ops) {
1644 return;
1645 }
1646
1647 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1648 assert(m->get_type() == CEPH_MSG_OSD_OP);
1649
1650 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1651
1652 if (pg->is_ec_pg()) {
1653 /**
1654 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1655 * can get this result:
1656 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1657 * [CRUSH_ITEM_NONE, 2, 3]/3
1658 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1659 * [3, 2, 3]/3
1660 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1661 * -- misdirected op
1662 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1663 * it and fulfils it
1664 *
1665 * We can't compute the op target based on the sending map epoch due to
1666 * splitting. The simplest thing is to detect such cases here and drop
1667 * them without an error (the client will resend anyway).
1668 */
1669 assert(m->get_map_epoch() <= superblock.newest_map);
1670 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1671 if (!opmap) {
1672 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1673 << m->get_map_epoch() << ", dropping" << dendl;
1674 return;
1675 }
1676 pg_t _pgid = m->get_raw_pg();
1677 spg_t pgid;
1678 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1679 _pgid = opmap->raw_pg_to_pg(_pgid);
1680 if (opmap->get_primary_shard(_pgid, &pgid) &&
1681 pgid.shard != pg->info.pgid.shard) {
1682 dout(7) << __func__ << ": " << *pg << " primary changed since "
1683 << m->get_map_epoch() << ", dropping" << dendl;
1684 return;
1685 }
1686 }
1687
1688 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1689 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1690 << " pg " << m->get_raw_pg()
1691 << " to osd." << whoami
1692 << " not " << pg->acting
1693 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1694 }
1695
1696 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1697 {
1698 osd->op_shardedwq.queue(make_pair(pgid, qi));
1699 }
1700
1701 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1702 {
1703 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1704 }
1705
1706 void OSDService::queue_for_peering(PG *pg)
1707 {
1708 peering_wq.queue(pg);
1709 }
1710
1711 void OSDService::queue_for_snap_trim(PG *pg)
1712 {
1713 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1714 osd->op_shardedwq.queue(
1715 make_pair(
1716 pg->info.pgid,
1717 PGQueueable(
1718 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1719 cct->_conf->osd_snap_trim_cost,
1720 cct->_conf->osd_snap_trim_priority,
1721 ceph_clock_now(),
1722 entity_inst_t(),
1723 pg->get_osdmap()->get_epoch())));
1724 }
1725
1726
1727 // ====================================================================
1728 // OSD
1729
1730 #undef dout_prefix
1731 #define dout_prefix *_dout
1732
1733 // Commands shared between OSD's console and admin console:
1734 namespace ceph {
1735 namespace osd_cmds {
1736
1737 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1738
1739 }} // namespace ceph::osd_cmds
1740
1741 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1742 uuid_d fsid, int whoami)
1743 {
1744 int ret;
1745
1746 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1747 new ObjectStore::Sequencer("mkfs"));
1748 OSDSuperblock sb;
1749 bufferlist sbbl;
1750 C_SaferCond waiter;
1751
1752 // if we are fed a uuid for this osd, use it.
1753 store->set_fsid(cct->_conf->osd_uuid);
1754
1755 ret = store->mkfs();
1756 if (ret) {
1757 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1758 << cpp_strerror(ret) << dendl;
1759 goto free_store;
1760 }
1761
1762 store->set_cache_shards(1); // doesn't matter for mkfs!
1763
1764 ret = store->mount();
1765 if (ret) {
1766 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1767 << cpp_strerror(ret) << dendl;
1768 goto free_store;
1769 }
1770
1771 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1772 if (ret >= 0) {
1773 /* if we already have superblock, check content of superblock */
1774 dout(0) << " have superblock" << dendl;
1775 bufferlist::iterator p;
1776 p = sbbl.begin();
1777 ::decode(sb, p);
1778 if (whoami != sb.whoami) {
1779 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1780 << dendl;
1781 ret = -EINVAL;
1782 goto umount_store;
1783 }
1784 if (fsid != sb.cluster_fsid) {
1785 derr << "provided cluster fsid " << fsid
1786 << " != superblock's " << sb.cluster_fsid << dendl;
1787 ret = -EINVAL;
1788 goto umount_store;
1789 }
1790 } else {
1791 // create superblock
1792 sb.cluster_fsid = fsid;
1793 sb.osd_fsid = store->get_fsid();
1794 sb.whoami = whoami;
1795 sb.compat_features = get_osd_initial_compat_set();
1796
1797 bufferlist bl;
1798 ::encode(sb, bl);
1799
1800 ObjectStore::Transaction t;
1801 t.create_collection(coll_t::meta(), 0);
1802 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1803 ret = store->apply_transaction(osr.get(), std::move(t));
1804 if (ret) {
1805 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1806 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1807 goto umount_store;
1808 }
1809 }
1810
1811 if (!osr->flush_commit(&waiter)) {
1812 waiter.wait();
1813 }
1814
1815 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
1816 if (ret) {
1817 derr << "OSD::mkfs: failed to write fsid file: error "
1818 << cpp_strerror(ret) << dendl;
1819 goto umount_store;
1820 }
1821
1822 umount_store:
1823 store->umount();
1824 free_store:
1825 delete store;
1826 return ret;
1827 }
1828
1829 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1830 {
1831 char val[80];
1832 int r;
1833
1834 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1835 r = store->write_meta("magic", val);
1836 if (r < 0)
1837 return r;
1838
1839 snprintf(val, sizeof(val), "%d", whoami);
1840 r = store->write_meta("whoami", val);
1841 if (r < 0)
1842 return r;
1843
1844 cluster_fsid.print(val);
1845 r = store->write_meta("ceph_fsid", val);
1846 if (r < 0)
1847 return r;
1848
1849 string key = cct->_conf->get_val<string>("key");
1850 if (key.size()) {
1851 r = store->write_meta("osd_key", key);
1852 if (r < 0)
1853 return r;
1854 } else {
1855 string keyfile = cct->_conf->get_val<string>("keyfile");
1856 if (!keyfile.empty()) {
1857 bufferlist keybl;
1858 string err;
1859 if (keyfile == "-") {
1860 static_assert(1024 * 1024 >
1861 (sizeof(CryptoKey) - sizeof(bufferptr) +
1862 sizeof(__u16) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1863 "1MB should be enough for a base64 encoded CryptoKey");
1864 r = keybl.read_fd(STDIN_FILENO, 1024 * 1024);
1865 } else {
1866 r = keybl.read_file(keyfile.c_str(), &err);
1867 }
1868 if (r < 0) {
1869 derr << __func__ << " failed to read keyfile " << keyfile << ": "
1870 << err << ": " << cpp_strerror(r) << dendl;
1871 return r;
1872 }
1873 r = store->write_meta("osd_key", keybl.to_str());
1874 if (r < 0)
1875 return r;
1876 }
1877 }
1878
1879 r = store->write_meta("ready", "ready");
1880 if (r < 0)
1881 return r;
1882
1883 return 0;
1884 }
1885
1886 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1887 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1888 {
1889 string val;
1890
1891 int r = store->read_meta("magic", &val);
1892 if (r < 0)
1893 return r;
1894 magic = val;
1895
1896 r = store->read_meta("whoami", &val);
1897 if (r < 0)
1898 return r;
1899 whoami = atoi(val.c_str());
1900
1901 r = store->read_meta("ceph_fsid", &val);
1902 if (r < 0)
1903 return r;
1904 r = cluster_fsid.parse(val.c_str());
1905 if (!r)
1906 return -EINVAL;
1907
1908 r = store->read_meta("fsid", &val);
1909 if (r < 0) {
1910 osd_fsid = uuid_d();
1911 } else {
1912 r = osd_fsid.parse(val.c_str());
1913 if (!r)
1914 return -EINVAL;
1915 }
1916
1917 return 0;
1918 }
1919
1920
1921 #undef dout_prefix
1922 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1923
1924 // cons/des
1925
1926 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1927 int id,
1928 Messenger *internal_messenger,
1929 Messenger *external_messenger,
1930 Messenger *hb_client_front,
1931 Messenger *hb_client_back,
1932 Messenger *hb_front_serverm,
1933 Messenger *hb_back_serverm,
1934 Messenger *osdc_messenger,
1935 MonClient *mc,
1936 const std::string &dev, const std::string &jdev) :
1937 Dispatcher(cct_),
1938 osd_lock("OSD::osd_lock"),
1939 tick_timer(cct, osd_lock),
1940 tick_timer_lock("OSD::tick_timer_lock"),
1941 tick_timer_without_osd_lock(cct, tick_timer_lock),
1942 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1943 cct->_conf->auth_supported.empty() ?
1944 cct->_conf->auth_cluster_required :
1945 cct->_conf->auth_supported)),
1946 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1947 cct->_conf->auth_supported.empty() ?
1948 cct->_conf->auth_service_required :
1949 cct->_conf->auth_supported)),
1950 cluster_messenger(internal_messenger),
1951 client_messenger(external_messenger),
1952 objecter_messenger(osdc_messenger),
1953 monc(mc),
1954 mgrc(cct_, client_messenger),
1955 logger(NULL),
1956 recoverystate_perf(NULL),
1957 store(store_),
1958 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1959 clog(log_client.create_channel()),
1960 whoami(id),
1961 dev_path(dev), journal_path(jdev),
1962 store_is_rotational(store->is_rotational()),
1963 trace_endpoint("0.0.0.0", 0, "osd"),
1964 asok_hook(NULL),
1965 osd_compat(get_osd_compat_set()),
1966 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1967 cct->_conf->osd_peering_wq_threads,
1968 "osd_peering_tp_threads"),
1969 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1970 get_num_op_threads()),
1971 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1972 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1973 session_waiting_lock("OSD::session_waiting_lock"),
1974 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
1975 heartbeat_lock("OSD::heartbeat_lock"),
1976 heartbeat_stop(false),
1977 heartbeat_need_update(true),
1978 hb_front_client_messenger(hb_client_front),
1979 hb_back_client_messenger(hb_client_back),
1980 hb_front_server_messenger(hb_front_serverm),
1981 hb_back_server_messenger(hb_back_serverm),
1982 daily_loadavg(0.0),
1983 heartbeat_thread(this),
1984 heartbeat_dispatcher(this),
1985 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1986 cct->_conf->osd_num_op_tracker_shard),
1987 test_ops_hook(NULL),
1988 op_queue(get_io_queue()),
1989 op_prio_cutoff(get_io_prio_cut()),
1990 op_shardedwq(
1991 get_num_op_shards(),
1992 this,
1993 cct->_conf->osd_op_thread_timeout,
1994 cct->_conf->osd_op_thread_suicide_timeout,
1995 &osd_op_tp),
1996 peering_wq(
1997 this,
1998 cct->_conf->osd_op_thread_timeout,
1999 cct->_conf->osd_op_thread_suicide_timeout,
2000 &peering_tp),
2001 map_lock("OSD::map_lock"),
2002 pg_map_lock("OSD::pg_map_lock"),
2003 last_pg_create_epoch(0),
2004 mon_report_lock("OSD::mon_report_lock"),
2005 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
2006 up_thru_wanted(0),
2007 requested_full_first(0),
2008 requested_full_last(0),
2009 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
2010 osd_stat_updated(false),
2011 pg_stat_tid(0), pg_stat_tid_flushed(0),
2012 command_wq(
2013 this,
2014 cct->_conf->osd_command_thread_timeout,
2015 cct->_conf->osd_command_thread_suicide_timeout,
2016 &command_tp),
2017 remove_wq(
2018 cct,
2019 store,
2020 cct->_conf->osd_remove_thread_timeout,
2021 cct->_conf->osd_remove_thread_suicide_timeout,
2022 &disk_tp),
2023 service(this)
2024 {
2025 monc->set_messenger(client_messenger);
2026 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2027 cct->_conf->osd_op_log_threshold);
2028 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2029 cct->_conf->osd_op_history_duration);
2030 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2031 cct->_conf->osd_op_history_slow_op_threshold);
2032 #ifdef WITH_BLKIN
2033 std::stringstream ss;
2034 ss << "osd." << whoami;
2035 trace_endpoint.copy_name(ss.str());
2036 #endif
2037 }
2038
2039 OSD::~OSD()
2040 {
2041 delete authorize_handler_cluster_registry;
2042 delete authorize_handler_service_registry;
2043 delete class_handler;
2044 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2045 cct->get_perfcounters_collection()->remove(logger);
2046 delete recoverystate_perf;
2047 delete logger;
2048 delete store;
2049 }
2050
2051 void cls_initialize(ClassHandler *ch);
2052
2053 void OSD::handle_signal(int signum)
2054 {
2055 assert(signum == SIGINT || signum == SIGTERM);
2056 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2057 shutdown();
2058 }
2059
2060 int OSD::pre_init()
2061 {
2062 Mutex::Locker lock(osd_lock);
2063 if (is_stopping())
2064 return 0;
2065
2066 if (store->test_mount_in_use()) {
2067 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2068 << "currently in use. (Is ceph-osd already running?)" << dendl;
2069 return -EBUSY;
2070 }
2071
2072 cct->_conf->add_observer(this);
2073 return 0;
2074 }
2075
2076 // asok
2077
2078 class OSDSocketHook : public AdminSocketHook {
2079 OSD *osd;
2080 public:
2081 explicit OSDSocketHook(OSD *o) : osd(o) {}
2082 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2083 bufferlist& out) override {
2084 stringstream ss;
2085 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2086 out.append(ss);
2087 return r;
2088 }
2089 };
2090
2091 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2092 ostream& ss)
2093 {
2094 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2095 if (admin_command == "status") {
2096 f->open_object_section("status");
2097 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2098 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2099 f->dump_unsigned("whoami", superblock.whoami);
2100 f->dump_string("state", get_state_name(get_state()));
2101 f->dump_unsigned("oldest_map", superblock.oldest_map);
2102 f->dump_unsigned("newest_map", superblock.newest_map);
2103 {
2104 RWLock::RLocker l(pg_map_lock);
2105 f->dump_unsigned("num_pgs", pg_map.size());
2106 }
2107 f->close_section();
2108 } else if (admin_command == "flush_journal") {
2109 store->flush_journal();
2110 } else if (admin_command == "dump_ops_in_flight" ||
2111 admin_command == "ops" ||
2112 admin_command == "dump_blocked_ops" ||
2113 admin_command == "dump_historic_ops" ||
2114 admin_command == "dump_historic_ops_by_duration" ||
2115 admin_command == "dump_historic_slow_ops") {
2116
2117 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2118 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2119 will start to track new ops received afterwards.";
2120
2121 set<string> filters;
2122 vector<string> filter_str;
2123 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2124 copy(filter_str.begin(), filter_str.end(),
2125 inserter(filters, filters.end()));
2126 }
2127
2128 if (admin_command == "dump_ops_in_flight" ||
2129 admin_command == "ops") {
2130 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2131 ss << error_str;
2132 }
2133 }
2134 if (admin_command == "dump_blocked_ops") {
2135 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2136 ss << error_str;
2137 }
2138 }
2139 if (admin_command == "dump_historic_ops") {
2140 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2141 ss << error_str;
2142 }
2143 }
2144 if (admin_command == "dump_historic_ops_by_duration") {
2145 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2146 ss << error_str;
2147 }
2148 }
2149 if (admin_command == "dump_historic_slow_ops") {
2150 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2151 ss << error_str;
2152 }
2153 }
2154 } else if (admin_command == "dump_op_pq_state") {
2155 f->open_object_section("pq");
2156 op_shardedwq.dump(f);
2157 f->close_section();
2158 } else if (admin_command == "dump_blacklist") {
2159 list<pair<entity_addr_t,utime_t> > bl;
2160 OSDMapRef curmap = service.get_osdmap();
2161
2162 f->open_array_section("blacklist");
2163 curmap->get_blacklist(&bl);
2164 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2165 it != bl.end(); ++it) {
2166 f->open_object_section("entry");
2167 f->open_object_section("entity_addr_t");
2168 it->first.dump(f);
2169 f->close_section(); //entity_addr_t
2170 it->second.localtime(f->dump_stream("expire_time"));
2171 f->close_section(); //entry
2172 }
2173 f->close_section(); //blacklist
2174 } else if (admin_command == "dump_watchers") {
2175 list<obj_watch_item_t> watchers;
2176 // scan pg's
2177 {
2178 Mutex::Locker l(osd_lock);
2179 RWLock::RLocker l2(pg_map_lock);
2180 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2181 it != pg_map.end();
2182 ++it) {
2183
2184 list<obj_watch_item_t> pg_watchers;
2185 PG *pg = it->second;
2186 pg->lock();
2187 pg->get_watchers(pg_watchers);
2188 pg->unlock();
2189 watchers.splice(watchers.end(), pg_watchers);
2190 }
2191 }
2192
2193 f->open_array_section("watchers");
2194 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2195 it != watchers.end(); ++it) {
2196
2197 f->open_object_section("watch");
2198
2199 f->dump_string("namespace", it->obj.nspace);
2200 f->dump_string("object", it->obj.oid.name);
2201
2202 f->open_object_section("entity_name");
2203 it->wi.name.dump(f);
2204 f->close_section(); //entity_name_t
2205
2206 f->dump_unsigned("cookie", it->wi.cookie);
2207 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2208
2209 f->open_object_section("entity_addr_t");
2210 it->wi.addr.dump(f);
2211 f->close_section(); //entity_addr_t
2212
2213 f->close_section(); //watch
2214 }
2215
2216 f->close_section(); //watchers
2217 } else if (admin_command == "dump_reservations") {
2218 f->open_object_section("reservations");
2219 f->open_object_section("local_reservations");
2220 service.local_reserver.dump(f);
2221 f->close_section();
2222 f->open_object_section("remote_reservations");
2223 service.remote_reserver.dump(f);
2224 f->close_section();
2225 f->close_section();
2226 } else if (admin_command == "get_latest_osdmap") {
2227 get_latest_osdmap();
2228 } else if (admin_command == "heap") {
2229 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2230
2231 // Note: Failed heap profile commands won't necessarily trigger an error:
2232 f->open_object_section("result");
2233 f->dump_string("error", cpp_strerror(result));
2234 f->dump_bool("success", result >= 0);
2235 f->close_section();
2236 } else if (admin_command == "set_heap_property") {
2237 string property;
2238 int64_t value = 0;
2239 string error;
2240 bool success = false;
2241 if (!cmd_getval(cct, cmdmap, "property", property)) {
2242 error = "unable to get property";
2243 success = false;
2244 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2245 error = "unable to get value";
2246 success = false;
2247 } else if (value < 0) {
2248 error = "negative value not allowed";
2249 success = false;
2250 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2251 error = "invalid property";
2252 success = false;
2253 } else {
2254 success = true;
2255 }
2256 f->open_object_section("result");
2257 f->dump_string("error", error);
2258 f->dump_bool("success", success);
2259 f->close_section();
2260 } else if (admin_command == "get_heap_property") {
2261 string property;
2262 size_t value = 0;
2263 string error;
2264 bool success = false;
2265 if (!cmd_getval(cct, cmdmap, "property", property)) {
2266 error = "unable to get property";
2267 success = false;
2268 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2269 error = "invalid property";
2270 success = false;
2271 } else {
2272 success = true;
2273 }
2274 f->open_object_section("result");
2275 f->dump_string("error", error);
2276 f->dump_bool("success", success);
2277 f->dump_int("value", value);
2278 f->close_section();
2279 } else if (admin_command == "dump_objectstore_kv_stats") {
2280 store->get_db_statistics(f);
2281 } else if (admin_command == "dump_scrubs") {
2282 service.dumps_scrub(f);
2283 } else if (admin_command == "calc_objectstore_db_histogram") {
2284 store->generate_db_histogram(f);
2285 } else if (admin_command == "flush_store_cache") {
2286 store->flush_cache();
2287 } else if (admin_command == "dump_pgstate_history") {
2288 f->open_object_section("pgstate_history");
2289 RWLock::RLocker l2(pg_map_lock);
2290 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2291 it != pg_map.end();
2292 ++it) {
2293
2294 PG *pg = it->second;
2295 f->dump_stream("pg") << pg->get_pgid();
2296 pg->lock();
2297 pg->pgstate_history.dump(f);
2298 pg->unlock();
2299 }
2300 f->close_section();
2301 } else if (admin_command == "compact") {
2302 dout(1) << "triggering manual compaction" << dendl;
2303 auto start = ceph::coarse_mono_clock::now();
2304 store->compact();
2305 auto end = ceph::coarse_mono_clock::now();
2306 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2307 dout(1) << "finished manual compaction in "
2308 << time_span.count()
2309 << " seconds" << dendl;
2310 f->open_object_section("compact_result");
2311 f->dump_float("elapsed_time", time_span.count());
2312 f->close_section();
2313 } else {
2314 assert(0 == "broken asok registration");
2315 }
2316 f->flush(ss);
2317 delete f;
2318 return true;
2319 }
2320
2321 class TestOpsSocketHook : public AdminSocketHook {
2322 OSDService *service;
2323 ObjectStore *store;
2324 public:
2325 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2326 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2327 bufferlist& out) override {
2328 stringstream ss;
2329 test_ops(service, store, command, cmdmap, ss);
2330 out.append(ss);
2331 return true;
2332 }
2333 void test_ops(OSDService *service, ObjectStore *store,
2334 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2335
2336 };
2337
2338 class OSD::C_Tick : public Context {
2339 OSD *osd;
2340 public:
2341 explicit C_Tick(OSD *o) : osd(o) {}
2342 void finish(int r) override {
2343 osd->tick();
2344 }
2345 };
2346
2347 class OSD::C_Tick_WithoutOSDLock : public Context {
2348 OSD *osd;
2349 public:
2350 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2351 void finish(int r) override {
2352 osd->tick_without_osd_lock();
2353 }
2354 };
2355
2356 int OSD::enable_disable_fuse(bool stop)
2357 {
2358 #ifdef HAVE_LIBFUSE
2359 int r;
2360 string mntpath = cct->_conf->osd_data + "/fuse";
2361 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2362 dout(1) << __func__ << " disabling" << dendl;
2363 fuse_store->stop();
2364 delete fuse_store;
2365 fuse_store = NULL;
2366 r = ::rmdir(mntpath.c_str());
2367 if (r < 0) {
2368 r = -errno;
2369 derr << __func__ << " failed to rmdir " << mntpath << ": "
2370 << cpp_strerror(r) << dendl;
2371 return r;
2372 }
2373 return 0;
2374 }
2375 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2376 dout(1) << __func__ << " enabling" << dendl;
2377 r = ::mkdir(mntpath.c_str(), 0700);
2378 if (r < 0)
2379 r = -errno;
2380 if (r < 0 && r != -EEXIST) {
2381 derr << __func__ << " unable to create " << mntpath << ": "
2382 << cpp_strerror(r) << dendl;
2383 return r;
2384 }
2385 fuse_store = new FuseStore(store, mntpath);
2386 r = fuse_store->start();
2387 if (r < 0) {
2388 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2389 delete fuse_store;
2390 fuse_store = NULL;
2391 return r;
2392 }
2393 }
2394 #endif // HAVE_LIBFUSE
2395 return 0;
2396 }
2397
2398 int OSD::get_num_op_shards()
2399 {
2400 if (cct->_conf->osd_op_num_shards)
2401 return cct->_conf->osd_op_num_shards;
2402 if (store_is_rotational)
2403 return cct->_conf->osd_op_num_shards_hdd;
2404 else
2405 return cct->_conf->osd_op_num_shards_ssd;
2406 }
2407
2408 int OSD::get_num_op_threads()
2409 {
2410 if (cct->_conf->osd_op_num_threads_per_shard)
2411 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2412 if (store_is_rotational)
2413 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2414 else
2415 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2416 }
2417
2418 float OSD::get_osd_recovery_sleep()
2419 {
2420 if (cct->_conf->osd_recovery_sleep)
2421 return cct->_conf->osd_recovery_sleep;
2422 if (!store_is_rotational && !journal_is_rotational)
2423 return cct->_conf->osd_recovery_sleep_ssd;
2424 else if (store_is_rotational && !journal_is_rotational)
2425 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2426 else
2427 return cct->_conf->osd_recovery_sleep_hdd;
2428 }
2429
2430 int OSD::init()
2431 {
2432 CompatSet initial, diff;
2433 Mutex::Locker lock(osd_lock);
2434 if (is_stopping())
2435 return 0;
2436
2437 tick_timer.init();
2438 tick_timer_without_osd_lock.init();
2439 service.recovery_request_timer.init();
2440 service.recovery_sleep_timer.init();
2441
2442 // mount.
2443 dout(2) << "init " << dev_path
2444 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2445 << dendl;
2446 dout(2) << "journal " << journal_path << dendl;
2447 assert(store); // call pre_init() first!
2448
2449 store->set_cache_shards(get_num_op_shards());
2450
2451 int r = store->mount();
2452 if (r < 0) {
2453 derr << "OSD:init: unable to mount object store" << dendl;
2454 return r;
2455 }
2456 journal_is_rotational = store->is_journal_rotational();
2457 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2458 << dendl;
2459
2460 enable_disable_fuse(false);
2461
2462 dout(2) << "boot" << dendl;
2463
2464 // initialize the daily loadavg with current 15min loadavg
2465 double loadavgs[3];
2466 if (getloadavg(loadavgs, 3) == 3) {
2467 daily_loadavg = loadavgs[2];
2468 } else {
2469 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2470 daily_loadavg = 1.0;
2471 }
2472
2473 int rotating_auth_attempts = 0;
2474
2475 // sanity check long object name handling
2476 {
2477 hobject_t l;
2478 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2479 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2480 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2481 r = store->validate_hobject_key(l);
2482 if (r < 0) {
2483 derr << "backend (" << store->get_type() << ") is unable to support max "
2484 << "object name[space] len" << dendl;
2485 derr << " osd max object name len = "
2486 << cct->_conf->osd_max_object_name_len << dendl;
2487 derr << " osd max object namespace len = "
2488 << cct->_conf->osd_max_object_namespace_len << dendl;
2489 derr << cpp_strerror(r) << dendl;
2490 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2491 goto out;
2492 }
2493 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2494 << dendl;
2495 } else {
2496 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2497 }
2498 }
2499
2500 // read superblock
2501 r = read_superblock();
2502 if (r < 0) {
2503 derr << "OSD::init() : unable to read osd superblock" << dendl;
2504 r = -EINVAL;
2505 goto out;
2506 }
2507
2508 if (osd_compat.compare(superblock.compat_features) < 0) {
2509 derr << "The disk uses features unsupported by the executable." << dendl;
2510 derr << " ondisk features " << superblock.compat_features << dendl;
2511 derr << " daemon features " << osd_compat << dendl;
2512
2513 if (osd_compat.writeable(superblock.compat_features)) {
2514 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2515 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2516 r = -EOPNOTSUPP;
2517 goto out;
2518 }
2519 else {
2520 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2521 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2522 r = -EOPNOTSUPP;
2523 goto out;
2524 }
2525 }
2526
2527 assert_warn(whoami == superblock.whoami);
2528 if (whoami != superblock.whoami) {
2529 derr << "OSD::init: superblock says osd"
2530 << superblock.whoami << " but I am osd." << whoami << dendl;
2531 r = -EINVAL;
2532 goto out;
2533 }
2534
2535 initial = get_osd_initial_compat_set();
2536 diff = superblock.compat_features.unsupported(initial);
2537 if (superblock.compat_features.merge(initial)) {
2538 // We need to persist the new compat_set before we
2539 // do anything else
2540 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2541 ObjectStore::Transaction t;
2542 write_superblock(t);
2543 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2544 if (r < 0)
2545 goto out;
2546 }
2547
2548 // make sure snap mapper object exists
2549 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2550 dout(10) << "init creating/touching snapmapper object" << dendl;
2551 ObjectStore::Transaction t;
2552 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2553 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2554 if (r < 0)
2555 goto out;
2556 }
2557
2558 class_handler = new ClassHandler(cct);
2559 cls_initialize(class_handler);
2560
2561 if (cct->_conf->osd_open_classes_on_start) {
2562 int r = class_handler->open_all_classes();
2563 if (r)
2564 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2565 }
2566
2567 // load up "current" osdmap
2568 assert_warn(!osdmap);
2569 if (osdmap) {
2570 derr << "OSD::init: unable to read current osdmap" << dendl;
2571 r = -EINVAL;
2572 goto out;
2573 }
2574 osdmap = get_map(superblock.current_epoch);
2575 check_osdmap_features(store);
2576
2577 create_recoverystate_perf();
2578
2579 {
2580 epoch_t bind_epoch = osdmap->get_epoch();
2581 service.set_epochs(NULL, NULL, &bind_epoch);
2582 }
2583
2584 clear_temp_objects();
2585
2586 // initialize osdmap references in sharded wq
2587 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2588
2589 // load up pgs (as they previously existed)
2590 load_pgs();
2591
2592 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2593 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2594 op_prio_cutoff << "." << dendl;
2595
2596 create_logger();
2597
2598 // i'm ready!
2599 client_messenger->add_dispatcher_head(this);
2600 cluster_messenger->add_dispatcher_head(this);
2601
2602 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2603 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2604 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2605 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2606
2607 objecter_messenger->add_dispatcher_head(service.objecter);
2608
2609 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2610 | CEPH_ENTITY_TYPE_MGR);
2611 r = monc->init();
2612 if (r < 0)
2613 goto out;
2614
2615 /**
2616 * FIXME: this is a placeholder implementation that unconditionally
2617 * sends every is_primary PG's stats every time we're called, unlike
2618 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2619 * This has equivalent cost to the existing worst case where all
2620 * PGs are busy and their stats are always enqueued for sending.
2621 */
2622 mgrc.set_pgstats_cb([this](){
2623 RWLock::RLocker l(map_lock);
2624
2625 utime_t had_for = ceph_clock_now() - had_map_since;
2626 osd_stat_t cur_stat = service.get_osd_stat();
2627 cur_stat.os_perf_stat = store->get_cur_stats();
2628
2629 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2630 m->osd_stat = cur_stat;
2631
2632 Mutex::Locker lec{min_last_epoch_clean_lock};
2633 min_last_epoch_clean = osdmap->get_epoch();
2634 min_last_epoch_clean_pgs.clear();
2635 RWLock::RLocker lpg(pg_map_lock);
2636 for (const auto &i : pg_map) {
2637 PG *pg = i.second;
2638 if (!pg->is_primary()) {
2639 continue;
2640 }
2641
2642 pg->pg_stats_publish_lock.Lock();
2643 if (pg->pg_stats_publish_valid) {
2644 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2645 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2646 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2647 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2648 }
2649 pg->pg_stats_publish_lock.Unlock();
2650 }
2651
2652 return m;
2653 });
2654
2655 mgrc.init();
2656 client_messenger->add_dispatcher_head(&mgrc);
2657
2658 // tell monc about log_client so it will know about mon session resets
2659 monc->set_log_client(&log_client);
2660 update_log_config();
2661
2662 peering_tp.start();
2663 osd_op_tp.start();
2664 disk_tp.start();
2665 command_tp.start();
2666
2667 set_disk_tp_priority();
2668
2669 // start the heartbeat
2670 heartbeat_thread.create("osd_srv_heartbt");
2671
2672 // tick
2673 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2674 {
2675 Mutex::Locker l(tick_timer_lock);
2676 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2677 }
2678
2679 service.init();
2680 service.publish_map(osdmap);
2681 service.publish_superblock(superblock);
2682 service.max_oldest_map = superblock.oldest_map;
2683
2684 osd_lock.Unlock();
2685
2686 r = monc->authenticate();
2687 if (r < 0) {
2688 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2689 << dendl;
2690 osd_lock.Lock(); // locker is going to unlock this on function exit
2691 if (is_stopping())
2692 r = 0;
2693 goto monout;
2694 }
2695
2696 while (monc->wait_auth_rotating(30.0) < 0) {
2697 derr << "unable to obtain rotating service keys; retrying" << dendl;
2698 ++rotating_auth_attempts;
2699 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2700 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2701 osd_lock.Lock(); // make locker happy
2702 if (!is_stopping()) {
2703 r = -ETIMEDOUT;
2704 }
2705 goto monout;
2706 }
2707 }
2708
2709 r = update_crush_device_class();
2710 if (r < 0) {
2711 derr << __func__ << " unable to update_crush_device_class: "
2712 << cpp_strerror(r) << dendl;
2713 osd_lock.Lock();
2714 goto monout;
2715 }
2716
2717 r = update_crush_location();
2718 if (r < 0) {
2719 derr << __func__ << " unable to update_crush_location: "
2720 << cpp_strerror(r) << dendl;
2721 osd_lock.Lock();
2722 goto monout;
2723 }
2724
2725 osd_lock.Lock();
2726 if (is_stopping())
2727 return 0;
2728
2729 // start objecter *after* we have authenticated, so that we don't ignore
2730 // the OSDMaps it requests.
2731 service.final_init();
2732
2733 check_config();
2734
2735 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2736 consume_map();
2737 peering_wq.drain();
2738
2739 dout(0) << "done with init, starting boot process" << dendl;
2740
2741 // subscribe to any pg creations
2742 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2743
2744 // MgrClient needs this (it doesn't have MonClient reference itself)
2745 monc->sub_want("mgrmap", 0, 0);
2746
2747 // we don't need to ask for an osdmap here; objecter will
2748 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2749
2750 monc->renew_subs();
2751
2752 start_boot();
2753
2754 return 0;
2755 monout:
2756 exit(1);
2757
2758 out:
2759 enable_disable_fuse(true);
2760 store->umount();
2761 delete store;
2762 store = NULL;
2763 return r;
2764 }
2765
2766 void OSD::final_init()
2767 {
2768 AdminSocket *admin_socket = cct->get_admin_socket();
2769 asok_hook = new OSDSocketHook(this);
2770 int r = admin_socket->register_command("status", "status", asok_hook,
2771 "high-level status of OSD");
2772 assert(r == 0);
2773 r = admin_socket->register_command("flush_journal", "flush_journal",
2774 asok_hook,
2775 "flush the journal to permanent store");
2776 assert(r == 0);
2777 r = admin_socket->register_command("dump_ops_in_flight",
2778 "dump_ops_in_flight " \
2779 "name=filterstr,type=CephString,n=N,req=false",
2780 asok_hook,
2781 "show the ops currently in flight");
2782 assert(r == 0);
2783 r = admin_socket->register_command("ops",
2784 "ops " \
2785 "name=filterstr,type=CephString,n=N,req=false",
2786 asok_hook,
2787 "show the ops currently in flight");
2788 assert(r == 0);
2789 r = admin_socket->register_command("dump_blocked_ops",
2790 "dump_blocked_ops " \
2791 "name=filterstr,type=CephString,n=N,req=false",
2792 asok_hook,
2793 "show the blocked ops currently in flight");
2794 assert(r == 0);
2795 r = admin_socket->register_command("dump_historic_ops",
2796 "dump_historic_ops " \
2797 "name=filterstr,type=CephString,n=N,req=false",
2798 asok_hook,
2799 "show recent ops");
2800 assert(r == 0);
2801 r = admin_socket->register_command("dump_historic_slow_ops",
2802 "dump_historic_slow_ops " \
2803 "name=filterstr,type=CephString,n=N,req=false",
2804 asok_hook,
2805 "show slowest recent ops");
2806 assert(r == 0);
2807 r = admin_socket->register_command("dump_historic_ops_by_duration",
2808 "dump_historic_ops_by_duration " \
2809 "name=filterstr,type=CephString,n=N,req=false",
2810 asok_hook,
2811 "show slowest recent ops, sorted by duration");
2812 assert(r == 0);
2813 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2814 asok_hook,
2815 "dump op priority queue state");
2816 assert(r == 0);
2817 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2818 asok_hook,
2819 "dump blacklisted clients and times");
2820 assert(r == 0);
2821 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2822 asok_hook,
2823 "show clients which have active watches,"
2824 " and on which objects");
2825 assert(r == 0);
2826 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2827 asok_hook,
2828 "show recovery reservations");
2829 assert(r == 0);
2830 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2831 asok_hook,
2832 "force osd to update the latest map from "
2833 "the mon");
2834 assert(r == 0);
2835
2836 r = admin_socket->register_command( "heap",
2837 "heap " \
2838 "name=heapcmd,type=CephString",
2839 asok_hook,
2840 "show heap usage info (available only if "
2841 "compiled with tcmalloc)");
2842 assert(r == 0);
2843
2844 r = admin_socket->register_command("set_heap_property",
2845 "set_heap_property " \
2846 "name=property,type=CephString " \
2847 "name=value,type=CephInt",
2848 asok_hook,
2849 "update malloc extension heap property");
2850 assert(r == 0);
2851
2852 r = admin_socket->register_command("get_heap_property",
2853 "get_heap_property " \
2854 "name=property,type=CephString",
2855 asok_hook,
2856 "get malloc extension heap property");
2857 assert(r == 0);
2858
2859 r = admin_socket->register_command("dump_objectstore_kv_stats",
2860 "dump_objectstore_kv_stats",
2861 asok_hook,
2862 "print statistics of kvdb which used by bluestore");
2863 assert(r == 0);
2864
2865 r = admin_socket->register_command("dump_scrubs",
2866 "dump_scrubs",
2867 asok_hook,
2868 "print scheduled scrubs");
2869 assert(r == 0);
2870
2871 r = admin_socket->register_command("calc_objectstore_db_histogram",
2872 "calc_objectstore_db_histogram",
2873 asok_hook,
2874 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2875 assert(r == 0);
2876
2877 r = admin_socket->register_command("flush_store_cache",
2878 "flush_store_cache",
2879 asok_hook,
2880 "Flush bluestore internal cache");
2881 assert(r == 0);
2882 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2883 asok_hook,
2884 "show recent state history");
2885 assert(r == 0);
2886
2887 r = admin_socket->register_command("compact", "compact",
2888 asok_hook,
2889 "Commpact object store's omap."
2890 " WARNING: Compaction probably slows your requests");
2891 assert(r == 0);
2892
2893 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2894 // Note: pools are CephString instead of CephPoolname because
2895 // these commands traditionally support both pool names and numbers
2896 r = admin_socket->register_command(
2897 "setomapval",
2898 "setomapval " \
2899 "name=pool,type=CephString " \
2900 "name=objname,type=CephObjectname " \
2901 "name=key,type=CephString "\
2902 "name=val,type=CephString",
2903 test_ops_hook,
2904 "set omap key");
2905 assert(r == 0);
2906 r = admin_socket->register_command(
2907 "rmomapkey",
2908 "rmomapkey " \
2909 "name=pool,type=CephString " \
2910 "name=objname,type=CephObjectname " \
2911 "name=key,type=CephString",
2912 test_ops_hook,
2913 "remove omap key");
2914 assert(r == 0);
2915 r = admin_socket->register_command(
2916 "setomapheader",
2917 "setomapheader " \
2918 "name=pool,type=CephString " \
2919 "name=objname,type=CephObjectname " \
2920 "name=header,type=CephString",
2921 test_ops_hook,
2922 "set omap header");
2923 assert(r == 0);
2924
2925 r = admin_socket->register_command(
2926 "getomap",
2927 "getomap " \
2928 "name=pool,type=CephString " \
2929 "name=objname,type=CephObjectname",
2930 test_ops_hook,
2931 "output entire object map");
2932 assert(r == 0);
2933
2934 r = admin_socket->register_command(
2935 "truncobj",
2936 "truncobj " \
2937 "name=pool,type=CephString " \
2938 "name=objname,type=CephObjectname " \
2939 "name=len,type=CephInt",
2940 test_ops_hook,
2941 "truncate object to length");
2942 assert(r == 0);
2943
2944 r = admin_socket->register_command(
2945 "injectdataerr",
2946 "injectdataerr " \
2947 "name=pool,type=CephString " \
2948 "name=objname,type=CephObjectname " \
2949 "name=shardid,type=CephInt,req=false,range=0|255",
2950 test_ops_hook,
2951 "inject data error to an object");
2952 assert(r == 0);
2953
2954 r = admin_socket->register_command(
2955 "injectmdataerr",
2956 "injectmdataerr " \
2957 "name=pool,type=CephString " \
2958 "name=objname,type=CephObjectname " \
2959 "name=shardid,type=CephInt,req=false,range=0|255",
2960 test_ops_hook,
2961 "inject metadata error to an object");
2962 assert(r == 0);
2963 r = admin_socket->register_command(
2964 "set_recovery_delay",
2965 "set_recovery_delay " \
2966 "name=utime,type=CephInt,req=false",
2967 test_ops_hook,
2968 "Delay osd recovery by specified seconds");
2969 assert(r == 0);
2970 r = admin_socket->register_command(
2971 "trigger_scrub",
2972 "trigger_scrub " \
2973 "name=pgid,type=CephString ",
2974 test_ops_hook,
2975 "Trigger a scheduled scrub ");
2976 assert(r == 0);
2977 r = admin_socket->register_command(
2978 "injectfull",
2979 "injectfull " \
2980 "name=type,type=CephString,req=false " \
2981 "name=count,type=CephInt,req=false ",
2982 test_ops_hook,
2983 "Inject a full disk (optional count times)");
2984 assert(r == 0);
2985 }
2986
2987 void OSD::create_logger()
2988 {
2989 dout(10) << "create_logger" << dendl;
2990
2991 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2992
2993 // Latency axis configuration for op histograms, values are in nanoseconds
2994 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2995 "Latency (usec)",
2996 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2997 0, ///< Start at 0
2998 100000, ///< Quantization unit is 100usec
2999 32, ///< Enough to cover much longer than slow requests
3000 };
3001
3002 // Op size axis configuration for op histograms, values are in bytes
3003 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3004 "Request size (bytes)",
3005 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3006 0, ///< Start at 0
3007 512, ///< Quantization unit is 512 bytes
3008 32, ///< Enough to cover requests larger than GB
3009 };
3010
3011
3012 // All the basic OSD operation stats are to be considered useful
3013 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3014
3015 osd_plb.add_u64(
3016 l_osd_op_wip, "op_wip",
3017 "Replication operations currently being processed (primary)");
3018 osd_plb.add_u64_counter(
3019 l_osd_op, "op",
3020 "Client operations",
3021 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3022 osd_plb.add_u64_counter(
3023 l_osd_op_inb, "op_in_bytes",
3024 "Client operations total write size",
3025 "wr", PerfCountersBuilder::PRIO_INTERESTING);
3026 osd_plb.add_u64_counter(
3027 l_osd_op_outb, "op_out_bytes",
3028 "Client operations total read size",
3029 "rd", PerfCountersBuilder::PRIO_INTERESTING);
3030 osd_plb.add_time_avg(
3031 l_osd_op_lat, "op_latency",
3032 "Latency of client operations (including queue time)",
3033 "l", 9);
3034 osd_plb.add_time_avg(
3035 l_osd_op_process_lat, "op_process_latency",
3036 "Latency of client operations (excluding queue time)");
3037 osd_plb.add_time_avg(
3038 l_osd_op_prepare_lat, "op_prepare_latency",
3039 "Latency of client operations (excluding queue time and wait for finished)");
3040
3041 osd_plb.add_u64_counter(
3042 l_osd_op_r, "op_r", "Client read operations");
3043 osd_plb.add_u64_counter(
3044 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
3045 osd_plb.add_time_avg(
3046 l_osd_op_r_lat, "op_r_latency",
3047 "Latency of read operation (including queue time)");
3048 osd_plb.add_u64_counter_histogram(
3049 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3050 op_hist_x_axis_config, op_hist_y_axis_config,
3051 "Histogram of operation latency (including queue time) + data read");
3052 osd_plb.add_time_avg(
3053 l_osd_op_r_process_lat, "op_r_process_latency",
3054 "Latency of read operation (excluding queue time)");
3055 osd_plb.add_time_avg(
3056 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3057 "Latency of read operations (excluding queue time and wait for finished)");
3058 osd_plb.add_u64_counter(
3059 l_osd_op_w, "op_w", "Client write operations");
3060 osd_plb.add_u64_counter(
3061 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3062 osd_plb.add_time_avg(
3063 l_osd_op_w_lat, "op_w_latency",
3064 "Latency of write operation (including queue time)");
3065 osd_plb.add_u64_counter_histogram(
3066 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3067 op_hist_x_axis_config, op_hist_y_axis_config,
3068 "Histogram of operation latency (including queue time) + data written");
3069 osd_plb.add_time_avg(
3070 l_osd_op_w_process_lat, "op_w_process_latency",
3071 "Latency of write operation (excluding queue time)");
3072 osd_plb.add_time_avg(
3073 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3074 "Latency of write operations (excluding queue time and wait for finished)");
3075 osd_plb.add_u64_counter(
3076 l_osd_op_rw, "op_rw",
3077 "Client read-modify-write operations");
3078 osd_plb.add_u64_counter(
3079 l_osd_op_rw_inb, "op_rw_in_bytes",
3080 "Client read-modify-write operations write in");
3081 osd_plb.add_u64_counter(
3082 l_osd_op_rw_outb,"op_rw_out_bytes",
3083 "Client read-modify-write operations read out ");
3084 osd_plb.add_time_avg(
3085 l_osd_op_rw_lat, "op_rw_latency",
3086 "Latency of read-modify-write operation (including queue time)");
3087 osd_plb.add_u64_counter_histogram(
3088 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3089 op_hist_x_axis_config, op_hist_y_axis_config,
3090 "Histogram of rw operation latency (including queue time) + data written");
3091 osd_plb.add_u64_counter_histogram(
3092 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3093 op_hist_x_axis_config, op_hist_y_axis_config,
3094 "Histogram of rw operation latency (including queue time) + data read");
3095 osd_plb.add_time_avg(
3096 l_osd_op_rw_process_lat, "op_rw_process_latency",
3097 "Latency of read-modify-write operation (excluding queue time)");
3098 osd_plb.add_time_avg(
3099 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3100 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3101
3102 // Now we move on to some more obscure stats, revert to assuming things
3103 // are low priority unless otherwise specified.
3104 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3105
3106 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3107 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3108 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3109 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3110
3111 osd_plb.add_u64_counter(
3112 l_osd_sop, "subop", "Suboperations");
3113 osd_plb.add_u64_counter(
3114 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3115 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3116
3117 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3118 osd_plb.add_u64_counter(
3119 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3120 osd_plb.add_time_avg(
3121 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3122 osd_plb.add_u64_counter(
3123 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3124 osd_plb.add_time_avg(
3125 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3126 osd_plb.add_u64_counter(
3127 l_osd_sop_push, "subop_push", "Suboperations push messages");
3128 osd_plb.add_u64_counter(
3129 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3130 osd_plb.add_time_avg(
3131 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3132
3133 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3134 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3135 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3136
3137 osd_plb.add_u64_counter(
3138 l_osd_rop, "recovery_ops",
3139 "Started recovery operations",
3140 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3141
3142 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3143 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3144 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3145 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3146 osd_plb.add_u64(
3147 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3148 osd_plb.add_u64(
3149 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3150 "Total number getting crc from crc_cache with adjusting");
3151 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3152 "Total number of crc cache misses");
3153
3154 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3155 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3156 osd_plb.add_u64(
3157 l_osd_pg_primary, "numpg_primary",
3158 "Placement groups for which this osd is primary");
3159 osd_plb.add_u64(
3160 l_osd_pg_replica, "numpg_replica",
3161 "Placement groups for which this osd is replica");
3162 osd_plb.add_u64(
3163 l_osd_pg_stray, "numpg_stray",
3164 "Placement groups ready to be deleted from this osd");
3165 osd_plb.add_u64(
3166 l_osd_pg_removing, "numpg_removing",
3167 "Placement groups queued for local deletion", "pgsr",
3168 PerfCountersBuilder::PRIO_USEFUL);
3169 osd_plb.add_u64(
3170 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3171 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3172 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3173 osd_plb.add_u64_counter(
3174 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3175 osd_plb.add_u64_counter(
3176 l_osd_waiting_for_map, "messages_delayed_for_map",
3177 "Operations waiting for OSD map");
3178
3179 osd_plb.add_u64_counter(
3180 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3181 osd_plb.add_u64_counter(
3182 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3183 osd_plb.add_u64_counter(
3184 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3185 "osdmap cache miss below cache lower bound");
3186 osd_plb.add_u64_avg(
3187 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3188 "osdmap cache miss, avg distance below cache lower bound");
3189 osd_plb.add_u64_counter(
3190 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3191 "OSDMap buffer cache hits");
3192 osd_plb.add_u64_counter(
3193 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3194 "OSDMap buffer cache misses");
3195
3196 osd_plb.add_u64(
3197 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3198 PerfCountersBuilder::PRIO_USEFUL);
3199 osd_plb.add_u64(
3200 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3201 PerfCountersBuilder::PRIO_USEFUL);
3202 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3203
3204 osd_plb.add_u64_counter(
3205 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3206
3207 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3208 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3209 osd_plb.add_u64_counter(
3210 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3211 osd_plb.add_u64_counter(
3212 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3213 osd_plb.add_u64_counter(
3214 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3215 "Failed tier flush attempts");
3216 osd_plb.add_u64_counter(
3217 l_osd_tier_evict, "tier_evict", "Tier evictions");
3218 osd_plb.add_u64_counter(
3219 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3220 osd_plb.add_u64_counter(
3221 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3222 osd_plb.add_u64_counter(
3223 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3224 osd_plb.add_u64_counter(
3225 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3226 osd_plb.add_u64_counter(
3227 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3228 osd_plb.add_u64_counter(
3229 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3230
3231 osd_plb.add_u64_counter(
3232 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3233 osd_plb.add_u64_counter(
3234 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3235 osd_plb.add_u64_counter(
3236 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3237 osd_plb.add_u64_counter(
3238 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3239
3240 osd_plb.add_u64_counter(
3241 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3242 osd_plb.add_u64_counter(
3243 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3244
3245 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3246 osd_plb.add_time_avg(
3247 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3248 osd_plb.add_time_avg(
3249 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3250 osd_plb.add_time_avg(
3251 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3252
3253 osd_plb.add_u64_counter(
3254 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3255 osd_plb.add_u64_counter(
3256 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3257 "PG updated its info using fastinfo attr");
3258 osd_plb.add_u64_counter(
3259 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3260
3261 logger = osd_plb.create_perf_counters();
3262 cct->get_perfcounters_collection()->add(logger);
3263 }
3264
3265 void OSD::create_recoverystate_perf()
3266 {
3267 dout(10) << "create_recoverystate_perf" << dendl;
3268
3269 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3270
3271 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3272 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3273 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3274 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3275 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3276 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3277 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3278 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3279 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3280 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3281 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3282 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3283 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3284 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3285 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3286 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3287 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3288 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3289 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3290 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3291 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3292 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3293 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3294 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3295 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3296 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3297 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3298 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3299 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3300 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3301 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3302
3303 recoverystate_perf = rs_perf.create_perf_counters();
3304 cct->get_perfcounters_collection()->add(recoverystate_perf);
3305 }
3306
3307 int OSD::shutdown()
3308 {
3309 if (!service.prepare_to_stop())
3310 return 0; // already shutting down
3311 osd_lock.Lock();
3312 if (is_stopping()) {
3313 osd_lock.Unlock();
3314 return 0;
3315 }
3316 derr << "shutdown" << dendl;
3317
3318 set_state(STATE_STOPPING);
3319
3320 // Debugging
3321 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3322 cct->_conf->set_val("debug_osd", "100");
3323 cct->_conf->set_val("debug_journal", "100");
3324 cct->_conf->set_val("debug_filestore", "100");
3325 cct->_conf->set_val("debug_bluestore", "100");
3326 cct->_conf->set_val("debug_ms", "100");
3327 cct->_conf->apply_changes(NULL);
3328 }
3329
3330 // stop MgrClient earlier as it's more like an internal consumer of OSD
3331 mgrc.shutdown();
3332
3333 service.start_shutdown();
3334
3335 // stop sending work to pgs. this just prevents any new work in _process
3336 // from racing with on_shutdown and potentially entering the pg after.
3337 op_shardedwq.drain();
3338
3339 // Shutdown PGs
3340 {
3341 RWLock::RLocker l(pg_map_lock);
3342 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3343 p != pg_map.end();
3344 ++p) {
3345 dout(20) << " kicking pg " << p->first << dendl;
3346 p->second->lock();
3347 p->second->on_shutdown();
3348 p->second->unlock();
3349 p->second->osr->flush();
3350 }
3351 }
3352 clear_pg_stat_queue();
3353
3354 // drain op queue again (in case PGs requeued something)
3355 op_shardedwq.drain();
3356 {
3357 finished.clear(); // zap waiters (bleh, this is messy)
3358 }
3359
3360 op_shardedwq.clear_pg_slots();
3361
3362 // unregister commands
3363 cct->get_admin_socket()->unregister_command("status");
3364 cct->get_admin_socket()->unregister_command("flush_journal");
3365 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3366 cct->get_admin_socket()->unregister_command("ops");
3367 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3368 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3369 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3370 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3371 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3372 cct->get_admin_socket()->unregister_command("dump_blacklist");
3373 cct->get_admin_socket()->unregister_command("dump_watchers");
3374 cct->get_admin_socket()->unregister_command("dump_reservations");
3375 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3376 cct->get_admin_socket()->unregister_command("heap");
3377 cct->get_admin_socket()->unregister_command("set_heap_property");
3378 cct->get_admin_socket()->unregister_command("get_heap_property");
3379 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3380 cct->get_admin_socket()->unregister_command("dump_scrubs");
3381 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3382 cct->get_admin_socket()->unregister_command("flush_store_cache");
3383 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3384 cct->get_admin_socket()->unregister_command("compact");
3385 delete asok_hook;
3386 asok_hook = NULL;
3387
3388 cct->get_admin_socket()->unregister_command("setomapval");
3389 cct->get_admin_socket()->unregister_command("rmomapkey");
3390 cct->get_admin_socket()->unregister_command("setomapheader");
3391 cct->get_admin_socket()->unregister_command("getomap");
3392 cct->get_admin_socket()->unregister_command("truncobj");
3393 cct->get_admin_socket()->unregister_command("injectdataerr");
3394 cct->get_admin_socket()->unregister_command("injectmdataerr");
3395 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3396 cct->get_admin_socket()->unregister_command("trigger_scrub");
3397 cct->get_admin_socket()->unregister_command("injectfull");
3398 delete test_ops_hook;
3399 test_ops_hook = NULL;
3400
3401 osd_lock.Unlock();
3402
3403 heartbeat_lock.Lock();
3404 heartbeat_stop = true;
3405 heartbeat_cond.Signal();
3406 heartbeat_lock.Unlock();
3407 heartbeat_thread.join();
3408
3409 peering_tp.drain();
3410 peering_wq.clear();
3411 peering_tp.stop();
3412 dout(10) << "osd tp stopped" << dendl;
3413
3414 osd_op_tp.drain();
3415 osd_op_tp.stop();
3416 dout(10) << "op sharded tp stopped" << dendl;
3417
3418 command_tp.drain();
3419 command_tp.stop();
3420 dout(10) << "command tp stopped" << dendl;
3421
3422 disk_tp.drain();
3423 disk_tp.stop();
3424 dout(10) << "disk tp paused (new)" << dendl;
3425
3426 dout(10) << "stopping agent" << dendl;
3427 service.agent_stop();
3428
3429 osd_lock.Lock();
3430
3431 reset_heartbeat_peers();
3432
3433 tick_timer.shutdown();
3434
3435 {
3436 Mutex::Locker l(tick_timer_lock);
3437 tick_timer_without_osd_lock.shutdown();
3438 }
3439
3440 // note unmount epoch
3441 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3442 superblock.mounted = service.get_boot_epoch();
3443 superblock.clean_thru = osdmap->get_epoch();
3444 ObjectStore::Transaction t;
3445 write_superblock(t);
3446 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3447 if (r) {
3448 derr << "OSD::shutdown: error writing superblock: "
3449 << cpp_strerror(r) << dendl;
3450 }
3451
3452
3453 {
3454 Mutex::Locker l(pg_stat_queue_lock);
3455 assert(pg_stat_queue.empty());
3456 }
3457
3458 service.shutdown_reserver();
3459
3460 // Remove PGs
3461 #ifdef PG_DEBUG_REFS
3462 service.dump_live_pgids();
3463 #endif
3464 {
3465 RWLock::RLocker l(pg_map_lock);
3466 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3467 p != pg_map.end();
3468 ++p) {
3469 dout(20) << " kicking pg " << p->first << dendl;
3470 p->second->lock();
3471 if (p->second->ref != 1) {
3472 derr << "pgid " << p->first << " has ref count of "
3473 << p->second->ref << dendl;
3474 #ifdef PG_DEBUG_REFS
3475 p->second->dump_live_ids();
3476 #endif
3477 if (cct->_conf->osd_shutdown_pgref_assert) {
3478 ceph_abort();
3479 }
3480 }
3481 p->second->unlock();
3482 p->second->put("PGMap");
3483 }
3484 pg_map.clear();
3485 }
3486 #ifdef PG_DEBUG_REFS
3487 service.dump_live_pgids();
3488 #endif
3489 cct->_conf->remove_observer(this);
3490
3491 dout(10) << "syncing store" << dendl;
3492 enable_disable_fuse(true);
3493
3494 if (cct->_conf->osd_journal_flush_on_shutdown) {
3495 dout(10) << "flushing journal" << dendl;
3496 store->flush_journal();
3497 }
3498
3499 store->umount();
3500 delete store;
3501 store = 0;
3502 dout(10) << "Store synced" << dendl;
3503
3504 monc->shutdown();
3505 osd_lock.Unlock();
3506
3507 osdmap = OSDMapRef();
3508 service.shutdown();
3509 op_tracker.on_shutdown();
3510
3511 class_handler->shutdown();
3512 client_messenger->shutdown();
3513 cluster_messenger->shutdown();
3514 hb_front_client_messenger->shutdown();
3515 hb_back_client_messenger->shutdown();
3516 objecter_messenger->shutdown();
3517 hb_front_server_messenger->shutdown();
3518 hb_back_server_messenger->shutdown();
3519
3520 peering_wq.clear();
3521
3522 return r;
3523 }
3524
3525 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3526 {
3527 bool created = false;
3528 while (true) {
3529 dout(10) << __func__ << " cmd: " << cmd << dendl;
3530 vector<string> vcmd{cmd};
3531 bufferlist inbl;
3532 C_SaferCond w;
3533 string outs;
3534 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3535 int r = w.wait();
3536 if (r < 0) {
3537 if (r == -ENOENT && !created) {
3538 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3539 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3540 vector<string> vnewcmd{newcmd};
3541 bufferlist inbl;
3542 C_SaferCond w;
3543 string outs;
3544 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3545 int r = w.wait();
3546 if (r < 0) {
3547 derr << __func__ << " fail: osd does not exist and created failed: "
3548 << cpp_strerror(r) << dendl;
3549 return r;
3550 }
3551 created = true;
3552 continue;
3553 }
3554 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3555 return r;
3556 }
3557 break;
3558 }
3559
3560 return 0;
3561 }
3562
3563 int OSD::update_crush_location()
3564 {
3565 if (!cct->_conf->osd_crush_update_on_start) {
3566 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3567 return 0;
3568 }
3569
3570 char weight[32];
3571 if (cct->_conf->osd_crush_initial_weight >= 0) {
3572 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3573 } else {
3574 struct store_statfs_t st;
3575 int r = store->statfs(&st);
3576 if (r < 0) {
3577 derr << "statfs: " << cpp_strerror(r) << dendl;
3578 return r;
3579 }
3580 snprintf(weight, sizeof(weight), "%.4lf",
3581 MAX((double).00001,
3582 (double)(st.total) /
3583 (double)(1ull << 40 /* TB */)));
3584 }
3585
3586 std::multimap<string,string> loc = cct->crush_location.get_location();
3587 dout(10) << __func__ << " crush location is " << loc << dendl;
3588
3589 string cmd =
3590 string("{\"prefix\": \"osd crush create-or-move\", ") +
3591 string("\"id\": ") + stringify(whoami) + string(", ") +
3592 string("\"weight\":") + weight + string(", ") +
3593 string("\"args\": [");
3594 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3595 if (p != loc.begin())
3596 cmd += ", ";
3597 cmd += "\"" + p->first + "=" + p->second + "\"";
3598 }
3599 cmd += "]}";
3600
3601 return mon_cmd_maybe_osd_create(cmd);
3602 }
3603
3604 int OSD::update_crush_device_class()
3605 {
3606 if (!cct->_conf->osd_class_update_on_start) {
3607 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3608 return 0;
3609 }
3610
3611 string device_class;
3612 int r = store->read_meta("crush_device_class", &device_class);
3613 if (r < 0 || device_class.empty()) {
3614 device_class = store->get_default_device_class();
3615 }
3616
3617 if (device_class.empty()) {
3618 dout(20) << __func__ << " no device class stored locally" << dendl;
3619 return 0;
3620 }
3621
3622 string cmd =
3623 string("{\"prefix\": \"osd crush set-device-class\", ") +
3624 string("\"class\": \"") + device_class + string("\", ") +
3625 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3626
3627 r = mon_cmd_maybe_osd_create(cmd);
3628 // the above cmd can fail for various reasons, e.g.:
3629 // (1) we are connecting to a pre-luminous monitor
3630 // (2) user manually specify a class other than
3631 // 'ceph-disk prepare --crush-device-class'
3632 // simply skip result-checking for now
3633 return 0;
3634 }
3635
3636 void OSD::write_superblock(ObjectStore::Transaction& t)
3637 {
3638 dout(10) << "write_superblock " << superblock << dendl;
3639
3640 //hack: at minimum it's using the baseline feature set
3641 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3642 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3643
3644 bufferlist bl;
3645 ::encode(superblock, bl);
3646 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3647 }
3648
3649 int OSD::read_superblock()
3650 {
3651 bufferlist bl;
3652 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3653 if (r < 0)
3654 return r;
3655
3656 bufferlist::iterator p = bl.begin();
3657 ::decode(superblock, p);
3658
3659 dout(10) << "read_superblock " << superblock << dendl;
3660
3661 return 0;
3662 }
3663
3664 void OSD::clear_temp_objects()
3665 {
3666 dout(10) << __func__ << dendl;
3667 vector<coll_t> ls;
3668 store->list_collections(ls);
3669 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3670 spg_t pgid;
3671 if (!p->is_pg(&pgid))
3672 continue;
3673
3674 // list temp objects
3675 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3676
3677 vector<ghobject_t> temps;
3678 ghobject_t next;
3679 while (1) {
3680 vector<ghobject_t> objects;
3681 store->collection_list(*p, next, ghobject_t::get_max(),
3682 store->get_ideal_list_max(),
3683 &objects, &next);
3684 if (objects.empty())
3685 break;
3686 vector<ghobject_t>::iterator q;
3687 for (q = objects.begin(); q != objects.end(); ++q) {
3688 // Hammer set pool for temps to -1, so check for clean-up
3689 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3690 temps.push_back(*q);
3691 } else {
3692 break;
3693 }
3694 }
3695 // If we saw a non-temp object and hit the break above we can
3696 // break out of the while loop too.
3697 if (q != objects.end())
3698 break;
3699 }
3700 if (!temps.empty()) {
3701 ObjectStore::Transaction t;
3702 int removed = 0;
3703 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3704 dout(20) << " removing " << *p << " object " << *q << dendl;
3705 t.remove(*p, *q);
3706 if (++removed > cct->_conf->osd_target_transaction_size) {
3707 store->apply_transaction(service.meta_osr.get(), std::move(t));
3708 t = ObjectStore::Transaction();
3709 removed = 0;
3710 }
3711 }
3712 if (removed) {
3713 store->apply_transaction(service.meta_osr.get(), std::move(t));
3714 }
3715 }
3716 }
3717 }
3718
3719 void OSD::recursive_remove_collection(CephContext* cct,
3720 ObjectStore *store, spg_t pgid,
3721 coll_t tmp)
3722 {
3723 OSDriver driver(
3724 store,
3725 coll_t(),
3726 make_snapmapper_oid());
3727
3728 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3729 ObjectStore::Sequencer>("rm"));
3730 ObjectStore::Transaction t;
3731 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3732
3733 vector<ghobject_t> objects;
3734 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3735 INT_MAX, &objects, 0);
3736 generic_dout(10) << __func__ << " " << objects << dendl;
3737 // delete them.
3738 int removed = 0;
3739 for (vector<ghobject_t>::iterator p = objects.begin();
3740 p != objects.end();
3741 ++p, removed++) {
3742 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3743 int r = mapper.remove_oid(p->hobj, &_t);
3744 if (r != 0 && r != -ENOENT)
3745 ceph_abort();
3746 t.remove(tmp, *p);
3747 if (removed > cct->_conf->osd_target_transaction_size) {
3748 int r = store->apply_transaction(osr.get(), std::move(t));
3749 assert(r == 0);
3750 t = ObjectStore::Transaction();
3751 removed = 0;
3752 }
3753 }
3754 t.remove_collection(tmp);
3755 int r = store->apply_transaction(osr.get(), std::move(t));
3756 assert(r == 0);
3757
3758 C_SaferCond waiter;
3759 if (!osr->flush_commit(&waiter)) {
3760 waiter.wait();
3761 }
3762 }
3763
3764
3765 // ======================================================
3766 // PG's
3767
3768 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3769 {
3770 if (!createmap->have_pg_pool(id)) {
3771 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3772 << id << dendl;
3773 ceph_abort();
3774 }
3775
3776 PGPool p = PGPool(cct, createmap, id);
3777
3778 dout(10) << "_get_pool " << p.id << dendl;
3779 return p;
3780 }
3781
3782 PG *OSD::_open_lock_pg(
3783 OSDMapRef createmap,
3784 spg_t pgid, bool no_lockdep_check)
3785 {
3786 assert(osd_lock.is_locked());
3787
3788 PG* pg = _make_pg(createmap, pgid);
3789 {
3790 RWLock::WLocker l(pg_map_lock);
3791 pg->lock(no_lockdep_check);
3792 pg_map[pgid] = pg;
3793 pg->get("PGMap"); // because it's in pg_map
3794 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3795 }
3796 return pg;
3797 }
3798
3799 PG* OSD::_make_pg(
3800 OSDMapRef createmap,
3801 spg_t pgid)
3802 {
3803 dout(10) << "_open_lock_pg " << pgid << dendl;
3804 PGPool pool = _get_pool(pgid.pool(), createmap);
3805
3806 // create
3807 PG *pg;
3808 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3809 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3810 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3811 else
3812 ceph_abort();
3813
3814 return pg;
3815 }
3816
3817
3818 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3819 {
3820 epoch_t e(service.get_osdmap()->get_epoch());
3821 pg->get("PGMap"); // For pg_map
3822 pg_map[pg->info.pgid] = pg;
3823 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3824
3825 dout(10) << "Adding newly split pg " << *pg << dendl;
3826 pg->handle_loaded(rctx);
3827 pg->write_if_dirty(*(rctx->transaction));
3828 pg->queue_null(e, e);
3829 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3830 peering_wait_for_split.find(pg->info.pgid);
3831 if (to_wake != peering_wait_for_split.end()) {
3832 for (list<PG::CephPeeringEvtRef>::iterator i =
3833 to_wake->second.begin();
3834 i != to_wake->second.end();
3835 ++i) {
3836 pg->queue_peering_event(*i);
3837 }
3838 peering_wait_for_split.erase(to_wake);
3839 }
3840 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3841 _remove_pg(pg);
3842 }
3843
3844 OSD::res_result OSD::_try_resurrect_pg(
3845 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3846 {
3847 assert(resurrected);
3848 assert(old_pg_state);
3849 // find nearest ancestor
3850 DeletingStateRef df;
3851 spg_t cur(pgid);
3852 while (true) {
3853 df = service.deleting_pgs.lookup(cur);
3854 if (df)
3855 break;
3856 if (!cur.ps())
3857 break;
3858 cur = cur.get_parent();
3859 }
3860 if (!df)
3861 return RES_NONE; // good to go
3862
3863 df->old_pg_state->lock();
3864 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3865 df->old_pg_state->unlock();
3866
3867 set<spg_t> children;
3868 if (cur == pgid) {
3869 if (df->try_stop_deletion()) {
3870 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3871 *resurrected = cur;
3872 *old_pg_state = df->old_pg_state;
3873 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3874 return RES_SELF;
3875 } else {
3876 // raced, ensure we don't see DeletingStateRef when we try to
3877 // delete this pg
3878 service.deleting_pgs.remove(pgid);
3879 return RES_NONE;
3880 }
3881 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3882 curmap->get_pg_num(cur.pool()),
3883 &children) &&
3884 children.count(pgid)) {
3885 if (df->try_stop_deletion()) {
3886 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3887 << dendl;
3888 *resurrected = cur;
3889 *old_pg_state = df->old_pg_state;
3890 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3891 return RES_PARENT;
3892 } else {
3893 /* this is not a problem, failing to cancel proves that all objects
3894 * have been removed, so no hobject_t overlap is possible
3895 */
3896 return RES_NONE;
3897 }
3898 }
3899 return RES_NONE;
3900 }
3901
3902 PG *OSD::_create_lock_pg(
3903 OSDMapRef createmap,
3904 spg_t pgid,
3905 bool hold_map_lock,
3906 bool backfill,
3907 int role,
3908 vector<int>& up, int up_primary,
3909 vector<int>& acting, int acting_primary,
3910 pg_history_t history,
3911 const PastIntervals& pi,
3912 ObjectStore::Transaction& t)
3913 {
3914 assert(osd_lock.is_locked());
3915 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3916
3917 PG *pg = _open_lock_pg(createmap, pgid, true);
3918
3919 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3920
3921 pg->init(
3922 role,
3923 up,
3924 up_primary,
3925 acting,
3926 acting_primary,
3927 history,
3928 pi,
3929 backfill,
3930 &t);
3931
3932 dout(7) << "_create_lock_pg " << *pg << dendl;
3933 return pg;
3934 }
3935
3936 PG *OSD::_lookup_lock_pg(spg_t pgid)
3937 {
3938 RWLock::RLocker l(pg_map_lock);
3939
3940 auto pg_map_entry = pg_map.find(pgid);
3941 if (pg_map_entry == pg_map.end())
3942 return nullptr;
3943 PG *pg = pg_map_entry->second;
3944 pg->lock();
3945 return pg;
3946 }
3947
3948 PG *OSD::lookup_lock_pg(spg_t pgid)
3949 {
3950 return _lookup_lock_pg(pgid);
3951 }
3952
3953 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3954 {
3955 assert(pg_map.count(pgid));
3956 PG *pg = pg_map[pgid];
3957 pg->lock();
3958 return pg;
3959 }
3960
3961 void OSD::load_pgs()
3962 {
3963 assert(osd_lock.is_locked());
3964 dout(0) << "load_pgs" << dendl;
3965 {
3966 RWLock::RLocker l(pg_map_lock);
3967 assert(pg_map.empty());
3968 }
3969
3970 vector<coll_t> ls;
3971 int r = store->list_collections(ls);
3972 if (r < 0) {
3973 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3974 }
3975
3976 bool has_upgraded = false;
3977
3978 for (vector<coll_t>::iterator it = ls.begin();
3979 it != ls.end();
3980 ++it) {
3981 spg_t pgid;
3982 if (it->is_temp(&pgid) ||
3983 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3984 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3985 recursive_remove_collection(cct, store, pgid, *it);
3986 continue;
3987 }
3988
3989 if (!it->is_pg(&pgid)) {
3990 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3991 continue;
3992 }
3993
3994 if (pgid.preferred() >= 0) {
3995 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3996 // FIXME: delete it too, eventually
3997 continue;
3998 }
3999
4000 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4001 bufferlist bl;
4002 epoch_t map_epoch = 0;
4003 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
4004 if (r < 0) {
4005 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4006 << dendl;
4007 continue;
4008 }
4009
4010 PG *pg = NULL;
4011 if (map_epoch > 0) {
4012 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4013 if (!pgosdmap) {
4014 if (!osdmap->have_pg_pool(pgid.pool())) {
4015 derr << __func__ << ": could not find map for epoch " << map_epoch
4016 << " on pg " << pgid << ", but the pool is not present in the "
4017 << "current map, so this is probably a result of bug 10617. "
4018 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4019 << "to clean it up later." << dendl;
4020 continue;
4021 } else {
4022 derr << __func__ << ": have pgid " << pgid << " at epoch "
4023 << map_epoch << ", but missing map. Crashing."
4024 << dendl;
4025 assert(0 == "Missing map in load_pgs");
4026 }
4027 }
4028 pg = _open_lock_pg(pgosdmap, pgid);
4029 } else {
4030 pg = _open_lock_pg(osdmap, pgid);
4031 }
4032 // there can be no waiters here, so we don't call wake_pg_waiters
4033
4034 pg->ch = store->open_collection(pg->coll);
4035
4036 // read pg state, log
4037 pg->read_state(store, bl);
4038
4039 if (pg->must_upgrade()) {
4040 if (!pg->can_upgrade()) {
4041 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4042 << " an older version first." << dendl;
4043 assert(0 == "PG too old to upgrade");
4044 }
4045 if (!has_upgraded) {
4046 derr << "PGs are upgrading" << dendl;
4047 has_upgraded = true;
4048 }
4049 dout(10) << "PG " << pg->info.pgid
4050 << " must upgrade..." << dendl;
4051 pg->upgrade(store);
4052 }
4053
4054 if (pg->dne()) {
4055 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4056 pg->ch = nullptr;
4057 service.pg_remove_epoch(pg->pg_id);
4058 pg->unlock();
4059 {
4060 // Delete pg
4061 RWLock::WLocker l(pg_map_lock);
4062 auto p = pg_map.find(pg->get_pgid());
4063 assert(p != pg_map.end() && p->second == pg);
4064 dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
4065 pg_map.erase(p);
4066 pg->put("PGMap");
4067 }
4068 recursive_remove_collection(cct, store, pgid, *it);
4069 continue;
4070 }
4071
4072 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4073
4074 // generate state for PG's current mapping
4075 int primary, up_primary;
4076 vector<int> acting, up;
4077 pg->get_osdmap()->pg_to_up_acting_osds(
4078 pgid.pgid, &up, &up_primary, &acting, &primary);
4079 pg->init_primary_up_acting(
4080 up,
4081 acting,
4082 up_primary,
4083 primary);
4084 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4085 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4086 pg->set_role(role);
4087 else
4088 pg->set_role(-1);
4089
4090 pg->reg_next_scrub();
4091
4092 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4093 pg->handle_loaded(&rctx);
4094
4095 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4096 if (pg->pg_log.is_dirty()) {
4097 ObjectStore::Transaction t;
4098 pg->write_if_dirty(t);
4099 store->apply_transaction(pg->osr.get(), std::move(t));
4100 }
4101 pg->unlock();
4102 }
4103 {
4104 RWLock::RLocker l(pg_map_lock);
4105 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4106 }
4107
4108 // clean up old infos object?
4109 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4110 dout(1) << __func__ << " removing legacy infos object" << dendl;
4111 ObjectStore::Transaction t;
4112 t.remove(coll_t::meta(), OSD::make_infos_oid());
4113 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4114 if (r != 0) {
4115 derr << __func__ << ": apply_transaction returned "
4116 << cpp_strerror(r) << dendl;
4117 ceph_abort();
4118 }
4119 }
4120
4121 build_past_intervals_parallel();
4122 }
4123
4124
4125 /*
4126 * build past_intervals efficiently on old, degraded, and buried
4127 * clusters. this is important for efficiently catching up osds that
4128 * are way behind on maps to the current cluster state.
4129 *
4130 * this is a parallel version of PG::generate_past_intervals().
4131 * follow the same logic, but do all pgs at the same time so that we
4132 * can make a single pass across the osdmap history.
4133 */
4134 void OSD::build_past_intervals_parallel()
4135 {
4136 struct pistate {
4137 epoch_t start, end;
4138 vector<int> old_acting, old_up;
4139 epoch_t same_interval_since;
4140 int primary;
4141 int up_primary;
4142 };
4143 map<PG*,pistate> pis;
4144
4145 // calculate junction of map range
4146 epoch_t end_epoch = superblock.oldest_map;
4147 epoch_t cur_epoch = superblock.newest_map;
4148 {
4149 RWLock::RLocker l(pg_map_lock);
4150 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4151 i != pg_map.end();
4152 ++i) {
4153 PG *pg = i->second;
4154
4155 // Ignore PGs only partially created (DNE)
4156 if (pg->info.dne()) {
4157 continue;
4158 }
4159
4160 auto rpib = pg->get_required_past_interval_bounds(
4161 pg->info,
4162 superblock.oldest_map);
4163 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4164 if (pg->info.history.same_interval_since == 0) {
4165 pg->info.history.same_interval_since = rpib.second;
4166 }
4167 continue;
4168 } else {
4169 auto apib = pg->past_intervals.get_bounds();
4170 if (apib.second >= rpib.second &&
4171 apib.first <= rpib.first) {
4172 if (pg->info.history.same_interval_since == 0) {
4173 pg->info.history.same_interval_since = rpib.second;
4174 }
4175 continue;
4176 }
4177 }
4178
4179 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4180 << rpib.second << dendl;
4181 pistate& p = pis[pg];
4182 p.start = rpib.first;
4183 p.end = rpib.second;
4184 p.same_interval_since = 0;
4185
4186 if (rpib.first < cur_epoch)
4187 cur_epoch = rpib.first;
4188 if (rpib.second > end_epoch)
4189 end_epoch = rpib.second;
4190 }
4191 }
4192 if (pis.empty()) {
4193 dout(10) << __func__ << " nothing to build" << dendl;
4194 return;
4195 }
4196
4197 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4198 assert(cur_epoch <= end_epoch);
4199
4200 OSDMapRef cur_map, last_map;
4201 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4202 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4203 last_map = cur_map;
4204 cur_map = get_map(cur_epoch);
4205
4206 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4207 PG *pg = i->first;
4208 pistate& p = i->second;
4209
4210 if (cur_epoch < p.start || cur_epoch > p.end)
4211 continue;
4212
4213 vector<int> acting, up;
4214 int up_primary;
4215 int primary;
4216 pg_t pgid = pg->info.pgid.pgid;
4217 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4218 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4219 cur_map->pg_to_up_acting_osds(
4220 pgid, &up, &up_primary, &acting, &primary);
4221
4222 if (p.same_interval_since == 0) {
4223 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4224 << " first map, acting " << acting
4225 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4226 p.same_interval_since = cur_epoch;
4227 p.old_up = up;
4228 p.old_acting = acting;
4229 p.primary = primary;
4230 p.up_primary = up_primary;
4231 continue;
4232 }
4233 assert(last_map);
4234
4235 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4236 pg->get_is_recoverable_predicate());
4237 std::stringstream debug;
4238 bool new_interval = PastIntervals::check_new_interval(
4239 p.primary,
4240 primary,
4241 p.old_acting, acting,
4242 p.up_primary,
4243 up_primary,
4244 p.old_up, up,
4245 p.same_interval_since,
4246 pg->info.history.last_epoch_clean,
4247 cur_map, last_map,
4248 pgid,
4249 recoverable.get(),
4250 &pg->past_intervals,
4251 &debug);
4252 if (new_interval) {
4253 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4254 << " " << debug.str() << dendl;
4255 p.old_up = up;
4256 p.old_acting = acting;
4257 p.primary = primary;
4258 p.up_primary = up_primary;
4259 p.same_interval_since = cur_epoch;
4260 }
4261 }
4262 }
4263
4264 // Now that past_intervals have been recomputed let's fix the same_interval_since
4265 // if it was cleared by import.
4266 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4267 PG *pg = i->first;
4268 pistate& p = i->second;
4269
4270 if (pg->info.history.same_interval_since == 0) {
4271 assert(p.same_interval_since);
4272 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4273 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4274 // Fix it
4275 pg->info.history.same_interval_since = p.same_interval_since;
4276 }
4277 }
4278
4279 // write info only at the end. this is necessary because we check
4280 // whether the past_intervals go far enough back or forward in time,
4281 // but we don't check for holes. we could avoid it by discarding
4282 // the previous past_intervals and rebuilding from scratch, or we
4283 // can just do this and commit all our work at the end.
4284 ObjectStore::Transaction t;
4285 int num = 0;
4286 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4287 PG *pg = i->first;
4288 pg->lock();
4289 pg->dirty_big_info = true;
4290 pg->dirty_info = true;
4291 pg->write_if_dirty(t);
4292 pg->unlock();
4293
4294 // don't let the transaction get too big
4295 if (++num >= cct->_conf->osd_target_transaction_size) {
4296 store->apply_transaction(service.meta_osr.get(), std::move(t));
4297 t = ObjectStore::Transaction();
4298 num = 0;
4299 }
4300 }
4301 if (!t.empty())
4302 store->apply_transaction(service.meta_osr.get(), std::move(t));
4303 }
4304
4305 /*
4306 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4307 * hasn't changed since the given epoch and we are the primary.
4308 */
4309 int OSD::handle_pg_peering_evt(
4310 spg_t pgid,
4311 const pg_history_t& orig_history,
4312 const PastIntervals& pi,
4313 epoch_t epoch,
4314 PG::CephPeeringEvtRef evt)
4315 {
4316 if (service.splitting(pgid)) {
4317 peering_wait_for_split[pgid].push_back(evt);
4318 return -EEXIST;
4319 }
4320
4321 PG *pg = _lookup_lock_pg(pgid);
4322 if (!pg) {
4323 // same primary?
4324 if (!osdmap->have_pg_pool(pgid.pool()))
4325 return -EINVAL;
4326 int up_primary, acting_primary;
4327 vector<int> up, acting;
4328 osdmap->pg_to_up_acting_osds(
4329 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4330
4331 pg_history_t history = orig_history;
4332 bool valid_history = project_pg_history(
4333 pgid, history, epoch, up, up_primary, acting, acting_primary);
4334
4335 if (!valid_history || epoch < history.same_interval_since) {
4336 dout(10) << __func__ << pgid << " acting changed in "
4337 << history.same_interval_since << " (msg from " << epoch << ")"
4338 << dendl;
4339 return -EINVAL;
4340 }
4341
4342 if (service.splitting(pgid)) {
4343 ceph_abort();
4344 }
4345
4346 const bool is_mon_create =
4347 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4348 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4349 return -EAGAIN;
4350 }
4351 // do we need to resurrect a deleting pg?
4352 spg_t resurrected;
4353 PGRef old_pg_state;
4354 res_result result = _try_resurrect_pg(
4355 service.get_osdmap(),
4356 pgid,
4357 &resurrected,
4358 &old_pg_state);
4359
4360 PG::RecoveryCtx rctx = create_context();
4361 switch (result) {
4362 case RES_NONE: {
4363 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4364 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4365 store->get_type() != "bluestore") {
4366 clog->warn() << "pg " << pgid
4367 << " is at risk of silent data corruption: "
4368 << "the pool allows ec overwrites but is not stored in "
4369 << "bluestore, so deep scrubbing will not detect bitrot";
4370 }
4371 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4372 PG::_init(*rctx.transaction, pgid, pp);
4373
4374 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4375 if (!pp->is_replicated() && role != pgid.shard)
4376 role = -1;
4377
4378 pg = _create_lock_pg(
4379 get_map(epoch),
4380 pgid, false, false,
4381 role,
4382 up, up_primary,
4383 acting, acting_primary,
4384 history, pi,
4385 *rctx.transaction);
4386 pg->handle_create(&rctx);
4387 pg->write_if_dirty(*rctx.transaction);
4388 dispatch_context(rctx, pg, osdmap);
4389
4390 dout(10) << *pg << " is new" << dendl;
4391
4392 pg->queue_peering_event(evt);
4393 wake_pg_waiters(pg);
4394 pg->unlock();
4395 return 0;
4396 }
4397 case RES_SELF: {
4398 old_pg_state->lock();
4399 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4400 int old_role = old_pg_state->role;
4401 vector<int> old_up = old_pg_state->up;
4402 int old_up_primary = old_pg_state->up_primary.osd;
4403 vector<int> old_acting = old_pg_state->acting;
4404 int old_primary = old_pg_state->primary.osd;
4405 pg_history_t old_history = old_pg_state->info.history;
4406 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4407 old_pg_state->unlock();
4408 pg = _create_lock_pg(
4409 old_osd_map,
4410 resurrected,
4411 false,
4412 true,
4413 old_role,
4414 old_up,
4415 old_up_primary,
4416 old_acting,
4417 old_primary,
4418 old_history,
4419 old_past_intervals,
4420 *rctx.transaction);
4421 pg->handle_create(&rctx);
4422 pg->write_if_dirty(*rctx.transaction);
4423 dispatch_context(rctx, pg, osdmap);
4424
4425 dout(10) << *pg << " is new (resurrected)" << dendl;
4426
4427 pg->queue_peering_event(evt);
4428 wake_pg_waiters(pg);
4429 pg->unlock();
4430 return 0;
4431 }
4432 case RES_PARENT: {
4433 assert(old_pg_state);
4434 old_pg_state->lock();
4435 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4436 int old_role = old_pg_state->role;
4437 vector<int> old_up = old_pg_state->up;
4438 int old_up_primary = old_pg_state->up_primary.osd;
4439 vector<int> old_acting = old_pg_state->acting;
4440 int old_primary = old_pg_state->primary.osd;
4441 pg_history_t old_history = old_pg_state->info.history;
4442 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4443 old_pg_state->unlock();
4444 PG *parent = _create_lock_pg(
4445 old_osd_map,
4446 resurrected,
4447 false,
4448 true,
4449 old_role,
4450 old_up,
4451 old_up_primary,
4452 old_acting,
4453 old_primary,
4454 old_history,
4455 old_past_intervals,
4456 *rctx.transaction
4457 );
4458 parent->handle_create(&rctx);
4459 parent->write_if_dirty(*rctx.transaction);
4460 dispatch_context(rctx, parent, osdmap);
4461
4462 dout(10) << *parent << " is new" << dendl;
4463
4464 assert(service.splitting(pgid));
4465 peering_wait_for_split[pgid].push_back(evt);
4466
4467 //parent->queue_peering_event(evt);
4468 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4469 wake_pg_waiters(parent);
4470 parent->unlock();
4471 return 0;
4472 }
4473 default:
4474 assert(0);
4475 return 0;
4476 }
4477 } else {
4478 // already had it. did the mapping change?
4479 if (epoch < pg->info.history.same_interval_since) {
4480 dout(10) << *pg << __func__ << " acting changed in "
4481 << pg->info.history.same_interval_since
4482 << " (msg from " << epoch << ")" << dendl;
4483 } else {
4484 pg->queue_peering_event(evt);
4485 }
4486 pg->unlock();
4487 return -EEXIST;
4488 }
4489 }
4490
4491 bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4492 {
4493 const auto max_pgs_per_osd =
4494 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4495 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4496
4497 RWLock::RLocker pg_map_locker{pg_map_lock};
4498 if (pg_map.size() < max_pgs_per_osd) {
4499 return false;
4500 }
4501 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4502 if (is_mon_create) {
4503 pending_creates_from_mon++;
4504 } else {
4505 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4506 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4507 }
4508 dout(5) << __func__ << " withhold creation of pg " << pgid
4509 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4510 return true;
4511 }
4512
4513 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4514 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4515 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4516 static vector<int32_t> twiddle(const vector<int>& acting) {
4517 if (acting.size() > 1) {
4518 return {acting[0]};
4519 } else {
4520 vector<int32_t> twiddled(acting.begin(), acting.end());
4521 twiddled.push_back(-1);
4522 return twiddled;
4523 }
4524 }
4525
4526 void OSD::resume_creating_pg()
4527 {
4528 bool do_sub_pg_creates = false;
4529 bool have_pending_creates = false;
4530 {
4531 const auto max_pgs_per_osd =
4532 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4533 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4534 RWLock::RLocker l(pg_map_lock);
4535 if (max_pgs_per_osd <= pg_map.size()) {
4536 // this could happen if admin decreases this setting before a PG is removed
4537 return;
4538 }
4539 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4540 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4541 if (pending_creates_from_mon > 0) {
4542 do_sub_pg_creates = true;
4543 if (pending_creates_from_mon >= spare_pgs) {
4544 spare_pgs = pending_creates_from_mon = 0;
4545 } else {
4546 spare_pgs -= pending_creates_from_mon;
4547 pending_creates_from_mon = 0;
4548 }
4549 }
4550 auto pg = pending_creates_from_osd.cbegin();
4551 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4552 dout(20) << __func__ << " pg " << pg->first << dendl;
4553 vector<int> acting;
4554 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4555 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4556 pg = pending_creates_from_osd.erase(pg);
4557 do_sub_pg_creates = true;
4558 spare_pgs--;
4559 }
4560 have_pending_creates = (pending_creates_from_mon > 0 ||
4561 !pending_creates_from_osd.empty());
4562 }
4563
4564 bool do_renew_subs = false;
4565 if (do_sub_pg_creates) {
4566 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4567 dout(4) << __func__ << ": resolicit pg creates from mon since "
4568 << last_pg_create_epoch << dendl;
4569 do_renew_subs = true;
4570 }
4571 }
4572 version_t start = osdmap->get_epoch() + 1;
4573 if (have_pending_creates) {
4574 // don't miss any new osdmap deleting PGs
4575 if (monc->sub_want("osdmap", start, 0)) {
4576 dout(4) << __func__ << ": resolicit osdmap from mon since "
4577 << start << dendl;
4578 do_renew_subs = true;
4579 }
4580 } else if (do_sub_pg_creates) {
4581 // no need to subscribe the osdmap continuously anymore
4582 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4583 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4584 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
4585 << start << dendl;
4586 do_renew_subs = true;
4587 }
4588 }
4589
4590 if (do_renew_subs) {
4591 monc->renew_subs();
4592 }
4593
4594 service.send_pg_temp();
4595 }
4596
4597 void OSD::build_initial_pg_history(
4598 spg_t pgid,
4599 epoch_t created,
4600 utime_t created_stamp,
4601 pg_history_t *h,
4602 PastIntervals *pi)
4603 {
4604 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4605 h->epoch_created = created;
4606 h->epoch_pool_created = created;
4607 h->same_interval_since = created;
4608 h->same_up_since = created;
4609 h->same_primary_since = created;
4610 h->last_scrub_stamp = created_stamp;
4611 h->last_deep_scrub_stamp = created_stamp;
4612 h->last_clean_scrub_stamp = created_stamp;
4613
4614 OSDMapRef lastmap = service.get_map(created);
4615 int up_primary, acting_primary;
4616 vector<int> up, acting;
4617 lastmap->pg_to_up_acting_osds(
4618 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4619
4620 ostringstream debug;
4621 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4622 OSDMapRef osdmap = service.get_map(e);
4623 int new_up_primary, new_acting_primary;
4624 vector<int> new_up, new_acting;
4625 osdmap->pg_to_up_acting_osds(
4626 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4627
4628 // this is a bit imprecise, but sufficient?
4629 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4630 const pg_pool_t *pi;
4631 bool operator()(const set<pg_shard_t> &have) const {
4632 return have.size() >= pi->min_size;
4633 }
4634 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4635 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4636
4637 bool new_interval = PastIntervals::check_new_interval(
4638 acting_primary,
4639 new_acting_primary,
4640 acting, new_acting,
4641 up_primary,
4642 new_up_primary,
4643 up, new_up,
4644 h->same_interval_since,
4645 h->last_epoch_clean,
4646 osdmap,
4647 lastmap,
4648 pgid.pgid,
4649 &min_size_predicate,
4650 pi,
4651 &debug);
4652 if (new_interval) {
4653 h->same_interval_since = e;
4654 if (up != new_up) {
4655 h->same_up_since = e;
4656 }
4657 if (acting_primary != new_acting_primary) {
4658 h->same_primary_since = e;
4659 }
4660 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4661 osdmap->get_pg_num(pgid.pgid.pool()),
4662 nullptr)) {
4663 h->last_epoch_split = e;
4664 }
4665 up = new_up;
4666 acting = new_acting;
4667 up_primary = new_up_primary;
4668 acting_primary = new_acting_primary;
4669 }
4670 lastmap = osdmap;
4671 }
4672 dout(20) << __func__ << " " << debug.str() << dendl;
4673 dout(10) << __func__ << " " << *h << " " << *pi
4674 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4675 pi->get_bounds()) << ")"
4676 << dendl;
4677 }
4678
4679 /**
4680 * Fill in the passed history so you know same_interval_since, same_up_since,
4681 * and same_primary_since.
4682 */
4683 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4684 const vector<int>& currentup,
4685 int currentupprimary,
4686 const vector<int>& currentacting,
4687 int currentactingprimary)
4688 {
4689 dout(15) << "project_pg_history " << pgid
4690 << " from " << from << " to " << osdmap->get_epoch()
4691 << ", start " << h
4692 << dendl;
4693
4694 epoch_t e;
4695 for (e = osdmap->get_epoch();
4696 e > from;
4697 e--) {
4698 // verify during intermediate epoch (e-1)
4699 OSDMapRef oldmap = service.try_get_map(e-1);
4700 if (!oldmap) {
4701 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4702 return false;
4703 }
4704 assert(oldmap->have_pg_pool(pgid.pool()));
4705
4706 int upprimary, actingprimary;
4707 vector<int> up, acting;
4708 oldmap->pg_to_up_acting_osds(
4709 pgid.pgid,
4710 &up,
4711 &upprimary,
4712 &acting,
4713 &actingprimary);
4714
4715 // acting set change?
4716 if ((actingprimary != currentactingprimary ||
4717 upprimary != currentupprimary ||
4718 acting != currentacting ||
4719 up != currentup) && e > h.same_interval_since) {
4720 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4721 << " from " << acting << "/" << up
4722 << " " << actingprimary << "/" << upprimary
4723 << " -> " << currentacting << "/" << currentup
4724 << " " << currentactingprimary << "/" << currentupprimary
4725 << dendl;
4726 h.same_interval_since = e;
4727 }
4728 // split?
4729 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4730 osdmap->get_pg_num(pgid.pool()),
4731 0) && e > h.same_interval_since) {
4732 h.same_interval_since = e;
4733 }
4734 // up set change?
4735 if ((up != currentup || upprimary != currentupprimary)
4736 && e > h.same_up_since) {
4737 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4738 << " from " << up << " " << upprimary
4739 << " -> " << currentup << " " << currentupprimary << dendl;
4740 h.same_up_since = e;
4741 }
4742
4743 // primary change?
4744 if (OSDMap::primary_changed(
4745 actingprimary,
4746 acting,
4747 currentactingprimary,
4748 currentacting) &&
4749 e > h.same_primary_since) {
4750 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4751 h.same_primary_since = e;
4752 }
4753
4754 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4755 break;
4756 }
4757
4758 // base case: these floors should be the pg creation epoch if we didn't
4759 // find any changes.
4760 if (e == h.epoch_created) {
4761 if (!h.same_interval_since)
4762 h.same_interval_since = e;
4763 if (!h.same_up_since)
4764 h.same_up_since = e;
4765 if (!h.same_primary_since)
4766 h.same_primary_since = e;
4767 }
4768
4769 dout(15) << "project_pg_history end " << h << dendl;
4770 return true;
4771 }
4772
4773
4774
4775 void OSD::_add_heartbeat_peer(int p)
4776 {
4777 if (p == whoami)
4778 return;
4779 HeartbeatInfo *hi;
4780
4781 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4782 if (i == heartbeat_peers.end()) {
4783 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4784 if (!cons.first)
4785 return;
4786 hi = &heartbeat_peers[p];
4787 hi->peer = p;
4788 HeartbeatSession *s = new HeartbeatSession(p);
4789 hi->con_back = cons.first.get();
4790 hi->con_back->set_priv(s->get());
4791 if (cons.second) {
4792 hi->con_front = cons.second.get();
4793 hi->con_front->set_priv(s->get());
4794 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4795 << " " << hi->con_back->get_peer_addr()
4796 << " " << hi->con_front->get_peer_addr()
4797 << dendl;
4798 } else {
4799 hi->con_front.reset(NULL);
4800 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4801 << " " << hi->con_back->get_peer_addr()
4802 << dendl;
4803 }
4804 s->put();
4805 } else {
4806 hi = &i->second;
4807 }
4808 hi->epoch = osdmap->get_epoch();
4809 }
4810
4811 void OSD::_remove_heartbeat_peer(int n)
4812 {
4813 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4814 assert(q != heartbeat_peers.end());
4815 dout(20) << " removing heartbeat peer osd." << n
4816 << " " << q->second.con_back->get_peer_addr()
4817 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4818 << dendl;
4819 q->second.con_back->mark_down();
4820 if (q->second.con_front) {
4821 q->second.con_front->mark_down();
4822 }
4823 heartbeat_peers.erase(q);
4824 }
4825
4826 void OSD::need_heartbeat_peer_update()
4827 {
4828 if (is_stopping())
4829 return;
4830 dout(20) << "need_heartbeat_peer_update" << dendl;
4831 heartbeat_set_peers_need_update();
4832 }
4833
4834 void OSD::maybe_update_heartbeat_peers()
4835 {
4836 assert(osd_lock.is_locked());
4837
4838 if (is_waiting_for_healthy()) {
4839 utime_t now = ceph_clock_now();
4840 if (last_heartbeat_resample == utime_t()) {
4841 last_heartbeat_resample = now;
4842 heartbeat_set_peers_need_update();
4843 } else if (!heartbeat_peers_need_update()) {
4844 utime_t dur = now - last_heartbeat_resample;
4845 if (dur > cct->_conf->osd_heartbeat_grace) {
4846 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4847 heartbeat_set_peers_need_update();
4848 last_heartbeat_resample = now;
4849 reset_heartbeat_peers(); // we want *new* peers!
4850 }
4851 }
4852 }
4853
4854 if (!heartbeat_peers_need_update())
4855 return;
4856 heartbeat_clear_peers_need_update();
4857
4858 Mutex::Locker l(heartbeat_lock);
4859
4860 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4861
4862
4863 // build heartbeat from set
4864 if (is_active()) {
4865 RWLock::RLocker l(pg_map_lock);
4866 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4867 i != pg_map.end();
4868 ++i) {
4869 PG *pg = i->second;
4870 pg->heartbeat_peer_lock.Lock();
4871 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4872 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4873 p != pg->heartbeat_peers.end();
4874 ++p)
4875 if (osdmap->is_up(*p))
4876 _add_heartbeat_peer(*p);
4877 for (set<int>::iterator p = pg->probe_targets.begin();
4878 p != pg->probe_targets.end();
4879 ++p)
4880 if (osdmap->is_up(*p))
4881 _add_heartbeat_peer(*p);
4882 pg->heartbeat_peer_lock.Unlock();
4883 }
4884 }
4885
4886 // include next and previous up osds to ensure we have a fully-connected set
4887 set<int> want, extras;
4888 int next = osdmap->get_next_up_osd_after(whoami);
4889 if (next >= 0)
4890 want.insert(next);
4891 int prev = osdmap->get_previous_up_osd_before(whoami);
4892 if (prev >= 0 && prev != next)
4893 want.insert(prev);
4894
4895 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4896 dout(10) << " adding neighbor peer osd." << *p << dendl;
4897 extras.insert(*p);
4898 _add_heartbeat_peer(*p);
4899 }
4900
4901 // remove down peers; enumerate extras
4902 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4903 while (p != heartbeat_peers.end()) {
4904 if (!osdmap->is_up(p->first)) {
4905 int o = p->first;
4906 ++p;
4907 _remove_heartbeat_peer(o);
4908 continue;
4909 }
4910 if (p->second.epoch < osdmap->get_epoch()) {
4911 extras.insert(p->first);
4912 }
4913 ++p;
4914 }
4915
4916 // too few?
4917 int start = osdmap->get_next_up_osd_after(whoami);
4918 for (int n = start; n >= 0; ) {
4919 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4920 break;
4921 if (!extras.count(n) && !want.count(n) && n != whoami) {
4922 dout(10) << " adding random peer osd." << n << dendl;
4923 extras.insert(n);
4924 _add_heartbeat_peer(n);
4925 }
4926 n = osdmap->get_next_up_osd_after(n);
4927 if (n == start)
4928 break; // came full circle; stop
4929 }
4930
4931 // too many?
4932 for (set<int>::iterator p = extras.begin();
4933 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4934 ++p) {
4935 if (want.count(*p))
4936 continue;
4937 _remove_heartbeat_peer(*p);
4938 }
4939
4940 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4941 }
4942
4943 void OSD::reset_heartbeat_peers()
4944 {
4945 assert(osd_lock.is_locked());
4946 dout(10) << "reset_heartbeat_peers" << dendl;
4947 Mutex::Locker l(heartbeat_lock);
4948 while (!heartbeat_peers.empty()) {
4949 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4950 hi.con_back->mark_down();
4951 if (hi.con_front) {
4952 hi.con_front->mark_down();
4953 }
4954 heartbeat_peers.erase(heartbeat_peers.begin());
4955 }
4956 failure_queue.clear();
4957 }
4958
4959 void OSD::handle_osd_ping(MOSDPing *m)
4960 {
4961 if (superblock.cluster_fsid != m->fsid) {
4962 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4963 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4964 m->put();
4965 return;
4966 }
4967
4968 int from = m->get_source().num();
4969
4970 heartbeat_lock.Lock();
4971 if (is_stopping()) {
4972 heartbeat_lock.Unlock();
4973 m->put();
4974 return;
4975 }
4976
4977 OSDMapRef curmap = service.get_osdmap();
4978 if (!curmap) {
4979 heartbeat_lock.Unlock();
4980 m->put();
4981 return;
4982 }
4983
4984 switch (m->op) {
4985
4986 case MOSDPing::PING:
4987 {
4988 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4989 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4990 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4991 if (heartbeat_drop->second == 0) {
4992 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4993 } else {
4994 --heartbeat_drop->second;
4995 dout(5) << "Dropping heartbeat from " << from
4996 << ", " << heartbeat_drop->second
4997 << " remaining to drop" << dendl;
4998 break;
4999 }
5000 } else if (cct->_conf->osd_debug_drop_ping_probability >
5001 ((((double)(rand()%100))/100.0))) {
5002 heartbeat_drop =
5003 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5004 cct->_conf->osd_debug_drop_ping_duration)).first;
5005 dout(5) << "Dropping heartbeat from " << from
5006 << ", " << heartbeat_drop->second
5007 << " remaining to drop" << dendl;
5008 break;
5009 }
5010 }
5011
5012 if (!cct->get_heartbeat_map()->is_healthy()) {
5013 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5014 break;
5015 }
5016
5017 Message *r = new MOSDPing(monc->get_fsid(),
5018 curmap->get_epoch(),
5019 MOSDPing::PING_REPLY, m->stamp,
5020 cct->_conf->osd_heartbeat_min_size);
5021 m->get_connection()->send_message(r);
5022
5023 if (curmap->is_up(from)) {
5024 service.note_peer_epoch(from, m->map_epoch);
5025 if (is_active()) {
5026 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5027 if (con) {
5028 service.share_map_peer(from, con.get());
5029 }
5030 }
5031 } else if (!curmap->exists(from) ||
5032 curmap->get_down_at(from) > m->map_epoch) {
5033 // tell them they have died
5034 Message *r = new MOSDPing(monc->get_fsid(),
5035 curmap->get_epoch(),
5036 MOSDPing::YOU_DIED,
5037 m->stamp,
5038 cct->_conf->osd_heartbeat_min_size);
5039 m->get_connection()->send_message(r);
5040 }
5041 }
5042 break;
5043
5044 case MOSDPing::PING_REPLY:
5045 {
5046 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5047 if (i != heartbeat_peers.end()) {
5048 if (m->get_connection() == i->second.con_back) {
5049 dout(25) << "handle_osd_ping got reply from osd." << from
5050 << " first_tx " << i->second.first_tx
5051 << " last_tx " << i->second.last_tx
5052 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
5053 << " last_rx_front " << i->second.last_rx_front
5054 << dendl;
5055 i->second.last_rx_back = m->stamp;
5056 // if there is no front con, set both stamps.
5057 if (i->second.con_front == NULL)
5058 i->second.last_rx_front = m->stamp;
5059 } else if (m->get_connection() == i->second.con_front) {
5060 dout(25) << "handle_osd_ping got reply from osd." << from
5061 << " first_tx " << i->second.first_tx
5062 << " last_tx " << i->second.last_tx
5063 << " last_rx_back " << i->second.last_rx_back
5064 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
5065 << dendl;
5066 i->second.last_rx_front = m->stamp;
5067 }
5068
5069 utime_t cutoff = ceph_clock_now();
5070 cutoff -= cct->_conf->osd_heartbeat_grace;
5071 if (i->second.is_healthy(cutoff)) {
5072 // Cancel false reports
5073 auto failure_queue_entry = failure_queue.find(from);
5074 if (failure_queue_entry != failure_queue.end()) {
5075 dout(10) << "handle_osd_ping canceling queued "
5076 << "failure report for osd." << from << dendl;
5077 failure_queue.erase(failure_queue_entry);
5078 }
5079
5080 auto failure_pending_entry = failure_pending.find(from);
5081 if (failure_pending_entry != failure_pending.end()) {
5082 dout(10) << "handle_osd_ping canceling in-flight "
5083 << "failure report for osd." << from << dendl;
5084 send_still_alive(curmap->get_epoch(),
5085 failure_pending_entry->second.second);
5086 failure_pending.erase(failure_pending_entry);
5087 }
5088 }
5089 }
5090
5091 if (m->map_epoch &&
5092 curmap->is_up(from)) {
5093 service.note_peer_epoch(from, m->map_epoch);
5094 if (is_active()) {
5095 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5096 if (con) {
5097 service.share_map_peer(from, con.get());
5098 }
5099 }
5100 }
5101 }
5102 break;
5103
5104 case MOSDPing::YOU_DIED:
5105 dout(10) << "handle_osd_ping " << m->get_source_inst()
5106 << " says i am down in " << m->map_epoch << dendl;
5107 osdmap_subscribe(curmap->get_epoch()+1, false);
5108 break;
5109 }
5110
5111 heartbeat_lock.Unlock();
5112 m->put();
5113 }
5114
5115 void OSD::heartbeat_entry()
5116 {
5117 Mutex::Locker l(heartbeat_lock);
5118 if (is_stopping())
5119 return;
5120 while (!heartbeat_stop) {
5121 heartbeat();
5122
5123 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5124 utime_t w;
5125 w.set_from_double(wait);
5126 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5127 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5128 if (is_stopping())
5129 return;
5130 dout(30) << "heartbeat_entry woke up" << dendl;
5131 }
5132 }
5133
5134 void OSD::heartbeat_check()
5135 {
5136 assert(heartbeat_lock.is_locked());
5137 utime_t now = ceph_clock_now();
5138
5139 // check for heartbeat replies (move me elsewhere?)
5140 utime_t cutoff = now;
5141 cutoff -= cct->_conf->osd_heartbeat_grace;
5142 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5143 p != heartbeat_peers.end();
5144 ++p) {
5145
5146 if (p->second.first_tx == utime_t()) {
5147 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5148 << "yet, skipping" << dendl;
5149 continue;
5150 }
5151
5152 dout(25) << "heartbeat_check osd." << p->first
5153 << " first_tx " << p->second.first_tx
5154 << " last_tx " << p->second.last_tx
5155 << " last_rx_back " << p->second.last_rx_back
5156 << " last_rx_front " << p->second.last_rx_front
5157 << dendl;
5158 if (p->second.is_unhealthy(cutoff)) {
5159 if (p->second.last_rx_back == utime_t() ||
5160 p->second.last_rx_front == utime_t()) {
5161 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5162 << " osd." << p->first << " ever on either front or back, first ping sent "
5163 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5164 // fail
5165 failure_queue[p->first] = p->second.last_tx;
5166 } else {
5167 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5168 << " osd." << p->first << " since back " << p->second.last_rx_back
5169 << " front " << p->second.last_rx_front
5170 << " (cutoff " << cutoff << ")" << dendl;
5171 // fail
5172 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5173 }
5174 }
5175 }
5176 }
5177
5178 void OSD::heartbeat()
5179 {
5180 dout(30) << "heartbeat" << dendl;
5181
5182 // get CPU load avg
5183 double loadavgs[1];
5184 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5185 if (getloadavg(loadavgs, 1) == 1) {
5186 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5187 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5188 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5189 }
5190
5191 dout(30) << "heartbeat checking stats" << dendl;
5192
5193 // refresh stats?
5194 vector<int> hb_peers;
5195 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5196 p != heartbeat_peers.end();
5197 ++p)
5198 hb_peers.push_back(p->first);
5199 service.update_osd_stat(hb_peers);
5200
5201 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5202
5203 utime_t now = ceph_clock_now();
5204
5205 // send heartbeats
5206 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5207 i != heartbeat_peers.end();
5208 ++i) {
5209 int peer = i->first;
5210 i->second.last_tx = now;
5211 if (i->second.first_tx == utime_t())
5212 i->second.first_tx = now;
5213 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5214 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5215 service.get_osdmap()->get_epoch(),
5216 MOSDPing::PING, now,
5217 cct->_conf->osd_heartbeat_min_size));
5218
5219 if (i->second.con_front)
5220 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5221 service.get_osdmap()->get_epoch(),
5222 MOSDPing::PING, now,
5223 cct->_conf->osd_heartbeat_min_size));
5224 }
5225
5226 logger->set(l_osd_hb_to, heartbeat_peers.size());
5227
5228 // hmm.. am i all alone?
5229 dout(30) << "heartbeat lonely?" << dendl;
5230 if (heartbeat_peers.empty()) {
5231 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5232 last_mon_heartbeat = now;
5233 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5234 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5235 }
5236 }
5237
5238 dout(30) << "heartbeat done" << dendl;
5239 }
5240
5241 bool OSD::heartbeat_reset(Connection *con)
5242 {
5243 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5244 if (s) {
5245 heartbeat_lock.Lock();
5246 if (is_stopping()) {
5247 heartbeat_lock.Unlock();
5248 s->put();
5249 return true;
5250 }
5251 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5252 if (p != heartbeat_peers.end() &&
5253 (p->second.con_back == con ||
5254 p->second.con_front == con)) {
5255 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5256 << ", reopening" << dendl;
5257 if (con != p->second.con_back) {
5258 p->second.con_back->mark_down();
5259 }
5260 p->second.con_back.reset(NULL);
5261 if (p->second.con_front && con != p->second.con_front) {
5262 p->second.con_front->mark_down();
5263 }
5264 p->second.con_front.reset(NULL);
5265 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5266 if (newcon.first) {
5267 p->second.con_back = newcon.first.get();
5268 p->second.con_back->set_priv(s->get());
5269 if (newcon.second) {
5270 p->second.con_front = newcon.second.get();
5271 p->second.con_front->set_priv(s->get());
5272 }
5273 } else {
5274 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5275 << ", raced with osdmap update, closing out peer" << dendl;
5276 heartbeat_peers.erase(p);
5277 }
5278 } else {
5279 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5280 }
5281 heartbeat_lock.Unlock();
5282 s->put();
5283 }
5284 return true;
5285 }
5286
5287
5288
5289 // =========================================
5290
5291 void OSD::tick()
5292 {
5293 assert(osd_lock.is_locked());
5294 dout(10) << "tick" << dendl;
5295
5296 if (is_active() || is_waiting_for_healthy()) {
5297 maybe_update_heartbeat_peers();
5298 }
5299
5300 if (is_waiting_for_healthy()) {
5301 start_boot();
5302 } else if (is_preboot() &&
5303 waiting_for_luminous_mons &&
5304 monc->monmap.get_required_features().contains_all(
5305 ceph::features::mon::FEATURE_LUMINOUS)) {
5306 // mon upgrade finished!
5307 start_boot();
5308 }
5309
5310 do_waiters();
5311
5312 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5313 }
5314
5315 void OSD::tick_without_osd_lock()
5316 {
5317 assert(tick_timer_lock.is_locked());
5318 dout(10) << "tick_without_osd_lock" << dendl;
5319
5320 logger->set(l_osd_buf, buffer::get_total_alloc());
5321 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5322 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5323 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5324 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5325 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5326 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
5327
5328 // osd_lock is not being held, which means the OSD state
5329 // might change when doing the monitor report
5330 if (is_active() || is_waiting_for_healthy()) {
5331 heartbeat_lock.Lock();
5332 heartbeat_check();
5333 heartbeat_lock.Unlock();
5334
5335 map_lock.get_read();
5336 Mutex::Locker l(mon_report_lock);
5337
5338 // mon report?
5339 bool reset = false;
5340 bool report = false;
5341 utime_t now = ceph_clock_now();
5342 pg_stat_queue_lock.Lock();
5343 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5344 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5345 // note: we shouldn't adjust max because it must remain < the
5346 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5347 // value).
5348 double max = cct->_conf->osd_mon_report_interval_max;
5349 if (!outstanding_pg_stats.empty() &&
5350 (now - stats_ack_timeout) > last_pg_stats_ack) {
5351 dout(1) << __func__ << " mon hasn't acked PGStats in "
5352 << now - last_pg_stats_ack
5353 << " seconds, reconnecting elsewhere" << dendl;
5354 reset = true;
5355 last_pg_stats_ack = now; // reset clock
5356 last_pg_stats_sent = utime_t();
5357 stats_ack_timeout =
5358 MAX(cct->_conf->osd_mon_ack_timeout,
5359 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5360 outstanding_pg_stats.clear();
5361 }
5362 if (now - last_pg_stats_sent > max) {
5363 osd_stat_updated = true;
5364 report = true;
5365 } else if (service.need_fullness_update()) {
5366 report = true;
5367 } else if ((int)outstanding_pg_stats.size() >=
5368 cct->_conf->osd_mon_report_max_in_flight) {
5369 dout(20) << __func__ << " have max " << outstanding_pg_stats
5370 << " stats updates in flight" << dendl;
5371 } else {
5372 if (now - last_mon_report > adjusted_min) {
5373 dout(20) << __func__ << " stats backoff " << backoff
5374 << " adjusted_min " << adjusted_min << " - sending report"
5375 << dendl;
5376 osd_stat_updated = true;
5377 report = true;
5378 }
5379 }
5380 pg_stat_queue_lock.Unlock();
5381
5382 if (reset) {
5383 monc->reopen_session();
5384 } else if (report) {
5385 last_mon_report = now;
5386
5387 // do any pending reports
5388 send_full_update();
5389 send_failures();
5390 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5391 send_pg_stats(now);
5392 }
5393 }
5394 map_lock.put_read();
5395 }
5396
5397 if (is_active()) {
5398 if (!scrub_random_backoff()) {
5399 sched_scrub();
5400 }
5401 service.promote_throttle_recalibrate();
5402 resume_creating_pg();
5403 bool need_send_beacon = false;
5404 const auto now = ceph::coarse_mono_clock::now();
5405 {
5406 // borrow lec lock to pretect last_sent_beacon from changing
5407 Mutex::Locker l{min_last_epoch_clean_lock};
5408 const auto elapsed = now - last_sent_beacon;
5409 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5410 cct->_conf->osd_beacon_report_interval) {
5411 need_send_beacon = true;
5412 }
5413 }
5414 if (need_send_beacon) {
5415 send_beacon(now);
5416 }
5417 }
5418
5419 mgrc.update_osd_health(get_health_metrics());
5420 service.kick_recovery_queue();
5421 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5422 }
5423
5424 void OSD::check_ops_in_flight()
5425 {
5426 vector<string> warnings;
5427 if (op_tracker.check_ops_in_flight(warnings)) {
5428 for (vector<string>::iterator i = warnings.begin();
5429 i != warnings.end();
5430 ++i) {
5431 clog->warn() << *i;
5432 }
5433 }
5434 }
5435
5436 // Usage:
5437 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5438 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5439 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5440 // getomap <pool> [namespace/]<obj-name>
5441 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5442 // injectmdataerr [namespace/]<obj-name> [shardid]
5443 // injectdataerr [namespace/]<obj-name> [shardid]
5444 //
5445 // set_recovery_delay [utime]
5446 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5447 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5448 {
5449 //Test support
5450 //Support changing the omap on a single osd by using the Admin Socket to
5451 //directly request the osd make a change.
5452 if (command == "setomapval" || command == "rmomapkey" ||
5453 command == "setomapheader" || command == "getomap" ||
5454 command == "truncobj" || command == "injectmdataerr" ||
5455 command == "injectdataerr"
5456 ) {
5457 pg_t rawpg;
5458 int64_t pool;
5459 OSDMapRef curmap = service->get_osdmap();
5460 int r = -1;
5461
5462 string poolstr;
5463
5464 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5465 pool = curmap->lookup_pg_pool_name(poolstr);
5466 //If we can't find it by name then maybe id specified
5467 if (pool < 0 && isdigit(poolstr[0]))
5468 pool = atoll(poolstr.c_str());
5469 if (pool < 0) {
5470 ss << "Invalid pool '" << poolstr << "''";
5471 return;
5472 }
5473
5474 string objname, nspace;
5475 cmd_getval(service->cct, cmdmap, "objname", objname);
5476 std::size_t found = objname.find_first_of('/');
5477 if (found != string::npos) {
5478 nspace = objname.substr(0, found);
5479 objname = objname.substr(found+1);
5480 }
5481 object_locator_t oloc(pool, nspace);
5482 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5483
5484 if (r < 0) {
5485 ss << "Invalid namespace/objname";
5486 return;
5487 }
5488
5489 int64_t shardid;
5490 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5491 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5492 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5493 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5494 if (curmap->pg_is_ec(rawpg)) {
5495 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5496 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5497 return;
5498 }
5499 }
5500
5501 ObjectStore::Transaction t;
5502
5503 if (command == "setomapval") {
5504 map<string, bufferlist> newattrs;
5505 bufferlist val;
5506 string key, valstr;
5507 cmd_getval(service->cct, cmdmap, "key", key);
5508 cmd_getval(service->cct, cmdmap, "val", valstr);
5509
5510 val.append(valstr);
5511 newattrs[key] = val;
5512 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5513 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5514 if (r < 0)
5515 ss << "error=" << r;
5516 else
5517 ss << "ok";
5518 } else if (command == "rmomapkey") {
5519 string key;
5520 set<string> keys;
5521 cmd_getval(service->cct, cmdmap, "key", key);
5522
5523 keys.insert(key);
5524 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5525 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5526 if (r < 0)
5527 ss << "error=" << r;
5528 else
5529 ss << "ok";
5530 } else if (command == "setomapheader") {
5531 bufferlist newheader;
5532 string headerstr;
5533
5534 cmd_getval(service->cct, cmdmap, "header", headerstr);
5535 newheader.append(headerstr);
5536 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5537 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5538 if (r < 0)
5539 ss << "error=" << r;
5540 else
5541 ss << "ok";
5542 } else if (command == "getomap") {
5543 //Debug: Output entire omap
5544 bufferlist hdrbl;
5545 map<string, bufferlist> keyvals;
5546 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5547 if (r >= 0) {
5548 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5549 for (map<string, bufferlist>::iterator it = keyvals.begin();
5550 it != keyvals.end(); ++it)
5551 ss << " key=" << (*it).first << " val="
5552 << string((*it).second.c_str(), (*it).second.length());
5553 } else {
5554 ss << "error=" << r;
5555 }
5556 } else if (command == "truncobj") {
5557 int64_t trunclen;
5558 cmd_getval(service->cct, cmdmap, "len", trunclen);
5559 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5560 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5561 if (r < 0)
5562 ss << "error=" << r;
5563 else
5564 ss << "ok";
5565 } else if (command == "injectdataerr") {
5566 store->inject_data_error(gobj);
5567 ss << "ok";
5568 } else if (command == "injectmdataerr") {
5569 store->inject_mdata_error(gobj);
5570 ss << "ok";
5571 }
5572 return;
5573 }
5574 if (command == "set_recovery_delay") {
5575 int64_t delay;
5576 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5577 ostringstream oss;
5578 oss << delay;
5579 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5580 oss.str().c_str());
5581 if (r != 0) {
5582 ss << "set_recovery_delay: error setting "
5583 << "osd_recovery_delay_start to '" << delay << "': error "
5584 << r;
5585 return;
5586 }
5587 service->cct->_conf->apply_changes(NULL);
5588 ss << "set_recovery_delay: set osd_recovery_delay_start "
5589 << "to " << service->cct->_conf->osd_recovery_delay_start;
5590 return;
5591 }
5592 if (command == "trigger_scrub") {
5593 spg_t pgid;
5594 OSDMapRef curmap = service->get_osdmap();
5595
5596 string pgidstr;
5597
5598 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5599 if (!pgid.parse(pgidstr.c_str())) {
5600 ss << "Invalid pgid specified";
5601 return;
5602 }
5603
5604 PG *pg = service->osd->_lookup_lock_pg(pgid);
5605 if (pg == nullptr) {
5606 ss << "Can't find pg " << pgid;
5607 return;
5608 }
5609
5610 if (pg->is_primary()) {
5611 pg->unreg_next_scrub();
5612 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5613 double pool_scrub_max_interval = 0;
5614 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5615 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5616 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5617 // Instead of marking must_scrub force a schedule scrub
5618 utime_t stamp = ceph_clock_now();
5619 stamp -= scrub_max_interval;
5620 stamp -= 100.0; // push back last scrub more for good measure
5621 pg->info.history.last_scrub_stamp = stamp;
5622 pg->reg_next_scrub();
5623 ss << "ok";
5624 } else {
5625 ss << "Not primary";
5626 }
5627 pg->unlock();
5628 return;
5629 }
5630 if (command == "injectfull") {
5631 int64_t count;
5632 string type;
5633 OSDService::s_names state;
5634 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5635 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5636 if (type == "none" || count == 0) {
5637 type = "none";
5638 count = 0;
5639 }
5640 state = service->get_full_state(type);
5641 if (state == OSDService::s_names::INVALID) {
5642 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5643 return;
5644 }
5645 service->set_injectfull(state, count);
5646 return;
5647 }
5648 ss << "Internal error - command=" << command;
5649 }
5650
5651 // =========================================
5652 bool remove_dir(
5653 CephContext *cct,
5654 ObjectStore *store, SnapMapper *mapper,
5655 OSDriver *osdriver,
5656 ObjectStore::Sequencer *osr,
5657 coll_t coll, DeletingStateRef dstate,
5658 bool *finished,
5659 ThreadPool::TPHandle &handle)
5660 {
5661 vector<ghobject_t> olist;
5662 int64_t num = 0;
5663 ObjectStore::Transaction t;
5664 ghobject_t next;
5665 handle.reset_tp_timeout();
5666 store->collection_list(
5667 coll,
5668 next,
5669 ghobject_t::get_max(),
5670 store->get_ideal_list_max(),
5671 &olist,
5672 &next);
5673 generic_dout(10) << __func__ << " " << olist << dendl;
5674 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5675 // will recheck the answer before it really goes on.
5676 bool cont = true;
5677 for (vector<ghobject_t>::iterator i = olist.begin();
5678 i != olist.end();
5679 ++i) {
5680 if (i->is_pgmeta())
5681 continue;
5682 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5683 int r = mapper->remove_oid(i->hobj, &_t);
5684 if (r != 0 && r != -ENOENT) {
5685 ceph_abort();
5686 }
5687 t.remove(coll, *i);
5688 if (++num >= cct->_conf->osd_target_transaction_size) {
5689 C_SaferCond waiter;
5690 store->queue_transaction(osr, std::move(t), &waiter);
5691 cont = dstate->pause_clearing();
5692 handle.suspend_tp_timeout();
5693 waiter.wait();
5694 handle.reset_tp_timeout();
5695 if (cont)
5696 cont = dstate->resume_clearing();
5697 if (!cont)
5698 return false;
5699 t = ObjectStore::Transaction();
5700 num = 0;
5701 }
5702 }
5703 if (num) {
5704 C_SaferCond waiter;
5705 store->queue_transaction(osr, std::move(t), &waiter);
5706 cont = dstate->pause_clearing();
5707 handle.suspend_tp_timeout();
5708 waiter.wait();
5709 handle.reset_tp_timeout();
5710 if (cont)
5711 cont = dstate->resume_clearing();
5712 }
5713 // whether there are more objects to remove in the collection
5714 *finished = next.is_max();
5715 return cont;
5716 }
5717
5718 void OSD::RemoveWQ::_process(
5719 pair<PGRef, DeletingStateRef> item,
5720 ThreadPool::TPHandle &handle)
5721 {
5722 FUNCTRACE();
5723 PGRef pg(item.first);
5724 SnapMapper &mapper = pg->snap_mapper;
5725 OSDriver &driver = pg->osdriver;
5726 coll_t coll = coll_t(pg->info.pgid);
5727 pg->osr->flush();
5728 bool finished = false;
5729
5730 if (!item.second->start_or_resume_clearing())
5731 return;
5732
5733 bool cont = remove_dir(
5734 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5735 &finished, handle);
5736 if (!cont)
5737 return;
5738 if (!finished) {
5739 if (item.second->pause_clearing())
5740 queue_front(item);
5741 return;
5742 }
5743
5744 if (!item.second->start_deleting())
5745 return;
5746
5747 ObjectStore::Transaction t;
5748 PGLog::clear_info_log(pg->info.pgid, &t);
5749
5750 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5751 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5752 _exit(1);
5753 }
5754 t.remove_collection(coll);
5755
5756 // We need the sequencer to stick around until the op is complete
5757 store->queue_transaction(
5758 pg->osr.get(),
5759 std::move(t),
5760 0, // onapplied
5761 0, // oncommit
5762 0, // onreadable sync
5763 new ContainerContext<PGRef>(pg),
5764 TrackedOpRef());
5765
5766 item.second->finish_deleting();
5767 }
5768 // =========================================
5769
5770 void OSD::ms_handle_connect(Connection *con)
5771 {
5772 dout(10) << __func__ << " con " << con << dendl;
5773 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5774 Mutex::Locker l(osd_lock);
5775 if (is_stopping())
5776 return;
5777 dout(10) << __func__ << " on mon" << dendl;
5778
5779 if (is_preboot()) {
5780 start_boot();
5781 } else if (is_booting()) {
5782 _send_boot(); // resend boot message
5783 } else {
5784 map_lock.get_read();
5785 Mutex::Locker l2(mon_report_lock);
5786
5787 utime_t now = ceph_clock_now();
5788 last_mon_report = now;
5789
5790 // resend everything, it's a new session
5791 send_full_update();
5792 send_alive();
5793 service.requeue_pg_temp();
5794 service.send_pg_temp();
5795 requeue_failures();
5796 send_failures();
5797 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5798 send_pg_stats(now);
5799 }
5800
5801 map_lock.put_read();
5802 if (is_active()) {
5803 send_beacon(ceph::coarse_mono_clock::now());
5804 }
5805 }
5806
5807 // full map requests may happen while active or pre-boot
5808 if (requested_full_first) {
5809 rerequest_full_maps();
5810 }
5811 }
5812 }
5813
5814 void OSD::ms_handle_fast_connect(Connection *con)
5815 {
5816 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5817 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5818 Session *s = static_cast<Session*>(con->get_priv());
5819 if (!s) {
5820 s = new Session(cct);
5821 con->set_priv(s->get());
5822 s->con = con;
5823 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5824 << " addr=" << s->con->get_peer_addr() << dendl;
5825 // we don't connect to clients
5826 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5827 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5828 }
5829 s->put();
5830 }
5831 }
5832
5833 void OSD::ms_handle_fast_accept(Connection *con)
5834 {
5835 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5836 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5837 Session *s = static_cast<Session*>(con->get_priv());
5838 if (!s) {
5839 s = new Session(cct);
5840 con->set_priv(s->get());
5841 s->con = con;
5842 dout(10) << "new session (incoming)" << s << " con=" << con
5843 << " addr=" << con->get_peer_addr()
5844 << " must have raced with connect" << dendl;
5845 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5846 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5847 }
5848 s->put();
5849 }
5850 }
5851
5852 bool OSD::ms_handle_reset(Connection *con)
5853 {
5854 Session *session = static_cast<Session*>(con->get_priv());
5855 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5856 if (!session)
5857 return false;
5858 session->wstate.reset(con);
5859 session->con.reset(NULL); // break con <-> session ref cycle
5860 // note that we break session->con *before* the session_handle_reset
5861 // cleanup below. this avoids a race between us and
5862 // PG::add_backoff, Session::check_backoff, etc.
5863 session_handle_reset(session);
5864 session->put();
5865 return true;
5866 }
5867
5868 bool OSD::ms_handle_refused(Connection *con)
5869 {
5870 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5871 return false;
5872
5873 Session *session = static_cast<Session*>(con->get_priv());
5874 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5875 if (!session)
5876 return false;
5877 int type = con->get_peer_type();
5878 // handle only OSD failures here
5879 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5880 OSDMapRef osdmap = get_osdmap();
5881 if (osdmap) {
5882 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5883 if (id >= 0 && osdmap->is_up(id)) {
5884 // I'm cheating mon heartbeat grace logic, because we know it's not going
5885 // to respawn alone. +1 so we won't hit any boundary case.
5886 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5887 osdmap->get_inst(id),
5888 cct->_conf->osd_heartbeat_grace + 1,
5889 osdmap->get_epoch(),
5890 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5891 ));
5892 }
5893 }
5894 }
5895 session->put();
5896 return true;
5897 }
5898
5899 struct C_OSD_GetVersion : public Context {
5900 OSD *osd;
5901 uint64_t oldest, newest;
5902 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5903 void finish(int r) override {
5904 if (r >= 0)
5905 osd->_got_mon_epochs(oldest, newest);
5906 }
5907 };
5908
5909 void OSD::start_boot()
5910 {
5911 if (!_is_healthy()) {
5912 // if we are not healthy, do not mark ourselves up (yet)
5913 dout(1) << "not healthy; waiting to boot" << dendl;
5914 if (!is_waiting_for_healthy())
5915 start_waiting_for_healthy();
5916 // send pings sooner rather than later
5917 heartbeat_kick();
5918 return;
5919 }
5920 dout(1) << __func__ << dendl;
5921 set_state(STATE_PREBOOT);
5922 waiting_for_luminous_mons = false;
5923 dout(10) << "start_boot - have maps " << superblock.oldest_map
5924 << ".." << superblock.newest_map << dendl;
5925 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5926 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5927 }
5928
5929 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5930 {
5931 Mutex::Locker l(osd_lock);
5932 if (is_preboot()) {
5933 _preboot(oldest, newest);
5934 }
5935 }
5936
5937 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5938 {
5939 assert(is_preboot());
5940 dout(10) << __func__ << " _preboot mon has osdmaps "
5941 << oldest << ".." << newest << dendl;
5942
5943 // ensure our local fullness awareness is accurate
5944 heartbeat();
5945
5946 // if our map within recent history, try to add ourselves to the osdmap.
5947 if (osdmap->get_epoch() == 0) {
5948 derr << "waiting for initial osdmap" << dendl;
5949 } else if (osdmap->is_destroyed(whoami)) {
5950 derr << "osdmap says I am destroyed" << dendl;
5951 // provide a small margin so we don't livelock seeing if we
5952 // un-destroyed ourselves.
5953 if (osdmap->get_epoch() > newest - 1) {
5954 exit(0);
5955 }
5956 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5957 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5958 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5959 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5960 << dendl;
5961 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5962 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5963 << dendl;
5964 } else if (!monc->monmap.get_required_features().contains_all(
5965 ceph::features::mon::FEATURE_LUMINOUS)) {
5966 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5967 << "Luminous or later before Luminous OSDs will boot" << dendl;
5968 waiting_for_luminous_mons = true;
5969 } else if (service.need_fullness_update()) {
5970 derr << "osdmap fullness state needs update" << dendl;
5971 send_full_update();
5972 } else if (osdmap->get_epoch() >= oldest - 1 &&
5973 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5974 _send_boot();
5975 return;
5976 }
5977
5978 // get all the latest maps
5979 if (osdmap->get_epoch() + 1 >= oldest)
5980 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5981 else
5982 osdmap_subscribe(oldest - 1, true);
5983 }
5984
5985 void OSD::send_full_update()
5986 {
5987 if (!service.need_fullness_update())
5988 return;
5989 unsigned state = 0;
5990 if (service.is_full()) {
5991 state = CEPH_OSD_FULL;
5992 } else if (service.is_backfillfull()) {
5993 state = CEPH_OSD_BACKFILLFULL;
5994 } else if (service.is_nearfull()) {
5995 state = CEPH_OSD_NEARFULL;
5996 }
5997 set<string> s;
5998 OSDMap::calc_state_set(state, s);
5999 dout(10) << __func__ << " want state " << s << dendl;
6000 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6001 }
6002
6003 void OSD::start_waiting_for_healthy()
6004 {
6005 dout(1) << "start_waiting_for_healthy" << dendl;
6006 set_state(STATE_WAITING_FOR_HEALTHY);
6007 last_heartbeat_resample = utime_t();
6008
6009 // subscribe to osdmap updates, in case our peers really are known to be dead
6010 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6011 }
6012
6013 bool OSD::_is_healthy()
6014 {
6015 if (!cct->get_heartbeat_map()->is_healthy()) {
6016 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6017 return false;
6018 }
6019
6020 if (is_waiting_for_healthy()) {
6021 Mutex::Locker l(heartbeat_lock);
6022 utime_t cutoff = ceph_clock_now();
6023 cutoff -= cct->_conf->osd_heartbeat_grace;
6024 int num = 0, up = 0;
6025 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6026 p != heartbeat_peers.end();
6027 ++p) {
6028 if (p->second.is_healthy(cutoff))
6029 ++up;
6030 ++num;
6031 }
6032 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6033 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6034 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6035 return false;
6036 }
6037 }
6038
6039 return true;
6040 }
6041
6042 void OSD::_send_boot()
6043 {
6044 dout(10) << "_send_boot" << dendl;
6045 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
6046 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
6047 if (cluster_addr.is_blank_ip()) {
6048 int port = cluster_addr.get_port();
6049 cluster_addr = client_messenger->get_myaddr();
6050 cluster_addr.set_port(port);
6051 cluster_messenger->set_addr_unknowns(cluster_addr);
6052 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
6053 } else {
6054 Session *s = static_cast<Session*>(local_connection->get_priv());
6055 if (s)
6056 s->put();
6057 else
6058 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6059 }
6060
6061 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
6062 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6063 if (hb_back_addr.is_blank_ip()) {
6064 int port = hb_back_addr.get_port();
6065 hb_back_addr = cluster_addr;
6066 hb_back_addr.set_port(port);
6067 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
6068 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
6069 } else {
6070 Session *s = static_cast<Session*>(local_connection->get_priv());
6071 if (s)
6072 s->put();
6073 else
6074 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6075 }
6076
6077 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
6078 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6079 if (hb_front_addr.is_blank_ip()) {
6080 int port = hb_front_addr.get_port();
6081 hb_front_addr = client_messenger->get_myaddr();
6082 hb_front_addr.set_port(port);
6083 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
6084 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
6085 } else {
6086 Session *s = static_cast<Session*>(local_connection->get_priv());
6087 if (s)
6088 s->put();
6089 else
6090 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6091 }
6092
6093 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6094 hb_back_addr, hb_front_addr, cluster_addr,
6095 CEPH_FEATURES_ALL);
6096 dout(10) << " client_addr " << client_messenger->get_myaddr()
6097 << ", cluster_addr " << cluster_addr
6098 << ", hb_back_addr " << hb_back_addr
6099 << ", hb_front_addr " << hb_front_addr
6100 << dendl;
6101 _collect_metadata(&mboot->metadata);
6102 monc->send_mon_message(mboot);
6103 set_state(STATE_BOOTING);
6104 }
6105
6106 void OSD::_collect_metadata(map<string,string> *pm)
6107 {
6108 // config info
6109 (*pm)["osd_data"] = dev_path;
6110 if (store->get_type() == "filestore") {
6111 // not applicable for bluestore
6112 (*pm)["osd_journal"] = journal_path;
6113 }
6114 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6115 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6116 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6117 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6118
6119 // backend
6120 (*pm)["osd_objectstore"] = store->get_type();
6121 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6122 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6123 (*pm)["default_device_class"] = store->get_default_device_class();
6124 store->collect_metadata(pm);
6125
6126 collect_sys_info(pm, cct);
6127
6128 std::string front_iface, back_iface;
6129 /*
6130 pick_iface(cct,
6131 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6132 &front_iface, &back_iface);
6133 */
6134 (*pm)["front_iface"] = pick_iface(cct,
6135 client_messenger->get_myaddr().get_sockaddr_storage());
6136 (*pm)["back_iface"] = pick_iface(cct,
6137 cluster_messenger->get_myaddr().get_sockaddr_storage());
6138
6139 dout(10) << __func__ << " " << *pm << dendl;
6140 }
6141
6142 void OSD::queue_want_up_thru(epoch_t want)
6143 {
6144 map_lock.get_read();
6145 epoch_t cur = osdmap->get_up_thru(whoami);
6146 Mutex::Locker l(mon_report_lock);
6147 if (want > up_thru_wanted) {
6148 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6149 << ", currently " << cur
6150 << dendl;
6151 up_thru_wanted = want;
6152 send_alive();
6153 } else {
6154 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6155 << ", currently " << cur
6156 << dendl;
6157 }
6158 map_lock.put_read();
6159 }
6160
6161 void OSD::send_alive()
6162 {
6163 assert(mon_report_lock.is_locked());
6164 if (!osdmap->exists(whoami))
6165 return;
6166 epoch_t up_thru = osdmap->get_up_thru(whoami);
6167 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6168 if (up_thru_wanted > up_thru) {
6169 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6170 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6171 }
6172 }
6173
6174 void OSD::request_full_map(epoch_t first, epoch_t last)
6175 {
6176 dout(10) << __func__ << " " << first << ".." << last
6177 << ", previously requested "
6178 << requested_full_first << ".." << requested_full_last << dendl;
6179 assert(osd_lock.is_locked());
6180 assert(first > 0 && last > 0);
6181 assert(first <= last);
6182 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6183 if (requested_full_first == 0) {
6184 // first request
6185 requested_full_first = first;
6186 requested_full_last = last;
6187 } else if (last <= requested_full_last) {
6188 // dup
6189 return;
6190 } else {
6191 // additional request
6192 first = requested_full_last + 1;
6193 requested_full_last = last;
6194 }
6195 MMonGetOSDMap *req = new MMonGetOSDMap;
6196 req->request_full(first, last);
6197 monc->send_mon_message(req);
6198 }
6199
6200 void OSD::got_full_map(epoch_t e)
6201 {
6202 assert(requested_full_first <= requested_full_last);
6203 assert(osd_lock.is_locked());
6204 if (requested_full_first == 0) {
6205 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6206 return;
6207 }
6208 if (e < requested_full_first) {
6209 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6210 << ".." << requested_full_last
6211 << ", ignoring" << dendl;
6212 return;
6213 }
6214 if (e >= requested_full_last) {
6215 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6216 << ".." << requested_full_last << ", resetting" << dendl;
6217 requested_full_first = requested_full_last = 0;
6218 return;
6219 }
6220
6221 requested_full_first = e + 1;
6222
6223 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6224 << ".." << requested_full_last
6225 << ", still need more" << dendl;
6226 }
6227
6228 void OSD::requeue_failures()
6229 {
6230 Mutex::Locker l(heartbeat_lock);
6231 unsigned old_queue = failure_queue.size();
6232 unsigned old_pending = failure_pending.size();
6233 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6234 failure_pending.begin();
6235 p != failure_pending.end(); ) {
6236 failure_queue[p->first] = p->second.first;
6237 failure_pending.erase(p++);
6238 }
6239 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6240 << failure_queue.size() << dendl;
6241 }
6242
6243 void OSD::send_failures()
6244 {
6245 assert(map_lock.is_locked());
6246 assert(mon_report_lock.is_locked());
6247 Mutex::Locker l(heartbeat_lock);
6248 utime_t now = ceph_clock_now();
6249 while (!failure_queue.empty()) {
6250 int osd = failure_queue.begin()->first;
6251 if (!failure_pending.count(osd)) {
6252 entity_inst_t i = osdmap->get_inst(osd);
6253 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6254 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6255 osdmap->get_epoch()));
6256 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6257 }
6258 failure_queue.erase(osd);
6259 }
6260 }
6261
6262 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6263 {
6264 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6265 monc->send_mon_message(m);
6266 }
6267
6268 void OSD::send_pg_stats(const utime_t &now)
6269 {
6270 assert(map_lock.is_locked());
6271 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6272 dout(20) << "send_pg_stats" << dendl;
6273
6274 osd_stat_t cur_stat = service.get_osd_stat();
6275
6276 cur_stat.os_perf_stat = store->get_cur_stats();
6277
6278 pg_stat_queue_lock.Lock();
6279
6280 if (osd_stat_updated || !pg_stat_queue.empty()) {
6281 last_pg_stats_sent = now;
6282 osd_stat_updated = false;
6283
6284 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6285
6286 utime_t had_for(now);
6287 had_for -= had_map_since;
6288
6289 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6290
6291 uint64_t tid = ++pg_stat_tid;
6292 m->set_tid(tid);
6293 m->osd_stat = cur_stat;
6294
6295 xlist<PG*>::iterator p = pg_stat_queue.begin();
6296 while (!p.end()) {
6297 PG *pg = *p;
6298 ++p;
6299 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6300 pg->stat_queue_item.remove_myself();
6301 pg->put("pg_stat_queue");
6302 continue;
6303 }
6304 pg->pg_stats_publish_lock.Lock();
6305 if (pg->pg_stats_publish_valid) {
6306 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6307 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6308 << pg->pg_stats_publish.reported_seq << dendl;
6309 } else {
6310 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6311 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6312 }
6313 pg->pg_stats_publish_lock.Unlock();
6314 }
6315
6316 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6317 last_pg_stats_ack = ceph_clock_now();
6318 }
6319 outstanding_pg_stats.insert(tid);
6320 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6321
6322 monc->send_mon_message(m);
6323 }
6324
6325 pg_stat_queue_lock.Unlock();
6326 }
6327
6328 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6329 {
6330 dout(10) << "handle_pg_stats_ack " << dendl;
6331
6332 if (!require_mon_peer(ack)) {
6333 ack->put();
6334 return;
6335 }
6336
6337 // NOTE: we may get replies from a previous mon even while
6338 // outstanding_pg_stats is empty if reconnecting races with replies
6339 // in flight.
6340
6341 pg_stat_queue_lock.Lock();
6342
6343 last_pg_stats_ack = ceph_clock_now();
6344
6345 // decay timeout slowly (analogous to TCP)
6346 stats_ack_timeout =
6347 MAX(cct->_conf->osd_mon_ack_timeout,
6348 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6349 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6350
6351 if (ack->get_tid() > pg_stat_tid_flushed) {
6352 pg_stat_tid_flushed = ack->get_tid();
6353 pg_stat_queue_cond.Signal();
6354 }
6355
6356 xlist<PG*>::iterator p = pg_stat_queue.begin();
6357 while (!p.end()) {
6358 PG *pg = *p;
6359 PGRef _pg(pg);
6360 ++p;
6361
6362 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6363 if (acked != ack->pg_stat.end()) {
6364 pg->pg_stats_publish_lock.Lock();
6365 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6366 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6367 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6368 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6369 pg->stat_queue_item.remove_myself();
6370 pg->put("pg_stat_queue");
6371 } else {
6372 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6373 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6374 << acked->second << dendl;
6375 }
6376 pg->pg_stats_publish_lock.Unlock();
6377 } else {
6378 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6379 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6380 }
6381 }
6382
6383 outstanding_pg_stats.erase(ack->get_tid());
6384 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6385
6386 pg_stat_queue_lock.Unlock();
6387
6388 ack->put();
6389 }
6390
6391 void OSD::flush_pg_stats()
6392 {
6393 dout(10) << "flush_pg_stats" << dendl;
6394 osd_lock.Unlock();
6395 utime_t now = ceph_clock_now();
6396 map_lock.get_read();
6397 mon_report_lock.Lock();
6398 send_pg_stats(now);
6399 mon_report_lock.Unlock();
6400 map_lock.put_read();
6401
6402
6403 pg_stat_queue_lock.Lock();
6404 uint64_t tid = pg_stat_tid;
6405 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6406 while (tid > pg_stat_tid_flushed)
6407 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6408 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6409 pg_stat_queue_lock.Unlock();
6410
6411 osd_lock.Lock();
6412 }
6413
6414 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6415 {
6416 const auto& monmap = monc->monmap;
6417 // send beacon to mon even if we are just connected, and the monmap is not
6418 // initialized yet by then.
6419 if (monmap.epoch > 0 &&
6420 monmap.get_required_features().contains_all(
6421 ceph::features::mon::FEATURE_LUMINOUS)) {
6422 dout(20) << __func__ << " sending" << dendl;
6423 MOSDBeacon* beacon = nullptr;
6424 {
6425 Mutex::Locker l{min_last_epoch_clean_lock};
6426 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6427 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6428 last_sent_beacon = now;
6429 }
6430 monc->send_mon_message(beacon);
6431 } else {
6432 dout(20) << __func__ << " not sending" << dendl;
6433 }
6434 }
6435
6436 void OSD::handle_command(MMonCommand *m)
6437 {
6438 if (!require_mon_peer(m)) {
6439 m->put();
6440 return;
6441 }
6442
6443 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6444 command_wq.queue(c);
6445 m->put();
6446 }
6447
6448 void OSD::handle_command(MCommand *m)
6449 {
6450 ConnectionRef con = m->get_connection();
6451 Session *session = static_cast<Session *>(con->get_priv());
6452 if (!session) {
6453 con->send_message(new MCommandReply(m, -EPERM));
6454 m->put();
6455 return;
6456 }
6457
6458 OSDCap& caps = session->caps;
6459 session->put();
6460
6461 if (!caps.allow_all() || m->get_source().is_mon()) {
6462 con->send_message(new MCommandReply(m, -EPERM));
6463 m->put();
6464 return;
6465 }
6466
6467 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6468 command_wq.queue(c);
6469
6470 m->put();
6471 }
6472
6473 struct OSDCommand {
6474 string cmdstring;
6475 string helpstring;
6476 string module;
6477 string perm;
6478 string availability;
6479 } osd_commands[] = {
6480
6481 #define COMMAND(parsesig, helptext, module, perm, availability) \
6482 {parsesig, helptext, module, perm, availability},
6483
6484 // yes, these are really pg commands, but there's a limit to how
6485 // much work it's worth. The OSD returns all of them. Make this
6486 // form (pg <pgid> <cmd>) valid only for the cli.
6487 // Rest uses "tell <pgid> <cmd>"
6488
6489 COMMAND("pg " \
6490 "name=pgid,type=CephPgid " \
6491 "name=cmd,type=CephChoices,strings=query", \
6492 "show details of a specific pg", "osd", "r", "cli")
6493 COMMAND("pg " \
6494 "name=pgid,type=CephPgid " \
6495 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6496 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6497 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6498 "osd", "rw", "cli")
6499 COMMAND("pg " \
6500 "name=pgid,type=CephPgid " \
6501 "name=cmd,type=CephChoices,strings=list_missing " \
6502 "name=offset,type=CephString,req=false",
6503 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6504 "osd", "r", "cli")
6505
6506 // new form: tell <pgid> <cmd> for both cli and rest
6507
6508 COMMAND("query",
6509 "show details of a specific pg", "osd", "r", "cli,rest")
6510 COMMAND("mark_unfound_lost " \
6511 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6512 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6513 "osd", "rw", "cli,rest")
6514 COMMAND("list_missing " \
6515 "name=offset,type=CephString,req=false",
6516 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6517 "osd", "r", "cli,rest")
6518 COMMAND("perf histogram dump "
6519 "name=logger,type=CephString,req=false "
6520 "name=counter,type=CephString,req=false",
6521 "Get histogram data",
6522 "osd", "r", "cli,rest")
6523
6524 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6525 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6526 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6527 COMMAND("injectargs " \
6528 "name=injected_args,type=CephString,n=N",
6529 "inject configuration arguments into running OSD",
6530 "osd", "rw", "cli,rest")
6531 COMMAND("config set " \
6532 "name=key,type=CephString name=value,type=CephString",
6533 "Set a configuration option at runtime (not persistent)",
6534 "osd", "rw", "cli,rest")
6535 COMMAND("cluster_log " \
6536 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6537 "name=message,type=CephString,n=N",
6538 "log a message to the cluster log",
6539 "osd", "rw", "cli,rest")
6540 COMMAND("bench " \
6541 "name=count,type=CephInt,req=false " \
6542 "name=size,type=CephInt,req=false " \
6543 "name=object_size,type=CephInt,req=false " \
6544 "name=object_num,type=CephInt,req=false ", \
6545 "OSD benchmark: write <count> <size>-byte objects, " \
6546 "(default 1G size 4MB). Results in log.",
6547 "osd", "rw", "cli,rest")
6548 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6549 COMMAND("heap " \
6550 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6551 "show heap usage info (available only if compiled with tcmalloc)", \
6552 "osd", "rw", "cli,rest")
6553 COMMAND("debug dump_missing " \
6554 "name=filename,type=CephFilepath",
6555 "dump missing objects to a named file", "osd", "r", "cli,rest")
6556 COMMAND("debug kick_recovery_wq " \
6557 "name=delay,type=CephInt,range=0",
6558 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6559 COMMAND("cpu_profiler " \
6560 "name=arg,type=CephChoices,strings=status|flush",
6561 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6562 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6563 "osd", "r", "cli,rest")
6564 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6565 "osd", "rw", "cli,rest")
6566 COMMAND("compact",
6567 "compact object store's omap. "
6568 "WARNING: Compaction probably slows your requests",
6569 "osd", "rw", "cli,rest")
6570 };
6571
6572 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6573 {
6574 int r = 0;
6575 stringstream ss, ds;
6576 string rs;
6577 bufferlist odata;
6578
6579 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6580
6581 map<string, cmd_vartype> cmdmap;
6582 string prefix;
6583 string format;
6584 string pgidstr;
6585 boost::scoped_ptr<Formatter> f;
6586
6587 if (cmd.empty()) {
6588 ss << "no command given";
6589 goto out;
6590 }
6591
6592 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6593 r = -EINVAL;
6594 goto out;
6595 }
6596
6597 cmd_getval(cct, cmdmap, "prefix", prefix);
6598
6599 if (prefix == "get_command_descriptions") {
6600 int cmdnum = 0;
6601 JSONFormatter *f = new JSONFormatter();
6602 f->open_object_section("command_descriptions");
6603 for (OSDCommand *cp = osd_commands;
6604 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6605
6606 ostringstream secname;
6607 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6608 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6609 cp->module, cp->perm, cp->availability, 0);
6610 cmdnum++;
6611 }
6612 f->close_section(); // command_descriptions
6613
6614 f->flush(ds);
6615 delete f;
6616 goto out;
6617 }
6618
6619 cmd_getval(cct, cmdmap, "format", format);
6620 f.reset(Formatter::create(format));
6621
6622 if (prefix == "version") {
6623 if (f) {
6624 f->open_object_section("version");
6625 f->dump_string("version", pretty_version_to_str());
6626 f->close_section();
6627 f->flush(ds);
6628 } else {
6629 ds << pretty_version_to_str();
6630 }
6631 goto out;
6632 }
6633 else if (prefix == "injectargs") {
6634 vector<string> argsvec;
6635 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6636
6637 if (argsvec.empty()) {
6638 r = -EINVAL;
6639 ss << "ignoring empty injectargs";
6640 goto out;
6641 }
6642 string args = argsvec.front();
6643 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6644 args += " " + *a;
6645 osd_lock.Unlock();
6646 r = cct->_conf->injectargs(args, &ss);
6647 osd_lock.Lock();
6648 }
6649 else if (prefix == "config set") {
6650 std::string key;
6651 std::string val;
6652 cmd_getval(cct, cmdmap, "key", key);
6653 cmd_getval(cct, cmdmap, "value", val);
6654 osd_lock.Unlock();
6655 r = cct->_conf->set_val(key, val, true, &ss);
6656 if (r == 0) {
6657 cct->_conf->apply_changes(nullptr);
6658 }
6659 osd_lock.Lock();
6660 }
6661 else if (prefix == "cluster_log") {
6662 vector<string> msg;
6663 cmd_getval(cct, cmdmap, "message", msg);
6664 if (msg.empty()) {
6665 r = -EINVAL;
6666 ss << "ignoring empty log message";
6667 goto out;
6668 }
6669 string message = msg.front();
6670 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6671 message += " " + *a;
6672 string lvl;
6673 cmd_getval(cct, cmdmap, "level", lvl);
6674 clog_type level = string_to_clog_type(lvl);
6675 if (level < 0) {
6676 r = -EINVAL;
6677 ss << "unknown level '" << lvl << "'";
6678 goto out;
6679 }
6680 clog->do_log(level, message);
6681 }
6682
6683 // either 'pg <pgid> <command>' or
6684 // 'tell <pgid>' (which comes in without any of that prefix)?
6685
6686 else if (prefix == "pg" ||
6687 prefix == "query" ||
6688 prefix == "mark_unfound_lost" ||
6689 prefix == "list_missing"
6690 ) {
6691 pg_t pgid;
6692
6693 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6694 ss << "no pgid specified";
6695 r = -EINVAL;
6696 } else if (!pgid.parse(pgidstr.c_str())) {
6697 ss << "couldn't parse pgid '" << pgidstr << "'";
6698 r = -EINVAL;
6699 } else {
6700 spg_t pcand;
6701 PG *pg = nullptr;
6702 if (osdmap->get_primary_shard(pgid, &pcand) &&
6703 (pg = _lookup_lock_pg(pcand))) {
6704 if (pg->is_primary()) {
6705 // simulate pg <pgid> cmd= for pg->do-command
6706 if (prefix != "pg")
6707 cmd_putval(cct, cmdmap, "cmd", prefix);
6708 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6709 if (r == -EAGAIN) {
6710 pg->unlock();
6711 // don't reply, pg will do so async
6712 return;
6713 }
6714 } else {
6715 ss << "not primary for pgid " << pgid;
6716
6717 // send them the latest diff to ensure they realize the mapping
6718 // has changed.
6719 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6720
6721 // do not reply; they will get newer maps and realize they
6722 // need to resend.
6723 pg->unlock();
6724 return;
6725 }
6726 pg->unlock();
6727 } else {
6728 ss << "i don't have pgid " << pgid;
6729 r = -ENOENT;
6730 }
6731 }
6732 }
6733
6734 else if (prefix == "bench") {
6735 int64_t count;
6736 int64_t bsize;
6737 int64_t osize, onum;
6738 // default count 1G, size 4MB
6739 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6740 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6741 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6742 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6743
6744 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6745 ObjectStore::Sequencer>("bench"));
6746
6747 uint32_t duration = cct->_conf->osd_bench_duration;
6748
6749 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6750 // let us limit the block size because the next checks rely on it
6751 // having a sane value. If we allow any block size to be set things
6752 // can still go sideways.
6753 ss << "block 'size' values are capped at "
6754 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6755 << " a higher value, please adjust 'osd_bench_max_block_size'";
6756 r = -EINVAL;
6757 goto out;
6758 } else if (bsize < (int64_t) (1 << 20)) {
6759 // entering the realm of small block sizes.
6760 // limit the count to a sane value, assuming a configurable amount of
6761 // IOPS and duration, so that the OSD doesn't get hung up on this,
6762 // preventing timeouts from going off
6763 int64_t max_count =
6764 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6765 if (count > max_count) {
6766 ss << "'count' values greater than " << max_count
6767 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6768 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6769 << " for " << duration << " seconds,"
6770 << " can cause ill effects on osd. "
6771 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6772 << " value if you wish to use a higher 'count'.";
6773 r = -EINVAL;
6774 goto out;
6775 }
6776 } else {
6777 // 1MB block sizes are big enough so that we get more stuff done.
6778 // However, to avoid the osd from getting hung on this and having
6779 // timers being triggered, we are going to limit the count assuming
6780 // a configurable throughput and duration.
6781 // NOTE: max_count is the total amount of bytes that we believe we
6782 // will be able to write during 'duration' for the given
6783 // throughput. The block size hardly impacts this unless it's
6784 // way too big. Given we already check how big the block size
6785 // is, it's safe to assume everything will check out.
6786 int64_t max_count =
6787 cct->_conf->osd_bench_large_size_max_throughput * duration;
6788 if (count > max_count) {
6789 ss << "'count' values greater than " << max_count
6790 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6791 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6792 << " for " << duration << " seconds,"
6793 << " can cause ill effects on osd. "
6794 << " Please adjust 'osd_bench_large_size_max_throughput'"
6795 << " with a higher value if you wish to use a higher 'count'.";
6796 r = -EINVAL;
6797 goto out;
6798 }
6799 }
6800
6801 if (osize && bsize > osize)
6802 bsize = osize;
6803
6804 dout(1) << " bench count " << count
6805 << " bsize " << prettybyte_t(bsize) << dendl;
6806
6807 ObjectStore::Transaction cleanupt;
6808
6809 if (osize && onum) {
6810 bufferlist bl;
6811 bufferptr bp(osize);
6812 bp.zero();
6813 bl.push_back(std::move(bp));
6814 bl.rebuild_page_aligned();
6815 for (int i=0; i<onum; ++i) {
6816 char nm[30];
6817 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6818 object_t oid(nm);
6819 hobject_t soid(sobject_t(oid, 0));
6820 ObjectStore::Transaction t;
6821 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6822 store->queue_transaction(osr.get(), std::move(t), NULL);
6823 cleanupt.remove(coll_t(), ghobject_t(soid));
6824 }
6825 }
6826
6827 bufferlist bl;
6828 bufferptr bp(bsize);
6829 bp.zero();
6830 bl.push_back(std::move(bp));
6831 bl.rebuild_page_aligned();
6832
6833 {
6834 C_SaferCond waiter;
6835 if (!osr->flush_commit(&waiter)) {
6836 waiter.wait();
6837 }
6838 }
6839
6840 utime_t start = ceph_clock_now();
6841 for (int64_t pos = 0; pos < count; pos += bsize) {
6842 char nm[30];
6843 unsigned offset = 0;
6844 if (onum && osize) {
6845 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6846 offset = rand() % (osize / bsize) * bsize;
6847 } else {
6848 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6849 }
6850 object_t oid(nm);
6851 hobject_t soid(sobject_t(oid, 0));
6852 ObjectStore::Transaction t;
6853 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6854 store->queue_transaction(osr.get(), std::move(t), NULL);
6855 if (!onum || !osize)
6856 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6857 }
6858
6859 {
6860 C_SaferCond waiter;
6861 if (!osr->flush_commit(&waiter)) {
6862 waiter.wait();
6863 }
6864 }
6865 utime_t end = ceph_clock_now();
6866
6867 // clean up
6868 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6869 {
6870 C_SaferCond waiter;
6871 if (!osr->flush_commit(&waiter)) {
6872 waiter.wait();
6873 }
6874 }
6875
6876 uint64_t rate = (double)count / (end - start);
6877 if (f) {
6878 f->open_object_section("osd_bench_results");
6879 f->dump_int("bytes_written", count);
6880 f->dump_int("blocksize", bsize);
6881 f->dump_unsigned("bytes_per_sec", rate);
6882 f->close_section();
6883 f->flush(ss);
6884 } else {
6885 ss << "bench: wrote " << prettybyte_t(count)
6886 << " in blocks of " << prettybyte_t(bsize) << " in "
6887 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6888 }
6889 }
6890
6891 else if (prefix == "flush_pg_stats") {
6892 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6893 mgrc.send_pgstats();
6894 ds << service.get_osd_stat_seq() << "\n";
6895 } else {
6896 flush_pg_stats();
6897 }
6898 }
6899
6900 else if (prefix == "heap") {
6901 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6902 }
6903
6904 else if (prefix == "debug dump_missing") {
6905 string file_name;
6906 cmd_getval(cct, cmdmap, "filename", file_name);
6907 std::ofstream fout(file_name.c_str());
6908 if (!fout.is_open()) {
6909 ss << "failed to open file '" << file_name << "'";
6910 r = -EINVAL;
6911 goto out;
6912 }
6913
6914 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6915 RWLock::RLocker l(pg_map_lock);
6916 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6917 pg_map_e != pg_map.end(); ++pg_map_e) {
6918 PG *pg = pg_map_e->second;
6919 pg->lock();
6920
6921 fout << *pg << std::endl;
6922 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6923 pg->pg_log.get_missing().get_items().end();
6924 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6925 pg->pg_log.get_missing().get_items().begin();
6926 for (; mi != mend; ++mi) {
6927 fout << mi->first << " -> " << mi->second << std::endl;
6928 if (!pg->missing_loc.needs_recovery(mi->first))
6929 continue;
6930 if (pg->missing_loc.is_unfound(mi->first))
6931 fout << " unfound ";
6932 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6933 if (mls.empty())
6934 continue;
6935 fout << "missing_loc: " << mls << std::endl;
6936 }
6937 pg->unlock();
6938 fout << std::endl;
6939 }
6940
6941 fout.close();
6942 }
6943 else if (prefix == "debug kick_recovery_wq") {
6944 int64_t delay;
6945 cmd_getval(cct, cmdmap, "delay", delay);
6946 ostringstream oss;
6947 oss << delay;
6948 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6949 if (r != 0) {
6950 ss << "kick_recovery_wq: error setting "
6951 << "osd_recovery_delay_start to '" << delay << "': error "
6952 << r;
6953 goto out;
6954 }
6955 cct->_conf->apply_changes(NULL);
6956 ss << "kicking recovery queue. set osd_recovery_delay_start "
6957 << "to " << cct->_conf->osd_recovery_delay_start;
6958 }
6959
6960 else if (prefix == "cpu_profiler") {
6961 string arg;
6962 cmd_getval(cct, cmdmap, "arg", arg);
6963 vector<string> argvec;
6964 get_str_vec(arg, argvec);
6965 cpu_profiler_handle_command(argvec, ds);
6966 }
6967
6968 else if (prefix == "dump_pg_recovery_stats") {
6969 stringstream s;
6970 if (f) {
6971 pg_recovery_stats.dump_formatted(f.get());
6972 f->flush(ds);
6973 } else {
6974 pg_recovery_stats.dump(s);
6975 ds << "dump pg recovery stats: " << s.str();
6976 }
6977 }
6978
6979 else if (prefix == "reset_pg_recovery_stats") {
6980 ss << "reset pg recovery stats";
6981 pg_recovery_stats.reset();
6982 }
6983
6984 else if (prefix == "perf histogram dump") {
6985 std::string logger;
6986 std::string counter;
6987 cmd_getval(cct, cmdmap, "logger", logger);
6988 cmd_getval(cct, cmdmap, "counter", counter);
6989 if (f) {
6990 cct->get_perfcounters_collection()->dump_formatted_histograms(
6991 f.get(), false, logger, counter);
6992 f->flush(ds);
6993 }
6994 }
6995
6996 else if (prefix == "compact") {
6997 dout(1) << "triggering manual compaction" << dendl;
6998 auto start = ceph::coarse_mono_clock::now();
6999 store->compact();
7000 auto end = ceph::coarse_mono_clock::now();
7001 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
7002 dout(1) << "finished manual compaction in "
7003 << time_span.count()
7004 << " seconds" << dendl;
7005 ss << "compacted omap in " << time_span.count() << " seconds";
7006 }
7007
7008 else {
7009 ss << "unrecognized command! " << cmd;
7010 r = -EINVAL;
7011 }
7012
7013 out:
7014 rs = ss.str();
7015 odata.append(ds);
7016 dout(0) << "do_command r=" << r << " " << rs << dendl;
7017 clog->info() << rs;
7018 if (con) {
7019 MCommandReply *reply = new MCommandReply(r, rs);
7020 reply->set_tid(tid);
7021 reply->set_data(odata);
7022 con->send_message(reply);
7023 }
7024 }
7025
7026 bool OSD::heartbeat_dispatch(Message *m)
7027 {
7028 dout(30) << "heartbeat_dispatch " << m << dendl;
7029 switch (m->get_type()) {
7030
7031 case CEPH_MSG_PING:
7032 dout(10) << "ping from " << m->get_source_inst() << dendl;
7033 m->put();
7034 break;
7035
7036 case MSG_OSD_PING:
7037 handle_osd_ping(static_cast<MOSDPing*>(m));
7038 break;
7039
7040 default:
7041 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7042 m->put();
7043 }
7044
7045 return true;
7046 }
7047
7048 bool OSD::ms_dispatch(Message *m)
7049 {
7050 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7051 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7052 service.got_stop_ack();
7053 m->put();
7054 return true;
7055 }
7056
7057 // lock!
7058
7059 osd_lock.Lock();
7060 if (is_stopping()) {
7061 osd_lock.Unlock();
7062 m->put();
7063 return true;
7064 }
7065
7066 do_waiters();
7067 _dispatch(m);
7068
7069 osd_lock.Unlock();
7070
7071 return true;
7072 }
7073
7074 void OSD::maybe_share_map(
7075 Session *session,
7076 OpRequestRef op,
7077 OSDMapRef osdmap)
7078 {
7079 if (!op->check_send_map) {
7080 return;
7081 }
7082 epoch_t last_sent_epoch = 0;
7083
7084 session->sent_epoch_lock.lock();
7085 last_sent_epoch = session->last_sent_epoch;
7086 session->sent_epoch_lock.unlock();
7087
7088 const Message *m = op->get_req();
7089 service.share_map(
7090 m->get_source(),
7091 m->get_connection().get(),
7092 op->sent_epoch,
7093 osdmap,
7094 session ? &last_sent_epoch : NULL);
7095
7096 session->sent_epoch_lock.lock();
7097 if (session->last_sent_epoch < last_sent_epoch) {
7098 session->last_sent_epoch = last_sent_epoch;
7099 }
7100 session->sent_epoch_lock.unlock();
7101
7102 op->check_send_map = false;
7103 }
7104
7105 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7106 {
7107 assert(session->session_dispatch_lock.is_locked());
7108
7109 auto i = session->waiting_on_map.begin();
7110 while (i != session->waiting_on_map.end()) {
7111 OpRequestRef op = &(*i);
7112 assert(ms_can_fast_dispatch(op->get_req()));
7113 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7114 op->get_req());
7115 if (m->get_min_epoch() > osdmap->get_epoch()) {
7116 break;
7117 }
7118 session->waiting_on_map.erase(i++);
7119 op->put();
7120
7121 spg_t pgid;
7122 if (m->get_type() == CEPH_MSG_OSD_OP) {
7123 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7124 static_cast<const MOSDOp*>(m)->get_pg());
7125 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7126 continue;
7127 }
7128 } else {
7129 pgid = m->get_spg();
7130 }
7131 enqueue_op(pgid, op, m->get_map_epoch());
7132 }
7133
7134 if (session->waiting_on_map.empty()) {
7135 clear_session_waiting_on_map(session);
7136 } else {
7137 register_session_waiting_on_map(session);
7138 }
7139 }
7140
7141 void OSD::ms_fast_dispatch(Message *m)
7142 {
7143 FUNCTRACE();
7144 if (service.is_stopping()) {
7145 m->put();
7146 return;
7147 }
7148 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7149 {
7150 #ifdef WITH_LTTNG
7151 osd_reqid_t reqid = op->get_reqid();
7152 #endif
7153 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7154 reqid.name._num, reqid.tid, reqid.inc);
7155 }
7156
7157 if (m->trace)
7158 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7159
7160 // note sender epoch, min req'd epoch
7161 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7162 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7163 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7164
7165 service.maybe_inject_dispatch_delay();
7166
7167 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7168 m->get_type() != CEPH_MSG_OSD_OP) {
7169 // queue it directly
7170 enqueue_op(
7171 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7172 op,
7173 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7174 } else {
7175 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7176 // message that didn't have an explicit spg_t); we need to map
7177 // them to an spg_t while preserving delivery order.
7178 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7179 if (session) {
7180 {
7181 Mutex::Locker l(session->session_dispatch_lock);
7182 op->get();
7183 session->waiting_on_map.push_back(*op);
7184 OSDMapRef nextmap = service.get_nextmap_reserved();
7185 dispatch_session_waiting(session, nextmap);
7186 service.release_map(nextmap);
7187 }
7188 session->put();
7189 }
7190 }
7191 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7192 }
7193
7194 void OSD::ms_fast_preprocess(Message *m)
7195 {
7196 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7197 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7198 MOSDMap *mm = static_cast<MOSDMap*>(m);
7199 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7200 if (s) {
7201 s->received_map_lock.lock();
7202 s->received_map_epoch = mm->get_last();
7203 s->received_map_lock.unlock();
7204 s->put();
7205 }
7206 }
7207 }
7208 }
7209
7210 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7211 {
7212 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7213
7214 if (is_stopping()) {
7215 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7216 return false;
7217 }
7218
7219 if (dest_type == CEPH_ENTITY_TYPE_MON)
7220 return true;
7221
7222 if (force_new) {
7223 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7224 to get through */
7225 if (monc->wait_auth_rotating(10) < 0) {
7226 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7227 return false;
7228 }
7229 }
7230
7231 *authorizer = monc->build_authorizer(dest_type);
7232 return *authorizer != NULL;
7233 }
7234
7235
7236 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7237 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7238 bool& isvalid, CryptoKey& session_key)
7239 {
7240 AuthAuthorizeHandler *authorize_handler = 0;
7241 switch (peer_type) {
7242 case CEPH_ENTITY_TYPE_MDS:
7243 /*
7244 * note: mds is technically a client from our perspective, but
7245 * this makes the 'cluster' consistent w/ monitor's usage.
7246 */
7247 case CEPH_ENTITY_TYPE_OSD:
7248 case CEPH_ENTITY_TYPE_MGR:
7249 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7250 break;
7251 default:
7252 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7253 }
7254 if (!authorize_handler) {
7255 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7256 isvalid = false;
7257 return true;
7258 }
7259
7260 AuthCapsInfo caps_info;
7261 EntityName name;
7262 uint64_t global_id;
7263 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7264
7265 RotatingKeyRing *keys = monc->rotating_secrets.get();
7266 if (keys) {
7267 isvalid = authorize_handler->verify_authorizer(
7268 cct, keys,
7269 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7270 &auid);
7271 } else {
7272 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7273 isvalid = false;
7274 }
7275
7276 if (isvalid) {
7277 Session *s = static_cast<Session *>(con->get_priv());
7278 if (!s) {
7279 s = new Session(cct);
7280 con->set_priv(s->get());
7281 s->con = con;
7282 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7283 }
7284
7285 s->entity_name = name;
7286 if (caps_info.allow_all)
7287 s->caps.set_allow_all();
7288 s->auid = auid;
7289
7290 if (caps_info.caps.length() > 0) {
7291 bufferlist::iterator p = caps_info.caps.begin();
7292 string str;
7293 try {
7294 ::decode(str, p);
7295 }
7296 catch (buffer::error& e) {
7297 }
7298 bool success = s->caps.parse(str);
7299 if (success)
7300 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7301 else
7302 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7303 }
7304
7305 s->put();
7306 }
7307 return true;
7308 }
7309
7310 void OSD::do_waiters()
7311 {
7312 assert(osd_lock.is_locked());
7313
7314 dout(10) << "do_waiters -- start" << dendl;
7315 while (!finished.empty()) {
7316 OpRequestRef next = finished.front();
7317 finished.pop_front();
7318 dispatch_op(next);
7319 }
7320 dout(10) << "do_waiters -- finish" << dendl;
7321 }
7322
7323 void OSD::dispatch_op(OpRequestRef op)
7324 {
7325 switch (op->get_req()->get_type()) {
7326
7327 case MSG_OSD_PG_CREATE:
7328 handle_pg_create(op);
7329 break;
7330 case MSG_OSD_PG_NOTIFY:
7331 handle_pg_notify(op);
7332 break;
7333 case MSG_OSD_PG_QUERY:
7334 handle_pg_query(op);
7335 break;
7336 case MSG_OSD_PG_LOG:
7337 handle_pg_log(op);
7338 break;
7339 case MSG_OSD_PG_REMOVE:
7340 handle_pg_remove(op);
7341 break;
7342 case MSG_OSD_PG_INFO:
7343 handle_pg_info(op);
7344 break;
7345 case MSG_OSD_PG_TRIM:
7346 handle_pg_trim(op);
7347 break;
7348 case MSG_OSD_BACKFILL_RESERVE:
7349 handle_pg_backfill_reserve(op);
7350 break;
7351 case MSG_OSD_RECOVERY_RESERVE:
7352 handle_pg_recovery_reserve(op);
7353 break;
7354 }
7355 }
7356
7357 void OSD::_dispatch(Message *m)
7358 {
7359 assert(osd_lock.is_locked());
7360 dout(20) << "_dispatch " << m << " " << *m << dendl;
7361
7362 switch (m->get_type()) {
7363
7364 // -- don't need lock --
7365 case CEPH_MSG_PING:
7366 dout(10) << "ping from " << m->get_source() << dendl;
7367 m->put();
7368 break;
7369
7370 // -- don't need OSDMap --
7371
7372 // map and replication
7373 case CEPH_MSG_OSD_MAP:
7374 handle_osd_map(static_cast<MOSDMap*>(m));
7375 break;
7376
7377 // osd
7378 case MSG_PGSTATSACK:
7379 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7380 break;
7381
7382 case MSG_MON_COMMAND:
7383 handle_command(static_cast<MMonCommand*>(m));
7384 break;
7385 case MSG_COMMAND:
7386 handle_command(static_cast<MCommand*>(m));
7387 break;
7388
7389 case MSG_OSD_SCRUB:
7390 handle_scrub(static_cast<MOSDScrub*>(m));
7391 break;
7392
7393 case MSG_OSD_FORCE_RECOVERY:
7394 handle_force_recovery(m);
7395 break;
7396
7397 // -- need OSDMap --
7398
7399 case MSG_OSD_PG_CREATE:
7400 case MSG_OSD_PG_NOTIFY:
7401 case MSG_OSD_PG_QUERY:
7402 case MSG_OSD_PG_LOG:
7403 case MSG_OSD_PG_REMOVE:
7404 case MSG_OSD_PG_INFO:
7405 case MSG_OSD_PG_TRIM:
7406 case MSG_OSD_BACKFILL_RESERVE:
7407 case MSG_OSD_RECOVERY_RESERVE:
7408 {
7409 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7410 if (m->trace)
7411 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7412 // no map? starting up?
7413 if (!osdmap) {
7414 dout(7) << "no OSDMap, not booted" << dendl;
7415 logger->inc(l_osd_waiting_for_map);
7416 waiting_for_osdmap.push_back(op);
7417 op->mark_delayed("no osdmap");
7418 break;
7419 }
7420
7421 // need OSDMap
7422 dispatch_op(op);
7423 }
7424 }
7425 }
7426
7427 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7428 {
7429 pg->lock();
7430 if (pg->is_primary()) {
7431 pg->unreg_next_scrub();
7432 pg->scrubber.must_scrub = true;
7433 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7434 pg->scrubber.must_repair = m->repair;
7435 pg->reg_next_scrub();
7436 dout(10) << "marking " << *pg << " for scrub" << dendl;
7437 }
7438 pg->unlock();
7439 }
7440
7441 void OSD::handle_scrub(MOSDScrub *m)
7442 {
7443 dout(10) << "handle_scrub " << *m << dendl;
7444 if (!require_mon_or_mgr_peer(m)) {
7445 m->put();
7446 return;
7447 }
7448 if (m->fsid != monc->get_fsid()) {
7449 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7450 m->put();
7451 return;
7452 }
7453
7454 RWLock::RLocker l(pg_map_lock);
7455 if (m->scrub_pgs.empty()) {
7456 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7457 p != pg_map.end();
7458 ++p)
7459 handle_pg_scrub(m, p->second);
7460 } else {
7461 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7462 p != m->scrub_pgs.end();
7463 ++p) {
7464 spg_t pcand;
7465 if (osdmap->get_primary_shard(*p, &pcand)) {
7466 auto pg_map_entry = pg_map.find(pcand);
7467 if (pg_map_entry != pg_map.end()) {
7468 handle_pg_scrub(m, pg_map_entry->second);
7469 }
7470 }
7471 }
7472 }
7473
7474 m->put();
7475 }
7476
7477 bool OSD::scrub_random_backoff()
7478 {
7479 bool coin_flip = (rand() / (double)RAND_MAX >=
7480 cct->_conf->osd_scrub_backoff_ratio);
7481 if (!coin_flip) {
7482 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7483 return true;
7484 }
7485 return false;
7486 }
7487
7488 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7489 const spg_t& pg, const utime_t& timestamp,
7490 double pool_scrub_min_interval,
7491 double pool_scrub_max_interval, bool must)
7492 : cct(cct),
7493 pgid(pg),
7494 sched_time(timestamp),
7495 deadline(timestamp)
7496 {
7497 // if not explicitly requested, postpone the scrub with a random delay
7498 if (!must) {
7499 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7500 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7501 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7502 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7503
7504 sched_time += scrub_min_interval;
7505 double r = rand() / (double)RAND_MAX;
7506 sched_time +=
7507 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7508 deadline += scrub_max_interval;
7509 }
7510 }
7511
7512 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7513 if (sched_time < rhs.sched_time)
7514 return true;
7515 if (sched_time > rhs.sched_time)
7516 return false;
7517 return pgid < rhs.pgid;
7518 }
7519
7520 bool OSD::scrub_time_permit(utime_t now)
7521 {
7522 struct tm bdt;
7523 time_t tt = now.sec();
7524 localtime_r(&tt, &bdt);
7525 bool time_permit = false;
7526 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7527 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7528 time_permit = true;
7529 }
7530 } else {
7531 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7532 time_permit = true;
7533 }
7534 }
7535 if (!time_permit) {
7536 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7537 << " - " << cct->_conf->osd_scrub_end_hour
7538 << " now " << bdt.tm_hour << " = no" << dendl;
7539 } else {
7540 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7541 << " - " << cct->_conf->osd_scrub_end_hour
7542 << " now " << bdt.tm_hour << " = yes" << dendl;
7543 }
7544 return time_permit;
7545 }
7546
7547 bool OSD::scrub_load_below_threshold()
7548 {
7549 double loadavgs[3];
7550 if (getloadavg(loadavgs, 3) != 3) {
7551 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7552 return false;
7553 }
7554
7555 // allow scrub if below configured threshold
7556 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7557 dout(20) << __func__ << " loadavg " << loadavgs[0]
7558 << " < max " << cct->_conf->osd_scrub_load_threshold
7559 << " = yes" << dendl;
7560 return true;
7561 }
7562
7563 // allow scrub if below daily avg and currently decreasing
7564 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7565 dout(20) << __func__ << " loadavg " << loadavgs[0]
7566 << " < daily_loadavg " << daily_loadavg
7567 << " and < 15m avg " << loadavgs[2]
7568 << " = yes" << dendl;
7569 return true;
7570 }
7571
7572 dout(20) << __func__ << " loadavg " << loadavgs[0]
7573 << " >= max " << cct->_conf->osd_scrub_load_threshold
7574 << " and ( >= daily_loadavg " << daily_loadavg
7575 << " or >= 15m avg " << loadavgs[2]
7576 << ") = no" << dendl;
7577 return false;
7578 }
7579
7580 void OSD::sched_scrub()
7581 {
7582 // if not permitted, fail fast
7583 if (!service.can_inc_scrubs_pending()) {
7584 return;
7585 }
7586 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7587 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7588 return;
7589 }
7590
7591
7592 utime_t now = ceph_clock_now();
7593 bool time_permit = scrub_time_permit(now);
7594 bool load_is_low = scrub_load_below_threshold();
7595 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7596
7597 OSDService::ScrubJob scrub;
7598 if (service.first_scrub_stamp(&scrub)) {
7599 do {
7600 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7601
7602 if (scrub.sched_time > now) {
7603 // save ourselves some effort
7604 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7605 << " > " << now << dendl;
7606 break;
7607 }
7608
7609 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7610 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7611 << (!time_permit ? "time not permit" : "high load") << dendl;
7612 continue;
7613 }
7614
7615 PG *pg = _lookup_lock_pg(scrub.pgid);
7616 if (!pg)
7617 continue;
7618 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7619 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7620 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7621 (load_is_low ? ", load_is_low" : " deadline < now"))
7622 << dendl;
7623 if (pg->sched_scrub()) {
7624 pg->unlock();
7625 break;
7626 }
7627 }
7628 pg->unlock();
7629 } while (service.next_scrub_stamp(scrub, &scrub));
7630 }
7631 dout(20) << "sched_scrub done" << dendl;
7632 }
7633
7634
7635
7636 vector<OSDHealthMetric> OSD::get_health_metrics()
7637 {
7638 vector<OSDHealthMetric> metrics;
7639 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
7640 auto n_primaries = pending_creates_from_mon;
7641 for (const auto& create : pending_creates_from_osd) {
7642 if (create.second) {
7643 n_primaries++;
7644 }
7645 }
7646 metrics.emplace_back(osd_metric::PENDING_CREATING_PGS, n_primaries);
7647 return metrics;
7648 }
7649
7650 // =====================================================
7651 // MAP
7652
7653 void OSD::wait_for_new_map(OpRequestRef op)
7654 {
7655 // ask?
7656 if (waiting_for_osdmap.empty()) {
7657 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7658 }
7659
7660 logger->inc(l_osd_waiting_for_map);
7661 waiting_for_osdmap.push_back(op);
7662 op->mark_delayed("wait for new map");
7663 }
7664
7665
7666 /** update_map
7667 * assimilate new OSDMap(s). scan pgs, etc.
7668 */
7669
7670 void OSD::note_down_osd(int peer)
7671 {
7672 assert(osd_lock.is_locked());
7673 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7674
7675 heartbeat_lock.Lock();
7676 failure_queue.erase(peer);
7677 failure_pending.erase(peer);
7678 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7679 if (p != heartbeat_peers.end()) {
7680 p->second.con_back->mark_down();
7681 if (p->second.con_front) {
7682 p->second.con_front->mark_down();
7683 }
7684 heartbeat_peers.erase(p);
7685 }
7686 heartbeat_lock.Unlock();
7687 }
7688
7689 void OSD::note_up_osd(int peer)
7690 {
7691 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7692 heartbeat_set_peers_need_update();
7693 }
7694
7695 struct C_OnMapCommit : public Context {
7696 OSD *osd;
7697 epoch_t first, last;
7698 MOSDMap *msg;
7699 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7700 : osd(o), first(f), last(l), msg(m) {}
7701 void finish(int r) override {
7702 osd->_committed_osd_maps(first, last, msg);
7703 msg->put();
7704 }
7705 };
7706
7707 struct C_OnMapApply : public Context {
7708 OSDService *service;
7709 list<OSDMapRef> pinned_maps;
7710 epoch_t e;
7711 C_OnMapApply(OSDService *service,
7712 const list<OSDMapRef> &pinned_maps,
7713 epoch_t e)
7714 : service(service), pinned_maps(pinned_maps), e(e) {}
7715 void finish(int r) override {
7716 service->clear_map_bl_cache_pins(e);
7717 }
7718 };
7719
7720 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7721 {
7722 Mutex::Locker l(osdmap_subscribe_lock);
7723 if (latest_subscribed_epoch >= epoch && !force_request)
7724 return;
7725
7726 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7727
7728 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7729 force_request) {
7730 monc->renew_subs();
7731 }
7732 }
7733
7734 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7735 {
7736 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7737 if (min <= superblock.oldest_map)
7738 return;
7739
7740 int num = 0;
7741 ObjectStore::Transaction t;
7742 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7743 dout(20) << " removing old osdmap epoch " << e << dendl;
7744 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7745 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7746 superblock.oldest_map = e + 1;
7747 num++;
7748 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7749 service.publish_superblock(superblock);
7750 write_superblock(t);
7751 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7752 assert(tr == 0);
7753 num = 0;
7754 if (!skip_maps) {
7755 // skip_maps leaves us with a range of old maps if we fail to remove all
7756 // of them before moving superblock.oldest_map forward to the first map
7757 // in the incoming MOSDMap msg. so we should continue removing them in
7758 // this case, even we could do huge series of delete transactions all at
7759 // once.
7760 break;
7761 }
7762 }
7763 }
7764 if (num > 0) {
7765 service.publish_superblock(superblock);
7766 write_superblock(t);
7767 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7768 assert(tr == 0);
7769 }
7770 // we should not remove the cached maps
7771 assert(min <= service.map_cache.cached_key_lower_bound());
7772 }
7773
7774 void OSD::handle_osd_map(MOSDMap *m)
7775 {
7776 assert(osd_lock.is_locked());
7777 // Keep a ref in the list until we get the newly received map written
7778 // onto disk. This is important because as long as the refs are alive,
7779 // the OSDMaps will be pinned in the cache and we won't try to read it
7780 // off of disk. Otherwise these maps will probably not stay in the cache,
7781 // and reading those OSDMaps before they are actually written can result
7782 // in a crash.
7783 list<OSDMapRef> pinned_maps;
7784 if (m->fsid != monc->get_fsid()) {
7785 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7786 << monc->get_fsid() << dendl;
7787 m->put();
7788 return;
7789 }
7790 if (is_initializing()) {
7791 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7792 m->put();
7793 return;
7794 }
7795
7796 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7797 if (session && !(session->entity_name.is_mon() ||
7798 session->entity_name.is_osd())) {
7799 //not enough perms!
7800 dout(10) << "got osd map from Session " << session
7801 << " which we can't take maps from (not a mon or osd)" << dendl;
7802 m->put();
7803 session->put();
7804 return;
7805 }
7806 if (session)
7807 session->put();
7808
7809 // share with the objecter
7810 if (!is_preboot())
7811 service.objecter->handle_osd_map(m);
7812
7813 epoch_t first = m->get_first();
7814 epoch_t last = m->get_last();
7815 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7816 << superblock.newest_map
7817 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7818 << dendl;
7819
7820 logger->inc(l_osd_map);
7821 logger->inc(l_osd_mape, last - first + 1);
7822 if (first <= superblock.newest_map)
7823 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7824 if (service.max_oldest_map < m->oldest_map) {
7825 service.max_oldest_map = m->oldest_map;
7826 assert(service.max_oldest_map >= superblock.oldest_map);
7827 }
7828
7829 // make sure there is something new, here, before we bother flushing
7830 // the queues and such
7831 if (last <= superblock.newest_map) {
7832 dout(10) << " no new maps here, dropping" << dendl;
7833 m->put();
7834 return;
7835 }
7836
7837 // missing some?
7838 bool skip_maps = false;
7839 if (first > superblock.newest_map + 1) {
7840 dout(10) << "handle_osd_map message skips epochs "
7841 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7842 if (m->oldest_map <= superblock.newest_map + 1) {
7843 osdmap_subscribe(superblock.newest_map + 1, false);
7844 m->put();
7845 return;
7846 }
7847 // always try to get the full range of maps--as many as we can. this
7848 // 1- is good to have
7849 // 2- is at present the only way to ensure that we get a *full* map as
7850 // the first map!
7851 if (m->oldest_map < first) {
7852 osdmap_subscribe(m->oldest_map - 1, true);
7853 m->put();
7854 return;
7855 }
7856 skip_maps = true;
7857 }
7858
7859 ObjectStore::Transaction t;
7860 uint64_t txn_size = 0;
7861
7862 // store new maps: queue for disk and put in the osdmap cache
7863 epoch_t start = MAX(superblock.newest_map + 1, first);
7864 for (epoch_t e = start; e <= last; e++) {
7865 if (txn_size >= t.get_num_bytes()) {
7866 derr << __func__ << " transaction size overflowed" << dendl;
7867 assert(txn_size < t.get_num_bytes());
7868 }
7869 txn_size = t.get_num_bytes();
7870 map<epoch_t,bufferlist>::iterator p;
7871 p = m->maps.find(e);
7872 if (p != m->maps.end()) {
7873 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7874 OSDMap *o = new OSDMap;
7875 bufferlist& bl = p->second;
7876
7877 o->decode(bl);
7878
7879 ghobject_t fulloid = get_osdmap_pobject_name(e);
7880 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7881 pin_map_bl(e, bl);
7882 pinned_maps.push_back(add_map(o));
7883
7884 got_full_map(e);
7885 continue;
7886 }
7887
7888 p = m->incremental_maps.find(e);
7889 if (p != m->incremental_maps.end()) {
7890 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7891 bufferlist& bl = p->second;
7892 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7893 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7894 pin_map_inc_bl(e, bl);
7895
7896 OSDMap *o = new OSDMap;
7897 if (e > 1) {
7898 bufferlist obl;
7899 bool got = get_map_bl(e - 1, obl);
7900 assert(got);
7901 o->decode(obl);
7902 }
7903
7904 OSDMap::Incremental inc;
7905 bufferlist::iterator p = bl.begin();
7906 inc.decode(p);
7907 if (o->apply_incremental(inc) < 0) {
7908 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7909 assert(0 == "bad fsid");
7910 }
7911
7912 bufferlist fbl;
7913 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7914
7915 bool injected_failure = false;
7916 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7917 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7918 derr << __func__ << " injecting map crc failure" << dendl;
7919 injected_failure = true;
7920 }
7921
7922 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7923 dout(2) << "got incremental " << e
7924 << " but failed to encode full with correct crc; requesting"
7925 << dendl;
7926 clog->warn() << "failed to encode map e" << e << " with expected crc";
7927 dout(20) << "my encoded map was:\n";
7928 fbl.hexdump(*_dout);
7929 *_dout << dendl;
7930 delete o;
7931 request_full_map(e, last);
7932 last = e - 1;
7933 break;
7934 }
7935 got_full_map(e);
7936
7937 ghobject_t fulloid = get_osdmap_pobject_name(e);
7938 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7939 pin_map_bl(e, fbl);
7940 pinned_maps.push_back(add_map(o));
7941 continue;
7942 }
7943
7944 assert(0 == "MOSDMap lied about what maps it had?");
7945 }
7946
7947 // even if this map isn't from a mon, we may have satisfied our subscription
7948 monc->sub_got("osdmap", last);
7949
7950 if (!m->maps.empty() && requested_full_first) {
7951 dout(10) << __func__ << " still missing full maps " << requested_full_first
7952 << ".." << requested_full_last << dendl;
7953 rerequest_full_maps();
7954 }
7955
7956 if (superblock.oldest_map) {
7957 // make sure we at least keep pace with incoming maps
7958 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7959 }
7960
7961 if (!superblock.oldest_map || skip_maps)
7962 superblock.oldest_map = first;
7963 superblock.newest_map = last;
7964 superblock.current_epoch = last;
7965
7966 // note in the superblock that we were clean thru the prior epoch
7967 epoch_t boot_epoch = service.get_boot_epoch();
7968 if (boot_epoch && boot_epoch >= superblock.mounted) {
7969 superblock.mounted = boot_epoch;
7970 superblock.clean_thru = last;
7971 }
7972
7973 // superblock and commit
7974 write_superblock(t);
7975 store->queue_transaction(
7976 service.meta_osr.get(),
7977 std::move(t),
7978 new C_OnMapApply(&service, pinned_maps, last),
7979 new C_OnMapCommit(this, start, last, m), 0);
7980 service.publish_superblock(superblock);
7981 }
7982
7983 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7984 {
7985 dout(10) << __func__ << " " << first << ".." << last << dendl;
7986 if (is_stopping()) {
7987 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7988 return;
7989 }
7990 Mutex::Locker l(osd_lock);
7991 if (is_stopping()) {
7992 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7993 return;
7994 }
7995 map_lock.get_write();
7996
7997 bool do_shutdown = false;
7998 bool do_restart = false;
7999 bool network_error = false;
8000
8001 // advance through the new maps
8002 for (epoch_t cur = first; cur <= last; cur++) {
8003 dout(10) << " advance to epoch " << cur
8004 << " (<= last " << last
8005 << " <= newest_map " << superblock.newest_map
8006 << ")" << dendl;
8007
8008 OSDMapRef newmap = get_map(cur);
8009 assert(newmap); // we just cached it above!
8010
8011 // start blacklisting messages sent to peers that go down.
8012 service.pre_publish_map(newmap);
8013
8014 // kill connections to newly down osds
8015 bool waited_for_reservations = false;
8016 set<int> old;
8017 osdmap->get_all_osds(old);
8018 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8019 if (*p != whoami &&
8020 osdmap->is_up(*p) && // in old map
8021 newmap->is_down(*p)) { // but not the new one
8022 if (!waited_for_reservations) {
8023 service.await_reserved_maps();
8024 waited_for_reservations = true;
8025 }
8026 note_down_osd(*p);
8027 } else if (*p != whoami &&
8028 osdmap->is_down(*p) &&
8029 newmap->is_up(*p)) {
8030 note_up_osd(*p);
8031 }
8032 }
8033
8034 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
8035 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
8036 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
8037 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8038 << dendl;
8039 if (is_booting()) {
8040 // this captures the case where we sent the boot message while
8041 // NOUP was being set on the mon and our boot request was
8042 // dropped, and then later it is cleared. it imperfectly
8043 // handles the case where our original boot message was not
8044 // dropped and we restart even though we might have booted, but
8045 // that is harmless (boot will just take slightly longer).
8046 do_restart = true;
8047 }
8048 }
8049 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
8050 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
8051 dout(10) << __func__ << " require_osd_release reached luminous in "
8052 << newmap->get_epoch() << dendl;
8053 clear_pg_stat_queue();
8054 clear_outstanding_pg_stats();
8055 }
8056
8057 osdmap = newmap;
8058 epoch_t up_epoch;
8059 epoch_t boot_epoch;
8060 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8061 if (!up_epoch &&
8062 osdmap->is_up(whoami) &&
8063 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
8064 up_epoch = osdmap->get_epoch();
8065 dout(10) << "up_epoch is " << up_epoch << dendl;
8066 if (!boot_epoch) {
8067 boot_epoch = osdmap->get_epoch();
8068 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8069 }
8070 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8071 }
8072 }
8073
8074 had_map_since = ceph_clock_now();
8075
8076 epoch_t _bind_epoch = service.get_bind_epoch();
8077 if (osdmap->is_up(whoami) &&
8078 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
8079 _bind_epoch < osdmap->get_up_from(whoami)) {
8080
8081 if (is_booting()) {
8082 dout(1) << "state: booting -> active" << dendl;
8083 set_state(STATE_ACTIVE);
8084
8085 // set incarnation so that osd_reqid_t's we generate for our
8086 // objecter requests are unique across restarts.
8087 service.objecter->set_client_incarnation(osdmap->get_epoch());
8088 }
8089 }
8090
8091 if (osdmap->get_epoch() > 0 &&
8092 is_active()) {
8093 if (!osdmap->exists(whoami)) {
8094 dout(0) << "map says i do not exist. shutting down." << dendl;
8095 do_shutdown = true; // don't call shutdown() while we have
8096 // everything paused
8097 } else if (!osdmap->is_up(whoami) ||
8098 !osdmap->get_addr(whoami).probably_equals(
8099 client_messenger->get_myaddr()) ||
8100 !osdmap->get_cluster_addr(whoami).probably_equals(
8101 cluster_messenger->get_myaddr()) ||
8102 !osdmap->get_hb_back_addr(whoami).probably_equals(
8103 hb_back_server_messenger->get_myaddr()) ||
8104 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8105 !osdmap->get_hb_front_addr(whoami).probably_equals(
8106 hb_front_server_messenger->get_myaddr()))) {
8107 if (!osdmap->is_up(whoami)) {
8108 if (service.is_preparing_to_stop() || service.is_stopping()) {
8109 service.got_stop_ack();
8110 } else {
8111 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8112 "but it is still running";
8113 clog->debug() << "map e" << osdmap->get_epoch()
8114 << " wrongly marked me down at e"
8115 << osdmap->get_down_at(whoami);
8116 }
8117 } else if (!osdmap->get_addr(whoami).probably_equals(
8118 client_messenger->get_myaddr())) {
8119 clog->error() << "map e" << osdmap->get_epoch()
8120 << " had wrong client addr (" << osdmap->get_addr(whoami)
8121 << " != my " << client_messenger->get_myaddr() << ")";
8122 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8123 cluster_messenger->get_myaddr())) {
8124 clog->error() << "map e" << osdmap->get_epoch()
8125 << " had wrong cluster addr ("
8126 << osdmap->get_cluster_addr(whoami)
8127 << " != my " << cluster_messenger->get_myaddr() << ")";
8128 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8129 hb_back_server_messenger->get_myaddr())) {
8130 clog->error() << "map e" << osdmap->get_epoch()
8131 << " had wrong heartbeat back addr ("
8132 << osdmap->get_hb_back_addr(whoami)
8133 << " != my " << hb_back_server_messenger->get_myaddr()
8134 << ")";
8135 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8136 !osdmap->get_hb_front_addr(whoami).probably_equals(
8137 hb_front_server_messenger->get_myaddr())) {
8138 clog->error() << "map e" << osdmap->get_epoch()
8139 << " had wrong heartbeat front addr ("
8140 << osdmap->get_hb_front_addr(whoami)
8141 << " != my " << hb_front_server_messenger->get_myaddr()
8142 << ")";
8143 }
8144
8145 if (!service.is_stopping()) {
8146 epoch_t up_epoch = 0;
8147 epoch_t bind_epoch = osdmap->get_epoch();
8148 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8149 do_restart = true;
8150
8151 //add markdown log
8152 utime_t now = ceph_clock_now();
8153 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8154 osd_markdown_log.push_back(now);
8155 //clear all out-of-date log
8156 while (!osd_markdown_log.empty() &&
8157 osd_markdown_log.front() + grace < now)
8158 osd_markdown_log.pop_front();
8159 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8160 dout(0) << __func__ << " marked down "
8161 << osd_markdown_log.size()
8162 << " > osd_max_markdown_count "
8163 << cct->_conf->osd_max_markdown_count
8164 << " in last " << grace << " seconds, shutting down"
8165 << dendl;
8166 do_restart = false;
8167 do_shutdown = true;
8168 }
8169
8170 start_waiting_for_healthy();
8171
8172 set<int> avoid_ports;
8173 #if defined(__FreeBSD__)
8174 // prevent FreeBSD from grabbing the client_messenger port during
8175 // rebinding. In which case a cluster_meesneger will connect also
8176 // to the same port
8177 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8178 #endif
8179 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8180 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8181 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8182
8183 int r = cluster_messenger->rebind(avoid_ports);
8184 if (r != 0) {
8185 do_shutdown = true; // FIXME: do_restart?
8186 network_error = true;
8187 dout(0) << __func__ << " marked down:"
8188 << " rebind cluster_messenger failed" << dendl;
8189 }
8190
8191 r = hb_back_server_messenger->rebind(avoid_ports);
8192 if (r != 0) {
8193 do_shutdown = true; // FIXME: do_restart?
8194 network_error = true;
8195 dout(0) << __func__ << " marked down:"
8196 << " rebind hb_back_server_messenger failed" << dendl;
8197 }
8198
8199 r = hb_front_server_messenger->rebind(avoid_ports);
8200 if (r != 0) {
8201 do_shutdown = true; // FIXME: do_restart?
8202 network_error = true;
8203 dout(0) << __func__ << " marked down:"
8204 << " rebind hb_front_server_messenger failed" << dendl;
8205 }
8206
8207 hb_front_client_messenger->mark_down_all();
8208 hb_back_client_messenger->mark_down_all();
8209
8210 reset_heartbeat_peers();
8211 }
8212 }
8213 }
8214
8215 map_lock.put_write();
8216
8217 check_osdmap_features(store);
8218
8219 // yay!
8220 consume_map();
8221
8222 if (is_active() || is_waiting_for_healthy())
8223 maybe_update_heartbeat_peers();
8224
8225 if (!is_active()) {
8226 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8227 peering_wq.drain();
8228 } else {
8229 activate_map();
8230 }
8231
8232 if (do_shutdown) {
8233 if (network_error) {
8234 Mutex::Locker l(heartbeat_lock);
8235 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8236 failure_pending.begin();
8237 while (it != failure_pending.end()) {
8238 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8239 << it->first << dendl;
8240 send_still_alive(osdmap->get_epoch(), it->second.second);
8241 failure_pending.erase(it++);
8242 }
8243 }
8244 // trigger shutdown in a different thread
8245 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8246 queue_async_signal(SIGINT);
8247 }
8248 else if (m->newest_map && m->newest_map > last) {
8249 dout(10) << " msg say newest map is " << m->newest_map
8250 << ", requesting more" << dendl;
8251 osdmap_subscribe(osdmap->get_epoch()+1, false);
8252 }
8253 else if (is_preboot()) {
8254 if (m->get_source().is_mon())
8255 _preboot(m->oldest_map, m->newest_map);
8256 else
8257 start_boot();
8258 }
8259 else if (do_restart)
8260 start_boot();
8261
8262 }
8263
8264 void OSD::check_osdmap_features(ObjectStore *fs)
8265 {
8266 // adjust required feature bits?
8267
8268 // we have to be a bit careful here, because we are accessing the
8269 // Policy structures without taking any lock. in particular, only
8270 // modify integer values that can safely be read by a racing CPU.
8271 // since we are only accessing existing Policy structures a their
8272 // current memory location, and setting or clearing bits in integer
8273 // fields, and we are the only writer, this is not a problem.
8274
8275 {
8276 Messenger::Policy p = client_messenger->get_default_policy();
8277 uint64_t mask;
8278 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8279 if ((p.features_required & mask) != features) {
8280 dout(0) << "crush map has features " << features
8281 << ", adjusting msgr requires for clients" << dendl;
8282 p.features_required = (p.features_required & ~mask) | features;
8283 client_messenger->set_default_policy(p);
8284 }
8285 }
8286 {
8287 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8288 uint64_t mask;
8289 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8290 if ((p.features_required & mask) != features) {
8291 dout(0) << "crush map has features " << features
8292 << " was " << p.features_required
8293 << ", adjusting msgr requires for mons" << dendl;
8294 p.features_required = (p.features_required & ~mask) | features;
8295 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8296 }
8297 }
8298 {
8299 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8300 uint64_t mask;
8301 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8302
8303 if ((p.features_required & mask) != features) {
8304 dout(0) << "crush map has features " << features
8305 << ", adjusting msgr requires for osds" << dendl;
8306 p.features_required = (p.features_required & ~mask) | features;
8307 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8308 }
8309
8310 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8311 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8312 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8313 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8314 ObjectStore::Transaction t;
8315 write_superblock(t);
8316 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8317 assert(err == 0);
8318 }
8319 }
8320 }
8321
8322 bool OSD::advance_pg(
8323 epoch_t osd_epoch, PG *pg,
8324 ThreadPool::TPHandle &handle,
8325 PG::RecoveryCtx *rctx,
8326 set<PGRef> *new_pgs)
8327 {
8328 assert(pg->is_locked());
8329 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8330 OSDMapRef lastmap = pg->get_osdmap();
8331
8332 if (lastmap->get_epoch() == osd_epoch)
8333 return true;
8334 assert(lastmap->get_epoch() < osd_epoch);
8335
8336 epoch_t min_epoch = service.get_min_pg_epoch();
8337 epoch_t max;
8338 if (min_epoch) {
8339 max = min_epoch + cct->_conf->osd_map_max_advance;
8340 } else {
8341 max = next_epoch + cct->_conf->osd_map_max_advance;
8342 }
8343
8344 for (;
8345 next_epoch <= osd_epoch && next_epoch <= max;
8346 ++next_epoch) {
8347 OSDMapRef nextmap = service.try_get_map(next_epoch);
8348 if (!nextmap) {
8349 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8350 // make sure max is bumped up so that we can get past any
8351 // gap in maps
8352 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8353 continue;
8354 }
8355
8356 vector<int> newup, newacting;
8357 int up_primary, acting_primary;
8358 nextmap->pg_to_up_acting_osds(
8359 pg->info.pgid.pgid,
8360 &newup, &up_primary,
8361 &newacting, &acting_primary);
8362 pg->handle_advance_map(
8363 nextmap, lastmap, newup, up_primary,
8364 newacting, acting_primary, rctx);
8365
8366 // Check for split!
8367 set<spg_t> children;
8368 spg_t parent(pg->info.pgid);
8369 if (parent.is_split(
8370 lastmap->get_pg_num(pg->pool.id),
8371 nextmap->get_pg_num(pg->pool.id),
8372 &children)) {
8373 service.mark_split_in_progress(pg->info.pgid, children);
8374 split_pgs(
8375 pg, children, new_pgs, lastmap, nextmap,
8376 rctx);
8377 }
8378
8379 lastmap = nextmap;
8380 handle.reset_tp_timeout();
8381 }
8382 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8383 pg->handle_activate_map(rctx);
8384 if (next_epoch <= osd_epoch) {
8385 dout(10) << __func__ << " advanced to max " << max
8386 << " past min epoch " << min_epoch
8387 << " ... will requeue " << *pg << dendl;
8388 return false;
8389 }
8390 return true;
8391 }
8392
8393 void OSD::consume_map()
8394 {
8395 assert(osd_lock.is_locked());
8396 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8397
8398 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8399 * speak the older sorting version any more. Be careful not to force
8400 * a shutdown if we are merely processing old maps, though.
8401 */
8402 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8403 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8404 ceph_abort();
8405 }
8406
8407 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8408 list<PGRef> to_remove;
8409
8410 // scan pg's
8411 {
8412 RWLock::RLocker l(pg_map_lock);
8413 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8414 it != pg_map.end();
8415 ++it) {
8416 PG *pg = it->second;
8417 pg->lock();
8418 if (pg->is_primary())
8419 num_pg_primary++;
8420 else if (pg->is_replica())
8421 num_pg_replica++;
8422 else
8423 num_pg_stray++;
8424
8425 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8426 //pool is deleted!
8427 to_remove.push_back(PGRef(pg));
8428 } else {
8429 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8430 }
8431
8432 pg->unlock();
8433 }
8434
8435 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8436 for (auto pg = pending_creates_from_osd.cbegin();
8437 pg != pending_creates_from_osd.cend();) {
8438 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
8439 pg = pending_creates_from_osd.erase(pg);
8440 } else {
8441 ++pg;
8442 }
8443 }
8444 }
8445
8446 for (list<PGRef>::iterator i = to_remove.begin();
8447 i != to_remove.end();
8448 to_remove.erase(i++)) {
8449 RWLock::WLocker locker(pg_map_lock);
8450 (*i)->lock();
8451 _remove_pg(&**i);
8452 (*i)->unlock();
8453 }
8454
8455 service.expand_pg_num(service.get_osdmap(), osdmap);
8456
8457 service.pre_publish_map(osdmap);
8458 service.await_reserved_maps();
8459 service.publish_map(osdmap);
8460
8461 service.maybe_inject_dispatch_delay();
8462
8463 dispatch_sessions_waiting_on_map();
8464
8465 service.maybe_inject_dispatch_delay();
8466
8467 // remove any PGs which we no longer host from the session waiting_for_pg lists
8468 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8469 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8470
8471 service.maybe_inject_dispatch_delay();
8472
8473 // scan pg's
8474 {
8475 RWLock::RLocker l(pg_map_lock);
8476 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8477 it != pg_map.end();
8478 ++it) {
8479 PG *pg = it->second;
8480 pg->lock();
8481 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8482 pg->unlock();
8483 }
8484
8485 logger->set(l_osd_pg, pg_map.size());
8486 }
8487 logger->set(l_osd_pg_primary, num_pg_primary);
8488 logger->set(l_osd_pg_replica, num_pg_replica);
8489 logger->set(l_osd_pg_stray, num_pg_stray);
8490 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
8491 }
8492
8493 void OSD::activate_map()
8494 {
8495 assert(osd_lock.is_locked());
8496
8497 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8498
8499 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8500 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8501 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8502 }
8503
8504 // norecover?
8505 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8506 if (!service.recovery_is_paused()) {
8507 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8508 service.pause_recovery();
8509 }
8510 } else {
8511 if (service.recovery_is_paused()) {
8512 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8513 service.unpause_recovery();
8514 }
8515 }
8516
8517 service.activate_map();
8518
8519 // process waiters
8520 take_waiters(waiting_for_osdmap);
8521 }
8522
8523 bool OSD::require_mon_peer(const Message *m)
8524 {
8525 if (!m->get_connection()->peer_is_mon()) {
8526 dout(0) << "require_mon_peer received from non-mon "
8527 << m->get_connection()->get_peer_addr()
8528 << " " << *m << dendl;
8529 return false;
8530 }
8531 return true;
8532 }
8533
8534 bool OSD::require_mon_or_mgr_peer(const Message *m)
8535 {
8536 if (!m->get_connection()->peer_is_mon() &&
8537 !m->get_connection()->peer_is_mgr()) {
8538 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8539 << m->get_connection()->get_peer_addr()
8540 << " " << *m << dendl;
8541 return false;
8542 }
8543 return true;
8544 }
8545
8546 bool OSD::require_osd_peer(const Message *m)
8547 {
8548 if (!m->get_connection()->peer_is_osd()) {
8549 dout(0) << "require_osd_peer received from non-osd "
8550 << m->get_connection()->get_peer_addr()
8551 << " " << *m << dendl;
8552 return false;
8553 }
8554 return true;
8555 }
8556
8557 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8558 {
8559 epoch_t up_epoch = service.get_up_epoch();
8560 if (epoch < up_epoch) {
8561 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8562 return false;
8563 }
8564
8565 if (!is_active()) {
8566 dout(7) << "still in boot state, dropping message " << *m << dendl;
8567 return false;
8568 }
8569
8570 return true;
8571 }
8572
8573 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8574 bool is_fast_dispatch)
8575 {
8576 int from = m->get_source().num();
8577
8578 if (map->is_down(from) ||
8579 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8580 dout(5) << "from dead osd." << from << ", marking down, "
8581 << " msg was " << m->get_source_inst().addr
8582 << " expected " << (map->is_up(from) ?
8583 map->get_cluster_addr(from) : entity_addr_t())
8584 << dendl;
8585 ConnectionRef con = m->get_connection();
8586 con->mark_down();
8587 Session *s = static_cast<Session*>(con->get_priv());
8588 if (s) {
8589 if (!is_fast_dispatch)
8590 s->session_dispatch_lock.Lock();
8591 clear_session_waiting_on_map(s);
8592 con->set_priv(NULL); // break ref <-> session cycle, if any
8593 if (!is_fast_dispatch)
8594 s->session_dispatch_lock.Unlock();
8595 s->put();
8596 }
8597 return false;
8598 }
8599 return true;
8600 }
8601
8602
8603 /*
8604 * require that we have same (or newer) map, and that
8605 * the source is the pg primary.
8606 */
8607 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8608 bool is_fast_dispatch)
8609 {
8610 const Message *m = op->get_req();
8611 dout(15) << "require_same_or_newer_map " << epoch
8612 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8613
8614 assert(osd_lock.is_locked());
8615
8616 // do they have a newer map?
8617 if (epoch > osdmap->get_epoch()) {
8618 dout(7) << "waiting for newer map epoch " << epoch
8619 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8620 wait_for_new_map(op);
8621 return false;
8622 }
8623
8624 if (!require_self_aliveness(op->get_req(), epoch)) {
8625 return false;
8626 }
8627
8628 // ok, our map is same or newer.. do they still exist?
8629 if (m->get_connection()->get_messenger() == cluster_messenger &&
8630 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8631 return false;
8632 }
8633
8634 return true;
8635 }
8636
8637
8638
8639
8640
8641 // ----------------------------------------
8642 // pg creation
8643
8644 void OSD::split_pgs(
8645 PG *parent,
8646 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8647 OSDMapRef curmap,
8648 OSDMapRef nextmap,
8649 PG::RecoveryCtx *rctx)
8650 {
8651 unsigned pg_num = nextmap->get_pg_num(
8652 parent->pool.id);
8653 parent->update_snap_mapper_bits(
8654 parent->info.pgid.get_split_bits(pg_num)
8655 );
8656
8657 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8658 parent->info.stats.stats.sum.split(updated_stats);
8659
8660 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8661 for (set<spg_t>::const_iterator i = childpgids.begin();
8662 i != childpgids.end();
8663 ++i, ++stat_iter) {
8664 assert(stat_iter != updated_stats.end());
8665 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8666 assert(service.splitting(*i));
8667 PG* child = _make_pg(nextmap, *i);
8668 child->lock(true);
8669 out_pgs->insert(child);
8670 rctx->created_pgs.insert(child);
8671
8672 unsigned split_bits = i->get_split_bits(pg_num);
8673 dout(10) << "pg_num is " << pg_num << dendl;
8674 dout(10) << "m_seed " << i->ps() << dendl;
8675 dout(10) << "split_bits is " << split_bits << dendl;
8676
8677 parent->split_colls(
8678 *i,
8679 split_bits,
8680 i->ps(),
8681 &child->pool.info,
8682 rctx->transaction);
8683 parent->split_into(
8684 i->pgid,
8685 child,
8686 split_bits);
8687 child->info.stats.stats.sum = *stat_iter;
8688
8689 child->write_if_dirty(*(rctx->transaction));
8690 child->unlock();
8691 }
8692 assert(stat_iter != updated_stats.end());
8693 parent->info.stats.stats.sum = *stat_iter;
8694 parent->write_if_dirty(*(rctx->transaction));
8695 }
8696
8697 /*
8698 * holding osd_lock
8699 */
8700 void OSD::handle_pg_create(OpRequestRef op)
8701 {
8702 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8703 assert(m->get_type() == MSG_OSD_PG_CREATE);
8704
8705 dout(10) << "handle_pg_create " << *m << dendl;
8706
8707 if (!require_mon_peer(op->get_req())) {
8708 return;
8709 }
8710
8711 if (!require_same_or_newer_map(op, m->epoch, false))
8712 return;
8713
8714 op->mark_started();
8715
8716 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8717 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8718 p != m->mkpg.end();
8719 ++p, ++ci) {
8720 assert(ci != m->ctimes.end() && ci->first == p->first);
8721 epoch_t created = p->second.created;
8722 if (p->second.split_bits) // Skip split pgs
8723 continue;
8724 pg_t on = p->first;
8725
8726 if (on.preferred() >= 0) {
8727 dout(20) << "ignoring localized pg " << on << dendl;
8728 continue;
8729 }
8730
8731 if (!osdmap->have_pg_pool(on.pool())) {
8732 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8733 continue;
8734 }
8735
8736 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8737
8738 // is it still ours?
8739 vector<int> up, acting;
8740 int up_primary = -1;
8741 int acting_primary = -1;
8742 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8743 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8744
8745 if (acting_primary != whoami) {
8746 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8747 << "), my role=" << role << ", skipping" << dendl;
8748 continue;
8749 }
8750
8751 spg_t pgid;
8752 bool mapped = osdmap->get_primary_shard(on, &pgid);
8753 assert(mapped);
8754
8755 PastIntervals pi(
8756 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8757 *osdmap);
8758 pg_history_t history;
8759 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8760
8761 // The mon won't resend unless the primary changed, so
8762 // we ignore same_interval_since. We'll pass this history
8763 // to handle_pg_peering_evt with the current epoch as the
8764 // event -- the project_pg_history check in
8765 // handle_pg_peering_evt will be a noop.
8766 if (history.same_primary_since > m->epoch) {
8767 dout(10) << __func__ << ": got obsolete pg create on pgid "
8768 << pgid << " from epoch " << m->epoch
8769 << ", primary changed in " << history.same_primary_since
8770 << dendl;
8771 continue;
8772 }
8773 if (handle_pg_peering_evt(
8774 pgid,
8775 history,
8776 pi,
8777 osdmap->get_epoch(),
8778 PG::CephPeeringEvtRef(
8779 new PG::CephPeeringEvt(
8780 osdmap->get_epoch(),
8781 osdmap->get_epoch(),
8782 PG::NullEvt()))
8783 ) == -EEXIST) {
8784 service.send_pg_created(pgid.pgid);
8785 }
8786 }
8787
8788 {
8789 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8790 if (pending_creates_from_mon == 0) {
8791 last_pg_create_epoch = m->epoch;
8792 }
8793 }
8794 maybe_update_heartbeat_peers();
8795 }
8796
8797
8798 // ----------------------------------------
8799 // peering and recovery
8800
8801 PG::RecoveryCtx OSD::create_context()
8802 {
8803 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8804 C_Contexts *on_applied = new C_Contexts(cct);
8805 C_Contexts *on_safe = new C_Contexts(cct);
8806 map<int, map<spg_t,pg_query_t> > *query_map =
8807 new map<int, map<spg_t, pg_query_t> >;
8808 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8809 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8810 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8811 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8812 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8813 on_applied, on_safe, t);
8814 return rctx;
8815 }
8816
8817 struct C_OpenPGs : public Context {
8818 set<PGRef> pgs;
8819 ObjectStore *store;
8820 OSD *osd;
8821 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8822 pgs.swap(p);
8823 }
8824 void finish(int r) override {
8825 RWLock::RLocker l(osd->pg_map_lock);
8826 for (auto p : pgs) {
8827 if (osd->pg_map.count(p->info.pgid)) {
8828 p->ch = store->open_collection(p->coll);
8829 assert(p->ch);
8830 }
8831 }
8832 }
8833 };
8834
8835 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8836 ThreadPool::TPHandle *handle)
8837 {
8838 if (!ctx.transaction->empty()) {
8839 if (!ctx.created_pgs.empty()) {
8840 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8841 }
8842 int tr = store->queue_transaction(
8843 pg->osr.get(),
8844 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8845 TrackedOpRef(), handle);
8846 delete (ctx.transaction);
8847 assert(tr == 0);
8848 ctx.transaction = new ObjectStore::Transaction;
8849 ctx.on_applied = new C_Contexts(cct);
8850 ctx.on_safe = new C_Contexts(cct);
8851 }
8852 }
8853
8854 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8855 ThreadPool::TPHandle *handle)
8856 {
8857 if (service.get_osdmap()->is_up(whoami) &&
8858 is_active()) {
8859 do_notifies(*ctx.notify_list, curmap);
8860 do_queries(*ctx.query_map, curmap);
8861 do_infos(*ctx.info_map, curmap);
8862 }
8863 delete ctx.notify_list;
8864 delete ctx.query_map;
8865 delete ctx.info_map;
8866 if ((ctx.on_applied->empty() &&
8867 ctx.on_safe->empty() &&
8868 ctx.transaction->empty() &&
8869 ctx.created_pgs.empty()) || !pg) {
8870 delete ctx.transaction;
8871 delete ctx.on_applied;
8872 delete ctx.on_safe;
8873 assert(ctx.created_pgs.empty());
8874 } else {
8875 if (!ctx.created_pgs.empty()) {
8876 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8877 }
8878 int tr = store->queue_transaction(
8879 pg->osr.get(),
8880 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8881 handle);
8882 delete (ctx.transaction);
8883 assert(tr == 0);
8884 }
8885 }
8886
8887 /** do_notifies
8888 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8889 * content for, and they are primary for.
8890 */
8891
8892 void OSD::do_notifies(
8893 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8894 OSDMapRef curmap)
8895 {
8896 for (map<int,
8897 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8898 notify_list.begin();
8899 it != notify_list.end();
8900 ++it) {
8901 if (!curmap->is_up(it->first)) {
8902 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8903 continue;
8904 }
8905 ConnectionRef con = service.get_con_osd_cluster(
8906 it->first, curmap->get_epoch());
8907 if (!con) {
8908 dout(20) << __func__ << " skipping osd." << it->first
8909 << " (NULL con)" << dendl;
8910 continue;
8911 }
8912 service.share_map_peer(it->first, con.get(), curmap);
8913 dout(7) << __func__ << " osd." << it->first
8914 << " on " << it->second.size() << " PGs" << dendl;
8915 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8916 it->second);
8917 con->send_message(m);
8918 }
8919 }
8920
8921
8922 /** do_queries
8923 * send out pending queries for info | summaries
8924 */
8925 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8926 OSDMapRef curmap)
8927 {
8928 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8929 pit != query_map.end();
8930 ++pit) {
8931 if (!curmap->is_up(pit->first)) {
8932 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8933 continue;
8934 }
8935 int who = pit->first;
8936 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8937 if (!con) {
8938 dout(20) << __func__ << " skipping osd." << who
8939 << " (NULL con)" << dendl;
8940 continue;
8941 }
8942 service.share_map_peer(who, con.get(), curmap);
8943 dout(7) << __func__ << " querying osd." << who
8944 << " on " << pit->second.size() << " PGs" << dendl;
8945 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8946 con->send_message(m);
8947 }
8948 }
8949
8950
8951 void OSD::do_infos(map<int,
8952 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8953 OSDMapRef curmap)
8954 {
8955 for (map<int,
8956 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8957 info_map.begin();
8958 p != info_map.end();
8959 ++p) {
8960 if (!curmap->is_up(p->first)) {
8961 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8962 continue;
8963 }
8964 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8965 i != p->second.end();
8966 ++i) {
8967 dout(20) << __func__ << " sending info " << i->first.info
8968 << " to shard " << p->first << dendl;
8969 }
8970 ConnectionRef con = service.get_con_osd_cluster(
8971 p->first, curmap->get_epoch());
8972 if (!con) {
8973 dout(20) << __func__ << " skipping osd." << p->first
8974 << " (NULL con)" << dendl;
8975 continue;
8976 }
8977 service.share_map_peer(p->first, con.get(), curmap);
8978 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8979 m->pg_list = p->second;
8980 con->send_message(m);
8981 }
8982 info_map.clear();
8983 }
8984
8985
8986 /** PGNotify
8987 * from non-primary to primary
8988 * includes pg_info_t.
8989 * NOTE: called with opqueue active.
8990 */
8991 void OSD::handle_pg_notify(OpRequestRef op)
8992 {
8993 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8994 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8995
8996 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8997 int from = m->get_source().num();
8998
8999 if (!require_osd_peer(op->get_req()))
9000 return;
9001
9002 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9003 return;
9004
9005 op->mark_started();
9006
9007 for (auto it = m->get_pg_list().begin();
9008 it != m->get_pg_list().end();
9009 ++it) {
9010 if (it->first.info.pgid.preferred() >= 0) {
9011 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
9012 continue;
9013 }
9014
9015 handle_pg_peering_evt(
9016 spg_t(it->first.info.pgid.pgid, it->first.to),
9017 it->first.info.history, it->second,
9018 it->first.query_epoch,
9019 PG::CephPeeringEvtRef(
9020 new PG::CephPeeringEvt(
9021 it->first.epoch_sent, it->first.query_epoch,
9022 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
9023 op->get_req()->get_connection()->get_features())))
9024 );
9025 }
9026 }
9027
9028 void OSD::handle_pg_log(OpRequestRef op)
9029 {
9030 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
9031 assert(m->get_type() == MSG_OSD_PG_LOG);
9032 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
9033
9034 if (!require_osd_peer(op->get_req()))
9035 return;
9036
9037 int from = m->get_source().num();
9038 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9039 return;
9040
9041 if (m->info.pgid.preferred() >= 0) {
9042 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
9043 return;
9044 }
9045
9046 op->mark_started();
9047 handle_pg_peering_evt(
9048 spg_t(m->info.pgid.pgid, m->to),
9049 m->info.history, m->past_intervals, m->get_epoch(),
9050 PG::CephPeeringEvtRef(
9051 new PG::CephPeeringEvt(
9052 m->get_epoch(), m->get_query_epoch(),
9053 PG::MLogRec(pg_shard_t(from, m->from), m)))
9054 );
9055 }
9056
9057 void OSD::handle_pg_info(OpRequestRef op)
9058 {
9059 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
9060 assert(m->get_type() == MSG_OSD_PG_INFO);
9061 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
9062
9063 if (!require_osd_peer(op->get_req()))
9064 return;
9065
9066 int from = m->get_source().num();
9067 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9068 return;
9069
9070 op->mark_started();
9071
9072 for (auto p = m->pg_list.begin();
9073 p != m->pg_list.end();
9074 ++p) {
9075 if (p->first.info.pgid.preferred() >= 0) {
9076 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
9077 continue;
9078 }
9079
9080 handle_pg_peering_evt(
9081 spg_t(p->first.info.pgid.pgid, p->first.to),
9082 p->first.info.history, p->second, p->first.epoch_sent,
9083 PG::CephPeeringEvtRef(
9084 new PG::CephPeeringEvt(
9085 p->first.epoch_sent, p->first.query_epoch,
9086 PG::MInfoRec(
9087 pg_shard_t(
9088 from, p->first.from), p->first.info, p->first.epoch_sent)))
9089 );
9090 }
9091 }
9092
9093 void OSD::handle_pg_trim(OpRequestRef op)
9094 {
9095 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
9096 assert(m->get_type() == MSG_OSD_PG_TRIM);
9097
9098 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
9099
9100 if (!require_osd_peer(op->get_req()))
9101 return;
9102
9103 int from = m->get_source().num();
9104 if (!require_same_or_newer_map(op, m->epoch, false))
9105 return;
9106
9107 if (m->pgid.preferred() >= 0) {
9108 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9109 return;
9110 }
9111
9112 op->mark_started();
9113
9114 PG *pg = _lookup_lock_pg(m->pgid);
9115 if(!pg) {
9116 dout(10) << " don't have pg " << m->pgid << dendl;
9117 return;
9118 }
9119
9120 if (m->epoch < pg->info.history.same_interval_since) {
9121 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9122 pg->unlock();
9123 return;
9124 }
9125
9126 if (pg->is_primary()) {
9127 // peer is informing us of their last_complete_ondisk
9128 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9129 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9130 m->trim_to;
9131 // trim log when the pg is recovered
9132 pg->calc_min_last_complete_ondisk();
9133 } else {
9134 // primary is instructing us to trim
9135 ObjectStore::Transaction t;
9136 pg->pg_log.trim(m->trim_to, pg->info);
9137 pg->dirty_info = true;
9138 pg->write_if_dirty(t);
9139 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9140 assert(tr == 0);
9141 }
9142 pg->unlock();
9143 }
9144
9145 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9146 {
9147 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9148 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9149
9150 if (!require_osd_peer(op->get_req()))
9151 return;
9152 if (!require_same_or_newer_map(op, m->query_epoch, false))
9153 return;
9154
9155 PG::CephPeeringEvtRef evt;
9156 if (m->type == MBackfillReserve::REQUEST) {
9157 evt = PG::CephPeeringEvtRef(
9158 new PG::CephPeeringEvt(
9159 m->query_epoch,
9160 m->query_epoch,
9161 PG::RequestBackfillPrio(m->priority)));
9162 } else if (m->type == MBackfillReserve::GRANT) {
9163 evt = PG::CephPeeringEvtRef(
9164 new PG::CephPeeringEvt(
9165 m->query_epoch,
9166 m->query_epoch,
9167 PG::RemoteBackfillReserved()));
9168 } else if (m->type == MBackfillReserve::REJECT) {
9169 // NOTE: this is replica -> primary "i reject your request"
9170 // and also primary -> replica "cancel my previously-granted request"
9171 evt = PG::CephPeeringEvtRef(
9172 new PG::CephPeeringEvt(
9173 m->query_epoch,
9174 m->query_epoch,
9175 PG::RemoteReservationRejected()));
9176 } else {
9177 ceph_abort();
9178 }
9179
9180 if (service.splitting(m->pgid)) {
9181 peering_wait_for_split[m->pgid].push_back(evt);
9182 return;
9183 }
9184
9185 PG *pg = _lookup_lock_pg(m->pgid);
9186 if (!pg) {
9187 dout(10) << " don't have pg " << m->pgid << dendl;
9188 return;
9189 }
9190
9191 pg->queue_peering_event(evt);
9192 pg->unlock();
9193 }
9194
9195 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9196 {
9197 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9198 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9199
9200 if (!require_osd_peer(op->get_req()))
9201 return;
9202 if (!require_same_or_newer_map(op, m->query_epoch, false))
9203 return;
9204
9205 PG::CephPeeringEvtRef evt;
9206 if (m->type == MRecoveryReserve::REQUEST) {
9207 evt = PG::CephPeeringEvtRef(
9208 new PG::CephPeeringEvt(
9209 m->query_epoch,
9210 m->query_epoch,
9211 PG::RequestRecovery()));
9212 } else if (m->type == MRecoveryReserve::GRANT) {
9213 evt = PG::CephPeeringEvtRef(
9214 new PG::CephPeeringEvt(
9215 m->query_epoch,
9216 m->query_epoch,
9217 PG::RemoteRecoveryReserved()));
9218 } else if (m->type == MRecoveryReserve::RELEASE) {
9219 evt = PG::CephPeeringEvtRef(
9220 new PG::CephPeeringEvt(
9221 m->query_epoch,
9222 m->query_epoch,
9223 PG::RecoveryDone()));
9224 } else {
9225 ceph_abort();
9226 }
9227
9228 if (service.splitting(m->pgid)) {
9229 peering_wait_for_split[m->pgid].push_back(evt);
9230 return;
9231 }
9232
9233 PG *pg = _lookup_lock_pg(m->pgid);
9234 if (!pg) {
9235 dout(10) << " don't have pg " << m->pgid << dendl;
9236 return;
9237 }
9238
9239 pg->queue_peering_event(evt);
9240 pg->unlock();
9241 }
9242
9243 void OSD::handle_force_recovery(Message *m)
9244 {
9245 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9246 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
9247
9248 vector<PGRef> local_pgs;
9249 local_pgs.reserve(msg->forced_pgs.size());
9250
9251 {
9252 RWLock::RLocker l(pg_map_lock);
9253 for (auto& i : msg->forced_pgs) {
9254 spg_t locpg;
9255 if (osdmap->get_primary_shard(i, &locpg)) {
9256 auto pg_map_entry = pg_map.find(locpg);
9257 if (pg_map_entry != pg_map.end()) {
9258 local_pgs.push_back(pg_map_entry->second);
9259 }
9260 }
9261 }
9262 }
9263
9264 if (local_pgs.size()) {
9265 service.adjust_pg_priorities(local_pgs, msg->options);
9266 }
9267
9268 msg->put();
9269 }
9270
9271 /** PGQuery
9272 * from primary to replica | stray
9273 * NOTE: called with opqueue active.
9274 */
9275 void OSD::handle_pg_query(OpRequestRef op)
9276 {
9277 assert(osd_lock.is_locked());
9278
9279 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9280 assert(m->get_type() == MSG_OSD_PG_QUERY);
9281
9282 if (!require_osd_peer(op->get_req()))
9283 return;
9284
9285 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9286 int from = m->get_source().num();
9287
9288 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9289 return;
9290
9291 op->mark_started();
9292
9293 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9294
9295 for (auto it = m->pg_list.begin();
9296 it != m->pg_list.end();
9297 ++it) {
9298 spg_t pgid = it->first;
9299
9300 if (pgid.preferred() >= 0) {
9301 dout(10) << "ignoring localized pg " << pgid << dendl;
9302 continue;
9303 }
9304
9305 if (service.splitting(pgid)) {
9306 peering_wait_for_split[pgid].push_back(
9307 PG::CephPeeringEvtRef(
9308 new PG::CephPeeringEvt(
9309 it->second.epoch_sent, it->second.epoch_sent,
9310 PG::MQuery(pg_shard_t(from, it->second.from),
9311 it->second, it->second.epoch_sent))));
9312 continue;
9313 }
9314
9315 {
9316 RWLock::RLocker l(pg_map_lock);
9317 if (pg_map.count(pgid)) {
9318 PG *pg = 0;
9319 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9320 pg->queue_query(
9321 it->second.epoch_sent, it->second.epoch_sent,
9322 pg_shard_t(from, it->second.from), it->second);
9323 pg->unlock();
9324 continue;
9325 }
9326 }
9327
9328 if (!osdmap->have_pg_pool(pgid.pool()))
9329 continue;
9330
9331 // get active crush mapping
9332 int up_primary, acting_primary;
9333 vector<int> up, acting;
9334 osdmap->pg_to_up_acting_osds(
9335 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9336
9337 // same primary?
9338 pg_history_t history = it->second.history;
9339 bool valid_history = project_pg_history(
9340 pgid, history, it->second.epoch_sent,
9341 up, up_primary, acting, acting_primary);
9342
9343 if (!valid_history ||
9344 it->second.epoch_sent < history.same_interval_since) {
9345 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9346 << history.same_interval_since
9347 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9348 continue;
9349 }
9350
9351 dout(10) << " pg " << pgid << " dne" << dendl;
9352 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9353 /* This is racy, but that should be ok: if we complete the deletion
9354 * before the pg is recreated, we'll just start it off backfilling
9355 * instead of just empty */
9356 if (service.deleting_pgs.lookup(pgid))
9357 empty.set_last_backfill(hobject_t());
9358 if (it->second.type == pg_query_t::LOG ||
9359 it->second.type == pg_query_t::FULLLOG) {
9360 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9361 if (con) {
9362 MOSDPGLog *mlog = new MOSDPGLog(
9363 it->second.from, it->second.to,
9364 osdmap->get_epoch(), empty,
9365 it->second.epoch_sent);
9366 service.share_map_peer(from, con.get(), osdmap);
9367 con->send_message(mlog);
9368 }
9369 } else {
9370 notify_list[from].push_back(
9371 make_pair(
9372 pg_notify_t(
9373 it->second.from, it->second.to,
9374 it->second.epoch_sent,
9375 osdmap->get_epoch(),
9376 empty),
9377 PastIntervals(
9378 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9379 *osdmap)));
9380 }
9381 }
9382 do_notifies(notify_list, osdmap);
9383 }
9384
9385
9386 void OSD::handle_pg_remove(OpRequestRef op)
9387 {
9388 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9389 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9390 assert(osd_lock.is_locked());
9391
9392 if (!require_osd_peer(op->get_req()))
9393 return;
9394
9395 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9396 << m->pg_list.size() << " pgs" << dendl;
9397
9398 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9399 return;
9400
9401 op->mark_started();
9402
9403 for (auto it = m->pg_list.begin();
9404 it != m->pg_list.end();
9405 ++it) {
9406 spg_t pgid = *it;
9407 if (pgid.preferred() >= 0) {
9408 dout(10) << "ignoring localized pg " << pgid << dendl;
9409 continue;
9410 }
9411
9412 RWLock::WLocker l(pg_map_lock);
9413 if (pg_map.count(pgid) == 0) {
9414 dout(10) << " don't have pg " << pgid << dendl;
9415 continue;
9416 }
9417 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9418 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9419 pg_history_t history = pg->info.history;
9420 int up_primary, acting_primary;
9421 vector<int> up, acting;
9422 osdmap->pg_to_up_acting_osds(
9423 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9424 bool valid_history = project_pg_history(
9425 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9426 up, up_primary, acting, acting_primary);
9427 if (valid_history &&
9428 history.same_interval_since <= m->get_epoch()) {
9429 assert(pg->get_primary().osd == m->get_source().num());
9430 PGRef _pg(pg);
9431 _remove_pg(pg);
9432 pg->unlock();
9433 } else {
9434 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9435 << history.same_interval_since
9436 << " > " << m->get_epoch() << dendl;
9437 pg->unlock();
9438 }
9439 }
9440 }
9441
9442 void OSD::_remove_pg(PG *pg)
9443 {
9444 ObjectStore::Transaction rmt ;
9445
9446 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9447 // the pg_map must be done together without unlocking the pg lock,
9448 // to avoid racing with watcher cleanup in ms_handle_reset
9449 // and handle_notify_timeout
9450 pg->on_removal(&rmt);
9451
9452 service.cancel_pending_splits_for_parent(pg->info.pgid);
9453 int tr = store->queue_transaction(
9454 pg->osr.get(), std::move(rmt), NULL,
9455 new ContainerContext<
9456 SequencerRef>(pg->osr));
9457 assert(tr == 0);
9458
9459 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9460 pg->info.pgid,
9461 make_pair(
9462 pg->info.pgid,
9463 PGRef(pg))
9464 );
9465 remove_wq.queue(make_pair(PGRef(pg), deleting));
9466
9467 service.pg_remove_epoch(pg->info.pgid);
9468
9469 // dereference from op_wq
9470 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9471
9472 // remove from map
9473 pg_map.erase(pg->info.pgid);
9474 pg->put("PGMap"); // since we've taken it out of map
9475 }
9476
9477 // =========================================================
9478 // RECOVERY
9479
9480 void OSDService::_maybe_queue_recovery() {
9481 assert(recovery_lock.is_locked_by_me());
9482 uint64_t available_pushes;
9483 while (!awaiting_throttle.empty() &&
9484 _recover_now(&available_pushes)) {
9485 uint64_t to_start = MIN(
9486 available_pushes,
9487 cct->_conf->osd_recovery_max_single_start);
9488 _queue_for_recovery(awaiting_throttle.front(), to_start);
9489 awaiting_throttle.pop_front();
9490 recovery_ops_reserved += to_start;
9491 }
9492 }
9493
9494 bool OSDService::_recover_now(uint64_t *available_pushes)
9495 {
9496 if (available_pushes)
9497 *available_pushes = 0;
9498
9499 if (ceph_clock_now() < defer_recovery_until) {
9500 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9501 return false;
9502 }
9503
9504 if (recovery_paused) {
9505 dout(15) << __func__ << " paused" << dendl;
9506 return false;
9507 }
9508
9509 uint64_t max = cct->_conf->osd_recovery_max_active;
9510 if (max <= recovery_ops_active + recovery_ops_reserved) {
9511 dout(15) << __func__ << " active " << recovery_ops_active
9512 << " + reserved " << recovery_ops_reserved
9513 << " >= max " << max << dendl;
9514 return false;
9515 }
9516
9517 if (available_pushes)
9518 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9519
9520 return true;
9521 }
9522
9523
9524 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9525 {
9526 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9527 return;
9528 int newstate = 0;
9529
9530 if (newflags & OFR_BACKFILL) {
9531 newstate = PG_STATE_FORCED_BACKFILL;
9532 } else if (newflags & OFR_RECOVERY) {
9533 newstate = PG_STATE_FORCED_RECOVERY;
9534 }
9535
9536 // debug output here may get large, don't generate it if debug level is below
9537 // 10 and use abbreviated pg ids otherwise
9538 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9539 stringstream ss;
9540
9541 for (auto& i : pgs) {
9542 ss << i->get_pgid() << " ";
9543 }
9544
9545 dout(10) << __func__ << " working on " << ss.str() << dendl;
9546 }
9547
9548 if (newflags & OFR_CANCEL) {
9549 for (auto& i : pgs) {
9550 i->lock();
9551 i->_change_recovery_force_mode(newstate, true);
9552 i->unlock();
9553 }
9554 } else {
9555 for (auto& i : pgs) {
9556 // make sure the PG is in correct state before forcing backfill or recovery, or
9557 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9558 // or forcing somehow recovery/backfill.
9559 i->lock();
9560 int pgstate = i->get_state();
9561 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9562 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
9563 i->_change_recovery_force_mode(newstate, false);
9564 i->unlock();
9565 }
9566 }
9567 }
9568
9569 void OSD::do_recovery(
9570 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9571 ThreadPool::TPHandle &handle)
9572 {
9573 uint64_t started = 0;
9574
9575 /*
9576 * When the value of osd_recovery_sleep is set greater than zero, recovery
9577 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9578 * recovery event's schedule time. This is done by adding a
9579 * recovery_requeue_callback event, which re-queues the recovery op using
9580 * queue_recovery_after_sleep.
9581 */
9582 float recovery_sleep = get_osd_recovery_sleep();
9583 {
9584 Mutex::Locker l(service.recovery_sleep_lock);
9585 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9586 PGRef pgref(pg);
9587 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9588 dout(20) << "do_recovery wake up at "
9589 << ceph_clock_now()
9590 << ", re-queuing recovery" << dendl;
9591 Mutex::Locker l(service.recovery_sleep_lock);
9592 service.recovery_needs_sleep = false;
9593 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9594 });
9595
9596 // This is true for the first recovery op and when the previous recovery op
9597 // has been scheduled in the past. The next recovery op is scheduled after
9598 // completing the sleep from now.
9599 if (service.recovery_schedule_time < ceph_clock_now()) {
9600 service.recovery_schedule_time = ceph_clock_now();
9601 }
9602 service.recovery_schedule_time += recovery_sleep;
9603 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9604 recovery_requeue_callback);
9605 dout(20) << "Recovery event scheduled at "
9606 << service.recovery_schedule_time << dendl;
9607 return;
9608 }
9609 }
9610
9611 {
9612 {
9613 Mutex::Locker l(service.recovery_sleep_lock);
9614 service.recovery_needs_sleep = true;
9615 }
9616
9617 if (pg->pg_has_reset_since(queued)) {
9618 goto out;
9619 }
9620
9621 assert(!pg->deleting);
9622 assert(pg->is_peered() && pg->is_primary());
9623
9624 assert(pg->recovery_queued);
9625 pg->recovery_queued = false;
9626
9627 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9628 #ifdef DEBUG_RECOVERY_OIDS
9629 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9630 #endif
9631
9632 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9633 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9634 << " on " << *pg << dendl;
9635
9636 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9637 if (!started && (more || !pg->have_unfound())) {
9638 goto out;
9639 }
9640
9641 PG::RecoveryCtx rctx = create_context();
9642 rctx.handle = &handle;
9643
9644 /*
9645 * if we couldn't start any recovery ops and things are still
9646 * unfound, see if we can discover more missing object locations.
9647 * It may be that our initial locations were bad and we errored
9648 * out while trying to pull.
9649 */
9650 if (!more && pg->have_unfound()) {
9651 pg->discover_all_missing(*rctx.query_map);
9652 if (rctx.query_map->empty()) {
9653 string action;
9654 if (pg->state_test(PG_STATE_BACKFILLING)) {
9655 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9656 queued,
9657 queued,
9658 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
9659 pg->queue_peering_event(evt);
9660 action = "in backfill";
9661 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9662 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9663 queued,
9664 queued,
9665 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
9666 pg->queue_peering_event(evt);
9667 action = "in recovery";
9668 } else {
9669 action = "already out of recovery/backfill";
9670 }
9671 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9672 } else {
9673 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9674 pg->queue_recovery();
9675 }
9676 }
9677
9678 pg->write_if_dirty(*rctx.transaction);
9679 OSDMapRef curmap = pg->get_osdmap();
9680 dispatch_context(rctx, pg, curmap);
9681 }
9682
9683 out:
9684 assert(started <= reserved_pushes);
9685 service.release_reserved_pushes(reserved_pushes);
9686 }
9687
9688 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9689 {
9690 Mutex::Locker l(recovery_lock);
9691 dout(10) << "start_recovery_op " << *pg << " " << soid
9692 << " (" << recovery_ops_active << "/"
9693 << cct->_conf->osd_recovery_max_active << " rops)"
9694 << dendl;
9695 recovery_ops_active++;
9696
9697 #ifdef DEBUG_RECOVERY_OIDS
9698 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9699 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9700 recovery_oids[pg->info.pgid].insert(soid);
9701 #endif
9702 }
9703
9704 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9705 {
9706 Mutex::Locker l(recovery_lock);
9707 dout(10) << "finish_recovery_op " << *pg << " " << soid
9708 << " dequeue=" << dequeue
9709 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9710 << dendl;
9711
9712 // adjust count
9713 assert(recovery_ops_active > 0);
9714 recovery_ops_active--;
9715
9716 #ifdef DEBUG_RECOVERY_OIDS
9717 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9718 assert(recovery_oids[pg->info.pgid].count(soid));
9719 recovery_oids[pg->info.pgid].erase(soid);
9720 #endif
9721
9722 _maybe_queue_recovery();
9723 }
9724
9725 bool OSDService::is_recovery_active()
9726 {
9727 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9728 }
9729
9730 // =========================================================
9731 // OPS
9732
9733 bool OSD::op_is_discardable(const MOSDOp *op)
9734 {
9735 // drop client request if they are not connected and can't get the
9736 // reply anyway.
9737 if (!op->get_connection()->is_connected()) {
9738 return true;
9739 }
9740 return false;
9741 }
9742
9743 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9744 {
9745 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9746 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9747 << " cost " << op->get_req()->get_cost()
9748 << " latency " << latency
9749 << " epoch " << epoch
9750 << " " << *(op->get_req()) << dendl;
9751 op->osd_trace.event("enqueue op");
9752 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9753 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9754 op->mark_queued_for_pg();
9755 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9756 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9757 }
9758
9759
9760
9761 /*
9762 * NOTE: dequeue called in worker thread, with pg lock
9763 */
9764 void OSD::dequeue_op(
9765 PGRef pg, OpRequestRef op,
9766 ThreadPool::TPHandle &handle)
9767 {
9768 FUNCTRACE();
9769 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9770
9771 utime_t now = ceph_clock_now();
9772 op->set_dequeued_time(now);
9773 utime_t latency = now - op->get_req()->get_recv_stamp();
9774 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9775 << " cost " << op->get_req()->get_cost()
9776 << " latency " << latency
9777 << " " << *(op->get_req())
9778 << " pg " << *pg << dendl;
9779
9780 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9781
9782 Session *session = static_cast<Session *>(
9783 op->get_req()->get_connection()->get_priv());
9784 if (session) {
9785 maybe_share_map(session, op, pg->get_osdmap());
9786 session->put();
9787 }
9788
9789 if (pg->deleting)
9790 return;
9791
9792 op->mark_reached_pg();
9793 op->osd_trace.event("dequeue_op");
9794
9795 pg->do_request(op, handle);
9796
9797 // finish
9798 dout(10) << "dequeue_op " << op << " finish" << dendl;
9799 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9800 }
9801
9802
9803 struct C_CompleteSplits : public Context {
9804 OSD *osd;
9805 set<PGRef> pgs;
9806 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9807 : osd(osd), pgs(in) {}
9808 void finish(int r) override {
9809 Mutex::Locker l(osd->osd_lock);
9810 if (osd->is_stopping())
9811 return;
9812 PG::RecoveryCtx rctx = osd->create_context();
9813 for (set<PGRef>::iterator i = pgs.begin();
9814 i != pgs.end();
9815 ++i) {
9816 osd->pg_map_lock.get_write();
9817 (*i)->lock();
9818 PG *pg = i->get();
9819 osd->add_newly_split_pg(pg, &rctx);
9820 if (!((*i)->deleting)) {
9821 set<spg_t> to_complete;
9822 to_complete.insert((*i)->info.pgid);
9823 osd->service.complete_split(to_complete);
9824 }
9825 osd->pg_map_lock.put_write();
9826 osd->dispatch_context_transaction(rctx, pg);
9827 osd->wake_pg_waiters(*i);
9828 (*i)->unlock();
9829 }
9830
9831 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9832 }
9833 };
9834
9835 void OSD::process_peering_events(
9836 const list<PG*> &pgs,
9837 ThreadPool::TPHandle &handle
9838 )
9839 {
9840 bool need_up_thru = false;
9841 epoch_t same_interval_since = 0;
9842 OSDMapRef curmap;
9843 PG::RecoveryCtx rctx = create_context();
9844 rctx.handle = &handle;
9845 for (list<PG*>::const_iterator i = pgs.begin();
9846 i != pgs.end();
9847 ++i) {
9848 set<PGRef> split_pgs;
9849 PG *pg = *i;
9850 pg->lock_suspend_timeout(handle);
9851 curmap = service.get_osdmap();
9852 if (pg->deleting) {
9853 pg->unlock();
9854 continue;
9855 }
9856 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9857 // we need to requeue the PG explicitly since we didn't actually
9858 // handle an event
9859 peering_wq.queue(pg);
9860 } else {
9861 assert(!pg->peering_queue.empty());
9862 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9863 pg->peering_queue.pop_front();
9864 pg->handle_peering_event(evt, &rctx);
9865 }
9866 need_up_thru = pg->need_up_thru || need_up_thru;
9867 same_interval_since = MAX(pg->info.history.same_interval_since,
9868 same_interval_since);
9869 pg->write_if_dirty(*rctx.transaction);
9870 if (!split_pgs.empty()) {
9871 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9872 split_pgs.clear();
9873 }
9874 dispatch_context_transaction(rctx, pg, &handle);
9875 pg->unlock();
9876 }
9877 if (need_up_thru)
9878 queue_want_up_thru(same_interval_since);
9879 dispatch_context(rctx, 0, curmap, &handle);
9880
9881 service.send_pg_temp();
9882 }
9883
9884 // --------------------------------
9885
9886 const char** OSD::get_tracked_conf_keys() const
9887 {
9888 static const char* KEYS[] = {
9889 "osd_max_backfills",
9890 "osd_min_recovery_priority",
9891 "osd_max_trimming_pgs",
9892 "osd_op_complaint_time",
9893 "osd_op_log_threshold",
9894 "osd_op_history_size",
9895 "osd_op_history_duration",
9896 "osd_op_history_slow_op_size",
9897 "osd_op_history_slow_op_threshold",
9898 "osd_enable_op_tracker",
9899 "osd_map_cache_size",
9900 "osd_map_max_advance",
9901 "osd_pg_epoch_persisted_max_stale",
9902 "osd_disk_thread_ioprio_class",
9903 "osd_disk_thread_ioprio_priority",
9904 // clog & admin clog
9905 "clog_to_monitors",
9906 "clog_to_syslog",
9907 "clog_to_syslog_facility",
9908 "clog_to_syslog_level",
9909 "osd_objectstore_fuse",
9910 "clog_to_graylog",
9911 "clog_to_graylog_host",
9912 "clog_to_graylog_port",
9913 "host",
9914 "fsid",
9915 "osd_recovery_delay_start",
9916 "osd_client_message_size_cap",
9917 "osd_client_message_cap",
9918 "osd_heartbeat_min_size",
9919 "osd_heartbeat_interval",
9920 NULL
9921 };
9922 return KEYS;
9923 }
9924
9925 void OSD::handle_conf_change(const struct md_config_t *conf,
9926 const std::set <std::string> &changed)
9927 {
9928 if (changed.count("osd_max_backfills")) {
9929 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9930 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9931 }
9932 if (changed.count("osd_min_recovery_priority")) {
9933 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9934 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9935 }
9936 if (changed.count("osd_max_trimming_pgs")) {
9937 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9938 }
9939 if (changed.count("osd_op_complaint_time") ||
9940 changed.count("osd_op_log_threshold")) {
9941 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9942 cct->_conf->osd_op_log_threshold);
9943 }
9944 if (changed.count("osd_op_history_size") ||
9945 changed.count("osd_op_history_duration")) {
9946 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9947 cct->_conf->osd_op_history_duration);
9948 }
9949 if (changed.count("osd_op_history_slow_op_size") ||
9950 changed.count("osd_op_history_slow_op_threshold")) {
9951 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9952 cct->_conf->osd_op_history_slow_op_threshold);
9953 }
9954 if (changed.count("osd_enable_op_tracker")) {
9955 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9956 }
9957 if (changed.count("osd_disk_thread_ioprio_class") ||
9958 changed.count("osd_disk_thread_ioprio_priority")) {
9959 set_disk_tp_priority();
9960 }
9961 if (changed.count("osd_map_cache_size")) {
9962 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9963 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9964 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9965 }
9966 if (changed.count("clog_to_monitors") ||
9967 changed.count("clog_to_syslog") ||
9968 changed.count("clog_to_syslog_level") ||
9969 changed.count("clog_to_syslog_facility") ||
9970 changed.count("clog_to_graylog") ||
9971 changed.count("clog_to_graylog_host") ||
9972 changed.count("clog_to_graylog_port") ||
9973 changed.count("host") ||
9974 changed.count("fsid")) {
9975 update_log_config();
9976 }
9977
9978 #ifdef HAVE_LIBFUSE
9979 if (changed.count("osd_objectstore_fuse")) {
9980 if (store) {
9981 enable_disable_fuse(false);
9982 }
9983 }
9984 #endif
9985
9986 if (changed.count("osd_recovery_delay_start")) {
9987 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9988 service.kick_recovery_queue();
9989 }
9990
9991 if (changed.count("osd_client_message_cap")) {
9992 uint64_t newval = cct->_conf->osd_client_message_cap;
9993 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9994 if (pol.throttler_messages && newval > 0) {
9995 pol.throttler_messages->reset_max(newval);
9996 }
9997 }
9998 if (changed.count("osd_client_message_size_cap")) {
9999 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10000 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10001 if (pol.throttler_bytes && newval > 0) {
10002 pol.throttler_bytes->reset_max(newval);
10003 }
10004 }
10005
10006 check_config();
10007 }
10008
10009 void OSD::update_log_config()
10010 {
10011 map<string,string> log_to_monitors;
10012 map<string,string> log_to_syslog;
10013 map<string,string> log_channel;
10014 map<string,string> log_prio;
10015 map<string,string> log_to_graylog;
10016 map<string,string> log_to_graylog_host;
10017 map<string,string> log_to_graylog_port;
10018 uuid_d fsid;
10019 string host;
10020
10021 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10022 log_channel, log_prio, log_to_graylog,
10023 log_to_graylog_host, log_to_graylog_port,
10024 fsid, host) == 0)
10025 clog->update_config(log_to_monitors, log_to_syslog,
10026 log_channel, log_prio, log_to_graylog,
10027 log_to_graylog_host, log_to_graylog_port,
10028 fsid, host);
10029 derr << "log_to_monitors " << log_to_monitors << dendl;
10030 }
10031
10032 void OSD::check_config()
10033 {
10034 // some sanity checks
10035 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
10036 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10037 << " is not > osd_map_max_advance ("
10038 << cct->_conf->osd_map_max_advance << ")";
10039 }
10040 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10041 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10042 << " is not > osd_pg_epoch_persisted_max_stale ("
10043 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10044 }
10045 }
10046
10047 void OSD::set_disk_tp_priority()
10048 {
10049 dout(10) << __func__
10050 << " class " << cct->_conf->osd_disk_thread_ioprio_class
10051 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
10052 << dendl;
10053 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
10054 cct->_conf->osd_disk_thread_ioprio_priority < 0)
10055 return;
10056 int cls =
10057 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
10058 if (cls < 0)
10059 derr << __func__ << cpp_strerror(cls) << ": "
10060 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
10061 << " but only the following values are allowed: idle, be or rt" << dendl;
10062 else
10063 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10064 }
10065
10066 // --------------------------------
10067
10068 void OSD::get_latest_osdmap()
10069 {
10070 dout(10) << __func__ << " -- start" << dendl;
10071
10072 C_SaferCond cond;
10073 service.objecter->wait_for_latest_osdmap(&cond);
10074 cond.wait();
10075
10076 dout(10) << __func__ << " -- finish" << dendl;
10077 }
10078
10079 // --------------------------------
10080
10081 int OSD::init_op_flags(OpRequestRef& op)
10082 {
10083 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10084 vector<OSDOp>::const_iterator iter;
10085
10086 // client flags have no bearing on whether an op is a read, write, etc.
10087 op->rmw_flags = 0;
10088
10089 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10090 op->set_force_rwordered();
10091 }
10092
10093 // set bits based on op codes, called methods.
10094 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10095 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10096 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10097 /* This a bit odd. PING isn't actually a write. It can't
10098 * result in an update to the object_info. PINGs also aren'ty
10099 * resent, so there's no reason to write out a log entry
10100 *
10101 * However, we pipeline them behind writes, so let's force
10102 * the write_ordered flag.
10103 */
10104 op->set_force_rwordered();
10105 } else {
10106 if (ceph_osd_op_mode_modify(iter->op.op))
10107 op->set_write();
10108 }
10109 if (ceph_osd_op_mode_read(iter->op.op))
10110 op->set_read();
10111
10112 // set READ flag if there are src_oids
10113 if (iter->soid.oid.name.length())
10114 op->set_read();
10115
10116 // set PGOP flag if there are PG ops
10117 if (ceph_osd_op_type_pg(iter->op.op))
10118 op->set_pg_op();
10119
10120 if (ceph_osd_op_mode_cache(iter->op.op))
10121 op->set_cache();
10122
10123 // check for ec base pool
10124 int64_t poolid = m->get_pg().pool();
10125 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10126 if (pool && pool->is_tier()) {
10127 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10128 if (base_pool && base_pool->require_rollback()) {
10129 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10130 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10131 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10132 (iter->op.op != CEPH_OSD_OP_STAT) &&
10133 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10134 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10135 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10136 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10137 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10138 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10139 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10140 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10141 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10142 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10143 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10144 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10145 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10146 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10147 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10148 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10149 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10150 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10151 op->set_promote();
10152 }
10153 }
10154 }
10155
10156 switch (iter->op.op) {
10157 case CEPH_OSD_OP_CALL:
10158 {
10159 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10160 int is_write, is_read;
10161 string cname, mname;
10162 bp.copy(iter->op.cls.class_len, cname);
10163 bp.copy(iter->op.cls.method_len, mname);
10164
10165 ClassHandler::ClassData *cls;
10166 int r = class_handler->open_class(cname, &cls);
10167 if (r) {
10168 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10169 if (r == -ENOENT)
10170 r = -EOPNOTSUPP;
10171 else if (r != -EPERM) // propagate permission errors
10172 r = -EIO;
10173 return r;
10174 }
10175 int flags = cls->get_method_flags(mname.c_str());
10176 if (flags < 0) {
10177 if (flags == -ENOENT)
10178 r = -EOPNOTSUPP;
10179 else
10180 r = flags;
10181 return r;
10182 }
10183 is_read = flags & CLS_METHOD_RD;
10184 is_write = flags & CLS_METHOD_WR;
10185 bool is_promote = flags & CLS_METHOD_PROMOTE;
10186
10187 dout(10) << "class " << cname << " method " << mname << " "
10188 << "flags=" << (is_read ? "r" : "")
10189 << (is_write ? "w" : "")
10190 << (is_promote ? "p" : "")
10191 << dendl;
10192 if (is_read)
10193 op->set_class_read();
10194 if (is_write)
10195 op->set_class_write();
10196 if (is_promote)
10197 op->set_promote();
10198 op->add_class(cname, is_read, is_write, cls->whitelisted);
10199 break;
10200 }
10201
10202 case CEPH_OSD_OP_WATCH:
10203 // force the read bit for watch since it is depends on previous
10204 // watch state (and may return early if the watch exists) or, in
10205 // the case of ping, is simply a read op.
10206 op->set_read();
10207 // fall through
10208 case CEPH_OSD_OP_NOTIFY:
10209 case CEPH_OSD_OP_NOTIFY_ACK:
10210 {
10211 op->set_promote();
10212 break;
10213 }
10214
10215 case CEPH_OSD_OP_DELETE:
10216 // if we get a delete with FAILOK we can skip handle cache. without
10217 // FAILOK we still need to promote (or do something smarter) to
10218 // determine whether to return ENOENT or 0.
10219 if (iter == m->ops.begin() &&
10220 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10221 op->set_skip_handle_cache();
10222 }
10223 // skip promotion when proxying a delete op
10224 if (m->ops.size() == 1) {
10225 op->set_skip_promote();
10226 }
10227 break;
10228
10229 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10230 case CEPH_OSD_OP_CACHE_FLUSH:
10231 case CEPH_OSD_OP_CACHE_EVICT:
10232 // If try_flush/flush/evict is the only op, can skip handle cache.
10233 if (m->ops.size() == 1) {
10234 op->set_skip_handle_cache();
10235 }
10236 break;
10237
10238 case CEPH_OSD_OP_READ:
10239 case CEPH_OSD_OP_SYNC_READ:
10240 case CEPH_OSD_OP_SPARSE_READ:
10241 case CEPH_OSD_OP_CHECKSUM:
10242 case CEPH_OSD_OP_WRITEFULL:
10243 if (m->ops.size() == 1 &&
10244 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10245 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10246 op->set_skip_promote();
10247 }
10248 break;
10249
10250 // force promotion when pin an object in cache tier
10251 case CEPH_OSD_OP_CACHE_PIN:
10252 op->set_promote();
10253 break;
10254
10255 default:
10256 break;
10257 }
10258 }
10259
10260 if (op->rmw_flags == 0)
10261 return -EINVAL;
10262
10263 return 0;
10264 }
10265
10266 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10267 for (list<PG*>::iterator i = peering_queue.begin();
10268 i != peering_queue.end() &&
10269 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10270 ) {
10271 if (in_use.count(*i)) {
10272 ++i;
10273 } else {
10274 out->push_back(*i);
10275 peering_queue.erase(i++);
10276 }
10277 }
10278 in_use.insert(out->begin(), out->end());
10279 }
10280
10281
10282 // =============================================================
10283
10284 #undef dout_context
10285 #define dout_context osd->cct
10286 #undef dout_prefix
10287 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10288
10289 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10290 {
10291 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10292 auto sdata = shard_list[shard_index];
10293 bool queued = false;
10294 {
10295 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10296 auto p = sdata->pg_slots.find(pgid);
10297 if (p != sdata->pg_slots.end()) {
10298 dout(20) << __func__ << " " << pgid
10299 << " to_process " << p->second.to_process
10300 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10301 for (auto i = p->second.to_process.rbegin();
10302 i != p->second.to_process.rend();
10303 ++i) {
10304 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10305 }
10306 p->second.to_process.clear();
10307 p->second.waiting_for_pg = false;
10308 ++p->second.requeue_seq;
10309 queued = true;
10310 }
10311 }
10312 if (queued) {
10313 sdata->sdata_lock.Lock();
10314 sdata->sdata_cond.SignalOne();
10315 sdata->sdata_lock.Unlock();
10316 }
10317 }
10318
10319 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10320 {
10321 unsigned pushes_to_free = 0;
10322 for (auto sdata : shard_list) {
10323 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10324 sdata->waiting_for_pg_osdmap = osdmap;
10325 auto p = sdata->pg_slots.begin();
10326 while (p != sdata->pg_slots.end()) {
10327 ShardData::pg_slot& slot = p->second;
10328 if (!slot.to_process.empty() && slot.num_running == 0) {
10329 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10330 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10331 << dendl;
10332 ++p;
10333 continue;
10334 }
10335 while (!slot.to_process.empty() &&
10336 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10337 auto& qi = slot.to_process.front();
10338 dout(20) << __func__ << " " << p->first
10339 << " item " << qi
10340 << " epoch " << qi.get_map_epoch()
10341 << " <= " << osdmap->get_epoch()
10342 << ", stale, dropping" << dendl;
10343 pushes_to_free += qi.get_reserved_pushes();
10344 slot.to_process.pop_front();
10345 }
10346 }
10347 if (slot.to_process.empty() &&
10348 slot.num_running == 0 &&
10349 !slot.pg) {
10350 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10351 p = sdata->pg_slots.erase(p);
10352 } else {
10353 ++p;
10354 }
10355 }
10356 }
10357 if (pushes_to_free > 0) {
10358 osd->service.release_reserved_pushes(pushes_to_free);
10359 }
10360 }
10361
10362 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10363 {
10364 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10365 auto sdata = shard_list[shard_index];
10366 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10367 auto p = sdata->pg_slots.find(pgid);
10368 if (p != sdata->pg_slots.end()) {
10369 auto& slot = p->second;
10370 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10371 assert(!slot.pg || slot.pg->deleting);
10372 slot.pg = nullptr;
10373 }
10374 }
10375
10376 void OSD::ShardedOpWQ::clear_pg_slots()
10377 {
10378 for (auto sdata : shard_list) {
10379 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10380 sdata->pg_slots.clear();
10381 sdata->waiting_for_pg_osdmap.reset();
10382 // don't bother with reserved pushes; we are shutting down
10383 }
10384 }
10385
10386 #undef dout_prefix
10387 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10388
10389 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10390 {
10391 uint32_t shard_index = thread_index % num_shards;
10392 ShardData *sdata = shard_list[shard_index];
10393 assert(NULL != sdata);
10394
10395 // peek at spg_t
10396 sdata->sdata_op_ordering_lock.Lock();
10397 if (sdata->pqueue->empty()) {
10398 dout(20) << __func__ << " empty q, waiting" << dendl;
10399 // optimistically sleep a moment; maybe another work item will come along.
10400 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10401 osd->cct->_conf->threadpool_default_timeout, 0);
10402 sdata->sdata_lock.Lock();
10403 sdata->sdata_op_ordering_lock.Unlock();
10404 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10405 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10406 sdata->sdata_lock.Unlock();
10407 sdata->sdata_op_ordering_lock.Lock();
10408 if (sdata->pqueue->empty()) {
10409 sdata->sdata_op_ordering_lock.Unlock();
10410 return;
10411 }
10412 }
10413 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10414 if (osd->is_stopping()) {
10415 sdata->sdata_op_ordering_lock.Unlock();
10416 return; // OSD shutdown, discard.
10417 }
10418 PGRef pg;
10419 uint64_t requeue_seq;
10420 {
10421 auto& slot = sdata->pg_slots[item.first];
10422 dout(30) << __func__ << " " << item.first
10423 << " to_process " << slot.to_process
10424 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10425 slot.to_process.push_back(item.second);
10426 // note the requeue seq now...
10427 requeue_seq = slot.requeue_seq;
10428 if (slot.waiting_for_pg) {
10429 // save ourselves a bit of effort
10430 dout(20) << __func__ << " " << item.first << " item " << item.second
10431 << " queued, waiting_for_pg" << dendl;
10432 sdata->sdata_op_ordering_lock.Unlock();
10433 return;
10434 }
10435 pg = slot.pg;
10436 dout(20) << __func__ << " " << item.first << " item " << item.second
10437 << " queued" << dendl;
10438 ++slot.num_running;
10439 }
10440 sdata->sdata_op_ordering_lock.Unlock();
10441
10442 osd->service.maybe_inject_dispatch_delay();
10443
10444 // [lookup +] lock pg (if we have it)
10445 if (!pg) {
10446 pg = osd->_lookup_lock_pg(item.first);
10447 } else {
10448 pg->lock();
10449 }
10450
10451 osd->service.maybe_inject_dispatch_delay();
10452
10453 boost::optional<PGQueueable> qi;
10454
10455 // we don't use a Mutex::Locker here because of the
10456 // osd->service.release_reserved_pushes() call below
10457 sdata->sdata_op_ordering_lock.Lock();
10458
10459 auto q = sdata->pg_slots.find(item.first);
10460 assert(q != sdata->pg_slots.end());
10461 auto& slot = q->second;
10462 --slot.num_running;
10463
10464 if (slot.to_process.empty()) {
10465 // raced with wake_pg_waiters or prune_pg_waiters
10466 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10467 if (pg) {
10468 pg->unlock();
10469 }
10470 sdata->sdata_op_ordering_lock.Unlock();
10471 return;
10472 }
10473 if (requeue_seq != slot.requeue_seq) {
10474 dout(20) << __func__ << " " << item.first
10475 << " requeue_seq " << slot.requeue_seq << " > our "
10476 << requeue_seq << ", we raced with wake_pg_waiters"
10477 << dendl;
10478 if (pg) {
10479 pg->unlock();
10480 }
10481 sdata->sdata_op_ordering_lock.Unlock();
10482 return;
10483 }
10484 if (pg && !slot.pg && !pg->deleting) {
10485 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10486 slot.pg = pg;
10487 }
10488 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10489 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10490
10491 // make sure we're not already waiting for this pg
10492 if (slot.waiting_for_pg) {
10493 dout(20) << __func__ << " " << item.first << " item " << item.second
10494 << " slot is waiting_for_pg" << dendl;
10495 if (pg) {
10496 pg->unlock();
10497 }
10498 sdata->sdata_op_ordering_lock.Unlock();
10499 return;
10500 }
10501
10502 // take next item
10503 qi = slot.to_process.front();
10504 slot.to_process.pop_front();
10505 dout(20) << __func__ << " " << item.first << " item " << *qi
10506 << " pg " << pg << dendl;
10507
10508 if (!pg) {
10509 // should this pg shard exist on this osd in this (or a later) epoch?
10510 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10511 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10512 dout(20) << __func__ << " " << item.first
10513 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10514 slot.to_process.push_front(*qi);
10515 slot.waiting_for_pg = true;
10516 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10517 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10518 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10519 << ", will wait on " << *qi << dendl;
10520 slot.to_process.push_front(*qi);
10521 slot.waiting_for_pg = true;
10522 } else {
10523 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10524 << " dropping " << *qi << dendl;
10525 // share map with client?
10526 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10527 Session *session = static_cast<Session *>(
10528 (*_op)->get_req()->get_connection()->get_priv());
10529 if (session) {
10530 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10531 session->put();
10532 }
10533 }
10534 unsigned pushes_to_free = qi->get_reserved_pushes();
10535 if (pushes_to_free > 0) {
10536 sdata->sdata_op_ordering_lock.Unlock();
10537 osd->service.release_reserved_pushes(pushes_to_free);
10538 return;
10539 }
10540 }
10541 sdata->sdata_op_ordering_lock.Unlock();
10542 return;
10543 }
10544 sdata->sdata_op_ordering_lock.Unlock();
10545
10546
10547 // osd_opwq_process marks the point at which an operation has been dequeued
10548 // and will begin to be handled by a worker thread.
10549 {
10550 #ifdef WITH_LTTNG
10551 osd_reqid_t reqid;
10552 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10553 reqid = (*_op)->get_reqid();
10554 }
10555 #endif
10556 tracepoint(osd, opwq_process_start, reqid.name._type,
10557 reqid.name._num, reqid.tid, reqid.inc);
10558 }
10559
10560 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10561 Formatter *f = Formatter::create("json");
10562 f->open_object_section("q");
10563 dump(f);
10564 f->close_section();
10565 f->flush(*_dout);
10566 delete f;
10567 *_dout << dendl;
10568
10569 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10570 suicide_interval);
10571 qi->run(osd, pg, tp_handle);
10572
10573 {
10574 #ifdef WITH_LTTNG
10575 osd_reqid_t reqid;
10576 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10577 reqid = (*_op)->get_reqid();
10578 }
10579 #endif
10580 tracepoint(osd, opwq_process_finish, reqid.name._type,
10581 reqid.name._num, reqid.tid, reqid.inc);
10582 }
10583
10584 pg->unlock();
10585 }
10586
10587 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10588 uint32_t shard_index =
10589 item.first.hash_to_shard(shard_list.size());
10590
10591 ShardData* sdata = shard_list[shard_index];
10592 assert (NULL != sdata);
10593 unsigned priority = item.second.get_priority();
10594 unsigned cost = item.second.get_cost();
10595 sdata->sdata_op_ordering_lock.Lock();
10596
10597 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10598 if (priority >= osd->op_prio_cutoff)
10599 sdata->pqueue->enqueue_strict(
10600 item.second.get_owner(), priority, item);
10601 else
10602 sdata->pqueue->enqueue(
10603 item.second.get_owner(),
10604 priority, cost, item);
10605 sdata->sdata_op_ordering_lock.Unlock();
10606
10607 sdata->sdata_lock.Lock();
10608 sdata->sdata_cond.SignalOne();
10609 sdata->sdata_lock.Unlock();
10610
10611 }
10612
10613 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10614 {
10615 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10616 ShardData* sdata = shard_list[shard_index];
10617 assert (NULL != sdata);
10618 sdata->sdata_op_ordering_lock.Lock();
10619 auto p = sdata->pg_slots.find(item.first);
10620 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10621 // we may be racing with _process, which has dequeued a new item
10622 // from pqueue, put it on to_process, and is now busy taking the
10623 // pg lock. ensure this old requeued item is ordered before any
10624 // such newer item in to_process.
10625 p->second.to_process.push_front(item.second);
10626 item.second = p->second.to_process.back();
10627 p->second.to_process.pop_back();
10628 dout(20) << __func__ << " " << item.first
10629 << " " << p->second.to_process.front()
10630 << " shuffled w/ " << item.second << dendl;
10631 } else {
10632 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10633 }
10634 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10635 sdata->sdata_op_ordering_lock.Unlock();
10636 sdata->sdata_lock.Lock();
10637 sdata->sdata_cond.SignalOne();
10638 sdata->sdata_lock.Unlock();
10639 }
10640
10641 namespace ceph {
10642 namespace osd_cmds {
10643
10644 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10645 {
10646 if (!ceph_using_tcmalloc()) {
10647 os << "could not issue heap profiler command -- not using tcmalloc!";
10648 return -EOPNOTSUPP;
10649 }
10650
10651 string cmd;
10652 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10653 os << "unable to get value for command \"" << cmd << "\"";
10654 return -EINVAL;
10655 }
10656
10657 std::vector<std::string> cmd_vec;
10658 get_str_vec(cmd, cmd_vec);
10659
10660 ceph_heap_profiler_handle_command(cmd_vec, os);
10661
10662 return 0;
10663 }
10664
10665 }} // namespace ceph::osd_cmds
10666
10667
10668 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10669 switch(q) {
10670 case OSD::io_queue::prioritized:
10671 out << "prioritized";
10672 break;
10673 case OSD::io_queue::weightedpriority:
10674 out << "weightedpriority";
10675 break;
10676 case OSD::io_queue::mclock_opclass:
10677 out << "mclock_opclass";
10678 break;
10679 case OSD::io_queue::mclock_client:
10680 out << "mclock_client";
10681 break;
10682 }
10683 return out;
10684 }