]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to v12.2.3
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48 #include "common/pick_address.h"
49
50 #include "os/ObjectStore.h"
51 #ifdef HAVE_LIBFUSE
52 #include "os/FuseStore.h"
53 #endif
54
55 #include "PrimaryLogPG.h"
56
57
58 #include "msg/Messenger.h"
59 #include "msg/Message.h"
60
61 #include "mon/MonClient.h"
62
63 #include "messages/MLog.h"
64
65 #include "messages/MGenericMessage.h"
66 #include "messages/MOSDPing.h"
67 #include "messages/MOSDFailure.h"
68 #include "messages/MOSDMarkMeDown.h"
69 #include "messages/MOSDFull.h"
70 #include "messages/MOSDOp.h"
71 #include "messages/MOSDOpReply.h"
72 #include "messages/MOSDBackoff.h"
73 #include "messages/MOSDBeacon.h"
74 #include "messages/MOSDRepOp.h"
75 #include "messages/MOSDRepOpReply.h"
76 #include "messages/MOSDBoot.h"
77 #include "messages/MOSDPGTemp.h"
78
79 #include "messages/MOSDMap.h"
80 #include "messages/MMonGetOSDMap.h"
81 #include "messages/MOSDPGNotify.h"
82 #include "messages/MOSDPGQuery.h"
83 #include "messages/MOSDPGLog.h"
84 #include "messages/MOSDPGRemove.h"
85 #include "messages/MOSDPGInfo.h"
86 #include "messages/MOSDPGCreate.h"
87 #include "messages/MOSDPGTrim.h"
88 #include "messages/MOSDPGScan.h"
89 #include "messages/MOSDPGBackfill.h"
90 #include "messages/MBackfillReserve.h"
91 #include "messages/MRecoveryReserve.h"
92 #include "messages/MOSDForceRecovery.h"
93 #include "messages/MOSDECSubOpWrite.h"
94 #include "messages/MOSDECSubOpWriteReply.h"
95 #include "messages/MOSDECSubOpRead.h"
96 #include "messages/MOSDECSubOpReadReply.h"
97 #include "messages/MOSDPGCreated.h"
98 #include "messages/MOSDPGUpdateLogMissing.h"
99 #include "messages/MOSDPGUpdateLogMissingReply.h"
100
101 #include "messages/MOSDAlive.h"
102
103 #include "messages/MOSDScrub.h"
104 #include "messages/MOSDScrubReserve.h"
105 #include "messages/MOSDRepScrub.h"
106
107 #include "messages/MMonCommand.h"
108 #include "messages/MCommand.h"
109 #include "messages/MCommandReply.h"
110
111 #include "messages/MPGStats.h"
112 #include "messages/MPGStatsAck.h"
113
114 #include "messages/MWatchNotify.h"
115 #include "messages/MOSDPGPush.h"
116 #include "messages/MOSDPGPushReply.h"
117 #include "messages/MOSDPGPull.h"
118
119 #include "common/perf_counters.h"
120 #include "common/Timer.h"
121 #include "common/LogClient.h"
122 #include "common/AsyncReserver.h"
123 #include "common/HeartbeatMap.h"
124 #include "common/admin_socket.h"
125 #include "common/ceph_context.h"
126
127 #include "global/signal_handler.h"
128 #include "global/pidfile.h"
129
130 #include "include/color.h"
131 #include "perfglue/cpu_profiler.h"
132 #include "perfglue/heap_profiler.h"
133
134 #include "osd/OpRequest.h"
135
136 #include "auth/AuthAuthorizeHandler.h"
137 #include "auth/RotatingKeyRing.h"
138 #include "common/errno.h"
139
140 #include "objclass/objclass.h"
141
142 #include "common/cmdparse.h"
143 #include "include/str_list.h"
144 #include "include/util.h"
145
146 #include "include/assert.h"
147 #include "common/config.h"
148 #include "common/EventTrace.h"
149
150 #ifdef WITH_LTTNG
151 #define TRACEPOINT_DEFINE
152 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #include "tracing/osd.h"
154 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155 #undef TRACEPOINT_DEFINE
156 #else
157 #define tracepoint(...)
158 #endif
159
160 #define dout_context cct
161 #define dout_subsys ceph_subsys_osd
162 #undef dout_prefix
163 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
164
165
166 const double OSD::OSD_TICK_INTERVAL = 1.0;
167
168 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
170 }
171
172 //Initial features in new superblock.
173 //Features here are also automatically upgraded
174 CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
195 }
196
197 //Features are added here that this OSD supports.
198 CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
202 return compat;
203 }
204
205 OSDService::OSDService(OSD *osd) :
206 osd(osd),
207 cct(osd->cct),
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
214 logger(osd->logger),
215 recoverystate_perf(osd->recoverystate_perf),
216 monc(osd->monc),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
219 &osd->disk_tp),
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
224 max_oldest_map(0),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 scrubs_active(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
230 agent_ops(0),
231 flush_mode_high_count(0),
232 agent_active(true),
233 agent_thread(this),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
244 next_notif_id(0),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
249 reserver_finisher(cct),
250 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
251 cct->_conf->osd_min_recovery_priority),
252 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 snap_sleep_timer(
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 scrub_sleep_timer(
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
261 snap_reserver(cct, &reserver_finisher,
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279 #ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281 #endif
282 {
283 objecter->init();
284 }
285
286 OSDService::~OSDService()
287 {
288 delete objecter;
289 }
290
291
292
293 #ifdef PG_DEBUG_REFS
294 void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
297 live_pgs[pgid] = pg;
298 }
299 pgid_tracker[pgid]++;
300 }
301 void OSDService::remove_pgid(spg_t pgid, PG *pg)
302 {
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
310 }
311 }
312 void OSDService::dump_live_pgids()
313 {
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
318 ++i) {
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
321 }
322 }
323 #endif
324
325
326 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
327 {
328 for (set<spg_t>::const_iterator i = children.begin();
329 i != children.end();
330 ++i) {
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
336
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
339 }
340 }
341
342 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
343 {
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
348 i != children.end();
349 ++i) {
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
354
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
358 }
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
361 }
362
363 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
364 {
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
367 }
368
369 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
370 {
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
373 return;
374
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
377 ++i) {
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
384 }
385 rev_pending_splits.erase(piter);
386 }
387
388 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
389 OSDMapRef new_map,
390 spg_t pgid)
391 {
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
395 set<spg_t> children;
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
399 } else {
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
401 }
402 }
403
404 void OSDService::init_splits_between(spg_t pgid,
405 OSDMapRef frommap,
406 OSDMapRef tomap)
407 {
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
410 pgid.is_split(
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
413 NULL)) {
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
420 ++e) {
421 OSDMapRef nextmap(try_get_map(e));
422 if (!nextmap)
423 continue;
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
429 &split_pgs)) {
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
432 }
433 }
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
435 curmap = nextmap;
436 }
437 assert(curmap == tomap); // we must have had both frommap and tomap
438 }
439 }
440
441 void OSDService::expand_pg_num(OSDMapRef old_map,
442 OSDMapRef new_map)
443 {
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
447 ) {
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
450 } else {
451 _maybe_split_pgid(old_map, new_map, *i);
452 ++i;
453 }
454 }
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
457 ) {
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
461 } else {
462 _maybe_split_pgid(old_map, new_map, i->first);
463 ++i;
464 }
465 }
466 }
467
468 bool OSDService::splitting(spg_t pgid)
469 {
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
473 }
474
475 void OSDService::complete_split(const set<spg_t> &pgs)
476 {
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
479 i != pgs.end();
480 ++i) {
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
485 }
486 }
487
488 void OSDService::need_heartbeat_peer_update()
489 {
490 osd->need_heartbeat_peer_update();
491 }
492
493 void OSDService::pg_stat_queue_enqueue(PG *pg)
494 {
495 osd->pg_stat_queue_enqueue(pg);
496 }
497
498 void OSDService::pg_stat_queue_dequeue(PG *pg)
499 {
500 osd->pg_stat_queue_dequeue(pg);
501 }
502
503 void OSDService::start_shutdown()
504 {
505 {
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
508 }
509
510 {
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
513 }
514 }
515
516 void OSDService::shutdown_reserver()
517 {
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
520 }
521
522 void OSDService::shutdown()
523 {
524 {
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
527 }
528
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
532
533 {
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
536 }
537
538 {
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
541 }
542
543 {
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
546 }
547
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
550 }
551
552 void OSDService::init()
553 {
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
557
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
560
561 watch_timer.init();
562 agent_timer.init();
563 snap_sleep_timer.init();
564 scrub_sleep_timer.init();
565
566 agent_thread.create("osd_srv_agent");
567
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
570 }
571
572 void OSDService::final_init()
573 {
574 objecter->start(osdmap.get());
575 }
576
577 void OSDService::activate_map()
578 {
579 // wake/unwake the tiering agent
580 agent_lock.Lock();
581 agent_active =
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
583 osd->is_active();
584 agent_cond.Signal();
585 agent_lock.Unlock();
586 }
587
588 void OSDService::request_osdmap_update(epoch_t e)
589 {
590 osd->osdmap_subscribe(e, false);
591 }
592
593 class AgentTimeoutCB : public Context {
594 PGRef pg;
595 public:
596 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
597 void finish(int) override {
598 pg->agent_choose_mode_restart();
599 }
600 };
601
602 void OSDService::agent_entry()
603 {
604 dout(10) << __func__ << " start" << dendl;
605 agent_lock.Lock();
606
607 while (!agent_stop_flag) {
608 if (agent_queue.empty()) {
609 dout(20) << __func__ << " empty queue" << dendl;
610 agent_cond.Wait(agent_lock);
611 continue;
612 }
613 uint64_t level = agent_queue.rbegin()->first;
614 set<PGRef>& top = agent_queue.rbegin()->second;
615 dout(10) << __func__
616 << " tiers " << agent_queue.size()
617 << ", top is " << level
618 << " with pgs " << top.size()
619 << ", ops " << agent_ops << "/"
620 << cct->_conf->osd_agent_max_ops
621 << (agent_active ? " active" : " NOT ACTIVE")
622 << dendl;
623 dout(20) << __func__ << " oids " << agent_oids << dendl;
624 int max = cct->_conf->osd_agent_max_ops - agent_ops;
625 int agent_flush_quota = max;
626 if (!flush_mode_high_count)
627 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
628 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
629 agent_cond.Wait(agent_lock);
630 continue;
631 }
632
633 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
634 agent_queue_pos = top.begin();
635 agent_valid_iterator = true;
636 }
637 PGRef pg = *agent_queue_pos;
638 dout(10) << "high_count " << flush_mode_high_count
639 << " agent_ops " << agent_ops
640 << " flush_quota " << agent_flush_quota << dendl;
641 agent_lock.Unlock();
642 if (!pg->agent_work(max, agent_flush_quota)) {
643 dout(10) << __func__ << " " << pg->get_pgid()
644 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
645 << " seconds" << dendl;
646
647 osd->logger->inc(l_osd_tier_delay);
648 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
649 agent_timer_lock.Lock();
650 Context *cb = new AgentTimeoutCB(pg);
651 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
652 agent_timer_lock.Unlock();
653 }
654 agent_lock.Lock();
655 }
656 agent_lock.Unlock();
657 dout(10) << __func__ << " finish" << dendl;
658 }
659
660 void OSDService::agent_stop()
661 {
662 {
663 Mutex::Locker l(agent_lock);
664
665 // By this time all ops should be cancelled
666 assert(agent_ops == 0);
667 // By this time all PGs are shutdown and dequeued
668 if (!agent_queue.empty()) {
669 set<PGRef>& top = agent_queue.rbegin()->second;
670 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
671 assert(0 == "agent queue not empty");
672 }
673
674 agent_stop_flag = true;
675 agent_cond.Signal();
676 }
677 agent_thread.join();
678 }
679
680 // -------------------------------------
681
682 void OSDService::promote_throttle_recalibrate()
683 {
684 utime_t now = ceph_clock_now();
685 double dur = now - last_recalibrate;
686 last_recalibrate = now;
687 unsigned prob = promote_probability_millis;
688
689 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
690 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
691
692 unsigned min_prob = 1;
693
694 uint64_t attempts, obj, bytes;
695 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
696 dout(10) << __func__ << " " << attempts << " attempts, promoted "
697 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
698 << target_obj_sec << " obj/sec or "
699 << pretty_si_t(target_bytes_sec) << " bytes/sec"
700 << dendl;
701
702 // calculate what the probability *should* be, given the targets
703 unsigned new_prob;
704 if (attempts && dur > 0) {
705 uint64_t avg_size = 1;
706 if (obj)
707 avg_size = MAX(bytes / obj, 1);
708 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
709 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
710 / (double)attempts;
711 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
712 << avg_size << dendl;
713 if (target_obj_sec && target_bytes_sec)
714 new_prob = MIN(po, pb);
715 else if (target_obj_sec)
716 new_prob = po;
717 else if (target_bytes_sec)
718 new_prob = pb;
719 else
720 new_prob = 1000;
721 } else {
722 new_prob = 1000;
723 }
724 dout(20) << __func__ << " new_prob " << new_prob << dendl;
725
726 // correct for persistent skew between target rate and actual rate, adjust
727 double ratio = 1.0;
728 unsigned actual = 0;
729 if (attempts && obj) {
730 actual = obj * 1000 / attempts;
731 ratio = (double)actual / (double)prob;
732 new_prob = (double)new_prob / ratio;
733 }
734 new_prob = MAX(new_prob, min_prob);
735 new_prob = MIN(new_prob, 1000);
736
737 // adjust
738 prob = (prob + new_prob) / 2;
739 prob = MAX(prob, min_prob);
740 prob = MIN(prob, 1000);
741 dout(10) << __func__ << " actual " << actual
742 << ", actual/prob ratio " << ratio
743 << ", adjusted new_prob " << new_prob
744 << ", prob " << promote_probability_millis << " -> " << prob
745 << dendl;
746 promote_probability_millis = prob;
747
748 // set hard limits for this interval to mitigate stampedes
749 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
750 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
751 }
752
753 // -------------------------------------
754
755 float OSDService::get_failsafe_full_ratio()
756 {
757 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
758 if (full_ratio > 1.0) full_ratio /= 100.0;
759 return full_ratio;
760 }
761
762 void OSDService::check_full_status(float ratio)
763 {
764 Mutex::Locker l(full_status_lock);
765
766 cur_ratio = ratio;
767
768 // The OSDMap ratios take precendence. So if the failsafe is .95 and
769 // the admin sets the cluster full to .96, the failsafe moves up to .96
770 // too. (Not that having failsafe == full is ideal, but it's better than
771 // dropping writes before the clusters appears full.)
772 OSDMapRef osdmap = get_osdmap();
773 if (!osdmap || osdmap->get_epoch() == 0) {
774 cur_state = NONE;
775 return;
776 }
777 float nearfull_ratio = osdmap->get_nearfull_ratio();
778 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
779 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
780 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
781
782 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
783 // use the failsafe for nearfull and full; the mon isn't using the
784 // flags anyway because we're mid-upgrade.
785 full_ratio = failsafe_ratio;
786 backfillfull_ratio = failsafe_ratio;
787 nearfull_ratio = failsafe_ratio;
788 } else if (full_ratio <= 0 ||
789 backfillfull_ratio <= 0 ||
790 nearfull_ratio <= 0) {
791 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
792 // use failsafe flag. ick. the monitor did something wrong or the user
793 // did something stupid.
794 full_ratio = failsafe_ratio;
795 backfillfull_ratio = failsafe_ratio;
796 nearfull_ratio = failsafe_ratio;
797 }
798
799 string inject;
800 s_names new_state;
801 if (injectfull_state > NONE && injectfull) {
802 new_state = injectfull_state;
803 inject = "(Injected)";
804 } else if (ratio > failsafe_ratio) {
805 new_state = FAILSAFE;
806 } else if (ratio > full_ratio) {
807 new_state = FULL;
808 } else if (ratio > backfillfull_ratio) {
809 new_state = BACKFILLFULL;
810 } else if (ratio > nearfull_ratio) {
811 new_state = NEARFULL;
812 } else {
813 new_state = NONE;
814 }
815 dout(20) << __func__ << " cur ratio " << ratio
816 << ". nearfull_ratio " << nearfull_ratio
817 << ". backfillfull_ratio " << backfillfull_ratio
818 << ", full_ratio " << full_ratio
819 << ", failsafe_ratio " << failsafe_ratio
820 << ", new state " << get_full_state_name(new_state)
821 << " " << inject
822 << dendl;
823
824 // warn
825 if (cur_state != new_state) {
826 dout(10) << __func__ << " " << get_full_state_name(cur_state)
827 << " -> " << get_full_state_name(new_state) << dendl;
828 if (new_state == FAILSAFE) {
829 clog->error() << "full status failsafe engaged, dropping updates, now "
830 << (int)roundf(ratio * 100) << "% full";
831 } else if (cur_state == FAILSAFE) {
832 clog->error() << "full status failsafe disengaged, no longer dropping "
833 << "updates, now " << (int)roundf(ratio * 100) << "% full";
834 }
835 cur_state = new_state;
836 }
837 }
838
839 bool OSDService::need_fullness_update()
840 {
841 OSDMapRef osdmap = get_osdmap();
842 s_names cur = NONE;
843 if (osdmap->exists(whoami)) {
844 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
845 cur = FULL;
846 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
847 cur = BACKFILLFULL;
848 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
849 cur = NEARFULL;
850 }
851 }
852 s_names want = NONE;
853 if (is_full())
854 want = FULL;
855 else if (is_backfillfull())
856 want = BACKFILLFULL;
857 else if (is_nearfull())
858 want = NEARFULL;
859 return want != cur;
860 }
861
862 bool OSDService::_check_full(s_names type, ostream &ss) const
863 {
864 Mutex::Locker l(full_status_lock);
865
866 if (injectfull && injectfull_state >= type) {
867 // injectfull is either a count of the number of times to return failsafe full
868 // or if -1 then always return full
869 if (injectfull > 0)
870 --injectfull;
871 ss << "Injected " << get_full_state_name(type) << " OSD ("
872 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
873 return true;
874 }
875
876 ss << "current usage is " << cur_ratio;
877 return cur_state >= type;
878 }
879
880 bool OSDService::check_failsafe_full(ostream &ss) const
881 {
882 return _check_full(FAILSAFE, ss);
883 }
884
885 bool OSDService::check_full(ostream &ss) const
886 {
887 return _check_full(FULL, ss);
888 }
889
890 bool OSDService::check_backfill_full(ostream &ss) const
891 {
892 return _check_full(BACKFILLFULL, ss);
893 }
894
895 bool OSDService::check_nearfull(ostream &ss) const
896 {
897 return _check_full(NEARFULL, ss);
898 }
899
900 bool OSDService::is_failsafe_full() const
901 {
902 Mutex::Locker l(full_status_lock);
903 return cur_state == FAILSAFE;
904 }
905
906 bool OSDService::is_full() const
907 {
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= FULL;
910 }
911
912 bool OSDService::is_backfillfull() const
913 {
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= BACKFILLFULL;
916 }
917
918 bool OSDService::is_nearfull() const
919 {
920 Mutex::Locker l(full_status_lock);
921 return cur_state >= NEARFULL;
922 }
923
924 void OSDService::set_injectfull(s_names type, int64_t count)
925 {
926 Mutex::Locker l(full_status_lock);
927 injectfull_state = type;
928 injectfull = count;
929 }
930
931 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
932 vector<int>& hb_peers,
933 int num_pgs)
934 {
935 uint64_t bytes = stbuf.total;
936 uint64_t used = bytes - stbuf.available;
937 uint64_t avail = stbuf.available;
938
939 osd->logger->set(l_osd_stat_bytes, bytes);
940 osd->logger->set(l_osd_stat_bytes_used, used);
941 osd->logger->set(l_osd_stat_bytes_avail, avail);
942
943 {
944 Mutex::Locker l(stat_lock);
945 osd_stat.hb_peers.swap(hb_peers);
946 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
947 osd_stat.kb = bytes >> 10;
948 osd_stat.kb_used = used >> 10;
949 osd_stat.kb_avail = avail >> 10;
950 osd_stat.num_pgs = num_pgs;
951 return osd_stat;
952 }
953 }
954
955 void OSDService::update_osd_stat(vector<int>& hb_peers)
956 {
957 // load osd stats first
958 struct store_statfs_t stbuf;
959 int r = osd->store->statfs(&stbuf);
960 if (r < 0) {
961 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
962 return;
963 }
964
965 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
966 dout(20) << "update_osd_stat " << new_stat << dendl;
967 assert(new_stat.kb);
968 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
969 check_full_status(ratio);
970 }
971
972 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
973 {
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
977 return true;
978 }
979 return false;
980 }
981
982 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
983 {
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
987
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
990 m->put();
991 release_map(next_map);
992 return;
993 }
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
999 }
1000
1001 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1002 {
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1006
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1010 return NULL;
1011 }
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1014 return con;
1015 }
1016
1017 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1018 {
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1022
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1027 return ret;
1028 }
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1033 return ret;
1034 }
1035
1036
1037 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1038 {
1039 Mutex::Locker l(pg_temp_lock);
1040 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1041 if (p == pg_temp_pending.end() ||
1042 p->second != want) {
1043 pg_temp_wanted[pgid] = want;
1044 }
1045 }
1046
1047 void OSDService::remove_want_pg_temp(pg_t pgid)
1048 {
1049 Mutex::Locker l(pg_temp_lock);
1050 pg_temp_wanted.erase(pgid);
1051 pg_temp_pending.erase(pgid);
1052 }
1053
1054 void OSDService::_sent_pg_temp()
1055 {
1056 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1057 p != pg_temp_wanted.end();
1058 ++p)
1059 pg_temp_pending[p->first] = p->second;
1060 pg_temp_wanted.clear();
1061 }
1062
1063 void OSDService::requeue_pg_temp()
1064 {
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1070 _sent_pg_temp();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1074 }
1075
1076 void OSDService::send_pg_temp()
1077 {
1078 Mutex::Locker l(pg_temp_lock);
1079 if (pg_temp_wanted.empty())
1080 return;
1081 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1082 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1083 m->pg_temp = pg_temp_wanted;
1084 monc->send_mon_message(m);
1085 _sent_pg_temp();
1086 }
1087
1088 void OSDService::send_pg_created(pg_t pgid)
1089 {
1090 dout(20) << __func__ << dendl;
1091 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1092 monc->send_mon_message(new MOSDPGCreated(pgid));
1093 }
1094 }
1095
1096 // --------------------------------------
1097 // dispatch
1098
1099 epoch_t OSDService::get_peer_epoch(int peer)
1100 {
1101 Mutex::Locker l(peer_map_epoch_lock);
1102 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1103 if (p == peer_map_epoch.end())
1104 return 0;
1105 return p->second;
1106 }
1107
1108 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1109 {
1110 Mutex::Locker l(peer_map_epoch_lock);
1111 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1112 if (p != peer_map_epoch.end()) {
1113 if (p->second < e) {
1114 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1115 p->second = e;
1116 } else {
1117 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1118 }
1119 return p->second;
1120 } else {
1121 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1122 peer_map_epoch[peer] = e;
1123 return e;
1124 }
1125 }
1126
1127 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1128 {
1129 Mutex::Locker l(peer_map_epoch_lock);
1130 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1131 if (p != peer_map_epoch.end()) {
1132 if (p->second <= as_of) {
1133 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1134 << " had " << p->second << dendl;
1135 peer_map_epoch.erase(p);
1136 } else {
1137 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1138 << " has " << p->second << " - not forgetting" << dendl;
1139 }
1140 }
1141 }
1142
1143 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1144 epoch_t epoch, const OSDMapRef& osdmap,
1145 const epoch_t *sent_epoch_p)
1146 {
1147 dout(20) << "should_share_map "
1148 << name << " " << con->get_peer_addr()
1149 << " " << epoch << dendl;
1150
1151 // does client have old map?
1152 if (name.is_client()) {
1153 bool message_sendmap = epoch < osdmap->get_epoch();
1154 if (message_sendmap && sent_epoch_p) {
1155 dout(20) << "client session last_sent_epoch: "
1156 << *sent_epoch_p
1157 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1158 if (*sent_epoch_p < osdmap->get_epoch()) {
1159 return true;
1160 } // else we don't need to send it out again
1161 }
1162 }
1163
1164 if (con->get_messenger() == osd->cluster_messenger &&
1165 con != osd->cluster_messenger->get_loopback_connection() &&
1166 osdmap->is_up(name.num()) &&
1167 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1168 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1169 // remember
1170 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1171
1172 // share?
1173 if (has < osdmap->get_epoch()) {
1174 dout(10) << name << " " << con->get_peer_addr()
1175 << " has old map " << epoch << " < "
1176 << osdmap->get_epoch() << dendl;
1177 return true;
1178 }
1179 }
1180
1181 return false;
1182 }
1183
1184 void OSDService::share_map(
1185 entity_name_t name,
1186 Connection *con,
1187 epoch_t epoch,
1188 OSDMapRef& osdmap,
1189 epoch_t *sent_epoch_p)
1190 {
1191 dout(20) << "share_map "
1192 << name << " " << con->get_peer_addr()
1193 << " " << epoch << dendl;
1194
1195 if (!osd->is_active()) {
1196 /*It is safe not to proceed as OSD is not in healthy state*/
1197 return;
1198 }
1199
1200 bool want_shared = should_share_map(name, con, epoch,
1201 osdmap, sent_epoch_p);
1202
1203 if (want_shared){
1204 if (name.is_client()) {
1205 dout(10) << name << " has old map " << epoch
1206 << " < " << osdmap->get_epoch() << dendl;
1207 // we know the Session is valid or we wouldn't be sending
1208 if (sent_epoch_p) {
1209 *sent_epoch_p = osdmap->get_epoch();
1210 }
1211 send_incremental_map(epoch, con, osdmap);
1212 } else if (con->get_messenger() == osd->cluster_messenger &&
1213 osdmap->is_up(name.num()) &&
1214 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1215 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1216 dout(10) << name << " " << con->get_peer_addr()
1217 << " has old map " << epoch << " < "
1218 << osdmap->get_epoch() << dendl;
1219 note_peer_epoch(name.num(), osdmap->get_epoch());
1220 send_incremental_map(epoch, con, osdmap);
1221 }
1222 }
1223 }
1224
1225 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1226 {
1227 if (!map)
1228 map = get_osdmap();
1229
1230 // send map?
1231 epoch_t pe = get_peer_epoch(peer);
1232 if (pe) {
1233 if (pe < map->get_epoch()) {
1234 send_incremental_map(pe, con, map);
1235 note_peer_epoch(peer, map->get_epoch());
1236 } else
1237 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1238 } else {
1239 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1240 // no idea about peer's epoch.
1241 // ??? send recent ???
1242 // do nothing.
1243 }
1244 }
1245
1246 bool OSDService::can_inc_scrubs_pending()
1247 {
1248 bool can_inc = false;
1249 Mutex::Locker l(sched_scrub_lock);
1250
1251 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1252 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1253 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1254 can_inc = true;
1255 } else {
1256 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1257 }
1258
1259 return can_inc;
1260 }
1261
1262 bool OSDService::inc_scrubs_pending()
1263 {
1264 bool result = false;
1265
1266 sched_scrub_lock.Lock();
1267 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1268 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1269 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1270 result = true;
1271 ++scrubs_pending;
1272 } else {
1273 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1274 }
1275 sched_scrub_lock.Unlock();
1276
1277 return result;
1278 }
1279
1280 void OSDService::dec_scrubs_pending()
1281 {
1282 sched_scrub_lock.Lock();
1283 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1284 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1285 --scrubs_pending;
1286 assert(scrubs_pending >= 0);
1287 sched_scrub_lock.Unlock();
1288 }
1289
1290 void OSDService::inc_scrubs_active(bool reserved)
1291 {
1292 sched_scrub_lock.Lock();
1293 ++(scrubs_active);
1294 if (reserved) {
1295 --(scrubs_pending);
1296 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1297 << " (max " << cct->_conf->osd_max_scrubs
1298 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1299 assert(scrubs_pending >= 0);
1300 } else {
1301 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1302 << " (max " << cct->_conf->osd_max_scrubs
1303 << ", pending " << scrubs_pending << ")" << dendl;
1304 }
1305 sched_scrub_lock.Unlock();
1306 }
1307
1308 void OSDService::dec_scrubs_active()
1309 {
1310 sched_scrub_lock.Lock();
1311 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1312 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1313 --scrubs_active;
1314 assert(scrubs_active >= 0);
1315 sched_scrub_lock.Unlock();
1316 }
1317
1318 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1319 epoch_t *_bind_epoch) const
1320 {
1321 Mutex::Locker l(epoch_lock);
1322 if (_boot_epoch)
1323 *_boot_epoch = boot_epoch;
1324 if (_up_epoch)
1325 *_up_epoch = up_epoch;
1326 if (_bind_epoch)
1327 *_bind_epoch = bind_epoch;
1328 }
1329
1330 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1331 const epoch_t *_bind_epoch)
1332 {
1333 Mutex::Locker l(epoch_lock);
1334 if (_boot_epoch) {
1335 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1336 boot_epoch = *_boot_epoch;
1337 }
1338 if (_up_epoch) {
1339 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1340 up_epoch = *_up_epoch;
1341 }
1342 if (_bind_epoch) {
1343 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1344 bind_epoch = *_bind_epoch;
1345 }
1346 }
1347
1348 bool OSDService::prepare_to_stop()
1349 {
1350 Mutex::Locker l(is_stopping_lock);
1351 if (get_state() != NOT_STOPPING)
1352 return false;
1353
1354 OSDMapRef osdmap = get_osdmap();
1355 if (osdmap && osdmap->is_up(whoami)) {
1356 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1357 set_state(PREPARING_TO_STOP);
1358 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1359 osdmap->get_inst(whoami),
1360 osdmap->get_epoch(),
1361 true // request ack
1362 ));
1363 utime_t now = ceph_clock_now();
1364 utime_t timeout;
1365 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1366 while ((ceph_clock_now() < timeout) &&
1367 (get_state() != STOPPING)) {
1368 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1369 }
1370 }
1371 dout(0) << __func__ << " starting shutdown" << dendl;
1372 set_state(STOPPING);
1373 return true;
1374 }
1375
1376 void OSDService::got_stop_ack()
1377 {
1378 Mutex::Locker l(is_stopping_lock);
1379 if (get_state() == PREPARING_TO_STOP) {
1380 dout(0) << __func__ << " starting shutdown" << dendl;
1381 set_state(STOPPING);
1382 is_stopping_cond.Signal();
1383 } else {
1384 dout(10) << __func__ << " ignoring msg" << dendl;
1385 }
1386 }
1387
1388 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1389 OSDSuperblock& sblock)
1390 {
1391 MOSDMap *m = new MOSDMap(monc->get_fsid());
1392 m->oldest_map = max_oldest_map;
1393 m->newest_map = sblock.newest_map;
1394
1395 for (epoch_t e = to; e > since; e--) {
1396 bufferlist bl;
1397 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1398 m->incremental_maps[e].claim(bl);
1399 } else if (get_map_bl(e, bl)) {
1400 m->maps[e].claim(bl);
1401 break;
1402 } else {
1403 derr << "since " << since << " to " << to
1404 << " oldest " << m->oldest_map << " newest " << m->newest_map
1405 << dendl;
1406 m->put();
1407 m = NULL;
1408 break;
1409 }
1410 }
1411 return m;
1412 }
1413
1414 void OSDService::send_map(MOSDMap *m, Connection *con)
1415 {
1416 con->send_message(m);
1417 }
1418
1419 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1420 OSDMapRef& osdmap)
1421 {
1422 epoch_t to = osdmap->get_epoch();
1423 dout(10) << "send_incremental_map " << since << " -> " << to
1424 << " to " << con << " " << con->get_peer_addr() << dendl;
1425
1426 MOSDMap *m = NULL;
1427 while (!m) {
1428 OSDSuperblock sblock(get_superblock());
1429 if (since < sblock.oldest_map) {
1430 // just send latest full map
1431 MOSDMap *m = new MOSDMap(monc->get_fsid());
1432 m->oldest_map = max_oldest_map;
1433 m->newest_map = sblock.newest_map;
1434 get_map_bl(to, m->maps[to]);
1435 send_map(m, con);
1436 return;
1437 }
1438
1439 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1440 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1441 << ", only sending most recent" << dendl;
1442 since = to - cct->_conf->osd_map_share_max_epochs;
1443 }
1444
1445 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1446 to = since + cct->_conf->osd_map_message_max;
1447 m = build_incremental_map_msg(since, to, sblock);
1448 }
1449 send_map(m, con);
1450 }
1451
1452 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1453 {
1454 bool found = map_bl_cache.lookup(e, &bl);
1455 if (found) {
1456 if (logger)
1457 logger->inc(l_osd_map_bl_cache_hit);
1458 return true;
1459 }
1460 if (logger)
1461 logger->inc(l_osd_map_bl_cache_miss);
1462 found = store->read(coll_t::meta(),
1463 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1464 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1465 if (found) {
1466 _add_map_bl(e, bl);
1467 }
1468 return found;
1469 }
1470
1471 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1472 {
1473 Mutex::Locker l(map_cache_lock);
1474 bool found = map_bl_inc_cache.lookup(e, &bl);
1475 if (found) {
1476 if (logger)
1477 logger->inc(l_osd_map_bl_cache_hit);
1478 return true;
1479 }
1480 if (logger)
1481 logger->inc(l_osd_map_bl_cache_miss);
1482 found = store->read(coll_t::meta(),
1483 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1484 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1485 if (found) {
1486 _add_map_inc_bl(e, bl);
1487 }
1488 return found;
1489 }
1490
1491 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1492 {
1493 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1494 // cache a contiguous buffer
1495 if (bl.get_num_buffers() > 1) {
1496 bl.rebuild();
1497 }
1498 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1499 map_bl_cache.add(e, bl);
1500 }
1501
1502 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1503 {
1504 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1505 // cache a contiguous buffer
1506 if (bl.get_num_buffers() > 1) {
1507 bl.rebuild();
1508 }
1509 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1510 map_bl_inc_cache.add(e, bl);
1511 }
1512
1513 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1514 {
1515 Mutex::Locker l(map_cache_lock);
1516 // cache a contiguous buffer
1517 if (bl.get_num_buffers() > 1) {
1518 bl.rebuild();
1519 }
1520 map_bl_inc_cache.pin(e, bl);
1521 }
1522
1523 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1524 {
1525 Mutex::Locker l(map_cache_lock);
1526 // cache a contiguous buffer
1527 if (bl.get_num_buffers() > 1) {
1528 bl.rebuild();
1529 }
1530 map_bl_cache.pin(e, bl);
1531 }
1532
1533 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1534 {
1535 Mutex::Locker l(map_cache_lock);
1536 map_bl_inc_cache.clear_pinned(e);
1537 map_bl_cache.clear_pinned(e);
1538 }
1539
1540 OSDMapRef OSDService::_add_map(OSDMap *o)
1541 {
1542 epoch_t e = o->get_epoch();
1543
1544 if (cct->_conf->osd_map_dedup) {
1545 // Dedup against an existing map at a nearby epoch
1546 OSDMapRef for_dedup = map_cache.lower_bound(e);
1547 if (for_dedup) {
1548 OSDMap::dedup(for_dedup.get(), o);
1549 }
1550 }
1551 bool existed;
1552 OSDMapRef l = map_cache.add(e, o, &existed);
1553 if (existed) {
1554 delete o;
1555 }
1556 return l;
1557 }
1558
1559 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1560 {
1561 Mutex::Locker l(map_cache_lock);
1562 OSDMapRef retval = map_cache.lookup(epoch);
1563 if (retval) {
1564 dout(30) << "get_map " << epoch << " -cached" << dendl;
1565 if (logger) {
1566 logger->inc(l_osd_map_cache_hit);
1567 }
1568 return retval;
1569 }
1570 if (logger) {
1571 logger->inc(l_osd_map_cache_miss);
1572 epoch_t lb = map_cache.cached_key_lower_bound();
1573 if (epoch < lb) {
1574 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1575 logger->inc(l_osd_map_cache_miss_low);
1576 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1577 }
1578 }
1579
1580 OSDMap *map = new OSDMap;
1581 if (epoch > 0) {
1582 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1583 bufferlist bl;
1584 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1585 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1586 delete map;
1587 return OSDMapRef();
1588 }
1589 map->decode(bl);
1590 } else {
1591 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1592 }
1593 return _add_map(map);
1594 }
1595
1596 // ops
1597
1598
1599 void OSDService::reply_op_error(OpRequestRef op, int err)
1600 {
1601 reply_op_error(op, err, eversion_t(), 0);
1602 }
1603
1604 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1605 version_t uv)
1606 {
1607 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1608 assert(m->get_type() == CEPH_MSG_OSD_OP);
1609 int flags;
1610 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1611
1612 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1613 true);
1614 reply->set_reply_versions(v, uv);
1615 m->get_connection()->send_message(reply);
1616 }
1617
1618 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1619 {
1620 if (!cct->_conf->osd_debug_misdirected_ops) {
1621 return;
1622 }
1623
1624 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1625 assert(m->get_type() == CEPH_MSG_OSD_OP);
1626
1627 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1628
1629 if (pg->is_ec_pg()) {
1630 /**
1631 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1632 * can get this result:
1633 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1634 * [CRUSH_ITEM_NONE, 2, 3]/3
1635 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1636 * [3, 2, 3]/3
1637 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1638 * -- misdirected op
1639 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1640 * it and fulfils it
1641 *
1642 * We can't compute the op target based on the sending map epoch due to
1643 * splitting. The simplest thing is to detect such cases here and drop
1644 * them without an error (the client will resend anyway).
1645 */
1646 assert(m->get_map_epoch() <= superblock.newest_map);
1647 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1648 if (!opmap) {
1649 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1650 << m->get_map_epoch() << ", dropping" << dendl;
1651 return;
1652 }
1653 pg_t _pgid = m->get_raw_pg();
1654 spg_t pgid;
1655 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1656 _pgid = opmap->raw_pg_to_pg(_pgid);
1657 if (opmap->get_primary_shard(_pgid, &pgid) &&
1658 pgid.shard != pg->info.pgid.shard) {
1659 dout(7) << __func__ << ": " << *pg << " primary changed since "
1660 << m->get_map_epoch() << ", dropping" << dendl;
1661 return;
1662 }
1663 }
1664
1665 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1666 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1667 << " pg " << m->get_raw_pg()
1668 << " to osd." << whoami
1669 << " not " << pg->acting
1670 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1671 }
1672
1673 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1674 {
1675 osd->op_shardedwq.queue(make_pair(pgid, qi));
1676 }
1677
1678 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1679 {
1680 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1681 }
1682
1683 void OSDService::queue_for_peering(PG *pg)
1684 {
1685 peering_wq.queue(pg);
1686 }
1687
1688 void OSDService::queue_for_snap_trim(PG *pg)
1689 {
1690 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1691 osd->op_shardedwq.queue(
1692 make_pair(
1693 pg->info.pgid,
1694 PGQueueable(
1695 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1696 cct->_conf->osd_snap_trim_cost,
1697 cct->_conf->osd_snap_trim_priority,
1698 ceph_clock_now(),
1699 entity_inst_t(),
1700 pg->get_osdmap()->get_epoch())));
1701 }
1702
1703
1704 // ====================================================================
1705 // OSD
1706
1707 #undef dout_prefix
1708 #define dout_prefix *_dout
1709
1710 // Commands shared between OSD's console and admin console:
1711 namespace ceph {
1712 namespace osd_cmds {
1713
1714 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1715
1716 }} // namespace ceph::osd_cmds
1717
1718 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1719 uuid_d fsid, int whoami)
1720 {
1721 int ret;
1722
1723 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1724 new ObjectStore::Sequencer("mkfs"));
1725 OSDSuperblock sb;
1726 bufferlist sbbl;
1727 C_SaferCond waiter;
1728
1729 // if we are fed a uuid for this osd, use it.
1730 store->set_fsid(cct->_conf->osd_uuid);
1731
1732 ret = store->mkfs();
1733 if (ret) {
1734 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1735 << cpp_strerror(ret) << dendl;
1736 goto free_store;
1737 }
1738
1739 store->set_cache_shards(1); // doesn't matter for mkfs!
1740
1741 ret = store->mount();
1742 if (ret) {
1743 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1744 << cpp_strerror(ret) << dendl;
1745 goto free_store;
1746 }
1747
1748 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1749 if (ret >= 0) {
1750 /* if we already have superblock, check content of superblock */
1751 dout(0) << " have superblock" << dendl;
1752 bufferlist::iterator p;
1753 p = sbbl.begin();
1754 ::decode(sb, p);
1755 if (whoami != sb.whoami) {
1756 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1757 << dendl;
1758 ret = -EINVAL;
1759 goto umount_store;
1760 }
1761 if (fsid != sb.cluster_fsid) {
1762 derr << "provided cluster fsid " << fsid
1763 << " != superblock's " << sb.cluster_fsid << dendl;
1764 ret = -EINVAL;
1765 goto umount_store;
1766 }
1767 } else {
1768 // create superblock
1769 sb.cluster_fsid = fsid;
1770 sb.osd_fsid = store->get_fsid();
1771 sb.whoami = whoami;
1772 sb.compat_features = get_osd_initial_compat_set();
1773
1774 bufferlist bl;
1775 ::encode(sb, bl);
1776
1777 ObjectStore::Transaction t;
1778 t.create_collection(coll_t::meta(), 0);
1779 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1780 ret = store->apply_transaction(osr.get(), std::move(t));
1781 if (ret) {
1782 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1783 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1784 goto umount_store;
1785 }
1786 }
1787
1788 if (!osr->flush_commit(&waiter)) {
1789 waiter.wait();
1790 }
1791
1792 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
1793 if (ret) {
1794 derr << "OSD::mkfs: failed to write fsid file: error "
1795 << cpp_strerror(ret) << dendl;
1796 goto umount_store;
1797 }
1798
1799 umount_store:
1800 store->umount();
1801 free_store:
1802 delete store;
1803 return ret;
1804 }
1805
1806 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1807 {
1808 char val[80];
1809 int r;
1810
1811 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1812 r = store->write_meta("magic", val);
1813 if (r < 0)
1814 return r;
1815
1816 snprintf(val, sizeof(val), "%d", whoami);
1817 r = store->write_meta("whoami", val);
1818 if (r < 0)
1819 return r;
1820
1821 cluster_fsid.print(val);
1822 r = store->write_meta("ceph_fsid", val);
1823 if (r < 0)
1824 return r;
1825
1826 string key = cct->_conf->get_val<string>("key");
1827 if (key.size()) {
1828 r = store->write_meta("osd_key", key);
1829 if (r < 0)
1830 return r;
1831 } else {
1832 string keyfile = cct->_conf->get_val<string>("keyfile");
1833 if (!keyfile.empty()) {
1834 bufferlist keybl;
1835 string err;
1836 if (keyfile == "-") {
1837 static_assert(1024 * 1024 >
1838 (sizeof(CryptoKey) - sizeof(bufferptr) +
1839 sizeof(__u16) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1840 "1MB should be enough for a base64 encoded CryptoKey");
1841 r = keybl.read_fd(STDIN_FILENO, 1024 * 1024);
1842 } else {
1843 r = keybl.read_file(keyfile.c_str(), &err);
1844 }
1845 if (r < 0) {
1846 derr << __func__ << " failed to read keyfile " << keyfile << ": "
1847 << err << ": " << cpp_strerror(r) << dendl;
1848 return r;
1849 }
1850 r = store->write_meta("osd_key", keybl.to_str());
1851 if (r < 0)
1852 return r;
1853 }
1854 }
1855
1856 r = store->write_meta("ready", "ready");
1857 if (r < 0)
1858 return r;
1859
1860 return 0;
1861 }
1862
1863 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1864 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1865 {
1866 string val;
1867
1868 int r = store->read_meta("magic", &val);
1869 if (r < 0)
1870 return r;
1871 magic = val;
1872
1873 r = store->read_meta("whoami", &val);
1874 if (r < 0)
1875 return r;
1876 whoami = atoi(val.c_str());
1877
1878 r = store->read_meta("ceph_fsid", &val);
1879 if (r < 0)
1880 return r;
1881 r = cluster_fsid.parse(val.c_str());
1882 if (!r)
1883 return -EINVAL;
1884
1885 r = store->read_meta("fsid", &val);
1886 if (r < 0) {
1887 osd_fsid = uuid_d();
1888 } else {
1889 r = osd_fsid.parse(val.c_str());
1890 if (!r)
1891 return -EINVAL;
1892 }
1893
1894 return 0;
1895 }
1896
1897
1898 #undef dout_prefix
1899 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1900
1901 // cons/des
1902
1903 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1904 int id,
1905 Messenger *internal_messenger,
1906 Messenger *external_messenger,
1907 Messenger *hb_client_front,
1908 Messenger *hb_client_back,
1909 Messenger *hb_front_serverm,
1910 Messenger *hb_back_serverm,
1911 Messenger *osdc_messenger,
1912 MonClient *mc,
1913 const std::string &dev, const std::string &jdev) :
1914 Dispatcher(cct_),
1915 osd_lock("OSD::osd_lock"),
1916 tick_timer(cct, osd_lock),
1917 tick_timer_lock("OSD::tick_timer_lock"),
1918 tick_timer_without_osd_lock(cct, tick_timer_lock),
1919 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1920 cct->_conf->auth_supported.empty() ?
1921 cct->_conf->auth_cluster_required :
1922 cct->_conf->auth_supported)),
1923 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1924 cct->_conf->auth_supported.empty() ?
1925 cct->_conf->auth_service_required :
1926 cct->_conf->auth_supported)),
1927 cluster_messenger(internal_messenger),
1928 client_messenger(external_messenger),
1929 objecter_messenger(osdc_messenger),
1930 monc(mc),
1931 mgrc(cct_, client_messenger),
1932 logger(NULL),
1933 recoverystate_perf(NULL),
1934 store(store_),
1935 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1936 clog(log_client.create_channel()),
1937 whoami(id),
1938 dev_path(dev), journal_path(jdev),
1939 store_is_rotational(store->is_rotational()),
1940 trace_endpoint("0.0.0.0", 0, "osd"),
1941 asok_hook(NULL),
1942 osd_compat(get_osd_compat_set()),
1943 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1944 cct->_conf->osd_peering_wq_threads,
1945 "osd_peering_tp_threads"),
1946 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1947 get_num_op_threads()),
1948 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1949 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1950 session_waiting_lock("OSD::session_waiting_lock"),
1951 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
1952 heartbeat_lock("OSD::heartbeat_lock"),
1953 heartbeat_stop(false),
1954 heartbeat_need_update(true),
1955 hb_front_client_messenger(hb_client_front),
1956 hb_back_client_messenger(hb_client_back),
1957 hb_front_server_messenger(hb_front_serverm),
1958 hb_back_server_messenger(hb_back_serverm),
1959 daily_loadavg(0.0),
1960 heartbeat_thread(this),
1961 heartbeat_dispatcher(this),
1962 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1963 cct->_conf->osd_num_op_tracker_shard),
1964 test_ops_hook(NULL),
1965 op_queue(get_io_queue()),
1966 op_prio_cutoff(get_io_prio_cut()),
1967 op_shardedwq(
1968 get_num_op_shards(),
1969 this,
1970 cct->_conf->osd_op_thread_timeout,
1971 cct->_conf->osd_op_thread_suicide_timeout,
1972 &osd_op_tp),
1973 peering_wq(
1974 this,
1975 cct->_conf->osd_op_thread_timeout,
1976 cct->_conf->osd_op_thread_suicide_timeout,
1977 &peering_tp),
1978 map_lock("OSD::map_lock"),
1979 pg_map_lock("OSD::pg_map_lock"),
1980 last_pg_create_epoch(0),
1981 mon_report_lock("OSD::mon_report_lock"),
1982 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1983 up_thru_wanted(0),
1984 requested_full_first(0),
1985 requested_full_last(0),
1986 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1987 osd_stat_updated(false),
1988 pg_stat_tid(0), pg_stat_tid_flushed(0),
1989 command_wq(
1990 this,
1991 cct->_conf->osd_command_thread_timeout,
1992 cct->_conf->osd_command_thread_suicide_timeout,
1993 &command_tp),
1994 remove_wq(
1995 cct,
1996 store,
1997 cct->_conf->osd_remove_thread_timeout,
1998 cct->_conf->osd_remove_thread_suicide_timeout,
1999 &disk_tp),
2000 service(this)
2001 {
2002 monc->set_messenger(client_messenger);
2003 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2004 cct->_conf->osd_op_log_threshold);
2005 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2006 cct->_conf->osd_op_history_duration);
2007 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2008 cct->_conf->osd_op_history_slow_op_threshold);
2009 #ifdef WITH_BLKIN
2010 std::stringstream ss;
2011 ss << "osd." << whoami;
2012 trace_endpoint.copy_name(ss.str());
2013 #endif
2014 }
2015
2016 OSD::~OSD()
2017 {
2018 delete authorize_handler_cluster_registry;
2019 delete authorize_handler_service_registry;
2020 delete class_handler;
2021 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2022 cct->get_perfcounters_collection()->remove(logger);
2023 delete recoverystate_perf;
2024 delete logger;
2025 delete store;
2026 }
2027
2028 void cls_initialize(ClassHandler *ch);
2029
2030 void OSD::handle_signal(int signum)
2031 {
2032 assert(signum == SIGINT || signum == SIGTERM);
2033 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2034 shutdown();
2035 }
2036
2037 int OSD::pre_init()
2038 {
2039 Mutex::Locker lock(osd_lock);
2040 if (is_stopping())
2041 return 0;
2042
2043 if (store->test_mount_in_use()) {
2044 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2045 << "currently in use. (Is ceph-osd already running?)" << dendl;
2046 return -EBUSY;
2047 }
2048
2049 cct->_conf->add_observer(this);
2050 return 0;
2051 }
2052
2053 // asok
2054
2055 class OSDSocketHook : public AdminSocketHook {
2056 OSD *osd;
2057 public:
2058 explicit OSDSocketHook(OSD *o) : osd(o) {}
2059 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2060 bufferlist& out) override {
2061 stringstream ss;
2062 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2063 out.append(ss);
2064 return r;
2065 }
2066 };
2067
2068 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2069 ostream& ss)
2070 {
2071 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2072 if (admin_command == "status") {
2073 f->open_object_section("status");
2074 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2075 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2076 f->dump_unsigned("whoami", superblock.whoami);
2077 f->dump_string("state", get_state_name(get_state()));
2078 f->dump_unsigned("oldest_map", superblock.oldest_map);
2079 f->dump_unsigned("newest_map", superblock.newest_map);
2080 {
2081 RWLock::RLocker l(pg_map_lock);
2082 f->dump_unsigned("num_pgs", pg_map.size());
2083 }
2084 f->close_section();
2085 } else if (admin_command == "flush_journal") {
2086 store->flush_journal();
2087 } else if (admin_command == "dump_ops_in_flight" ||
2088 admin_command == "ops" ||
2089 admin_command == "dump_blocked_ops" ||
2090 admin_command == "dump_historic_ops" ||
2091 admin_command == "dump_historic_ops_by_duration" ||
2092 admin_command == "dump_historic_slow_ops") {
2093
2094 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2095 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2096 will start to track new ops received afterwards.";
2097
2098 set<string> filters;
2099 vector<string> filter_str;
2100 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2101 copy(filter_str.begin(), filter_str.end(),
2102 inserter(filters, filters.end()));
2103 }
2104
2105 if (admin_command == "dump_ops_in_flight" ||
2106 admin_command == "ops") {
2107 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2108 ss << error_str;
2109 }
2110 }
2111 if (admin_command == "dump_blocked_ops") {
2112 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2113 ss << error_str;
2114 }
2115 }
2116 if (admin_command == "dump_historic_ops") {
2117 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2118 ss << error_str;
2119 }
2120 }
2121 if (admin_command == "dump_historic_ops_by_duration") {
2122 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2123 ss << error_str;
2124 }
2125 }
2126 if (admin_command == "dump_historic_slow_ops") {
2127 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2128 ss << error_str;
2129 }
2130 }
2131 } else if (admin_command == "dump_op_pq_state") {
2132 f->open_object_section("pq");
2133 op_shardedwq.dump(f);
2134 f->close_section();
2135 } else if (admin_command == "dump_blacklist") {
2136 list<pair<entity_addr_t,utime_t> > bl;
2137 OSDMapRef curmap = service.get_osdmap();
2138
2139 f->open_array_section("blacklist");
2140 curmap->get_blacklist(&bl);
2141 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2142 it != bl.end(); ++it) {
2143 f->open_object_section("entry");
2144 f->open_object_section("entity_addr_t");
2145 it->first.dump(f);
2146 f->close_section(); //entity_addr_t
2147 it->second.localtime(f->dump_stream("expire_time"));
2148 f->close_section(); //entry
2149 }
2150 f->close_section(); //blacklist
2151 } else if (admin_command == "dump_watchers") {
2152 list<obj_watch_item_t> watchers;
2153 // scan pg's
2154 {
2155 Mutex::Locker l(osd_lock);
2156 RWLock::RLocker l2(pg_map_lock);
2157 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2158 it != pg_map.end();
2159 ++it) {
2160
2161 list<obj_watch_item_t> pg_watchers;
2162 PG *pg = it->second;
2163 pg->lock();
2164 pg->get_watchers(pg_watchers);
2165 pg->unlock();
2166 watchers.splice(watchers.end(), pg_watchers);
2167 }
2168 }
2169
2170 f->open_array_section("watchers");
2171 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2172 it != watchers.end(); ++it) {
2173
2174 f->open_object_section("watch");
2175
2176 f->dump_string("namespace", it->obj.nspace);
2177 f->dump_string("object", it->obj.oid.name);
2178
2179 f->open_object_section("entity_name");
2180 it->wi.name.dump(f);
2181 f->close_section(); //entity_name_t
2182
2183 f->dump_unsigned("cookie", it->wi.cookie);
2184 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2185
2186 f->open_object_section("entity_addr_t");
2187 it->wi.addr.dump(f);
2188 f->close_section(); //entity_addr_t
2189
2190 f->close_section(); //watch
2191 }
2192
2193 f->close_section(); //watchers
2194 } else if (admin_command == "dump_reservations") {
2195 f->open_object_section("reservations");
2196 f->open_object_section("local_reservations");
2197 service.local_reserver.dump(f);
2198 f->close_section();
2199 f->open_object_section("remote_reservations");
2200 service.remote_reserver.dump(f);
2201 f->close_section();
2202 f->close_section();
2203 } else if (admin_command == "get_latest_osdmap") {
2204 get_latest_osdmap();
2205 } else if (admin_command == "heap") {
2206 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2207
2208 // Note: Failed heap profile commands won't necessarily trigger an error:
2209 f->open_object_section("result");
2210 f->dump_string("error", cpp_strerror(result));
2211 f->dump_bool("success", result >= 0);
2212 f->close_section();
2213 } else if (admin_command == "set_heap_property") {
2214 string property;
2215 int64_t value = 0;
2216 string error;
2217 bool success = false;
2218 if (!cmd_getval(cct, cmdmap, "property", property)) {
2219 error = "unable to get property";
2220 success = false;
2221 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2222 error = "unable to get value";
2223 success = false;
2224 } else if (value < 0) {
2225 error = "negative value not allowed";
2226 success = false;
2227 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2228 error = "invalid property";
2229 success = false;
2230 } else {
2231 success = true;
2232 }
2233 f->open_object_section("result");
2234 f->dump_string("error", error);
2235 f->dump_bool("success", success);
2236 f->close_section();
2237 } else if (admin_command == "get_heap_property") {
2238 string property;
2239 size_t value = 0;
2240 string error;
2241 bool success = false;
2242 if (!cmd_getval(cct, cmdmap, "property", property)) {
2243 error = "unable to get property";
2244 success = false;
2245 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2246 error = "invalid property";
2247 success = false;
2248 } else {
2249 success = true;
2250 }
2251 f->open_object_section("result");
2252 f->dump_string("error", error);
2253 f->dump_bool("success", success);
2254 f->dump_int("value", value);
2255 f->close_section();
2256 } else if (admin_command == "dump_objectstore_kv_stats") {
2257 store->get_db_statistics(f);
2258 } else if (admin_command == "dump_scrubs") {
2259 service.dumps_scrub(f);
2260 } else if (admin_command == "calc_objectstore_db_histogram") {
2261 store->generate_db_histogram(f);
2262 } else if (admin_command == "flush_store_cache") {
2263 store->flush_cache();
2264 } else if (admin_command == "dump_pgstate_history") {
2265 f->open_object_section("pgstate_history");
2266 RWLock::RLocker l2(pg_map_lock);
2267 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2268 it != pg_map.end();
2269 ++it) {
2270
2271 PG *pg = it->second;
2272 f->dump_stream("pg") << pg->get_pgid();
2273 pg->lock();
2274 pg->pgstate_history.dump(f);
2275 pg->unlock();
2276 }
2277 f->close_section();
2278 } else if (admin_command == "compact") {
2279 dout(1) << "triggering manual compaction" << dendl;
2280 auto start = ceph::coarse_mono_clock::now();
2281 store->compact();
2282 auto end = ceph::coarse_mono_clock::now();
2283 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2284 dout(1) << "finished manual compaction in "
2285 << time_span.count()
2286 << " seconds" << dendl;
2287 f->open_object_section("compact_result");
2288 f->dump_float("elapsed_time", time_span.count());
2289 f->close_section();
2290 } else {
2291 assert(0 == "broken asok registration");
2292 }
2293 f->flush(ss);
2294 delete f;
2295 return true;
2296 }
2297
2298 class TestOpsSocketHook : public AdminSocketHook {
2299 OSDService *service;
2300 ObjectStore *store;
2301 public:
2302 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2303 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2304 bufferlist& out) override {
2305 stringstream ss;
2306 test_ops(service, store, command, cmdmap, ss);
2307 out.append(ss);
2308 return true;
2309 }
2310 void test_ops(OSDService *service, ObjectStore *store,
2311 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2312
2313 };
2314
2315 class OSD::C_Tick : public Context {
2316 OSD *osd;
2317 public:
2318 explicit C_Tick(OSD *o) : osd(o) {}
2319 void finish(int r) override {
2320 osd->tick();
2321 }
2322 };
2323
2324 class OSD::C_Tick_WithoutOSDLock : public Context {
2325 OSD *osd;
2326 public:
2327 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2328 void finish(int r) override {
2329 osd->tick_without_osd_lock();
2330 }
2331 };
2332
2333 int OSD::enable_disable_fuse(bool stop)
2334 {
2335 #ifdef HAVE_LIBFUSE
2336 int r;
2337 string mntpath = cct->_conf->osd_data + "/fuse";
2338 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2339 dout(1) << __func__ << " disabling" << dendl;
2340 fuse_store->stop();
2341 delete fuse_store;
2342 fuse_store = NULL;
2343 r = ::rmdir(mntpath.c_str());
2344 if (r < 0) {
2345 r = -errno;
2346 derr << __func__ << " failed to rmdir " << mntpath << ": "
2347 << cpp_strerror(r) << dendl;
2348 return r;
2349 }
2350 return 0;
2351 }
2352 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2353 dout(1) << __func__ << " enabling" << dendl;
2354 r = ::mkdir(mntpath.c_str(), 0700);
2355 if (r < 0)
2356 r = -errno;
2357 if (r < 0 && r != -EEXIST) {
2358 derr << __func__ << " unable to create " << mntpath << ": "
2359 << cpp_strerror(r) << dendl;
2360 return r;
2361 }
2362 fuse_store = new FuseStore(store, mntpath);
2363 r = fuse_store->start();
2364 if (r < 0) {
2365 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2366 delete fuse_store;
2367 fuse_store = NULL;
2368 return r;
2369 }
2370 }
2371 #endif // HAVE_LIBFUSE
2372 return 0;
2373 }
2374
2375 int OSD::get_num_op_shards()
2376 {
2377 if (cct->_conf->osd_op_num_shards)
2378 return cct->_conf->osd_op_num_shards;
2379 if (store_is_rotational)
2380 return cct->_conf->osd_op_num_shards_hdd;
2381 else
2382 return cct->_conf->osd_op_num_shards_ssd;
2383 }
2384
2385 int OSD::get_num_op_threads()
2386 {
2387 if (cct->_conf->osd_op_num_threads_per_shard)
2388 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2389 if (store_is_rotational)
2390 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2391 else
2392 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2393 }
2394
2395 float OSD::get_osd_recovery_sleep()
2396 {
2397 if (cct->_conf->osd_recovery_sleep)
2398 return cct->_conf->osd_recovery_sleep;
2399 if (!store_is_rotational && !journal_is_rotational)
2400 return cct->_conf->osd_recovery_sleep_ssd;
2401 else if (store_is_rotational && !journal_is_rotational)
2402 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2403 else
2404 return cct->_conf->osd_recovery_sleep_hdd;
2405 }
2406
2407 int OSD::init()
2408 {
2409 CompatSet initial, diff;
2410 Mutex::Locker lock(osd_lock);
2411 if (is_stopping())
2412 return 0;
2413
2414 tick_timer.init();
2415 tick_timer_without_osd_lock.init();
2416 service.recovery_request_timer.init();
2417 service.recovery_sleep_timer.init();
2418
2419 // mount.
2420 dout(2) << "init " << dev_path
2421 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2422 << dendl;
2423 dout(2) << "journal " << journal_path << dendl;
2424 assert(store); // call pre_init() first!
2425
2426 store->set_cache_shards(get_num_op_shards());
2427
2428 int r = store->mount();
2429 if (r < 0) {
2430 derr << "OSD:init: unable to mount object store" << dendl;
2431 return r;
2432 }
2433 journal_is_rotational = store->is_journal_rotational();
2434 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2435 << dendl;
2436
2437 enable_disable_fuse(false);
2438
2439 dout(2) << "boot" << dendl;
2440
2441 // initialize the daily loadavg with current 15min loadavg
2442 double loadavgs[3];
2443 if (getloadavg(loadavgs, 3) == 3) {
2444 daily_loadavg = loadavgs[2];
2445 } else {
2446 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2447 daily_loadavg = 1.0;
2448 }
2449
2450 int rotating_auth_attempts = 0;
2451
2452 // sanity check long object name handling
2453 {
2454 hobject_t l;
2455 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2456 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2457 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2458 r = store->validate_hobject_key(l);
2459 if (r < 0) {
2460 derr << "backend (" << store->get_type() << ") is unable to support max "
2461 << "object name[space] len" << dendl;
2462 derr << " osd max object name len = "
2463 << cct->_conf->osd_max_object_name_len << dendl;
2464 derr << " osd max object namespace len = "
2465 << cct->_conf->osd_max_object_namespace_len << dendl;
2466 derr << cpp_strerror(r) << dendl;
2467 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2468 goto out;
2469 }
2470 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2471 << dendl;
2472 } else {
2473 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2474 }
2475 }
2476
2477 // read superblock
2478 r = read_superblock();
2479 if (r < 0) {
2480 derr << "OSD::init() : unable to read osd superblock" << dendl;
2481 r = -EINVAL;
2482 goto out;
2483 }
2484
2485 if (osd_compat.compare(superblock.compat_features) < 0) {
2486 derr << "The disk uses features unsupported by the executable." << dendl;
2487 derr << " ondisk features " << superblock.compat_features << dendl;
2488 derr << " daemon features " << osd_compat << dendl;
2489
2490 if (osd_compat.writeable(superblock.compat_features)) {
2491 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2492 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2493 r = -EOPNOTSUPP;
2494 goto out;
2495 }
2496 else {
2497 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2498 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2499 r = -EOPNOTSUPP;
2500 goto out;
2501 }
2502 }
2503
2504 assert_warn(whoami == superblock.whoami);
2505 if (whoami != superblock.whoami) {
2506 derr << "OSD::init: superblock says osd"
2507 << superblock.whoami << " but I am osd." << whoami << dendl;
2508 r = -EINVAL;
2509 goto out;
2510 }
2511
2512 initial = get_osd_initial_compat_set();
2513 diff = superblock.compat_features.unsupported(initial);
2514 if (superblock.compat_features.merge(initial)) {
2515 // We need to persist the new compat_set before we
2516 // do anything else
2517 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2518 ObjectStore::Transaction t;
2519 write_superblock(t);
2520 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2521 if (r < 0)
2522 goto out;
2523 }
2524
2525 // make sure snap mapper object exists
2526 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2527 dout(10) << "init creating/touching snapmapper object" << dendl;
2528 ObjectStore::Transaction t;
2529 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2530 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2531 if (r < 0)
2532 goto out;
2533 }
2534
2535 class_handler = new ClassHandler(cct);
2536 cls_initialize(class_handler);
2537
2538 if (cct->_conf->osd_open_classes_on_start) {
2539 int r = class_handler->open_all_classes();
2540 if (r)
2541 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2542 }
2543
2544 // load up "current" osdmap
2545 assert_warn(!osdmap);
2546 if (osdmap) {
2547 derr << "OSD::init: unable to read current osdmap" << dendl;
2548 r = -EINVAL;
2549 goto out;
2550 }
2551 osdmap = get_map(superblock.current_epoch);
2552 check_osdmap_features(store);
2553
2554 create_recoverystate_perf();
2555
2556 {
2557 epoch_t bind_epoch = osdmap->get_epoch();
2558 service.set_epochs(NULL, NULL, &bind_epoch);
2559 }
2560
2561 clear_temp_objects();
2562
2563 // initialize osdmap references in sharded wq
2564 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2565
2566 // load up pgs (as they previously existed)
2567 load_pgs();
2568
2569 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2570 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2571 op_prio_cutoff << "." << dendl;
2572
2573 create_logger();
2574
2575 // i'm ready!
2576 client_messenger->add_dispatcher_head(this);
2577 cluster_messenger->add_dispatcher_head(this);
2578
2579 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2580 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2581 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2582 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2583
2584 objecter_messenger->add_dispatcher_head(service.objecter);
2585
2586 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2587 | CEPH_ENTITY_TYPE_MGR);
2588 r = monc->init();
2589 if (r < 0)
2590 goto out;
2591
2592 /**
2593 * FIXME: this is a placeholder implementation that unconditionally
2594 * sends every is_primary PG's stats every time we're called, unlike
2595 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2596 * This has equivalent cost to the existing worst case where all
2597 * PGs are busy and their stats are always enqueued for sending.
2598 */
2599 mgrc.set_pgstats_cb([this](){
2600 RWLock::RLocker l(map_lock);
2601
2602 utime_t had_for = ceph_clock_now() - had_map_since;
2603 osd_stat_t cur_stat = service.get_osd_stat();
2604 cur_stat.os_perf_stat = store->get_cur_stats();
2605
2606 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2607 m->osd_stat = cur_stat;
2608
2609 Mutex::Locker lec{min_last_epoch_clean_lock};
2610 min_last_epoch_clean = osdmap->get_epoch();
2611 min_last_epoch_clean_pgs.clear();
2612 RWLock::RLocker lpg(pg_map_lock);
2613 for (const auto &i : pg_map) {
2614 PG *pg = i.second;
2615 if (!pg->is_primary()) {
2616 continue;
2617 }
2618
2619 pg->pg_stats_publish_lock.Lock();
2620 if (pg->pg_stats_publish_valid) {
2621 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2622 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2623 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2624 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2625 }
2626 pg->pg_stats_publish_lock.Unlock();
2627 }
2628
2629 return m;
2630 });
2631
2632 mgrc.init();
2633 client_messenger->add_dispatcher_head(&mgrc);
2634
2635 // tell monc about log_client so it will know about mon session resets
2636 monc->set_log_client(&log_client);
2637 update_log_config();
2638
2639 peering_tp.start();
2640 osd_op_tp.start();
2641 disk_tp.start();
2642 command_tp.start();
2643
2644 set_disk_tp_priority();
2645
2646 // start the heartbeat
2647 heartbeat_thread.create("osd_srv_heartbt");
2648
2649 // tick
2650 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2651 {
2652 Mutex::Locker l(tick_timer_lock);
2653 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2654 }
2655
2656 service.init();
2657 service.publish_map(osdmap);
2658 service.publish_superblock(superblock);
2659 service.max_oldest_map = superblock.oldest_map;
2660
2661 osd_lock.Unlock();
2662
2663 r = monc->authenticate();
2664 if (r < 0) {
2665 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2666 << dendl;
2667 osd_lock.Lock(); // locker is going to unlock this on function exit
2668 if (is_stopping())
2669 r = 0;
2670 goto monout;
2671 }
2672
2673 while (monc->wait_auth_rotating(30.0) < 0) {
2674 derr << "unable to obtain rotating service keys; retrying" << dendl;
2675 ++rotating_auth_attempts;
2676 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2677 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2678 osd_lock.Lock(); // make locker happy
2679 if (!is_stopping()) {
2680 r = -ETIMEDOUT;
2681 }
2682 goto monout;
2683 }
2684 }
2685
2686 r = update_crush_device_class();
2687 if (r < 0) {
2688 derr << __func__ << " unable to update_crush_device_class: "
2689 << cpp_strerror(r) << dendl;
2690 osd_lock.Lock();
2691 goto monout;
2692 }
2693
2694 r = update_crush_location();
2695 if (r < 0) {
2696 derr << __func__ << " unable to update_crush_location: "
2697 << cpp_strerror(r) << dendl;
2698 osd_lock.Lock();
2699 goto monout;
2700 }
2701
2702 osd_lock.Lock();
2703 if (is_stopping())
2704 return 0;
2705
2706 // start objecter *after* we have authenticated, so that we don't ignore
2707 // the OSDMaps it requests.
2708 service.final_init();
2709
2710 check_config();
2711
2712 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2713 consume_map();
2714 peering_wq.drain();
2715
2716 dout(0) << "done with init, starting boot process" << dendl;
2717
2718 // subscribe to any pg creations
2719 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2720
2721 // MgrClient needs this (it doesn't have MonClient reference itself)
2722 monc->sub_want("mgrmap", 0, 0);
2723
2724 // we don't need to ask for an osdmap here; objecter will
2725 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2726
2727 monc->renew_subs();
2728
2729 start_boot();
2730
2731 return 0;
2732 monout:
2733 exit(1);
2734
2735 out:
2736 enable_disable_fuse(true);
2737 store->umount();
2738 delete store;
2739 store = NULL;
2740 return r;
2741 }
2742
2743 void OSD::final_init()
2744 {
2745 AdminSocket *admin_socket = cct->get_admin_socket();
2746 asok_hook = new OSDSocketHook(this);
2747 int r = admin_socket->register_command("status", "status", asok_hook,
2748 "high-level status of OSD");
2749 assert(r == 0);
2750 r = admin_socket->register_command("flush_journal", "flush_journal",
2751 asok_hook,
2752 "flush the journal to permanent store");
2753 assert(r == 0);
2754 r = admin_socket->register_command("dump_ops_in_flight",
2755 "dump_ops_in_flight " \
2756 "name=filterstr,type=CephString,n=N,req=false",
2757 asok_hook,
2758 "show the ops currently in flight");
2759 assert(r == 0);
2760 r = admin_socket->register_command("ops",
2761 "ops " \
2762 "name=filterstr,type=CephString,n=N,req=false",
2763 asok_hook,
2764 "show the ops currently in flight");
2765 assert(r == 0);
2766 r = admin_socket->register_command("dump_blocked_ops",
2767 "dump_blocked_ops " \
2768 "name=filterstr,type=CephString,n=N,req=false",
2769 asok_hook,
2770 "show the blocked ops currently in flight");
2771 assert(r == 0);
2772 r = admin_socket->register_command("dump_historic_ops",
2773 "dump_historic_ops " \
2774 "name=filterstr,type=CephString,n=N,req=false",
2775 asok_hook,
2776 "show recent ops");
2777 assert(r == 0);
2778 r = admin_socket->register_command("dump_historic_slow_ops",
2779 "dump_historic_slow_ops " \
2780 "name=filterstr,type=CephString,n=N,req=false",
2781 asok_hook,
2782 "show slowest recent ops");
2783 assert(r == 0);
2784 r = admin_socket->register_command("dump_historic_ops_by_duration",
2785 "dump_historic_ops_by_duration " \
2786 "name=filterstr,type=CephString,n=N,req=false",
2787 asok_hook,
2788 "show slowest recent ops, sorted by duration");
2789 assert(r == 0);
2790 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2791 asok_hook,
2792 "dump op priority queue state");
2793 assert(r == 0);
2794 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2795 asok_hook,
2796 "dump blacklisted clients and times");
2797 assert(r == 0);
2798 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2799 asok_hook,
2800 "show clients which have active watches,"
2801 " and on which objects");
2802 assert(r == 0);
2803 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2804 asok_hook,
2805 "show recovery reservations");
2806 assert(r == 0);
2807 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2808 asok_hook,
2809 "force osd to update the latest map from "
2810 "the mon");
2811 assert(r == 0);
2812
2813 r = admin_socket->register_command( "heap",
2814 "heap " \
2815 "name=heapcmd,type=CephString",
2816 asok_hook,
2817 "show heap usage info (available only if "
2818 "compiled with tcmalloc)");
2819 assert(r == 0);
2820
2821 r = admin_socket->register_command("set_heap_property",
2822 "set_heap_property " \
2823 "name=property,type=CephString " \
2824 "name=value,type=CephInt",
2825 asok_hook,
2826 "update malloc extension heap property");
2827 assert(r == 0);
2828
2829 r = admin_socket->register_command("get_heap_property",
2830 "get_heap_property " \
2831 "name=property,type=CephString",
2832 asok_hook,
2833 "get malloc extension heap property");
2834 assert(r == 0);
2835
2836 r = admin_socket->register_command("dump_objectstore_kv_stats",
2837 "dump_objectstore_kv_stats",
2838 asok_hook,
2839 "print statistics of kvdb which used by bluestore");
2840 assert(r == 0);
2841
2842 r = admin_socket->register_command("dump_scrubs",
2843 "dump_scrubs",
2844 asok_hook,
2845 "print scheduled scrubs");
2846 assert(r == 0);
2847
2848 r = admin_socket->register_command("calc_objectstore_db_histogram",
2849 "calc_objectstore_db_histogram",
2850 asok_hook,
2851 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2852 assert(r == 0);
2853
2854 r = admin_socket->register_command("flush_store_cache",
2855 "flush_store_cache",
2856 asok_hook,
2857 "Flush bluestore internal cache");
2858 assert(r == 0);
2859 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2860 asok_hook,
2861 "show recent state history");
2862 assert(r == 0);
2863
2864 r = admin_socket->register_command("compact", "compact",
2865 asok_hook,
2866 "Commpact object store's omap."
2867 " WARNING: Compaction probably slows your requests");
2868 assert(r == 0);
2869
2870 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2871 // Note: pools are CephString instead of CephPoolname because
2872 // these commands traditionally support both pool names and numbers
2873 r = admin_socket->register_command(
2874 "setomapval",
2875 "setomapval " \
2876 "name=pool,type=CephString " \
2877 "name=objname,type=CephObjectname " \
2878 "name=key,type=CephString "\
2879 "name=val,type=CephString",
2880 test_ops_hook,
2881 "set omap key");
2882 assert(r == 0);
2883 r = admin_socket->register_command(
2884 "rmomapkey",
2885 "rmomapkey " \
2886 "name=pool,type=CephString " \
2887 "name=objname,type=CephObjectname " \
2888 "name=key,type=CephString",
2889 test_ops_hook,
2890 "remove omap key");
2891 assert(r == 0);
2892 r = admin_socket->register_command(
2893 "setomapheader",
2894 "setomapheader " \
2895 "name=pool,type=CephString " \
2896 "name=objname,type=CephObjectname " \
2897 "name=header,type=CephString",
2898 test_ops_hook,
2899 "set omap header");
2900 assert(r == 0);
2901
2902 r = admin_socket->register_command(
2903 "getomap",
2904 "getomap " \
2905 "name=pool,type=CephString " \
2906 "name=objname,type=CephObjectname",
2907 test_ops_hook,
2908 "output entire object map");
2909 assert(r == 0);
2910
2911 r = admin_socket->register_command(
2912 "truncobj",
2913 "truncobj " \
2914 "name=pool,type=CephString " \
2915 "name=objname,type=CephObjectname " \
2916 "name=len,type=CephInt",
2917 test_ops_hook,
2918 "truncate object to length");
2919 assert(r == 0);
2920
2921 r = admin_socket->register_command(
2922 "injectdataerr",
2923 "injectdataerr " \
2924 "name=pool,type=CephString " \
2925 "name=objname,type=CephObjectname " \
2926 "name=shardid,type=CephInt,req=false,range=0|255",
2927 test_ops_hook,
2928 "inject data error to an object");
2929 assert(r == 0);
2930
2931 r = admin_socket->register_command(
2932 "injectmdataerr",
2933 "injectmdataerr " \
2934 "name=pool,type=CephString " \
2935 "name=objname,type=CephObjectname " \
2936 "name=shardid,type=CephInt,req=false,range=0|255",
2937 test_ops_hook,
2938 "inject metadata error to an object");
2939 assert(r == 0);
2940 r = admin_socket->register_command(
2941 "set_recovery_delay",
2942 "set_recovery_delay " \
2943 "name=utime,type=CephInt,req=false",
2944 test_ops_hook,
2945 "Delay osd recovery by specified seconds");
2946 assert(r == 0);
2947 r = admin_socket->register_command(
2948 "trigger_scrub",
2949 "trigger_scrub " \
2950 "name=pgid,type=CephString ",
2951 test_ops_hook,
2952 "Trigger a scheduled scrub ");
2953 assert(r == 0);
2954 r = admin_socket->register_command(
2955 "injectfull",
2956 "injectfull " \
2957 "name=type,type=CephString,req=false " \
2958 "name=count,type=CephInt,req=false ",
2959 test_ops_hook,
2960 "Inject a full disk (optional count times)");
2961 assert(r == 0);
2962 }
2963
2964 void OSD::create_logger()
2965 {
2966 dout(10) << "create_logger" << dendl;
2967
2968 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2969
2970 // Latency axis configuration for op histograms, values are in nanoseconds
2971 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2972 "Latency (usec)",
2973 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2974 0, ///< Start at 0
2975 100000, ///< Quantization unit is 100usec
2976 32, ///< Enough to cover much longer than slow requests
2977 };
2978
2979 // Op size axis configuration for op histograms, values are in bytes
2980 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2981 "Request size (bytes)",
2982 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2983 0, ///< Start at 0
2984 512, ///< Quantization unit is 512 bytes
2985 32, ///< Enough to cover requests larger than GB
2986 };
2987
2988
2989 // All the basic OSD operation stats are to be considered useful
2990 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
2991
2992 osd_plb.add_u64(
2993 l_osd_op_wip, "op_wip",
2994 "Replication operations currently being processed (primary)");
2995 osd_plb.add_u64_counter(
2996 l_osd_op, "op",
2997 "Client operations",
2998 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2999 osd_plb.add_u64_counter(
3000 l_osd_op_inb, "op_in_bytes",
3001 "Client operations total write size",
3002 "wr", PerfCountersBuilder::PRIO_INTERESTING);
3003 osd_plb.add_u64_counter(
3004 l_osd_op_outb, "op_out_bytes",
3005 "Client operations total read size",
3006 "rd", PerfCountersBuilder::PRIO_INTERESTING);
3007 osd_plb.add_time_avg(
3008 l_osd_op_lat, "op_latency",
3009 "Latency of client operations (including queue time)",
3010 "l", 9);
3011 osd_plb.add_time_avg(
3012 l_osd_op_process_lat, "op_process_latency",
3013 "Latency of client operations (excluding queue time)");
3014 osd_plb.add_time_avg(
3015 l_osd_op_prepare_lat, "op_prepare_latency",
3016 "Latency of client operations (excluding queue time and wait for finished)");
3017
3018 osd_plb.add_u64_counter(
3019 l_osd_op_r, "op_r", "Client read operations");
3020 osd_plb.add_u64_counter(
3021 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
3022 osd_plb.add_time_avg(
3023 l_osd_op_r_lat, "op_r_latency",
3024 "Latency of read operation (including queue time)");
3025 osd_plb.add_u64_counter_histogram(
3026 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3027 op_hist_x_axis_config, op_hist_y_axis_config,
3028 "Histogram of operation latency (including queue time) + data read");
3029 osd_plb.add_time_avg(
3030 l_osd_op_r_process_lat, "op_r_process_latency",
3031 "Latency of read operation (excluding queue time)");
3032 osd_plb.add_time_avg(
3033 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3034 "Latency of read operations (excluding queue time and wait for finished)");
3035 osd_plb.add_u64_counter(
3036 l_osd_op_w, "op_w", "Client write operations");
3037 osd_plb.add_u64_counter(
3038 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3039 osd_plb.add_time_avg(
3040 l_osd_op_w_lat, "op_w_latency",
3041 "Latency of write operation (including queue time)");
3042 osd_plb.add_u64_counter_histogram(
3043 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3044 op_hist_x_axis_config, op_hist_y_axis_config,
3045 "Histogram of operation latency (including queue time) + data written");
3046 osd_plb.add_time_avg(
3047 l_osd_op_w_process_lat, "op_w_process_latency",
3048 "Latency of write operation (excluding queue time)");
3049 osd_plb.add_time_avg(
3050 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3051 "Latency of write operations (excluding queue time and wait for finished)");
3052 osd_plb.add_u64_counter(
3053 l_osd_op_rw, "op_rw",
3054 "Client read-modify-write operations");
3055 osd_plb.add_u64_counter(
3056 l_osd_op_rw_inb, "op_rw_in_bytes",
3057 "Client read-modify-write operations write in");
3058 osd_plb.add_u64_counter(
3059 l_osd_op_rw_outb,"op_rw_out_bytes",
3060 "Client read-modify-write operations read out ");
3061 osd_plb.add_time_avg(
3062 l_osd_op_rw_lat, "op_rw_latency",
3063 "Latency of read-modify-write operation (including queue time)");
3064 osd_plb.add_u64_counter_histogram(
3065 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3066 op_hist_x_axis_config, op_hist_y_axis_config,
3067 "Histogram of rw operation latency (including queue time) + data written");
3068 osd_plb.add_u64_counter_histogram(
3069 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3070 op_hist_x_axis_config, op_hist_y_axis_config,
3071 "Histogram of rw operation latency (including queue time) + data read");
3072 osd_plb.add_time_avg(
3073 l_osd_op_rw_process_lat, "op_rw_process_latency",
3074 "Latency of read-modify-write operation (excluding queue time)");
3075 osd_plb.add_time_avg(
3076 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3077 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3078
3079 // Now we move on to some more obscure stats, revert to assuming things
3080 // are low priority unless otherwise specified.
3081 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3082
3083 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3084 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3085 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3086 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3087
3088 osd_plb.add_u64_counter(
3089 l_osd_sop, "subop", "Suboperations");
3090 osd_plb.add_u64_counter(
3091 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3092 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3093
3094 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3095 osd_plb.add_u64_counter(
3096 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3097 osd_plb.add_time_avg(
3098 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3099 osd_plb.add_u64_counter(
3100 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3101 osd_plb.add_time_avg(
3102 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3103 osd_plb.add_u64_counter(
3104 l_osd_sop_push, "subop_push", "Suboperations push messages");
3105 osd_plb.add_u64_counter(
3106 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3107 osd_plb.add_time_avg(
3108 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3109
3110 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3111 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3112 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3113
3114 osd_plb.add_u64_counter(
3115 l_osd_rop, "recovery_ops",
3116 "Started recovery operations",
3117 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3118
3119 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3120 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3121 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3122 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3123 osd_plb.add_u64(
3124 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3125 osd_plb.add_u64(
3126 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3127 "Total number getting crc from crc_cache with adjusting");
3128 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3129 "Total number of crc cache misses");
3130
3131 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3132 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3133 osd_plb.add_u64(
3134 l_osd_pg_primary, "numpg_primary",
3135 "Placement groups for which this osd is primary");
3136 osd_plb.add_u64(
3137 l_osd_pg_replica, "numpg_replica",
3138 "Placement groups for which this osd is replica");
3139 osd_plb.add_u64(
3140 l_osd_pg_stray, "numpg_stray",
3141 "Placement groups ready to be deleted from this osd");
3142 osd_plb.add_u64(
3143 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3144 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3145 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3146 osd_plb.add_u64_counter(
3147 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3148 osd_plb.add_u64_counter(
3149 l_osd_waiting_for_map, "messages_delayed_for_map",
3150 "Operations waiting for OSD map");
3151
3152 osd_plb.add_u64_counter(
3153 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3154 osd_plb.add_u64_counter(
3155 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3156 osd_plb.add_u64_counter(
3157 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3158 "osdmap cache miss below cache lower bound");
3159 osd_plb.add_u64_avg(
3160 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3161 "osdmap cache miss, avg distance below cache lower bound");
3162 osd_plb.add_u64_counter(
3163 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3164 "OSDMap buffer cache hits");
3165 osd_plb.add_u64_counter(
3166 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3167 "OSDMap buffer cache misses");
3168
3169 osd_plb.add_u64(
3170 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3171 PerfCountersBuilder::PRIO_USEFUL);
3172 osd_plb.add_u64(
3173 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3174 PerfCountersBuilder::PRIO_USEFUL);
3175 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3176
3177 osd_plb.add_u64_counter(
3178 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3179
3180 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3181 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3182 osd_plb.add_u64_counter(
3183 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3184 osd_plb.add_u64_counter(
3185 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3186 osd_plb.add_u64_counter(
3187 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3188 "Failed tier flush attempts");
3189 osd_plb.add_u64_counter(
3190 l_osd_tier_evict, "tier_evict", "Tier evictions");
3191 osd_plb.add_u64_counter(
3192 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3193 osd_plb.add_u64_counter(
3194 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3195 osd_plb.add_u64_counter(
3196 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3197 osd_plb.add_u64_counter(
3198 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3199 osd_plb.add_u64_counter(
3200 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3201 osd_plb.add_u64_counter(
3202 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3203
3204 osd_plb.add_u64_counter(
3205 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3206 osd_plb.add_u64_counter(
3207 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3208 osd_plb.add_u64_counter(
3209 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3210 osd_plb.add_u64_counter(
3211 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3212
3213 osd_plb.add_u64_counter(
3214 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3215 osd_plb.add_u64_counter(
3216 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3217
3218 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3219 osd_plb.add_time_avg(
3220 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3221 osd_plb.add_time_avg(
3222 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3223 osd_plb.add_time_avg(
3224 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3225
3226 osd_plb.add_u64_counter(
3227 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3228 osd_plb.add_u64_counter(
3229 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3230 "PG updated its info using fastinfo attr");
3231 osd_plb.add_u64_counter(
3232 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3233
3234 logger = osd_plb.create_perf_counters();
3235 cct->get_perfcounters_collection()->add(logger);
3236 }
3237
3238 void OSD::create_recoverystate_perf()
3239 {
3240 dout(10) << "create_recoverystate_perf" << dendl;
3241
3242 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3243
3244 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3245 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3246 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3247 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3248 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3249 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3250 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3251 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3252 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3253 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3254 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3255 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3256 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3257 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3258 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3259 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3260 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3261 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3262 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3263 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3264 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3265 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3266 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3267 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3268 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3269 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3270 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3271 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3272 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3273 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3274 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3275
3276 recoverystate_perf = rs_perf.create_perf_counters();
3277 cct->get_perfcounters_collection()->add(recoverystate_perf);
3278 }
3279
3280 int OSD::shutdown()
3281 {
3282 if (!service.prepare_to_stop())
3283 return 0; // already shutting down
3284 osd_lock.Lock();
3285 if (is_stopping()) {
3286 osd_lock.Unlock();
3287 return 0;
3288 }
3289 derr << "shutdown" << dendl;
3290
3291 set_state(STATE_STOPPING);
3292
3293 // Debugging
3294 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3295 cct->_conf->set_val("debug_osd", "100");
3296 cct->_conf->set_val("debug_journal", "100");
3297 cct->_conf->set_val("debug_filestore", "100");
3298 cct->_conf->set_val("debug_bluestore", "100");
3299 cct->_conf->set_val("debug_ms", "100");
3300 cct->_conf->apply_changes(NULL);
3301 }
3302
3303 // stop MgrClient earlier as it's more like an internal consumer of OSD
3304 mgrc.shutdown();
3305
3306 service.start_shutdown();
3307
3308 // stop sending work to pgs. this just prevents any new work in _process
3309 // from racing with on_shutdown and potentially entering the pg after.
3310 op_shardedwq.drain();
3311
3312 // Shutdown PGs
3313 {
3314 RWLock::RLocker l(pg_map_lock);
3315 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3316 p != pg_map.end();
3317 ++p) {
3318 dout(20) << " kicking pg " << p->first << dendl;
3319 p->second->lock();
3320 p->second->on_shutdown();
3321 p->second->unlock();
3322 p->second->osr->flush();
3323 }
3324 }
3325 clear_pg_stat_queue();
3326
3327 // drain op queue again (in case PGs requeued something)
3328 op_shardedwq.drain();
3329 {
3330 finished.clear(); // zap waiters (bleh, this is messy)
3331 }
3332
3333 op_shardedwq.clear_pg_slots();
3334
3335 // unregister commands
3336 cct->get_admin_socket()->unregister_command("status");
3337 cct->get_admin_socket()->unregister_command("flush_journal");
3338 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3339 cct->get_admin_socket()->unregister_command("ops");
3340 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3341 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3342 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3343 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3344 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3345 cct->get_admin_socket()->unregister_command("dump_blacklist");
3346 cct->get_admin_socket()->unregister_command("dump_watchers");
3347 cct->get_admin_socket()->unregister_command("dump_reservations");
3348 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3349 cct->get_admin_socket()->unregister_command("heap");
3350 cct->get_admin_socket()->unregister_command("set_heap_property");
3351 cct->get_admin_socket()->unregister_command("get_heap_property");
3352 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3353 cct->get_admin_socket()->unregister_command("dump_scrubs");
3354 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3355 cct->get_admin_socket()->unregister_command("flush_store_cache");
3356 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3357 cct->get_admin_socket()->unregister_command("compact");
3358 delete asok_hook;
3359 asok_hook = NULL;
3360
3361 cct->get_admin_socket()->unregister_command("setomapval");
3362 cct->get_admin_socket()->unregister_command("rmomapkey");
3363 cct->get_admin_socket()->unregister_command("setomapheader");
3364 cct->get_admin_socket()->unregister_command("getomap");
3365 cct->get_admin_socket()->unregister_command("truncobj");
3366 cct->get_admin_socket()->unregister_command("injectdataerr");
3367 cct->get_admin_socket()->unregister_command("injectmdataerr");
3368 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3369 cct->get_admin_socket()->unregister_command("trigger_scrub");
3370 cct->get_admin_socket()->unregister_command("injectfull");
3371 delete test_ops_hook;
3372 test_ops_hook = NULL;
3373
3374 osd_lock.Unlock();
3375
3376 heartbeat_lock.Lock();
3377 heartbeat_stop = true;
3378 heartbeat_cond.Signal();
3379 heartbeat_lock.Unlock();
3380 heartbeat_thread.join();
3381
3382 peering_tp.drain();
3383 peering_wq.clear();
3384 peering_tp.stop();
3385 dout(10) << "osd tp stopped" << dendl;
3386
3387 osd_op_tp.drain();
3388 osd_op_tp.stop();
3389 dout(10) << "op sharded tp stopped" << dendl;
3390
3391 command_tp.drain();
3392 command_tp.stop();
3393 dout(10) << "command tp stopped" << dendl;
3394
3395 disk_tp.drain();
3396 disk_tp.stop();
3397 dout(10) << "disk tp paused (new)" << dendl;
3398
3399 dout(10) << "stopping agent" << dendl;
3400 service.agent_stop();
3401
3402 osd_lock.Lock();
3403
3404 reset_heartbeat_peers();
3405
3406 tick_timer.shutdown();
3407
3408 {
3409 Mutex::Locker l(tick_timer_lock);
3410 tick_timer_without_osd_lock.shutdown();
3411 }
3412
3413 // note unmount epoch
3414 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3415 superblock.mounted = service.get_boot_epoch();
3416 superblock.clean_thru = osdmap->get_epoch();
3417 ObjectStore::Transaction t;
3418 write_superblock(t);
3419 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3420 if (r) {
3421 derr << "OSD::shutdown: error writing superblock: "
3422 << cpp_strerror(r) << dendl;
3423 }
3424
3425
3426 {
3427 Mutex::Locker l(pg_stat_queue_lock);
3428 assert(pg_stat_queue.empty());
3429 }
3430
3431 service.shutdown_reserver();
3432
3433 // Remove PGs
3434 #ifdef PG_DEBUG_REFS
3435 service.dump_live_pgids();
3436 #endif
3437 {
3438 RWLock::RLocker l(pg_map_lock);
3439 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3440 p != pg_map.end();
3441 ++p) {
3442 dout(20) << " kicking pg " << p->first << dendl;
3443 p->second->lock();
3444 if (p->second->ref != 1) {
3445 derr << "pgid " << p->first << " has ref count of "
3446 << p->second->ref << dendl;
3447 #ifdef PG_DEBUG_REFS
3448 p->second->dump_live_ids();
3449 #endif
3450 if (cct->_conf->osd_shutdown_pgref_assert) {
3451 ceph_abort();
3452 }
3453 }
3454 p->second->unlock();
3455 p->second->put("PGMap");
3456 }
3457 pg_map.clear();
3458 }
3459 #ifdef PG_DEBUG_REFS
3460 service.dump_live_pgids();
3461 #endif
3462 cct->_conf->remove_observer(this);
3463
3464 dout(10) << "syncing store" << dendl;
3465 enable_disable_fuse(true);
3466
3467 if (cct->_conf->osd_journal_flush_on_shutdown) {
3468 dout(10) << "flushing journal" << dendl;
3469 store->flush_journal();
3470 }
3471
3472 store->umount();
3473 delete store;
3474 store = 0;
3475 dout(10) << "Store synced" << dendl;
3476
3477 monc->shutdown();
3478 osd_lock.Unlock();
3479
3480 osdmap = OSDMapRef();
3481 service.shutdown();
3482 op_tracker.on_shutdown();
3483
3484 class_handler->shutdown();
3485 client_messenger->shutdown();
3486 cluster_messenger->shutdown();
3487 hb_front_client_messenger->shutdown();
3488 hb_back_client_messenger->shutdown();
3489 objecter_messenger->shutdown();
3490 hb_front_server_messenger->shutdown();
3491 hb_back_server_messenger->shutdown();
3492
3493 peering_wq.clear();
3494
3495 return r;
3496 }
3497
3498 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3499 {
3500 bool created = false;
3501 while (true) {
3502 dout(10) << __func__ << " cmd: " << cmd << dendl;
3503 vector<string> vcmd{cmd};
3504 bufferlist inbl;
3505 C_SaferCond w;
3506 string outs;
3507 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3508 int r = w.wait();
3509 if (r < 0) {
3510 if (r == -ENOENT && !created) {
3511 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3512 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3513 vector<string> vnewcmd{newcmd};
3514 bufferlist inbl;
3515 C_SaferCond w;
3516 string outs;
3517 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3518 int r = w.wait();
3519 if (r < 0) {
3520 derr << __func__ << " fail: osd does not exist and created failed: "
3521 << cpp_strerror(r) << dendl;
3522 return r;
3523 }
3524 created = true;
3525 continue;
3526 }
3527 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3528 return r;
3529 }
3530 break;
3531 }
3532
3533 return 0;
3534 }
3535
3536 int OSD::update_crush_location()
3537 {
3538 if (!cct->_conf->osd_crush_update_on_start) {
3539 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3540 return 0;
3541 }
3542
3543 char weight[32];
3544 if (cct->_conf->osd_crush_initial_weight >= 0) {
3545 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3546 } else {
3547 struct store_statfs_t st;
3548 int r = store->statfs(&st);
3549 if (r < 0) {
3550 derr << "statfs: " << cpp_strerror(r) << dendl;
3551 return r;
3552 }
3553 snprintf(weight, sizeof(weight), "%.4lf",
3554 MAX((double).00001,
3555 (double)(st.total) /
3556 (double)(1ull << 40 /* TB */)));
3557 }
3558
3559 std::multimap<string,string> loc = cct->crush_location.get_location();
3560 dout(10) << __func__ << " crush location is " << loc << dendl;
3561
3562 string cmd =
3563 string("{\"prefix\": \"osd crush create-or-move\", ") +
3564 string("\"id\": ") + stringify(whoami) + string(", ") +
3565 string("\"weight\":") + weight + string(", ") +
3566 string("\"args\": [");
3567 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3568 if (p != loc.begin())
3569 cmd += ", ";
3570 cmd += "\"" + p->first + "=" + p->second + "\"";
3571 }
3572 cmd += "]}";
3573
3574 return mon_cmd_maybe_osd_create(cmd);
3575 }
3576
3577 int OSD::update_crush_device_class()
3578 {
3579 if (!cct->_conf->osd_class_update_on_start) {
3580 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3581 return 0;
3582 }
3583
3584 string device_class;
3585 int r = store->read_meta("crush_device_class", &device_class);
3586 if (r < 0 || device_class.empty()) {
3587 device_class = store->get_default_device_class();
3588 }
3589
3590 if (device_class.empty()) {
3591 dout(20) << __func__ << " no device class stored locally" << dendl;
3592 return 0;
3593 }
3594
3595 string cmd =
3596 string("{\"prefix\": \"osd crush set-device-class\", ") +
3597 string("\"class\": \"") + device_class + string("\", ") +
3598 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3599
3600 r = mon_cmd_maybe_osd_create(cmd);
3601 // the above cmd can fail for various reasons, e.g.:
3602 // (1) we are connecting to a pre-luminous monitor
3603 // (2) user manually specify a class other than
3604 // 'ceph-disk prepare --crush-device-class'
3605 // simply skip result-checking for now
3606 return 0;
3607 }
3608
3609 void OSD::write_superblock(ObjectStore::Transaction& t)
3610 {
3611 dout(10) << "write_superblock " << superblock << dendl;
3612
3613 //hack: at minimum it's using the baseline feature set
3614 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3615 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3616
3617 bufferlist bl;
3618 ::encode(superblock, bl);
3619 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3620 }
3621
3622 int OSD::read_superblock()
3623 {
3624 bufferlist bl;
3625 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3626 if (r < 0)
3627 return r;
3628
3629 bufferlist::iterator p = bl.begin();
3630 ::decode(superblock, p);
3631
3632 dout(10) << "read_superblock " << superblock << dendl;
3633
3634 return 0;
3635 }
3636
3637 void OSD::clear_temp_objects()
3638 {
3639 dout(10) << __func__ << dendl;
3640 vector<coll_t> ls;
3641 store->list_collections(ls);
3642 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3643 spg_t pgid;
3644 if (!p->is_pg(&pgid))
3645 continue;
3646
3647 // list temp objects
3648 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3649
3650 vector<ghobject_t> temps;
3651 ghobject_t next;
3652 while (1) {
3653 vector<ghobject_t> objects;
3654 store->collection_list(*p, next, ghobject_t::get_max(),
3655 store->get_ideal_list_max(),
3656 &objects, &next);
3657 if (objects.empty())
3658 break;
3659 vector<ghobject_t>::iterator q;
3660 for (q = objects.begin(); q != objects.end(); ++q) {
3661 // Hammer set pool for temps to -1, so check for clean-up
3662 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3663 temps.push_back(*q);
3664 } else {
3665 break;
3666 }
3667 }
3668 // If we saw a non-temp object and hit the break above we can
3669 // break out of the while loop too.
3670 if (q != objects.end())
3671 break;
3672 }
3673 if (!temps.empty()) {
3674 ObjectStore::Transaction t;
3675 int removed = 0;
3676 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3677 dout(20) << " removing " << *p << " object " << *q << dendl;
3678 t.remove(*p, *q);
3679 if (++removed > cct->_conf->osd_target_transaction_size) {
3680 store->apply_transaction(service.meta_osr.get(), std::move(t));
3681 t = ObjectStore::Transaction();
3682 removed = 0;
3683 }
3684 }
3685 if (removed) {
3686 store->apply_transaction(service.meta_osr.get(), std::move(t));
3687 }
3688 }
3689 }
3690 }
3691
3692 void OSD::recursive_remove_collection(CephContext* cct,
3693 ObjectStore *store, spg_t pgid,
3694 coll_t tmp)
3695 {
3696 OSDriver driver(
3697 store,
3698 coll_t(),
3699 make_snapmapper_oid());
3700
3701 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3702 ObjectStore::Sequencer>("rm"));
3703 ObjectStore::Transaction t;
3704 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3705
3706 vector<ghobject_t> objects;
3707 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3708 INT_MAX, &objects, 0);
3709 generic_dout(10) << __func__ << " " << objects << dendl;
3710 // delete them.
3711 int removed = 0;
3712 for (vector<ghobject_t>::iterator p = objects.begin();
3713 p != objects.end();
3714 ++p, removed++) {
3715 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3716 int r = mapper.remove_oid(p->hobj, &_t);
3717 if (r != 0 && r != -ENOENT)
3718 ceph_abort();
3719 t.remove(tmp, *p);
3720 if (removed > cct->_conf->osd_target_transaction_size) {
3721 int r = store->apply_transaction(osr.get(), std::move(t));
3722 assert(r == 0);
3723 t = ObjectStore::Transaction();
3724 removed = 0;
3725 }
3726 }
3727 t.remove_collection(tmp);
3728 int r = store->apply_transaction(osr.get(), std::move(t));
3729 assert(r == 0);
3730
3731 C_SaferCond waiter;
3732 if (!osr->flush_commit(&waiter)) {
3733 waiter.wait();
3734 }
3735 }
3736
3737
3738 // ======================================================
3739 // PG's
3740
3741 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3742 {
3743 if (!createmap->have_pg_pool(id)) {
3744 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3745 << id << dendl;
3746 ceph_abort();
3747 }
3748
3749 PGPool p = PGPool(cct, createmap, id);
3750
3751 dout(10) << "_get_pool " << p.id << dendl;
3752 return p;
3753 }
3754
3755 PG *OSD::_open_lock_pg(
3756 OSDMapRef createmap,
3757 spg_t pgid, bool no_lockdep_check)
3758 {
3759 assert(osd_lock.is_locked());
3760
3761 PG* pg = _make_pg(createmap, pgid);
3762 {
3763 RWLock::WLocker l(pg_map_lock);
3764 pg->lock(no_lockdep_check);
3765 pg_map[pgid] = pg;
3766 pg->get("PGMap"); // because it's in pg_map
3767 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3768 }
3769 return pg;
3770 }
3771
3772 PG* OSD::_make_pg(
3773 OSDMapRef createmap,
3774 spg_t pgid)
3775 {
3776 dout(10) << "_open_lock_pg " << pgid << dendl;
3777 PGPool pool = _get_pool(pgid.pool(), createmap);
3778
3779 // create
3780 PG *pg;
3781 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3782 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3783 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3784 else
3785 ceph_abort();
3786
3787 return pg;
3788 }
3789
3790
3791 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3792 {
3793 epoch_t e(service.get_osdmap()->get_epoch());
3794 pg->get("PGMap"); // For pg_map
3795 pg_map[pg->info.pgid] = pg;
3796 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3797
3798 dout(10) << "Adding newly split pg " << *pg << dendl;
3799 pg->handle_loaded(rctx);
3800 pg->write_if_dirty(*(rctx->transaction));
3801 pg->queue_null(e, e);
3802 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3803 peering_wait_for_split.find(pg->info.pgid);
3804 if (to_wake != peering_wait_for_split.end()) {
3805 for (list<PG::CephPeeringEvtRef>::iterator i =
3806 to_wake->second.begin();
3807 i != to_wake->second.end();
3808 ++i) {
3809 pg->queue_peering_event(*i);
3810 }
3811 peering_wait_for_split.erase(to_wake);
3812 }
3813 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3814 _remove_pg(pg);
3815 }
3816
3817 OSD::res_result OSD::_try_resurrect_pg(
3818 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3819 {
3820 assert(resurrected);
3821 assert(old_pg_state);
3822 // find nearest ancestor
3823 DeletingStateRef df;
3824 spg_t cur(pgid);
3825 while (true) {
3826 df = service.deleting_pgs.lookup(cur);
3827 if (df)
3828 break;
3829 if (!cur.ps())
3830 break;
3831 cur = cur.get_parent();
3832 }
3833 if (!df)
3834 return RES_NONE; // good to go
3835
3836 df->old_pg_state->lock();
3837 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3838 df->old_pg_state->unlock();
3839
3840 set<spg_t> children;
3841 if (cur == pgid) {
3842 if (df->try_stop_deletion()) {
3843 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3844 *resurrected = cur;
3845 *old_pg_state = df->old_pg_state;
3846 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3847 return RES_SELF;
3848 } else {
3849 // raced, ensure we don't see DeletingStateRef when we try to
3850 // delete this pg
3851 service.deleting_pgs.remove(pgid);
3852 return RES_NONE;
3853 }
3854 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3855 curmap->get_pg_num(cur.pool()),
3856 &children) &&
3857 children.count(pgid)) {
3858 if (df->try_stop_deletion()) {
3859 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3860 << dendl;
3861 *resurrected = cur;
3862 *old_pg_state = df->old_pg_state;
3863 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3864 return RES_PARENT;
3865 } else {
3866 /* this is not a problem, failing to cancel proves that all objects
3867 * have been removed, so no hobject_t overlap is possible
3868 */
3869 return RES_NONE;
3870 }
3871 }
3872 return RES_NONE;
3873 }
3874
3875 PG *OSD::_create_lock_pg(
3876 OSDMapRef createmap,
3877 spg_t pgid,
3878 bool hold_map_lock,
3879 bool backfill,
3880 int role,
3881 vector<int>& up, int up_primary,
3882 vector<int>& acting, int acting_primary,
3883 pg_history_t history,
3884 const PastIntervals& pi,
3885 ObjectStore::Transaction& t)
3886 {
3887 assert(osd_lock.is_locked());
3888 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3889
3890 PG *pg = _open_lock_pg(createmap, pgid, true);
3891
3892 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3893
3894 pg->init(
3895 role,
3896 up,
3897 up_primary,
3898 acting,
3899 acting_primary,
3900 history,
3901 pi,
3902 backfill,
3903 &t);
3904
3905 dout(7) << "_create_lock_pg " << *pg << dendl;
3906 return pg;
3907 }
3908
3909 PG *OSD::_lookup_lock_pg(spg_t pgid)
3910 {
3911 RWLock::RLocker l(pg_map_lock);
3912
3913 auto pg_map_entry = pg_map.find(pgid);
3914 if (pg_map_entry == pg_map.end())
3915 return nullptr;
3916 PG *pg = pg_map_entry->second;
3917 pg->lock();
3918 return pg;
3919 }
3920
3921 PG *OSD::lookup_lock_pg(spg_t pgid)
3922 {
3923 return _lookup_lock_pg(pgid);
3924 }
3925
3926 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3927 {
3928 assert(pg_map.count(pgid));
3929 PG *pg = pg_map[pgid];
3930 pg->lock();
3931 return pg;
3932 }
3933
3934 void OSD::load_pgs()
3935 {
3936 assert(osd_lock.is_locked());
3937 dout(0) << "load_pgs" << dendl;
3938 {
3939 RWLock::RLocker l(pg_map_lock);
3940 assert(pg_map.empty());
3941 }
3942
3943 vector<coll_t> ls;
3944 int r = store->list_collections(ls);
3945 if (r < 0) {
3946 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3947 }
3948
3949 bool has_upgraded = false;
3950
3951 for (vector<coll_t>::iterator it = ls.begin();
3952 it != ls.end();
3953 ++it) {
3954 spg_t pgid;
3955 if (it->is_temp(&pgid) ||
3956 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3957 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3958 recursive_remove_collection(cct, store, pgid, *it);
3959 continue;
3960 }
3961
3962 if (!it->is_pg(&pgid)) {
3963 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3964 continue;
3965 }
3966
3967 if (pgid.preferred() >= 0) {
3968 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3969 // FIXME: delete it too, eventually
3970 continue;
3971 }
3972
3973 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3974 bufferlist bl;
3975 epoch_t map_epoch = 0;
3976 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3977 if (r < 0) {
3978 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3979 << dendl;
3980 continue;
3981 }
3982
3983 PG *pg = NULL;
3984 if (map_epoch > 0) {
3985 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3986 if (!pgosdmap) {
3987 if (!osdmap->have_pg_pool(pgid.pool())) {
3988 derr << __func__ << ": could not find map for epoch " << map_epoch
3989 << " on pg " << pgid << ", but the pool is not present in the "
3990 << "current map, so this is probably a result of bug 10617. "
3991 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3992 << "to clean it up later." << dendl;
3993 continue;
3994 } else {
3995 derr << __func__ << ": have pgid " << pgid << " at epoch "
3996 << map_epoch << ", but missing map. Crashing."
3997 << dendl;
3998 assert(0 == "Missing map in load_pgs");
3999 }
4000 }
4001 pg = _open_lock_pg(pgosdmap, pgid);
4002 } else {
4003 pg = _open_lock_pg(osdmap, pgid);
4004 }
4005 // there can be no waiters here, so we don't call wake_pg_waiters
4006
4007 pg->ch = store->open_collection(pg->coll);
4008
4009 // read pg state, log
4010 pg->read_state(store, bl);
4011
4012 if (pg->must_upgrade()) {
4013 if (!pg->can_upgrade()) {
4014 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4015 << " an older version first." << dendl;
4016 assert(0 == "PG too old to upgrade");
4017 }
4018 if (!has_upgraded) {
4019 derr << "PGs are upgrading" << dendl;
4020 has_upgraded = true;
4021 }
4022 dout(10) << "PG " << pg->info.pgid
4023 << " must upgrade..." << dendl;
4024 pg->upgrade(store);
4025 }
4026
4027 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4028
4029 // generate state for PG's current mapping
4030 int primary, up_primary;
4031 vector<int> acting, up;
4032 pg->get_osdmap()->pg_to_up_acting_osds(
4033 pgid.pgid, &up, &up_primary, &acting, &primary);
4034 pg->init_primary_up_acting(
4035 up,
4036 acting,
4037 up_primary,
4038 primary);
4039 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4040 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4041 pg->set_role(role);
4042 else
4043 pg->set_role(-1);
4044
4045 pg->reg_next_scrub();
4046
4047 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4048 pg->handle_loaded(&rctx);
4049
4050 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4051 if (pg->pg_log.is_dirty()) {
4052 ObjectStore::Transaction t;
4053 pg->write_if_dirty(t);
4054 store->apply_transaction(pg->osr.get(), std::move(t));
4055 }
4056 pg->unlock();
4057 }
4058 {
4059 RWLock::RLocker l(pg_map_lock);
4060 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4061 }
4062
4063 // clean up old infos object?
4064 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4065 dout(1) << __func__ << " removing legacy infos object" << dendl;
4066 ObjectStore::Transaction t;
4067 t.remove(coll_t::meta(), OSD::make_infos_oid());
4068 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4069 if (r != 0) {
4070 derr << __func__ << ": apply_transaction returned "
4071 << cpp_strerror(r) << dendl;
4072 ceph_abort();
4073 }
4074 }
4075
4076 build_past_intervals_parallel();
4077 }
4078
4079
4080 /*
4081 * build past_intervals efficiently on old, degraded, and buried
4082 * clusters. this is important for efficiently catching up osds that
4083 * are way behind on maps to the current cluster state.
4084 *
4085 * this is a parallel version of PG::generate_past_intervals().
4086 * follow the same logic, but do all pgs at the same time so that we
4087 * can make a single pass across the osdmap history.
4088 */
4089 void OSD::build_past_intervals_parallel()
4090 {
4091 struct pistate {
4092 epoch_t start, end;
4093 vector<int> old_acting, old_up;
4094 epoch_t same_interval_since;
4095 int primary;
4096 int up_primary;
4097 };
4098 map<PG*,pistate> pis;
4099
4100 // calculate junction of map range
4101 epoch_t end_epoch = superblock.oldest_map;
4102 epoch_t cur_epoch = superblock.newest_map;
4103 {
4104 RWLock::RLocker l(pg_map_lock);
4105 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4106 i != pg_map.end();
4107 ++i) {
4108 PG *pg = i->second;
4109
4110 // Ignore PGs only partially created (DNE)
4111 if (pg->info.dne()) {
4112 continue;
4113 }
4114
4115 auto rpib = pg->get_required_past_interval_bounds(
4116 pg->info,
4117 superblock.oldest_map);
4118 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4119 if (pg->info.history.same_interval_since == 0) {
4120 pg->info.history.same_interval_since = rpib.second;
4121 }
4122 continue;
4123 } else {
4124 auto apib = pg->past_intervals.get_bounds();
4125 if (apib.second >= rpib.second &&
4126 apib.first <= rpib.first) {
4127 if (pg->info.history.same_interval_since == 0) {
4128 pg->info.history.same_interval_since = rpib.second;
4129 }
4130 continue;
4131 }
4132 }
4133
4134 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4135 << rpib.second << dendl;
4136 pistate& p = pis[pg];
4137 p.start = rpib.first;
4138 p.end = rpib.second;
4139 p.same_interval_since = 0;
4140
4141 if (rpib.first < cur_epoch)
4142 cur_epoch = rpib.first;
4143 if (rpib.second > end_epoch)
4144 end_epoch = rpib.second;
4145 }
4146 }
4147 if (pis.empty()) {
4148 dout(10) << __func__ << " nothing to build" << dendl;
4149 return;
4150 }
4151
4152 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4153 assert(cur_epoch <= end_epoch);
4154
4155 OSDMapRef cur_map, last_map;
4156 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4157 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4158 last_map = cur_map;
4159 cur_map = get_map(cur_epoch);
4160
4161 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4162 PG *pg = i->first;
4163 pistate& p = i->second;
4164
4165 if (cur_epoch < p.start || cur_epoch > p.end)
4166 continue;
4167
4168 vector<int> acting, up;
4169 int up_primary;
4170 int primary;
4171 pg_t pgid = pg->info.pgid.pgid;
4172 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4173 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4174 cur_map->pg_to_up_acting_osds(
4175 pgid, &up, &up_primary, &acting, &primary);
4176
4177 if (p.same_interval_since == 0) {
4178 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4179 << " first map, acting " << acting
4180 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4181 p.same_interval_since = cur_epoch;
4182 p.old_up = up;
4183 p.old_acting = acting;
4184 p.primary = primary;
4185 p.up_primary = up_primary;
4186 continue;
4187 }
4188 assert(last_map);
4189
4190 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4191 pg->get_is_recoverable_predicate());
4192 std::stringstream debug;
4193 bool new_interval = PastIntervals::check_new_interval(
4194 p.primary,
4195 primary,
4196 p.old_acting, acting,
4197 p.up_primary,
4198 up_primary,
4199 p.old_up, up,
4200 p.same_interval_since,
4201 pg->info.history.last_epoch_clean,
4202 cur_map, last_map,
4203 pgid,
4204 recoverable.get(),
4205 &pg->past_intervals,
4206 &debug);
4207 if (new_interval) {
4208 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4209 << " " << debug.str() << dendl;
4210 p.old_up = up;
4211 p.old_acting = acting;
4212 p.primary = primary;
4213 p.up_primary = up_primary;
4214 p.same_interval_since = cur_epoch;
4215 }
4216 }
4217 }
4218
4219 // Now that past_intervals have been recomputed let's fix the same_interval_since
4220 // if it was cleared by import.
4221 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4222 PG *pg = i->first;
4223 pistate& p = i->second;
4224
4225 if (pg->info.history.same_interval_since == 0) {
4226 assert(p.same_interval_since);
4227 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4228 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4229 // Fix it
4230 pg->info.history.same_interval_since = p.same_interval_since;
4231 }
4232 }
4233
4234 // write info only at the end. this is necessary because we check
4235 // whether the past_intervals go far enough back or forward in time,
4236 // but we don't check for holes. we could avoid it by discarding
4237 // the previous past_intervals and rebuilding from scratch, or we
4238 // can just do this and commit all our work at the end.
4239 ObjectStore::Transaction t;
4240 int num = 0;
4241 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4242 PG *pg = i->first;
4243 pg->lock();
4244 pg->dirty_big_info = true;
4245 pg->dirty_info = true;
4246 pg->write_if_dirty(t);
4247 pg->unlock();
4248
4249 // don't let the transaction get too big
4250 if (++num >= cct->_conf->osd_target_transaction_size) {
4251 store->apply_transaction(service.meta_osr.get(), std::move(t));
4252 t = ObjectStore::Transaction();
4253 num = 0;
4254 }
4255 }
4256 if (!t.empty())
4257 store->apply_transaction(service.meta_osr.get(), std::move(t));
4258 }
4259
4260 /*
4261 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4262 * hasn't changed since the given epoch and we are the primary.
4263 */
4264 int OSD::handle_pg_peering_evt(
4265 spg_t pgid,
4266 const pg_history_t& orig_history,
4267 const PastIntervals& pi,
4268 epoch_t epoch,
4269 PG::CephPeeringEvtRef evt)
4270 {
4271 if (service.splitting(pgid)) {
4272 peering_wait_for_split[pgid].push_back(evt);
4273 return -EEXIST;
4274 }
4275
4276 PG *pg = _lookup_lock_pg(pgid);
4277 if (!pg) {
4278 // same primary?
4279 if (!osdmap->have_pg_pool(pgid.pool()))
4280 return -EINVAL;
4281 int up_primary, acting_primary;
4282 vector<int> up, acting;
4283 osdmap->pg_to_up_acting_osds(
4284 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4285
4286 pg_history_t history = orig_history;
4287 bool valid_history = project_pg_history(
4288 pgid, history, epoch, up, up_primary, acting, acting_primary);
4289
4290 if (!valid_history || epoch < history.same_interval_since) {
4291 dout(10) << __func__ << pgid << " acting changed in "
4292 << history.same_interval_since << " (msg from " << epoch << ")"
4293 << dendl;
4294 return -EINVAL;
4295 }
4296
4297 if (service.splitting(pgid)) {
4298 ceph_abort();
4299 }
4300
4301 const bool is_mon_create =
4302 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4303 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4304 return -EAGAIN;
4305 }
4306 // do we need to resurrect a deleting pg?
4307 spg_t resurrected;
4308 PGRef old_pg_state;
4309 res_result result = _try_resurrect_pg(
4310 service.get_osdmap(),
4311 pgid,
4312 &resurrected,
4313 &old_pg_state);
4314
4315 PG::RecoveryCtx rctx = create_context();
4316 switch (result) {
4317 case RES_NONE: {
4318 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4319 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4320 store->get_type() != "bluestore") {
4321 clog->warn() << "pg " << pgid
4322 << " is at risk of silent data corruption: "
4323 << "the pool allows ec overwrites but is not stored in "
4324 << "bluestore, so deep scrubbing will not detect bitrot";
4325 }
4326 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4327 PG::_init(*rctx.transaction, pgid, pp);
4328
4329 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4330 if (!pp->is_replicated() && role != pgid.shard)
4331 role = -1;
4332
4333 pg = _create_lock_pg(
4334 get_map(epoch),
4335 pgid, false, false,
4336 role,
4337 up, up_primary,
4338 acting, acting_primary,
4339 history, pi,
4340 *rctx.transaction);
4341 pg->handle_create(&rctx);
4342 pg->write_if_dirty(*rctx.transaction);
4343 dispatch_context(rctx, pg, osdmap);
4344
4345 dout(10) << *pg << " is new" << dendl;
4346
4347 pg->queue_peering_event(evt);
4348 wake_pg_waiters(pg);
4349 pg->unlock();
4350 return 0;
4351 }
4352 case RES_SELF: {
4353 old_pg_state->lock();
4354 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4355 int old_role = old_pg_state->role;
4356 vector<int> old_up = old_pg_state->up;
4357 int old_up_primary = old_pg_state->up_primary.osd;
4358 vector<int> old_acting = old_pg_state->acting;
4359 int old_primary = old_pg_state->primary.osd;
4360 pg_history_t old_history = old_pg_state->info.history;
4361 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4362 old_pg_state->unlock();
4363 pg = _create_lock_pg(
4364 old_osd_map,
4365 resurrected,
4366 false,
4367 true,
4368 old_role,
4369 old_up,
4370 old_up_primary,
4371 old_acting,
4372 old_primary,
4373 old_history,
4374 old_past_intervals,
4375 *rctx.transaction);
4376 pg->handle_create(&rctx);
4377 pg->write_if_dirty(*rctx.transaction);
4378 dispatch_context(rctx, pg, osdmap);
4379
4380 dout(10) << *pg << " is new (resurrected)" << dendl;
4381
4382 pg->queue_peering_event(evt);
4383 wake_pg_waiters(pg);
4384 pg->unlock();
4385 return 0;
4386 }
4387 case RES_PARENT: {
4388 assert(old_pg_state);
4389 old_pg_state->lock();
4390 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4391 int old_role = old_pg_state->role;
4392 vector<int> old_up = old_pg_state->up;
4393 int old_up_primary = old_pg_state->up_primary.osd;
4394 vector<int> old_acting = old_pg_state->acting;
4395 int old_primary = old_pg_state->primary.osd;
4396 pg_history_t old_history = old_pg_state->info.history;
4397 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4398 old_pg_state->unlock();
4399 PG *parent = _create_lock_pg(
4400 old_osd_map,
4401 resurrected,
4402 false,
4403 true,
4404 old_role,
4405 old_up,
4406 old_up_primary,
4407 old_acting,
4408 old_primary,
4409 old_history,
4410 old_past_intervals,
4411 *rctx.transaction
4412 );
4413 parent->handle_create(&rctx);
4414 parent->write_if_dirty(*rctx.transaction);
4415 dispatch_context(rctx, parent, osdmap);
4416
4417 dout(10) << *parent << " is new" << dendl;
4418
4419 assert(service.splitting(pgid));
4420 peering_wait_for_split[pgid].push_back(evt);
4421
4422 //parent->queue_peering_event(evt);
4423 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4424 wake_pg_waiters(parent);
4425 parent->unlock();
4426 return 0;
4427 }
4428 default:
4429 assert(0);
4430 return 0;
4431 }
4432 } else {
4433 // already had it. did the mapping change?
4434 if (epoch < pg->info.history.same_interval_since) {
4435 dout(10) << *pg << __func__ << " acting changed in "
4436 << pg->info.history.same_interval_since
4437 << " (msg from " << epoch << ")" << dendl;
4438 } else {
4439 pg->queue_peering_event(evt);
4440 }
4441 pg->unlock();
4442 return -EEXIST;
4443 }
4444 }
4445
4446 bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4447 {
4448 const auto max_pgs_per_osd =
4449 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4450 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4451
4452 RWLock::RLocker pg_map_locker{pg_map_lock};
4453 if (pg_map.size() < max_pgs_per_osd) {
4454 return false;
4455 }
4456 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4457 if (is_mon_create) {
4458 pending_creates_from_mon++;
4459 } else {
4460 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4461 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4462 }
4463 dout(5) << __func__ << " withhold creation of pg " << pgid
4464 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4465 return true;
4466 }
4467
4468 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4469 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4470 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4471 static vector<int32_t> twiddle(const vector<int>& acting) {
4472 if (acting.size() > 1) {
4473 return {acting[0]};
4474 } else {
4475 vector<int32_t> twiddled(acting.begin(), acting.end());
4476 twiddled.push_back(-1);
4477 return twiddled;
4478 }
4479 }
4480
4481 void OSD::resume_creating_pg()
4482 {
4483 bool do_sub_pg_creates = false;
4484 bool have_pending_creates = false;
4485 MOSDPGTemp *pgtemp = nullptr;
4486 {
4487 const auto max_pgs_per_osd =
4488 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4489 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4490 RWLock::RLocker l(pg_map_lock);
4491 if (max_pgs_per_osd <= pg_map.size()) {
4492 // this could happen if admin decreases this setting before a PG is removed
4493 return;
4494 }
4495 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4496 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4497 if (pending_creates_from_mon > 0) {
4498 do_sub_pg_creates = true;
4499 if (pending_creates_from_mon >= spare_pgs) {
4500 spare_pgs = pending_creates_from_mon = 0;
4501 } else {
4502 spare_pgs -= pending_creates_from_mon;
4503 pending_creates_from_mon = 0;
4504 }
4505 }
4506 auto pg = pending_creates_from_osd.cbegin();
4507 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4508 if (!pgtemp) {
4509 pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
4510 }
4511 vector<int> acting;
4512 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4513 pgtemp->pg_temp[pg->first] = twiddle(acting);
4514 pg = pending_creates_from_osd.erase(pg);
4515 spare_pgs--;
4516 }
4517 have_pending_creates = (pending_creates_from_mon > 0 ||
4518 !pending_creates_from_osd.empty());
4519 }
4520
4521 bool do_renew_subs = false;
4522 if (do_sub_pg_creates) {
4523 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4524 dout(4) << __func__ << ": resolicit pg creates from mon since "
4525 << last_pg_create_epoch << dendl;
4526 do_renew_subs = true;
4527 }
4528 }
4529 version_t start = osdmap->get_epoch() + 1;
4530 if (have_pending_creates) {
4531 // don't miss any new osdmap deleting PGs
4532 if (monc->sub_want("osdmap", start, 0)) {
4533 dout(4) << __func__ << ": resolicit osdmap from mon since "
4534 << start << dendl;
4535 do_renew_subs = true;
4536 }
4537 } else if (pgtemp || do_sub_pg_creates) {
4538 // no need to subscribe the osdmap continuously anymore
4539 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4540 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4541 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
4542 << start << dendl;
4543 do_renew_subs = true;
4544 }
4545 }
4546
4547 if (do_renew_subs) {
4548 monc->renew_subs();
4549 }
4550
4551 if (pgtemp) {
4552 pgtemp->forced = true;
4553 monc->send_mon_message(pgtemp);
4554 }
4555 }
4556
4557 void OSD::build_initial_pg_history(
4558 spg_t pgid,
4559 epoch_t created,
4560 utime_t created_stamp,
4561 pg_history_t *h,
4562 PastIntervals *pi)
4563 {
4564 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4565 h->epoch_created = created;
4566 h->epoch_pool_created = created;
4567 h->same_interval_since = created;
4568 h->same_up_since = created;
4569 h->same_primary_since = created;
4570 h->last_scrub_stamp = created_stamp;
4571 h->last_deep_scrub_stamp = created_stamp;
4572 h->last_clean_scrub_stamp = created_stamp;
4573
4574 OSDMapRef lastmap = service.get_map(created);
4575 int up_primary, acting_primary;
4576 vector<int> up, acting;
4577 lastmap->pg_to_up_acting_osds(
4578 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4579
4580 ostringstream debug;
4581 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4582 OSDMapRef osdmap = service.get_map(e);
4583 int new_up_primary, new_acting_primary;
4584 vector<int> new_up, new_acting;
4585 osdmap->pg_to_up_acting_osds(
4586 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4587
4588 // this is a bit imprecise, but sufficient?
4589 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4590 const pg_pool_t *pi;
4591 bool operator()(const set<pg_shard_t> &have) const {
4592 return have.size() >= pi->min_size;
4593 }
4594 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4595 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4596
4597 bool new_interval = PastIntervals::check_new_interval(
4598 acting_primary,
4599 new_acting_primary,
4600 acting, new_acting,
4601 up_primary,
4602 new_up_primary,
4603 up, new_up,
4604 h->same_interval_since,
4605 h->last_epoch_clean,
4606 osdmap,
4607 lastmap,
4608 pgid.pgid,
4609 &min_size_predicate,
4610 pi,
4611 &debug);
4612 if (new_interval) {
4613 h->same_interval_since = e;
4614 if (up != new_up) {
4615 h->same_up_since = e;
4616 }
4617 if (acting_primary != new_acting_primary) {
4618 h->same_primary_since = e;
4619 }
4620 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4621 osdmap->get_pg_num(pgid.pgid.pool()),
4622 nullptr)) {
4623 h->last_epoch_split = e;
4624 }
4625 up = new_up;
4626 acting = new_acting;
4627 up_primary = new_up_primary;
4628 acting_primary = new_acting_primary;
4629 }
4630 lastmap = osdmap;
4631 }
4632 dout(20) << __func__ << " " << debug.str() << dendl;
4633 dout(10) << __func__ << " " << *h << " " << *pi
4634 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4635 pi->get_bounds()) << ")"
4636 << dendl;
4637 }
4638
4639 /**
4640 * Fill in the passed history so you know same_interval_since, same_up_since,
4641 * and same_primary_since.
4642 */
4643 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4644 const vector<int>& currentup,
4645 int currentupprimary,
4646 const vector<int>& currentacting,
4647 int currentactingprimary)
4648 {
4649 dout(15) << "project_pg_history " << pgid
4650 << " from " << from << " to " << osdmap->get_epoch()
4651 << ", start " << h
4652 << dendl;
4653
4654 epoch_t e;
4655 for (e = osdmap->get_epoch();
4656 e > from;
4657 e--) {
4658 // verify during intermediate epoch (e-1)
4659 OSDMapRef oldmap = service.try_get_map(e-1);
4660 if (!oldmap) {
4661 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4662 return false;
4663 }
4664 assert(oldmap->have_pg_pool(pgid.pool()));
4665
4666 int upprimary, actingprimary;
4667 vector<int> up, acting;
4668 oldmap->pg_to_up_acting_osds(
4669 pgid.pgid,
4670 &up,
4671 &upprimary,
4672 &acting,
4673 &actingprimary);
4674
4675 // acting set change?
4676 if ((actingprimary != currentactingprimary ||
4677 upprimary != currentupprimary ||
4678 acting != currentacting ||
4679 up != currentup) && e > h.same_interval_since) {
4680 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4681 << " from " << acting << "/" << up
4682 << " " << actingprimary << "/" << upprimary
4683 << " -> " << currentacting << "/" << currentup
4684 << " " << currentactingprimary << "/" << currentupprimary
4685 << dendl;
4686 h.same_interval_since = e;
4687 }
4688 // split?
4689 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4690 osdmap->get_pg_num(pgid.pool()),
4691 0) && e > h.same_interval_since) {
4692 h.same_interval_since = e;
4693 }
4694 // up set change?
4695 if ((up != currentup || upprimary != currentupprimary)
4696 && e > h.same_up_since) {
4697 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4698 << " from " << up << " " << upprimary
4699 << " -> " << currentup << " " << currentupprimary << dendl;
4700 h.same_up_since = e;
4701 }
4702
4703 // primary change?
4704 if (OSDMap::primary_changed(
4705 actingprimary,
4706 acting,
4707 currentactingprimary,
4708 currentacting) &&
4709 e > h.same_primary_since) {
4710 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4711 h.same_primary_since = e;
4712 }
4713
4714 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4715 break;
4716 }
4717
4718 // base case: these floors should be the pg creation epoch if we didn't
4719 // find any changes.
4720 if (e == h.epoch_created) {
4721 if (!h.same_interval_since)
4722 h.same_interval_since = e;
4723 if (!h.same_up_since)
4724 h.same_up_since = e;
4725 if (!h.same_primary_since)
4726 h.same_primary_since = e;
4727 }
4728
4729 dout(15) << "project_pg_history end " << h << dendl;
4730 return true;
4731 }
4732
4733
4734
4735 void OSD::_add_heartbeat_peer(int p)
4736 {
4737 if (p == whoami)
4738 return;
4739 HeartbeatInfo *hi;
4740
4741 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4742 if (i == heartbeat_peers.end()) {
4743 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4744 if (!cons.first)
4745 return;
4746 hi = &heartbeat_peers[p];
4747 hi->peer = p;
4748 HeartbeatSession *s = new HeartbeatSession(p);
4749 hi->con_back = cons.first.get();
4750 hi->con_back->set_priv(s->get());
4751 if (cons.second) {
4752 hi->con_front = cons.second.get();
4753 hi->con_front->set_priv(s->get());
4754 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4755 << " " << hi->con_back->get_peer_addr()
4756 << " " << hi->con_front->get_peer_addr()
4757 << dendl;
4758 } else {
4759 hi->con_front.reset(NULL);
4760 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4761 << " " << hi->con_back->get_peer_addr()
4762 << dendl;
4763 }
4764 s->put();
4765 } else {
4766 hi = &i->second;
4767 }
4768 hi->epoch = osdmap->get_epoch();
4769 }
4770
4771 void OSD::_remove_heartbeat_peer(int n)
4772 {
4773 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4774 assert(q != heartbeat_peers.end());
4775 dout(20) << " removing heartbeat peer osd." << n
4776 << " " << q->second.con_back->get_peer_addr()
4777 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4778 << dendl;
4779 q->second.con_back->mark_down();
4780 if (q->second.con_front) {
4781 q->second.con_front->mark_down();
4782 }
4783 heartbeat_peers.erase(q);
4784 }
4785
4786 void OSD::need_heartbeat_peer_update()
4787 {
4788 if (is_stopping())
4789 return;
4790 dout(20) << "need_heartbeat_peer_update" << dendl;
4791 heartbeat_set_peers_need_update();
4792 }
4793
4794 void OSD::maybe_update_heartbeat_peers()
4795 {
4796 assert(osd_lock.is_locked());
4797
4798 if (is_waiting_for_healthy()) {
4799 utime_t now = ceph_clock_now();
4800 if (last_heartbeat_resample == utime_t()) {
4801 last_heartbeat_resample = now;
4802 heartbeat_set_peers_need_update();
4803 } else if (!heartbeat_peers_need_update()) {
4804 utime_t dur = now - last_heartbeat_resample;
4805 if (dur > cct->_conf->osd_heartbeat_grace) {
4806 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4807 heartbeat_set_peers_need_update();
4808 last_heartbeat_resample = now;
4809 reset_heartbeat_peers(); // we want *new* peers!
4810 }
4811 }
4812 }
4813
4814 if (!heartbeat_peers_need_update())
4815 return;
4816 heartbeat_clear_peers_need_update();
4817
4818 Mutex::Locker l(heartbeat_lock);
4819
4820 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4821
4822
4823 // build heartbeat from set
4824 if (is_active()) {
4825 RWLock::RLocker l(pg_map_lock);
4826 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4827 i != pg_map.end();
4828 ++i) {
4829 PG *pg = i->second;
4830 pg->heartbeat_peer_lock.Lock();
4831 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4832 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4833 p != pg->heartbeat_peers.end();
4834 ++p)
4835 if (osdmap->is_up(*p))
4836 _add_heartbeat_peer(*p);
4837 for (set<int>::iterator p = pg->probe_targets.begin();
4838 p != pg->probe_targets.end();
4839 ++p)
4840 if (osdmap->is_up(*p))
4841 _add_heartbeat_peer(*p);
4842 pg->heartbeat_peer_lock.Unlock();
4843 }
4844 }
4845
4846 // include next and previous up osds to ensure we have a fully-connected set
4847 set<int> want, extras;
4848 int next = osdmap->get_next_up_osd_after(whoami);
4849 if (next >= 0)
4850 want.insert(next);
4851 int prev = osdmap->get_previous_up_osd_before(whoami);
4852 if (prev >= 0 && prev != next)
4853 want.insert(prev);
4854
4855 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4856 dout(10) << " adding neighbor peer osd." << *p << dendl;
4857 extras.insert(*p);
4858 _add_heartbeat_peer(*p);
4859 }
4860
4861 // remove down peers; enumerate extras
4862 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4863 while (p != heartbeat_peers.end()) {
4864 if (!osdmap->is_up(p->first)) {
4865 int o = p->first;
4866 ++p;
4867 _remove_heartbeat_peer(o);
4868 continue;
4869 }
4870 if (p->second.epoch < osdmap->get_epoch()) {
4871 extras.insert(p->first);
4872 }
4873 ++p;
4874 }
4875
4876 // too few?
4877 int start = osdmap->get_next_up_osd_after(whoami);
4878 for (int n = start; n >= 0; ) {
4879 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4880 break;
4881 if (!extras.count(n) && !want.count(n) && n != whoami) {
4882 dout(10) << " adding random peer osd." << n << dendl;
4883 extras.insert(n);
4884 _add_heartbeat_peer(n);
4885 }
4886 n = osdmap->get_next_up_osd_after(n);
4887 if (n == start)
4888 break; // came full circle; stop
4889 }
4890
4891 // too many?
4892 for (set<int>::iterator p = extras.begin();
4893 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4894 ++p) {
4895 if (want.count(*p))
4896 continue;
4897 _remove_heartbeat_peer(*p);
4898 }
4899
4900 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4901 }
4902
4903 void OSD::reset_heartbeat_peers()
4904 {
4905 assert(osd_lock.is_locked());
4906 dout(10) << "reset_heartbeat_peers" << dendl;
4907 Mutex::Locker l(heartbeat_lock);
4908 while (!heartbeat_peers.empty()) {
4909 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4910 hi.con_back->mark_down();
4911 if (hi.con_front) {
4912 hi.con_front->mark_down();
4913 }
4914 heartbeat_peers.erase(heartbeat_peers.begin());
4915 }
4916 failure_queue.clear();
4917 }
4918
4919 void OSD::handle_osd_ping(MOSDPing *m)
4920 {
4921 if (superblock.cluster_fsid != m->fsid) {
4922 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4923 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4924 m->put();
4925 return;
4926 }
4927
4928 int from = m->get_source().num();
4929
4930 heartbeat_lock.Lock();
4931 if (is_stopping()) {
4932 heartbeat_lock.Unlock();
4933 m->put();
4934 return;
4935 }
4936
4937 OSDMapRef curmap = service.get_osdmap();
4938 if (!curmap) {
4939 heartbeat_lock.Unlock();
4940 m->put();
4941 return;
4942 }
4943
4944 switch (m->op) {
4945
4946 case MOSDPing::PING:
4947 {
4948 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4949 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4950 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4951 if (heartbeat_drop->second == 0) {
4952 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4953 } else {
4954 --heartbeat_drop->second;
4955 dout(5) << "Dropping heartbeat from " << from
4956 << ", " << heartbeat_drop->second
4957 << " remaining to drop" << dendl;
4958 break;
4959 }
4960 } else if (cct->_conf->osd_debug_drop_ping_probability >
4961 ((((double)(rand()%100))/100.0))) {
4962 heartbeat_drop =
4963 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4964 cct->_conf->osd_debug_drop_ping_duration)).first;
4965 dout(5) << "Dropping heartbeat from " << from
4966 << ", " << heartbeat_drop->second
4967 << " remaining to drop" << dendl;
4968 break;
4969 }
4970 }
4971
4972 if (!cct->get_heartbeat_map()->is_healthy()) {
4973 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4974 break;
4975 }
4976
4977 Message *r = new MOSDPing(monc->get_fsid(),
4978 curmap->get_epoch(),
4979 MOSDPing::PING_REPLY, m->stamp,
4980 cct->_conf->osd_heartbeat_min_size);
4981 m->get_connection()->send_message(r);
4982
4983 if (curmap->is_up(from)) {
4984 service.note_peer_epoch(from, m->map_epoch);
4985 if (is_active()) {
4986 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4987 if (con) {
4988 service.share_map_peer(from, con.get());
4989 }
4990 }
4991 } else if (!curmap->exists(from) ||
4992 curmap->get_down_at(from) > m->map_epoch) {
4993 // tell them they have died
4994 Message *r = new MOSDPing(monc->get_fsid(),
4995 curmap->get_epoch(),
4996 MOSDPing::YOU_DIED,
4997 m->stamp,
4998 cct->_conf->osd_heartbeat_min_size);
4999 m->get_connection()->send_message(r);
5000 }
5001 }
5002 break;
5003
5004 case MOSDPing::PING_REPLY:
5005 {
5006 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5007 if (i != heartbeat_peers.end()) {
5008 if (m->get_connection() == i->second.con_back) {
5009 dout(25) << "handle_osd_ping got reply from osd." << from
5010 << " first_tx " << i->second.first_tx
5011 << " last_tx " << i->second.last_tx
5012 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
5013 << " last_rx_front " << i->second.last_rx_front
5014 << dendl;
5015 i->second.last_rx_back = m->stamp;
5016 // if there is no front con, set both stamps.
5017 if (i->second.con_front == NULL)
5018 i->second.last_rx_front = m->stamp;
5019 } else if (m->get_connection() == i->second.con_front) {
5020 dout(25) << "handle_osd_ping got reply from osd." << from
5021 << " first_tx " << i->second.first_tx
5022 << " last_tx " << i->second.last_tx
5023 << " last_rx_back " << i->second.last_rx_back
5024 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
5025 << dendl;
5026 i->second.last_rx_front = m->stamp;
5027 }
5028
5029 utime_t cutoff = ceph_clock_now();
5030 cutoff -= cct->_conf->osd_heartbeat_grace;
5031 if (i->second.is_healthy(cutoff)) {
5032 // Cancel false reports
5033 auto failure_queue_entry = failure_queue.find(from);
5034 if (failure_queue_entry != failure_queue.end()) {
5035 dout(10) << "handle_osd_ping canceling queued "
5036 << "failure report for osd." << from << dendl;
5037 failure_queue.erase(failure_queue_entry);
5038 }
5039
5040 auto failure_pending_entry = failure_pending.find(from);
5041 if (failure_pending_entry != failure_pending.end()) {
5042 dout(10) << "handle_osd_ping canceling in-flight "
5043 << "failure report for osd." << from << dendl;
5044 send_still_alive(curmap->get_epoch(),
5045 failure_pending_entry->second.second);
5046 failure_pending.erase(failure_pending_entry);
5047 }
5048 }
5049 }
5050
5051 if (m->map_epoch &&
5052 curmap->is_up(from)) {
5053 service.note_peer_epoch(from, m->map_epoch);
5054 if (is_active()) {
5055 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5056 if (con) {
5057 service.share_map_peer(from, con.get());
5058 }
5059 }
5060 }
5061 }
5062 break;
5063
5064 case MOSDPing::YOU_DIED:
5065 dout(10) << "handle_osd_ping " << m->get_source_inst()
5066 << " says i am down in " << m->map_epoch << dendl;
5067 osdmap_subscribe(curmap->get_epoch()+1, false);
5068 break;
5069 }
5070
5071 heartbeat_lock.Unlock();
5072 m->put();
5073 }
5074
5075 void OSD::heartbeat_entry()
5076 {
5077 Mutex::Locker l(heartbeat_lock);
5078 if (is_stopping())
5079 return;
5080 while (!heartbeat_stop) {
5081 heartbeat();
5082
5083 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5084 utime_t w;
5085 w.set_from_double(wait);
5086 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5087 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5088 if (is_stopping())
5089 return;
5090 dout(30) << "heartbeat_entry woke up" << dendl;
5091 }
5092 }
5093
5094 void OSD::heartbeat_check()
5095 {
5096 assert(heartbeat_lock.is_locked());
5097 utime_t now = ceph_clock_now();
5098
5099 // check for heartbeat replies (move me elsewhere?)
5100 utime_t cutoff = now;
5101 cutoff -= cct->_conf->osd_heartbeat_grace;
5102 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5103 p != heartbeat_peers.end();
5104 ++p) {
5105
5106 if (p->second.first_tx == utime_t()) {
5107 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5108 << "yet, skipping" << dendl;
5109 continue;
5110 }
5111
5112 dout(25) << "heartbeat_check osd." << p->first
5113 << " first_tx " << p->second.first_tx
5114 << " last_tx " << p->second.last_tx
5115 << " last_rx_back " << p->second.last_rx_back
5116 << " last_rx_front " << p->second.last_rx_front
5117 << dendl;
5118 if (p->second.is_unhealthy(cutoff)) {
5119 if (p->second.last_rx_back == utime_t() ||
5120 p->second.last_rx_front == utime_t()) {
5121 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5122 << " osd." << p->first << " ever on either front or back, first ping sent "
5123 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5124 // fail
5125 failure_queue[p->first] = p->second.last_tx;
5126 } else {
5127 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5128 << " osd." << p->first << " since back " << p->second.last_rx_back
5129 << " front " << p->second.last_rx_front
5130 << " (cutoff " << cutoff << ")" << dendl;
5131 // fail
5132 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5133 }
5134 }
5135 }
5136 }
5137
5138 void OSD::heartbeat()
5139 {
5140 dout(30) << "heartbeat" << dendl;
5141
5142 // get CPU load avg
5143 double loadavgs[1];
5144 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5145 if (getloadavg(loadavgs, 1) == 1) {
5146 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5147 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5148 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5149 }
5150
5151 dout(30) << "heartbeat checking stats" << dendl;
5152
5153 // refresh stats?
5154 vector<int> hb_peers;
5155 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5156 p != heartbeat_peers.end();
5157 ++p)
5158 hb_peers.push_back(p->first);
5159 service.update_osd_stat(hb_peers);
5160
5161 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5162
5163 utime_t now = ceph_clock_now();
5164
5165 // send heartbeats
5166 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5167 i != heartbeat_peers.end();
5168 ++i) {
5169 int peer = i->first;
5170 i->second.last_tx = now;
5171 if (i->second.first_tx == utime_t())
5172 i->second.first_tx = now;
5173 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5174 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5175 service.get_osdmap()->get_epoch(),
5176 MOSDPing::PING, now,
5177 cct->_conf->osd_heartbeat_min_size));
5178
5179 if (i->second.con_front)
5180 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5181 service.get_osdmap()->get_epoch(),
5182 MOSDPing::PING, now,
5183 cct->_conf->osd_heartbeat_min_size));
5184 }
5185
5186 logger->set(l_osd_hb_to, heartbeat_peers.size());
5187
5188 // hmm.. am i all alone?
5189 dout(30) << "heartbeat lonely?" << dendl;
5190 if (heartbeat_peers.empty()) {
5191 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5192 last_mon_heartbeat = now;
5193 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5194 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5195 }
5196 }
5197
5198 dout(30) << "heartbeat done" << dendl;
5199 }
5200
5201 bool OSD::heartbeat_reset(Connection *con)
5202 {
5203 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5204 if (s) {
5205 heartbeat_lock.Lock();
5206 if (is_stopping()) {
5207 heartbeat_lock.Unlock();
5208 s->put();
5209 return true;
5210 }
5211 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5212 if (p != heartbeat_peers.end() &&
5213 (p->second.con_back == con ||
5214 p->second.con_front == con)) {
5215 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5216 << ", reopening" << dendl;
5217 if (con != p->second.con_back) {
5218 p->second.con_back->mark_down();
5219 }
5220 p->second.con_back.reset(NULL);
5221 if (p->second.con_front && con != p->second.con_front) {
5222 p->second.con_front->mark_down();
5223 }
5224 p->second.con_front.reset(NULL);
5225 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5226 if (newcon.first) {
5227 p->second.con_back = newcon.first.get();
5228 p->second.con_back->set_priv(s->get());
5229 if (newcon.second) {
5230 p->second.con_front = newcon.second.get();
5231 p->second.con_front->set_priv(s->get());
5232 }
5233 } else {
5234 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5235 << ", raced with osdmap update, closing out peer" << dendl;
5236 heartbeat_peers.erase(p);
5237 }
5238 } else {
5239 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5240 }
5241 heartbeat_lock.Unlock();
5242 s->put();
5243 }
5244 return true;
5245 }
5246
5247
5248
5249 // =========================================
5250
5251 void OSD::tick()
5252 {
5253 assert(osd_lock.is_locked());
5254 dout(10) << "tick" << dendl;
5255
5256 if (is_active() || is_waiting_for_healthy()) {
5257 maybe_update_heartbeat_peers();
5258 }
5259
5260 if (is_waiting_for_healthy()) {
5261 start_boot();
5262 } else if (is_preboot() &&
5263 waiting_for_luminous_mons &&
5264 monc->monmap.get_required_features().contains_all(
5265 ceph::features::mon::FEATURE_LUMINOUS)) {
5266 // mon upgrade finished!
5267 start_boot();
5268 }
5269
5270 do_waiters();
5271
5272 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5273 }
5274
5275 void OSD::tick_without_osd_lock()
5276 {
5277 assert(tick_timer_lock.is_locked());
5278 dout(10) << "tick_without_osd_lock" << dendl;
5279
5280 logger->set(l_osd_buf, buffer::get_total_alloc());
5281 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5282 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5283 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5284 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5285 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5286
5287 // osd_lock is not being held, which means the OSD state
5288 // might change when doing the monitor report
5289 if (is_active() || is_waiting_for_healthy()) {
5290 heartbeat_lock.Lock();
5291 heartbeat_check();
5292 heartbeat_lock.Unlock();
5293
5294 map_lock.get_read();
5295 Mutex::Locker l(mon_report_lock);
5296
5297 // mon report?
5298 bool reset = false;
5299 bool report = false;
5300 utime_t now = ceph_clock_now();
5301 pg_stat_queue_lock.Lock();
5302 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5303 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5304 // note: we shouldn't adjust max because it must remain < the
5305 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5306 // value).
5307 double max = cct->_conf->osd_mon_report_interval_max;
5308 if (!outstanding_pg_stats.empty() &&
5309 (now - stats_ack_timeout) > last_pg_stats_ack) {
5310 dout(1) << __func__ << " mon hasn't acked PGStats in "
5311 << now - last_pg_stats_ack
5312 << " seconds, reconnecting elsewhere" << dendl;
5313 reset = true;
5314 last_pg_stats_ack = now; // reset clock
5315 last_pg_stats_sent = utime_t();
5316 stats_ack_timeout =
5317 MAX(cct->_conf->osd_mon_ack_timeout,
5318 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5319 outstanding_pg_stats.clear();
5320 }
5321 if (now - last_pg_stats_sent > max) {
5322 osd_stat_updated = true;
5323 report = true;
5324 } else if (service.need_fullness_update()) {
5325 report = true;
5326 } else if ((int)outstanding_pg_stats.size() >=
5327 cct->_conf->osd_mon_report_max_in_flight) {
5328 dout(20) << __func__ << " have max " << outstanding_pg_stats
5329 << " stats updates in flight" << dendl;
5330 } else {
5331 if (now - last_mon_report > adjusted_min) {
5332 dout(20) << __func__ << " stats backoff " << backoff
5333 << " adjusted_min " << adjusted_min << " - sending report"
5334 << dendl;
5335 osd_stat_updated = true;
5336 report = true;
5337 }
5338 }
5339 pg_stat_queue_lock.Unlock();
5340
5341 if (reset) {
5342 monc->reopen_session();
5343 } else if (report) {
5344 last_mon_report = now;
5345
5346 // do any pending reports
5347 send_full_update();
5348 send_failures();
5349 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5350 send_pg_stats(now);
5351 }
5352 }
5353 map_lock.put_read();
5354 }
5355
5356 if (is_active()) {
5357 if (!scrub_random_backoff()) {
5358 sched_scrub();
5359 }
5360 service.promote_throttle_recalibrate();
5361 resume_creating_pg();
5362 bool need_send_beacon = false;
5363 const auto now = ceph::coarse_mono_clock::now();
5364 {
5365 // borrow lec lock to pretect last_sent_beacon from changing
5366 Mutex::Locker l{min_last_epoch_clean_lock};
5367 const auto elapsed = now - last_sent_beacon;
5368 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5369 cct->_conf->osd_beacon_report_interval) {
5370 need_send_beacon = true;
5371 }
5372 }
5373 if (need_send_beacon) {
5374 send_beacon(now);
5375 }
5376 }
5377
5378 mgrc.update_osd_health(get_health_metrics());
5379 service.kick_recovery_queue();
5380 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5381 }
5382
5383 void OSD::check_ops_in_flight()
5384 {
5385 vector<string> warnings;
5386 if (op_tracker.check_ops_in_flight(warnings)) {
5387 for (vector<string>::iterator i = warnings.begin();
5388 i != warnings.end();
5389 ++i) {
5390 clog->warn() << *i;
5391 }
5392 }
5393 }
5394
5395 // Usage:
5396 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5397 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5398 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5399 // getomap <pool> [namespace/]<obj-name>
5400 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5401 // injectmdataerr [namespace/]<obj-name> [shardid]
5402 // injectdataerr [namespace/]<obj-name> [shardid]
5403 //
5404 // set_recovery_delay [utime]
5405 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5406 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5407 {
5408 //Test support
5409 //Support changing the omap on a single osd by using the Admin Socket to
5410 //directly request the osd make a change.
5411 if (command == "setomapval" || command == "rmomapkey" ||
5412 command == "setomapheader" || command == "getomap" ||
5413 command == "truncobj" || command == "injectmdataerr" ||
5414 command == "injectdataerr"
5415 ) {
5416 pg_t rawpg;
5417 int64_t pool;
5418 OSDMapRef curmap = service->get_osdmap();
5419 int r = -1;
5420
5421 string poolstr;
5422
5423 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5424 pool = curmap->lookup_pg_pool_name(poolstr);
5425 //If we can't find it by name then maybe id specified
5426 if (pool < 0 && isdigit(poolstr[0]))
5427 pool = atoll(poolstr.c_str());
5428 if (pool < 0) {
5429 ss << "Invalid pool '" << poolstr << "''";
5430 return;
5431 }
5432
5433 string objname, nspace;
5434 cmd_getval(service->cct, cmdmap, "objname", objname);
5435 std::size_t found = objname.find_first_of('/');
5436 if (found != string::npos) {
5437 nspace = objname.substr(0, found);
5438 objname = objname.substr(found+1);
5439 }
5440 object_locator_t oloc(pool, nspace);
5441 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5442
5443 if (r < 0) {
5444 ss << "Invalid namespace/objname";
5445 return;
5446 }
5447
5448 int64_t shardid;
5449 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5450 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5451 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5452 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5453 if (curmap->pg_is_ec(rawpg)) {
5454 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5455 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5456 return;
5457 }
5458 }
5459
5460 ObjectStore::Transaction t;
5461
5462 if (command == "setomapval") {
5463 map<string, bufferlist> newattrs;
5464 bufferlist val;
5465 string key, valstr;
5466 cmd_getval(service->cct, cmdmap, "key", key);
5467 cmd_getval(service->cct, cmdmap, "val", valstr);
5468
5469 val.append(valstr);
5470 newattrs[key] = val;
5471 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5472 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5473 if (r < 0)
5474 ss << "error=" << r;
5475 else
5476 ss << "ok";
5477 } else if (command == "rmomapkey") {
5478 string key;
5479 set<string> keys;
5480 cmd_getval(service->cct, cmdmap, "key", key);
5481
5482 keys.insert(key);
5483 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5484 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5485 if (r < 0)
5486 ss << "error=" << r;
5487 else
5488 ss << "ok";
5489 } else if (command == "setomapheader") {
5490 bufferlist newheader;
5491 string headerstr;
5492
5493 cmd_getval(service->cct, cmdmap, "header", headerstr);
5494 newheader.append(headerstr);
5495 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5496 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5497 if (r < 0)
5498 ss << "error=" << r;
5499 else
5500 ss << "ok";
5501 } else if (command == "getomap") {
5502 //Debug: Output entire omap
5503 bufferlist hdrbl;
5504 map<string, bufferlist> keyvals;
5505 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5506 if (r >= 0) {
5507 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5508 for (map<string, bufferlist>::iterator it = keyvals.begin();
5509 it != keyvals.end(); ++it)
5510 ss << " key=" << (*it).first << " val="
5511 << string((*it).second.c_str(), (*it).second.length());
5512 } else {
5513 ss << "error=" << r;
5514 }
5515 } else if (command == "truncobj") {
5516 int64_t trunclen;
5517 cmd_getval(service->cct, cmdmap, "len", trunclen);
5518 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5519 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5520 if (r < 0)
5521 ss << "error=" << r;
5522 else
5523 ss << "ok";
5524 } else if (command == "injectdataerr") {
5525 store->inject_data_error(gobj);
5526 ss << "ok";
5527 } else if (command == "injectmdataerr") {
5528 store->inject_mdata_error(gobj);
5529 ss << "ok";
5530 }
5531 return;
5532 }
5533 if (command == "set_recovery_delay") {
5534 int64_t delay;
5535 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5536 ostringstream oss;
5537 oss << delay;
5538 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5539 oss.str().c_str());
5540 if (r != 0) {
5541 ss << "set_recovery_delay: error setting "
5542 << "osd_recovery_delay_start to '" << delay << "': error "
5543 << r;
5544 return;
5545 }
5546 service->cct->_conf->apply_changes(NULL);
5547 ss << "set_recovery_delay: set osd_recovery_delay_start "
5548 << "to " << service->cct->_conf->osd_recovery_delay_start;
5549 return;
5550 }
5551 if (command == "trigger_scrub") {
5552 spg_t pgid;
5553 OSDMapRef curmap = service->get_osdmap();
5554
5555 string pgidstr;
5556
5557 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5558 if (!pgid.parse(pgidstr.c_str())) {
5559 ss << "Invalid pgid specified";
5560 return;
5561 }
5562
5563 PG *pg = service->osd->_lookup_lock_pg(pgid);
5564 if (pg == nullptr) {
5565 ss << "Can't find pg " << pgid;
5566 return;
5567 }
5568
5569 if (pg->is_primary()) {
5570 pg->unreg_next_scrub();
5571 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5572 double pool_scrub_max_interval = 0;
5573 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5574 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5575 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5576 // Instead of marking must_scrub force a schedule scrub
5577 utime_t stamp = ceph_clock_now();
5578 stamp -= scrub_max_interval;
5579 stamp -= 100.0; // push back last scrub more for good measure
5580 pg->info.history.last_scrub_stamp = stamp;
5581 pg->reg_next_scrub();
5582 ss << "ok";
5583 } else {
5584 ss << "Not primary";
5585 }
5586 pg->unlock();
5587 return;
5588 }
5589 if (command == "injectfull") {
5590 int64_t count;
5591 string type;
5592 OSDService::s_names state;
5593 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5594 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5595 if (type == "none" || count == 0) {
5596 type = "none";
5597 count = 0;
5598 }
5599 state = service->get_full_state(type);
5600 if (state == OSDService::s_names::INVALID) {
5601 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5602 return;
5603 }
5604 service->set_injectfull(state, count);
5605 return;
5606 }
5607 ss << "Internal error - command=" << command;
5608 }
5609
5610 // =========================================
5611 bool remove_dir(
5612 CephContext *cct,
5613 ObjectStore *store, SnapMapper *mapper,
5614 OSDriver *osdriver,
5615 ObjectStore::Sequencer *osr,
5616 coll_t coll, DeletingStateRef dstate,
5617 bool *finished,
5618 ThreadPool::TPHandle &handle)
5619 {
5620 vector<ghobject_t> olist;
5621 int64_t num = 0;
5622 ObjectStore::Transaction t;
5623 ghobject_t next;
5624 handle.reset_tp_timeout();
5625 store->collection_list(
5626 coll,
5627 next,
5628 ghobject_t::get_max(),
5629 store->get_ideal_list_max(),
5630 &olist,
5631 &next);
5632 generic_dout(10) << __func__ << " " << olist << dendl;
5633 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5634 // will recheck the answer before it really goes on.
5635 bool cont = true;
5636 for (vector<ghobject_t>::iterator i = olist.begin();
5637 i != olist.end();
5638 ++i) {
5639 if (i->is_pgmeta())
5640 continue;
5641 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5642 int r = mapper->remove_oid(i->hobj, &_t);
5643 if (r != 0 && r != -ENOENT) {
5644 ceph_abort();
5645 }
5646 t.remove(coll, *i);
5647 if (++num >= cct->_conf->osd_target_transaction_size) {
5648 C_SaferCond waiter;
5649 store->queue_transaction(osr, std::move(t), &waiter);
5650 cont = dstate->pause_clearing();
5651 handle.suspend_tp_timeout();
5652 waiter.wait();
5653 handle.reset_tp_timeout();
5654 if (cont)
5655 cont = dstate->resume_clearing();
5656 if (!cont)
5657 return false;
5658 t = ObjectStore::Transaction();
5659 num = 0;
5660 }
5661 }
5662 if (num) {
5663 C_SaferCond waiter;
5664 store->queue_transaction(osr, std::move(t), &waiter);
5665 cont = dstate->pause_clearing();
5666 handle.suspend_tp_timeout();
5667 waiter.wait();
5668 handle.reset_tp_timeout();
5669 if (cont)
5670 cont = dstate->resume_clearing();
5671 }
5672 // whether there are more objects to remove in the collection
5673 *finished = next.is_max();
5674 return cont;
5675 }
5676
5677 void OSD::RemoveWQ::_process(
5678 pair<PGRef, DeletingStateRef> item,
5679 ThreadPool::TPHandle &handle)
5680 {
5681 FUNCTRACE();
5682 PGRef pg(item.first);
5683 SnapMapper &mapper = pg->snap_mapper;
5684 OSDriver &driver = pg->osdriver;
5685 coll_t coll = coll_t(pg->info.pgid);
5686 pg->osr->flush();
5687 bool finished = false;
5688
5689 if (!item.second->start_or_resume_clearing())
5690 return;
5691
5692 bool cont = remove_dir(
5693 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5694 &finished, handle);
5695 if (!cont)
5696 return;
5697 if (!finished) {
5698 if (item.second->pause_clearing())
5699 queue_front(item);
5700 return;
5701 }
5702
5703 if (!item.second->start_deleting())
5704 return;
5705
5706 ObjectStore::Transaction t;
5707 PGLog::clear_info_log(pg->info.pgid, &t);
5708
5709 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5710 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5711 _exit(1);
5712 }
5713 t.remove_collection(coll);
5714
5715 // We need the sequencer to stick around until the op is complete
5716 store->queue_transaction(
5717 pg->osr.get(),
5718 std::move(t),
5719 0, // onapplied
5720 0, // oncommit
5721 0, // onreadable sync
5722 new ContainerContext<PGRef>(pg),
5723 TrackedOpRef());
5724
5725 item.second->finish_deleting();
5726 }
5727 // =========================================
5728
5729 void OSD::ms_handle_connect(Connection *con)
5730 {
5731 dout(10) << __func__ << " con " << con << dendl;
5732 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5733 Mutex::Locker l(osd_lock);
5734 if (is_stopping())
5735 return;
5736 dout(10) << __func__ << " on mon" << dendl;
5737
5738 if (is_preboot()) {
5739 start_boot();
5740 } else if (is_booting()) {
5741 _send_boot(); // resend boot message
5742 } else {
5743 map_lock.get_read();
5744 Mutex::Locker l2(mon_report_lock);
5745
5746 utime_t now = ceph_clock_now();
5747 last_mon_report = now;
5748
5749 // resend everything, it's a new session
5750 send_full_update();
5751 send_alive();
5752 service.requeue_pg_temp();
5753 service.send_pg_temp();
5754 requeue_failures();
5755 send_failures();
5756 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5757 send_pg_stats(now);
5758 }
5759
5760 map_lock.put_read();
5761 if (is_active()) {
5762 send_beacon(ceph::coarse_mono_clock::now());
5763 }
5764 }
5765
5766 // full map requests may happen while active or pre-boot
5767 if (requested_full_first) {
5768 rerequest_full_maps();
5769 }
5770 }
5771 }
5772
5773 void OSD::ms_handle_fast_connect(Connection *con)
5774 {
5775 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5776 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5777 Session *s = static_cast<Session*>(con->get_priv());
5778 if (!s) {
5779 s = new Session(cct);
5780 con->set_priv(s->get());
5781 s->con = con;
5782 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5783 << " addr=" << s->con->get_peer_addr() << dendl;
5784 // we don't connect to clients
5785 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5786 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5787 }
5788 s->put();
5789 }
5790 }
5791
5792 void OSD::ms_handle_fast_accept(Connection *con)
5793 {
5794 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5795 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5796 Session *s = static_cast<Session*>(con->get_priv());
5797 if (!s) {
5798 s = new Session(cct);
5799 con->set_priv(s->get());
5800 s->con = con;
5801 dout(10) << "new session (incoming)" << s << " con=" << con
5802 << " addr=" << con->get_peer_addr()
5803 << " must have raced with connect" << dendl;
5804 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5805 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5806 }
5807 s->put();
5808 }
5809 }
5810
5811 bool OSD::ms_handle_reset(Connection *con)
5812 {
5813 Session *session = static_cast<Session*>(con->get_priv());
5814 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5815 if (!session)
5816 return false;
5817 session->wstate.reset(con);
5818 session->con.reset(NULL); // break con <-> session ref cycle
5819 // note that we break session->con *before* the session_handle_reset
5820 // cleanup below. this avoids a race between us and
5821 // PG::add_backoff, Session::check_backoff, etc.
5822 session_handle_reset(session);
5823 session->put();
5824 return true;
5825 }
5826
5827 bool OSD::ms_handle_refused(Connection *con)
5828 {
5829 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5830 return false;
5831
5832 Session *session = static_cast<Session*>(con->get_priv());
5833 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5834 if (!session)
5835 return false;
5836 int type = con->get_peer_type();
5837 // handle only OSD failures here
5838 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5839 OSDMapRef osdmap = get_osdmap();
5840 if (osdmap) {
5841 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5842 if (id >= 0 && osdmap->is_up(id)) {
5843 // I'm cheating mon heartbeat grace logic, because we know it's not going
5844 // to respawn alone. +1 so we won't hit any boundary case.
5845 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5846 osdmap->get_inst(id),
5847 cct->_conf->osd_heartbeat_grace + 1,
5848 osdmap->get_epoch(),
5849 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5850 ));
5851 }
5852 }
5853 }
5854 session->put();
5855 return true;
5856 }
5857
5858 struct C_OSD_GetVersion : public Context {
5859 OSD *osd;
5860 uint64_t oldest, newest;
5861 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5862 void finish(int r) override {
5863 if (r >= 0)
5864 osd->_got_mon_epochs(oldest, newest);
5865 }
5866 };
5867
5868 void OSD::start_boot()
5869 {
5870 if (!_is_healthy()) {
5871 // if we are not healthy, do not mark ourselves up (yet)
5872 dout(1) << "not healthy; waiting to boot" << dendl;
5873 if (!is_waiting_for_healthy())
5874 start_waiting_for_healthy();
5875 // send pings sooner rather than later
5876 heartbeat_kick();
5877 return;
5878 }
5879 dout(1) << __func__ << dendl;
5880 set_state(STATE_PREBOOT);
5881 waiting_for_luminous_mons = false;
5882 dout(10) << "start_boot - have maps " << superblock.oldest_map
5883 << ".." << superblock.newest_map << dendl;
5884 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5885 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5886 }
5887
5888 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5889 {
5890 Mutex::Locker l(osd_lock);
5891 if (is_preboot()) {
5892 _preboot(oldest, newest);
5893 }
5894 }
5895
5896 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5897 {
5898 assert(is_preboot());
5899 dout(10) << __func__ << " _preboot mon has osdmaps "
5900 << oldest << ".." << newest << dendl;
5901
5902 // ensure our local fullness awareness is accurate
5903 heartbeat();
5904
5905 // if our map within recent history, try to add ourselves to the osdmap.
5906 if (osdmap->get_epoch() == 0) {
5907 derr << "waiting for initial osdmap" << dendl;
5908 } else if (osdmap->is_destroyed(whoami)) {
5909 derr << "osdmap says I am destroyed" << dendl;
5910 // provide a small margin so we don't livelock seeing if we
5911 // un-destroyed ourselves.
5912 if (osdmap->get_epoch() > newest - 1) {
5913 exit(0);
5914 }
5915 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5916 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5917 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5918 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5919 << dendl;
5920 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5921 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5922 << dendl;
5923 } else if (!monc->monmap.get_required_features().contains_all(
5924 ceph::features::mon::FEATURE_LUMINOUS)) {
5925 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5926 << "Luminous or later before Luminous OSDs will boot" << dendl;
5927 waiting_for_luminous_mons = true;
5928 } else if (service.need_fullness_update()) {
5929 derr << "osdmap fullness state needs update" << dendl;
5930 send_full_update();
5931 } else if (osdmap->get_epoch() >= oldest - 1 &&
5932 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5933 _send_boot();
5934 return;
5935 }
5936
5937 // get all the latest maps
5938 if (osdmap->get_epoch() + 1 >= oldest)
5939 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5940 else
5941 osdmap_subscribe(oldest - 1, true);
5942 }
5943
5944 void OSD::send_full_update()
5945 {
5946 if (!service.need_fullness_update())
5947 return;
5948 unsigned state = 0;
5949 if (service.is_full()) {
5950 state = CEPH_OSD_FULL;
5951 } else if (service.is_backfillfull()) {
5952 state = CEPH_OSD_BACKFILLFULL;
5953 } else if (service.is_nearfull()) {
5954 state = CEPH_OSD_NEARFULL;
5955 }
5956 set<string> s;
5957 OSDMap::calc_state_set(state, s);
5958 dout(10) << __func__ << " want state " << s << dendl;
5959 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5960 }
5961
5962 void OSD::start_waiting_for_healthy()
5963 {
5964 dout(1) << "start_waiting_for_healthy" << dendl;
5965 set_state(STATE_WAITING_FOR_HEALTHY);
5966 last_heartbeat_resample = utime_t();
5967
5968 // subscribe to osdmap updates, in case our peers really are known to be dead
5969 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5970 }
5971
5972 bool OSD::_is_healthy()
5973 {
5974 if (!cct->get_heartbeat_map()->is_healthy()) {
5975 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5976 return false;
5977 }
5978
5979 if (is_waiting_for_healthy()) {
5980 Mutex::Locker l(heartbeat_lock);
5981 utime_t cutoff = ceph_clock_now();
5982 cutoff -= cct->_conf->osd_heartbeat_grace;
5983 int num = 0, up = 0;
5984 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5985 p != heartbeat_peers.end();
5986 ++p) {
5987 if (p->second.is_healthy(cutoff))
5988 ++up;
5989 ++num;
5990 }
5991 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5992 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5993 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5994 return false;
5995 }
5996 }
5997
5998 return true;
5999 }
6000
6001 void OSD::_send_boot()
6002 {
6003 dout(10) << "_send_boot" << dendl;
6004 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
6005 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
6006 if (cluster_addr.is_blank_ip()) {
6007 int port = cluster_addr.get_port();
6008 cluster_addr = client_messenger->get_myaddr();
6009 cluster_addr.set_port(port);
6010 cluster_messenger->set_addr_unknowns(cluster_addr);
6011 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
6012 } else {
6013 Session *s = static_cast<Session*>(local_connection->get_priv());
6014 if (s)
6015 s->put();
6016 else
6017 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6018 }
6019
6020 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
6021 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6022 if (hb_back_addr.is_blank_ip()) {
6023 int port = hb_back_addr.get_port();
6024 hb_back_addr = cluster_addr;
6025 hb_back_addr.set_port(port);
6026 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
6027 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
6028 } else {
6029 Session *s = static_cast<Session*>(local_connection->get_priv());
6030 if (s)
6031 s->put();
6032 else
6033 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6034 }
6035
6036 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
6037 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6038 if (hb_front_addr.is_blank_ip()) {
6039 int port = hb_front_addr.get_port();
6040 hb_front_addr = client_messenger->get_myaddr();
6041 hb_front_addr.set_port(port);
6042 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
6043 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
6044 } else {
6045 Session *s = static_cast<Session*>(local_connection->get_priv());
6046 if (s)
6047 s->put();
6048 else
6049 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6050 }
6051
6052 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6053 hb_back_addr, hb_front_addr, cluster_addr,
6054 CEPH_FEATURES_ALL);
6055 dout(10) << " client_addr " << client_messenger->get_myaddr()
6056 << ", cluster_addr " << cluster_addr
6057 << ", hb_back_addr " << hb_back_addr
6058 << ", hb_front_addr " << hb_front_addr
6059 << dendl;
6060 _collect_metadata(&mboot->metadata);
6061 monc->send_mon_message(mboot);
6062 set_state(STATE_BOOTING);
6063 }
6064
6065 void OSD::_collect_metadata(map<string,string> *pm)
6066 {
6067 // config info
6068 (*pm)["osd_data"] = dev_path;
6069 if (store->get_type() == "filestore") {
6070 // not applicable for bluestore
6071 (*pm)["osd_journal"] = journal_path;
6072 }
6073 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6074 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6075 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6076 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6077
6078 // backend
6079 (*pm)["osd_objectstore"] = store->get_type();
6080 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6081 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6082 (*pm)["default_device_class"] = store->get_default_device_class();
6083 store->collect_metadata(pm);
6084
6085 collect_sys_info(pm, cct);
6086
6087 std::string front_iface, back_iface;
6088 /*
6089 pick_iface(cct,
6090 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6091 &front_iface, &back_iface);
6092 */
6093 (*pm)["front_iface"] = pick_iface(cct,
6094 client_messenger->get_myaddr().get_sockaddr_storage());
6095 (*pm)["back_iface"] = pick_iface(cct,
6096 cluster_messenger->get_myaddr().get_sockaddr_storage());
6097
6098 dout(10) << __func__ << " " << *pm << dendl;
6099 }
6100
6101 void OSD::queue_want_up_thru(epoch_t want)
6102 {
6103 map_lock.get_read();
6104 epoch_t cur = osdmap->get_up_thru(whoami);
6105 Mutex::Locker l(mon_report_lock);
6106 if (want > up_thru_wanted) {
6107 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6108 << ", currently " << cur
6109 << dendl;
6110 up_thru_wanted = want;
6111 send_alive();
6112 } else {
6113 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6114 << ", currently " << cur
6115 << dendl;
6116 }
6117 map_lock.put_read();
6118 }
6119
6120 void OSD::send_alive()
6121 {
6122 assert(mon_report_lock.is_locked());
6123 if (!osdmap->exists(whoami))
6124 return;
6125 epoch_t up_thru = osdmap->get_up_thru(whoami);
6126 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6127 if (up_thru_wanted > up_thru) {
6128 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6129 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6130 }
6131 }
6132
6133 void OSD::request_full_map(epoch_t first, epoch_t last)
6134 {
6135 dout(10) << __func__ << " " << first << ".." << last
6136 << ", previously requested "
6137 << requested_full_first << ".." << requested_full_last << dendl;
6138 assert(osd_lock.is_locked());
6139 assert(first > 0 && last > 0);
6140 assert(first <= last);
6141 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6142 if (requested_full_first == 0) {
6143 // first request
6144 requested_full_first = first;
6145 requested_full_last = last;
6146 } else if (last <= requested_full_last) {
6147 // dup
6148 return;
6149 } else {
6150 // additional request
6151 first = requested_full_last + 1;
6152 requested_full_last = last;
6153 }
6154 MMonGetOSDMap *req = new MMonGetOSDMap;
6155 req->request_full(first, last);
6156 monc->send_mon_message(req);
6157 }
6158
6159 void OSD::got_full_map(epoch_t e)
6160 {
6161 assert(requested_full_first <= requested_full_last);
6162 assert(osd_lock.is_locked());
6163 if (requested_full_first == 0) {
6164 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6165 return;
6166 }
6167 if (e < requested_full_first) {
6168 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6169 << ".." << requested_full_last
6170 << ", ignoring" << dendl;
6171 return;
6172 }
6173 if (e >= requested_full_last) {
6174 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6175 << ".." << requested_full_last << ", resetting" << dendl;
6176 requested_full_first = requested_full_last = 0;
6177 return;
6178 }
6179
6180 requested_full_first = e + 1;
6181
6182 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6183 << ".." << requested_full_last
6184 << ", still need more" << dendl;
6185 }
6186
6187 void OSD::requeue_failures()
6188 {
6189 Mutex::Locker l(heartbeat_lock);
6190 unsigned old_queue = failure_queue.size();
6191 unsigned old_pending = failure_pending.size();
6192 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6193 failure_pending.begin();
6194 p != failure_pending.end(); ) {
6195 failure_queue[p->first] = p->second.first;
6196 failure_pending.erase(p++);
6197 }
6198 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6199 << failure_queue.size() << dendl;
6200 }
6201
6202 void OSD::send_failures()
6203 {
6204 assert(map_lock.is_locked());
6205 assert(mon_report_lock.is_locked());
6206 Mutex::Locker l(heartbeat_lock);
6207 utime_t now = ceph_clock_now();
6208 while (!failure_queue.empty()) {
6209 int osd = failure_queue.begin()->first;
6210 if (!failure_pending.count(osd)) {
6211 entity_inst_t i = osdmap->get_inst(osd);
6212 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6213 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6214 osdmap->get_epoch()));
6215 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6216 }
6217 failure_queue.erase(osd);
6218 }
6219 }
6220
6221 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6222 {
6223 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6224 monc->send_mon_message(m);
6225 }
6226
6227 void OSD::send_pg_stats(const utime_t &now)
6228 {
6229 assert(map_lock.is_locked());
6230 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6231 dout(20) << "send_pg_stats" << dendl;
6232
6233 osd_stat_t cur_stat = service.get_osd_stat();
6234
6235 cur_stat.os_perf_stat = store->get_cur_stats();
6236
6237 pg_stat_queue_lock.Lock();
6238
6239 if (osd_stat_updated || !pg_stat_queue.empty()) {
6240 last_pg_stats_sent = now;
6241 osd_stat_updated = false;
6242
6243 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6244
6245 utime_t had_for(now);
6246 had_for -= had_map_since;
6247
6248 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6249
6250 uint64_t tid = ++pg_stat_tid;
6251 m->set_tid(tid);
6252 m->osd_stat = cur_stat;
6253
6254 xlist<PG*>::iterator p = pg_stat_queue.begin();
6255 while (!p.end()) {
6256 PG *pg = *p;
6257 ++p;
6258 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6259 pg->stat_queue_item.remove_myself();
6260 pg->put("pg_stat_queue");
6261 continue;
6262 }
6263 pg->pg_stats_publish_lock.Lock();
6264 if (pg->pg_stats_publish_valid) {
6265 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6266 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6267 << pg->pg_stats_publish.reported_seq << dendl;
6268 } else {
6269 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6270 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6271 }
6272 pg->pg_stats_publish_lock.Unlock();
6273 }
6274
6275 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6276 last_pg_stats_ack = ceph_clock_now();
6277 }
6278 outstanding_pg_stats.insert(tid);
6279 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6280
6281 monc->send_mon_message(m);
6282 }
6283
6284 pg_stat_queue_lock.Unlock();
6285 }
6286
6287 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6288 {
6289 dout(10) << "handle_pg_stats_ack " << dendl;
6290
6291 if (!require_mon_peer(ack)) {
6292 ack->put();
6293 return;
6294 }
6295
6296 // NOTE: we may get replies from a previous mon even while
6297 // outstanding_pg_stats is empty if reconnecting races with replies
6298 // in flight.
6299
6300 pg_stat_queue_lock.Lock();
6301
6302 last_pg_stats_ack = ceph_clock_now();
6303
6304 // decay timeout slowly (analogous to TCP)
6305 stats_ack_timeout =
6306 MAX(cct->_conf->osd_mon_ack_timeout,
6307 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6308 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6309
6310 if (ack->get_tid() > pg_stat_tid_flushed) {
6311 pg_stat_tid_flushed = ack->get_tid();
6312 pg_stat_queue_cond.Signal();
6313 }
6314
6315 xlist<PG*>::iterator p = pg_stat_queue.begin();
6316 while (!p.end()) {
6317 PG *pg = *p;
6318 PGRef _pg(pg);
6319 ++p;
6320
6321 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6322 if (acked != ack->pg_stat.end()) {
6323 pg->pg_stats_publish_lock.Lock();
6324 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6325 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6326 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6327 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6328 pg->stat_queue_item.remove_myself();
6329 pg->put("pg_stat_queue");
6330 } else {
6331 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6332 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6333 << acked->second << dendl;
6334 }
6335 pg->pg_stats_publish_lock.Unlock();
6336 } else {
6337 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6338 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6339 }
6340 }
6341
6342 outstanding_pg_stats.erase(ack->get_tid());
6343 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6344
6345 pg_stat_queue_lock.Unlock();
6346
6347 ack->put();
6348 }
6349
6350 void OSD::flush_pg_stats()
6351 {
6352 dout(10) << "flush_pg_stats" << dendl;
6353 osd_lock.Unlock();
6354 utime_t now = ceph_clock_now();
6355 map_lock.get_read();
6356 mon_report_lock.Lock();
6357 send_pg_stats(now);
6358 mon_report_lock.Unlock();
6359 map_lock.put_read();
6360
6361
6362 pg_stat_queue_lock.Lock();
6363 uint64_t tid = pg_stat_tid;
6364 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6365 while (tid > pg_stat_tid_flushed)
6366 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6367 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6368 pg_stat_queue_lock.Unlock();
6369
6370 osd_lock.Lock();
6371 }
6372
6373 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6374 {
6375 const auto& monmap = monc->monmap;
6376 // send beacon to mon even if we are just connected, and the monmap is not
6377 // initialized yet by then.
6378 if (monmap.epoch > 0 &&
6379 monmap.get_required_features().contains_all(
6380 ceph::features::mon::FEATURE_LUMINOUS)) {
6381 dout(20) << __func__ << " sending" << dendl;
6382 MOSDBeacon* beacon = nullptr;
6383 {
6384 Mutex::Locker l{min_last_epoch_clean_lock};
6385 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6386 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6387 last_sent_beacon = now;
6388 }
6389 monc->send_mon_message(beacon);
6390 } else {
6391 dout(20) << __func__ << " not sending" << dendl;
6392 }
6393 }
6394
6395 void OSD::handle_command(MMonCommand *m)
6396 {
6397 if (!require_mon_peer(m)) {
6398 m->put();
6399 return;
6400 }
6401
6402 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6403 command_wq.queue(c);
6404 m->put();
6405 }
6406
6407 void OSD::handle_command(MCommand *m)
6408 {
6409 ConnectionRef con = m->get_connection();
6410 Session *session = static_cast<Session *>(con->get_priv());
6411 if (!session) {
6412 con->send_message(new MCommandReply(m, -EPERM));
6413 m->put();
6414 return;
6415 }
6416
6417 OSDCap& caps = session->caps;
6418 session->put();
6419
6420 if (!caps.allow_all() || m->get_source().is_mon()) {
6421 con->send_message(new MCommandReply(m, -EPERM));
6422 m->put();
6423 return;
6424 }
6425
6426 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6427 command_wq.queue(c);
6428
6429 m->put();
6430 }
6431
6432 struct OSDCommand {
6433 string cmdstring;
6434 string helpstring;
6435 string module;
6436 string perm;
6437 string availability;
6438 } osd_commands[] = {
6439
6440 #define COMMAND(parsesig, helptext, module, perm, availability) \
6441 {parsesig, helptext, module, perm, availability},
6442
6443 // yes, these are really pg commands, but there's a limit to how
6444 // much work it's worth. The OSD returns all of them. Make this
6445 // form (pg <pgid> <cmd>) valid only for the cli.
6446 // Rest uses "tell <pgid> <cmd>"
6447
6448 COMMAND("pg " \
6449 "name=pgid,type=CephPgid " \
6450 "name=cmd,type=CephChoices,strings=query", \
6451 "show details of a specific pg", "osd", "r", "cli")
6452 COMMAND("pg " \
6453 "name=pgid,type=CephPgid " \
6454 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6455 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6456 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6457 "osd", "rw", "cli")
6458 COMMAND("pg " \
6459 "name=pgid,type=CephPgid " \
6460 "name=cmd,type=CephChoices,strings=list_missing " \
6461 "name=offset,type=CephString,req=false",
6462 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6463 "osd", "r", "cli")
6464
6465 // new form: tell <pgid> <cmd> for both cli and rest
6466
6467 COMMAND("query",
6468 "show details of a specific pg", "osd", "r", "cli,rest")
6469 COMMAND("mark_unfound_lost " \
6470 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6471 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6472 "osd", "rw", "cli,rest")
6473 COMMAND("list_missing " \
6474 "name=offset,type=CephString,req=false",
6475 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6476 "osd", "r", "cli,rest")
6477 COMMAND("perf histogram dump "
6478 "name=logger,type=CephString,req=false "
6479 "name=counter,type=CephString,req=false",
6480 "Get histogram data",
6481 "osd", "r", "cli,rest")
6482
6483 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6484 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6485 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6486 COMMAND("injectargs " \
6487 "name=injected_args,type=CephString,n=N",
6488 "inject configuration arguments into running OSD",
6489 "osd", "rw", "cli,rest")
6490 COMMAND("config set " \
6491 "name=key,type=CephString name=value,type=CephString",
6492 "Set a configuration option at runtime (not persistent)",
6493 "osd", "rw", "cli,rest")
6494 COMMAND("cluster_log " \
6495 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6496 "name=message,type=CephString,n=N",
6497 "log a message to the cluster log",
6498 "osd", "rw", "cli,rest")
6499 COMMAND("bench " \
6500 "name=count,type=CephInt,req=false " \
6501 "name=size,type=CephInt,req=false " \
6502 "name=object_size,type=CephInt,req=false " \
6503 "name=object_num,type=CephInt,req=false ", \
6504 "OSD benchmark: write <count> <size>-byte objects, " \
6505 "(default 1G size 4MB). Results in log.",
6506 "osd", "rw", "cli,rest")
6507 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6508 COMMAND("heap " \
6509 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6510 "show heap usage info (available only if compiled with tcmalloc)", \
6511 "osd", "rw", "cli,rest")
6512 COMMAND("debug dump_missing " \
6513 "name=filename,type=CephFilepath",
6514 "dump missing objects to a named file", "osd", "r", "cli,rest")
6515 COMMAND("debug kick_recovery_wq " \
6516 "name=delay,type=CephInt,range=0",
6517 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6518 COMMAND("cpu_profiler " \
6519 "name=arg,type=CephChoices,strings=status|flush",
6520 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6521 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6522 "osd", "r", "cli,rest")
6523 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6524 "osd", "rw", "cli,rest")
6525 COMMAND("compact",
6526 "compact object store's omap. "
6527 "WARNING: Compaction probably slows your requests",
6528 "osd", "rw", "cli,rest")
6529 };
6530
6531 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6532 {
6533 int r = 0;
6534 stringstream ss, ds;
6535 string rs;
6536 bufferlist odata;
6537
6538 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6539
6540 map<string, cmd_vartype> cmdmap;
6541 string prefix;
6542 string format;
6543 string pgidstr;
6544 boost::scoped_ptr<Formatter> f;
6545
6546 if (cmd.empty()) {
6547 ss << "no command given";
6548 goto out;
6549 }
6550
6551 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6552 r = -EINVAL;
6553 goto out;
6554 }
6555
6556 cmd_getval(cct, cmdmap, "prefix", prefix);
6557
6558 if (prefix == "get_command_descriptions") {
6559 int cmdnum = 0;
6560 JSONFormatter *f = new JSONFormatter();
6561 f->open_object_section("command_descriptions");
6562 for (OSDCommand *cp = osd_commands;
6563 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6564
6565 ostringstream secname;
6566 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6567 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6568 cp->module, cp->perm, cp->availability, 0);
6569 cmdnum++;
6570 }
6571 f->close_section(); // command_descriptions
6572
6573 f->flush(ds);
6574 delete f;
6575 goto out;
6576 }
6577
6578 cmd_getval(cct, cmdmap, "format", format);
6579 f.reset(Formatter::create(format));
6580
6581 if (prefix == "version") {
6582 if (f) {
6583 f->open_object_section("version");
6584 f->dump_string("version", pretty_version_to_str());
6585 f->close_section();
6586 f->flush(ds);
6587 } else {
6588 ds << pretty_version_to_str();
6589 }
6590 goto out;
6591 }
6592 else if (prefix == "injectargs") {
6593 vector<string> argsvec;
6594 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6595
6596 if (argsvec.empty()) {
6597 r = -EINVAL;
6598 ss << "ignoring empty injectargs";
6599 goto out;
6600 }
6601 string args = argsvec.front();
6602 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6603 args += " " + *a;
6604 osd_lock.Unlock();
6605 r = cct->_conf->injectargs(args, &ss);
6606 osd_lock.Lock();
6607 }
6608 else if (prefix == "config set") {
6609 std::string key;
6610 std::string val;
6611 cmd_getval(cct, cmdmap, "key", key);
6612 cmd_getval(cct, cmdmap, "value", val);
6613 osd_lock.Unlock();
6614 r = cct->_conf->set_val(key, val, true, &ss);
6615 if (r == 0) {
6616 cct->_conf->apply_changes(nullptr);
6617 }
6618 osd_lock.Lock();
6619 }
6620 else if (prefix == "cluster_log") {
6621 vector<string> msg;
6622 cmd_getval(cct, cmdmap, "message", msg);
6623 if (msg.empty()) {
6624 r = -EINVAL;
6625 ss << "ignoring empty log message";
6626 goto out;
6627 }
6628 string message = msg.front();
6629 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6630 message += " " + *a;
6631 string lvl;
6632 cmd_getval(cct, cmdmap, "level", lvl);
6633 clog_type level = string_to_clog_type(lvl);
6634 if (level < 0) {
6635 r = -EINVAL;
6636 ss << "unknown level '" << lvl << "'";
6637 goto out;
6638 }
6639 clog->do_log(level, message);
6640 }
6641
6642 // either 'pg <pgid> <command>' or
6643 // 'tell <pgid>' (which comes in without any of that prefix)?
6644
6645 else if (prefix == "pg" ||
6646 prefix == "query" ||
6647 prefix == "mark_unfound_lost" ||
6648 prefix == "list_missing"
6649 ) {
6650 pg_t pgid;
6651
6652 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6653 ss << "no pgid specified";
6654 r = -EINVAL;
6655 } else if (!pgid.parse(pgidstr.c_str())) {
6656 ss << "couldn't parse pgid '" << pgidstr << "'";
6657 r = -EINVAL;
6658 } else {
6659 spg_t pcand;
6660 PG *pg = nullptr;
6661 if (osdmap->get_primary_shard(pgid, &pcand) &&
6662 (pg = _lookup_lock_pg(pcand))) {
6663 if (pg->is_primary()) {
6664 // simulate pg <pgid> cmd= for pg->do-command
6665 if (prefix != "pg")
6666 cmd_putval(cct, cmdmap, "cmd", prefix);
6667 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6668 if (r == -EAGAIN) {
6669 pg->unlock();
6670 // don't reply, pg will do so async
6671 return;
6672 }
6673 } else {
6674 ss << "not primary for pgid " << pgid;
6675
6676 // send them the latest diff to ensure they realize the mapping
6677 // has changed.
6678 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6679
6680 // do not reply; they will get newer maps and realize they
6681 // need to resend.
6682 pg->unlock();
6683 return;
6684 }
6685 pg->unlock();
6686 } else {
6687 ss << "i don't have pgid " << pgid;
6688 r = -ENOENT;
6689 }
6690 }
6691 }
6692
6693 else if (prefix == "bench") {
6694 int64_t count;
6695 int64_t bsize;
6696 int64_t osize, onum;
6697 // default count 1G, size 4MB
6698 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6699 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6700 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6701 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6702
6703 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6704 ObjectStore::Sequencer>("bench"));
6705
6706 uint32_t duration = cct->_conf->osd_bench_duration;
6707
6708 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6709 // let us limit the block size because the next checks rely on it
6710 // having a sane value. If we allow any block size to be set things
6711 // can still go sideways.
6712 ss << "block 'size' values are capped at "
6713 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6714 << " a higher value, please adjust 'osd_bench_max_block_size'";
6715 r = -EINVAL;
6716 goto out;
6717 } else if (bsize < (int64_t) (1 << 20)) {
6718 // entering the realm of small block sizes.
6719 // limit the count to a sane value, assuming a configurable amount of
6720 // IOPS and duration, so that the OSD doesn't get hung up on this,
6721 // preventing timeouts from going off
6722 int64_t max_count =
6723 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6724 if (count > max_count) {
6725 ss << "'count' values greater than " << max_count
6726 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6727 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6728 << " for " << duration << " seconds,"
6729 << " can cause ill effects on osd. "
6730 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6731 << " value if you wish to use a higher 'count'.";
6732 r = -EINVAL;
6733 goto out;
6734 }
6735 } else {
6736 // 1MB block sizes are big enough so that we get more stuff done.
6737 // However, to avoid the osd from getting hung on this and having
6738 // timers being triggered, we are going to limit the count assuming
6739 // a configurable throughput and duration.
6740 // NOTE: max_count is the total amount of bytes that we believe we
6741 // will be able to write during 'duration' for the given
6742 // throughput. The block size hardly impacts this unless it's
6743 // way too big. Given we already check how big the block size
6744 // is, it's safe to assume everything will check out.
6745 int64_t max_count =
6746 cct->_conf->osd_bench_large_size_max_throughput * duration;
6747 if (count > max_count) {
6748 ss << "'count' values greater than " << max_count
6749 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6750 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6751 << " for " << duration << " seconds,"
6752 << " can cause ill effects on osd. "
6753 << " Please adjust 'osd_bench_large_size_max_throughput'"
6754 << " with a higher value if you wish to use a higher 'count'.";
6755 r = -EINVAL;
6756 goto out;
6757 }
6758 }
6759
6760 if (osize && bsize > osize)
6761 bsize = osize;
6762
6763 dout(1) << " bench count " << count
6764 << " bsize " << prettybyte_t(bsize) << dendl;
6765
6766 ObjectStore::Transaction cleanupt;
6767
6768 if (osize && onum) {
6769 bufferlist bl;
6770 bufferptr bp(osize);
6771 bp.zero();
6772 bl.push_back(std::move(bp));
6773 bl.rebuild_page_aligned();
6774 for (int i=0; i<onum; ++i) {
6775 char nm[30];
6776 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6777 object_t oid(nm);
6778 hobject_t soid(sobject_t(oid, 0));
6779 ObjectStore::Transaction t;
6780 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6781 store->queue_transaction(osr.get(), std::move(t), NULL);
6782 cleanupt.remove(coll_t(), ghobject_t(soid));
6783 }
6784 }
6785
6786 bufferlist bl;
6787 bufferptr bp(bsize);
6788 bp.zero();
6789 bl.push_back(std::move(bp));
6790 bl.rebuild_page_aligned();
6791
6792 {
6793 C_SaferCond waiter;
6794 if (!osr->flush_commit(&waiter)) {
6795 waiter.wait();
6796 }
6797 }
6798
6799 utime_t start = ceph_clock_now();
6800 for (int64_t pos = 0; pos < count; pos += bsize) {
6801 char nm[30];
6802 unsigned offset = 0;
6803 if (onum && osize) {
6804 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6805 offset = rand() % (osize / bsize) * bsize;
6806 } else {
6807 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6808 }
6809 object_t oid(nm);
6810 hobject_t soid(sobject_t(oid, 0));
6811 ObjectStore::Transaction t;
6812 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6813 store->queue_transaction(osr.get(), std::move(t), NULL);
6814 if (!onum || !osize)
6815 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6816 }
6817
6818 {
6819 C_SaferCond waiter;
6820 if (!osr->flush_commit(&waiter)) {
6821 waiter.wait();
6822 }
6823 }
6824 utime_t end = ceph_clock_now();
6825
6826 // clean up
6827 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6828 {
6829 C_SaferCond waiter;
6830 if (!osr->flush_commit(&waiter)) {
6831 waiter.wait();
6832 }
6833 }
6834
6835 uint64_t rate = (double)count / (end - start);
6836 if (f) {
6837 f->open_object_section("osd_bench_results");
6838 f->dump_int("bytes_written", count);
6839 f->dump_int("blocksize", bsize);
6840 f->dump_unsigned("bytes_per_sec", rate);
6841 f->close_section();
6842 f->flush(ss);
6843 } else {
6844 ss << "bench: wrote " << prettybyte_t(count)
6845 << " in blocks of " << prettybyte_t(bsize) << " in "
6846 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6847 }
6848 }
6849
6850 else if (prefix == "flush_pg_stats") {
6851 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6852 mgrc.send_pgstats();
6853 ds << service.get_osd_stat_seq() << "\n";
6854 } else {
6855 flush_pg_stats();
6856 }
6857 }
6858
6859 else if (prefix == "heap") {
6860 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6861 }
6862
6863 else if (prefix == "debug dump_missing") {
6864 string file_name;
6865 cmd_getval(cct, cmdmap, "filename", file_name);
6866 std::ofstream fout(file_name.c_str());
6867 if (!fout.is_open()) {
6868 ss << "failed to open file '" << file_name << "'";
6869 r = -EINVAL;
6870 goto out;
6871 }
6872
6873 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6874 RWLock::RLocker l(pg_map_lock);
6875 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6876 pg_map_e != pg_map.end(); ++pg_map_e) {
6877 PG *pg = pg_map_e->second;
6878 pg->lock();
6879
6880 fout << *pg << std::endl;
6881 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6882 pg->pg_log.get_missing().get_items().end();
6883 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6884 pg->pg_log.get_missing().get_items().begin();
6885 for (; mi != mend; ++mi) {
6886 fout << mi->first << " -> " << mi->second << std::endl;
6887 if (!pg->missing_loc.needs_recovery(mi->first))
6888 continue;
6889 if (pg->missing_loc.is_unfound(mi->first))
6890 fout << " unfound ";
6891 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6892 if (mls.empty())
6893 continue;
6894 fout << "missing_loc: " << mls << std::endl;
6895 }
6896 pg->unlock();
6897 fout << std::endl;
6898 }
6899
6900 fout.close();
6901 }
6902 else if (prefix == "debug kick_recovery_wq") {
6903 int64_t delay;
6904 cmd_getval(cct, cmdmap, "delay", delay);
6905 ostringstream oss;
6906 oss << delay;
6907 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6908 if (r != 0) {
6909 ss << "kick_recovery_wq: error setting "
6910 << "osd_recovery_delay_start to '" << delay << "': error "
6911 << r;
6912 goto out;
6913 }
6914 cct->_conf->apply_changes(NULL);
6915 ss << "kicking recovery queue. set osd_recovery_delay_start "
6916 << "to " << cct->_conf->osd_recovery_delay_start;
6917 }
6918
6919 else if (prefix == "cpu_profiler") {
6920 string arg;
6921 cmd_getval(cct, cmdmap, "arg", arg);
6922 vector<string> argvec;
6923 get_str_vec(arg, argvec);
6924 cpu_profiler_handle_command(argvec, ds);
6925 }
6926
6927 else if (prefix == "dump_pg_recovery_stats") {
6928 stringstream s;
6929 if (f) {
6930 pg_recovery_stats.dump_formatted(f.get());
6931 f->flush(ds);
6932 } else {
6933 pg_recovery_stats.dump(s);
6934 ds << "dump pg recovery stats: " << s.str();
6935 }
6936 }
6937
6938 else if (prefix == "reset_pg_recovery_stats") {
6939 ss << "reset pg recovery stats";
6940 pg_recovery_stats.reset();
6941 }
6942
6943 else if (prefix == "perf histogram dump") {
6944 std::string logger;
6945 std::string counter;
6946 cmd_getval(cct, cmdmap, "logger", logger);
6947 cmd_getval(cct, cmdmap, "counter", counter);
6948 if (f) {
6949 cct->get_perfcounters_collection()->dump_formatted_histograms(
6950 f.get(), false, logger, counter);
6951 f->flush(ds);
6952 }
6953 }
6954
6955 else if (prefix == "compact") {
6956 dout(1) << "triggering manual compaction" << dendl;
6957 auto start = ceph::coarse_mono_clock::now();
6958 store->compact();
6959 auto end = ceph::coarse_mono_clock::now();
6960 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6961 dout(1) << "finished manual compaction in "
6962 << time_span.count()
6963 << " seconds" << dendl;
6964 ss << "compacted omap in " << time_span.count() << " seconds";
6965 }
6966
6967 else {
6968 ss << "unrecognized command! " << cmd;
6969 r = -EINVAL;
6970 }
6971
6972 out:
6973 rs = ss.str();
6974 odata.append(ds);
6975 dout(0) << "do_command r=" << r << " " << rs << dendl;
6976 clog->info() << rs;
6977 if (con) {
6978 MCommandReply *reply = new MCommandReply(r, rs);
6979 reply->set_tid(tid);
6980 reply->set_data(odata);
6981 con->send_message(reply);
6982 }
6983 }
6984
6985 bool OSD::heartbeat_dispatch(Message *m)
6986 {
6987 dout(30) << "heartbeat_dispatch " << m << dendl;
6988 switch (m->get_type()) {
6989
6990 case CEPH_MSG_PING:
6991 dout(10) << "ping from " << m->get_source_inst() << dendl;
6992 m->put();
6993 break;
6994
6995 case MSG_OSD_PING:
6996 handle_osd_ping(static_cast<MOSDPing*>(m));
6997 break;
6998
6999 default:
7000 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7001 m->put();
7002 }
7003
7004 return true;
7005 }
7006
7007 bool OSD::ms_dispatch(Message *m)
7008 {
7009 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7010 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7011 service.got_stop_ack();
7012 m->put();
7013 return true;
7014 }
7015
7016 // lock!
7017
7018 osd_lock.Lock();
7019 if (is_stopping()) {
7020 osd_lock.Unlock();
7021 m->put();
7022 return true;
7023 }
7024
7025 do_waiters();
7026 _dispatch(m);
7027
7028 osd_lock.Unlock();
7029
7030 return true;
7031 }
7032
7033 void OSD::maybe_share_map(
7034 Session *session,
7035 OpRequestRef op,
7036 OSDMapRef osdmap)
7037 {
7038 if (!op->check_send_map) {
7039 return;
7040 }
7041 epoch_t last_sent_epoch = 0;
7042
7043 session->sent_epoch_lock.lock();
7044 last_sent_epoch = session->last_sent_epoch;
7045 session->sent_epoch_lock.unlock();
7046
7047 const Message *m = op->get_req();
7048 service.share_map(
7049 m->get_source(),
7050 m->get_connection().get(),
7051 op->sent_epoch,
7052 osdmap,
7053 session ? &last_sent_epoch : NULL);
7054
7055 session->sent_epoch_lock.lock();
7056 if (session->last_sent_epoch < last_sent_epoch) {
7057 session->last_sent_epoch = last_sent_epoch;
7058 }
7059 session->sent_epoch_lock.unlock();
7060
7061 op->check_send_map = false;
7062 }
7063
7064 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7065 {
7066 assert(session->session_dispatch_lock.is_locked());
7067
7068 auto i = session->waiting_on_map.begin();
7069 while (i != session->waiting_on_map.end()) {
7070 OpRequestRef op = &(*i);
7071 assert(ms_can_fast_dispatch(op->get_req()));
7072 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7073 op->get_req());
7074 if (m->get_min_epoch() > osdmap->get_epoch()) {
7075 break;
7076 }
7077 session->waiting_on_map.erase(i++);
7078 op->put();
7079
7080 spg_t pgid;
7081 if (m->get_type() == CEPH_MSG_OSD_OP) {
7082 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7083 static_cast<const MOSDOp*>(m)->get_pg());
7084 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7085 continue;
7086 }
7087 } else {
7088 pgid = m->get_spg();
7089 }
7090 enqueue_op(pgid, op, m->get_map_epoch());
7091 }
7092
7093 if (session->waiting_on_map.empty()) {
7094 clear_session_waiting_on_map(session);
7095 } else {
7096 register_session_waiting_on_map(session);
7097 }
7098 }
7099
7100 void OSD::ms_fast_dispatch(Message *m)
7101 {
7102 FUNCTRACE();
7103 if (service.is_stopping()) {
7104 m->put();
7105 return;
7106 }
7107 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7108 {
7109 #ifdef WITH_LTTNG
7110 osd_reqid_t reqid = op->get_reqid();
7111 #endif
7112 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7113 reqid.name._num, reqid.tid, reqid.inc);
7114 }
7115
7116 if (m->trace)
7117 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7118
7119 // note sender epoch, min req'd epoch
7120 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7121 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7122 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7123
7124 service.maybe_inject_dispatch_delay();
7125
7126 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7127 m->get_type() != CEPH_MSG_OSD_OP) {
7128 // queue it directly
7129 enqueue_op(
7130 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7131 op,
7132 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7133 } else {
7134 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7135 // message that didn't have an explicit spg_t); we need to map
7136 // them to an spg_t while preserving delivery order.
7137 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7138 if (session) {
7139 {
7140 Mutex::Locker l(session->session_dispatch_lock);
7141 op->get();
7142 session->waiting_on_map.push_back(*op);
7143 OSDMapRef nextmap = service.get_nextmap_reserved();
7144 dispatch_session_waiting(session, nextmap);
7145 service.release_map(nextmap);
7146 }
7147 session->put();
7148 }
7149 }
7150 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7151 }
7152
7153 void OSD::ms_fast_preprocess(Message *m)
7154 {
7155 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7156 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7157 MOSDMap *mm = static_cast<MOSDMap*>(m);
7158 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7159 if (s) {
7160 s->received_map_lock.lock();
7161 s->received_map_epoch = mm->get_last();
7162 s->received_map_lock.unlock();
7163 s->put();
7164 }
7165 }
7166 }
7167 }
7168
7169 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7170 {
7171 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7172
7173 if (is_stopping()) {
7174 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7175 return false;
7176 }
7177
7178 if (dest_type == CEPH_ENTITY_TYPE_MON)
7179 return true;
7180
7181 if (force_new) {
7182 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7183 to get through */
7184 if (monc->wait_auth_rotating(10) < 0) {
7185 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7186 return false;
7187 }
7188 }
7189
7190 *authorizer = monc->build_authorizer(dest_type);
7191 return *authorizer != NULL;
7192 }
7193
7194
7195 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7196 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7197 bool& isvalid, CryptoKey& session_key)
7198 {
7199 AuthAuthorizeHandler *authorize_handler = 0;
7200 switch (peer_type) {
7201 case CEPH_ENTITY_TYPE_MDS:
7202 /*
7203 * note: mds is technically a client from our perspective, but
7204 * this makes the 'cluster' consistent w/ monitor's usage.
7205 */
7206 case CEPH_ENTITY_TYPE_OSD:
7207 case CEPH_ENTITY_TYPE_MGR:
7208 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7209 break;
7210 default:
7211 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7212 }
7213 if (!authorize_handler) {
7214 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7215 isvalid = false;
7216 return true;
7217 }
7218
7219 AuthCapsInfo caps_info;
7220 EntityName name;
7221 uint64_t global_id;
7222 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7223
7224 RotatingKeyRing *keys = monc->rotating_secrets.get();
7225 if (keys) {
7226 isvalid = authorize_handler->verify_authorizer(
7227 cct, keys,
7228 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7229 &auid);
7230 } else {
7231 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7232 isvalid = false;
7233 }
7234
7235 if (isvalid) {
7236 Session *s = static_cast<Session *>(con->get_priv());
7237 if (!s) {
7238 s = new Session(cct);
7239 con->set_priv(s->get());
7240 s->con = con;
7241 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7242 }
7243
7244 s->entity_name = name;
7245 if (caps_info.allow_all)
7246 s->caps.set_allow_all();
7247 s->auid = auid;
7248
7249 if (caps_info.caps.length() > 0) {
7250 bufferlist::iterator p = caps_info.caps.begin();
7251 string str;
7252 try {
7253 ::decode(str, p);
7254 }
7255 catch (buffer::error& e) {
7256 }
7257 bool success = s->caps.parse(str);
7258 if (success)
7259 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7260 else
7261 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7262 }
7263
7264 s->put();
7265 }
7266 return true;
7267 }
7268
7269 void OSD::do_waiters()
7270 {
7271 assert(osd_lock.is_locked());
7272
7273 dout(10) << "do_waiters -- start" << dendl;
7274 while (!finished.empty()) {
7275 OpRequestRef next = finished.front();
7276 finished.pop_front();
7277 dispatch_op(next);
7278 }
7279 dout(10) << "do_waiters -- finish" << dendl;
7280 }
7281
7282 void OSD::dispatch_op(OpRequestRef op)
7283 {
7284 switch (op->get_req()->get_type()) {
7285
7286 case MSG_OSD_PG_CREATE:
7287 handle_pg_create(op);
7288 break;
7289 case MSG_OSD_PG_NOTIFY:
7290 handle_pg_notify(op);
7291 break;
7292 case MSG_OSD_PG_QUERY:
7293 handle_pg_query(op);
7294 break;
7295 case MSG_OSD_PG_LOG:
7296 handle_pg_log(op);
7297 break;
7298 case MSG_OSD_PG_REMOVE:
7299 handle_pg_remove(op);
7300 break;
7301 case MSG_OSD_PG_INFO:
7302 handle_pg_info(op);
7303 break;
7304 case MSG_OSD_PG_TRIM:
7305 handle_pg_trim(op);
7306 break;
7307 case MSG_OSD_BACKFILL_RESERVE:
7308 handle_pg_backfill_reserve(op);
7309 break;
7310 case MSG_OSD_RECOVERY_RESERVE:
7311 handle_pg_recovery_reserve(op);
7312 break;
7313 }
7314 }
7315
7316 void OSD::_dispatch(Message *m)
7317 {
7318 assert(osd_lock.is_locked());
7319 dout(20) << "_dispatch " << m << " " << *m << dendl;
7320
7321 switch (m->get_type()) {
7322
7323 // -- don't need lock --
7324 case CEPH_MSG_PING:
7325 dout(10) << "ping from " << m->get_source() << dendl;
7326 m->put();
7327 break;
7328
7329 // -- don't need OSDMap --
7330
7331 // map and replication
7332 case CEPH_MSG_OSD_MAP:
7333 handle_osd_map(static_cast<MOSDMap*>(m));
7334 break;
7335
7336 // osd
7337 case MSG_PGSTATSACK:
7338 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7339 break;
7340
7341 case MSG_MON_COMMAND:
7342 handle_command(static_cast<MMonCommand*>(m));
7343 break;
7344 case MSG_COMMAND:
7345 handle_command(static_cast<MCommand*>(m));
7346 break;
7347
7348 case MSG_OSD_SCRUB:
7349 handle_scrub(static_cast<MOSDScrub*>(m));
7350 break;
7351
7352 case MSG_OSD_FORCE_RECOVERY:
7353 handle_force_recovery(m);
7354 break;
7355
7356 // -- need OSDMap --
7357
7358 case MSG_OSD_PG_CREATE:
7359 case MSG_OSD_PG_NOTIFY:
7360 case MSG_OSD_PG_QUERY:
7361 case MSG_OSD_PG_LOG:
7362 case MSG_OSD_PG_REMOVE:
7363 case MSG_OSD_PG_INFO:
7364 case MSG_OSD_PG_TRIM:
7365 case MSG_OSD_BACKFILL_RESERVE:
7366 case MSG_OSD_RECOVERY_RESERVE:
7367 {
7368 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7369 if (m->trace)
7370 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7371 // no map? starting up?
7372 if (!osdmap) {
7373 dout(7) << "no OSDMap, not booted" << dendl;
7374 logger->inc(l_osd_waiting_for_map);
7375 waiting_for_osdmap.push_back(op);
7376 op->mark_delayed("no osdmap");
7377 break;
7378 }
7379
7380 // need OSDMap
7381 dispatch_op(op);
7382 }
7383 }
7384 }
7385
7386 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7387 {
7388 pg->lock();
7389 if (pg->is_primary()) {
7390 pg->unreg_next_scrub();
7391 pg->scrubber.must_scrub = true;
7392 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7393 pg->scrubber.must_repair = m->repair;
7394 pg->reg_next_scrub();
7395 dout(10) << "marking " << *pg << " for scrub" << dendl;
7396 }
7397 pg->unlock();
7398 }
7399
7400 void OSD::handle_scrub(MOSDScrub *m)
7401 {
7402 dout(10) << "handle_scrub " << *m << dendl;
7403 if (!require_mon_or_mgr_peer(m)) {
7404 m->put();
7405 return;
7406 }
7407 if (m->fsid != monc->get_fsid()) {
7408 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7409 m->put();
7410 return;
7411 }
7412
7413 RWLock::RLocker l(pg_map_lock);
7414 if (m->scrub_pgs.empty()) {
7415 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7416 p != pg_map.end();
7417 ++p)
7418 handle_pg_scrub(m, p->second);
7419 } else {
7420 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7421 p != m->scrub_pgs.end();
7422 ++p) {
7423 spg_t pcand;
7424 if (osdmap->get_primary_shard(*p, &pcand)) {
7425 auto pg_map_entry = pg_map.find(pcand);
7426 if (pg_map_entry != pg_map.end()) {
7427 handle_pg_scrub(m, pg_map_entry->second);
7428 }
7429 }
7430 }
7431 }
7432
7433 m->put();
7434 }
7435
7436 bool OSD::scrub_random_backoff()
7437 {
7438 bool coin_flip = (rand() / (double)RAND_MAX >=
7439 cct->_conf->osd_scrub_backoff_ratio);
7440 if (!coin_flip) {
7441 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7442 return true;
7443 }
7444 return false;
7445 }
7446
7447 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7448 const spg_t& pg, const utime_t& timestamp,
7449 double pool_scrub_min_interval,
7450 double pool_scrub_max_interval, bool must)
7451 : cct(cct),
7452 pgid(pg),
7453 sched_time(timestamp),
7454 deadline(timestamp)
7455 {
7456 // if not explicitly requested, postpone the scrub with a random delay
7457 if (!must) {
7458 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7459 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7460 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7461 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7462
7463 sched_time += scrub_min_interval;
7464 double r = rand() / (double)RAND_MAX;
7465 sched_time +=
7466 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7467 deadline += scrub_max_interval;
7468 }
7469 }
7470
7471 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7472 if (sched_time < rhs.sched_time)
7473 return true;
7474 if (sched_time > rhs.sched_time)
7475 return false;
7476 return pgid < rhs.pgid;
7477 }
7478
7479 bool OSD::scrub_time_permit(utime_t now)
7480 {
7481 struct tm bdt;
7482 time_t tt = now.sec();
7483 localtime_r(&tt, &bdt);
7484 bool time_permit = false;
7485 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7486 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7487 time_permit = true;
7488 }
7489 } else {
7490 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7491 time_permit = true;
7492 }
7493 }
7494 if (!time_permit) {
7495 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7496 << " - " << cct->_conf->osd_scrub_end_hour
7497 << " now " << bdt.tm_hour << " = no" << dendl;
7498 } else {
7499 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7500 << " - " << cct->_conf->osd_scrub_end_hour
7501 << " now " << bdt.tm_hour << " = yes" << dendl;
7502 }
7503 return time_permit;
7504 }
7505
7506 bool OSD::scrub_load_below_threshold()
7507 {
7508 double loadavgs[3];
7509 if (getloadavg(loadavgs, 3) != 3) {
7510 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7511 return false;
7512 }
7513
7514 // allow scrub if below configured threshold
7515 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7516 dout(20) << __func__ << " loadavg " << loadavgs[0]
7517 << " < max " << cct->_conf->osd_scrub_load_threshold
7518 << " = yes" << dendl;
7519 return true;
7520 }
7521
7522 // allow scrub if below daily avg and currently decreasing
7523 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7524 dout(20) << __func__ << " loadavg " << loadavgs[0]
7525 << " < daily_loadavg " << daily_loadavg
7526 << " and < 15m avg " << loadavgs[2]
7527 << " = yes" << dendl;
7528 return true;
7529 }
7530
7531 dout(20) << __func__ << " loadavg " << loadavgs[0]
7532 << " >= max " << cct->_conf->osd_scrub_load_threshold
7533 << " and ( >= daily_loadavg " << daily_loadavg
7534 << " or >= 15m avg " << loadavgs[2]
7535 << ") = no" << dendl;
7536 return false;
7537 }
7538
7539 void OSD::sched_scrub()
7540 {
7541 // if not permitted, fail fast
7542 if (!service.can_inc_scrubs_pending()) {
7543 return;
7544 }
7545 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7546 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7547 return;
7548 }
7549
7550
7551 utime_t now = ceph_clock_now();
7552 bool time_permit = scrub_time_permit(now);
7553 bool load_is_low = scrub_load_below_threshold();
7554 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7555
7556 OSDService::ScrubJob scrub;
7557 if (service.first_scrub_stamp(&scrub)) {
7558 do {
7559 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7560
7561 if (scrub.sched_time > now) {
7562 // save ourselves some effort
7563 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7564 << " > " << now << dendl;
7565 break;
7566 }
7567
7568 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7569 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7570 << (!time_permit ? "time not permit" : "high load") << dendl;
7571 continue;
7572 }
7573
7574 PG *pg = _lookup_lock_pg(scrub.pgid);
7575 if (!pg)
7576 continue;
7577 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7578 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7579 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7580 (load_is_low ? ", load_is_low" : " deadline < now"))
7581 << dendl;
7582 if (pg->sched_scrub()) {
7583 pg->unlock();
7584 break;
7585 }
7586 }
7587 pg->unlock();
7588 } while (service.next_scrub_stamp(scrub, &scrub));
7589 }
7590 dout(20) << "sched_scrub done" << dendl;
7591 }
7592
7593
7594
7595 vector<OSDHealthMetric> OSD::get_health_metrics()
7596 {
7597 vector<OSDHealthMetric> metrics;
7598 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
7599 auto n_primaries = pending_creates_from_mon;
7600 for (const auto& create : pending_creates_from_osd) {
7601 if (create.second) {
7602 n_primaries++;
7603 }
7604 }
7605 metrics.emplace_back(osd_metric::PENDING_CREATING_PGS, n_primaries);
7606 return metrics;
7607 }
7608
7609 // =====================================================
7610 // MAP
7611
7612 void OSD::wait_for_new_map(OpRequestRef op)
7613 {
7614 // ask?
7615 if (waiting_for_osdmap.empty()) {
7616 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7617 }
7618
7619 logger->inc(l_osd_waiting_for_map);
7620 waiting_for_osdmap.push_back(op);
7621 op->mark_delayed("wait for new map");
7622 }
7623
7624
7625 /** update_map
7626 * assimilate new OSDMap(s). scan pgs, etc.
7627 */
7628
7629 void OSD::note_down_osd(int peer)
7630 {
7631 assert(osd_lock.is_locked());
7632 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7633
7634 heartbeat_lock.Lock();
7635 failure_queue.erase(peer);
7636 failure_pending.erase(peer);
7637 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7638 if (p != heartbeat_peers.end()) {
7639 p->second.con_back->mark_down();
7640 if (p->second.con_front) {
7641 p->second.con_front->mark_down();
7642 }
7643 heartbeat_peers.erase(p);
7644 }
7645 heartbeat_lock.Unlock();
7646 }
7647
7648 void OSD::note_up_osd(int peer)
7649 {
7650 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7651 heartbeat_set_peers_need_update();
7652 }
7653
7654 struct C_OnMapCommit : public Context {
7655 OSD *osd;
7656 epoch_t first, last;
7657 MOSDMap *msg;
7658 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7659 : osd(o), first(f), last(l), msg(m) {}
7660 void finish(int r) override {
7661 osd->_committed_osd_maps(first, last, msg);
7662 msg->put();
7663 }
7664 };
7665
7666 struct C_OnMapApply : public Context {
7667 OSDService *service;
7668 list<OSDMapRef> pinned_maps;
7669 epoch_t e;
7670 C_OnMapApply(OSDService *service,
7671 const list<OSDMapRef> &pinned_maps,
7672 epoch_t e)
7673 : service(service), pinned_maps(pinned_maps), e(e) {}
7674 void finish(int r) override {
7675 service->clear_map_bl_cache_pins(e);
7676 }
7677 };
7678
7679 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7680 {
7681 Mutex::Locker l(osdmap_subscribe_lock);
7682 if (latest_subscribed_epoch >= epoch && !force_request)
7683 return;
7684
7685 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7686
7687 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7688 force_request) {
7689 monc->renew_subs();
7690 }
7691 }
7692
7693 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7694 {
7695 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7696 if (min <= superblock.oldest_map)
7697 return;
7698
7699 int num = 0;
7700 ObjectStore::Transaction t;
7701 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7702 dout(20) << " removing old osdmap epoch " << e << dendl;
7703 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7704 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7705 superblock.oldest_map = e + 1;
7706 num++;
7707 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7708 service.publish_superblock(superblock);
7709 write_superblock(t);
7710 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7711 assert(tr == 0);
7712 num = 0;
7713 if (!skip_maps) {
7714 // skip_maps leaves us with a range of old maps if we fail to remove all
7715 // of them before moving superblock.oldest_map forward to the first map
7716 // in the incoming MOSDMap msg. so we should continue removing them in
7717 // this case, even we could do huge series of delete transactions all at
7718 // once.
7719 break;
7720 }
7721 }
7722 }
7723 if (num > 0) {
7724 service.publish_superblock(superblock);
7725 write_superblock(t);
7726 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7727 assert(tr == 0);
7728 }
7729 // we should not remove the cached maps
7730 assert(min <= service.map_cache.cached_key_lower_bound());
7731 }
7732
7733 void OSD::handle_osd_map(MOSDMap *m)
7734 {
7735 assert(osd_lock.is_locked());
7736 // Keep a ref in the list until we get the newly received map written
7737 // onto disk. This is important because as long as the refs are alive,
7738 // the OSDMaps will be pinned in the cache and we won't try to read it
7739 // off of disk. Otherwise these maps will probably not stay in the cache,
7740 // and reading those OSDMaps before they are actually written can result
7741 // in a crash.
7742 list<OSDMapRef> pinned_maps;
7743 if (m->fsid != monc->get_fsid()) {
7744 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7745 << monc->get_fsid() << dendl;
7746 m->put();
7747 return;
7748 }
7749 if (is_initializing()) {
7750 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7751 m->put();
7752 return;
7753 }
7754
7755 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7756 if (session && !(session->entity_name.is_mon() ||
7757 session->entity_name.is_osd())) {
7758 //not enough perms!
7759 dout(10) << "got osd map from Session " << session
7760 << " which we can't take maps from (not a mon or osd)" << dendl;
7761 m->put();
7762 session->put();
7763 return;
7764 }
7765 if (session)
7766 session->put();
7767
7768 // share with the objecter
7769 if (!is_preboot())
7770 service.objecter->handle_osd_map(m);
7771
7772 epoch_t first = m->get_first();
7773 epoch_t last = m->get_last();
7774 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7775 << superblock.newest_map
7776 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7777 << dendl;
7778
7779 logger->inc(l_osd_map);
7780 logger->inc(l_osd_mape, last - first + 1);
7781 if (first <= superblock.newest_map)
7782 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7783 if (service.max_oldest_map < m->oldest_map) {
7784 service.max_oldest_map = m->oldest_map;
7785 assert(service.max_oldest_map >= superblock.oldest_map);
7786 }
7787
7788 // make sure there is something new, here, before we bother flushing
7789 // the queues and such
7790 if (last <= superblock.newest_map) {
7791 dout(10) << " no new maps here, dropping" << dendl;
7792 m->put();
7793 return;
7794 }
7795
7796 // missing some?
7797 bool skip_maps = false;
7798 if (first > superblock.newest_map + 1) {
7799 dout(10) << "handle_osd_map message skips epochs "
7800 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7801 if (m->oldest_map <= superblock.newest_map + 1) {
7802 osdmap_subscribe(superblock.newest_map + 1, false);
7803 m->put();
7804 return;
7805 }
7806 // always try to get the full range of maps--as many as we can. this
7807 // 1- is good to have
7808 // 2- is at present the only way to ensure that we get a *full* map as
7809 // the first map!
7810 if (m->oldest_map < first) {
7811 osdmap_subscribe(m->oldest_map - 1, true);
7812 m->put();
7813 return;
7814 }
7815 skip_maps = true;
7816 }
7817
7818 ObjectStore::Transaction t;
7819 uint64_t txn_size = 0;
7820
7821 // store new maps: queue for disk and put in the osdmap cache
7822 epoch_t start = MAX(superblock.newest_map + 1, first);
7823 for (epoch_t e = start; e <= last; e++) {
7824 if (txn_size >= t.get_num_bytes()) {
7825 derr << __func__ << " transaction size overflowed" << dendl;
7826 assert(txn_size < t.get_num_bytes());
7827 }
7828 txn_size = t.get_num_bytes();
7829 map<epoch_t,bufferlist>::iterator p;
7830 p = m->maps.find(e);
7831 if (p != m->maps.end()) {
7832 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7833 OSDMap *o = new OSDMap;
7834 bufferlist& bl = p->second;
7835
7836 o->decode(bl);
7837
7838 ghobject_t fulloid = get_osdmap_pobject_name(e);
7839 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7840 pin_map_bl(e, bl);
7841 pinned_maps.push_back(add_map(o));
7842
7843 got_full_map(e);
7844 continue;
7845 }
7846
7847 p = m->incremental_maps.find(e);
7848 if (p != m->incremental_maps.end()) {
7849 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7850 bufferlist& bl = p->second;
7851 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7852 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7853 pin_map_inc_bl(e, bl);
7854
7855 OSDMap *o = new OSDMap;
7856 if (e > 1) {
7857 bufferlist obl;
7858 bool got = get_map_bl(e - 1, obl);
7859 assert(got);
7860 o->decode(obl);
7861 }
7862
7863 OSDMap::Incremental inc;
7864 bufferlist::iterator p = bl.begin();
7865 inc.decode(p);
7866 if (o->apply_incremental(inc) < 0) {
7867 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7868 assert(0 == "bad fsid");
7869 }
7870
7871 bufferlist fbl;
7872 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7873
7874 bool injected_failure = false;
7875 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7876 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7877 derr << __func__ << " injecting map crc failure" << dendl;
7878 injected_failure = true;
7879 }
7880
7881 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7882 dout(2) << "got incremental " << e
7883 << " but failed to encode full with correct crc; requesting"
7884 << dendl;
7885 clog->warn() << "failed to encode map e" << e << " with expected crc";
7886 dout(20) << "my encoded map was:\n";
7887 fbl.hexdump(*_dout);
7888 *_dout << dendl;
7889 delete o;
7890 request_full_map(e, last);
7891 last = e - 1;
7892 break;
7893 }
7894 got_full_map(e);
7895
7896 ghobject_t fulloid = get_osdmap_pobject_name(e);
7897 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7898 pin_map_bl(e, fbl);
7899 pinned_maps.push_back(add_map(o));
7900 continue;
7901 }
7902
7903 assert(0 == "MOSDMap lied about what maps it had?");
7904 }
7905
7906 // even if this map isn't from a mon, we may have satisfied our subscription
7907 monc->sub_got("osdmap", last);
7908
7909 if (!m->maps.empty() && requested_full_first) {
7910 dout(10) << __func__ << " still missing full maps " << requested_full_first
7911 << ".." << requested_full_last << dendl;
7912 rerequest_full_maps();
7913 }
7914
7915 if (superblock.oldest_map) {
7916 // make sure we at least keep pace with incoming maps
7917 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7918 }
7919
7920 if (!superblock.oldest_map || skip_maps)
7921 superblock.oldest_map = first;
7922 superblock.newest_map = last;
7923 superblock.current_epoch = last;
7924
7925 // note in the superblock that we were clean thru the prior epoch
7926 epoch_t boot_epoch = service.get_boot_epoch();
7927 if (boot_epoch && boot_epoch >= superblock.mounted) {
7928 superblock.mounted = boot_epoch;
7929 superblock.clean_thru = last;
7930 }
7931
7932 // superblock and commit
7933 write_superblock(t);
7934 store->queue_transaction(
7935 service.meta_osr.get(),
7936 std::move(t),
7937 new C_OnMapApply(&service, pinned_maps, last),
7938 new C_OnMapCommit(this, start, last, m), 0);
7939 service.publish_superblock(superblock);
7940 }
7941
7942 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7943 {
7944 dout(10) << __func__ << " " << first << ".." << last << dendl;
7945 if (is_stopping()) {
7946 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7947 return;
7948 }
7949 Mutex::Locker l(osd_lock);
7950 if (is_stopping()) {
7951 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7952 return;
7953 }
7954 map_lock.get_write();
7955
7956 bool do_shutdown = false;
7957 bool do_restart = false;
7958 bool network_error = false;
7959
7960 // advance through the new maps
7961 for (epoch_t cur = first; cur <= last; cur++) {
7962 dout(10) << " advance to epoch " << cur
7963 << " (<= last " << last
7964 << " <= newest_map " << superblock.newest_map
7965 << ")" << dendl;
7966
7967 OSDMapRef newmap = get_map(cur);
7968 assert(newmap); // we just cached it above!
7969
7970 // start blacklisting messages sent to peers that go down.
7971 service.pre_publish_map(newmap);
7972
7973 // kill connections to newly down osds
7974 bool waited_for_reservations = false;
7975 set<int> old;
7976 osdmap->get_all_osds(old);
7977 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7978 if (*p != whoami &&
7979 osdmap->is_up(*p) && // in old map
7980 newmap->is_down(*p)) { // but not the new one
7981 if (!waited_for_reservations) {
7982 service.await_reserved_maps();
7983 waited_for_reservations = true;
7984 }
7985 note_down_osd(*p);
7986 } else if (*p != whoami &&
7987 osdmap->is_down(*p) &&
7988 newmap->is_up(*p)) {
7989 note_up_osd(*p);
7990 }
7991 }
7992
7993 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7994 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7995 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7996 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7997 << dendl;
7998 if (is_booting()) {
7999 // this captures the case where we sent the boot message while
8000 // NOUP was being set on the mon and our boot request was
8001 // dropped, and then later it is cleared. it imperfectly
8002 // handles the case where our original boot message was not
8003 // dropped and we restart even though we might have booted, but
8004 // that is harmless (boot will just take slightly longer).
8005 do_restart = true;
8006 }
8007 }
8008 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
8009 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
8010 dout(10) << __func__ << " require_osd_release reached luminous in "
8011 << newmap->get_epoch() << dendl;
8012 clear_pg_stat_queue();
8013 clear_outstanding_pg_stats();
8014 }
8015
8016 osdmap = newmap;
8017 epoch_t up_epoch;
8018 epoch_t boot_epoch;
8019 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8020 if (!up_epoch &&
8021 osdmap->is_up(whoami) &&
8022 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
8023 up_epoch = osdmap->get_epoch();
8024 dout(10) << "up_epoch is " << up_epoch << dendl;
8025 if (!boot_epoch) {
8026 boot_epoch = osdmap->get_epoch();
8027 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8028 }
8029 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8030 }
8031 }
8032
8033 had_map_since = ceph_clock_now();
8034
8035 epoch_t _bind_epoch = service.get_bind_epoch();
8036 if (osdmap->is_up(whoami) &&
8037 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
8038 _bind_epoch < osdmap->get_up_from(whoami)) {
8039
8040 if (is_booting()) {
8041 dout(1) << "state: booting -> active" << dendl;
8042 set_state(STATE_ACTIVE);
8043
8044 // set incarnation so that osd_reqid_t's we generate for our
8045 // objecter requests are unique across restarts.
8046 service.objecter->set_client_incarnation(osdmap->get_epoch());
8047 }
8048 }
8049
8050 if (osdmap->get_epoch() > 0 &&
8051 is_active()) {
8052 if (!osdmap->exists(whoami)) {
8053 dout(0) << "map says i do not exist. shutting down." << dendl;
8054 do_shutdown = true; // don't call shutdown() while we have
8055 // everything paused
8056 } else if (!osdmap->is_up(whoami) ||
8057 !osdmap->get_addr(whoami).probably_equals(
8058 client_messenger->get_myaddr()) ||
8059 !osdmap->get_cluster_addr(whoami).probably_equals(
8060 cluster_messenger->get_myaddr()) ||
8061 !osdmap->get_hb_back_addr(whoami).probably_equals(
8062 hb_back_server_messenger->get_myaddr()) ||
8063 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8064 !osdmap->get_hb_front_addr(whoami).probably_equals(
8065 hb_front_server_messenger->get_myaddr()))) {
8066 if (!osdmap->is_up(whoami)) {
8067 if (service.is_preparing_to_stop() || service.is_stopping()) {
8068 service.got_stop_ack();
8069 } else {
8070 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8071 "but it is still running";
8072 clog->debug() << "map e" << osdmap->get_epoch()
8073 << " wrongly marked me down at e"
8074 << osdmap->get_down_at(whoami);
8075 }
8076 } else if (!osdmap->get_addr(whoami).probably_equals(
8077 client_messenger->get_myaddr())) {
8078 clog->error() << "map e" << osdmap->get_epoch()
8079 << " had wrong client addr (" << osdmap->get_addr(whoami)
8080 << " != my " << client_messenger->get_myaddr() << ")";
8081 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8082 cluster_messenger->get_myaddr())) {
8083 clog->error() << "map e" << osdmap->get_epoch()
8084 << " had wrong cluster addr ("
8085 << osdmap->get_cluster_addr(whoami)
8086 << " != my " << cluster_messenger->get_myaddr() << ")";
8087 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8088 hb_back_server_messenger->get_myaddr())) {
8089 clog->error() << "map e" << osdmap->get_epoch()
8090 << " had wrong heartbeat back addr ("
8091 << osdmap->get_hb_back_addr(whoami)
8092 << " != my " << hb_back_server_messenger->get_myaddr()
8093 << ")";
8094 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8095 !osdmap->get_hb_front_addr(whoami).probably_equals(
8096 hb_front_server_messenger->get_myaddr())) {
8097 clog->error() << "map e" << osdmap->get_epoch()
8098 << " had wrong heartbeat front addr ("
8099 << osdmap->get_hb_front_addr(whoami)
8100 << " != my " << hb_front_server_messenger->get_myaddr()
8101 << ")";
8102 }
8103
8104 if (!service.is_stopping()) {
8105 epoch_t up_epoch = 0;
8106 epoch_t bind_epoch = osdmap->get_epoch();
8107 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8108 do_restart = true;
8109
8110 //add markdown log
8111 utime_t now = ceph_clock_now();
8112 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8113 osd_markdown_log.push_back(now);
8114 //clear all out-of-date log
8115 while (!osd_markdown_log.empty() &&
8116 osd_markdown_log.front() + grace < now)
8117 osd_markdown_log.pop_front();
8118 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8119 dout(0) << __func__ << " marked down "
8120 << osd_markdown_log.size()
8121 << " > osd_max_markdown_count "
8122 << cct->_conf->osd_max_markdown_count
8123 << " in last " << grace << " seconds, shutting down"
8124 << dendl;
8125 do_restart = false;
8126 do_shutdown = true;
8127 }
8128
8129 start_waiting_for_healthy();
8130
8131 set<int> avoid_ports;
8132 #if defined(__FreeBSD__)
8133 // prevent FreeBSD from grabbing the client_messenger port during
8134 // rebinding. In which case a cluster_meesneger will connect also
8135 // to the same port
8136 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8137 #endif
8138 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8139 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8140 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8141
8142 int r = cluster_messenger->rebind(avoid_ports);
8143 if (r != 0) {
8144 do_shutdown = true; // FIXME: do_restart?
8145 network_error = true;
8146 dout(0) << __func__ << " marked down:"
8147 << " rebind cluster_messenger failed" << dendl;
8148 }
8149
8150 r = hb_back_server_messenger->rebind(avoid_ports);
8151 if (r != 0) {
8152 do_shutdown = true; // FIXME: do_restart?
8153 network_error = true;
8154 dout(0) << __func__ << " marked down:"
8155 << " rebind hb_back_server_messenger failed" << dendl;
8156 }
8157
8158 r = hb_front_server_messenger->rebind(avoid_ports);
8159 if (r != 0) {
8160 do_shutdown = true; // FIXME: do_restart?
8161 network_error = true;
8162 dout(0) << __func__ << " marked down:"
8163 << " rebind hb_front_server_messenger failed" << dendl;
8164 }
8165
8166 hb_front_client_messenger->mark_down_all();
8167 hb_back_client_messenger->mark_down_all();
8168
8169 reset_heartbeat_peers();
8170 }
8171 }
8172 }
8173
8174 map_lock.put_write();
8175
8176 check_osdmap_features(store);
8177
8178 // yay!
8179 consume_map();
8180
8181 if (is_active() || is_waiting_for_healthy())
8182 maybe_update_heartbeat_peers();
8183
8184 if (!is_active()) {
8185 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8186 peering_wq.drain();
8187 } else {
8188 activate_map();
8189 }
8190
8191 if (do_shutdown) {
8192 if (network_error) {
8193 Mutex::Locker l(heartbeat_lock);
8194 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8195 failure_pending.begin();
8196 while (it != failure_pending.end()) {
8197 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8198 << it->first << dendl;
8199 send_still_alive(osdmap->get_epoch(), it->second.second);
8200 failure_pending.erase(it++);
8201 }
8202 }
8203 // trigger shutdown in a different thread
8204 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8205 queue_async_signal(SIGINT);
8206 }
8207 else if (m->newest_map && m->newest_map > last) {
8208 dout(10) << " msg say newest map is " << m->newest_map
8209 << ", requesting more" << dendl;
8210 osdmap_subscribe(osdmap->get_epoch()+1, false);
8211 }
8212 else if (is_preboot()) {
8213 if (m->get_source().is_mon())
8214 _preboot(m->oldest_map, m->newest_map);
8215 else
8216 start_boot();
8217 }
8218 else if (do_restart)
8219 start_boot();
8220
8221 }
8222
8223 void OSD::check_osdmap_features(ObjectStore *fs)
8224 {
8225 // adjust required feature bits?
8226
8227 // we have to be a bit careful here, because we are accessing the
8228 // Policy structures without taking any lock. in particular, only
8229 // modify integer values that can safely be read by a racing CPU.
8230 // since we are only accessing existing Policy structures a their
8231 // current memory location, and setting or clearing bits in integer
8232 // fields, and we are the only writer, this is not a problem.
8233
8234 {
8235 Messenger::Policy p = client_messenger->get_default_policy();
8236 uint64_t mask;
8237 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8238 if ((p.features_required & mask) != features) {
8239 dout(0) << "crush map has features " << features
8240 << ", adjusting msgr requires for clients" << dendl;
8241 p.features_required = (p.features_required & ~mask) | features;
8242 client_messenger->set_default_policy(p);
8243 }
8244 }
8245 {
8246 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8247 uint64_t mask;
8248 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8249 if ((p.features_required & mask) != features) {
8250 dout(0) << "crush map has features " << features
8251 << " was " << p.features_required
8252 << ", adjusting msgr requires for mons" << dendl;
8253 p.features_required = (p.features_required & ~mask) | features;
8254 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8255 }
8256 }
8257 {
8258 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8259 uint64_t mask;
8260 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8261
8262 if ((p.features_required & mask) != features) {
8263 dout(0) << "crush map has features " << features
8264 << ", adjusting msgr requires for osds" << dendl;
8265 p.features_required = (p.features_required & ~mask) | features;
8266 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8267 }
8268
8269 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8270 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8271 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8272 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8273 ObjectStore::Transaction t;
8274 write_superblock(t);
8275 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8276 assert(err == 0);
8277 }
8278 }
8279 }
8280
8281 bool OSD::advance_pg(
8282 epoch_t osd_epoch, PG *pg,
8283 ThreadPool::TPHandle &handle,
8284 PG::RecoveryCtx *rctx,
8285 set<PGRef> *new_pgs)
8286 {
8287 assert(pg->is_locked());
8288 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8289 OSDMapRef lastmap = pg->get_osdmap();
8290
8291 if (lastmap->get_epoch() == osd_epoch)
8292 return true;
8293 assert(lastmap->get_epoch() < osd_epoch);
8294
8295 epoch_t min_epoch = service.get_min_pg_epoch();
8296 epoch_t max;
8297 if (min_epoch) {
8298 max = min_epoch + cct->_conf->osd_map_max_advance;
8299 } else {
8300 max = next_epoch + cct->_conf->osd_map_max_advance;
8301 }
8302
8303 for (;
8304 next_epoch <= osd_epoch && next_epoch <= max;
8305 ++next_epoch) {
8306 OSDMapRef nextmap = service.try_get_map(next_epoch);
8307 if (!nextmap) {
8308 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8309 // make sure max is bumped up so that we can get past any
8310 // gap in maps
8311 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8312 continue;
8313 }
8314
8315 vector<int> newup, newacting;
8316 int up_primary, acting_primary;
8317 nextmap->pg_to_up_acting_osds(
8318 pg->info.pgid.pgid,
8319 &newup, &up_primary,
8320 &newacting, &acting_primary);
8321 pg->handle_advance_map(
8322 nextmap, lastmap, newup, up_primary,
8323 newacting, acting_primary, rctx);
8324
8325 // Check for split!
8326 set<spg_t> children;
8327 spg_t parent(pg->info.pgid);
8328 if (parent.is_split(
8329 lastmap->get_pg_num(pg->pool.id),
8330 nextmap->get_pg_num(pg->pool.id),
8331 &children)) {
8332 service.mark_split_in_progress(pg->info.pgid, children);
8333 split_pgs(
8334 pg, children, new_pgs, lastmap, nextmap,
8335 rctx);
8336 }
8337
8338 lastmap = nextmap;
8339 handle.reset_tp_timeout();
8340 }
8341 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8342 pg->handle_activate_map(rctx);
8343 if (next_epoch <= osd_epoch) {
8344 dout(10) << __func__ << " advanced to max " << max
8345 << " past min epoch " << min_epoch
8346 << " ... will requeue " << *pg << dendl;
8347 return false;
8348 }
8349 return true;
8350 }
8351
8352 void OSD::consume_map()
8353 {
8354 assert(osd_lock.is_locked());
8355 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8356
8357 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8358 * speak the older sorting version any more. Be careful not to force
8359 * a shutdown if we are merely processing old maps, though.
8360 */
8361 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8362 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8363 ceph_abort();
8364 }
8365
8366 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8367 list<PGRef> to_remove;
8368
8369 // scan pg's
8370 {
8371 RWLock::RLocker l(pg_map_lock);
8372 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8373 it != pg_map.end();
8374 ++it) {
8375 PG *pg = it->second;
8376 pg->lock();
8377 if (pg->is_primary())
8378 num_pg_primary++;
8379 else if (pg->is_replica())
8380 num_pg_replica++;
8381 else
8382 num_pg_stray++;
8383
8384 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8385 //pool is deleted!
8386 to_remove.push_back(PGRef(pg));
8387 } else {
8388 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8389 }
8390
8391 pg->unlock();
8392 }
8393
8394 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8395 for (auto pg = pending_creates_from_osd.cbegin();
8396 pg != pending_creates_from_osd.cend();) {
8397 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
8398 pg = pending_creates_from_osd.erase(pg);
8399 } else {
8400 ++pg;
8401 }
8402 }
8403 }
8404
8405 for (list<PGRef>::iterator i = to_remove.begin();
8406 i != to_remove.end();
8407 to_remove.erase(i++)) {
8408 RWLock::WLocker locker(pg_map_lock);
8409 (*i)->lock();
8410 _remove_pg(&**i);
8411 (*i)->unlock();
8412 }
8413
8414 service.expand_pg_num(service.get_osdmap(), osdmap);
8415
8416 service.pre_publish_map(osdmap);
8417 service.await_reserved_maps();
8418 service.publish_map(osdmap);
8419
8420 service.maybe_inject_dispatch_delay();
8421
8422 dispatch_sessions_waiting_on_map();
8423
8424 service.maybe_inject_dispatch_delay();
8425
8426 // remove any PGs which we no longer host from the session waiting_for_pg lists
8427 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8428 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8429
8430 service.maybe_inject_dispatch_delay();
8431
8432 // scan pg's
8433 {
8434 RWLock::RLocker l(pg_map_lock);
8435 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8436 it != pg_map.end();
8437 ++it) {
8438 PG *pg = it->second;
8439 pg->lock();
8440 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8441 pg->unlock();
8442 }
8443
8444 logger->set(l_osd_pg, pg_map.size());
8445 }
8446 logger->set(l_osd_pg_primary, num_pg_primary);
8447 logger->set(l_osd_pg_replica, num_pg_replica);
8448 logger->set(l_osd_pg_stray, num_pg_stray);
8449 }
8450
8451 void OSD::activate_map()
8452 {
8453 assert(osd_lock.is_locked());
8454
8455 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8456
8457 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8458 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8459 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8460 }
8461
8462 // norecover?
8463 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8464 if (!service.recovery_is_paused()) {
8465 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8466 service.pause_recovery();
8467 }
8468 } else {
8469 if (service.recovery_is_paused()) {
8470 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8471 service.unpause_recovery();
8472 }
8473 }
8474
8475 service.activate_map();
8476
8477 // process waiters
8478 take_waiters(waiting_for_osdmap);
8479 }
8480
8481 bool OSD::require_mon_peer(const Message *m)
8482 {
8483 if (!m->get_connection()->peer_is_mon()) {
8484 dout(0) << "require_mon_peer received from non-mon "
8485 << m->get_connection()->get_peer_addr()
8486 << " " << *m << dendl;
8487 return false;
8488 }
8489 return true;
8490 }
8491
8492 bool OSD::require_mon_or_mgr_peer(const Message *m)
8493 {
8494 if (!m->get_connection()->peer_is_mon() &&
8495 !m->get_connection()->peer_is_mgr()) {
8496 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8497 << m->get_connection()->get_peer_addr()
8498 << " " << *m << dendl;
8499 return false;
8500 }
8501 return true;
8502 }
8503
8504 bool OSD::require_osd_peer(const Message *m)
8505 {
8506 if (!m->get_connection()->peer_is_osd()) {
8507 dout(0) << "require_osd_peer received from non-osd "
8508 << m->get_connection()->get_peer_addr()
8509 << " " << *m << dendl;
8510 return false;
8511 }
8512 return true;
8513 }
8514
8515 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8516 {
8517 epoch_t up_epoch = service.get_up_epoch();
8518 if (epoch < up_epoch) {
8519 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8520 return false;
8521 }
8522
8523 if (!is_active()) {
8524 dout(7) << "still in boot state, dropping message " << *m << dendl;
8525 return false;
8526 }
8527
8528 return true;
8529 }
8530
8531 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8532 bool is_fast_dispatch)
8533 {
8534 int from = m->get_source().num();
8535
8536 if (map->is_down(from) ||
8537 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8538 dout(5) << "from dead osd." << from << ", marking down, "
8539 << " msg was " << m->get_source_inst().addr
8540 << " expected " << (map->is_up(from) ?
8541 map->get_cluster_addr(from) : entity_addr_t())
8542 << dendl;
8543 ConnectionRef con = m->get_connection();
8544 con->mark_down();
8545 Session *s = static_cast<Session*>(con->get_priv());
8546 if (s) {
8547 if (!is_fast_dispatch)
8548 s->session_dispatch_lock.Lock();
8549 clear_session_waiting_on_map(s);
8550 con->set_priv(NULL); // break ref <-> session cycle, if any
8551 if (!is_fast_dispatch)
8552 s->session_dispatch_lock.Unlock();
8553 s->put();
8554 }
8555 return false;
8556 }
8557 return true;
8558 }
8559
8560
8561 /*
8562 * require that we have same (or newer) map, and that
8563 * the source is the pg primary.
8564 */
8565 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8566 bool is_fast_dispatch)
8567 {
8568 const Message *m = op->get_req();
8569 dout(15) << "require_same_or_newer_map " << epoch
8570 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8571
8572 assert(osd_lock.is_locked());
8573
8574 // do they have a newer map?
8575 if (epoch > osdmap->get_epoch()) {
8576 dout(7) << "waiting for newer map epoch " << epoch
8577 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8578 wait_for_new_map(op);
8579 return false;
8580 }
8581
8582 if (!require_self_aliveness(op->get_req(), epoch)) {
8583 return false;
8584 }
8585
8586 // ok, our map is same or newer.. do they still exist?
8587 if (m->get_connection()->get_messenger() == cluster_messenger &&
8588 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8589 return false;
8590 }
8591
8592 return true;
8593 }
8594
8595
8596
8597
8598
8599 // ----------------------------------------
8600 // pg creation
8601
8602 void OSD::split_pgs(
8603 PG *parent,
8604 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8605 OSDMapRef curmap,
8606 OSDMapRef nextmap,
8607 PG::RecoveryCtx *rctx)
8608 {
8609 unsigned pg_num = nextmap->get_pg_num(
8610 parent->pool.id);
8611 parent->update_snap_mapper_bits(
8612 parent->info.pgid.get_split_bits(pg_num)
8613 );
8614
8615 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8616 parent->info.stats.stats.sum.split(updated_stats);
8617
8618 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8619 for (set<spg_t>::const_iterator i = childpgids.begin();
8620 i != childpgids.end();
8621 ++i, ++stat_iter) {
8622 assert(stat_iter != updated_stats.end());
8623 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8624 assert(service.splitting(*i));
8625 PG* child = _make_pg(nextmap, *i);
8626 child->lock(true);
8627 out_pgs->insert(child);
8628 rctx->created_pgs.insert(child);
8629
8630 unsigned split_bits = i->get_split_bits(pg_num);
8631 dout(10) << "pg_num is " << pg_num << dendl;
8632 dout(10) << "m_seed " << i->ps() << dendl;
8633 dout(10) << "split_bits is " << split_bits << dendl;
8634
8635 parent->split_colls(
8636 *i,
8637 split_bits,
8638 i->ps(),
8639 &child->pool.info,
8640 rctx->transaction);
8641 parent->split_into(
8642 i->pgid,
8643 child,
8644 split_bits);
8645 child->info.stats.stats.sum = *stat_iter;
8646
8647 child->write_if_dirty(*(rctx->transaction));
8648 child->unlock();
8649 }
8650 assert(stat_iter != updated_stats.end());
8651 parent->info.stats.stats.sum = *stat_iter;
8652 parent->write_if_dirty(*(rctx->transaction));
8653 }
8654
8655 /*
8656 * holding osd_lock
8657 */
8658 void OSD::handle_pg_create(OpRequestRef op)
8659 {
8660 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8661 assert(m->get_type() == MSG_OSD_PG_CREATE);
8662
8663 dout(10) << "handle_pg_create " << *m << dendl;
8664
8665 if (!require_mon_peer(op->get_req())) {
8666 return;
8667 }
8668
8669 if (!require_same_or_newer_map(op, m->epoch, false))
8670 return;
8671
8672 op->mark_started();
8673
8674 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8675 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8676 p != m->mkpg.end();
8677 ++p, ++ci) {
8678 assert(ci != m->ctimes.end() && ci->first == p->first);
8679 epoch_t created = p->second.created;
8680 if (p->second.split_bits) // Skip split pgs
8681 continue;
8682 pg_t on = p->first;
8683
8684 if (on.preferred() >= 0) {
8685 dout(20) << "ignoring localized pg " << on << dendl;
8686 continue;
8687 }
8688
8689 if (!osdmap->have_pg_pool(on.pool())) {
8690 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8691 continue;
8692 }
8693
8694 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8695
8696 // is it still ours?
8697 vector<int> up, acting;
8698 int up_primary = -1;
8699 int acting_primary = -1;
8700 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8701 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8702
8703 if (acting_primary != whoami) {
8704 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8705 << "), my role=" << role << ", skipping" << dendl;
8706 continue;
8707 }
8708
8709 spg_t pgid;
8710 bool mapped = osdmap->get_primary_shard(on, &pgid);
8711 assert(mapped);
8712
8713 PastIntervals pi(
8714 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8715 *osdmap);
8716 pg_history_t history;
8717 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8718
8719 // The mon won't resend unless the primary changed, so
8720 // we ignore same_interval_since. We'll pass this history
8721 // to handle_pg_peering_evt with the current epoch as the
8722 // event -- the project_pg_history check in
8723 // handle_pg_peering_evt will be a noop.
8724 if (history.same_primary_since > m->epoch) {
8725 dout(10) << __func__ << ": got obsolete pg create on pgid "
8726 << pgid << " from epoch " << m->epoch
8727 << ", primary changed in " << history.same_primary_since
8728 << dendl;
8729 continue;
8730 }
8731 if (handle_pg_peering_evt(
8732 pgid,
8733 history,
8734 pi,
8735 osdmap->get_epoch(),
8736 PG::CephPeeringEvtRef(
8737 new PG::CephPeeringEvt(
8738 osdmap->get_epoch(),
8739 osdmap->get_epoch(),
8740 PG::NullEvt()))
8741 ) == -EEXIST) {
8742 service.send_pg_created(pgid.pgid);
8743 }
8744 }
8745
8746 {
8747 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8748 if (pending_creates_from_mon == 0) {
8749 last_pg_create_epoch = m->epoch;
8750 }
8751 }
8752 maybe_update_heartbeat_peers();
8753 }
8754
8755
8756 // ----------------------------------------
8757 // peering and recovery
8758
8759 PG::RecoveryCtx OSD::create_context()
8760 {
8761 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8762 C_Contexts *on_applied = new C_Contexts(cct);
8763 C_Contexts *on_safe = new C_Contexts(cct);
8764 map<int, map<spg_t,pg_query_t> > *query_map =
8765 new map<int, map<spg_t, pg_query_t> >;
8766 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8767 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8768 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8769 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8770 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8771 on_applied, on_safe, t);
8772 return rctx;
8773 }
8774
8775 struct C_OpenPGs : public Context {
8776 set<PGRef> pgs;
8777 ObjectStore *store;
8778 OSD *osd;
8779 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8780 pgs.swap(p);
8781 }
8782 void finish(int r) override {
8783 RWLock::RLocker l(osd->pg_map_lock);
8784 for (auto p : pgs) {
8785 if (osd->pg_map.count(p->info.pgid)) {
8786 p->ch = store->open_collection(p->coll);
8787 assert(p->ch);
8788 }
8789 }
8790 }
8791 };
8792
8793 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8794 ThreadPool::TPHandle *handle)
8795 {
8796 if (!ctx.transaction->empty()) {
8797 if (!ctx.created_pgs.empty()) {
8798 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8799 }
8800 int tr = store->queue_transaction(
8801 pg->osr.get(),
8802 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8803 TrackedOpRef(), handle);
8804 delete (ctx.transaction);
8805 assert(tr == 0);
8806 ctx.transaction = new ObjectStore::Transaction;
8807 ctx.on_applied = new C_Contexts(cct);
8808 ctx.on_safe = new C_Contexts(cct);
8809 }
8810 }
8811
8812 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8813 ThreadPool::TPHandle *handle)
8814 {
8815 if (service.get_osdmap()->is_up(whoami) &&
8816 is_active()) {
8817 do_notifies(*ctx.notify_list, curmap);
8818 do_queries(*ctx.query_map, curmap);
8819 do_infos(*ctx.info_map, curmap);
8820 }
8821 delete ctx.notify_list;
8822 delete ctx.query_map;
8823 delete ctx.info_map;
8824 if ((ctx.on_applied->empty() &&
8825 ctx.on_safe->empty() &&
8826 ctx.transaction->empty() &&
8827 ctx.created_pgs.empty()) || !pg) {
8828 delete ctx.transaction;
8829 delete ctx.on_applied;
8830 delete ctx.on_safe;
8831 assert(ctx.created_pgs.empty());
8832 } else {
8833 if (!ctx.created_pgs.empty()) {
8834 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8835 }
8836 int tr = store->queue_transaction(
8837 pg->osr.get(),
8838 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8839 handle);
8840 delete (ctx.transaction);
8841 assert(tr == 0);
8842 }
8843 }
8844
8845 /** do_notifies
8846 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8847 * content for, and they are primary for.
8848 */
8849
8850 void OSD::do_notifies(
8851 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8852 OSDMapRef curmap)
8853 {
8854 for (map<int,
8855 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8856 notify_list.begin();
8857 it != notify_list.end();
8858 ++it) {
8859 if (!curmap->is_up(it->first)) {
8860 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8861 continue;
8862 }
8863 ConnectionRef con = service.get_con_osd_cluster(
8864 it->first, curmap->get_epoch());
8865 if (!con) {
8866 dout(20) << __func__ << " skipping osd." << it->first
8867 << " (NULL con)" << dendl;
8868 continue;
8869 }
8870 service.share_map_peer(it->first, con.get(), curmap);
8871 dout(7) << __func__ << " osd." << it->first
8872 << " on " << it->second.size() << " PGs" << dendl;
8873 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8874 it->second);
8875 con->send_message(m);
8876 }
8877 }
8878
8879
8880 /** do_queries
8881 * send out pending queries for info | summaries
8882 */
8883 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8884 OSDMapRef curmap)
8885 {
8886 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8887 pit != query_map.end();
8888 ++pit) {
8889 if (!curmap->is_up(pit->first)) {
8890 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8891 continue;
8892 }
8893 int who = pit->first;
8894 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8895 if (!con) {
8896 dout(20) << __func__ << " skipping osd." << who
8897 << " (NULL con)" << dendl;
8898 continue;
8899 }
8900 service.share_map_peer(who, con.get(), curmap);
8901 dout(7) << __func__ << " querying osd." << who
8902 << " on " << pit->second.size() << " PGs" << dendl;
8903 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8904 con->send_message(m);
8905 }
8906 }
8907
8908
8909 void OSD::do_infos(map<int,
8910 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8911 OSDMapRef curmap)
8912 {
8913 for (map<int,
8914 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8915 info_map.begin();
8916 p != info_map.end();
8917 ++p) {
8918 if (!curmap->is_up(p->first)) {
8919 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8920 continue;
8921 }
8922 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8923 i != p->second.end();
8924 ++i) {
8925 dout(20) << __func__ << " sending info " << i->first.info
8926 << " to shard " << p->first << dendl;
8927 }
8928 ConnectionRef con = service.get_con_osd_cluster(
8929 p->first, curmap->get_epoch());
8930 if (!con) {
8931 dout(20) << __func__ << " skipping osd." << p->first
8932 << " (NULL con)" << dendl;
8933 continue;
8934 }
8935 service.share_map_peer(p->first, con.get(), curmap);
8936 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8937 m->pg_list = p->second;
8938 con->send_message(m);
8939 }
8940 info_map.clear();
8941 }
8942
8943
8944 /** PGNotify
8945 * from non-primary to primary
8946 * includes pg_info_t.
8947 * NOTE: called with opqueue active.
8948 */
8949 void OSD::handle_pg_notify(OpRequestRef op)
8950 {
8951 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8952 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8953
8954 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8955 int from = m->get_source().num();
8956
8957 if (!require_osd_peer(op->get_req()))
8958 return;
8959
8960 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8961 return;
8962
8963 op->mark_started();
8964
8965 for (auto it = m->get_pg_list().begin();
8966 it != m->get_pg_list().end();
8967 ++it) {
8968 if (it->first.info.pgid.preferred() >= 0) {
8969 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8970 continue;
8971 }
8972
8973 handle_pg_peering_evt(
8974 spg_t(it->first.info.pgid.pgid, it->first.to),
8975 it->first.info.history, it->second,
8976 it->first.query_epoch,
8977 PG::CephPeeringEvtRef(
8978 new PG::CephPeeringEvt(
8979 it->first.epoch_sent, it->first.query_epoch,
8980 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8981 op->get_req()->get_connection()->get_features())))
8982 );
8983 }
8984 }
8985
8986 void OSD::handle_pg_log(OpRequestRef op)
8987 {
8988 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8989 assert(m->get_type() == MSG_OSD_PG_LOG);
8990 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8991
8992 if (!require_osd_peer(op->get_req()))
8993 return;
8994
8995 int from = m->get_source().num();
8996 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8997 return;
8998
8999 if (m->info.pgid.preferred() >= 0) {
9000 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
9001 return;
9002 }
9003
9004 op->mark_started();
9005 handle_pg_peering_evt(
9006 spg_t(m->info.pgid.pgid, m->to),
9007 m->info.history, m->past_intervals, m->get_epoch(),
9008 PG::CephPeeringEvtRef(
9009 new PG::CephPeeringEvt(
9010 m->get_epoch(), m->get_query_epoch(),
9011 PG::MLogRec(pg_shard_t(from, m->from), m)))
9012 );
9013 }
9014
9015 void OSD::handle_pg_info(OpRequestRef op)
9016 {
9017 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
9018 assert(m->get_type() == MSG_OSD_PG_INFO);
9019 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
9020
9021 if (!require_osd_peer(op->get_req()))
9022 return;
9023
9024 int from = m->get_source().num();
9025 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9026 return;
9027
9028 op->mark_started();
9029
9030 for (auto p = m->pg_list.begin();
9031 p != m->pg_list.end();
9032 ++p) {
9033 if (p->first.info.pgid.preferred() >= 0) {
9034 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
9035 continue;
9036 }
9037
9038 handle_pg_peering_evt(
9039 spg_t(p->first.info.pgid.pgid, p->first.to),
9040 p->first.info.history, p->second, p->first.epoch_sent,
9041 PG::CephPeeringEvtRef(
9042 new PG::CephPeeringEvt(
9043 p->first.epoch_sent, p->first.query_epoch,
9044 PG::MInfoRec(
9045 pg_shard_t(
9046 from, p->first.from), p->first.info, p->first.epoch_sent)))
9047 );
9048 }
9049 }
9050
9051 void OSD::handle_pg_trim(OpRequestRef op)
9052 {
9053 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
9054 assert(m->get_type() == MSG_OSD_PG_TRIM);
9055
9056 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
9057
9058 if (!require_osd_peer(op->get_req()))
9059 return;
9060
9061 int from = m->get_source().num();
9062 if (!require_same_or_newer_map(op, m->epoch, false))
9063 return;
9064
9065 if (m->pgid.preferred() >= 0) {
9066 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9067 return;
9068 }
9069
9070 op->mark_started();
9071
9072 PG *pg = _lookup_lock_pg(m->pgid);
9073 if(!pg) {
9074 dout(10) << " don't have pg " << m->pgid << dendl;
9075 return;
9076 }
9077
9078 if (m->epoch < pg->info.history.same_interval_since) {
9079 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9080 pg->unlock();
9081 return;
9082 }
9083
9084 if (pg->is_primary()) {
9085 // peer is informing us of their last_complete_ondisk
9086 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9087 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9088 m->trim_to;
9089 // trim log when the pg is recovered
9090 pg->calc_min_last_complete_ondisk();
9091 } else {
9092 // primary is instructing us to trim
9093 ObjectStore::Transaction t;
9094 pg->pg_log.trim(m->trim_to, pg->info);
9095 pg->dirty_info = true;
9096 pg->write_if_dirty(t);
9097 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9098 assert(tr == 0);
9099 }
9100 pg->unlock();
9101 }
9102
9103 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9104 {
9105 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9106 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9107
9108 if (!require_osd_peer(op->get_req()))
9109 return;
9110 if (!require_same_or_newer_map(op, m->query_epoch, false))
9111 return;
9112
9113 PG::CephPeeringEvtRef evt;
9114 if (m->type == MBackfillReserve::REQUEST) {
9115 evt = PG::CephPeeringEvtRef(
9116 new PG::CephPeeringEvt(
9117 m->query_epoch,
9118 m->query_epoch,
9119 PG::RequestBackfillPrio(m->priority)));
9120 } else if (m->type == MBackfillReserve::GRANT) {
9121 evt = PG::CephPeeringEvtRef(
9122 new PG::CephPeeringEvt(
9123 m->query_epoch,
9124 m->query_epoch,
9125 PG::RemoteBackfillReserved()));
9126 } else if (m->type == MBackfillReserve::REJECT) {
9127 // NOTE: this is replica -> primary "i reject your request"
9128 // and also primary -> replica "cancel my previously-granted request"
9129 evt = PG::CephPeeringEvtRef(
9130 new PG::CephPeeringEvt(
9131 m->query_epoch,
9132 m->query_epoch,
9133 PG::RemoteReservationRejected()));
9134 } else {
9135 ceph_abort();
9136 }
9137
9138 if (service.splitting(m->pgid)) {
9139 peering_wait_for_split[m->pgid].push_back(evt);
9140 return;
9141 }
9142
9143 PG *pg = _lookup_lock_pg(m->pgid);
9144 if (!pg) {
9145 dout(10) << " don't have pg " << m->pgid << dendl;
9146 return;
9147 }
9148
9149 pg->queue_peering_event(evt);
9150 pg->unlock();
9151 }
9152
9153 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9154 {
9155 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9156 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9157
9158 if (!require_osd_peer(op->get_req()))
9159 return;
9160 if (!require_same_or_newer_map(op, m->query_epoch, false))
9161 return;
9162
9163 PG::CephPeeringEvtRef evt;
9164 if (m->type == MRecoveryReserve::REQUEST) {
9165 evt = PG::CephPeeringEvtRef(
9166 new PG::CephPeeringEvt(
9167 m->query_epoch,
9168 m->query_epoch,
9169 PG::RequestRecovery()));
9170 } else if (m->type == MRecoveryReserve::GRANT) {
9171 evt = PG::CephPeeringEvtRef(
9172 new PG::CephPeeringEvt(
9173 m->query_epoch,
9174 m->query_epoch,
9175 PG::RemoteRecoveryReserved()));
9176 } else if (m->type == MRecoveryReserve::RELEASE) {
9177 evt = PG::CephPeeringEvtRef(
9178 new PG::CephPeeringEvt(
9179 m->query_epoch,
9180 m->query_epoch,
9181 PG::RecoveryDone()));
9182 } else {
9183 ceph_abort();
9184 }
9185
9186 if (service.splitting(m->pgid)) {
9187 peering_wait_for_split[m->pgid].push_back(evt);
9188 return;
9189 }
9190
9191 PG *pg = _lookup_lock_pg(m->pgid);
9192 if (!pg) {
9193 dout(10) << " don't have pg " << m->pgid << dendl;
9194 return;
9195 }
9196
9197 pg->queue_peering_event(evt);
9198 pg->unlock();
9199 }
9200
9201 void OSD::handle_force_recovery(Message *m)
9202 {
9203 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9204 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
9205
9206 vector<PGRef> local_pgs;
9207 local_pgs.reserve(msg->forced_pgs.size());
9208
9209 {
9210 RWLock::RLocker l(pg_map_lock);
9211 for (auto& i : msg->forced_pgs) {
9212 spg_t locpg;
9213 if (osdmap->get_primary_shard(i, &locpg)) {
9214 auto pg_map_entry = pg_map.find(locpg);
9215 if (pg_map_entry != pg_map.end()) {
9216 local_pgs.push_back(pg_map_entry->second);
9217 }
9218 }
9219 }
9220 }
9221
9222 if (local_pgs.size()) {
9223 service.adjust_pg_priorities(local_pgs, msg->options);
9224 }
9225
9226 msg->put();
9227 }
9228
9229 /** PGQuery
9230 * from primary to replica | stray
9231 * NOTE: called with opqueue active.
9232 */
9233 void OSD::handle_pg_query(OpRequestRef op)
9234 {
9235 assert(osd_lock.is_locked());
9236
9237 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9238 assert(m->get_type() == MSG_OSD_PG_QUERY);
9239
9240 if (!require_osd_peer(op->get_req()))
9241 return;
9242
9243 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9244 int from = m->get_source().num();
9245
9246 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9247 return;
9248
9249 op->mark_started();
9250
9251 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9252
9253 for (auto it = m->pg_list.begin();
9254 it != m->pg_list.end();
9255 ++it) {
9256 spg_t pgid = it->first;
9257
9258 if (pgid.preferred() >= 0) {
9259 dout(10) << "ignoring localized pg " << pgid << dendl;
9260 continue;
9261 }
9262
9263 if (service.splitting(pgid)) {
9264 peering_wait_for_split[pgid].push_back(
9265 PG::CephPeeringEvtRef(
9266 new PG::CephPeeringEvt(
9267 it->second.epoch_sent, it->second.epoch_sent,
9268 PG::MQuery(pg_shard_t(from, it->second.from),
9269 it->second, it->second.epoch_sent))));
9270 continue;
9271 }
9272
9273 {
9274 RWLock::RLocker l(pg_map_lock);
9275 if (pg_map.count(pgid)) {
9276 PG *pg = 0;
9277 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9278 pg->queue_query(
9279 it->second.epoch_sent, it->second.epoch_sent,
9280 pg_shard_t(from, it->second.from), it->second);
9281 pg->unlock();
9282 continue;
9283 }
9284 }
9285
9286 if (!osdmap->have_pg_pool(pgid.pool()))
9287 continue;
9288
9289 // get active crush mapping
9290 int up_primary, acting_primary;
9291 vector<int> up, acting;
9292 osdmap->pg_to_up_acting_osds(
9293 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9294
9295 // same primary?
9296 pg_history_t history = it->second.history;
9297 bool valid_history = project_pg_history(
9298 pgid, history, it->second.epoch_sent,
9299 up, up_primary, acting, acting_primary);
9300
9301 if (!valid_history ||
9302 it->second.epoch_sent < history.same_interval_since) {
9303 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9304 << history.same_interval_since
9305 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9306 continue;
9307 }
9308
9309 dout(10) << " pg " << pgid << " dne" << dendl;
9310 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9311 /* This is racy, but that should be ok: if we complete the deletion
9312 * before the pg is recreated, we'll just start it off backfilling
9313 * instead of just empty */
9314 if (service.deleting_pgs.lookup(pgid))
9315 empty.set_last_backfill(hobject_t());
9316 if (it->second.type == pg_query_t::LOG ||
9317 it->second.type == pg_query_t::FULLLOG) {
9318 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9319 if (con) {
9320 MOSDPGLog *mlog = new MOSDPGLog(
9321 it->second.from, it->second.to,
9322 osdmap->get_epoch(), empty,
9323 it->second.epoch_sent);
9324 service.share_map_peer(from, con.get(), osdmap);
9325 con->send_message(mlog);
9326 }
9327 } else {
9328 notify_list[from].push_back(
9329 make_pair(
9330 pg_notify_t(
9331 it->second.from, it->second.to,
9332 it->second.epoch_sent,
9333 osdmap->get_epoch(),
9334 empty),
9335 PastIntervals(
9336 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9337 *osdmap)));
9338 }
9339 }
9340 do_notifies(notify_list, osdmap);
9341 }
9342
9343
9344 void OSD::handle_pg_remove(OpRequestRef op)
9345 {
9346 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9347 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9348 assert(osd_lock.is_locked());
9349
9350 if (!require_osd_peer(op->get_req()))
9351 return;
9352
9353 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9354 << m->pg_list.size() << " pgs" << dendl;
9355
9356 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9357 return;
9358
9359 op->mark_started();
9360
9361 for (auto it = m->pg_list.begin();
9362 it != m->pg_list.end();
9363 ++it) {
9364 spg_t pgid = *it;
9365 if (pgid.preferred() >= 0) {
9366 dout(10) << "ignoring localized pg " << pgid << dendl;
9367 continue;
9368 }
9369
9370 RWLock::WLocker l(pg_map_lock);
9371 if (pg_map.count(pgid) == 0) {
9372 dout(10) << " don't have pg " << pgid << dendl;
9373 continue;
9374 }
9375 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9376 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9377 pg_history_t history = pg->info.history;
9378 int up_primary, acting_primary;
9379 vector<int> up, acting;
9380 osdmap->pg_to_up_acting_osds(
9381 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9382 bool valid_history = project_pg_history(
9383 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9384 up, up_primary, acting, acting_primary);
9385 if (valid_history &&
9386 history.same_interval_since <= m->get_epoch()) {
9387 assert(pg->get_primary().osd == m->get_source().num());
9388 PGRef _pg(pg);
9389 _remove_pg(pg);
9390 pg->unlock();
9391 } else {
9392 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9393 << history.same_interval_since
9394 << " > " << m->get_epoch() << dendl;
9395 pg->unlock();
9396 }
9397 }
9398 }
9399
9400 void OSD::_remove_pg(PG *pg)
9401 {
9402 ObjectStore::Transaction rmt ;
9403
9404 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9405 // the pg_map must be done together without unlocking the pg lock,
9406 // to avoid racing with watcher cleanup in ms_handle_reset
9407 // and handle_notify_timeout
9408 pg->on_removal(&rmt);
9409
9410 service.cancel_pending_splits_for_parent(pg->info.pgid);
9411 int tr = store->queue_transaction(
9412 pg->osr.get(), std::move(rmt), NULL,
9413 new ContainerContext<
9414 SequencerRef>(pg->osr));
9415 assert(tr == 0);
9416
9417 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9418 pg->info.pgid,
9419 make_pair(
9420 pg->info.pgid,
9421 PGRef(pg))
9422 );
9423 remove_wq.queue(make_pair(PGRef(pg), deleting));
9424
9425 service.pg_remove_epoch(pg->info.pgid);
9426
9427 // dereference from op_wq
9428 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9429
9430 // remove from map
9431 pg_map.erase(pg->info.pgid);
9432 pg->put("PGMap"); // since we've taken it out of map
9433 }
9434
9435 // =========================================================
9436 // RECOVERY
9437
9438 void OSDService::_maybe_queue_recovery() {
9439 assert(recovery_lock.is_locked_by_me());
9440 uint64_t available_pushes;
9441 while (!awaiting_throttle.empty() &&
9442 _recover_now(&available_pushes)) {
9443 uint64_t to_start = MIN(
9444 available_pushes,
9445 cct->_conf->osd_recovery_max_single_start);
9446 _queue_for_recovery(awaiting_throttle.front(), to_start);
9447 awaiting_throttle.pop_front();
9448 recovery_ops_reserved += to_start;
9449 }
9450 }
9451
9452 bool OSDService::_recover_now(uint64_t *available_pushes)
9453 {
9454 if (available_pushes)
9455 *available_pushes = 0;
9456
9457 if (ceph_clock_now() < defer_recovery_until) {
9458 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9459 return false;
9460 }
9461
9462 if (recovery_paused) {
9463 dout(15) << __func__ << " paused" << dendl;
9464 return false;
9465 }
9466
9467 uint64_t max = cct->_conf->osd_recovery_max_active;
9468 if (max <= recovery_ops_active + recovery_ops_reserved) {
9469 dout(15) << __func__ << " active " << recovery_ops_active
9470 << " + reserved " << recovery_ops_reserved
9471 << " >= max " << max << dendl;
9472 return false;
9473 }
9474
9475 if (available_pushes)
9476 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9477
9478 return true;
9479 }
9480
9481
9482 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9483 {
9484 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9485 return;
9486 int newstate = 0;
9487
9488 if (newflags & OFR_BACKFILL) {
9489 newstate = PG_STATE_FORCED_BACKFILL;
9490 } else if (newflags & OFR_RECOVERY) {
9491 newstate = PG_STATE_FORCED_RECOVERY;
9492 }
9493
9494 // debug output here may get large, don't generate it if debug level is below
9495 // 10 and use abbreviated pg ids otherwise
9496 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9497 stringstream ss;
9498
9499 for (auto& i : pgs) {
9500 ss << i->get_pgid() << " ";
9501 }
9502
9503 dout(10) << __func__ << " working on " << ss.str() << dendl;
9504 }
9505
9506 if (newflags & OFR_CANCEL) {
9507 for (auto& i : pgs) {
9508 i->lock();
9509 i->_change_recovery_force_mode(newstate, true);
9510 i->unlock();
9511 }
9512 } else {
9513 for (auto& i : pgs) {
9514 // make sure the PG is in correct state before forcing backfill or recovery, or
9515 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9516 // or forcing somehow recovery/backfill.
9517 i->lock();
9518 int pgstate = i->get_state();
9519 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9520 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
9521 i->_change_recovery_force_mode(newstate, false);
9522 i->unlock();
9523 }
9524 }
9525 }
9526
9527 void OSD::do_recovery(
9528 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9529 ThreadPool::TPHandle &handle)
9530 {
9531 uint64_t started = 0;
9532
9533 /*
9534 * When the value of osd_recovery_sleep is set greater than zero, recovery
9535 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9536 * recovery event's schedule time. This is done by adding a
9537 * recovery_requeue_callback event, which re-queues the recovery op using
9538 * queue_recovery_after_sleep.
9539 */
9540 float recovery_sleep = get_osd_recovery_sleep();
9541 {
9542 Mutex::Locker l(service.recovery_sleep_lock);
9543 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9544 PGRef pgref(pg);
9545 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9546 dout(20) << "do_recovery wake up at "
9547 << ceph_clock_now()
9548 << ", re-queuing recovery" << dendl;
9549 Mutex::Locker l(service.recovery_sleep_lock);
9550 service.recovery_needs_sleep = false;
9551 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9552 });
9553
9554 // This is true for the first recovery op and when the previous recovery op
9555 // has been scheduled in the past. The next recovery op is scheduled after
9556 // completing the sleep from now.
9557 if (service.recovery_schedule_time < ceph_clock_now()) {
9558 service.recovery_schedule_time = ceph_clock_now();
9559 }
9560 service.recovery_schedule_time += recovery_sleep;
9561 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9562 recovery_requeue_callback);
9563 dout(20) << "Recovery event scheduled at "
9564 << service.recovery_schedule_time << dendl;
9565 return;
9566 }
9567 }
9568
9569 {
9570 {
9571 Mutex::Locker l(service.recovery_sleep_lock);
9572 service.recovery_needs_sleep = true;
9573 }
9574
9575 if (pg->pg_has_reset_since(queued)) {
9576 goto out;
9577 }
9578
9579 assert(!pg->deleting);
9580 assert(pg->is_peered() && pg->is_primary());
9581
9582 assert(pg->recovery_queued);
9583 pg->recovery_queued = false;
9584
9585 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9586 #ifdef DEBUG_RECOVERY_OIDS
9587 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9588 #endif
9589
9590 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9591 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9592 << " on " << *pg << dendl;
9593
9594 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9595 if (!started && (more || !pg->have_unfound())) {
9596 goto out;
9597 }
9598
9599 PG::RecoveryCtx rctx = create_context();
9600 rctx.handle = &handle;
9601
9602 /*
9603 * if we couldn't start any recovery ops and things are still
9604 * unfound, see if we can discover more missing object locations.
9605 * It may be that our initial locations were bad and we errored
9606 * out while trying to pull.
9607 */
9608 if (!more && pg->have_unfound()) {
9609 pg->discover_all_missing(*rctx.query_map);
9610 if (rctx.query_map->empty()) {
9611 string action;
9612 if (pg->state_test(PG_STATE_BACKFILLING)) {
9613 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9614 queued,
9615 queued,
9616 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
9617 pg->queue_peering_event(evt);
9618 action = "in backfill";
9619 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9620 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9621 queued,
9622 queued,
9623 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
9624 pg->queue_peering_event(evt);
9625 action = "in recovery";
9626 } else {
9627 action = "already out of recovery/backfill";
9628 }
9629 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9630 } else {
9631 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9632 pg->queue_recovery();
9633 }
9634 }
9635
9636 pg->write_if_dirty(*rctx.transaction);
9637 OSDMapRef curmap = pg->get_osdmap();
9638 dispatch_context(rctx, pg, curmap);
9639 }
9640
9641 out:
9642 assert(started <= reserved_pushes);
9643 service.release_reserved_pushes(reserved_pushes);
9644 }
9645
9646 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9647 {
9648 Mutex::Locker l(recovery_lock);
9649 dout(10) << "start_recovery_op " << *pg << " " << soid
9650 << " (" << recovery_ops_active << "/"
9651 << cct->_conf->osd_recovery_max_active << " rops)"
9652 << dendl;
9653 recovery_ops_active++;
9654
9655 #ifdef DEBUG_RECOVERY_OIDS
9656 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9657 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9658 recovery_oids[pg->info.pgid].insert(soid);
9659 #endif
9660 }
9661
9662 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9663 {
9664 Mutex::Locker l(recovery_lock);
9665 dout(10) << "finish_recovery_op " << *pg << " " << soid
9666 << " dequeue=" << dequeue
9667 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9668 << dendl;
9669
9670 // adjust count
9671 assert(recovery_ops_active > 0);
9672 recovery_ops_active--;
9673
9674 #ifdef DEBUG_RECOVERY_OIDS
9675 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9676 assert(recovery_oids[pg->info.pgid].count(soid));
9677 recovery_oids[pg->info.pgid].erase(soid);
9678 #endif
9679
9680 _maybe_queue_recovery();
9681 }
9682
9683 bool OSDService::is_recovery_active()
9684 {
9685 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9686 }
9687
9688 // =========================================================
9689 // OPS
9690
9691 bool OSD::op_is_discardable(const MOSDOp *op)
9692 {
9693 // drop client request if they are not connected and can't get the
9694 // reply anyway.
9695 if (!op->get_connection()->is_connected()) {
9696 return true;
9697 }
9698 return false;
9699 }
9700
9701 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9702 {
9703 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9704 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9705 << " cost " << op->get_req()->get_cost()
9706 << " latency " << latency
9707 << " epoch " << epoch
9708 << " " << *(op->get_req()) << dendl;
9709 op->osd_trace.event("enqueue op");
9710 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9711 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9712 op->mark_queued_for_pg();
9713 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9714 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9715 }
9716
9717
9718
9719 /*
9720 * NOTE: dequeue called in worker thread, with pg lock
9721 */
9722 void OSD::dequeue_op(
9723 PGRef pg, OpRequestRef op,
9724 ThreadPool::TPHandle &handle)
9725 {
9726 FUNCTRACE();
9727 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9728
9729 utime_t now = ceph_clock_now();
9730 op->set_dequeued_time(now);
9731 utime_t latency = now - op->get_req()->get_recv_stamp();
9732 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9733 << " cost " << op->get_req()->get_cost()
9734 << " latency " << latency
9735 << " " << *(op->get_req())
9736 << " pg " << *pg << dendl;
9737
9738 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9739
9740 Session *session = static_cast<Session *>(
9741 op->get_req()->get_connection()->get_priv());
9742 if (session) {
9743 maybe_share_map(session, op, pg->get_osdmap());
9744 session->put();
9745 }
9746
9747 if (pg->deleting)
9748 return;
9749
9750 op->mark_reached_pg();
9751 op->osd_trace.event("dequeue_op");
9752
9753 pg->do_request(op, handle);
9754
9755 // finish
9756 dout(10) << "dequeue_op " << op << " finish" << dendl;
9757 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9758 }
9759
9760
9761 struct C_CompleteSplits : public Context {
9762 OSD *osd;
9763 set<PGRef> pgs;
9764 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9765 : osd(osd), pgs(in) {}
9766 void finish(int r) override {
9767 Mutex::Locker l(osd->osd_lock);
9768 if (osd->is_stopping())
9769 return;
9770 PG::RecoveryCtx rctx = osd->create_context();
9771 for (set<PGRef>::iterator i = pgs.begin();
9772 i != pgs.end();
9773 ++i) {
9774 osd->pg_map_lock.get_write();
9775 (*i)->lock();
9776 PG *pg = i->get();
9777 osd->add_newly_split_pg(pg, &rctx);
9778 if (!((*i)->deleting)) {
9779 set<spg_t> to_complete;
9780 to_complete.insert((*i)->info.pgid);
9781 osd->service.complete_split(to_complete);
9782 }
9783 osd->pg_map_lock.put_write();
9784 osd->dispatch_context_transaction(rctx, pg);
9785 osd->wake_pg_waiters(*i);
9786 (*i)->unlock();
9787 }
9788
9789 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9790 }
9791 };
9792
9793 void OSD::process_peering_events(
9794 const list<PG*> &pgs,
9795 ThreadPool::TPHandle &handle
9796 )
9797 {
9798 bool need_up_thru = false;
9799 epoch_t same_interval_since = 0;
9800 OSDMapRef curmap;
9801 PG::RecoveryCtx rctx = create_context();
9802 rctx.handle = &handle;
9803 for (list<PG*>::const_iterator i = pgs.begin();
9804 i != pgs.end();
9805 ++i) {
9806 set<PGRef> split_pgs;
9807 PG *pg = *i;
9808 pg->lock_suspend_timeout(handle);
9809 curmap = service.get_osdmap();
9810 if (pg->deleting) {
9811 pg->unlock();
9812 continue;
9813 }
9814 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9815 // we need to requeue the PG explicitly since we didn't actually
9816 // handle an event
9817 peering_wq.queue(pg);
9818 } else {
9819 assert(!pg->peering_queue.empty());
9820 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9821 pg->peering_queue.pop_front();
9822 pg->handle_peering_event(evt, &rctx);
9823 }
9824 need_up_thru = pg->need_up_thru || need_up_thru;
9825 same_interval_since = MAX(pg->info.history.same_interval_since,
9826 same_interval_since);
9827 pg->write_if_dirty(*rctx.transaction);
9828 if (!split_pgs.empty()) {
9829 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9830 split_pgs.clear();
9831 }
9832 dispatch_context_transaction(rctx, pg, &handle);
9833 pg->unlock();
9834 }
9835 if (need_up_thru)
9836 queue_want_up_thru(same_interval_since);
9837 dispatch_context(rctx, 0, curmap, &handle);
9838
9839 service.send_pg_temp();
9840 }
9841
9842 // --------------------------------
9843
9844 const char** OSD::get_tracked_conf_keys() const
9845 {
9846 static const char* KEYS[] = {
9847 "osd_max_backfills",
9848 "osd_min_recovery_priority",
9849 "osd_max_trimming_pgs",
9850 "osd_op_complaint_time",
9851 "osd_op_log_threshold",
9852 "osd_op_history_size",
9853 "osd_op_history_duration",
9854 "osd_op_history_slow_op_size",
9855 "osd_op_history_slow_op_threshold",
9856 "osd_enable_op_tracker",
9857 "osd_map_cache_size",
9858 "osd_map_max_advance",
9859 "osd_pg_epoch_persisted_max_stale",
9860 "osd_disk_thread_ioprio_class",
9861 "osd_disk_thread_ioprio_priority",
9862 // clog & admin clog
9863 "clog_to_monitors",
9864 "clog_to_syslog",
9865 "clog_to_syslog_facility",
9866 "clog_to_syslog_level",
9867 "osd_objectstore_fuse",
9868 "clog_to_graylog",
9869 "clog_to_graylog_host",
9870 "clog_to_graylog_port",
9871 "host",
9872 "fsid",
9873 "osd_recovery_delay_start",
9874 "osd_client_message_size_cap",
9875 "osd_client_message_cap",
9876 "osd_heartbeat_min_size",
9877 "osd_heartbeat_interval",
9878 NULL
9879 };
9880 return KEYS;
9881 }
9882
9883 void OSD::handle_conf_change(const struct md_config_t *conf,
9884 const std::set <std::string> &changed)
9885 {
9886 if (changed.count("osd_max_backfills")) {
9887 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9888 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9889 }
9890 if (changed.count("osd_min_recovery_priority")) {
9891 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9892 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9893 }
9894 if (changed.count("osd_max_trimming_pgs")) {
9895 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9896 }
9897 if (changed.count("osd_op_complaint_time") ||
9898 changed.count("osd_op_log_threshold")) {
9899 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9900 cct->_conf->osd_op_log_threshold);
9901 }
9902 if (changed.count("osd_op_history_size") ||
9903 changed.count("osd_op_history_duration")) {
9904 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9905 cct->_conf->osd_op_history_duration);
9906 }
9907 if (changed.count("osd_op_history_slow_op_size") ||
9908 changed.count("osd_op_history_slow_op_threshold")) {
9909 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9910 cct->_conf->osd_op_history_slow_op_threshold);
9911 }
9912 if (changed.count("osd_enable_op_tracker")) {
9913 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9914 }
9915 if (changed.count("osd_disk_thread_ioprio_class") ||
9916 changed.count("osd_disk_thread_ioprio_priority")) {
9917 set_disk_tp_priority();
9918 }
9919 if (changed.count("osd_map_cache_size")) {
9920 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9921 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9922 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9923 }
9924 if (changed.count("clog_to_monitors") ||
9925 changed.count("clog_to_syslog") ||
9926 changed.count("clog_to_syslog_level") ||
9927 changed.count("clog_to_syslog_facility") ||
9928 changed.count("clog_to_graylog") ||
9929 changed.count("clog_to_graylog_host") ||
9930 changed.count("clog_to_graylog_port") ||
9931 changed.count("host") ||
9932 changed.count("fsid")) {
9933 update_log_config();
9934 }
9935
9936 #ifdef HAVE_LIBFUSE
9937 if (changed.count("osd_objectstore_fuse")) {
9938 if (store) {
9939 enable_disable_fuse(false);
9940 }
9941 }
9942 #endif
9943
9944 if (changed.count("osd_recovery_delay_start")) {
9945 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9946 service.kick_recovery_queue();
9947 }
9948
9949 if (changed.count("osd_client_message_cap")) {
9950 uint64_t newval = cct->_conf->osd_client_message_cap;
9951 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9952 if (pol.throttler_messages && newval > 0) {
9953 pol.throttler_messages->reset_max(newval);
9954 }
9955 }
9956 if (changed.count("osd_client_message_size_cap")) {
9957 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9958 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9959 if (pol.throttler_bytes && newval > 0) {
9960 pol.throttler_bytes->reset_max(newval);
9961 }
9962 }
9963
9964 check_config();
9965 }
9966
9967 void OSD::update_log_config()
9968 {
9969 map<string,string> log_to_monitors;
9970 map<string,string> log_to_syslog;
9971 map<string,string> log_channel;
9972 map<string,string> log_prio;
9973 map<string,string> log_to_graylog;
9974 map<string,string> log_to_graylog_host;
9975 map<string,string> log_to_graylog_port;
9976 uuid_d fsid;
9977 string host;
9978
9979 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9980 log_channel, log_prio, log_to_graylog,
9981 log_to_graylog_host, log_to_graylog_port,
9982 fsid, host) == 0)
9983 clog->update_config(log_to_monitors, log_to_syslog,
9984 log_channel, log_prio, log_to_graylog,
9985 log_to_graylog_host, log_to_graylog_port,
9986 fsid, host);
9987 derr << "log_to_monitors " << log_to_monitors << dendl;
9988 }
9989
9990 void OSD::check_config()
9991 {
9992 // some sanity checks
9993 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9994 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9995 << " is not > osd_map_max_advance ("
9996 << cct->_conf->osd_map_max_advance << ")";
9997 }
9998 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9999 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10000 << " is not > osd_pg_epoch_persisted_max_stale ("
10001 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10002 }
10003 }
10004
10005 void OSD::set_disk_tp_priority()
10006 {
10007 dout(10) << __func__
10008 << " class " << cct->_conf->osd_disk_thread_ioprio_class
10009 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
10010 << dendl;
10011 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
10012 cct->_conf->osd_disk_thread_ioprio_priority < 0)
10013 return;
10014 int cls =
10015 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
10016 if (cls < 0)
10017 derr << __func__ << cpp_strerror(cls) << ": "
10018 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
10019 << " but only the following values are allowed: idle, be or rt" << dendl;
10020 else
10021 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10022 }
10023
10024 // --------------------------------
10025
10026 void OSD::get_latest_osdmap()
10027 {
10028 dout(10) << __func__ << " -- start" << dendl;
10029
10030 C_SaferCond cond;
10031 service.objecter->wait_for_latest_osdmap(&cond);
10032 cond.wait();
10033
10034 dout(10) << __func__ << " -- finish" << dendl;
10035 }
10036
10037 // --------------------------------
10038
10039 int OSD::init_op_flags(OpRequestRef& op)
10040 {
10041 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10042 vector<OSDOp>::const_iterator iter;
10043
10044 // client flags have no bearing on whether an op is a read, write, etc.
10045 op->rmw_flags = 0;
10046
10047 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10048 op->set_force_rwordered();
10049 }
10050
10051 // set bits based on op codes, called methods.
10052 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10053 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10054 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10055 /* This a bit odd. PING isn't actually a write. It can't
10056 * result in an update to the object_info. PINGs also aren'ty
10057 * resent, so there's no reason to write out a log entry
10058 *
10059 * However, we pipeline them behind writes, so let's force
10060 * the write_ordered flag.
10061 */
10062 op->set_force_rwordered();
10063 } else {
10064 if (ceph_osd_op_mode_modify(iter->op.op))
10065 op->set_write();
10066 }
10067 if (ceph_osd_op_mode_read(iter->op.op))
10068 op->set_read();
10069
10070 // set READ flag if there are src_oids
10071 if (iter->soid.oid.name.length())
10072 op->set_read();
10073
10074 // set PGOP flag if there are PG ops
10075 if (ceph_osd_op_type_pg(iter->op.op))
10076 op->set_pg_op();
10077
10078 if (ceph_osd_op_mode_cache(iter->op.op))
10079 op->set_cache();
10080
10081 // check for ec base pool
10082 int64_t poolid = m->get_pg().pool();
10083 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10084 if (pool && pool->is_tier()) {
10085 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10086 if (base_pool && base_pool->require_rollback()) {
10087 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10088 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10089 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10090 (iter->op.op != CEPH_OSD_OP_STAT) &&
10091 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10092 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10093 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10094 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10095 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10096 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10097 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10098 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10099 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10100 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10101 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10102 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10103 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10104 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10105 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10106 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10107 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10108 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10109 op->set_promote();
10110 }
10111 }
10112 }
10113
10114 switch (iter->op.op) {
10115 case CEPH_OSD_OP_CALL:
10116 {
10117 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10118 int is_write, is_read;
10119 string cname, mname;
10120 bp.copy(iter->op.cls.class_len, cname);
10121 bp.copy(iter->op.cls.method_len, mname);
10122
10123 ClassHandler::ClassData *cls;
10124 int r = class_handler->open_class(cname, &cls);
10125 if (r) {
10126 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10127 if (r == -ENOENT)
10128 r = -EOPNOTSUPP;
10129 else if (r != -EPERM) // propagate permission errors
10130 r = -EIO;
10131 return r;
10132 }
10133 int flags = cls->get_method_flags(mname.c_str());
10134 if (flags < 0) {
10135 if (flags == -ENOENT)
10136 r = -EOPNOTSUPP;
10137 else
10138 r = flags;
10139 return r;
10140 }
10141 is_read = flags & CLS_METHOD_RD;
10142 is_write = flags & CLS_METHOD_WR;
10143 bool is_promote = flags & CLS_METHOD_PROMOTE;
10144
10145 dout(10) << "class " << cname << " method " << mname << " "
10146 << "flags=" << (is_read ? "r" : "")
10147 << (is_write ? "w" : "")
10148 << (is_promote ? "p" : "")
10149 << dendl;
10150 if (is_read)
10151 op->set_class_read();
10152 if (is_write)
10153 op->set_class_write();
10154 if (is_promote)
10155 op->set_promote();
10156 op->add_class(cname, is_read, is_write, cls->whitelisted);
10157 break;
10158 }
10159
10160 case CEPH_OSD_OP_WATCH:
10161 // force the read bit for watch since it is depends on previous
10162 // watch state (and may return early if the watch exists) or, in
10163 // the case of ping, is simply a read op.
10164 op->set_read();
10165 // fall through
10166 case CEPH_OSD_OP_NOTIFY:
10167 case CEPH_OSD_OP_NOTIFY_ACK:
10168 {
10169 op->set_promote();
10170 break;
10171 }
10172
10173 case CEPH_OSD_OP_DELETE:
10174 // if we get a delete with FAILOK we can skip handle cache. without
10175 // FAILOK we still need to promote (or do something smarter) to
10176 // determine whether to return ENOENT or 0.
10177 if (iter == m->ops.begin() &&
10178 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10179 op->set_skip_handle_cache();
10180 }
10181 // skip promotion when proxying a delete op
10182 if (m->ops.size() == 1) {
10183 op->set_skip_promote();
10184 }
10185 break;
10186
10187 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10188 case CEPH_OSD_OP_CACHE_FLUSH:
10189 case CEPH_OSD_OP_CACHE_EVICT:
10190 // If try_flush/flush/evict is the only op, can skip handle cache.
10191 if (m->ops.size() == 1) {
10192 op->set_skip_handle_cache();
10193 }
10194 break;
10195
10196 case CEPH_OSD_OP_READ:
10197 case CEPH_OSD_OP_SYNC_READ:
10198 case CEPH_OSD_OP_SPARSE_READ:
10199 case CEPH_OSD_OP_CHECKSUM:
10200 case CEPH_OSD_OP_WRITEFULL:
10201 if (m->ops.size() == 1 &&
10202 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10203 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10204 op->set_skip_promote();
10205 }
10206 break;
10207
10208 // force promotion when pin an object in cache tier
10209 case CEPH_OSD_OP_CACHE_PIN:
10210 op->set_promote();
10211 break;
10212
10213 default:
10214 break;
10215 }
10216 }
10217
10218 if (op->rmw_flags == 0)
10219 return -EINVAL;
10220
10221 return 0;
10222 }
10223
10224 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10225 for (list<PG*>::iterator i = peering_queue.begin();
10226 i != peering_queue.end() &&
10227 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10228 ) {
10229 if (in_use.count(*i)) {
10230 ++i;
10231 } else {
10232 out->push_back(*i);
10233 peering_queue.erase(i++);
10234 }
10235 }
10236 in_use.insert(out->begin(), out->end());
10237 }
10238
10239
10240 // =============================================================
10241
10242 #undef dout_context
10243 #define dout_context osd->cct
10244 #undef dout_prefix
10245 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10246
10247 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10248 {
10249 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10250 auto sdata = shard_list[shard_index];
10251 bool queued = false;
10252 unsigned pushes_to_free = 0;
10253 {
10254 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10255 auto p = sdata->pg_slots.find(pgid);
10256 if (p != sdata->pg_slots.end()) {
10257 dout(20) << __func__ << " " << pgid
10258 << " to_process " << p->second.to_process
10259 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10260 for (auto i = p->second.to_process.rbegin();
10261 i != p->second.to_process.rend();
10262 ++i) {
10263 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10264 }
10265 for (auto& q : p->second.to_process) {
10266 pushes_to_free += q.get_reserved_pushes();
10267 }
10268 p->second.to_process.clear();
10269 p->second.waiting_for_pg = false;
10270 ++p->second.requeue_seq;
10271 queued = true;
10272 }
10273 }
10274 if (pushes_to_free > 0) {
10275 osd->service.release_reserved_pushes(pushes_to_free);
10276 }
10277 if (queued) {
10278 sdata->sdata_lock.Lock();
10279 sdata->sdata_cond.SignalOne();
10280 sdata->sdata_lock.Unlock();
10281 }
10282 }
10283
10284 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10285 {
10286 unsigned pushes_to_free = 0;
10287 for (auto sdata : shard_list) {
10288 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10289 sdata->waiting_for_pg_osdmap = osdmap;
10290 auto p = sdata->pg_slots.begin();
10291 while (p != sdata->pg_slots.end()) {
10292 ShardData::pg_slot& slot = p->second;
10293 if (!slot.to_process.empty() && slot.num_running == 0) {
10294 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10295 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10296 << dendl;
10297 ++p;
10298 continue;
10299 }
10300 while (!slot.to_process.empty() &&
10301 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10302 auto& qi = slot.to_process.front();
10303 dout(20) << __func__ << " " << p->first
10304 << " item " << qi
10305 << " epoch " << qi.get_map_epoch()
10306 << " <= " << osdmap->get_epoch()
10307 << ", stale, dropping" << dendl;
10308 pushes_to_free += qi.get_reserved_pushes();
10309 slot.to_process.pop_front();
10310 }
10311 }
10312 if (slot.to_process.empty() &&
10313 slot.num_running == 0 &&
10314 !slot.pg) {
10315 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10316 p = sdata->pg_slots.erase(p);
10317 } else {
10318 ++p;
10319 }
10320 }
10321 }
10322 if (pushes_to_free > 0) {
10323 osd->service.release_reserved_pushes(pushes_to_free);
10324 }
10325 }
10326
10327 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10328 {
10329 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10330 auto sdata = shard_list[shard_index];
10331 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10332 auto p = sdata->pg_slots.find(pgid);
10333 if (p != sdata->pg_slots.end()) {
10334 auto& slot = p->second;
10335 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10336 assert(!slot.pg || slot.pg->deleting);
10337 slot.pg = nullptr;
10338 }
10339 }
10340
10341 void OSD::ShardedOpWQ::clear_pg_slots()
10342 {
10343 for (auto sdata : shard_list) {
10344 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10345 sdata->pg_slots.clear();
10346 sdata->waiting_for_pg_osdmap.reset();
10347 // don't bother with reserved pushes; we are shutting down
10348 }
10349 }
10350
10351 #undef dout_prefix
10352 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10353
10354 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10355 {
10356 uint32_t shard_index = thread_index % num_shards;
10357 ShardData *sdata = shard_list[shard_index];
10358 assert(NULL != sdata);
10359
10360 // peek at spg_t
10361 sdata->sdata_op_ordering_lock.Lock();
10362 if (sdata->pqueue->empty()) {
10363 dout(20) << __func__ << " empty q, waiting" << dendl;
10364 // optimistically sleep a moment; maybe another work item will come along.
10365 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10366 osd->cct->_conf->threadpool_default_timeout, 0);
10367 sdata->sdata_lock.Lock();
10368 sdata->sdata_op_ordering_lock.Unlock();
10369 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10370 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10371 sdata->sdata_lock.Unlock();
10372 sdata->sdata_op_ordering_lock.Lock();
10373 if (sdata->pqueue->empty()) {
10374 sdata->sdata_op_ordering_lock.Unlock();
10375 return;
10376 }
10377 }
10378 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10379 if (osd->is_stopping()) {
10380 sdata->sdata_op_ordering_lock.Unlock();
10381 return; // OSD shutdown, discard.
10382 }
10383 PGRef pg;
10384 uint64_t requeue_seq;
10385 {
10386 auto& slot = sdata->pg_slots[item.first];
10387 dout(30) << __func__ << " " << item.first
10388 << " to_process " << slot.to_process
10389 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10390 slot.to_process.push_back(item.second);
10391 // note the requeue seq now...
10392 requeue_seq = slot.requeue_seq;
10393 if (slot.waiting_for_pg) {
10394 // save ourselves a bit of effort
10395 dout(20) << __func__ << " " << item.first << " item " << item.second
10396 << " queued, waiting_for_pg" << dendl;
10397 sdata->sdata_op_ordering_lock.Unlock();
10398 return;
10399 }
10400 pg = slot.pg;
10401 dout(20) << __func__ << " " << item.first << " item " << item.second
10402 << " queued" << dendl;
10403 ++slot.num_running;
10404 }
10405 sdata->sdata_op_ordering_lock.Unlock();
10406
10407 osd->service.maybe_inject_dispatch_delay();
10408
10409 // [lookup +] lock pg (if we have it)
10410 if (!pg) {
10411 pg = osd->_lookup_lock_pg(item.first);
10412 } else {
10413 pg->lock();
10414 }
10415
10416 osd->service.maybe_inject_dispatch_delay();
10417
10418 boost::optional<PGQueueable> qi;
10419
10420 // we don't use a Mutex::Locker here because of the
10421 // osd->service.release_reserved_pushes() call below
10422 sdata->sdata_op_ordering_lock.Lock();
10423
10424 auto q = sdata->pg_slots.find(item.first);
10425 assert(q != sdata->pg_slots.end());
10426 auto& slot = q->second;
10427 --slot.num_running;
10428
10429 if (slot.to_process.empty()) {
10430 // raced with wake_pg_waiters or prune_pg_waiters
10431 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10432 if (pg) {
10433 pg->unlock();
10434 }
10435 sdata->sdata_op_ordering_lock.Unlock();
10436 return;
10437 }
10438 if (requeue_seq != slot.requeue_seq) {
10439 dout(20) << __func__ << " " << item.first
10440 << " requeue_seq " << slot.requeue_seq << " > our "
10441 << requeue_seq << ", we raced with wake_pg_waiters"
10442 << dendl;
10443 if (pg) {
10444 pg->unlock();
10445 }
10446 sdata->sdata_op_ordering_lock.Unlock();
10447 return;
10448 }
10449 if (pg && !slot.pg && !pg->deleting) {
10450 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10451 slot.pg = pg;
10452 }
10453 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10454 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10455
10456 // make sure we're not already waiting for this pg
10457 if (slot.waiting_for_pg) {
10458 dout(20) << __func__ << " " << item.first << " item " << item.second
10459 << " slot is waiting_for_pg" << dendl;
10460 if (pg) {
10461 pg->unlock();
10462 }
10463 sdata->sdata_op_ordering_lock.Unlock();
10464 return;
10465 }
10466
10467 // take next item
10468 qi = slot.to_process.front();
10469 slot.to_process.pop_front();
10470 dout(20) << __func__ << " " << item.first << " item " << *qi
10471 << " pg " << pg << dendl;
10472
10473 if (!pg) {
10474 // should this pg shard exist on this osd in this (or a later) epoch?
10475 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10476 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10477 dout(20) << __func__ << " " << item.first
10478 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10479 slot.to_process.push_front(*qi);
10480 slot.waiting_for_pg = true;
10481 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10482 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10483 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10484 << ", will wait on " << *qi << dendl;
10485 slot.to_process.push_front(*qi);
10486 slot.waiting_for_pg = true;
10487 } else {
10488 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10489 << " dropping " << *qi << dendl;
10490 // share map with client?
10491 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10492 Session *session = static_cast<Session *>(
10493 (*_op)->get_req()->get_connection()->get_priv());
10494 if (session) {
10495 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10496 session->put();
10497 }
10498 }
10499 unsigned pushes_to_free = qi->get_reserved_pushes();
10500 if (pushes_to_free > 0) {
10501 sdata->sdata_op_ordering_lock.Unlock();
10502 osd->service.release_reserved_pushes(pushes_to_free);
10503 return;
10504 }
10505 }
10506 sdata->sdata_op_ordering_lock.Unlock();
10507 return;
10508 }
10509 sdata->sdata_op_ordering_lock.Unlock();
10510
10511
10512 // osd_opwq_process marks the point at which an operation has been dequeued
10513 // and will begin to be handled by a worker thread.
10514 {
10515 #ifdef WITH_LTTNG
10516 osd_reqid_t reqid;
10517 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10518 reqid = (*_op)->get_reqid();
10519 }
10520 #endif
10521 tracepoint(osd, opwq_process_start, reqid.name._type,
10522 reqid.name._num, reqid.tid, reqid.inc);
10523 }
10524
10525 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10526 Formatter *f = Formatter::create("json");
10527 f->open_object_section("q");
10528 dump(f);
10529 f->close_section();
10530 f->flush(*_dout);
10531 delete f;
10532 *_dout << dendl;
10533
10534 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10535 suicide_interval);
10536 qi->run(osd, pg, tp_handle);
10537
10538 {
10539 #ifdef WITH_LTTNG
10540 osd_reqid_t reqid;
10541 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10542 reqid = (*_op)->get_reqid();
10543 }
10544 #endif
10545 tracepoint(osd, opwq_process_finish, reqid.name._type,
10546 reqid.name._num, reqid.tid, reqid.inc);
10547 }
10548
10549 pg->unlock();
10550 }
10551
10552 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10553 uint32_t shard_index =
10554 item.first.hash_to_shard(shard_list.size());
10555
10556 ShardData* sdata = shard_list[shard_index];
10557 assert (NULL != sdata);
10558 unsigned priority = item.second.get_priority();
10559 unsigned cost = item.second.get_cost();
10560 sdata->sdata_op_ordering_lock.Lock();
10561
10562 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10563 if (priority >= osd->op_prio_cutoff)
10564 sdata->pqueue->enqueue_strict(
10565 item.second.get_owner(), priority, item);
10566 else
10567 sdata->pqueue->enqueue(
10568 item.second.get_owner(),
10569 priority, cost, item);
10570 sdata->sdata_op_ordering_lock.Unlock();
10571
10572 sdata->sdata_lock.Lock();
10573 sdata->sdata_cond.SignalOne();
10574 sdata->sdata_lock.Unlock();
10575
10576 }
10577
10578 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10579 {
10580 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10581 ShardData* sdata = shard_list[shard_index];
10582 assert (NULL != sdata);
10583 sdata->sdata_op_ordering_lock.Lock();
10584 auto p = sdata->pg_slots.find(item.first);
10585 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10586 // we may be racing with _process, which has dequeued a new item
10587 // from pqueue, put it on to_process, and is now busy taking the
10588 // pg lock. ensure this old requeued item is ordered before any
10589 // such newer item in to_process.
10590 p->second.to_process.push_front(item.second);
10591 item.second = p->second.to_process.back();
10592 p->second.to_process.pop_back();
10593 dout(20) << __func__ << " " << item.first
10594 << " " << p->second.to_process.front()
10595 << " shuffled w/ " << item.second << dendl;
10596 } else {
10597 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10598 }
10599 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10600 sdata->sdata_op_ordering_lock.Unlock();
10601 sdata->sdata_lock.Lock();
10602 sdata->sdata_cond.SignalOne();
10603 sdata->sdata_lock.Unlock();
10604 }
10605
10606 namespace ceph {
10607 namespace osd_cmds {
10608
10609 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10610 {
10611 if (!ceph_using_tcmalloc()) {
10612 os << "could not issue heap profiler command -- not using tcmalloc!";
10613 return -EOPNOTSUPP;
10614 }
10615
10616 string cmd;
10617 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10618 os << "unable to get value for command \"" << cmd << "\"";
10619 return -EINVAL;
10620 }
10621
10622 std::vector<std::string> cmd_vec;
10623 get_str_vec(cmd, cmd_vec);
10624
10625 ceph_heap_profiler_handle_command(cmd_vec, os);
10626
10627 return 0;
10628 }
10629
10630 }} // namespace ceph::osd_cmds
10631
10632
10633 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10634 switch(q) {
10635 case OSD::io_queue::prioritized:
10636 out << "prioritized";
10637 break;
10638 case OSD::io_queue::weightedpriority:
10639 out << "weightedpriority";
10640 break;
10641 case OSD::io_queue::mclock_opclass:
10642 out << "mclock_opclass";
10643 break;
10644 case OSD::io_queue::mclock_client:
10645 out << "mclock_client";
10646 break;
10647 }
10648 return out;
10649 }