]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
update sources to 12.2.7
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15#include "acconfig.h"
16
17#include <fstream>
18#include <iostream>
19#include <errno.h>
20#include <sys/stat.h>
21#include <signal.h>
22#include <ctype.h>
23#include <boost/scoped_ptr.hpp>
24
25#ifdef HAVE_SYS_PARAM_H
26#include <sys/param.h>
27#endif
28
29#ifdef HAVE_SYS_MOUNT_H
30#include <sys/mount.h>
31#endif
32
33#include "osd/PG.h"
34
35#include "include/types.h"
36#include "include/compat.h"
37
38#include "OSD.h"
39#include "OSDMap.h"
40#include "Watch.h"
41#include "osdc/Objecter.h"
42
43#include "common/errno.h"
44#include "common/ceph_argparse.h"
224ce89b 45#include "common/ceph_time.h"
7c673cae
FG
46#include "common/version.h"
47#include "common/io_priority.h"
b5b8bbf5 48#include "common/pick_address.h"
7c673cae
FG
49
50#include "os/ObjectStore.h"
51#ifdef HAVE_LIBFUSE
52#include "os/FuseStore.h"
53#endif
54
55#include "PrimaryLogPG.h"
56
57
58#include "msg/Messenger.h"
59#include "msg/Message.h"
60
61#include "mon/MonClient.h"
62
63#include "messages/MLog.h"
64
65#include "messages/MGenericMessage.h"
7c673cae
FG
66#include "messages/MOSDPing.h"
67#include "messages/MOSDFailure.h"
68#include "messages/MOSDMarkMeDown.h"
69#include "messages/MOSDFull.h"
70#include "messages/MOSDOp.h"
71#include "messages/MOSDOpReply.h"
72#include "messages/MOSDBackoff.h"
73#include "messages/MOSDBeacon.h"
74#include "messages/MOSDRepOp.h"
75#include "messages/MOSDRepOpReply.h"
76#include "messages/MOSDBoot.h"
77#include "messages/MOSDPGTemp.h"
78
79#include "messages/MOSDMap.h"
80#include "messages/MMonGetOSDMap.h"
81#include "messages/MOSDPGNotify.h"
82#include "messages/MOSDPGQuery.h"
83#include "messages/MOSDPGLog.h"
84#include "messages/MOSDPGRemove.h"
85#include "messages/MOSDPGInfo.h"
86#include "messages/MOSDPGCreate.h"
87#include "messages/MOSDPGTrim.h"
88#include "messages/MOSDPGScan.h"
89#include "messages/MOSDPGBackfill.h"
90#include "messages/MBackfillReserve.h"
91#include "messages/MRecoveryReserve.h"
c07f9fc5 92#include "messages/MOSDForceRecovery.h"
7c673cae
FG
93#include "messages/MOSDECSubOpWrite.h"
94#include "messages/MOSDECSubOpWriteReply.h"
95#include "messages/MOSDECSubOpRead.h"
96#include "messages/MOSDECSubOpReadReply.h"
97#include "messages/MOSDPGCreated.h"
98#include "messages/MOSDPGUpdateLogMissing.h"
99#include "messages/MOSDPGUpdateLogMissingReply.h"
100
101#include "messages/MOSDAlive.h"
102
103#include "messages/MOSDScrub.h"
104#include "messages/MOSDScrubReserve.h"
105#include "messages/MOSDRepScrub.h"
106
107#include "messages/MMonCommand.h"
108#include "messages/MCommand.h"
109#include "messages/MCommandReply.h"
110
111#include "messages/MPGStats.h"
112#include "messages/MPGStatsAck.h"
113
114#include "messages/MWatchNotify.h"
115#include "messages/MOSDPGPush.h"
116#include "messages/MOSDPGPushReply.h"
117#include "messages/MOSDPGPull.h"
118
119#include "common/perf_counters.h"
120#include "common/Timer.h"
121#include "common/LogClient.h"
122#include "common/AsyncReserver.h"
123#include "common/HeartbeatMap.h"
124#include "common/admin_socket.h"
125#include "common/ceph_context.h"
126
127#include "global/signal_handler.h"
128#include "global/pidfile.h"
129
130#include "include/color.h"
131#include "perfglue/cpu_profiler.h"
132#include "perfglue/heap_profiler.h"
133
134#include "osd/OpRequest.h"
135
136#include "auth/AuthAuthorizeHandler.h"
137#include "auth/RotatingKeyRing.h"
138#include "common/errno.h"
139
140#include "objclass/objclass.h"
141
142#include "common/cmdparse.h"
143#include "include/str_list.h"
144#include "include/util.h"
145
146#include "include/assert.h"
147#include "common/config.h"
148#include "common/EventTrace.h"
149
150#ifdef WITH_LTTNG
151#define TRACEPOINT_DEFINE
152#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153#include "tracing/osd.h"
154#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155#undef TRACEPOINT_DEFINE
156#else
157#define tracepoint(...)
158#endif
159
160#define dout_context cct
161#define dout_subsys ceph_subsys_osd
162#undef dout_prefix
163#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
164
224ce89b 165
7c673cae
FG
166const double OSD::OSD_TICK_INTERVAL = 1.0;
167
168static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
170}
171
7c673cae
FG
172//Initial features in new superblock.
173//Features here are also automatically upgraded
174CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
195}
196
197//Features are added here that this OSD supports.
198CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
202 return compat;
203}
204
205OSDService::OSDService(OSD *osd) :
206 osd(osd),
207 cct(osd->cct),
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
214 logger(osd->logger),
215 recoverystate_perf(osd->recoverystate_perf),
216 monc(osd->monc),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
219 &osd->disk_tp),
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
224 max_oldest_map(0),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 scrubs_active(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
230 agent_ops(0),
231 flush_mode_high_count(0),
232 agent_active(true),
233 agent_thread(this),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
244 next_notif_id(0),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
31f18b77
FG
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
7c673cae 249 reserver_finisher(cct),
3efd9988 250 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 251 cct->_conf->osd_min_recovery_priority),
3efd9988 252 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae
FG
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 snap_sleep_timer(
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
31f18b77
FG
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 scrub_sleep_timer(
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
3efd9988 261 snap_reserver(cct, &reserver_finisher,
7c673cae
FG
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279#ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281#endif
282{
283 objecter->init();
284}
285
286OSDService::~OSDService()
287{
288 delete objecter;
289}
290
31f18b77
FG
291
292
293#ifdef PG_DEBUG_REFS
294void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
297 live_pgs[pgid] = pg;
298 }
299 pgid_tracker[pgid]++;
300}
301void OSDService::remove_pgid(spg_t pgid, PG *pg)
302{
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
310 }
311}
312void OSDService::dump_live_pgids()
313{
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
318 ++i) {
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
321 }
322}
323#endif
324
325
7c673cae
FG
326void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
327{
328 for (set<spg_t>::const_iterator i = children.begin();
329 i != children.end();
330 ++i) {
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
336
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
339 }
340}
341
342void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
343{
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
348 i != children.end();
349 ++i) {
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
354
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
358 }
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
361}
362
363void OSDService::cancel_pending_splits_for_parent(spg_t parent)
364{
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
367}
368
369void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
370{
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
373 return;
374
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
377 ++i) {
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
384 }
385 rev_pending_splits.erase(piter);
386}
387
388void OSDService::_maybe_split_pgid(OSDMapRef old_map,
389 OSDMapRef new_map,
390 spg_t pgid)
391{
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
395 set<spg_t> children;
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
399 } else {
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
401 }
402}
403
404void OSDService::init_splits_between(spg_t pgid,
405 OSDMapRef frommap,
406 OSDMapRef tomap)
407{
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
410 pgid.is_split(
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
413 NULL)) {
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
420 ++e) {
421 OSDMapRef nextmap(try_get_map(e));
422 if (!nextmap)
423 continue;
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
429 &split_pgs)) {
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
432 }
433 }
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
435 curmap = nextmap;
436 }
437 assert(curmap == tomap); // we must have had both frommap and tomap
438 }
439}
440
441void OSDService::expand_pg_num(OSDMapRef old_map,
442 OSDMapRef new_map)
443{
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
447 ) {
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
450 } else {
451 _maybe_split_pgid(old_map, new_map, *i);
452 ++i;
453 }
454 }
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
457 ) {
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
461 } else {
462 _maybe_split_pgid(old_map, new_map, i->first);
463 ++i;
464 }
465 }
466}
467
468bool OSDService::splitting(spg_t pgid)
469{
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
473}
474
475void OSDService::complete_split(const set<spg_t> &pgs)
476{
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
479 i != pgs.end();
480 ++i) {
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
485 }
486}
487
488void OSDService::need_heartbeat_peer_update()
489{
490 osd->need_heartbeat_peer_update();
491}
492
493void OSDService::pg_stat_queue_enqueue(PG *pg)
494{
495 osd->pg_stat_queue_enqueue(pg);
496}
497
498void OSDService::pg_stat_queue_dequeue(PG *pg)
499{
500 osd->pg_stat_queue_dequeue(pg);
501}
502
503void OSDService::start_shutdown()
504{
505 {
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
508 }
31f18b77
FG
509
510 {
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
513 }
7c673cae
FG
514}
515
31f18b77 516void OSDService::shutdown_reserver()
7c673cae
FG
517{
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
31f18b77
FG
520}
521
522void OSDService::shutdown()
523{
7c673cae
FG
524 {
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
527 }
528
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
532
533 {
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
536 }
537
538 {
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
541 }
542
31f18b77
FG
543 {
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
546 }
547
7c673cae
FG
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
550}
551
552void OSDService::init()
553{
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
557
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
560
561 watch_timer.init();
562 agent_timer.init();
563 snap_sleep_timer.init();
31f18b77 564 scrub_sleep_timer.init();
7c673cae
FG
565
566 agent_thread.create("osd_srv_agent");
567
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
570}
571
572void OSDService::final_init()
573{
574 objecter->start(osdmap.get());
575}
576
577void OSDService::activate_map()
578{
579 // wake/unwake the tiering agent
580 agent_lock.Lock();
581 agent_active =
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
583 osd->is_active();
584 agent_cond.Signal();
585 agent_lock.Unlock();
586}
587
181888fb
FG
588void OSDService::request_osdmap_update(epoch_t e)
589{
590 osd->osdmap_subscribe(e, false);
591}
592
7c673cae
FG
593class AgentTimeoutCB : public Context {
594 PGRef pg;
595public:
596 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
597 void finish(int) override {
598 pg->agent_choose_mode_restart();
599 }
600};
601
602void OSDService::agent_entry()
603{
604 dout(10) << __func__ << " start" << dendl;
605 agent_lock.Lock();
606
607 while (!agent_stop_flag) {
608 if (agent_queue.empty()) {
609 dout(20) << __func__ << " empty queue" << dendl;
610 agent_cond.Wait(agent_lock);
611 continue;
612 }
613 uint64_t level = agent_queue.rbegin()->first;
614 set<PGRef>& top = agent_queue.rbegin()->second;
615 dout(10) << __func__
616 << " tiers " << agent_queue.size()
617 << ", top is " << level
618 << " with pgs " << top.size()
619 << ", ops " << agent_ops << "/"
620 << cct->_conf->osd_agent_max_ops
621 << (agent_active ? " active" : " NOT ACTIVE")
622 << dendl;
623 dout(20) << __func__ << " oids " << agent_oids << dendl;
624 int max = cct->_conf->osd_agent_max_ops - agent_ops;
625 int agent_flush_quota = max;
626 if (!flush_mode_high_count)
627 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
628 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
629 agent_cond.Wait(agent_lock);
630 continue;
631 }
632
633 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
634 agent_queue_pos = top.begin();
635 agent_valid_iterator = true;
636 }
637 PGRef pg = *agent_queue_pos;
638 dout(10) << "high_count " << flush_mode_high_count
639 << " agent_ops " << agent_ops
640 << " flush_quota " << agent_flush_quota << dendl;
641 agent_lock.Unlock();
642 if (!pg->agent_work(max, agent_flush_quota)) {
643 dout(10) << __func__ << " " << pg->get_pgid()
644 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
645 << " seconds" << dendl;
646
647 osd->logger->inc(l_osd_tier_delay);
648 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
649 agent_timer_lock.Lock();
650 Context *cb = new AgentTimeoutCB(pg);
651 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
652 agent_timer_lock.Unlock();
653 }
654 agent_lock.Lock();
655 }
656 agent_lock.Unlock();
657 dout(10) << __func__ << " finish" << dendl;
658}
659
660void OSDService::agent_stop()
661{
662 {
663 Mutex::Locker l(agent_lock);
664
665 // By this time all ops should be cancelled
666 assert(agent_ops == 0);
667 // By this time all PGs are shutdown and dequeued
668 if (!agent_queue.empty()) {
669 set<PGRef>& top = agent_queue.rbegin()->second;
670 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
671 assert(0 == "agent queue not empty");
672 }
673
674 agent_stop_flag = true;
675 agent_cond.Signal();
676 }
677 agent_thread.join();
678}
679
680// -------------------------------------
681
682void OSDService::promote_throttle_recalibrate()
683{
684 utime_t now = ceph_clock_now();
685 double dur = now - last_recalibrate;
686 last_recalibrate = now;
687 unsigned prob = promote_probability_millis;
688
689 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
690 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
691
692 unsigned min_prob = 1;
693
694 uint64_t attempts, obj, bytes;
695 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
696 dout(10) << __func__ << " " << attempts << " attempts, promoted "
697 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
698 << target_obj_sec << " obj/sec or "
699 << pretty_si_t(target_bytes_sec) << " bytes/sec"
700 << dendl;
701
702 // calculate what the probability *should* be, given the targets
703 unsigned new_prob;
704 if (attempts && dur > 0) {
705 uint64_t avg_size = 1;
706 if (obj)
707 avg_size = MAX(bytes / obj, 1);
708 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
709 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
710 / (double)attempts;
711 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
712 << avg_size << dendl;
713 if (target_obj_sec && target_bytes_sec)
714 new_prob = MIN(po, pb);
715 else if (target_obj_sec)
716 new_prob = po;
717 else if (target_bytes_sec)
718 new_prob = pb;
719 else
720 new_prob = 1000;
721 } else {
722 new_prob = 1000;
723 }
724 dout(20) << __func__ << " new_prob " << new_prob << dendl;
725
726 // correct for persistent skew between target rate and actual rate, adjust
727 double ratio = 1.0;
728 unsigned actual = 0;
729 if (attempts && obj) {
730 actual = obj * 1000 / attempts;
731 ratio = (double)actual / (double)prob;
732 new_prob = (double)new_prob / ratio;
733 }
734 new_prob = MAX(new_prob, min_prob);
735 new_prob = MIN(new_prob, 1000);
736
737 // adjust
738 prob = (prob + new_prob) / 2;
739 prob = MAX(prob, min_prob);
740 prob = MIN(prob, 1000);
741 dout(10) << __func__ << " actual " << actual
742 << ", actual/prob ratio " << ratio
743 << ", adjusted new_prob " << new_prob
744 << ", prob " << promote_probability_millis << " -> " << prob
745 << dendl;
746 promote_probability_millis = prob;
747
748 // set hard limits for this interval to mitigate stampedes
749 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
750 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
751}
752
753// -------------------------------------
754
755float OSDService::get_failsafe_full_ratio()
756{
757 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
758 if (full_ratio > 1.0) full_ratio /= 100.0;
759 return full_ratio;
760}
761
224ce89b 762void OSDService::check_full_status(float ratio)
7c673cae
FG
763{
764 Mutex::Locker l(full_status_lock);
765
7c673cae
FG
766 cur_ratio = ratio;
767
768 // The OSDMap ratios take precendence. So if the failsafe is .95 and
769 // the admin sets the cluster full to .96, the failsafe moves up to .96
770 // too. (Not that having failsafe == full is ideal, but it's better than
771 // dropping writes before the clusters appears full.)
772 OSDMapRef osdmap = get_osdmap();
773 if (!osdmap || osdmap->get_epoch() == 0) {
774 cur_state = NONE;
775 return;
776 }
777 float nearfull_ratio = osdmap->get_nearfull_ratio();
778 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
779 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
780 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
781
31f18b77 782 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
783 // use the failsafe for nearfull and full; the mon isn't using the
784 // flags anyway because we're mid-upgrade.
785 full_ratio = failsafe_ratio;
786 backfillfull_ratio = failsafe_ratio;
787 nearfull_ratio = failsafe_ratio;
788 } else if (full_ratio <= 0 ||
789 backfillfull_ratio <= 0 ||
790 nearfull_ratio <= 0) {
791 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
792 // use failsafe flag. ick. the monitor did something wrong or the user
793 // did something stupid.
794 full_ratio = failsafe_ratio;
795 backfillfull_ratio = failsafe_ratio;
796 nearfull_ratio = failsafe_ratio;
797 }
798
799 string inject;
800 s_names new_state;
801 if (injectfull_state > NONE && injectfull) {
802 new_state = injectfull_state;
803 inject = "(Injected)";
804 } else if (ratio > failsafe_ratio) {
805 new_state = FAILSAFE;
806 } else if (ratio > full_ratio) {
807 new_state = FULL;
808 } else if (ratio > backfillfull_ratio) {
809 new_state = BACKFILLFULL;
810 } else if (ratio > nearfull_ratio) {
811 new_state = NEARFULL;
812 } else {
813 new_state = NONE;
814 }
815 dout(20) << __func__ << " cur ratio " << ratio
816 << ". nearfull_ratio " << nearfull_ratio
817 << ". backfillfull_ratio " << backfillfull_ratio
818 << ", full_ratio " << full_ratio
819 << ", failsafe_ratio " << failsafe_ratio
820 << ", new state " << get_full_state_name(new_state)
821 << " " << inject
822 << dendl;
823
824 // warn
825 if (cur_state != new_state) {
826 dout(10) << __func__ << " " << get_full_state_name(cur_state)
827 << " -> " << get_full_state_name(new_state) << dendl;
828 if (new_state == FAILSAFE) {
c07f9fc5 829 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
830 << (int)roundf(ratio * 100) << "% full";
831 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
832 clog->error() << "full status failsafe disengaged, no longer dropping "
833 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
834 }
835 cur_state = new_state;
836 }
837}
838
839bool OSDService::need_fullness_update()
840{
841 OSDMapRef osdmap = get_osdmap();
842 s_names cur = NONE;
843 if (osdmap->exists(whoami)) {
844 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
845 cur = FULL;
846 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
847 cur = BACKFILLFULL;
848 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
849 cur = NEARFULL;
850 }
851 }
852 s_names want = NONE;
853 if (is_full())
854 want = FULL;
855 else if (is_backfillfull())
856 want = BACKFILLFULL;
857 else if (is_nearfull())
858 want = NEARFULL;
859 return want != cur;
860}
861
862bool OSDService::_check_full(s_names type, ostream &ss) const
863{
864 Mutex::Locker l(full_status_lock);
865
866 if (injectfull && injectfull_state >= type) {
867 // injectfull is either a count of the number of times to return failsafe full
868 // or if -1 then always return full
869 if (injectfull > 0)
870 --injectfull;
871 ss << "Injected " << get_full_state_name(type) << " OSD ("
872 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
873 return true;
874 }
875
876 ss << "current usage is " << cur_ratio;
877 return cur_state >= type;
878}
879
880bool OSDService::check_failsafe_full(ostream &ss) const
881{
882 return _check_full(FAILSAFE, ss);
883}
884
885bool OSDService::check_full(ostream &ss) const
886{
887 return _check_full(FULL, ss);
888}
889
890bool OSDService::check_backfill_full(ostream &ss) const
891{
892 return _check_full(BACKFILLFULL, ss);
893}
894
895bool OSDService::check_nearfull(ostream &ss) const
896{
897 return _check_full(NEARFULL, ss);
898}
899
900bool OSDService::is_failsafe_full() const
901{
902 Mutex::Locker l(full_status_lock);
903 return cur_state == FAILSAFE;
904}
905
906bool OSDService::is_full() const
907{
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= FULL;
910}
911
912bool OSDService::is_backfillfull() const
913{
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= BACKFILLFULL;
916}
917
918bool OSDService::is_nearfull() const
919{
920 Mutex::Locker l(full_status_lock);
921 return cur_state >= NEARFULL;
922}
923
924void OSDService::set_injectfull(s_names type, int64_t count)
925{
926 Mutex::Locker l(full_status_lock);
927 injectfull_state = type;
928 injectfull = count;
929}
930
224ce89b 931osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
35e4c445
FG
932 vector<int>& hb_peers,
933 int num_pgs)
7c673cae 934{
224ce89b
WB
935 uint64_t bytes = stbuf.total;
936 uint64_t used = bytes - stbuf.available;
937 uint64_t avail = stbuf.available;
7c673cae 938
224ce89b
WB
939 osd->logger->set(l_osd_stat_bytes, bytes);
940 osd->logger->set(l_osd_stat_bytes_used, used);
941 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 942
224ce89b
WB
943 {
944 Mutex::Locker l(stat_lock);
945 osd_stat.hb_peers.swap(hb_peers);
946 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
947 osd_stat.kb = bytes >> 10;
948 osd_stat.kb_used = used >> 10;
949 osd_stat.kb_avail = avail >> 10;
35e4c445 950 osd_stat.num_pgs = num_pgs;
224ce89b
WB
951 return osd_stat;
952 }
953}
7c673cae 954
224ce89b
WB
955void OSDService::update_osd_stat(vector<int>& hb_peers)
956{
957 // load osd stats first
7c673cae
FG
958 struct store_statfs_t stbuf;
959 int r = osd->store->statfs(&stbuf);
960 if (r < 0) {
961 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
962 return;
963 }
964
35e4c445 965 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
224ce89b
WB
966 dout(20) << "update_osd_stat " << new_stat << dendl;
967 assert(new_stat.kb);
968 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
969 check_full_status(ratio);
7c673cae
FG
970}
971
972bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
973{
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
977 return true;
978 }
979 return false;
980}
981
982void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
983{
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
987
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
990 m->put();
991 release_map(next_map);
992 return;
993 }
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
999}
1000
1001ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1002{
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1006
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1010 return NULL;
1011 }
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1014 return con;
1015}
1016
1017pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1018{
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1022
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1027 return ret;
1028 }
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1033 return ret;
1034}
1035
1036
94b18763
FG
1037void OSDService::queue_want_pg_temp(pg_t pgid,
1038 const vector<int>& want,
1039 bool forced)
7c673cae
FG
1040{
1041 Mutex::Locker l(pg_temp_lock);
94b18763 1042 auto p = pg_temp_pending.find(pgid);
7c673cae 1043 if (p == pg_temp_pending.end() ||
94b18763
FG
1044 p->second.acting != want ||
1045 forced) {
1046 pg_temp_wanted[pgid] = pg_temp_t{want, forced};
7c673cae
FG
1047 }
1048}
1049
1050void OSDService::remove_want_pg_temp(pg_t pgid)
1051{
1052 Mutex::Locker l(pg_temp_lock);
1053 pg_temp_wanted.erase(pgid);
1054 pg_temp_pending.erase(pgid);
1055}
1056
1057void OSDService::_sent_pg_temp()
1058{
94b18763
FG
1059 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1060 make_move_iterator(end(pg_temp_wanted)));
7c673cae
FG
1061 pg_temp_wanted.clear();
1062}
1063
1064void OSDService::requeue_pg_temp()
1065{
1066 Mutex::Locker l(pg_temp_lock);
1067 // wanted overrides pending. note that remove_want_pg_temp
1068 // clears the item out of both.
1069 unsigned old_wanted = pg_temp_wanted.size();
1070 unsigned old_pending = pg_temp_pending.size();
1071 _sent_pg_temp();
1072 pg_temp_wanted.swap(pg_temp_pending);
1073 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1074 << pg_temp_wanted.size() << dendl;
1075}
1076
94b18763
FG
1077std::ostream& operator<<(std::ostream& out,
1078 const OSDService::pg_temp_t& pg_temp)
1079{
1080 out << pg_temp.acting;
1081 if (pg_temp.forced) {
1082 out << " (forced)";
1083 }
1084 return out;
1085}
1086
7c673cae
FG
1087void OSDService::send_pg_temp()
1088{
1089 Mutex::Locker l(pg_temp_lock);
1090 if (pg_temp_wanted.empty())
1091 return;
1092 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763
FG
1093 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1094 for (auto& pg_temp : pg_temp_wanted) {
1095 auto& m = ms[pg_temp.second.forced];
1096 if (!m) {
1097 m = new MOSDPGTemp(osdmap->get_epoch());
1098 m->forced = pg_temp.second.forced;
1099 }
1100 m->pg_temp.emplace(pg_temp.first,
1101 pg_temp.second.acting);
1102 }
1103 for (auto m : ms) {
1104 if (m) {
1105 monc->send_mon_message(m);
1106 }
1107 }
7c673cae
FG
1108 _sent_pg_temp();
1109}
1110
1111void OSDService::send_pg_created(pg_t pgid)
1112{
1113 dout(20) << __func__ << dendl;
c07f9fc5
FG
1114 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1115 monc->send_mon_message(new MOSDPGCreated(pgid));
1116 }
7c673cae
FG
1117}
1118
1119// --------------------------------------
1120// dispatch
1121
1122epoch_t OSDService::get_peer_epoch(int peer)
1123{
1124 Mutex::Locker l(peer_map_epoch_lock);
1125 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1126 if (p == peer_map_epoch.end())
1127 return 0;
1128 return p->second;
1129}
1130
1131epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1132{
1133 Mutex::Locker l(peer_map_epoch_lock);
1134 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1135 if (p != peer_map_epoch.end()) {
1136 if (p->second < e) {
1137 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1138 p->second = e;
1139 } else {
1140 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1141 }
1142 return p->second;
1143 } else {
1144 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1145 peer_map_epoch[peer] = e;
1146 return e;
1147 }
1148}
1149
1150void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1151{
1152 Mutex::Locker l(peer_map_epoch_lock);
1153 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1154 if (p != peer_map_epoch.end()) {
1155 if (p->second <= as_of) {
1156 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1157 << " had " << p->second << dendl;
1158 peer_map_epoch.erase(p);
1159 } else {
1160 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1161 << " has " << p->second << " - not forgetting" << dendl;
1162 }
1163 }
1164}
1165
1166bool OSDService::should_share_map(entity_name_t name, Connection *con,
1167 epoch_t epoch, const OSDMapRef& osdmap,
1168 const epoch_t *sent_epoch_p)
1169{
1170 dout(20) << "should_share_map "
1171 << name << " " << con->get_peer_addr()
1172 << " " << epoch << dendl;
1173
1174 // does client have old map?
1175 if (name.is_client()) {
1176 bool message_sendmap = epoch < osdmap->get_epoch();
1177 if (message_sendmap && sent_epoch_p) {
1178 dout(20) << "client session last_sent_epoch: "
1179 << *sent_epoch_p
1180 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1181 if (*sent_epoch_p < osdmap->get_epoch()) {
1182 return true;
1183 } // else we don't need to send it out again
1184 }
1185 }
1186
1187 if (con->get_messenger() == osd->cluster_messenger &&
1188 con != osd->cluster_messenger->get_loopback_connection() &&
1189 osdmap->is_up(name.num()) &&
1190 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1191 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1192 // remember
1193 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1194
1195 // share?
1196 if (has < osdmap->get_epoch()) {
1197 dout(10) << name << " " << con->get_peer_addr()
1198 << " has old map " << epoch << " < "
1199 << osdmap->get_epoch() << dendl;
1200 return true;
1201 }
1202 }
1203
1204 return false;
1205}
1206
1207void OSDService::share_map(
1208 entity_name_t name,
1209 Connection *con,
1210 epoch_t epoch,
1211 OSDMapRef& osdmap,
1212 epoch_t *sent_epoch_p)
1213{
1214 dout(20) << "share_map "
1215 << name << " " << con->get_peer_addr()
1216 << " " << epoch << dendl;
1217
1218 if (!osd->is_active()) {
1219 /*It is safe not to proceed as OSD is not in healthy state*/
1220 return;
1221 }
1222
1223 bool want_shared = should_share_map(name, con, epoch,
1224 osdmap, sent_epoch_p);
1225
1226 if (want_shared){
1227 if (name.is_client()) {
1228 dout(10) << name << " has old map " << epoch
1229 << " < " << osdmap->get_epoch() << dendl;
1230 // we know the Session is valid or we wouldn't be sending
1231 if (sent_epoch_p) {
1232 *sent_epoch_p = osdmap->get_epoch();
1233 }
1234 send_incremental_map(epoch, con, osdmap);
1235 } else if (con->get_messenger() == osd->cluster_messenger &&
1236 osdmap->is_up(name.num()) &&
1237 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1238 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1239 dout(10) << name << " " << con->get_peer_addr()
1240 << " has old map " << epoch << " < "
1241 << osdmap->get_epoch() << dendl;
1242 note_peer_epoch(name.num(), osdmap->get_epoch());
1243 send_incremental_map(epoch, con, osdmap);
1244 }
1245 }
1246}
1247
1248void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1249{
1250 if (!map)
1251 map = get_osdmap();
1252
1253 // send map?
1254 epoch_t pe = get_peer_epoch(peer);
1255 if (pe) {
1256 if (pe < map->get_epoch()) {
1257 send_incremental_map(pe, con, map);
1258 note_peer_epoch(peer, map->get_epoch());
1259 } else
1260 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1261 } else {
1262 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1263 // no idea about peer's epoch.
1264 // ??? send recent ???
1265 // do nothing.
1266 }
1267}
1268
1269bool OSDService::can_inc_scrubs_pending()
1270{
1271 bool can_inc = false;
1272 Mutex::Locker l(sched_scrub_lock);
1273
1274 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1275 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
28e407b8
AA
1276 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
1277 << ")" << dendl;
7c673cae
FG
1278 can_inc = true;
1279 } else {
28e407b8
AA
1280 dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
1281 << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1282 }
1283
1284 return can_inc;
1285}
1286
1287bool OSDService::inc_scrubs_pending()
1288{
1289 bool result = false;
1290
1291 sched_scrub_lock.Lock();
1292 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1293 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1294 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1295 result = true;
1296 ++scrubs_pending;
1297 } else {
1298 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1299 }
1300 sched_scrub_lock.Unlock();
1301
1302 return result;
1303}
1304
1305void OSDService::dec_scrubs_pending()
1306{
1307 sched_scrub_lock.Lock();
1308 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1309 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1310 --scrubs_pending;
1311 assert(scrubs_pending >= 0);
1312 sched_scrub_lock.Unlock();
1313}
1314
1315void OSDService::inc_scrubs_active(bool reserved)
1316{
1317 sched_scrub_lock.Lock();
1318 ++(scrubs_active);
1319 if (reserved) {
1320 --(scrubs_pending);
1321 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1322 << " (max " << cct->_conf->osd_max_scrubs
1323 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1324 assert(scrubs_pending >= 0);
1325 } else {
1326 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1327 << " (max " << cct->_conf->osd_max_scrubs
1328 << ", pending " << scrubs_pending << ")" << dendl;
1329 }
1330 sched_scrub_lock.Unlock();
1331}
1332
1333void OSDService::dec_scrubs_active()
1334{
1335 sched_scrub_lock.Lock();
1336 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1337 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1338 --scrubs_active;
1339 assert(scrubs_active >= 0);
1340 sched_scrub_lock.Unlock();
1341}
1342
1343void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344 epoch_t *_bind_epoch) const
1345{
1346 Mutex::Locker l(epoch_lock);
1347 if (_boot_epoch)
1348 *_boot_epoch = boot_epoch;
1349 if (_up_epoch)
1350 *_up_epoch = up_epoch;
1351 if (_bind_epoch)
1352 *_bind_epoch = bind_epoch;
1353}
1354
1355void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356 const epoch_t *_bind_epoch)
1357{
1358 Mutex::Locker l(epoch_lock);
1359 if (_boot_epoch) {
1360 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1361 boot_epoch = *_boot_epoch;
1362 }
1363 if (_up_epoch) {
1364 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1365 up_epoch = *_up_epoch;
1366 }
1367 if (_bind_epoch) {
1368 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1369 bind_epoch = *_bind_epoch;
1370 }
1371}
1372
1373bool OSDService::prepare_to_stop()
1374{
1375 Mutex::Locker l(is_stopping_lock);
1376 if (get_state() != NOT_STOPPING)
1377 return false;
1378
1379 OSDMapRef osdmap = get_osdmap();
1380 if (osdmap && osdmap->is_up(whoami)) {
1381 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382 set_state(PREPARING_TO_STOP);
1383 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1384 osdmap->get_inst(whoami),
1385 osdmap->get_epoch(),
1386 true // request ack
1387 ));
1388 utime_t now = ceph_clock_now();
1389 utime_t timeout;
1390 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1391 while ((ceph_clock_now() < timeout) &&
1392 (get_state() != STOPPING)) {
1393 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1394 }
1395 }
1396 dout(0) << __func__ << " starting shutdown" << dendl;
1397 set_state(STOPPING);
1398 return true;
1399}
1400
1401void OSDService::got_stop_ack()
1402{
1403 Mutex::Locker l(is_stopping_lock);
1404 if (get_state() == PREPARING_TO_STOP) {
1405 dout(0) << __func__ << " starting shutdown" << dendl;
1406 set_state(STOPPING);
1407 is_stopping_cond.Signal();
1408 } else {
1409 dout(10) << __func__ << " ignoring msg" << dendl;
1410 }
1411}
1412
1413MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1414 OSDSuperblock& sblock)
1415{
28e407b8
AA
1416 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1417 osdmap->get_encoding_features());
7c673cae
FG
1418 m->oldest_map = max_oldest_map;
1419 m->newest_map = sblock.newest_map;
1420
1421 for (epoch_t e = to; e > since; e--) {
1422 bufferlist bl;
1423 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1424 m->incremental_maps[e].claim(bl);
1425 } else if (get_map_bl(e, bl)) {
1426 m->maps[e].claim(bl);
1427 break;
1428 } else {
1429 derr << "since " << since << " to " << to
1430 << " oldest " << m->oldest_map << " newest " << m->newest_map
1431 << dendl;
1432 m->put();
1433 m = NULL;
1434 break;
1435 }
1436 }
1437 return m;
1438}
1439
1440void OSDService::send_map(MOSDMap *m, Connection *con)
1441{
1442 con->send_message(m);
1443}
1444
1445void OSDService::send_incremental_map(epoch_t since, Connection *con,
1446 OSDMapRef& osdmap)
1447{
1448 epoch_t to = osdmap->get_epoch();
1449 dout(10) << "send_incremental_map " << since << " -> " << to
1450 << " to " << con << " " << con->get_peer_addr() << dendl;
1451
1452 MOSDMap *m = NULL;
1453 while (!m) {
1454 OSDSuperblock sblock(get_superblock());
1455 if (since < sblock.oldest_map) {
1456 // just send latest full map
28e407b8
AA
1457 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1458 osdmap->get_encoding_features());
7c673cae
FG
1459 m->oldest_map = max_oldest_map;
1460 m->newest_map = sblock.newest_map;
1461 get_map_bl(to, m->maps[to]);
1462 send_map(m, con);
1463 return;
1464 }
1465
1466 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1467 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1468 << ", only sending most recent" << dendl;
1469 since = to - cct->_conf->osd_map_share_max_epochs;
1470 }
1471
1472 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1473 to = since + cct->_conf->osd_map_message_max;
1474 m = build_incremental_map_msg(since, to, sblock);
1475 }
1476 send_map(m, con);
1477}
1478
1479bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1480{
1481 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1482 if (found) {
1483 if (logger)
1484 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1485 return true;
31f18b77
FG
1486 }
1487 if (logger)
1488 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1489 found = store->read(coll_t::meta(),
31f18b77
FG
1490 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1491 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1492 if (found) {
7c673cae 1493 _add_map_bl(e, bl);
31f18b77 1494 }
7c673cae
FG
1495 return found;
1496}
1497
1498bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1499{
1500 Mutex::Locker l(map_cache_lock);
1501 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1502 if (found) {
1503 if (logger)
1504 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1505 return true;
31f18b77
FG
1506 }
1507 if (logger)
1508 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1509 found = store->read(coll_t::meta(),
31f18b77
FG
1510 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1511 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1512 if (found) {
7c673cae 1513 _add_map_inc_bl(e, bl);
31f18b77 1514 }
7c673cae
FG
1515 return found;
1516}
1517
1518void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1519{
1520 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1521 // cache a contiguous buffer
1522 if (bl.get_num_buffers() > 1) {
1523 bl.rebuild();
1524 }
1525 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1526 map_bl_cache.add(e, bl);
1527}
1528
1529void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1530{
1531 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1532 // cache a contiguous buffer
1533 if (bl.get_num_buffers() > 1) {
1534 bl.rebuild();
1535 }
1536 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1537 map_bl_inc_cache.add(e, bl);
1538}
1539
1540void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1541{
1542 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1543 // cache a contiguous buffer
1544 if (bl.get_num_buffers() > 1) {
1545 bl.rebuild();
1546 }
7c673cae
FG
1547 map_bl_inc_cache.pin(e, bl);
1548}
1549
1550void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1551{
1552 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1553 // cache a contiguous buffer
1554 if (bl.get_num_buffers() > 1) {
1555 bl.rebuild();
1556 }
7c673cae
FG
1557 map_bl_cache.pin(e, bl);
1558}
1559
1560void OSDService::clear_map_bl_cache_pins(epoch_t e)
1561{
1562 Mutex::Locker l(map_cache_lock);
1563 map_bl_inc_cache.clear_pinned(e);
1564 map_bl_cache.clear_pinned(e);
1565}
1566
1567OSDMapRef OSDService::_add_map(OSDMap *o)
1568{
1569 epoch_t e = o->get_epoch();
1570
1571 if (cct->_conf->osd_map_dedup) {
1572 // Dedup against an existing map at a nearby epoch
1573 OSDMapRef for_dedup = map_cache.lower_bound(e);
1574 if (for_dedup) {
1575 OSDMap::dedup(for_dedup.get(), o);
1576 }
1577 }
1578 bool existed;
1579 OSDMapRef l = map_cache.add(e, o, &existed);
1580 if (existed) {
1581 delete o;
1582 }
1583 return l;
1584}
1585
1586OSDMapRef OSDService::try_get_map(epoch_t epoch)
1587{
1588 Mutex::Locker l(map_cache_lock);
1589 OSDMapRef retval = map_cache.lookup(epoch);
1590 if (retval) {
1591 dout(30) << "get_map " << epoch << " -cached" << dendl;
1592 if (logger) {
1593 logger->inc(l_osd_map_cache_hit);
1594 }
1595 return retval;
1596 }
1597 if (logger) {
1598 logger->inc(l_osd_map_cache_miss);
1599 epoch_t lb = map_cache.cached_key_lower_bound();
1600 if (epoch < lb) {
1601 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1602 logger->inc(l_osd_map_cache_miss_low);
1603 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1604 }
1605 }
1606
1607 OSDMap *map = new OSDMap;
1608 if (epoch > 0) {
1609 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1610 bufferlist bl;
1611 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1612 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1613 delete map;
1614 return OSDMapRef();
1615 }
1616 map->decode(bl);
1617 } else {
1618 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1619 }
1620 return _add_map(map);
1621}
1622
1623// ops
1624
1625
1626void OSDService::reply_op_error(OpRequestRef op, int err)
1627{
1628 reply_op_error(op, err, eversion_t(), 0);
1629}
1630
1631void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1632 version_t uv)
1633{
1634 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1635 assert(m->get_type() == CEPH_MSG_OSD_OP);
1636 int flags;
1637 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1638
1639 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1640 true);
1641 reply->set_reply_versions(v, uv);
1642 m->get_connection()->send_message(reply);
1643}
1644
1645void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1646{
31f18b77
FG
1647 if (!cct->_conf->osd_debug_misdirected_ops) {
1648 return;
1649 }
1650
7c673cae
FG
1651 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1652 assert(m->get_type() == CEPH_MSG_OSD_OP);
1653
1654 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1655
1656 if (pg->is_ec_pg()) {
1657 /**
1658 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1659 * can get this result:
1660 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1661 * [CRUSH_ITEM_NONE, 2, 3]/3
1662 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1663 * [3, 2, 3]/3
1664 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1665 * -- misdirected op
1666 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1667 * it and fulfils it
1668 *
1669 * We can't compute the op target based on the sending map epoch due to
1670 * splitting. The simplest thing is to detect such cases here and drop
1671 * them without an error (the client will resend anyway).
1672 */
1673 assert(m->get_map_epoch() <= superblock.newest_map);
1674 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1675 if (!opmap) {
1676 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1677 << m->get_map_epoch() << ", dropping" << dendl;
1678 return;
1679 }
1680 pg_t _pgid = m->get_raw_pg();
1681 spg_t pgid;
1682 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1683 _pgid = opmap->raw_pg_to_pg(_pgid);
1684 if (opmap->get_primary_shard(_pgid, &pgid) &&
1685 pgid.shard != pg->info.pgid.shard) {
1686 dout(7) << __func__ << ": " << *pg << " primary changed since "
1687 << m->get_map_epoch() << ", dropping" << dendl;
1688 return;
1689 }
1690 }
1691
1692 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1693 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1694 << " pg " << m->get_raw_pg()
1695 << " to osd." << whoami
1696 << " not " << pg->acting
1697 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1698}
1699
1700void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1701{
1702 osd->op_shardedwq.queue(make_pair(pgid, qi));
1703}
1704
1705void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1706{
1707 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1708}
1709
1710void OSDService::queue_for_peering(PG *pg)
1711{
1712 peering_wq.queue(pg);
1713}
1714
1715void OSDService::queue_for_snap_trim(PG *pg)
1716{
1717 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1718 osd->op_shardedwq.queue(
1719 make_pair(
1720 pg->info.pgid,
1721 PGQueueable(
1722 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1723 cct->_conf->osd_snap_trim_cost,
1724 cct->_conf->osd_snap_trim_priority,
1725 ceph_clock_now(),
1726 entity_inst_t(),
1727 pg->get_osdmap()->get_epoch())));
1728}
1729
1730
1731// ====================================================================
1732// OSD
1733
1734#undef dout_prefix
1735#define dout_prefix *_dout
1736
1737// Commands shared between OSD's console and admin console:
1738namespace ceph {
1739namespace osd_cmds {
1740
1741int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1742
1743}} // namespace ceph::osd_cmds
1744
1745int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1746 uuid_d fsid, int whoami)
1747{
1748 int ret;
1749
1750 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1751 new ObjectStore::Sequencer("mkfs"));
1752 OSDSuperblock sb;
1753 bufferlist sbbl;
1754 C_SaferCond waiter;
1755
1756 // if we are fed a uuid for this osd, use it.
1757 store->set_fsid(cct->_conf->osd_uuid);
1758
1759 ret = store->mkfs();
1760 if (ret) {
224ce89b
WB
1761 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1762 << cpp_strerror(ret) << dendl;
7c673cae
FG
1763 goto free_store;
1764 }
1765
31f18b77 1766 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1767
1768 ret = store->mount();
1769 if (ret) {
224ce89b
WB
1770 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1771 << cpp_strerror(ret) << dendl;
7c673cae
FG
1772 goto free_store;
1773 }
1774
1775 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1776 if (ret >= 0) {
1777 /* if we already have superblock, check content of superblock */
1778 dout(0) << " have superblock" << dendl;
1779 bufferlist::iterator p;
1780 p = sbbl.begin();
1781 ::decode(sb, p);
1782 if (whoami != sb.whoami) {
1783 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1784 << dendl;
1785 ret = -EINVAL;
1786 goto umount_store;
1787 }
1788 if (fsid != sb.cluster_fsid) {
1789 derr << "provided cluster fsid " << fsid
1790 << " != superblock's " << sb.cluster_fsid << dendl;
1791 ret = -EINVAL;
1792 goto umount_store;
1793 }
1794 } else {
1795 // create superblock
1796 sb.cluster_fsid = fsid;
1797 sb.osd_fsid = store->get_fsid();
1798 sb.whoami = whoami;
1799 sb.compat_features = get_osd_initial_compat_set();
1800
1801 bufferlist bl;
1802 ::encode(sb, bl);
1803
1804 ObjectStore::Transaction t;
1805 t.create_collection(coll_t::meta(), 0);
1806 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1807 ret = store->apply_transaction(osr.get(), std::move(t));
1808 if (ret) {
1809 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
224ce89b 1810 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1811 goto umount_store;
1812 }
1813 }
1814
1815 if (!osr->flush_commit(&waiter)) {
1816 waiter.wait();
1817 }
1818
3efd9988 1819 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 1820 if (ret) {
224ce89b
WB
1821 derr << "OSD::mkfs: failed to write fsid file: error "
1822 << cpp_strerror(ret) << dendl;
7c673cae
FG
1823 goto umount_store;
1824 }
1825
1826umount_store:
1827 store->umount();
1828free_store:
1829 delete store;
1830 return ret;
1831}
1832
3efd9988 1833int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
1834{
1835 char val[80];
1836 int r;
1837
1838 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1839 r = store->write_meta("magic", val);
1840 if (r < 0)
1841 return r;
1842
1843 snprintf(val, sizeof(val), "%d", whoami);
1844 r = store->write_meta("whoami", val);
1845 if (r < 0)
1846 return r;
1847
1848 cluster_fsid.print(val);
1849 r = store->write_meta("ceph_fsid", val);
1850 if (r < 0)
1851 return r;
1852
3efd9988 1853 string key = cct->_conf->get_val<string>("key");
3efd9988
FG
1854 if (key.size()) {
1855 r = store->write_meta("osd_key", key);
1856 if (r < 0)
1857 return r;
b32b8144
FG
1858 } else {
1859 string keyfile = cct->_conf->get_val<string>("keyfile");
1860 if (!keyfile.empty()) {
1861 bufferlist keybl;
1862 string err;
1863 if (keyfile == "-") {
1864 static_assert(1024 * 1024 >
1865 (sizeof(CryptoKey) - sizeof(bufferptr) +
1866 sizeof(__u16) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1867 "1MB should be enough for a base64 encoded CryptoKey");
1868 r = keybl.read_fd(STDIN_FILENO, 1024 * 1024);
1869 } else {
1870 r = keybl.read_file(keyfile.c_str(), &err);
1871 }
1872 if (r < 0) {
1873 derr << __func__ << " failed to read keyfile " << keyfile << ": "
1874 << err << ": " << cpp_strerror(r) << dendl;
1875 return r;
1876 }
1877 r = store->write_meta("osd_key", keybl.to_str());
1878 if (r < 0)
1879 return r;
1880 }
3efd9988
FG
1881 }
1882
7c673cae
FG
1883 r = store->write_meta("ready", "ready");
1884 if (r < 0)
1885 return r;
1886
1887 return 0;
1888}
1889
1890int OSD::peek_meta(ObjectStore *store, std::string& magic,
1891 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1892{
1893 string val;
1894
1895 int r = store->read_meta("magic", &val);
1896 if (r < 0)
1897 return r;
1898 magic = val;
1899
1900 r = store->read_meta("whoami", &val);
1901 if (r < 0)
1902 return r;
1903 whoami = atoi(val.c_str());
1904
1905 r = store->read_meta("ceph_fsid", &val);
1906 if (r < 0)
1907 return r;
1908 r = cluster_fsid.parse(val.c_str());
1909 if (!r)
1910 return -EINVAL;
1911
1912 r = store->read_meta("fsid", &val);
1913 if (r < 0) {
1914 osd_fsid = uuid_d();
1915 } else {
1916 r = osd_fsid.parse(val.c_str());
1917 if (!r)
1918 return -EINVAL;
1919 }
1920
1921 return 0;
1922}
1923
1924
1925#undef dout_prefix
1926#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1927
1928// cons/des
1929
1930OSD::OSD(CephContext *cct_, ObjectStore *store_,
1931 int id,
1932 Messenger *internal_messenger,
1933 Messenger *external_messenger,
1934 Messenger *hb_client_front,
1935 Messenger *hb_client_back,
1936 Messenger *hb_front_serverm,
1937 Messenger *hb_back_serverm,
1938 Messenger *osdc_messenger,
1939 MonClient *mc,
1940 const std::string &dev, const std::string &jdev) :
1941 Dispatcher(cct_),
1942 osd_lock("OSD::osd_lock"),
1943 tick_timer(cct, osd_lock),
1944 tick_timer_lock("OSD::tick_timer_lock"),
1945 tick_timer_without_osd_lock(cct, tick_timer_lock),
1946 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1947 cct->_conf->auth_supported.empty() ?
1948 cct->_conf->auth_cluster_required :
1949 cct->_conf->auth_supported)),
1950 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1951 cct->_conf->auth_supported.empty() ?
1952 cct->_conf->auth_service_required :
1953 cct->_conf->auth_supported)),
1954 cluster_messenger(internal_messenger),
1955 client_messenger(external_messenger),
1956 objecter_messenger(osdc_messenger),
1957 monc(mc),
1958 mgrc(cct_, client_messenger),
1959 logger(NULL),
1960 recoverystate_perf(NULL),
1961 store(store_),
1962 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1963 clog(log_client.create_channel()),
1964 whoami(id),
1965 dev_path(dev), journal_path(jdev),
31f18b77 1966 store_is_rotational(store->is_rotational()),
7c673cae
FG
1967 trace_endpoint("0.0.0.0", 0, "osd"),
1968 asok_hook(NULL),
1969 osd_compat(get_osd_compat_set()),
31f18b77
FG
1970 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1971 cct->_conf->osd_peering_wq_threads,
1972 "osd_peering_tp_threads"),
7c673cae 1973 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 1974 get_num_op_threads()),
7c673cae
FG
1975 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1976 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1977 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 1978 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
1979 heartbeat_lock("OSD::heartbeat_lock"),
1980 heartbeat_stop(false),
1981 heartbeat_need_update(true),
1982 hb_front_client_messenger(hb_client_front),
1983 hb_back_client_messenger(hb_client_back),
1984 hb_front_server_messenger(hb_front_serverm),
1985 hb_back_server_messenger(hb_back_serverm),
1986 daily_loadavg(0.0),
1987 heartbeat_thread(this),
1988 heartbeat_dispatcher(this),
1989 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1990 cct->_conf->osd_num_op_tracker_shard),
1991 test_ops_hook(NULL),
1992 op_queue(get_io_queue()),
1993 op_prio_cutoff(get_io_prio_cut()),
1994 op_shardedwq(
31f18b77 1995 get_num_op_shards(),
7c673cae
FG
1996 this,
1997 cct->_conf->osd_op_thread_timeout,
1998 cct->_conf->osd_op_thread_suicide_timeout,
1999 &osd_op_tp),
2000 peering_wq(
2001 this,
2002 cct->_conf->osd_op_thread_timeout,
2003 cct->_conf->osd_op_thread_suicide_timeout,
31f18b77 2004 &peering_tp),
7c673cae
FG
2005 map_lock("OSD::map_lock"),
2006 pg_map_lock("OSD::pg_map_lock"),
2007 last_pg_create_epoch(0),
2008 mon_report_lock("OSD::mon_report_lock"),
2009 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
2010 up_thru_wanted(0),
2011 requested_full_first(0),
2012 requested_full_last(0),
2013 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
2014 osd_stat_updated(false),
2015 pg_stat_tid(0), pg_stat_tid_flushed(0),
2016 command_wq(
2017 this,
2018 cct->_conf->osd_command_thread_timeout,
2019 cct->_conf->osd_command_thread_suicide_timeout,
2020 &command_tp),
2021 remove_wq(
2022 cct,
2023 store,
2024 cct->_conf->osd_remove_thread_timeout,
2025 cct->_conf->osd_remove_thread_suicide_timeout,
2026 &disk_tp),
2027 service(this)
2028{
2029 monc->set_messenger(client_messenger);
2030 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2031 cct->_conf->osd_op_log_threshold);
2032 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2033 cct->_conf->osd_op_history_duration);
2034 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2035 cct->_conf->osd_op_history_slow_op_threshold);
2036#ifdef WITH_BLKIN
2037 std::stringstream ss;
2038 ss << "osd." << whoami;
2039 trace_endpoint.copy_name(ss.str());
2040#endif
2041}
2042
2043OSD::~OSD()
2044{
2045 delete authorize_handler_cluster_registry;
2046 delete authorize_handler_service_registry;
2047 delete class_handler;
2048 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2049 cct->get_perfcounters_collection()->remove(logger);
2050 delete recoverystate_perf;
2051 delete logger;
2052 delete store;
2053}
2054
2055void cls_initialize(ClassHandler *ch);
2056
2057void OSD::handle_signal(int signum)
2058{
2059 assert(signum == SIGINT || signum == SIGTERM);
2060 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2061 shutdown();
2062}
2063
2064int OSD::pre_init()
2065{
2066 Mutex::Locker lock(osd_lock);
2067 if (is_stopping())
2068 return 0;
2069
2070 if (store->test_mount_in_use()) {
2071 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2072 << "currently in use. (Is ceph-osd already running?)" << dendl;
2073 return -EBUSY;
2074 }
2075
2076 cct->_conf->add_observer(this);
2077 return 0;
2078}
2079
2080// asok
2081
2082class OSDSocketHook : public AdminSocketHook {
2083 OSD *osd;
2084public:
2085 explicit OSDSocketHook(OSD *o) : osd(o) {}
2086 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2087 bufferlist& out) override {
2088 stringstream ss;
2089 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2090 out.append(ss);
2091 return r;
2092 }
2093};
2094
2095bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2096 ostream& ss)
2097{
2098 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2099 if (admin_command == "status") {
2100 f->open_object_section("status");
2101 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2102 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2103 f->dump_unsigned("whoami", superblock.whoami);
2104 f->dump_string("state", get_state_name(get_state()));
2105 f->dump_unsigned("oldest_map", superblock.oldest_map);
2106 f->dump_unsigned("newest_map", superblock.newest_map);
2107 {
2108 RWLock::RLocker l(pg_map_lock);
2109 f->dump_unsigned("num_pgs", pg_map.size());
2110 }
2111 f->close_section();
2112 } else if (admin_command == "flush_journal") {
2113 store->flush_journal();
2114 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2115 admin_command == "ops" ||
2116 admin_command == "dump_blocked_ops" ||
2117 admin_command == "dump_historic_ops" ||
2118 admin_command == "dump_historic_ops_by_duration" ||
2119 admin_command == "dump_historic_slow_ops") {
2120
2121 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2122even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2123will start to track new ops received afterwards.";
2124
2125 set<string> filters;
2126 vector<string> filter_str;
2127 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2128 copy(filter_str.begin(), filter_str.end(),
2129 inserter(filters, filters.end()));
2130 }
2131
2132 if (admin_command == "dump_ops_in_flight" ||
2133 admin_command == "ops") {
2134 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2135 ss << error_str;
2136 }
2137 }
2138 if (admin_command == "dump_blocked_ops") {
2139 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2140 ss << error_str;
2141 }
2142 }
2143 if (admin_command == "dump_historic_ops") {
2144 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2145 ss << error_str;
2146 }
2147 }
2148 if (admin_command == "dump_historic_ops_by_duration") {
2149 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2150 ss << error_str;
2151 }
2152 }
2153 if (admin_command == "dump_historic_slow_ops") {
2154 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2155 ss << error_str;
2156 }
7c673cae
FG
2157 }
2158 } else if (admin_command == "dump_op_pq_state") {
2159 f->open_object_section("pq");
2160 op_shardedwq.dump(f);
2161 f->close_section();
2162 } else if (admin_command == "dump_blacklist") {
2163 list<pair<entity_addr_t,utime_t> > bl;
2164 OSDMapRef curmap = service.get_osdmap();
2165
2166 f->open_array_section("blacklist");
2167 curmap->get_blacklist(&bl);
2168 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2169 it != bl.end(); ++it) {
224ce89b 2170 f->open_object_section("entry");
7c673cae
FG
2171 f->open_object_section("entity_addr_t");
2172 it->first.dump(f);
2173 f->close_section(); //entity_addr_t
2174 it->second.localtime(f->dump_stream("expire_time"));
2175 f->close_section(); //entry
2176 }
2177 f->close_section(); //blacklist
2178 } else if (admin_command == "dump_watchers") {
2179 list<obj_watch_item_t> watchers;
2180 // scan pg's
2181 {
2182 Mutex::Locker l(osd_lock);
2183 RWLock::RLocker l2(pg_map_lock);
2184 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2185 it != pg_map.end();
2186 ++it) {
2187
2188 list<obj_watch_item_t> pg_watchers;
2189 PG *pg = it->second;
2190 pg->lock();
2191 pg->get_watchers(pg_watchers);
2192 pg->unlock();
2193 watchers.splice(watchers.end(), pg_watchers);
2194 }
2195 }
2196
2197 f->open_array_section("watchers");
2198 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2199 it != watchers.end(); ++it) {
2200
224ce89b 2201 f->open_object_section("watch");
7c673cae
FG
2202
2203 f->dump_string("namespace", it->obj.nspace);
2204 f->dump_string("object", it->obj.oid.name);
2205
2206 f->open_object_section("entity_name");
2207 it->wi.name.dump(f);
2208 f->close_section(); //entity_name_t
2209
224ce89b
WB
2210 f->dump_unsigned("cookie", it->wi.cookie);
2211 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2212
2213 f->open_object_section("entity_addr_t");
2214 it->wi.addr.dump(f);
2215 f->close_section(); //entity_addr_t
2216
2217 f->close_section(); //watch
2218 }
2219
2220 f->close_section(); //watchers
2221 } else if (admin_command == "dump_reservations") {
2222 f->open_object_section("reservations");
2223 f->open_object_section("local_reservations");
2224 service.local_reserver.dump(f);
2225 f->close_section();
2226 f->open_object_section("remote_reservations");
2227 service.remote_reserver.dump(f);
2228 f->close_section();
2229 f->close_section();
2230 } else if (admin_command == "get_latest_osdmap") {
2231 get_latest_osdmap();
2232 } else if (admin_command == "heap") {
2233 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2234
2235 // Note: Failed heap profile commands won't necessarily trigger an error:
2236 f->open_object_section("result");
2237 f->dump_string("error", cpp_strerror(result));
2238 f->dump_bool("success", result >= 0);
2239 f->close_section();
2240 } else if (admin_command == "set_heap_property") {
2241 string property;
2242 int64_t value = 0;
2243 string error;
2244 bool success = false;
2245 if (!cmd_getval(cct, cmdmap, "property", property)) {
2246 error = "unable to get property";
2247 success = false;
2248 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2249 error = "unable to get value";
2250 success = false;
2251 } else if (value < 0) {
2252 error = "negative value not allowed";
2253 success = false;
2254 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2255 error = "invalid property";
2256 success = false;
2257 } else {
2258 success = true;
2259 }
2260 f->open_object_section("result");
2261 f->dump_string("error", error);
2262 f->dump_bool("success", success);
2263 f->close_section();
2264 } else if (admin_command == "get_heap_property") {
2265 string property;
2266 size_t value = 0;
2267 string error;
2268 bool success = false;
2269 if (!cmd_getval(cct, cmdmap, "property", property)) {
2270 error = "unable to get property";
2271 success = false;
2272 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2273 error = "invalid property";
2274 success = false;
2275 } else {
2276 success = true;
2277 }
2278 f->open_object_section("result");
2279 f->dump_string("error", error);
2280 f->dump_bool("success", success);
2281 f->dump_int("value", value);
2282 f->close_section();
2283 } else if (admin_command == "dump_objectstore_kv_stats") {
2284 store->get_db_statistics(f);
2285 } else if (admin_command == "dump_scrubs") {
2286 service.dumps_scrub(f);
2287 } else if (admin_command == "calc_objectstore_db_histogram") {
2288 store->generate_db_histogram(f);
2289 } else if (admin_command == "flush_store_cache") {
2290 store->flush_cache();
2291 } else if (admin_command == "dump_pgstate_history") {
2292 f->open_object_section("pgstate_history");
2293 RWLock::RLocker l2(pg_map_lock);
2294 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2295 it != pg_map.end();
2296 ++it) {
2297
2298 PG *pg = it->second;
2299 f->dump_stream("pg") << pg->get_pgid();
2300 pg->lock();
2301 pg->pgstate_history.dump(f);
2302 pg->unlock();
2303 }
2304 f->close_section();
224ce89b
WB
2305 } else if (admin_command == "compact") {
2306 dout(1) << "triggering manual compaction" << dendl;
2307 auto start = ceph::coarse_mono_clock::now();
2308 store->compact();
2309 auto end = ceph::coarse_mono_clock::now();
2310 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2311 dout(1) << "finished manual compaction in "
2312 << time_span.count()
2313 << " seconds" << dendl;
2314 f->open_object_section("compact_result");
2315 f->dump_float("elapsed_time", time_span.count());
2316 f->close_section();
7c673cae
FG
2317 } else {
2318 assert(0 == "broken asok registration");
2319 }
2320 f->flush(ss);
2321 delete f;
2322 return true;
2323}
2324
2325class TestOpsSocketHook : public AdminSocketHook {
2326 OSDService *service;
2327 ObjectStore *store;
2328public:
2329 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2330 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2331 bufferlist& out) override {
2332 stringstream ss;
2333 test_ops(service, store, command, cmdmap, ss);
2334 out.append(ss);
2335 return true;
2336 }
2337 void test_ops(OSDService *service, ObjectStore *store,
2338 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2339
2340};
2341
2342class OSD::C_Tick : public Context {
2343 OSD *osd;
2344 public:
2345 explicit C_Tick(OSD *o) : osd(o) {}
2346 void finish(int r) override {
2347 osd->tick();
2348 }
2349};
2350
2351class OSD::C_Tick_WithoutOSDLock : public Context {
2352 OSD *osd;
2353 public:
2354 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2355 void finish(int r) override {
2356 osd->tick_without_osd_lock();
2357 }
2358};
2359
2360int OSD::enable_disable_fuse(bool stop)
2361{
2362#ifdef HAVE_LIBFUSE
2363 int r;
2364 string mntpath = cct->_conf->osd_data + "/fuse";
2365 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2366 dout(1) << __func__ << " disabling" << dendl;
2367 fuse_store->stop();
2368 delete fuse_store;
2369 fuse_store = NULL;
2370 r = ::rmdir(mntpath.c_str());
7c673cae 2371 if (r < 0) {
c07f9fc5
FG
2372 r = -errno;
2373 derr << __func__ << " failed to rmdir " << mntpath << ": "
2374 << cpp_strerror(r) << dendl;
7c673cae
FG
2375 return r;
2376 }
2377 return 0;
2378 }
2379 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2380 dout(1) << __func__ << " enabling" << dendl;
2381 r = ::mkdir(mntpath.c_str(), 0700);
2382 if (r < 0)
2383 r = -errno;
2384 if (r < 0 && r != -EEXIST) {
2385 derr << __func__ << " unable to create " << mntpath << ": "
2386 << cpp_strerror(r) << dendl;
2387 return r;
2388 }
2389 fuse_store = new FuseStore(store, mntpath);
2390 r = fuse_store->start();
2391 if (r < 0) {
2392 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2393 delete fuse_store;
2394 fuse_store = NULL;
2395 return r;
2396 }
2397 }
2398#endif // HAVE_LIBFUSE
2399 return 0;
2400}
2401
31f18b77
FG
2402int OSD::get_num_op_shards()
2403{
2404 if (cct->_conf->osd_op_num_shards)
2405 return cct->_conf->osd_op_num_shards;
2406 if (store_is_rotational)
2407 return cct->_conf->osd_op_num_shards_hdd;
2408 else
2409 return cct->_conf->osd_op_num_shards_ssd;
2410}
2411
2412int OSD::get_num_op_threads()
2413{
2414 if (cct->_conf->osd_op_num_threads_per_shard)
2415 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2416 if (store_is_rotational)
2417 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2418 else
2419 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2420}
2421
c07f9fc5
FG
2422float OSD::get_osd_recovery_sleep()
2423{
2424 if (cct->_conf->osd_recovery_sleep)
2425 return cct->_conf->osd_recovery_sleep;
d2e6a577 2426 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2427 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577
FG
2428 else if (store_is_rotational && !journal_is_rotational)
2429 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2430 else
2431 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2432}
2433
7c673cae
FG
2434int OSD::init()
2435{
2436 CompatSet initial, diff;
2437 Mutex::Locker lock(osd_lock);
2438 if (is_stopping())
2439 return 0;
2440
2441 tick_timer.init();
2442 tick_timer_without_osd_lock.init();
2443 service.recovery_request_timer.init();
31f18b77 2444 service.recovery_sleep_timer.init();
7c673cae
FG
2445
2446 // mount.
31f18b77
FG
2447 dout(2) << "init " << dev_path
2448 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2449 << dendl;
d2e6a577 2450 dout(2) << "journal " << journal_path << dendl;
7c673cae
FG
2451 assert(store); // call pre_init() first!
2452
31f18b77 2453 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2454
2455 int r = store->mount();
2456 if (r < 0) {
2457 derr << "OSD:init: unable to mount object store" << dendl;
2458 return r;
2459 }
d2e6a577
FG
2460 journal_is_rotational = store->is_journal_rotational();
2461 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2462 << dendl;
7c673cae
FG
2463
2464 enable_disable_fuse(false);
2465
2466 dout(2) << "boot" << dendl;
2467
2468 // initialize the daily loadavg with current 15min loadavg
2469 double loadavgs[3];
2470 if (getloadavg(loadavgs, 3) == 3) {
2471 daily_loadavg = loadavgs[2];
2472 } else {
2473 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2474 daily_loadavg = 1.0;
2475 }
2476
2477 int rotating_auth_attempts = 0;
2478
2479 // sanity check long object name handling
2480 {
2481 hobject_t l;
2482 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2483 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2484 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2485 r = store->validate_hobject_key(l);
2486 if (r < 0) {
2487 derr << "backend (" << store->get_type() << ") is unable to support max "
2488 << "object name[space] len" << dendl;
2489 derr << " osd max object name len = "
2490 << cct->_conf->osd_max_object_name_len << dendl;
2491 derr << " osd max object namespace len = "
2492 << cct->_conf->osd_max_object_namespace_len << dendl;
2493 derr << cpp_strerror(r) << dendl;
2494 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2495 goto out;
2496 }
2497 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2498 << dendl;
2499 } else {
2500 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2501 }
2502 }
2503
2504 // read superblock
2505 r = read_superblock();
2506 if (r < 0) {
2507 derr << "OSD::init() : unable to read osd superblock" << dendl;
2508 r = -EINVAL;
2509 goto out;
2510 }
2511
2512 if (osd_compat.compare(superblock.compat_features) < 0) {
2513 derr << "The disk uses features unsupported by the executable." << dendl;
2514 derr << " ondisk features " << superblock.compat_features << dendl;
2515 derr << " daemon features " << osd_compat << dendl;
2516
2517 if (osd_compat.writeable(superblock.compat_features)) {
2518 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2519 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2520 r = -EOPNOTSUPP;
2521 goto out;
2522 }
2523 else {
2524 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2525 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2526 r = -EOPNOTSUPP;
2527 goto out;
2528 }
2529 }
2530
2531 assert_warn(whoami == superblock.whoami);
2532 if (whoami != superblock.whoami) {
2533 derr << "OSD::init: superblock says osd"
2534 << superblock.whoami << " but I am osd." << whoami << dendl;
2535 r = -EINVAL;
2536 goto out;
2537 }
2538
2539 initial = get_osd_initial_compat_set();
2540 diff = superblock.compat_features.unsupported(initial);
2541 if (superblock.compat_features.merge(initial)) {
2542 // We need to persist the new compat_set before we
2543 // do anything else
2544 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2545 ObjectStore::Transaction t;
2546 write_superblock(t);
2547 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2548 if (r < 0)
2549 goto out;
2550 }
2551
2552 // make sure snap mapper object exists
2553 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2554 dout(10) << "init creating/touching snapmapper object" << dendl;
2555 ObjectStore::Transaction t;
2556 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2557 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2558 if (r < 0)
2559 goto out;
2560 }
2561
2562 class_handler = new ClassHandler(cct);
2563 cls_initialize(class_handler);
2564
2565 if (cct->_conf->osd_open_classes_on_start) {
2566 int r = class_handler->open_all_classes();
2567 if (r)
2568 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2569 }
2570
2571 // load up "current" osdmap
2572 assert_warn(!osdmap);
2573 if (osdmap) {
2574 derr << "OSD::init: unable to read current osdmap" << dendl;
2575 r = -EINVAL;
2576 goto out;
2577 }
2578 osdmap = get_map(superblock.current_epoch);
2579 check_osdmap_features(store);
2580
2581 create_recoverystate_perf();
2582
2583 {
2584 epoch_t bind_epoch = osdmap->get_epoch();
2585 service.set_epochs(NULL, NULL, &bind_epoch);
2586 }
2587
2588 clear_temp_objects();
2589
d2e6a577
FG
2590 // initialize osdmap references in sharded wq
2591 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2592
7c673cae
FG
2593 // load up pgs (as they previously existed)
2594 load_pgs();
2595
2596 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2597 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2598 op_prio_cutoff << "." << dendl;
2599
2600 create_logger();
2601
2602 // i'm ready!
2603 client_messenger->add_dispatcher_head(this);
2604 cluster_messenger->add_dispatcher_head(this);
2605
2606 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2607 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2608 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2609 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2610
2611 objecter_messenger->add_dispatcher_head(service.objecter);
2612
2613 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2614 | CEPH_ENTITY_TYPE_MGR);
2615 r = monc->init();
2616 if (r < 0)
2617 goto out;
2618
2619 /**
2620 * FIXME: this is a placeholder implementation that unconditionally
2621 * sends every is_primary PG's stats every time we're called, unlike
2622 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2623 * This has equivalent cost to the existing worst case where all
2624 * PGs are busy and their stats are always enqueued for sending.
2625 */
2626 mgrc.set_pgstats_cb([this](){
2627 RWLock::RLocker l(map_lock);
2628
2629 utime_t had_for = ceph_clock_now() - had_map_since;
2630 osd_stat_t cur_stat = service.get_osd_stat();
2631 cur_stat.os_perf_stat = store->get_cur_stats();
2632
2633 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2634 m->osd_stat = cur_stat;
2635
2636 Mutex::Locker lec{min_last_epoch_clean_lock};
2637 min_last_epoch_clean = osdmap->get_epoch();
2638 min_last_epoch_clean_pgs.clear();
2639 RWLock::RLocker lpg(pg_map_lock);
2640 for (const auto &i : pg_map) {
2641 PG *pg = i.second;
2642 if (!pg->is_primary()) {
2643 continue;
2644 }
2645
2646 pg->pg_stats_publish_lock.Lock();
2647 if (pg->pg_stats_publish_valid) {
2648 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2649 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2650 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2651 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2652 }
2653 pg->pg_stats_publish_lock.Unlock();
2654 }
2655
2656 return m;
2657 });
2658
2659 mgrc.init();
2660 client_messenger->add_dispatcher_head(&mgrc);
2661
2662 // tell monc about log_client so it will know about mon session resets
2663 monc->set_log_client(&log_client);
2664 update_log_config();
2665
31f18b77 2666 peering_tp.start();
28e407b8
AA
2667
2668 service.init();
2669 service.publish_map(osdmap);
2670 service.publish_superblock(superblock);
2671 service.max_oldest_map = superblock.oldest_map;
2672
7c673cae
FG
2673 osd_op_tp.start();
2674 disk_tp.start();
2675 command_tp.start();
2676
2677 set_disk_tp_priority();
2678
2679 // start the heartbeat
2680 heartbeat_thread.create("osd_srv_heartbt");
2681
2682 // tick
2683 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2684 {
2685 Mutex::Locker l(tick_timer_lock);
2686 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2687 }
2688
7c673cae
FG
2689 osd_lock.Unlock();
2690
2691 r = monc->authenticate();
2692 if (r < 0) {
c07f9fc5
FG
2693 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2694 << dendl;
7c673cae
FG
2695 osd_lock.Lock(); // locker is going to unlock this on function exit
2696 if (is_stopping())
c07f9fc5 2697 r = 0;
7c673cae
FG
2698 goto monout;
2699 }
2700
2701 while (monc->wait_auth_rotating(30.0) < 0) {
2702 derr << "unable to obtain rotating service keys; retrying" << dendl;
2703 ++rotating_auth_attempts;
2704 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
c07f9fc5 2705 derr << __func__ << " wait_auth_rotating timed out" << dendl;
7c673cae
FG
2706 osd_lock.Lock(); // make locker happy
2707 if (!is_stopping()) {
c07f9fc5 2708 r = -ETIMEDOUT;
7c673cae
FG
2709 }
2710 goto monout;
2711 }
2712 }
2713
2714 r = update_crush_device_class();
2715 if (r < 0) {
d2e6a577
FG
2716 derr << __func__ << " unable to update_crush_device_class: "
2717 << cpp_strerror(r) << dendl;
7c673cae
FG
2718 osd_lock.Lock();
2719 goto monout;
2720 }
2721
2722 r = update_crush_location();
2723 if (r < 0) {
d2e6a577 2724 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 2725 << cpp_strerror(r) << dendl;
7c673cae
FG
2726 osd_lock.Lock();
2727 goto monout;
2728 }
2729
2730 osd_lock.Lock();
2731 if (is_stopping())
2732 return 0;
2733
2734 // start objecter *after* we have authenticated, so that we don't ignore
2735 // the OSDMaps it requests.
2736 service.final_init();
2737
2738 check_config();
2739
2740 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2741 consume_map();
2742 peering_wq.drain();
2743
2744 dout(0) << "done with init, starting boot process" << dendl;
2745
2746 // subscribe to any pg creations
2747 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2748
2749 // MgrClient needs this (it doesn't have MonClient reference itself)
2750 monc->sub_want("mgrmap", 0, 0);
2751
2752 // we don't need to ask for an osdmap here; objecter will
2753 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2754
2755 monc->renew_subs();
2756
2757 start_boot();
2758
2759 return 0;
2760monout:
c07f9fc5 2761 exit(1);
7c673cae
FG
2762
2763out:
2764 enable_disable_fuse(true);
2765 store->umount();
2766 delete store;
2767 store = NULL;
2768 return r;
2769}
2770
2771void OSD::final_init()
2772{
2773 AdminSocket *admin_socket = cct->get_admin_socket();
2774 asok_hook = new OSDSocketHook(this);
2775 int r = admin_socket->register_command("status", "status", asok_hook,
2776 "high-level status of OSD");
2777 assert(r == 0);
2778 r = admin_socket->register_command("flush_journal", "flush_journal",
2779 asok_hook,
2780 "flush the journal to permanent store");
2781 assert(r == 0);
2782 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
2783 "dump_ops_in_flight " \
2784 "name=filterstr,type=CephString,n=N,req=false",
2785 asok_hook,
7c673cae
FG
2786 "show the ops currently in flight");
2787 assert(r == 0);
2788 r = admin_socket->register_command("ops",
c07f9fc5
FG
2789 "ops " \
2790 "name=filterstr,type=CephString,n=N,req=false",
2791 asok_hook,
7c673cae
FG
2792 "show the ops currently in flight");
2793 assert(r == 0);
2794 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
2795 "dump_blocked_ops " \
2796 "name=filterstr,type=CephString,n=N,req=false",
2797 asok_hook,
7c673cae
FG
2798 "show the blocked ops currently in flight");
2799 assert(r == 0);
c07f9fc5
FG
2800 r = admin_socket->register_command("dump_historic_ops",
2801 "dump_historic_ops " \
2802 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2803 asok_hook,
2804 "show recent ops");
2805 assert(r == 0);
c07f9fc5
FG
2806 r = admin_socket->register_command("dump_historic_slow_ops",
2807 "dump_historic_slow_ops " \
2808 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2809 asok_hook,
2810 "show slowest recent ops");
2811 assert(r == 0);
c07f9fc5
FG
2812 r = admin_socket->register_command("dump_historic_ops_by_duration",
2813 "dump_historic_ops_by_duration " \
2814 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2815 asok_hook,
2816 "show slowest recent ops, sorted by duration");
2817 assert(r == 0);
2818 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2819 asok_hook,
2820 "dump op priority queue state");
2821 assert(r == 0);
2822 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2823 asok_hook,
2824 "dump blacklisted clients and times");
2825 assert(r == 0);
2826 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2827 asok_hook,
2828 "show clients which have active watches,"
2829 " and on which objects");
2830 assert(r == 0);
2831 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2832 asok_hook,
2833 "show recovery reservations");
2834 assert(r == 0);
2835 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2836 asok_hook,
2837 "force osd to update the latest map from "
2838 "the mon");
2839 assert(r == 0);
2840
2841 r = admin_socket->register_command( "heap",
2842 "heap " \
2843 "name=heapcmd,type=CephString",
2844 asok_hook,
2845 "show heap usage info (available only if "
2846 "compiled with tcmalloc)");
2847 assert(r == 0);
2848
2849 r = admin_socket->register_command("set_heap_property",
2850 "set_heap_property " \
2851 "name=property,type=CephString " \
2852 "name=value,type=CephInt",
2853 asok_hook,
2854 "update malloc extension heap property");
2855 assert(r == 0);
2856
2857 r = admin_socket->register_command("get_heap_property",
2858 "get_heap_property " \
2859 "name=property,type=CephString",
2860 asok_hook,
2861 "get malloc extension heap property");
2862 assert(r == 0);
2863
2864 r = admin_socket->register_command("dump_objectstore_kv_stats",
2865 "dump_objectstore_kv_stats",
2866 asok_hook,
2867 "print statistics of kvdb which used by bluestore");
2868 assert(r == 0);
2869
2870 r = admin_socket->register_command("dump_scrubs",
2871 "dump_scrubs",
2872 asok_hook,
2873 "print scheduled scrubs");
2874 assert(r == 0);
2875
2876 r = admin_socket->register_command("calc_objectstore_db_histogram",
2877 "calc_objectstore_db_histogram",
2878 asok_hook,
2879 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2880 assert(r == 0);
2881
2882 r = admin_socket->register_command("flush_store_cache",
2883 "flush_store_cache",
2884 asok_hook,
2885 "Flush bluestore internal cache");
2886 assert(r == 0);
2887 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2888 asok_hook,
2889 "show recent state history");
2890 assert(r == 0);
2891
224ce89b
WB
2892 r = admin_socket->register_command("compact", "compact",
2893 asok_hook,
2894 "Commpact object store's omap."
2895 " WARNING: Compaction probably slows your requests");
2896 assert(r == 0);
2897
7c673cae
FG
2898 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2899 // Note: pools are CephString instead of CephPoolname because
2900 // these commands traditionally support both pool names and numbers
2901 r = admin_socket->register_command(
2902 "setomapval",
2903 "setomapval " \
2904 "name=pool,type=CephString " \
2905 "name=objname,type=CephObjectname " \
2906 "name=key,type=CephString "\
2907 "name=val,type=CephString",
2908 test_ops_hook,
2909 "set omap key");
2910 assert(r == 0);
2911 r = admin_socket->register_command(
2912 "rmomapkey",
2913 "rmomapkey " \
2914 "name=pool,type=CephString " \
2915 "name=objname,type=CephObjectname " \
2916 "name=key,type=CephString",
2917 test_ops_hook,
2918 "remove omap key");
2919 assert(r == 0);
2920 r = admin_socket->register_command(
2921 "setomapheader",
2922 "setomapheader " \
2923 "name=pool,type=CephString " \
2924 "name=objname,type=CephObjectname " \
2925 "name=header,type=CephString",
2926 test_ops_hook,
2927 "set omap header");
2928 assert(r == 0);
2929
2930 r = admin_socket->register_command(
2931 "getomap",
2932 "getomap " \
2933 "name=pool,type=CephString " \
2934 "name=objname,type=CephObjectname",
2935 test_ops_hook,
2936 "output entire object map");
2937 assert(r == 0);
2938
2939 r = admin_socket->register_command(
2940 "truncobj",
2941 "truncobj " \
2942 "name=pool,type=CephString " \
2943 "name=objname,type=CephObjectname " \
2944 "name=len,type=CephInt",
2945 test_ops_hook,
2946 "truncate object to length");
2947 assert(r == 0);
2948
2949 r = admin_socket->register_command(
2950 "injectdataerr",
2951 "injectdataerr " \
2952 "name=pool,type=CephString " \
2953 "name=objname,type=CephObjectname " \
2954 "name=shardid,type=CephInt,req=false,range=0|255",
2955 test_ops_hook,
2956 "inject data error to an object");
2957 assert(r == 0);
2958
2959 r = admin_socket->register_command(
2960 "injectmdataerr",
2961 "injectmdataerr " \
2962 "name=pool,type=CephString " \
2963 "name=objname,type=CephObjectname " \
2964 "name=shardid,type=CephInt,req=false,range=0|255",
2965 test_ops_hook,
2966 "inject metadata error to an object");
2967 assert(r == 0);
2968 r = admin_socket->register_command(
2969 "set_recovery_delay",
2970 "set_recovery_delay " \
2971 "name=utime,type=CephInt,req=false",
2972 test_ops_hook,
2973 "Delay osd recovery by specified seconds");
2974 assert(r == 0);
2975 r = admin_socket->register_command(
2976 "trigger_scrub",
2977 "trigger_scrub " \
2978 "name=pgid,type=CephString ",
2979 test_ops_hook,
2980 "Trigger a scheduled scrub ");
2981 assert(r == 0);
2982 r = admin_socket->register_command(
2983 "injectfull",
2984 "injectfull " \
2985 "name=type,type=CephString,req=false " \
2986 "name=count,type=CephInt,req=false ",
2987 test_ops_hook,
2988 "Inject a full disk (optional count times)");
2989 assert(r == 0);
2990}
2991
2992void OSD::create_logger()
2993{
2994 dout(10) << "create_logger" << dendl;
2995
2996 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2997
2998 // Latency axis configuration for op histograms, values are in nanoseconds
2999 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3000 "Latency (usec)",
3001 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3002 0, ///< Start at 0
3003 100000, ///< Quantization unit is 100usec
3004 32, ///< Enough to cover much longer than slow requests
3005 };
3006
3007 // Op size axis configuration for op histograms, values are in bytes
3008 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3009 "Request size (bytes)",
3010 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3011 0, ///< Start at 0
3012 512, ///< Quantization unit is 512 bytes
3013 32, ///< Enough to cover requests larger than GB
3014 };
3015
3016
3efd9988
FG
3017 // All the basic OSD operation stats are to be considered useful
3018 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3019
7c673cae
FG
3020 osd_plb.add_u64(
3021 l_osd_op_wip, "op_wip",
3022 "Replication operations currently being processed (primary)");
3023 osd_plb.add_u64_counter(
3024 l_osd_op, "op",
3025 "Client operations",
3026 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3027 osd_plb.add_u64_counter(
3028 l_osd_op_inb, "op_in_bytes",
3029 "Client operations total write size",
3030 "wr", PerfCountersBuilder::PRIO_INTERESTING);
3031 osd_plb.add_u64_counter(
3032 l_osd_op_outb, "op_out_bytes",
3033 "Client operations total read size",
3034 "rd", PerfCountersBuilder::PRIO_INTERESTING);
3035 osd_plb.add_time_avg(
3036 l_osd_op_lat, "op_latency",
3037 "Latency of client operations (including queue time)",
3038 "l", 9);
3039 osd_plb.add_time_avg(
3040 l_osd_op_process_lat, "op_process_latency",
3041 "Latency of client operations (excluding queue time)");
3042 osd_plb.add_time_avg(
3043 l_osd_op_prepare_lat, "op_prepare_latency",
3044 "Latency of client operations (excluding queue time and wait for finished)");
3045
3046 osd_plb.add_u64_counter(
3047 l_osd_op_r, "op_r", "Client read operations");
3048 osd_plb.add_u64_counter(
3049 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
3050 osd_plb.add_time_avg(
3051 l_osd_op_r_lat, "op_r_latency",
3052 "Latency of read operation (including queue time)");
31f18b77 3053 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3054 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3055 op_hist_x_axis_config, op_hist_y_axis_config,
3056 "Histogram of operation latency (including queue time) + data read");
3057 osd_plb.add_time_avg(
3058 l_osd_op_r_process_lat, "op_r_process_latency",
3059 "Latency of read operation (excluding queue time)");
3060 osd_plb.add_time_avg(
3061 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3062 "Latency of read operations (excluding queue time and wait for finished)");
3063 osd_plb.add_u64_counter(
3064 l_osd_op_w, "op_w", "Client write operations");
3065 osd_plb.add_u64_counter(
3066 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3067 osd_plb.add_time_avg(
3068 l_osd_op_w_lat, "op_w_latency",
3069 "Latency of write operation (including queue time)");
31f18b77 3070 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3071 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3072 op_hist_x_axis_config, op_hist_y_axis_config,
3073 "Histogram of operation latency (including queue time) + data written");
3074 osd_plb.add_time_avg(
3075 l_osd_op_w_process_lat, "op_w_process_latency",
3076 "Latency of write operation (excluding queue time)");
3077 osd_plb.add_time_avg(
3078 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3079 "Latency of write operations (excluding queue time and wait for finished)");
3080 osd_plb.add_u64_counter(
3081 l_osd_op_rw, "op_rw",
3082 "Client read-modify-write operations");
3083 osd_plb.add_u64_counter(
3084 l_osd_op_rw_inb, "op_rw_in_bytes",
3085 "Client read-modify-write operations write in");
3086 osd_plb.add_u64_counter(
3087 l_osd_op_rw_outb,"op_rw_out_bytes",
3088 "Client read-modify-write operations read out ");
3089 osd_plb.add_time_avg(
3090 l_osd_op_rw_lat, "op_rw_latency",
3091 "Latency of read-modify-write operation (including queue time)");
31f18b77 3092 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3093 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3094 op_hist_x_axis_config, op_hist_y_axis_config,
3095 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3096 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3097 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3098 op_hist_x_axis_config, op_hist_y_axis_config,
3099 "Histogram of rw operation latency (including queue time) + data read");
3100 osd_plb.add_time_avg(
3101 l_osd_op_rw_process_lat, "op_rw_process_latency",
3102 "Latency of read-modify-write operation (excluding queue time)");
3103 osd_plb.add_time_avg(
3104 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3105 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3106
3efd9988
FG
3107 // Now we move on to some more obscure stats, revert to assuming things
3108 // are low priority unless otherwise specified.
3109 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3110
224ce89b
WB
3111 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3112 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3113 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3114 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3115
7c673cae
FG
3116 osd_plb.add_u64_counter(
3117 l_osd_sop, "subop", "Suboperations");
3118 osd_plb.add_u64_counter(
3119 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3120 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3121
3122 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3123 osd_plb.add_u64_counter(
3124 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3125 osd_plb.add_time_avg(
3126 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3127 osd_plb.add_u64_counter(
3128 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3129 osd_plb.add_time_avg(
3130 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3131 osd_plb.add_u64_counter(
3132 l_osd_sop_push, "subop_push", "Suboperations push messages");
3133 osd_plb.add_u64_counter(
3134 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3135 osd_plb.add_time_avg(
3136 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3137
3138 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3139 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3140 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3141
3142 osd_plb.add_u64_counter(
3143 l_osd_rop, "recovery_ops",
3144 "Started recovery operations",
3145 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3146
3147 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3148 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3149 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3150 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3151 osd_plb.add_u64(
3152 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3153 osd_plb.add_u64(
3154 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3155 "Total number getting crc from crc_cache with adjusting");
3156 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3157 "Total number of crc cache misses");
3158
3159 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3160 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3161 osd_plb.add_u64(
3162 l_osd_pg_primary, "numpg_primary",
3163 "Placement groups for which this osd is primary");
3164 osd_plb.add_u64(
3165 l_osd_pg_replica, "numpg_replica",
3166 "Placement groups for which this osd is replica");
3167 osd_plb.add_u64(
3168 l_osd_pg_stray, "numpg_stray",
3169 "Placement groups ready to be deleted from this osd");
94b18763
FG
3170 osd_plb.add_u64(
3171 l_osd_pg_removing, "numpg_removing",
3172 "Placement groups queued for local deletion", "pgsr",
3173 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3174 osd_plb.add_u64(
3175 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3176 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3177 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3178 osd_plb.add_u64_counter(
3179 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3180 osd_plb.add_u64_counter(
3181 l_osd_waiting_for_map, "messages_delayed_for_map",
3182 "Operations waiting for OSD map");
31f18b77 3183
7c673cae
FG
3184 osd_plb.add_u64_counter(
3185 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3186 osd_plb.add_u64_counter(
3187 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3188 osd_plb.add_u64_counter(
3189 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3190 "osdmap cache miss below cache lower bound");
3191 osd_plb.add_u64_avg(
3192 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3193 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3194 osd_plb.add_u64_counter(
3195 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3196 "OSDMap buffer cache hits");
3197 osd_plb.add_u64_counter(
3198 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3199 "OSDMap buffer cache misses");
7c673cae 3200
3efd9988
FG
3201 osd_plb.add_u64(
3202 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3203 PerfCountersBuilder::PRIO_USEFUL);
3204 osd_plb.add_u64(
3205 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3206 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3207 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3208
3209 osd_plb.add_u64_counter(
3210 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3211
3212 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3213 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3214 osd_plb.add_u64_counter(
3215 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3216 osd_plb.add_u64_counter(
3217 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3218 osd_plb.add_u64_counter(
3219 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3220 "Failed tier flush attempts");
3221 osd_plb.add_u64_counter(
3222 l_osd_tier_evict, "tier_evict", "Tier evictions");
3223 osd_plb.add_u64_counter(
3224 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3225 osd_plb.add_u64_counter(
3226 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3227 osd_plb.add_u64_counter(
3228 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3229 osd_plb.add_u64_counter(
3230 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3231 osd_plb.add_u64_counter(
3232 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3233 osd_plb.add_u64_counter(
3234 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3235
3236 osd_plb.add_u64_counter(
3237 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3238 osd_plb.add_u64_counter(
3239 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3240 osd_plb.add_u64_counter(
3241 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3242 osd_plb.add_u64_counter(
3243 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3244
3245 osd_plb.add_u64_counter(
3246 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3247 osd_plb.add_u64_counter(
3248 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3249
3250 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3251 osd_plb.add_time_avg(
3252 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3253 osd_plb.add_time_avg(
3254 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3255 osd_plb.add_time_avg(
3256 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3257
3258 osd_plb.add_u64_counter(
3259 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3260 osd_plb.add_u64_counter(
3261 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3262 "PG updated its info using fastinfo attr");
3263 osd_plb.add_u64_counter(
3264 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3265
3266 logger = osd_plb.create_perf_counters();
3267 cct->get_perfcounters_collection()->add(logger);
3268}
3269
3270void OSD::create_recoverystate_perf()
3271{
3272 dout(10) << "create_recoverystate_perf" << dendl;
3273
3274 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3275
3276 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3277 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3278 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3279 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3280 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3281 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3282 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3283 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3284 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3285 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3286 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3287 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3288 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3289 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3290 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3291 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3292 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3293 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3294 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3295 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3296 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3297 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3298 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3299 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3300 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3301 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3302 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3303 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3304 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3305 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3306 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3307
3308 recoverystate_perf = rs_perf.create_perf_counters();
3309 cct->get_perfcounters_collection()->add(recoverystate_perf);
3310}
3311
3312int OSD::shutdown()
3313{
3314 if (!service.prepare_to_stop())
3315 return 0; // already shutting down
3316 osd_lock.Lock();
3317 if (is_stopping()) {
3318 osd_lock.Unlock();
3319 return 0;
3320 }
3321 derr << "shutdown" << dendl;
3322
3323 set_state(STATE_STOPPING);
3324
3325 // Debugging
3efd9988
FG
3326 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3327 cct->_conf->set_val("debug_osd", "100");
3328 cct->_conf->set_val("debug_journal", "100");
3329 cct->_conf->set_val("debug_filestore", "100");
3330 cct->_conf->set_val("debug_bluestore", "100");
3331 cct->_conf->set_val("debug_ms", "100");
3332 cct->_conf->apply_changes(NULL);
3333 }
7c673cae
FG
3334
3335 // stop MgrClient earlier as it's more like an internal consumer of OSD
3336 mgrc.shutdown();
3337
3338 service.start_shutdown();
3339
3340 // stop sending work to pgs. this just prevents any new work in _process
3341 // from racing with on_shutdown and potentially entering the pg after.
3342 op_shardedwq.drain();
3343
3344 // Shutdown PGs
3345 {
3346 RWLock::RLocker l(pg_map_lock);
3347 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3348 p != pg_map.end();
3349 ++p) {
3350 dout(20) << " kicking pg " << p->first << dendl;
3351 p->second->lock();
3352 p->second->on_shutdown();
3353 p->second->unlock();
3354 p->second->osr->flush();
3355 }
3356 }
3357 clear_pg_stat_queue();
3358
3359 // drain op queue again (in case PGs requeued something)
3360 op_shardedwq.drain();
3361 {
3362 finished.clear(); // zap waiters (bleh, this is messy)
3363 }
3364
3365 op_shardedwq.clear_pg_slots();
3366
3367 // unregister commands
3368 cct->get_admin_socket()->unregister_command("status");
3369 cct->get_admin_socket()->unregister_command("flush_journal");
3370 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3371 cct->get_admin_socket()->unregister_command("ops");
3372 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3373 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3374 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3375 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3376 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3377 cct->get_admin_socket()->unregister_command("dump_blacklist");
3378 cct->get_admin_socket()->unregister_command("dump_watchers");
3379 cct->get_admin_socket()->unregister_command("dump_reservations");
3380 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
224ce89b 3381 cct->get_admin_socket()->unregister_command("heap");
7c673cae
FG
3382 cct->get_admin_socket()->unregister_command("set_heap_property");
3383 cct->get_admin_socket()->unregister_command("get_heap_property");
3384 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
224ce89b 3385 cct->get_admin_socket()->unregister_command("dump_scrubs");
7c673cae
FG
3386 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3387 cct->get_admin_socket()->unregister_command("flush_store_cache");
3388 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
224ce89b 3389 cct->get_admin_socket()->unregister_command("compact");
7c673cae
FG
3390 delete asok_hook;
3391 asok_hook = NULL;
3392
3393 cct->get_admin_socket()->unregister_command("setomapval");
3394 cct->get_admin_socket()->unregister_command("rmomapkey");
3395 cct->get_admin_socket()->unregister_command("setomapheader");
3396 cct->get_admin_socket()->unregister_command("getomap");
3397 cct->get_admin_socket()->unregister_command("truncobj");
3398 cct->get_admin_socket()->unregister_command("injectdataerr");
3399 cct->get_admin_socket()->unregister_command("injectmdataerr");
3400 cct->get_admin_socket()->unregister_command("set_recovery_delay");
224ce89b
WB
3401 cct->get_admin_socket()->unregister_command("trigger_scrub");
3402 cct->get_admin_socket()->unregister_command("injectfull");
7c673cae
FG
3403 delete test_ops_hook;
3404 test_ops_hook = NULL;
3405
3406 osd_lock.Unlock();
3407
3408 heartbeat_lock.Lock();
3409 heartbeat_stop = true;
3410 heartbeat_cond.Signal();
3411 heartbeat_lock.Unlock();
3412 heartbeat_thread.join();
3413
31f18b77 3414 peering_tp.drain();
7c673cae 3415 peering_wq.clear();
31f18b77 3416 peering_tp.stop();
7c673cae
FG
3417 dout(10) << "osd tp stopped" << dendl;
3418
3419 osd_op_tp.drain();
3420 osd_op_tp.stop();
3421 dout(10) << "op sharded tp stopped" << dendl;
3422
3423 command_tp.drain();
3424 command_tp.stop();
3425 dout(10) << "command tp stopped" << dendl;
3426
3427 disk_tp.drain();
3428 disk_tp.stop();
3429 dout(10) << "disk tp paused (new)" << dendl;
3430
3431 dout(10) << "stopping agent" << dendl;
3432 service.agent_stop();
3433
3434 osd_lock.Lock();
3435
3436 reset_heartbeat_peers();
3437
3438 tick_timer.shutdown();
3439
3440 {
3441 Mutex::Locker l(tick_timer_lock);
3442 tick_timer_without_osd_lock.shutdown();
3443 }
3444
3445 // note unmount epoch
3446 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3447 superblock.mounted = service.get_boot_epoch();
3448 superblock.clean_thru = osdmap->get_epoch();
3449 ObjectStore::Transaction t;
3450 write_superblock(t);
3451 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3452 if (r) {
3453 derr << "OSD::shutdown: error writing superblock: "
3454 << cpp_strerror(r) << dendl;
3455 }
3456
3457
3458 {
3459 Mutex::Locker l(pg_stat_queue_lock);
3460 assert(pg_stat_queue.empty());
3461 }
3462
31f18b77
FG
3463 service.shutdown_reserver();
3464
7c673cae
FG
3465 // Remove PGs
3466#ifdef PG_DEBUG_REFS
3467 service.dump_live_pgids();
3468#endif
3469 {
3470 RWLock::RLocker l(pg_map_lock);
3471 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3472 p != pg_map.end();
3473 ++p) {
3474 dout(20) << " kicking pg " << p->first << dendl;
3475 p->second->lock();
3476 if (p->second->ref != 1) {
3477 derr << "pgid " << p->first << " has ref count of "
3478 << p->second->ref << dendl;
3479#ifdef PG_DEBUG_REFS
3480 p->second->dump_live_ids();
3481#endif
31f18b77
FG
3482 if (cct->_conf->osd_shutdown_pgref_assert) {
3483 ceph_abort();
3484 }
7c673cae
FG
3485 }
3486 p->second->unlock();
3487 p->second->put("PGMap");
3488 }
3489 pg_map.clear();
3490 }
3491#ifdef PG_DEBUG_REFS
3492 service.dump_live_pgids();
3493#endif
3494 cct->_conf->remove_observer(this);
3495
3496 dout(10) << "syncing store" << dendl;
3497 enable_disable_fuse(true);
3498
3499 if (cct->_conf->osd_journal_flush_on_shutdown) {
3500 dout(10) << "flushing journal" << dendl;
3501 store->flush_journal();
3502 }
3503
3504 store->umount();
3505 delete store;
3506 store = 0;
3507 dout(10) << "Store synced" << dendl;
3508
3509 monc->shutdown();
3510 osd_lock.Unlock();
3511
3512 osdmap = OSDMapRef();
3513 service.shutdown();
3514 op_tracker.on_shutdown();
3515
3516 class_handler->shutdown();
3517 client_messenger->shutdown();
3518 cluster_messenger->shutdown();
3519 hb_front_client_messenger->shutdown();
3520 hb_back_client_messenger->shutdown();
3521 objecter_messenger->shutdown();
3522 hb_front_server_messenger->shutdown();
3523 hb_back_server_messenger->shutdown();
3524
3525 peering_wq.clear();
3526
3527 return r;
3528}
3529
3530int OSD::mon_cmd_maybe_osd_create(string &cmd)
3531{
3532 bool created = false;
3533 while (true) {
3534 dout(10) << __func__ << " cmd: " << cmd << dendl;
3535 vector<string> vcmd{cmd};
3536 bufferlist inbl;
3537 C_SaferCond w;
3538 string outs;
3539 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3540 int r = w.wait();
3541 if (r < 0) {
3542 if (r == -ENOENT && !created) {
3543 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3544 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3545 vector<string> vnewcmd{newcmd};
3546 bufferlist inbl;
3547 C_SaferCond w;
3548 string outs;
3549 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3550 int r = w.wait();
3551 if (r < 0) {
3552 derr << __func__ << " fail: osd does not exist and created failed: "
3553 << cpp_strerror(r) << dendl;
3554 return r;
3555 }
3556 created = true;
3557 continue;
3558 }
3559 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3560 return r;
3561 }
3562 break;
3563 }
3564
3565 return 0;
3566}
3567
3568int OSD::update_crush_location()
3569{
3570 if (!cct->_conf->osd_crush_update_on_start) {
3571 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3572 return 0;
3573 }
3574
3575 char weight[32];
3576 if (cct->_conf->osd_crush_initial_weight >= 0) {
3577 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3578 } else {
3579 struct store_statfs_t st;
3580 int r = store->statfs(&st);
3581 if (r < 0) {
3582 derr << "statfs: " << cpp_strerror(r) << dendl;
3583 return r;
3584 }
3585 snprintf(weight, sizeof(weight), "%.4lf",
3586 MAX((double).00001,
3587 (double)(st.total) /
3588 (double)(1ull << 40 /* TB */)));
3589 }
3590
3591 std::multimap<string,string> loc = cct->crush_location.get_location();
3592 dout(10) << __func__ << " crush location is " << loc << dendl;
3593
3594 string cmd =
3595 string("{\"prefix\": \"osd crush create-or-move\", ") +
3596 string("\"id\": ") + stringify(whoami) + string(", ") +
3597 string("\"weight\":") + weight + string(", ") +
3598 string("\"args\": [");
3599 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3600 if (p != loc.begin())
3601 cmd += ", ";
3602 cmd += "\"" + p->first + "=" + p->second + "\"";
3603 }
3604 cmd += "]}";
3605
3606 return mon_cmd_maybe_osd_create(cmd);
3607}
3608
3609int OSD::update_crush_device_class()
3610{
224ce89b
WB
3611 if (!cct->_conf->osd_class_update_on_start) {
3612 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3613 return 0;
3614 }
3615
7c673cae
FG
3616 string device_class;
3617 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
3618 if (r < 0 || device_class.empty()) {
3619 device_class = store->get_default_device_class();
3620 }
3621
3622 if (device_class.empty()) {
d2e6a577 3623 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 3624 return 0;
224ce89b 3625 }
7c673cae
FG
3626
3627 string cmd =
3628 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
3629 string("\"class\": \"") + device_class + string("\", ") +
3630 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 3631
224ce89b 3632 r = mon_cmd_maybe_osd_create(cmd);
d2e6a577
FG
3633 // the above cmd can fail for various reasons, e.g.:
3634 // (1) we are connecting to a pre-luminous monitor
3635 // (2) user manually specify a class other than
3636 // 'ceph-disk prepare --crush-device-class'
3637 // simply skip result-checking for now
3638 return 0;
7c673cae
FG
3639}
3640
3641void OSD::write_superblock(ObjectStore::Transaction& t)
3642{
3643 dout(10) << "write_superblock " << superblock << dendl;
3644
3645 //hack: at minimum it's using the baseline feature set
3646 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3647 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3648
3649 bufferlist bl;
3650 ::encode(superblock, bl);
3651 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3652}
3653
3654int OSD::read_superblock()
3655{
3656 bufferlist bl;
3657 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3658 if (r < 0)
3659 return r;
3660
3661 bufferlist::iterator p = bl.begin();
3662 ::decode(superblock, p);
3663
3664 dout(10) << "read_superblock " << superblock << dendl;
3665
3666 return 0;
3667}
3668
3669void OSD::clear_temp_objects()
3670{
3671 dout(10) << __func__ << dendl;
3672 vector<coll_t> ls;
3673 store->list_collections(ls);
3674 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3675 spg_t pgid;
3676 if (!p->is_pg(&pgid))
3677 continue;
3678
3679 // list temp objects
3680 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3681
3682 vector<ghobject_t> temps;
3683 ghobject_t next;
3684 while (1) {
3685 vector<ghobject_t> objects;
3686 store->collection_list(*p, next, ghobject_t::get_max(),
3687 store->get_ideal_list_max(),
3688 &objects, &next);
3689 if (objects.empty())
3690 break;
3691 vector<ghobject_t>::iterator q;
3692 for (q = objects.begin(); q != objects.end(); ++q) {
3693 // Hammer set pool for temps to -1, so check for clean-up
3694 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3695 temps.push_back(*q);
3696 } else {
3697 break;
3698 }
3699 }
3700 // If we saw a non-temp object and hit the break above we can
3701 // break out of the while loop too.
3702 if (q != objects.end())
3703 break;
3704 }
3705 if (!temps.empty()) {
3706 ObjectStore::Transaction t;
3707 int removed = 0;
3708 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3709 dout(20) << " removing " << *p << " object " << *q << dendl;
3710 t.remove(*p, *q);
3711 if (++removed > cct->_conf->osd_target_transaction_size) {
3712 store->apply_transaction(service.meta_osr.get(), std::move(t));
3713 t = ObjectStore::Transaction();
3714 removed = 0;
3715 }
3716 }
3717 if (removed) {
3718 store->apply_transaction(service.meta_osr.get(), std::move(t));
3719 }
3720 }
3721 }
3722}
3723
3724void OSD::recursive_remove_collection(CephContext* cct,
3725 ObjectStore *store, spg_t pgid,
3726 coll_t tmp)
3727{
3728 OSDriver driver(
3729 store,
3730 coll_t(),
3731 make_snapmapper_oid());
3732
3733 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3734 ObjectStore::Sequencer>("rm"));
3735 ObjectStore::Transaction t;
3736 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3737
3738 vector<ghobject_t> objects;
3739 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3740 INT_MAX, &objects, 0);
3741 generic_dout(10) << __func__ << " " << objects << dendl;
3742 // delete them.
3743 int removed = 0;
3744 for (vector<ghobject_t>::iterator p = objects.begin();
3745 p != objects.end();
3746 ++p, removed++) {
3747 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3748 int r = mapper.remove_oid(p->hobj, &_t);
3749 if (r != 0 && r != -ENOENT)
3750 ceph_abort();
3751 t.remove(tmp, *p);
3752 if (removed > cct->_conf->osd_target_transaction_size) {
3753 int r = store->apply_transaction(osr.get(), std::move(t));
3754 assert(r == 0);
3755 t = ObjectStore::Transaction();
3756 removed = 0;
3757 }
3758 }
3759 t.remove_collection(tmp);
3760 int r = store->apply_transaction(osr.get(), std::move(t));
3761 assert(r == 0);
3762
3763 C_SaferCond waiter;
3764 if (!osr->flush_commit(&waiter)) {
3765 waiter.wait();
3766 }
3767}
3768
3769
3770// ======================================================
3771// PG's
3772
3773PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3774{
3775 if (!createmap->have_pg_pool(id)) {
3776 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3777 << id << dendl;
3778 ceph_abort();
3779 }
3780
3781 PGPool p = PGPool(cct, createmap, id);
3782
3783 dout(10) << "_get_pool " << p.id << dendl;
3784 return p;
3785}
3786
3787PG *OSD::_open_lock_pg(
3788 OSDMapRef createmap,
3789 spg_t pgid, bool no_lockdep_check)
3790{
3791 assert(osd_lock.is_locked());
3792
3793 PG* pg = _make_pg(createmap, pgid);
3794 {
3795 RWLock::WLocker l(pg_map_lock);
3796 pg->lock(no_lockdep_check);
3797 pg_map[pgid] = pg;
3798 pg->get("PGMap"); // because it's in pg_map
3799 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3800 }
3801 return pg;
3802}
3803
3804PG* OSD::_make_pg(
3805 OSDMapRef createmap,
3806 spg_t pgid)
3807{
3808 dout(10) << "_open_lock_pg " << pgid << dendl;
3809 PGPool pool = _get_pool(pgid.pool(), createmap);
3810
3811 // create
3812 PG *pg;
3813 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3814 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3815 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3816 else
3817 ceph_abort();
3818
3819 return pg;
3820}
3821
3822
3823void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3824{
3825 epoch_t e(service.get_osdmap()->get_epoch());
3826 pg->get("PGMap"); // For pg_map
3827 pg_map[pg->info.pgid] = pg;
3828 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3829
3830 dout(10) << "Adding newly split pg " << *pg << dendl;
3831 pg->handle_loaded(rctx);
3832 pg->write_if_dirty(*(rctx->transaction));
3833 pg->queue_null(e, e);
3834 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3835 peering_wait_for_split.find(pg->info.pgid);
3836 if (to_wake != peering_wait_for_split.end()) {
3837 for (list<PG::CephPeeringEvtRef>::iterator i =
3838 to_wake->second.begin();
3839 i != to_wake->second.end();
3840 ++i) {
3841 pg->queue_peering_event(*i);
3842 }
3843 peering_wait_for_split.erase(to_wake);
3844 }
3845 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3846 _remove_pg(pg);
3847}
3848
3849OSD::res_result OSD::_try_resurrect_pg(
3850 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3851{
3852 assert(resurrected);
3853 assert(old_pg_state);
3854 // find nearest ancestor
3855 DeletingStateRef df;
3856 spg_t cur(pgid);
3857 while (true) {
3858 df = service.deleting_pgs.lookup(cur);
3859 if (df)
3860 break;
3861 if (!cur.ps())
3862 break;
3863 cur = cur.get_parent();
3864 }
3865 if (!df)
3866 return RES_NONE; // good to go
3867
3868 df->old_pg_state->lock();
3869 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3870 df->old_pg_state->unlock();
3871
3872 set<spg_t> children;
3873 if (cur == pgid) {
3874 if (df->try_stop_deletion()) {
3875 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3876 *resurrected = cur;
3877 *old_pg_state = df->old_pg_state;
3878 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3879 return RES_SELF;
3880 } else {
3881 // raced, ensure we don't see DeletingStateRef when we try to
3882 // delete this pg
3883 service.deleting_pgs.remove(pgid);
3884 return RES_NONE;
3885 }
3886 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3887 curmap->get_pg_num(cur.pool()),
3888 &children) &&
3889 children.count(pgid)) {
3890 if (df->try_stop_deletion()) {
3891 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3892 << dendl;
3893 *resurrected = cur;
3894 *old_pg_state = df->old_pg_state;
3895 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3896 return RES_PARENT;
3897 } else {
3898 /* this is not a problem, failing to cancel proves that all objects
3899 * have been removed, so no hobject_t overlap is possible
3900 */
3901 return RES_NONE;
3902 }
3903 }
3904 return RES_NONE;
3905}
3906
3907PG *OSD::_create_lock_pg(
3908 OSDMapRef createmap,
3909 spg_t pgid,
3910 bool hold_map_lock,
3911 bool backfill,
3912 int role,
3913 vector<int>& up, int up_primary,
3914 vector<int>& acting, int acting_primary,
3915 pg_history_t history,
3916 const PastIntervals& pi,
3917 ObjectStore::Transaction& t)
3918{
3919 assert(osd_lock.is_locked());
3920 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3921
3922 PG *pg = _open_lock_pg(createmap, pgid, true);
3923
3924 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3925
3926 pg->init(
3927 role,
3928 up,
3929 up_primary,
3930 acting,
3931 acting_primary,
3932 history,
3933 pi,
3934 backfill,
3935 &t);
3936
3937 dout(7) << "_create_lock_pg " << *pg << dendl;
3938 return pg;
3939}
3940
3941PG *OSD::_lookup_lock_pg(spg_t pgid)
3942{
3943 RWLock::RLocker l(pg_map_lock);
3944
3945 auto pg_map_entry = pg_map.find(pgid);
3946 if (pg_map_entry == pg_map.end())
3947 return nullptr;
3948 PG *pg = pg_map_entry->second;
3949 pg->lock();
3950 return pg;
3951}
3952
31f18b77
FG
3953PG *OSD::lookup_lock_pg(spg_t pgid)
3954{
3955 return _lookup_lock_pg(pgid);
3956}
3957
7c673cae
FG
3958PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3959{
3960 assert(pg_map.count(pgid));
3961 PG *pg = pg_map[pgid];
3962 pg->lock();
3963 return pg;
3964}
3965
3966void OSD::load_pgs()
3967{
3968 assert(osd_lock.is_locked());
3969 dout(0) << "load_pgs" << dendl;
3970 {
3971 RWLock::RLocker l(pg_map_lock);
3972 assert(pg_map.empty());
3973 }
3974
3975 vector<coll_t> ls;
3976 int r = store->list_collections(ls);
3977 if (r < 0) {
3978 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3979 }
3980
3981 bool has_upgraded = false;
3982
3983 for (vector<coll_t>::iterator it = ls.begin();
3984 it != ls.end();
3985 ++it) {
3986 spg_t pgid;
3987 if (it->is_temp(&pgid) ||
3988 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3989 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3990 recursive_remove_collection(cct, store, pgid, *it);
3991 continue;
3992 }
3993
3994 if (!it->is_pg(&pgid)) {
3995 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3996 continue;
3997 }
3998
3999 if (pgid.preferred() >= 0) {
4000 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
4001 // FIXME: delete it too, eventually
4002 continue;
4003 }
4004
4005 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4006 bufferlist bl;
4007 epoch_t map_epoch = 0;
4008 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
4009 if (r < 0) {
4010 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4011 << dendl;
4012 continue;
4013 }
4014
4015 PG *pg = NULL;
4016 if (map_epoch > 0) {
4017 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4018 if (!pgosdmap) {
4019 if (!osdmap->have_pg_pool(pgid.pool())) {
4020 derr << __func__ << ": could not find map for epoch " << map_epoch
4021 << " on pg " << pgid << ", but the pool is not present in the "
4022 << "current map, so this is probably a result of bug 10617. "
4023 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4024 << "to clean it up later." << dendl;
4025 continue;
4026 } else {
4027 derr << __func__ << ": have pgid " << pgid << " at epoch "
4028 << map_epoch << ", but missing map. Crashing."
4029 << dendl;
4030 assert(0 == "Missing map in load_pgs");
4031 }
4032 }
4033 pg = _open_lock_pg(pgosdmap, pgid);
4034 } else {
4035 pg = _open_lock_pg(osdmap, pgid);
4036 }
4037 // there can be no waiters here, so we don't call wake_pg_waiters
4038
4039 pg->ch = store->open_collection(pg->coll);
4040
4041 // read pg state, log
4042 pg->read_state(store, bl);
4043
4044 if (pg->must_upgrade()) {
4045 if (!pg->can_upgrade()) {
4046 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4047 << " an older version first." << dendl;
4048 assert(0 == "PG too old to upgrade");
4049 }
4050 if (!has_upgraded) {
4051 derr << "PGs are upgrading" << dendl;
4052 has_upgraded = true;
4053 }
4054 dout(10) << "PG " << pg->info.pgid
4055 << " must upgrade..." << dendl;
4056 pg->upgrade(store);
4057 }
4058
94b18763
FG
4059 if (pg->dne()) {
4060 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4061 pg->ch = nullptr;
4062 service.pg_remove_epoch(pg->pg_id);
4063 pg->unlock();
4064 {
4065 // Delete pg
4066 RWLock::WLocker l(pg_map_lock);
4067 auto p = pg_map.find(pg->get_pgid());
4068 assert(p != pg_map.end() && p->second == pg);
4069 dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
4070 pg_map.erase(p);
4071 pg->put("PGMap");
4072 }
4073 recursive_remove_collection(cct, store, pgid, *it);
4074 continue;
4075 }
4076
7c673cae
FG
4077 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4078
4079 // generate state for PG's current mapping
4080 int primary, up_primary;
4081 vector<int> acting, up;
4082 pg->get_osdmap()->pg_to_up_acting_osds(
4083 pgid.pgid, &up, &up_primary, &acting, &primary);
4084 pg->init_primary_up_acting(
4085 up,
4086 acting,
4087 up_primary,
4088 primary);
4089 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4090 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4091 pg->set_role(role);
4092 else
4093 pg->set_role(-1);
4094
4095 pg->reg_next_scrub();
4096
4097 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4098 pg->handle_loaded(&rctx);
4099
4100 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4101 if (pg->pg_log.is_dirty()) {
4102 ObjectStore::Transaction t;
4103 pg->write_if_dirty(t);
4104 store->apply_transaction(pg->osr.get(), std::move(t));
4105 }
4106 pg->unlock();
4107 }
4108 {
4109 RWLock::RLocker l(pg_map_lock);
4110 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4111 }
4112
4113 // clean up old infos object?
4114 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4115 dout(1) << __func__ << " removing legacy infos object" << dendl;
4116 ObjectStore::Transaction t;
4117 t.remove(coll_t::meta(), OSD::make_infos_oid());
4118 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4119 if (r != 0) {
4120 derr << __func__ << ": apply_transaction returned "
4121 << cpp_strerror(r) << dendl;
4122 ceph_abort();
4123 }
4124 }
4125
4126 build_past_intervals_parallel();
4127}
4128
4129
4130/*
4131 * build past_intervals efficiently on old, degraded, and buried
4132 * clusters. this is important for efficiently catching up osds that
4133 * are way behind on maps to the current cluster state.
4134 *
4135 * this is a parallel version of PG::generate_past_intervals().
4136 * follow the same logic, but do all pgs at the same time so that we
4137 * can make a single pass across the osdmap history.
4138 */
4139void OSD::build_past_intervals_parallel()
4140{
4141 struct pistate {
4142 epoch_t start, end;
4143 vector<int> old_acting, old_up;
4144 epoch_t same_interval_since;
4145 int primary;
4146 int up_primary;
4147 };
4148 map<PG*,pistate> pis;
4149
4150 // calculate junction of map range
4151 epoch_t end_epoch = superblock.oldest_map;
4152 epoch_t cur_epoch = superblock.newest_map;
4153 {
4154 RWLock::RLocker l(pg_map_lock);
4155 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4156 i != pg_map.end();
4157 ++i) {
4158 PG *pg = i->second;
4159
3efd9988
FG
4160 // Ignore PGs only partially created (DNE)
4161 if (pg->info.dne()) {
4162 continue;
4163 }
4164
7c673cae
FG
4165 auto rpib = pg->get_required_past_interval_bounds(
4166 pg->info,
4167 superblock.oldest_map);
4168 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4169 if (pg->info.history.same_interval_since == 0) {
4170 pg->info.history.same_interval_since = rpib.second;
4171 }
4172 continue;
4173 } else {
4174 auto apib = pg->past_intervals.get_bounds();
4175 if (apib.second >= rpib.second &&
4176 apib.first <= rpib.first) {
4177 if (pg->info.history.same_interval_since == 0) {
4178 pg->info.history.same_interval_since = rpib.second;
4179 }
4180 continue;
4181 }
4182 }
4183
4184 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4185 << rpib.second << dendl;
4186 pistate& p = pis[pg];
4187 p.start = rpib.first;
4188 p.end = rpib.second;
4189 p.same_interval_since = 0;
4190
4191 if (rpib.first < cur_epoch)
4192 cur_epoch = rpib.first;
4193 if (rpib.second > end_epoch)
4194 end_epoch = rpib.second;
4195 }
4196 }
4197 if (pis.empty()) {
4198 dout(10) << __func__ << " nothing to build" << dendl;
4199 return;
4200 }
4201
4202 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4203 assert(cur_epoch <= end_epoch);
4204
4205 OSDMapRef cur_map, last_map;
4206 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4207 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4208 last_map = cur_map;
4209 cur_map = get_map(cur_epoch);
4210
4211 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4212 PG *pg = i->first;
4213 pistate& p = i->second;
4214
4215 if (cur_epoch < p.start || cur_epoch > p.end)
4216 continue;
4217
4218 vector<int> acting, up;
4219 int up_primary;
4220 int primary;
4221 pg_t pgid = pg->info.pgid.pgid;
4222 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4223 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4224 cur_map->pg_to_up_acting_osds(
4225 pgid, &up, &up_primary, &acting, &primary);
4226
4227 if (p.same_interval_since == 0) {
4228 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4229 << " first map, acting " << acting
4230 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4231 p.same_interval_since = cur_epoch;
4232 p.old_up = up;
4233 p.old_acting = acting;
4234 p.primary = primary;
4235 p.up_primary = up_primary;
4236 continue;
4237 }
4238 assert(last_map);
4239
4240 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4241 pg->get_is_recoverable_predicate());
4242 std::stringstream debug;
4243 bool new_interval = PastIntervals::check_new_interval(
4244 p.primary,
4245 primary,
4246 p.old_acting, acting,
4247 p.up_primary,
4248 up_primary,
4249 p.old_up, up,
4250 p.same_interval_since,
4251 pg->info.history.last_epoch_clean,
4252 cur_map, last_map,
4253 pgid,
4254 recoverable.get(),
4255 &pg->past_intervals,
4256 &debug);
4257 if (new_interval) {
4258 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4259 << " " << debug.str() << dendl;
4260 p.old_up = up;
4261 p.old_acting = acting;
4262 p.primary = primary;
4263 p.up_primary = up_primary;
4264 p.same_interval_since = cur_epoch;
4265 }
4266 }
4267 }
4268
4269 // Now that past_intervals have been recomputed let's fix the same_interval_since
4270 // if it was cleared by import.
4271 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4272 PG *pg = i->first;
4273 pistate& p = i->second;
4274
4275 if (pg->info.history.same_interval_since == 0) {
4276 assert(p.same_interval_since);
4277 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4278 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4279 // Fix it
4280 pg->info.history.same_interval_since = p.same_interval_since;
4281 }
4282 }
4283
4284 // write info only at the end. this is necessary because we check
4285 // whether the past_intervals go far enough back or forward in time,
4286 // but we don't check for holes. we could avoid it by discarding
4287 // the previous past_intervals and rebuilding from scratch, or we
4288 // can just do this and commit all our work at the end.
4289 ObjectStore::Transaction t;
4290 int num = 0;
4291 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4292 PG *pg = i->first;
4293 pg->lock();
4294 pg->dirty_big_info = true;
4295 pg->dirty_info = true;
4296 pg->write_if_dirty(t);
4297 pg->unlock();
4298
4299 // don't let the transaction get too big
4300 if (++num >= cct->_conf->osd_target_transaction_size) {
4301 store->apply_transaction(service.meta_osr.get(), std::move(t));
4302 t = ObjectStore::Transaction();
4303 num = 0;
4304 }
4305 }
4306 if (!t.empty())
4307 store->apply_transaction(service.meta_osr.get(), std::move(t));
4308}
4309
4310/*
4311 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4312 * hasn't changed since the given epoch and we are the primary.
4313 */
4314int OSD::handle_pg_peering_evt(
4315 spg_t pgid,
4316 const pg_history_t& orig_history,
4317 const PastIntervals& pi,
4318 epoch_t epoch,
4319 PG::CephPeeringEvtRef evt)
4320{
4321 if (service.splitting(pgid)) {
4322 peering_wait_for_split[pgid].push_back(evt);
4323 return -EEXIST;
4324 }
4325
4326 PG *pg = _lookup_lock_pg(pgid);
4327 if (!pg) {
4328 // same primary?
4329 if (!osdmap->have_pg_pool(pgid.pool()))
4330 return -EINVAL;
4331 int up_primary, acting_primary;
4332 vector<int> up, acting;
4333 osdmap->pg_to_up_acting_osds(
4334 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4335
4336 pg_history_t history = orig_history;
4337 bool valid_history = project_pg_history(
4338 pgid, history, epoch, up, up_primary, acting, acting_primary);
4339
4340 if (!valid_history || epoch < history.same_interval_since) {
4341 dout(10) << __func__ << pgid << " acting changed in "
4342 << history.same_interval_since << " (msg from " << epoch << ")"
4343 << dendl;
4344 return -EINVAL;
4345 }
4346
4347 if (service.splitting(pgid)) {
4348 ceph_abort();
4349 }
4350
3efd9988
FG
4351 const bool is_mon_create =
4352 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4353 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4354 return -EAGAIN;
4355 }
7c673cae
FG
4356 // do we need to resurrect a deleting pg?
4357 spg_t resurrected;
4358 PGRef old_pg_state;
4359 res_result result = _try_resurrect_pg(
4360 service.get_osdmap(),
4361 pgid,
4362 &resurrected,
4363 &old_pg_state);
4364
4365 PG::RecoveryCtx rctx = create_context();
4366 switch (result) {
4367 case RES_NONE: {
4368 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4369 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4370 store->get_type() != "bluestore") {
4371 clog->warn() << "pg " << pgid
4372 << " is at risk of silent data corruption: "
4373 << "the pool allows ec overwrites but is not stored in "
4374 << "bluestore, so deep scrubbing will not detect bitrot";
4375 }
4376 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4377 PG::_init(*rctx.transaction, pgid, pp);
4378
4379 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4380 if (!pp->is_replicated() && role != pgid.shard)
4381 role = -1;
4382
4383 pg = _create_lock_pg(
4384 get_map(epoch),
4385 pgid, false, false,
4386 role,
4387 up, up_primary,
4388 acting, acting_primary,
4389 history, pi,
4390 *rctx.transaction);
4391 pg->handle_create(&rctx);
4392 pg->write_if_dirty(*rctx.transaction);
4393 dispatch_context(rctx, pg, osdmap);
4394
4395 dout(10) << *pg << " is new" << dendl;
4396
4397 pg->queue_peering_event(evt);
4398 wake_pg_waiters(pg);
4399 pg->unlock();
4400 return 0;
4401 }
4402 case RES_SELF: {
4403 old_pg_state->lock();
4404 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4405 int old_role = old_pg_state->role;
4406 vector<int> old_up = old_pg_state->up;
4407 int old_up_primary = old_pg_state->up_primary.osd;
4408 vector<int> old_acting = old_pg_state->acting;
4409 int old_primary = old_pg_state->primary.osd;
4410 pg_history_t old_history = old_pg_state->info.history;
4411 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4412 old_pg_state->unlock();
4413 pg = _create_lock_pg(
4414 old_osd_map,
4415 resurrected,
4416 false,
4417 true,
4418 old_role,
4419 old_up,
4420 old_up_primary,
4421 old_acting,
4422 old_primary,
4423 old_history,
4424 old_past_intervals,
4425 *rctx.transaction);
4426 pg->handle_create(&rctx);
4427 pg->write_if_dirty(*rctx.transaction);
4428 dispatch_context(rctx, pg, osdmap);
4429
4430 dout(10) << *pg << " is new (resurrected)" << dendl;
4431
4432 pg->queue_peering_event(evt);
4433 wake_pg_waiters(pg);
4434 pg->unlock();
4435 return 0;
4436 }
4437 case RES_PARENT: {
4438 assert(old_pg_state);
4439 old_pg_state->lock();
4440 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4441 int old_role = old_pg_state->role;
4442 vector<int> old_up = old_pg_state->up;
4443 int old_up_primary = old_pg_state->up_primary.osd;
4444 vector<int> old_acting = old_pg_state->acting;
4445 int old_primary = old_pg_state->primary.osd;
4446 pg_history_t old_history = old_pg_state->info.history;
4447 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4448 old_pg_state->unlock();
4449 PG *parent = _create_lock_pg(
4450 old_osd_map,
4451 resurrected,
4452 false,
4453 true,
4454 old_role,
4455 old_up,
4456 old_up_primary,
4457 old_acting,
4458 old_primary,
4459 old_history,
4460 old_past_intervals,
4461 *rctx.transaction
4462 );
4463 parent->handle_create(&rctx);
4464 parent->write_if_dirty(*rctx.transaction);
4465 dispatch_context(rctx, parent, osdmap);
4466
4467 dout(10) << *parent << " is new" << dendl;
4468
4469 assert(service.splitting(pgid));
4470 peering_wait_for_split[pgid].push_back(evt);
4471
4472 //parent->queue_peering_event(evt);
4473 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4474 wake_pg_waiters(parent);
4475 parent->unlock();
4476 return 0;
4477 }
4478 default:
4479 assert(0);
4480 return 0;
4481 }
4482 } else {
4483 // already had it. did the mapping change?
4484 if (epoch < pg->info.history.same_interval_since) {
4485 dout(10) << *pg << __func__ << " acting changed in "
4486 << pg->info.history.same_interval_since
4487 << " (msg from " << epoch << ")" << dendl;
4488 } else {
4489 pg->queue_peering_event(evt);
4490 }
4491 pg->unlock();
4492 return -EEXIST;
4493 }
4494}
4495
3efd9988
FG
4496bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4497{
4498 const auto max_pgs_per_osd =
4499 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4500 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4501
4502 RWLock::RLocker pg_map_locker{pg_map_lock};
4503 if (pg_map.size() < max_pgs_per_osd) {
4504 return false;
4505 }
4506 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4507 if (is_mon_create) {
4508 pending_creates_from_mon++;
4509 } else {
b32b8144
FG
4510 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4511 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
3efd9988
FG
4512 }
4513 dout(5) << __func__ << " withhold creation of pg " << pgid
4514 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4515 return true;
4516}
4517
4518// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4519// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4520// to up set if pg_temp is empty. so an empty pg_temp won't work.
4521static vector<int32_t> twiddle(const vector<int>& acting) {
4522 if (acting.size() > 1) {
4523 return {acting[0]};
4524 } else {
4525 vector<int32_t> twiddled(acting.begin(), acting.end());
4526 twiddled.push_back(-1);
4527 return twiddled;
4528 }
4529}
4530
4531void OSD::resume_creating_pg()
4532{
4533 bool do_sub_pg_creates = false;
b32b8144 4534 bool have_pending_creates = false;
3efd9988
FG
4535 {
4536 const auto max_pgs_per_osd =
4537 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4538 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4539 RWLock::RLocker l(pg_map_lock);
4540 if (max_pgs_per_osd <= pg_map.size()) {
4541 // this could happen if admin decreases this setting before a PG is removed
4542 return;
4543 }
4544 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4545 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4546 if (pending_creates_from_mon > 0) {
4547 do_sub_pg_creates = true;
4548 if (pending_creates_from_mon >= spare_pgs) {
4549 spare_pgs = pending_creates_from_mon = 0;
4550 } else {
4551 spare_pgs -= pending_creates_from_mon;
4552 pending_creates_from_mon = 0;
4553 }
4554 }
4555 auto pg = pending_creates_from_osd.cbegin();
4556 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4557 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4558 vector<int> acting;
b32b8144 4559 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
94b18763 4560 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
3efd9988 4561 pg = pending_creates_from_osd.erase(pg);
94b18763 4562 do_sub_pg_creates = true;
3efd9988
FG
4563 spare_pgs--;
4564 }
b32b8144
FG
4565 have_pending_creates = (pending_creates_from_mon > 0 ||
4566 !pending_creates_from_osd.empty());
3efd9988 4567 }
b32b8144
FG
4568
4569 bool do_renew_subs = false;
3efd9988
FG
4570 if (do_sub_pg_creates) {
4571 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4572 dout(4) << __func__ << ": resolicit pg creates from mon since "
4573 << last_pg_create_epoch << dendl;
b32b8144 4574 do_renew_subs = true;
3efd9988
FG
4575 }
4576 }
b32b8144
FG
4577 version_t start = osdmap->get_epoch() + 1;
4578 if (have_pending_creates) {
4579 // don't miss any new osdmap deleting PGs
4580 if (monc->sub_want("osdmap", start, 0)) {
4581 dout(4) << __func__ << ": resolicit osdmap from mon since "
4582 << start << dendl;
4583 do_renew_subs = true;
4584 }
94b18763 4585 } else if (do_sub_pg_creates) {
b32b8144
FG
4586 // no need to subscribe the osdmap continuously anymore
4587 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4588 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4589 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
4590 << start << dendl;
4591 do_renew_subs = true;
4592 }
4593 }
4594
4595 if (do_renew_subs) {
4596 monc->renew_subs();
4597 }
4598
94b18763 4599 service.send_pg_temp();
3efd9988 4600}
7c673cae
FG
4601
4602void OSD::build_initial_pg_history(
4603 spg_t pgid,
4604 epoch_t created,
4605 utime_t created_stamp,
4606 pg_history_t *h,
4607 PastIntervals *pi)
4608{
4609 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4610 h->epoch_created = created;
31f18b77 4611 h->epoch_pool_created = created;
7c673cae
FG
4612 h->same_interval_since = created;
4613 h->same_up_since = created;
4614 h->same_primary_since = created;
4615 h->last_scrub_stamp = created_stamp;
4616 h->last_deep_scrub_stamp = created_stamp;
4617 h->last_clean_scrub_stamp = created_stamp;
4618
4619 OSDMapRef lastmap = service.get_map(created);
4620 int up_primary, acting_primary;
4621 vector<int> up, acting;
4622 lastmap->pg_to_up_acting_osds(
4623 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4624
4625 ostringstream debug;
4626 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4627 OSDMapRef osdmap = service.get_map(e);
4628 int new_up_primary, new_acting_primary;
4629 vector<int> new_up, new_acting;
4630 osdmap->pg_to_up_acting_osds(
4631 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4632
4633 // this is a bit imprecise, but sufficient?
4634 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4635 const pg_pool_t *pi;
4636 bool operator()(const set<pg_shard_t> &have) const {
4637 return have.size() >= pi->min_size;
4638 }
4639 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4640 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4641
4642 bool new_interval = PastIntervals::check_new_interval(
4643 acting_primary,
4644 new_acting_primary,
4645 acting, new_acting,
4646 up_primary,
4647 new_up_primary,
4648 up, new_up,
4649 h->same_interval_since,
4650 h->last_epoch_clean,
4651 osdmap,
4652 lastmap,
4653 pgid.pgid,
4654 &min_size_predicate,
4655 pi,
4656 &debug);
4657 if (new_interval) {
4658 h->same_interval_since = e;
181888fb
FG
4659 if (up != new_up) {
4660 h->same_up_since = e;
4661 }
4662 if (acting_primary != new_acting_primary) {
4663 h->same_primary_since = e;
4664 }
4665 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4666 osdmap->get_pg_num(pgid.pgid.pool()),
4667 nullptr)) {
4668 h->last_epoch_split = e;
4669 }
4670 up = new_up;
4671 acting = new_acting;
4672 up_primary = new_up_primary;
4673 acting_primary = new_acting_primary;
c07f9fc5 4674 }
7c673cae
FG
4675 lastmap = osdmap;
4676 }
4677 dout(20) << __func__ << " " << debug.str() << dendl;
4678 dout(10) << __func__ << " " << *h << " " << *pi
4679 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4680 pi->get_bounds()) << ")"
4681 << dendl;
4682}
4683
4684/**
4685 * Fill in the passed history so you know same_interval_since, same_up_since,
4686 * and same_primary_since.
4687 */
4688bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4689 const vector<int>& currentup,
4690 int currentupprimary,
4691 const vector<int>& currentacting,
4692 int currentactingprimary)
4693{
4694 dout(15) << "project_pg_history " << pgid
4695 << " from " << from << " to " << osdmap->get_epoch()
4696 << ", start " << h
4697 << dendl;
4698
4699 epoch_t e;
4700 for (e = osdmap->get_epoch();
4701 e > from;
4702 e--) {
4703 // verify during intermediate epoch (e-1)
4704 OSDMapRef oldmap = service.try_get_map(e-1);
4705 if (!oldmap) {
4706 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4707 return false;
4708 }
4709 assert(oldmap->have_pg_pool(pgid.pool()));
4710
4711 int upprimary, actingprimary;
4712 vector<int> up, acting;
4713 oldmap->pg_to_up_acting_osds(
4714 pgid.pgid,
4715 &up,
4716 &upprimary,
4717 &acting,
4718 &actingprimary);
4719
4720 // acting set change?
4721 if ((actingprimary != currentactingprimary ||
4722 upprimary != currentupprimary ||
4723 acting != currentacting ||
4724 up != currentup) && e > h.same_interval_since) {
4725 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4726 << " from " << acting << "/" << up
4727 << " " << actingprimary << "/" << upprimary
4728 << " -> " << currentacting << "/" << currentup
4729 << " " << currentactingprimary << "/" << currentupprimary
4730 << dendl;
4731 h.same_interval_since = e;
4732 }
4733 // split?
4734 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4735 osdmap->get_pg_num(pgid.pool()),
4736 0) && e > h.same_interval_since) {
4737 h.same_interval_since = e;
4738 }
4739 // up set change?
4740 if ((up != currentup || upprimary != currentupprimary)
4741 && e > h.same_up_since) {
4742 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4743 << " from " << up << " " << upprimary
4744 << " -> " << currentup << " " << currentupprimary << dendl;
4745 h.same_up_since = e;
4746 }
4747
4748 // primary change?
4749 if (OSDMap::primary_changed(
4750 actingprimary,
4751 acting,
4752 currentactingprimary,
4753 currentacting) &&
4754 e > h.same_primary_since) {
4755 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4756 h.same_primary_since = e;
4757 }
4758
4759 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4760 break;
4761 }
4762
31f18b77 4763 // base case: these floors should be the pg creation epoch if we didn't
7c673cae
FG
4764 // find any changes.
4765 if (e == h.epoch_created) {
4766 if (!h.same_interval_since)
4767 h.same_interval_since = e;
4768 if (!h.same_up_since)
4769 h.same_up_since = e;
4770 if (!h.same_primary_since)
4771 h.same_primary_since = e;
4772 }
4773
4774 dout(15) << "project_pg_history end " << h << dendl;
4775 return true;
4776}
4777
4778
4779
4780void OSD::_add_heartbeat_peer(int p)
4781{
4782 if (p == whoami)
4783 return;
4784 HeartbeatInfo *hi;
4785
4786 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4787 if (i == heartbeat_peers.end()) {
4788 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4789 if (!cons.first)
4790 return;
4791 hi = &heartbeat_peers[p];
4792 hi->peer = p;
4793 HeartbeatSession *s = new HeartbeatSession(p);
4794 hi->con_back = cons.first.get();
4795 hi->con_back->set_priv(s->get());
4796 if (cons.second) {
4797 hi->con_front = cons.second.get();
4798 hi->con_front->set_priv(s->get());
4799 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4800 << " " << hi->con_back->get_peer_addr()
4801 << " " << hi->con_front->get_peer_addr()
4802 << dendl;
4803 } else {
4804 hi->con_front.reset(NULL);
4805 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4806 << " " << hi->con_back->get_peer_addr()
4807 << dendl;
4808 }
4809 s->put();
4810 } else {
4811 hi = &i->second;
4812 }
4813 hi->epoch = osdmap->get_epoch();
4814}
4815
4816void OSD::_remove_heartbeat_peer(int n)
4817{
4818 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4819 assert(q != heartbeat_peers.end());
4820 dout(20) << " removing heartbeat peer osd." << n
4821 << " " << q->second.con_back->get_peer_addr()
4822 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4823 << dendl;
4824 q->second.con_back->mark_down();
4825 if (q->second.con_front) {
4826 q->second.con_front->mark_down();
4827 }
4828 heartbeat_peers.erase(q);
4829}
4830
4831void OSD::need_heartbeat_peer_update()
4832{
4833 if (is_stopping())
4834 return;
4835 dout(20) << "need_heartbeat_peer_update" << dendl;
4836 heartbeat_set_peers_need_update();
4837}
4838
4839void OSD::maybe_update_heartbeat_peers()
4840{
4841 assert(osd_lock.is_locked());
4842
4843 if (is_waiting_for_healthy()) {
4844 utime_t now = ceph_clock_now();
4845 if (last_heartbeat_resample == utime_t()) {
4846 last_heartbeat_resample = now;
4847 heartbeat_set_peers_need_update();
4848 } else if (!heartbeat_peers_need_update()) {
4849 utime_t dur = now - last_heartbeat_resample;
4850 if (dur > cct->_conf->osd_heartbeat_grace) {
4851 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4852 heartbeat_set_peers_need_update();
4853 last_heartbeat_resample = now;
4854 reset_heartbeat_peers(); // we want *new* peers!
4855 }
4856 }
4857 }
4858
4859 if (!heartbeat_peers_need_update())
4860 return;
4861 heartbeat_clear_peers_need_update();
4862
4863 Mutex::Locker l(heartbeat_lock);
4864
4865 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4866
4867
4868 // build heartbeat from set
4869 if (is_active()) {
4870 RWLock::RLocker l(pg_map_lock);
4871 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4872 i != pg_map.end();
4873 ++i) {
4874 PG *pg = i->second;
4875 pg->heartbeat_peer_lock.Lock();
4876 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4877 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4878 p != pg->heartbeat_peers.end();
4879 ++p)
4880 if (osdmap->is_up(*p))
4881 _add_heartbeat_peer(*p);
4882 for (set<int>::iterator p = pg->probe_targets.begin();
4883 p != pg->probe_targets.end();
4884 ++p)
4885 if (osdmap->is_up(*p))
4886 _add_heartbeat_peer(*p);
4887 pg->heartbeat_peer_lock.Unlock();
4888 }
4889 }
4890
4891 // include next and previous up osds to ensure we have a fully-connected set
4892 set<int> want, extras;
4893 int next = osdmap->get_next_up_osd_after(whoami);
4894 if (next >= 0)
4895 want.insert(next);
4896 int prev = osdmap->get_previous_up_osd_before(whoami);
4897 if (prev >= 0 && prev != next)
4898 want.insert(prev);
4899
4900 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4901 dout(10) << " adding neighbor peer osd." << *p << dendl;
4902 extras.insert(*p);
4903 _add_heartbeat_peer(*p);
4904 }
4905
4906 // remove down peers; enumerate extras
4907 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4908 while (p != heartbeat_peers.end()) {
4909 if (!osdmap->is_up(p->first)) {
4910 int o = p->first;
4911 ++p;
4912 _remove_heartbeat_peer(o);
4913 continue;
4914 }
4915 if (p->second.epoch < osdmap->get_epoch()) {
4916 extras.insert(p->first);
4917 }
4918 ++p;
4919 }
4920
4921 // too few?
4922 int start = osdmap->get_next_up_osd_after(whoami);
4923 for (int n = start; n >= 0; ) {
4924 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4925 break;
4926 if (!extras.count(n) && !want.count(n) && n != whoami) {
4927 dout(10) << " adding random peer osd." << n << dendl;
4928 extras.insert(n);
4929 _add_heartbeat_peer(n);
4930 }
4931 n = osdmap->get_next_up_osd_after(n);
4932 if (n == start)
4933 break; // came full circle; stop
4934 }
4935
4936 // too many?
4937 for (set<int>::iterator p = extras.begin();
4938 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4939 ++p) {
4940 if (want.count(*p))
4941 continue;
4942 _remove_heartbeat_peer(*p);
4943 }
4944
4945 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4946}
4947
4948void OSD::reset_heartbeat_peers()
4949{
4950 assert(osd_lock.is_locked());
4951 dout(10) << "reset_heartbeat_peers" << dendl;
4952 Mutex::Locker l(heartbeat_lock);
4953 while (!heartbeat_peers.empty()) {
4954 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4955 hi.con_back->mark_down();
4956 if (hi.con_front) {
4957 hi.con_front->mark_down();
4958 }
4959 heartbeat_peers.erase(heartbeat_peers.begin());
4960 }
4961 failure_queue.clear();
4962}
4963
4964void OSD::handle_osd_ping(MOSDPing *m)
4965{
4966 if (superblock.cluster_fsid != m->fsid) {
4967 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4968 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4969 m->put();
4970 return;
4971 }
4972
4973 int from = m->get_source().num();
4974
4975 heartbeat_lock.Lock();
4976 if (is_stopping()) {
4977 heartbeat_lock.Unlock();
4978 m->put();
4979 return;
4980 }
4981
4982 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
4983 if (!curmap) {
4984 heartbeat_lock.Unlock();
4985 m->put();
4986 return;
4987 }
7c673cae
FG
4988
4989 switch (m->op) {
4990
4991 case MOSDPing::PING:
4992 {
4993 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4994 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4995 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4996 if (heartbeat_drop->second == 0) {
4997 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4998 } else {
4999 --heartbeat_drop->second;
5000 dout(5) << "Dropping heartbeat from " << from
5001 << ", " << heartbeat_drop->second
5002 << " remaining to drop" << dendl;
5003 break;
5004 }
5005 } else if (cct->_conf->osd_debug_drop_ping_probability >
5006 ((((double)(rand()%100))/100.0))) {
5007 heartbeat_drop =
5008 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5009 cct->_conf->osd_debug_drop_ping_duration)).first;
5010 dout(5) << "Dropping heartbeat from " << from
5011 << ", " << heartbeat_drop->second
5012 << " remaining to drop" << dendl;
5013 break;
5014 }
5015 }
5016
5017 if (!cct->get_heartbeat_map()->is_healthy()) {
5018 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5019 break;
5020 }
5021
5022 Message *r = new MOSDPing(monc->get_fsid(),
5023 curmap->get_epoch(),
31f18b77
FG
5024 MOSDPing::PING_REPLY, m->stamp,
5025 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5026 m->get_connection()->send_message(r);
5027
5028 if (curmap->is_up(from)) {
5029 service.note_peer_epoch(from, m->map_epoch);
5030 if (is_active()) {
5031 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5032 if (con) {
5033 service.share_map_peer(from, con.get());
5034 }
5035 }
5036 } else if (!curmap->exists(from) ||
5037 curmap->get_down_at(from) > m->map_epoch) {
5038 // tell them they have died
5039 Message *r = new MOSDPing(monc->get_fsid(),
5040 curmap->get_epoch(),
5041 MOSDPing::YOU_DIED,
31f18b77
FG
5042 m->stamp,
5043 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5044 m->get_connection()->send_message(r);
5045 }
5046 }
5047 break;
5048
5049 case MOSDPing::PING_REPLY:
5050 {
5051 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5052 if (i != heartbeat_peers.end()) {
5053 if (m->get_connection() == i->second.con_back) {
5054 dout(25) << "handle_osd_ping got reply from osd." << from
5055 << " first_tx " << i->second.first_tx
5056 << " last_tx " << i->second.last_tx
5057 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
5058 << " last_rx_front " << i->second.last_rx_front
5059 << dendl;
5060 i->second.last_rx_back = m->stamp;
5061 // if there is no front con, set both stamps.
5062 if (i->second.con_front == NULL)
5063 i->second.last_rx_front = m->stamp;
5064 } else if (m->get_connection() == i->second.con_front) {
5065 dout(25) << "handle_osd_ping got reply from osd." << from
5066 << " first_tx " << i->second.first_tx
5067 << " last_tx " << i->second.last_tx
5068 << " last_rx_back " << i->second.last_rx_back
5069 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
5070 << dendl;
5071 i->second.last_rx_front = m->stamp;
5072 }
5073
5074 utime_t cutoff = ceph_clock_now();
5075 cutoff -= cct->_conf->osd_heartbeat_grace;
5076 if (i->second.is_healthy(cutoff)) {
5077 // Cancel false reports
5078 auto failure_queue_entry = failure_queue.find(from);
5079 if (failure_queue_entry != failure_queue.end()) {
5080 dout(10) << "handle_osd_ping canceling queued "
5081 << "failure report for osd." << from << dendl;
5082 failure_queue.erase(failure_queue_entry);
5083 }
5084
5085 auto failure_pending_entry = failure_pending.find(from);
5086 if (failure_pending_entry != failure_pending.end()) {
5087 dout(10) << "handle_osd_ping canceling in-flight "
5088 << "failure report for osd." << from << dendl;
5089 send_still_alive(curmap->get_epoch(),
5090 failure_pending_entry->second.second);
5091 failure_pending.erase(failure_pending_entry);
5092 }
5093 }
5094 }
5095
5096 if (m->map_epoch &&
5097 curmap->is_up(from)) {
5098 service.note_peer_epoch(from, m->map_epoch);
5099 if (is_active()) {
5100 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5101 if (con) {
5102 service.share_map_peer(from, con.get());
5103 }
5104 }
5105 }
5106 }
5107 break;
5108
5109 case MOSDPing::YOU_DIED:
5110 dout(10) << "handle_osd_ping " << m->get_source_inst()
5111 << " says i am down in " << m->map_epoch << dendl;
5112 osdmap_subscribe(curmap->get_epoch()+1, false);
5113 break;
5114 }
5115
5116 heartbeat_lock.Unlock();
5117 m->put();
5118}
5119
5120void OSD::heartbeat_entry()
5121{
5122 Mutex::Locker l(heartbeat_lock);
5123 if (is_stopping())
5124 return;
5125 while (!heartbeat_stop) {
5126 heartbeat();
5127
5128 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5129 utime_t w;
5130 w.set_from_double(wait);
5131 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5132 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5133 if (is_stopping())
5134 return;
5135 dout(30) << "heartbeat_entry woke up" << dendl;
5136 }
5137}
5138
5139void OSD::heartbeat_check()
5140{
5141 assert(heartbeat_lock.is_locked());
5142 utime_t now = ceph_clock_now();
5143
5144 // check for heartbeat replies (move me elsewhere?)
5145 utime_t cutoff = now;
5146 cutoff -= cct->_conf->osd_heartbeat_grace;
5147 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5148 p != heartbeat_peers.end();
5149 ++p) {
5150
5151 if (p->second.first_tx == utime_t()) {
5152 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5153 << "yet, skipping" << dendl;
5154 continue;
5155 }
5156
5157 dout(25) << "heartbeat_check osd." << p->first
5158 << " first_tx " << p->second.first_tx
5159 << " last_tx " << p->second.last_tx
5160 << " last_rx_back " << p->second.last_rx_back
5161 << " last_rx_front " << p->second.last_rx_front
5162 << dendl;
5163 if (p->second.is_unhealthy(cutoff)) {
5164 if (p->second.last_rx_back == utime_t() ||
5165 p->second.last_rx_front == utime_t()) {
5166 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5167 << " osd." << p->first << " ever on either front or back, first ping sent "
5168 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5169 // fail
5170 failure_queue[p->first] = p->second.last_tx;
5171 } else {
5172 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5173 << " osd." << p->first << " since back " << p->second.last_rx_back
5174 << " front " << p->second.last_rx_front
5175 << " (cutoff " << cutoff << ")" << dendl;
5176 // fail
5177 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5178 }
5179 }
5180 }
5181}
5182
5183void OSD::heartbeat()
5184{
5185 dout(30) << "heartbeat" << dendl;
5186
5187 // get CPU load avg
5188 double loadavgs[1];
5189 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5190 if (getloadavg(loadavgs, 1) == 1) {
5191 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5192 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5193 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5194 }
5195
5196 dout(30) << "heartbeat checking stats" << dendl;
5197
5198 // refresh stats?
5199 vector<int> hb_peers;
5200 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5201 p != heartbeat_peers.end();
5202 ++p)
5203 hb_peers.push_back(p->first);
5204 service.update_osd_stat(hb_peers);
5205
5206 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5207
5208 utime_t now = ceph_clock_now();
5209
5210 // send heartbeats
5211 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5212 i != heartbeat_peers.end();
5213 ++i) {
5214 int peer = i->first;
5215 i->second.last_tx = now;
5216 if (i->second.first_tx == utime_t())
5217 i->second.first_tx = now;
5218 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5219 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5220 service.get_osdmap()->get_epoch(),
31f18b77
FG
5221 MOSDPing::PING, now,
5222 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5223
5224 if (i->second.con_front)
5225 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5226 service.get_osdmap()->get_epoch(),
31f18b77
FG
5227 MOSDPing::PING, now,
5228 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5229 }
5230
5231 logger->set(l_osd_hb_to, heartbeat_peers.size());
5232
5233 // hmm.. am i all alone?
5234 dout(30) << "heartbeat lonely?" << dendl;
5235 if (heartbeat_peers.empty()) {
5236 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5237 last_mon_heartbeat = now;
5238 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5239 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5240 }
5241 }
5242
5243 dout(30) << "heartbeat done" << dendl;
5244}
5245
5246bool OSD::heartbeat_reset(Connection *con)
5247{
5248 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5249 if (s) {
5250 heartbeat_lock.Lock();
5251 if (is_stopping()) {
5252 heartbeat_lock.Unlock();
5253 s->put();
5254 return true;
5255 }
5256 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5257 if (p != heartbeat_peers.end() &&
5258 (p->second.con_back == con ||
5259 p->second.con_front == con)) {
5260 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5261 << ", reopening" << dendl;
5262 if (con != p->second.con_back) {
5263 p->second.con_back->mark_down();
5264 }
5265 p->second.con_back.reset(NULL);
5266 if (p->second.con_front && con != p->second.con_front) {
5267 p->second.con_front->mark_down();
5268 }
5269 p->second.con_front.reset(NULL);
5270 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5271 if (newcon.first) {
5272 p->second.con_back = newcon.first.get();
5273 p->second.con_back->set_priv(s->get());
5274 if (newcon.second) {
5275 p->second.con_front = newcon.second.get();
5276 p->second.con_front->set_priv(s->get());
5277 }
5278 } else {
5279 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5280 << ", raced with osdmap update, closing out peer" << dendl;
5281 heartbeat_peers.erase(p);
5282 }
5283 } else {
5284 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5285 }
5286 heartbeat_lock.Unlock();
5287 s->put();
5288 }
5289 return true;
5290}
5291
5292
5293
5294// =========================================
5295
5296void OSD::tick()
5297{
5298 assert(osd_lock.is_locked());
5299 dout(10) << "tick" << dendl;
5300
5301 if (is_active() || is_waiting_for_healthy()) {
5302 maybe_update_heartbeat_peers();
5303 }
5304
5305 if (is_waiting_for_healthy()) {
5306 start_boot();
224ce89b
WB
5307 } else if (is_preboot() &&
5308 waiting_for_luminous_mons &&
5309 monc->monmap.get_required_features().contains_all(
5310 ceph::features::mon::FEATURE_LUMINOUS)) {
5311 // mon upgrade finished!
5312 start_boot();
7c673cae
FG
5313 }
5314
5315 do_waiters();
5316
5317 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
7c673cae
FG
5318}
5319
5320void OSD::tick_without_osd_lock()
5321{
5322 assert(tick_timer_lock.is_locked());
5323 dout(10) << "tick_without_osd_lock" << dendl;
5324
5325 logger->set(l_osd_buf, buffer::get_total_alloc());
5326 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5327 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5328 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5329 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5330 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
94b18763 5331 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
7c673cae
FG
5332
5333 // osd_lock is not being held, which means the OSD state
5334 // might change when doing the monitor report
5335 if (is_active() || is_waiting_for_healthy()) {
5336 heartbeat_lock.Lock();
5337 heartbeat_check();
5338 heartbeat_lock.Unlock();
5339
5340 map_lock.get_read();
5341 Mutex::Locker l(mon_report_lock);
5342
5343 // mon report?
5344 bool reset = false;
5345 bool report = false;
5346 utime_t now = ceph_clock_now();
5347 pg_stat_queue_lock.Lock();
5348 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5349 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5350 // note: we shouldn't adjust max because it must remain < the
5351 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5352 // value).
5353 double max = cct->_conf->osd_mon_report_interval_max;
5354 if (!outstanding_pg_stats.empty() &&
5355 (now - stats_ack_timeout) > last_pg_stats_ack) {
5356 dout(1) << __func__ << " mon hasn't acked PGStats in "
5357 << now - last_pg_stats_ack
5358 << " seconds, reconnecting elsewhere" << dendl;
5359 reset = true;
5360 last_pg_stats_ack = now; // reset clock
5361 last_pg_stats_sent = utime_t();
5362 stats_ack_timeout =
5363 MAX(cct->_conf->osd_mon_ack_timeout,
5364 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5365 outstanding_pg_stats.clear();
5366 }
5367 if (now - last_pg_stats_sent > max) {
5368 osd_stat_updated = true;
5369 report = true;
5370 } else if (service.need_fullness_update()) {
5371 report = true;
5372 } else if ((int)outstanding_pg_stats.size() >=
5373 cct->_conf->osd_mon_report_max_in_flight) {
5374 dout(20) << __func__ << " have max " << outstanding_pg_stats
5375 << " stats updates in flight" << dendl;
5376 } else {
5377 if (now - last_mon_report > adjusted_min) {
5378 dout(20) << __func__ << " stats backoff " << backoff
5379 << " adjusted_min " << adjusted_min << " - sending report"
5380 << dendl;
5381 osd_stat_updated = true;
5382 report = true;
5383 }
5384 }
5385 pg_stat_queue_lock.Unlock();
5386
5387 if (reset) {
5388 monc->reopen_session();
5389 } else if (report) {
5390 last_mon_report = now;
5391
5392 // do any pending reports
5393 send_full_update();
5394 send_failures();
31f18b77
FG
5395 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5396 send_pg_stats(now);
5397 }
7c673cae
FG
5398 }
5399 map_lock.put_read();
5400 }
5401
5402 if (is_active()) {
5403 if (!scrub_random_backoff()) {
5404 sched_scrub();
5405 }
5406 service.promote_throttle_recalibrate();
3efd9988 5407 resume_creating_pg();
224ce89b
WB
5408 bool need_send_beacon = false;
5409 const auto now = ceph::coarse_mono_clock::now();
5410 {
5411 // borrow lec lock to pretect last_sent_beacon from changing
5412 Mutex::Locker l{min_last_epoch_clean_lock};
5413 const auto elapsed = now - last_sent_beacon;
5414 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5415 cct->_conf->osd_beacon_report_interval) {
5416 need_send_beacon = true;
5417 }
5418 }
5419 if (need_send_beacon) {
5420 send_beacon(now);
5421 }
7c673cae
FG
5422 }
5423
b32b8144 5424 mgrc.update_osd_health(get_health_metrics());
7c673cae
FG
5425 service.kick_recovery_queue();
5426 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5427}
5428
5429void OSD::check_ops_in_flight()
5430{
5431 vector<string> warnings;
5432 if (op_tracker.check_ops_in_flight(warnings)) {
5433 for (vector<string>::iterator i = warnings.begin();
5434 i != warnings.end();
5435 ++i) {
5436 clog->warn() << *i;
5437 }
5438 }
5439}
5440
5441// Usage:
5442// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5443// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5444// setomapheader <pool-id> [namespace/]<obj-name> <header>
5445// getomap <pool> [namespace/]<obj-name>
5446// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5447// injectmdataerr [namespace/]<obj-name> [shardid]
5448// injectdataerr [namespace/]<obj-name> [shardid]
5449//
5450// set_recovery_delay [utime]
5451void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5452 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5453{
5454 //Test support
5455 //Support changing the omap on a single osd by using the Admin Socket to
5456 //directly request the osd make a change.
5457 if (command == "setomapval" || command == "rmomapkey" ||
5458 command == "setomapheader" || command == "getomap" ||
5459 command == "truncobj" || command == "injectmdataerr" ||
5460 command == "injectdataerr"
5461 ) {
5462 pg_t rawpg;
5463 int64_t pool;
5464 OSDMapRef curmap = service->get_osdmap();
5465 int r = -1;
5466
5467 string poolstr;
5468
5469 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5470 pool = curmap->lookup_pg_pool_name(poolstr);
5471 //If we can't find it by name then maybe id specified
5472 if (pool < 0 && isdigit(poolstr[0]))
5473 pool = atoll(poolstr.c_str());
5474 if (pool < 0) {
b5b8bbf5 5475 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5476 return;
5477 }
5478
5479 string objname, nspace;
5480 cmd_getval(service->cct, cmdmap, "objname", objname);
5481 std::size_t found = objname.find_first_of('/');
5482 if (found != string::npos) {
5483 nspace = objname.substr(0, found);
5484 objname = objname.substr(found+1);
5485 }
5486 object_locator_t oloc(pool, nspace);
5487 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5488
5489 if (r < 0) {
5490 ss << "Invalid namespace/objname";
5491 return;
5492 }
5493
5494 int64_t shardid;
5495 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5496 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5497 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5498 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5499 if (curmap->pg_is_ec(rawpg)) {
5500 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5501 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5502 return;
5503 }
5504 }
5505
5506 ObjectStore::Transaction t;
5507
5508 if (command == "setomapval") {
5509 map<string, bufferlist> newattrs;
5510 bufferlist val;
5511 string key, valstr;
5512 cmd_getval(service->cct, cmdmap, "key", key);
5513 cmd_getval(service->cct, cmdmap, "val", valstr);
5514
5515 val.append(valstr);
5516 newattrs[key] = val;
5517 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5518 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5519 if (r < 0)
5520 ss << "error=" << r;
5521 else
5522 ss << "ok";
5523 } else if (command == "rmomapkey") {
5524 string key;
5525 set<string> keys;
5526 cmd_getval(service->cct, cmdmap, "key", key);
5527
5528 keys.insert(key);
5529 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5530 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5531 if (r < 0)
5532 ss << "error=" << r;
5533 else
5534 ss << "ok";
5535 } else if (command == "setomapheader") {
5536 bufferlist newheader;
5537 string headerstr;
5538
5539 cmd_getval(service->cct, cmdmap, "header", headerstr);
5540 newheader.append(headerstr);
5541 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5542 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5543 if (r < 0)
5544 ss << "error=" << r;
5545 else
5546 ss << "ok";
5547 } else if (command == "getomap") {
5548 //Debug: Output entire omap
5549 bufferlist hdrbl;
5550 map<string, bufferlist> keyvals;
5551 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5552 if (r >= 0) {
5553 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5554 for (map<string, bufferlist>::iterator it = keyvals.begin();
5555 it != keyvals.end(); ++it)
5556 ss << " key=" << (*it).first << " val="
5557 << string((*it).second.c_str(), (*it).second.length());
5558 } else {
5559 ss << "error=" << r;
5560 }
5561 } else if (command == "truncobj") {
5562 int64_t trunclen;
5563 cmd_getval(service->cct, cmdmap, "len", trunclen);
5564 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5565 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5566 if (r < 0)
5567 ss << "error=" << r;
5568 else
5569 ss << "ok";
5570 } else if (command == "injectdataerr") {
5571 store->inject_data_error(gobj);
5572 ss << "ok";
5573 } else if (command == "injectmdataerr") {
5574 store->inject_mdata_error(gobj);
5575 ss << "ok";
5576 }
5577 return;
5578 }
5579 if (command == "set_recovery_delay") {
5580 int64_t delay;
5581 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5582 ostringstream oss;
5583 oss << delay;
5584 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5585 oss.str().c_str());
5586 if (r != 0) {
5587 ss << "set_recovery_delay: error setting "
5588 << "osd_recovery_delay_start to '" << delay << "': error "
5589 << r;
5590 return;
5591 }
5592 service->cct->_conf->apply_changes(NULL);
5593 ss << "set_recovery_delay: set osd_recovery_delay_start "
5594 << "to " << service->cct->_conf->osd_recovery_delay_start;
5595 return;
5596 }
5597 if (command == "trigger_scrub") {
5598 spg_t pgid;
5599 OSDMapRef curmap = service->get_osdmap();
5600
5601 string pgidstr;
5602
5603 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5604 if (!pgid.parse(pgidstr.c_str())) {
5605 ss << "Invalid pgid specified";
5606 return;
5607 }
5608
5609 PG *pg = service->osd->_lookup_lock_pg(pgid);
5610 if (pg == nullptr) {
5611 ss << "Can't find pg " << pgid;
5612 return;
5613 }
5614
5615 if (pg->is_primary()) {
5616 pg->unreg_next_scrub();
5617 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5618 double pool_scrub_max_interval = 0;
5619 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5620 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5621 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5622 // Instead of marking must_scrub force a schedule scrub
5623 utime_t stamp = ceph_clock_now();
5624 stamp -= scrub_max_interval;
5625 stamp -= 100.0; // push back last scrub more for good measure
5626 pg->info.history.last_scrub_stamp = stamp;
5627 pg->reg_next_scrub();
5628 ss << "ok";
5629 } else {
5630 ss << "Not primary";
5631 }
5632 pg->unlock();
5633 return;
5634 }
5635 if (command == "injectfull") {
5636 int64_t count;
5637 string type;
5638 OSDService::s_names state;
5639 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5640 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5641 if (type == "none" || count == 0) {
5642 type = "none";
5643 count = 0;
5644 }
5645 state = service->get_full_state(type);
5646 if (state == OSDService::s_names::INVALID) {
5647 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5648 return;
5649 }
5650 service->set_injectfull(state, count);
5651 return;
5652 }
5653 ss << "Internal error - command=" << command;
5654}
5655
5656// =========================================
5657bool remove_dir(
5658 CephContext *cct,
5659 ObjectStore *store, SnapMapper *mapper,
5660 OSDriver *osdriver,
5661 ObjectStore::Sequencer *osr,
5662 coll_t coll, DeletingStateRef dstate,
5663 bool *finished,
5664 ThreadPool::TPHandle &handle)
5665{
5666 vector<ghobject_t> olist;
5667 int64_t num = 0;
5668 ObjectStore::Transaction t;
5669 ghobject_t next;
5670 handle.reset_tp_timeout();
5671 store->collection_list(
5672 coll,
5673 next,
5674 ghobject_t::get_max(),
5675 store->get_ideal_list_max(),
5676 &olist,
5677 &next);
5678 generic_dout(10) << __func__ << " " << olist << dendl;
5679 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5680 // will recheck the answer before it really goes on.
5681 bool cont = true;
5682 for (vector<ghobject_t>::iterator i = olist.begin();
5683 i != olist.end();
5684 ++i) {
5685 if (i->is_pgmeta())
5686 continue;
5687 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5688 int r = mapper->remove_oid(i->hobj, &_t);
5689 if (r != 0 && r != -ENOENT) {
5690 ceph_abort();
5691 }
5692 t.remove(coll, *i);
5693 if (++num >= cct->_conf->osd_target_transaction_size) {
5694 C_SaferCond waiter;
5695 store->queue_transaction(osr, std::move(t), &waiter);
5696 cont = dstate->pause_clearing();
5697 handle.suspend_tp_timeout();
5698 waiter.wait();
5699 handle.reset_tp_timeout();
5700 if (cont)
5701 cont = dstate->resume_clearing();
5702 if (!cont)
5703 return false;
5704 t = ObjectStore::Transaction();
5705 num = 0;
5706 }
5707 }
5708 if (num) {
5709 C_SaferCond waiter;
5710 store->queue_transaction(osr, std::move(t), &waiter);
5711 cont = dstate->pause_clearing();
5712 handle.suspend_tp_timeout();
5713 waiter.wait();
5714 handle.reset_tp_timeout();
5715 if (cont)
5716 cont = dstate->resume_clearing();
5717 }
5718 // whether there are more objects to remove in the collection
5719 *finished = next.is_max();
5720 return cont;
5721}
5722
5723void OSD::RemoveWQ::_process(
5724 pair<PGRef, DeletingStateRef> item,
5725 ThreadPool::TPHandle &handle)
5726{
5727 FUNCTRACE();
5728 PGRef pg(item.first);
5729 SnapMapper &mapper = pg->snap_mapper;
5730 OSDriver &driver = pg->osdriver;
5731 coll_t coll = coll_t(pg->info.pgid);
5732 pg->osr->flush();
5733 bool finished = false;
5734
5735 if (!item.second->start_or_resume_clearing())
5736 return;
5737
5738 bool cont = remove_dir(
5739 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5740 &finished, handle);
5741 if (!cont)
5742 return;
5743 if (!finished) {
5744 if (item.second->pause_clearing())
5745 queue_front(item);
5746 return;
5747 }
5748
5749 if (!item.second->start_deleting())
5750 return;
5751
5752 ObjectStore::Transaction t;
5753 PGLog::clear_info_log(pg->info.pgid, &t);
5754
5755 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5756 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5757 _exit(1);
5758 }
5759 t.remove_collection(coll);
5760
5761 // We need the sequencer to stick around until the op is complete
5762 store->queue_transaction(
5763 pg->osr.get(),
5764 std::move(t),
5765 0, // onapplied
5766 0, // oncommit
5767 0, // onreadable sync
5768 new ContainerContext<PGRef>(pg),
5769 TrackedOpRef());
5770
5771 item.second->finish_deleting();
5772}
5773// =========================================
5774
5775void OSD::ms_handle_connect(Connection *con)
5776{
5777 dout(10) << __func__ << " con " << con << dendl;
5778 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5779 Mutex::Locker l(osd_lock);
5780 if (is_stopping())
5781 return;
5782 dout(10) << __func__ << " on mon" << dendl;
5783
5784 if (is_preboot()) {
5785 start_boot();
5786 } else if (is_booting()) {
5787 _send_boot(); // resend boot message
5788 } else {
5789 map_lock.get_read();
5790 Mutex::Locker l2(mon_report_lock);
5791
5792 utime_t now = ceph_clock_now();
5793 last_mon_report = now;
5794
5795 // resend everything, it's a new session
5796 send_full_update();
5797 send_alive();
5798 service.requeue_pg_temp();
5799 service.send_pg_temp();
5800 requeue_failures();
5801 send_failures();
31f18b77
FG
5802 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5803 send_pg_stats(now);
5804 }
7c673cae
FG
5805
5806 map_lock.put_read();
5807 if (is_active()) {
5808 send_beacon(ceph::coarse_mono_clock::now());
5809 }
5810 }
5811
5812 // full map requests may happen while active or pre-boot
5813 if (requested_full_first) {
5814 rerequest_full_maps();
5815 }
5816 }
5817}
5818
5819void OSD::ms_handle_fast_connect(Connection *con)
5820{
5821 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5822 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5823 Session *s = static_cast<Session*>(con->get_priv());
5824 if (!s) {
5825 s = new Session(cct);
5826 con->set_priv(s->get());
5827 s->con = con;
5828 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5829 << " addr=" << s->con->get_peer_addr() << dendl;
5830 // we don't connect to clients
5831 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5832 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5833 }
5834 s->put();
5835 }
5836}
5837
5838void OSD::ms_handle_fast_accept(Connection *con)
5839{
5840 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5841 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5842 Session *s = static_cast<Session*>(con->get_priv());
5843 if (!s) {
5844 s = new Session(cct);
5845 con->set_priv(s->get());
5846 s->con = con;
5847 dout(10) << "new session (incoming)" << s << " con=" << con
5848 << " addr=" << con->get_peer_addr()
5849 << " must have raced with connect" << dendl;
5850 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5851 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5852 }
5853 s->put();
5854 }
5855}
5856
5857bool OSD::ms_handle_reset(Connection *con)
5858{
5859 Session *session = static_cast<Session*>(con->get_priv());
5860 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5861 if (!session)
5862 return false;
5863 session->wstate.reset(con);
5864 session->con.reset(NULL); // break con <-> session ref cycle
5865 // note that we break session->con *before* the session_handle_reset
5866 // cleanup below. this avoids a race between us and
5867 // PG::add_backoff, Session::check_backoff, etc.
5868 session_handle_reset(session);
5869 session->put();
5870 return true;
5871}
5872
5873bool OSD::ms_handle_refused(Connection *con)
5874{
5875 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5876 return false;
5877
5878 Session *session = static_cast<Session*>(con->get_priv());
5879 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5880 if (!session)
5881 return false;
5882 int type = con->get_peer_type();
5883 // handle only OSD failures here
5884 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5885 OSDMapRef osdmap = get_osdmap();
5886 if (osdmap) {
5887 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5888 if (id >= 0 && osdmap->is_up(id)) {
5889 // I'm cheating mon heartbeat grace logic, because we know it's not going
5890 // to respawn alone. +1 so we won't hit any boundary case.
5891 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5892 osdmap->get_inst(id),
5893 cct->_conf->osd_heartbeat_grace + 1,
5894 osdmap->get_epoch(),
5895 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5896 ));
5897 }
5898 }
5899 }
5900 session->put();
5901 return true;
5902}
5903
5904struct C_OSD_GetVersion : public Context {
5905 OSD *osd;
5906 uint64_t oldest, newest;
5907 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5908 void finish(int r) override {
5909 if (r >= 0)
5910 osd->_got_mon_epochs(oldest, newest);
5911 }
5912};
5913
5914void OSD::start_boot()
5915{
5916 if (!_is_healthy()) {
5917 // if we are not healthy, do not mark ourselves up (yet)
5918 dout(1) << "not healthy; waiting to boot" << dendl;
5919 if (!is_waiting_for_healthy())
5920 start_waiting_for_healthy();
5921 // send pings sooner rather than later
5922 heartbeat_kick();
5923 return;
5924 }
5925 dout(1) << __func__ << dendl;
5926 set_state(STATE_PREBOOT);
224ce89b 5927 waiting_for_luminous_mons = false;
7c673cae
FG
5928 dout(10) << "start_boot - have maps " << superblock.oldest_map
5929 << ".." << superblock.newest_map << dendl;
5930 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5931 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5932}
5933
5934void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5935{
5936 Mutex::Locker l(osd_lock);
5937 if (is_preboot()) {
5938 _preboot(oldest, newest);
5939 }
5940}
5941
5942void OSD::_preboot(epoch_t oldest, epoch_t newest)
5943{
5944 assert(is_preboot());
5945 dout(10) << __func__ << " _preboot mon has osdmaps "
5946 << oldest << ".." << newest << dendl;
5947
5948 // ensure our local fullness awareness is accurate
5949 heartbeat();
5950
5951 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
5952 if (osdmap->get_epoch() == 0) {
5953 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 5954 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
5955 derr << "osdmap says I am destroyed" << dendl;
5956 // provide a small margin so we don't livelock seeing if we
5957 // un-destroyed ourselves.
5958 if (osdmap->get_epoch() > newest - 1) {
5959 exit(0);
5960 }
31f18b77 5961 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
7c673cae
FG
5962 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5963 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5964 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5965 << dendl;
31f18b77 5966 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
5967 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5968 << dendl;
5969 } else if (!monc->monmap.get_required_features().contains_all(
5970 ceph::features::mon::FEATURE_LUMINOUS)) {
5971 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5972 << "Luminous or later before Luminous OSDs will boot" << dendl;
224ce89b 5973 waiting_for_luminous_mons = true;
7c673cae
FG
5974 } else if (service.need_fullness_update()) {
5975 derr << "osdmap fullness state needs update" << dendl;
5976 send_full_update();
5977 } else if (osdmap->get_epoch() >= oldest - 1 &&
5978 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5979 _send_boot();
5980 return;
5981 }
5982
5983 // get all the latest maps
5984 if (osdmap->get_epoch() + 1 >= oldest)
5985 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5986 else
5987 osdmap_subscribe(oldest - 1, true);
5988}
5989
5990void OSD::send_full_update()
5991{
5992 if (!service.need_fullness_update())
5993 return;
5994 unsigned state = 0;
5995 if (service.is_full()) {
5996 state = CEPH_OSD_FULL;
5997 } else if (service.is_backfillfull()) {
5998 state = CEPH_OSD_BACKFILLFULL;
5999 } else if (service.is_nearfull()) {
6000 state = CEPH_OSD_NEARFULL;
6001 }
6002 set<string> s;
6003 OSDMap::calc_state_set(state, s);
6004 dout(10) << __func__ << " want state " << s << dendl;
6005 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6006}
6007
6008void OSD::start_waiting_for_healthy()
6009{
6010 dout(1) << "start_waiting_for_healthy" << dendl;
6011 set_state(STATE_WAITING_FOR_HEALTHY);
6012 last_heartbeat_resample = utime_t();
181888fb
FG
6013
6014 // subscribe to osdmap updates, in case our peers really are known to be dead
6015 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
6016}
6017
6018bool OSD::_is_healthy()
6019{
6020 if (!cct->get_heartbeat_map()->is_healthy()) {
6021 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6022 return false;
6023 }
6024
6025 if (is_waiting_for_healthy()) {
6026 Mutex::Locker l(heartbeat_lock);
6027 utime_t cutoff = ceph_clock_now();
6028 cutoff -= cct->_conf->osd_heartbeat_grace;
6029 int num = 0, up = 0;
6030 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6031 p != heartbeat_peers.end();
6032 ++p) {
6033 if (p->second.is_healthy(cutoff))
6034 ++up;
6035 ++num;
6036 }
6037 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6038 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6039 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6040 return false;
6041 }
6042 }
6043
6044 return true;
6045}
6046
6047void OSD::_send_boot()
6048{
6049 dout(10) << "_send_boot" << dendl;
6050 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
6051 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
6052 if (cluster_addr.is_blank_ip()) {
6053 int port = cluster_addr.get_port();
6054 cluster_addr = client_messenger->get_myaddr();
6055 cluster_addr.set_port(port);
6056 cluster_messenger->set_addr_unknowns(cluster_addr);
6057 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
6058 } else {
6059 Session *s = static_cast<Session*>(local_connection->get_priv());
6060 if (s)
6061 s->put();
6062 else
6063 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6064 }
6065
6066 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
6067 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6068 if (hb_back_addr.is_blank_ip()) {
6069 int port = hb_back_addr.get_port();
6070 hb_back_addr = cluster_addr;
6071 hb_back_addr.set_port(port);
6072 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
6073 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
6074 } else {
6075 Session *s = static_cast<Session*>(local_connection->get_priv());
6076 if (s)
6077 s->put();
6078 else
6079 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6080 }
6081
6082 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
6083 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6084 if (hb_front_addr.is_blank_ip()) {
6085 int port = hb_front_addr.get_port();
6086 hb_front_addr = client_messenger->get_myaddr();
6087 hb_front_addr.set_port(port);
6088 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
6089 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
6090 } else {
6091 Session *s = static_cast<Session*>(local_connection->get_priv());
6092 if (s)
6093 s->put();
6094 else
6095 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6096 }
6097
6098 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6099 hb_back_addr, hb_front_addr, cluster_addr,
6100 CEPH_FEATURES_ALL);
6101 dout(10) << " client_addr " << client_messenger->get_myaddr()
6102 << ", cluster_addr " << cluster_addr
6103 << ", hb_back_addr " << hb_back_addr
6104 << ", hb_front_addr " << hb_front_addr
6105 << dendl;
6106 _collect_metadata(&mboot->metadata);
6107 monc->send_mon_message(mboot);
6108 set_state(STATE_BOOTING);
6109}
6110
6111void OSD::_collect_metadata(map<string,string> *pm)
6112{
6113 // config info
6114 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6115 if (store->get_type() == "filestore") {
6116 // not applicable for bluestore
6117 (*pm)["osd_journal"] = journal_path;
6118 }
7c673cae
FG
6119 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6120 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6121 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6122 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6123
6124 // backend
6125 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6126 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6127 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6128 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6129 store->collect_metadata(pm);
6130
6131 collect_sys_info(pm, cct);
6132
b5b8bbf5
FG
6133 std::string front_iface, back_iface;
6134 /*
6135 pick_iface(cct,
6136 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6137 &front_iface, &back_iface);
6138 */
6139 (*pm)["front_iface"] = pick_iface(cct,
6140 client_messenger->get_myaddr().get_sockaddr_storage());
6141 (*pm)["back_iface"] = pick_iface(cct,
6142 cluster_messenger->get_myaddr().get_sockaddr_storage());
6143
7c673cae
FG
6144 dout(10) << __func__ << " " << *pm << dendl;
6145}
6146
6147void OSD::queue_want_up_thru(epoch_t want)
6148{
6149 map_lock.get_read();
6150 epoch_t cur = osdmap->get_up_thru(whoami);
6151 Mutex::Locker l(mon_report_lock);
6152 if (want > up_thru_wanted) {
6153 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6154 << ", currently " << cur
6155 << dendl;
6156 up_thru_wanted = want;
6157 send_alive();
6158 } else {
6159 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6160 << ", currently " << cur
6161 << dendl;
6162 }
6163 map_lock.put_read();
6164}
6165
6166void OSD::send_alive()
6167{
6168 assert(mon_report_lock.is_locked());
6169 if (!osdmap->exists(whoami))
6170 return;
6171 epoch_t up_thru = osdmap->get_up_thru(whoami);
6172 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6173 if (up_thru_wanted > up_thru) {
6174 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6175 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6176 }
6177}
6178
6179void OSD::request_full_map(epoch_t first, epoch_t last)
6180{
6181 dout(10) << __func__ << " " << first << ".." << last
6182 << ", previously requested "
6183 << requested_full_first << ".." << requested_full_last << dendl;
6184 assert(osd_lock.is_locked());
6185 assert(first > 0 && last > 0);
6186 assert(first <= last);
6187 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6188 if (requested_full_first == 0) {
6189 // first request
6190 requested_full_first = first;
6191 requested_full_last = last;
6192 } else if (last <= requested_full_last) {
6193 // dup
6194 return;
6195 } else {
6196 // additional request
6197 first = requested_full_last + 1;
6198 requested_full_last = last;
6199 }
6200 MMonGetOSDMap *req = new MMonGetOSDMap;
6201 req->request_full(first, last);
6202 monc->send_mon_message(req);
6203}
6204
6205void OSD::got_full_map(epoch_t e)
6206{
6207 assert(requested_full_first <= requested_full_last);
6208 assert(osd_lock.is_locked());
6209 if (requested_full_first == 0) {
6210 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6211 return;
6212 }
6213 if (e < requested_full_first) {
6214 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6215 << ".." << requested_full_last
6216 << ", ignoring" << dendl;
6217 return;
6218 }
6219 if (e >= requested_full_last) {
6220 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6221 << ".." << requested_full_last << ", resetting" << dendl;
6222 requested_full_first = requested_full_last = 0;
6223 return;
6224 }
6225
6226 requested_full_first = e + 1;
6227
6228 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6229 << ".." << requested_full_last
6230 << ", still need more" << dendl;
6231}
6232
6233void OSD::requeue_failures()
6234{
6235 Mutex::Locker l(heartbeat_lock);
6236 unsigned old_queue = failure_queue.size();
6237 unsigned old_pending = failure_pending.size();
6238 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6239 failure_pending.begin();
6240 p != failure_pending.end(); ) {
6241 failure_queue[p->first] = p->second.first;
6242 failure_pending.erase(p++);
6243 }
6244 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6245 << failure_queue.size() << dendl;
6246}
6247
6248void OSD::send_failures()
6249{
6250 assert(map_lock.is_locked());
6251 assert(mon_report_lock.is_locked());
6252 Mutex::Locker l(heartbeat_lock);
6253 utime_t now = ceph_clock_now();
6254 while (!failure_queue.empty()) {
6255 int osd = failure_queue.begin()->first;
7c673cae 6256 if (!failure_pending.count(osd)) {
31f18b77 6257 entity_inst_t i = osdmap->get_inst(osd);
7c673cae
FG
6258 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6259 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6260 osdmap->get_epoch()));
6261 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6262 }
6263 failure_queue.erase(osd);
6264 }
6265}
6266
6267void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6268{
6269 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6270 monc->send_mon_message(m);
6271}
6272
6273void OSD::send_pg_stats(const utime_t &now)
6274{
6275 assert(map_lock.is_locked());
31f18b77 6276 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
6277 dout(20) << "send_pg_stats" << dendl;
6278
6279 osd_stat_t cur_stat = service.get_osd_stat();
6280
6281 cur_stat.os_perf_stat = store->get_cur_stats();
6282
6283 pg_stat_queue_lock.Lock();
6284
6285 if (osd_stat_updated || !pg_stat_queue.empty()) {
6286 last_pg_stats_sent = now;
6287 osd_stat_updated = false;
6288
6289 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6290
6291 utime_t had_for(now);
6292 had_for -= had_map_since;
6293
6294 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6295
6296 uint64_t tid = ++pg_stat_tid;
6297 m->set_tid(tid);
6298 m->osd_stat = cur_stat;
6299
6300 xlist<PG*>::iterator p = pg_stat_queue.begin();
6301 while (!p.end()) {
6302 PG *pg = *p;
6303 ++p;
6304 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6305 pg->stat_queue_item.remove_myself();
6306 pg->put("pg_stat_queue");
6307 continue;
6308 }
6309 pg->pg_stats_publish_lock.Lock();
6310 if (pg->pg_stats_publish_valid) {
6311 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6312 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6313 << pg->pg_stats_publish.reported_seq << dendl;
6314 } else {
6315 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6316 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6317 }
6318 pg->pg_stats_publish_lock.Unlock();
6319 }
6320
6321 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6322 last_pg_stats_ack = ceph_clock_now();
6323 }
6324 outstanding_pg_stats.insert(tid);
6325 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6326
6327 monc->send_mon_message(m);
6328 }
6329
6330 pg_stat_queue_lock.Unlock();
6331}
6332
6333void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6334{
6335 dout(10) << "handle_pg_stats_ack " << dendl;
6336
6337 if (!require_mon_peer(ack)) {
6338 ack->put();
6339 return;
6340 }
6341
6342 // NOTE: we may get replies from a previous mon even while
6343 // outstanding_pg_stats is empty if reconnecting races with replies
6344 // in flight.
6345
6346 pg_stat_queue_lock.Lock();
6347
6348 last_pg_stats_ack = ceph_clock_now();
6349
6350 // decay timeout slowly (analogous to TCP)
6351 stats_ack_timeout =
6352 MAX(cct->_conf->osd_mon_ack_timeout,
6353 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6354 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6355
6356 if (ack->get_tid() > pg_stat_tid_flushed) {
6357 pg_stat_tid_flushed = ack->get_tid();
6358 pg_stat_queue_cond.Signal();
6359 }
6360
6361 xlist<PG*>::iterator p = pg_stat_queue.begin();
6362 while (!p.end()) {
6363 PG *pg = *p;
6364 PGRef _pg(pg);
6365 ++p;
6366
6367 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6368 if (acked != ack->pg_stat.end()) {
6369 pg->pg_stats_publish_lock.Lock();
6370 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6371 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6372 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6373 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6374 pg->stat_queue_item.remove_myself();
6375 pg->put("pg_stat_queue");
6376 } else {
6377 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6378 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6379 << acked->second << dendl;
6380 }
6381 pg->pg_stats_publish_lock.Unlock();
6382 } else {
6383 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6384 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6385 }
6386 }
6387
6388 outstanding_pg_stats.erase(ack->get_tid());
6389 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6390
6391 pg_stat_queue_lock.Unlock();
6392
6393 ack->put();
6394}
6395
6396void OSD::flush_pg_stats()
6397{
6398 dout(10) << "flush_pg_stats" << dendl;
6399 osd_lock.Unlock();
6400 utime_t now = ceph_clock_now();
6401 map_lock.get_read();
6402 mon_report_lock.Lock();
6403 send_pg_stats(now);
6404 mon_report_lock.Unlock();
6405 map_lock.put_read();
6406
6407
6408 pg_stat_queue_lock.Lock();
6409 uint64_t tid = pg_stat_tid;
6410 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6411 while (tid > pg_stat_tid_flushed)
6412 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6413 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6414 pg_stat_queue_lock.Unlock();
6415
6416 osd_lock.Lock();
6417}
6418
6419void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6420{
6421 const auto& monmap = monc->monmap;
6422 // send beacon to mon even if we are just connected, and the monmap is not
6423 // initialized yet by then.
6424 if (monmap.epoch > 0 &&
6425 monmap.get_required_features().contains_all(
6426 ceph::features::mon::FEATURE_LUMINOUS)) {
6427 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6428 MOSDBeacon* beacon = nullptr;
6429 {
6430 Mutex::Locker l{min_last_epoch_clean_lock};
6431 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6432 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
224ce89b 6433 last_sent_beacon = now;
7c673cae
FG
6434 }
6435 monc->send_mon_message(beacon);
6436 } else {
6437 dout(20) << __func__ << " not sending" << dendl;
6438 }
6439}
6440
6441void OSD::handle_command(MMonCommand *m)
6442{
6443 if (!require_mon_peer(m)) {
6444 m->put();
6445 return;
6446 }
6447
6448 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6449 command_wq.queue(c);
6450 m->put();
6451}
6452
6453void OSD::handle_command(MCommand *m)
6454{
6455 ConnectionRef con = m->get_connection();
6456 Session *session = static_cast<Session *>(con->get_priv());
6457 if (!session) {
6458 con->send_message(new MCommandReply(m, -EPERM));
6459 m->put();
6460 return;
6461 }
6462
6463 OSDCap& caps = session->caps;
6464 session->put();
6465
6466 if (!caps.allow_all() || m->get_source().is_mon()) {
6467 con->send_message(new MCommandReply(m, -EPERM));
6468 m->put();
6469 return;
6470 }
6471
6472 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6473 command_wq.queue(c);
6474
6475 m->put();
6476}
6477
6478struct OSDCommand {
6479 string cmdstring;
6480 string helpstring;
6481 string module;
6482 string perm;
6483 string availability;
6484} osd_commands[] = {
6485
6486#define COMMAND(parsesig, helptext, module, perm, availability) \
6487 {parsesig, helptext, module, perm, availability},
6488
6489// yes, these are really pg commands, but there's a limit to how
6490// much work it's worth. The OSD returns all of them. Make this
6491// form (pg <pgid> <cmd>) valid only for the cli.
6492// Rest uses "tell <pgid> <cmd>"
6493
6494COMMAND("pg " \
6495 "name=pgid,type=CephPgid " \
6496 "name=cmd,type=CephChoices,strings=query", \
6497 "show details of a specific pg", "osd", "r", "cli")
6498COMMAND("pg " \
6499 "name=pgid,type=CephPgid " \
6500 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6501 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6502 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6503 "osd", "rw", "cli")
6504COMMAND("pg " \
6505 "name=pgid,type=CephPgid " \
6506 "name=cmd,type=CephChoices,strings=list_missing " \
6507 "name=offset,type=CephString,req=false",
6508 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6509 "osd", "r", "cli")
6510
6511// new form: tell <pgid> <cmd> for both cli and rest
6512
6513COMMAND("query",
6514 "show details of a specific pg", "osd", "r", "cli,rest")
6515COMMAND("mark_unfound_lost " \
6516 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6517 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6518 "osd", "rw", "cli,rest")
6519COMMAND("list_missing " \
6520 "name=offset,type=CephString,req=false",
6521 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6522 "osd", "r", "cli,rest")
31f18b77
FG
6523COMMAND("perf histogram dump "
6524 "name=logger,type=CephString,req=false "
6525 "name=counter,type=CephString,req=false",
6526 "Get histogram data",
6527 "osd", "r", "cli,rest")
7c673cae
FG
6528
6529// tell <osd.n> commands. Validation of osd.n must be special-cased in client
6530COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6531COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6532COMMAND("injectargs " \
6533 "name=injected_args,type=CephString,n=N",
6534 "inject configuration arguments into running OSD",
6535 "osd", "rw", "cli,rest")
c07f9fc5
FG
6536COMMAND("config set " \
6537 "name=key,type=CephString name=value,type=CephString",
6538 "Set a configuration option at runtime (not persistent)",
6539 "osd", "rw", "cli,rest")
7c673cae
FG
6540COMMAND("cluster_log " \
6541 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6542 "name=message,type=CephString,n=N",
6543 "log a message to the cluster log",
6544 "osd", "rw", "cli,rest")
6545COMMAND("bench " \
6546 "name=count,type=CephInt,req=false " \
6547 "name=size,type=CephInt,req=false " \
6548 "name=object_size,type=CephInt,req=false " \
6549 "name=object_num,type=CephInt,req=false ", \
6550 "OSD benchmark: write <count> <size>-byte objects, " \
6551 "(default 1G size 4MB). Results in log.",
6552 "osd", "rw", "cli,rest")
6553COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6554COMMAND("heap " \
6555 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6556 "show heap usage info (available only if compiled with tcmalloc)", \
6557 "osd", "rw", "cli,rest")
6558COMMAND("debug dump_missing " \
6559 "name=filename,type=CephFilepath",
6560 "dump missing objects to a named file", "osd", "r", "cli,rest")
6561COMMAND("debug kick_recovery_wq " \
6562 "name=delay,type=CephInt,range=0",
6563 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6564COMMAND("cpu_profiler " \
6565 "name=arg,type=CephChoices,strings=status|flush",
6566 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6567COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6568 "osd", "r", "cli,rest")
6569COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6570 "osd", "rw", "cli,rest")
224ce89b
WB
6571COMMAND("compact",
6572 "compact object store's omap. "
6573 "WARNING: Compaction probably slows your requests",
6574 "osd", "rw", "cli,rest")
7c673cae
FG
6575};
6576
6577void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6578{
6579 int r = 0;
6580 stringstream ss, ds;
6581 string rs;
6582 bufferlist odata;
6583
6584 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6585
6586 map<string, cmd_vartype> cmdmap;
6587 string prefix;
6588 string format;
6589 string pgidstr;
6590 boost::scoped_ptr<Formatter> f;
6591
6592 if (cmd.empty()) {
6593 ss << "no command given";
6594 goto out;
6595 }
6596
6597 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6598 r = -EINVAL;
6599 goto out;
6600 }
6601
6602 cmd_getval(cct, cmdmap, "prefix", prefix);
6603
6604 if (prefix == "get_command_descriptions") {
6605 int cmdnum = 0;
6606 JSONFormatter *f = new JSONFormatter();
6607 f->open_object_section("command_descriptions");
6608 for (OSDCommand *cp = osd_commands;
6609 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6610
6611 ostringstream secname;
6612 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6613 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6614 cp->module, cp->perm, cp->availability, 0);
6615 cmdnum++;
6616 }
6617 f->close_section(); // command_descriptions
6618
6619 f->flush(ds);
6620 delete f;
6621 goto out;
6622 }
6623
6624 cmd_getval(cct, cmdmap, "format", format);
6625 f.reset(Formatter::create(format));
6626
6627 if (prefix == "version") {
6628 if (f) {
6629 f->open_object_section("version");
6630 f->dump_string("version", pretty_version_to_str());
6631 f->close_section();
6632 f->flush(ds);
6633 } else {
6634 ds << pretty_version_to_str();
6635 }
6636 goto out;
6637 }
6638 else if (prefix == "injectargs") {
6639 vector<string> argsvec;
6640 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6641
6642 if (argsvec.empty()) {
6643 r = -EINVAL;
6644 ss << "ignoring empty injectargs";
6645 goto out;
6646 }
6647 string args = argsvec.front();
6648 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6649 args += " " + *a;
6650 osd_lock.Unlock();
6651 r = cct->_conf->injectargs(args, &ss);
6652 osd_lock.Lock();
6653 }
c07f9fc5
FG
6654 else if (prefix == "config set") {
6655 std::string key;
6656 std::string val;
6657 cmd_getval(cct, cmdmap, "key", key);
6658 cmd_getval(cct, cmdmap, "value", val);
6659 osd_lock.Unlock();
6660 r = cct->_conf->set_val(key, val, true, &ss);
d2e6a577
FG
6661 if (r == 0) {
6662 cct->_conf->apply_changes(nullptr);
6663 }
c07f9fc5
FG
6664 osd_lock.Lock();
6665 }
7c673cae
FG
6666 else if (prefix == "cluster_log") {
6667 vector<string> msg;
6668 cmd_getval(cct, cmdmap, "message", msg);
6669 if (msg.empty()) {
6670 r = -EINVAL;
6671 ss << "ignoring empty log message";
6672 goto out;
6673 }
6674 string message = msg.front();
6675 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6676 message += " " + *a;
6677 string lvl;
6678 cmd_getval(cct, cmdmap, "level", lvl);
6679 clog_type level = string_to_clog_type(lvl);
6680 if (level < 0) {
6681 r = -EINVAL;
6682 ss << "unknown level '" << lvl << "'";
6683 goto out;
6684 }
6685 clog->do_log(level, message);
6686 }
6687
6688 // either 'pg <pgid> <command>' or
6689 // 'tell <pgid>' (which comes in without any of that prefix)?
6690
6691 else if (prefix == "pg" ||
6692 prefix == "query" ||
6693 prefix == "mark_unfound_lost" ||
6694 prefix == "list_missing"
6695 ) {
6696 pg_t pgid;
6697
6698 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6699 ss << "no pgid specified";
6700 r = -EINVAL;
6701 } else if (!pgid.parse(pgidstr.c_str())) {
6702 ss << "couldn't parse pgid '" << pgidstr << "'";
6703 r = -EINVAL;
6704 } else {
6705 spg_t pcand;
6706 PG *pg = nullptr;
6707 if (osdmap->get_primary_shard(pgid, &pcand) &&
6708 (pg = _lookup_lock_pg(pcand))) {
6709 if (pg->is_primary()) {
6710 // simulate pg <pgid> cmd= for pg->do-command
6711 if (prefix != "pg")
6712 cmd_putval(cct, cmdmap, "cmd", prefix);
6713 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6714 if (r == -EAGAIN) {
6715 pg->unlock();
6716 // don't reply, pg will do so async
6717 return;
6718 }
6719 } else {
6720 ss << "not primary for pgid " << pgid;
6721
6722 // send them the latest diff to ensure they realize the mapping
6723 // has changed.
6724 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6725
6726 // do not reply; they will get newer maps and realize they
6727 // need to resend.
6728 pg->unlock();
6729 return;
6730 }
6731 pg->unlock();
6732 } else {
6733 ss << "i don't have pgid " << pgid;
6734 r = -ENOENT;
6735 }
6736 }
6737 }
6738
6739 else if (prefix == "bench") {
6740 int64_t count;
6741 int64_t bsize;
6742 int64_t osize, onum;
6743 // default count 1G, size 4MB
6744 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6745 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6746 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6747 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6748
6749 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6750 ObjectStore::Sequencer>("bench"));
6751
6752 uint32_t duration = cct->_conf->osd_bench_duration;
6753
6754 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6755 // let us limit the block size because the next checks rely on it
6756 // having a sane value. If we allow any block size to be set things
6757 // can still go sideways.
6758 ss << "block 'size' values are capped at "
6759 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6760 << " a higher value, please adjust 'osd_bench_max_block_size'";
6761 r = -EINVAL;
6762 goto out;
6763 } else if (bsize < (int64_t) (1 << 20)) {
6764 // entering the realm of small block sizes.
6765 // limit the count to a sane value, assuming a configurable amount of
6766 // IOPS and duration, so that the OSD doesn't get hung up on this,
6767 // preventing timeouts from going off
6768 int64_t max_count =
6769 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6770 if (count > max_count) {
6771 ss << "'count' values greater than " << max_count
6772 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6773 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6774 << " for " << duration << " seconds,"
6775 << " can cause ill effects on osd. "
6776 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6777 << " value if you wish to use a higher 'count'.";
6778 r = -EINVAL;
6779 goto out;
6780 }
6781 } else {
6782 // 1MB block sizes are big enough so that we get more stuff done.
6783 // However, to avoid the osd from getting hung on this and having
6784 // timers being triggered, we are going to limit the count assuming
6785 // a configurable throughput and duration.
6786 // NOTE: max_count is the total amount of bytes that we believe we
6787 // will be able to write during 'duration' for the given
6788 // throughput. The block size hardly impacts this unless it's
6789 // way too big. Given we already check how big the block size
6790 // is, it's safe to assume everything will check out.
6791 int64_t max_count =
6792 cct->_conf->osd_bench_large_size_max_throughput * duration;
6793 if (count > max_count) {
6794 ss << "'count' values greater than " << max_count
6795 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6796 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6797 << " for " << duration << " seconds,"
6798 << " can cause ill effects on osd. "
6799 << " Please adjust 'osd_bench_large_size_max_throughput'"
6800 << " with a higher value if you wish to use a higher 'count'.";
6801 r = -EINVAL;
6802 goto out;
6803 }
6804 }
6805
6806 if (osize && bsize > osize)
6807 bsize = osize;
6808
6809 dout(1) << " bench count " << count
6810 << " bsize " << prettybyte_t(bsize) << dendl;
6811
6812 ObjectStore::Transaction cleanupt;
6813
6814 if (osize && onum) {
6815 bufferlist bl;
6816 bufferptr bp(osize);
6817 bp.zero();
6818 bl.push_back(std::move(bp));
6819 bl.rebuild_page_aligned();
6820 for (int i=0; i<onum; ++i) {
6821 char nm[30];
6822 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6823 object_t oid(nm);
6824 hobject_t soid(sobject_t(oid, 0));
6825 ObjectStore::Transaction t;
6826 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6827 store->queue_transaction(osr.get(), std::move(t), NULL);
6828 cleanupt.remove(coll_t(), ghobject_t(soid));
6829 }
6830 }
6831
6832 bufferlist bl;
6833 bufferptr bp(bsize);
6834 bp.zero();
6835 bl.push_back(std::move(bp));
6836 bl.rebuild_page_aligned();
6837
6838 {
6839 C_SaferCond waiter;
6840 if (!osr->flush_commit(&waiter)) {
6841 waiter.wait();
6842 }
6843 }
6844
6845 utime_t start = ceph_clock_now();
6846 for (int64_t pos = 0; pos < count; pos += bsize) {
6847 char nm[30];
6848 unsigned offset = 0;
6849 if (onum && osize) {
6850 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6851 offset = rand() % (osize / bsize) * bsize;
6852 } else {
6853 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6854 }
6855 object_t oid(nm);
6856 hobject_t soid(sobject_t(oid, 0));
6857 ObjectStore::Transaction t;
6858 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6859 store->queue_transaction(osr.get(), std::move(t), NULL);
6860 if (!onum || !osize)
6861 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6862 }
6863
6864 {
6865 C_SaferCond waiter;
6866 if (!osr->flush_commit(&waiter)) {
6867 waiter.wait();
6868 }
6869 }
6870 utime_t end = ceph_clock_now();
6871
6872 // clean up
6873 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6874 {
6875 C_SaferCond waiter;
6876 if (!osr->flush_commit(&waiter)) {
6877 waiter.wait();
6878 }
6879 }
6880
6881 uint64_t rate = (double)count / (end - start);
6882 if (f) {
6883 f->open_object_section("osd_bench_results");
6884 f->dump_int("bytes_written", count);
6885 f->dump_int("blocksize", bsize);
6886 f->dump_unsigned("bytes_per_sec", rate);
6887 f->close_section();
6888 f->flush(ss);
6889 } else {
6890 ss << "bench: wrote " << prettybyte_t(count)
6891 << " in blocks of " << prettybyte_t(bsize) << " in "
6892 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6893 }
6894 }
6895
6896 else if (prefix == "flush_pg_stats") {
31f18b77
FG
6897 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6898 mgrc.send_pgstats();
6899 ds << service.get_osd_stat_seq() << "\n";
6900 } else {
6901 flush_pg_stats();
6902 }
7c673cae
FG
6903 }
6904
6905 else if (prefix == "heap") {
6906 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6907 }
6908
6909 else if (prefix == "debug dump_missing") {
6910 string file_name;
6911 cmd_getval(cct, cmdmap, "filename", file_name);
6912 std::ofstream fout(file_name.c_str());
6913 if (!fout.is_open()) {
6914 ss << "failed to open file '" << file_name << "'";
6915 r = -EINVAL;
6916 goto out;
6917 }
6918
6919 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6920 RWLock::RLocker l(pg_map_lock);
6921 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6922 pg_map_e != pg_map.end(); ++pg_map_e) {
6923 PG *pg = pg_map_e->second;
6924 pg->lock();
6925
6926 fout << *pg << std::endl;
6927 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6928 pg->pg_log.get_missing().get_items().end();
6929 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6930 pg->pg_log.get_missing().get_items().begin();
6931 for (; mi != mend; ++mi) {
6932 fout << mi->first << " -> " << mi->second << std::endl;
6933 if (!pg->missing_loc.needs_recovery(mi->first))
6934 continue;
6935 if (pg->missing_loc.is_unfound(mi->first))
6936 fout << " unfound ";
6937 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6938 if (mls.empty())
6939 continue;
6940 fout << "missing_loc: " << mls << std::endl;
6941 }
6942 pg->unlock();
6943 fout << std::endl;
6944 }
6945
6946 fout.close();
6947 }
6948 else if (prefix == "debug kick_recovery_wq") {
6949 int64_t delay;
6950 cmd_getval(cct, cmdmap, "delay", delay);
6951 ostringstream oss;
6952 oss << delay;
6953 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6954 if (r != 0) {
6955 ss << "kick_recovery_wq: error setting "
6956 << "osd_recovery_delay_start to '" << delay << "': error "
6957 << r;
6958 goto out;
6959 }
6960 cct->_conf->apply_changes(NULL);
6961 ss << "kicking recovery queue. set osd_recovery_delay_start "
6962 << "to " << cct->_conf->osd_recovery_delay_start;
6963 }
6964
6965 else if (prefix == "cpu_profiler") {
6966 string arg;
6967 cmd_getval(cct, cmdmap, "arg", arg);
6968 vector<string> argvec;
6969 get_str_vec(arg, argvec);
6970 cpu_profiler_handle_command(argvec, ds);
6971 }
6972
6973 else if (prefix == "dump_pg_recovery_stats") {
6974 stringstream s;
6975 if (f) {
6976 pg_recovery_stats.dump_formatted(f.get());
6977 f->flush(ds);
6978 } else {
6979 pg_recovery_stats.dump(s);
6980 ds << "dump pg recovery stats: " << s.str();
6981 }
6982 }
6983
6984 else if (prefix == "reset_pg_recovery_stats") {
6985 ss << "reset pg recovery stats";
6986 pg_recovery_stats.reset();
6987 }
6988
31f18b77
FG
6989 else if (prefix == "perf histogram dump") {
6990 std::string logger;
6991 std::string counter;
6992 cmd_getval(cct, cmdmap, "logger", logger);
6993 cmd_getval(cct, cmdmap, "counter", counter);
6994 if (f) {
6995 cct->get_perfcounters_collection()->dump_formatted_histograms(
6996 f.get(), false, logger, counter);
6997 f->flush(ds);
6998 }
6999 }
7000
224ce89b
WB
7001 else if (prefix == "compact") {
7002 dout(1) << "triggering manual compaction" << dendl;
7003 auto start = ceph::coarse_mono_clock::now();
7004 store->compact();
7005 auto end = ceph::coarse_mono_clock::now();
7006 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
7007 dout(1) << "finished manual compaction in "
7008 << time_span.count()
7009 << " seconds" << dendl;
7010 ss << "compacted omap in " << time_span.count() << " seconds";
7011 }
7012
7c673cae
FG
7013 else {
7014 ss << "unrecognized command! " << cmd;
7015 r = -EINVAL;
7016 }
7017
7018 out:
7019 rs = ss.str();
7020 odata.append(ds);
7021 dout(0) << "do_command r=" << r << " " << rs << dendl;
7022 clog->info() << rs;
7023 if (con) {
7024 MCommandReply *reply = new MCommandReply(r, rs);
7025 reply->set_tid(tid);
7026 reply->set_data(odata);
7027 con->send_message(reply);
7028 }
7029}
7030
7031bool OSD::heartbeat_dispatch(Message *m)
7032{
7033 dout(30) << "heartbeat_dispatch " << m << dendl;
7034 switch (m->get_type()) {
7035
7036 case CEPH_MSG_PING:
7037 dout(10) << "ping from " << m->get_source_inst() << dendl;
7038 m->put();
7039 break;
7040
7041 case MSG_OSD_PING:
7042 handle_osd_ping(static_cast<MOSDPing*>(m));
7043 break;
7044
7045 default:
7046 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7047 m->put();
7048 }
7049
7050 return true;
7051}
7052
7053bool OSD::ms_dispatch(Message *m)
7054{
7055 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7056 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7057 service.got_stop_ack();
7058 m->put();
7059 return true;
7060 }
7061
7062 // lock!
7063
7064 osd_lock.Lock();
7065 if (is_stopping()) {
7066 osd_lock.Unlock();
7067 m->put();
7068 return true;
7069 }
7070
7071 do_waiters();
7072 _dispatch(m);
7073
7074 osd_lock.Unlock();
7075
7076 return true;
7077}
7078
7079void OSD::maybe_share_map(
7080 Session *session,
7081 OpRequestRef op,
7082 OSDMapRef osdmap)
7083{
7084 if (!op->check_send_map) {
7085 return;
7086 }
7087 epoch_t last_sent_epoch = 0;
7088
7089 session->sent_epoch_lock.lock();
7090 last_sent_epoch = session->last_sent_epoch;
7091 session->sent_epoch_lock.unlock();
7092
7093 const Message *m = op->get_req();
7094 service.share_map(
7095 m->get_source(),
7096 m->get_connection().get(),
7097 op->sent_epoch,
7098 osdmap,
7099 session ? &last_sent_epoch : NULL);
7100
7101 session->sent_epoch_lock.lock();
7102 if (session->last_sent_epoch < last_sent_epoch) {
7103 session->last_sent_epoch = last_sent_epoch;
7104 }
7105 session->sent_epoch_lock.unlock();
7106
7107 op->check_send_map = false;
7108}
7109
7110void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7111{
7112 assert(session->session_dispatch_lock.is_locked());
7113
7114 auto i = session->waiting_on_map.begin();
7115 while (i != session->waiting_on_map.end()) {
7116 OpRequestRef op = &(*i);
7117 assert(ms_can_fast_dispatch(op->get_req()));
7118 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7119 op->get_req());
7120 if (m->get_min_epoch() > osdmap->get_epoch()) {
7121 break;
7122 }
7123 session->waiting_on_map.erase(i++);
7124 op->put();
7125
7126 spg_t pgid;
7127 if (m->get_type() == CEPH_MSG_OSD_OP) {
7128 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7129 static_cast<const MOSDOp*>(m)->get_pg());
7130 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7131 continue;
7132 }
7133 } else {
7134 pgid = m->get_spg();
7135 }
7136 enqueue_op(pgid, op, m->get_map_epoch());
7137 }
7138
7139 if (session->waiting_on_map.empty()) {
7140 clear_session_waiting_on_map(session);
7141 } else {
7142 register_session_waiting_on_map(session);
7143 }
7144}
7145
7146void OSD::ms_fast_dispatch(Message *m)
7147{
7148 FUNCTRACE();
7149 if (service.is_stopping()) {
7150 m->put();
7151 return;
7152 }
7153 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7154 {
7155#ifdef WITH_LTTNG
7156 osd_reqid_t reqid = op->get_reqid();
7157#endif
7158 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7159 reqid.name._num, reqid.tid, reqid.inc);
7160 }
7161
7162 if (m->trace)
7163 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7164
7165 // note sender epoch, min req'd epoch
7166 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7167 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7168 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7169
7170 service.maybe_inject_dispatch_delay();
7171
7172 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7173 m->get_type() != CEPH_MSG_OSD_OP) {
7174 // queue it directly
7175 enqueue_op(
7176 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7177 op,
7178 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7179 } else {
7180 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7181 // message that didn't have an explicit spg_t); we need to map
7182 // them to an spg_t while preserving delivery order.
7183 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7184 if (session) {
7185 {
7186 Mutex::Locker l(session->session_dispatch_lock);
7187 op->get();
7188 session->waiting_on_map.push_back(*op);
7189 OSDMapRef nextmap = service.get_nextmap_reserved();
7190 dispatch_session_waiting(session, nextmap);
7191 service.release_map(nextmap);
7192 }
7193 session->put();
7194 }
7195 }
7196 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7197}
7198
7199void OSD::ms_fast_preprocess(Message *m)
7200{
7201 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7202 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7203 MOSDMap *mm = static_cast<MOSDMap*>(m);
7204 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7205 if (s) {
7206 s->received_map_lock.lock();
7207 s->received_map_epoch = mm->get_last();
7208 s->received_map_lock.unlock();
7209 s->put();
7210 }
7211 }
7212 }
7213}
7214
7215bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7216{
7217 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7218
31f18b77
FG
7219 if (is_stopping()) {
7220 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7221 return false;
7222 }
7223
7c673cae
FG
7224 if (dest_type == CEPH_ENTITY_TYPE_MON)
7225 return true;
7226
7227 if (force_new) {
7228 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7229 to get through */
7230 if (monc->wait_auth_rotating(10) < 0) {
7231 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7232 return false;
7233 }
7234 }
7235
7236 *authorizer = monc->build_authorizer(dest_type);
7237 return *authorizer != NULL;
7238}
7239
7240
28e407b8
AA
7241bool OSD::ms_verify_authorizer(
7242 Connection *con, int peer_type,
7243 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7244 bool& isvalid, CryptoKey& session_key,
7245 std::unique_ptr<AuthAuthorizerChallenge> *challenge)
7c673cae
FG
7246{
7247 AuthAuthorizeHandler *authorize_handler = 0;
7248 switch (peer_type) {
7249 case CEPH_ENTITY_TYPE_MDS:
7250 /*
7251 * note: mds is technically a client from our perspective, but
7252 * this makes the 'cluster' consistent w/ monitor's usage.
7253 */
7254 case CEPH_ENTITY_TYPE_OSD:
7255 case CEPH_ENTITY_TYPE_MGR:
7256 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7257 break;
7258 default:
7259 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7260 }
7261 if (!authorize_handler) {
7262 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7263 isvalid = false;
7264 return true;
7265 }
7266
7267 AuthCapsInfo caps_info;
7268 EntityName name;
7269 uint64_t global_id;
7270 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7271
c07f9fc5
FG
7272 RotatingKeyRing *keys = monc->rotating_secrets.get();
7273 if (keys) {
7274 isvalid = authorize_handler->verify_authorizer(
7275 cct, keys,
7276 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
28e407b8 7277 &auid, challenge);
c07f9fc5
FG
7278 } else {
7279 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7280 isvalid = false;
7281 }
7c673cae
FG
7282
7283 if (isvalid) {
7284 Session *s = static_cast<Session *>(con->get_priv());
7285 if (!s) {
7286 s = new Session(cct);
7287 con->set_priv(s->get());
7288 s->con = con;
7289 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7290 }
7291
7292 s->entity_name = name;
7293 if (caps_info.allow_all)
7294 s->caps.set_allow_all();
7295 s->auid = auid;
7296
7297 if (caps_info.caps.length() > 0) {
7298 bufferlist::iterator p = caps_info.caps.begin();
7299 string str;
7300 try {
7301 ::decode(str, p);
7302 }
7303 catch (buffer::error& e) {
7304 }
7305 bool success = s->caps.parse(str);
7306 if (success)
7307 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7308 else
7309 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7310 }
7311
7312 s->put();
7313 }
7314 return true;
7315}
7316
7317void OSD::do_waiters()
7318{
7319 assert(osd_lock.is_locked());
7320
7321 dout(10) << "do_waiters -- start" << dendl;
7322 while (!finished.empty()) {
7323 OpRequestRef next = finished.front();
7324 finished.pop_front();
7325 dispatch_op(next);
7326 }
7327 dout(10) << "do_waiters -- finish" << dendl;
7328}
7329
7330void OSD::dispatch_op(OpRequestRef op)
7331{
7332 switch (op->get_req()->get_type()) {
7333
7334 case MSG_OSD_PG_CREATE:
7335 handle_pg_create(op);
7336 break;
7337 case MSG_OSD_PG_NOTIFY:
7338 handle_pg_notify(op);
7339 break;
7340 case MSG_OSD_PG_QUERY:
7341 handle_pg_query(op);
7342 break;
7343 case MSG_OSD_PG_LOG:
7344 handle_pg_log(op);
7345 break;
7346 case MSG_OSD_PG_REMOVE:
7347 handle_pg_remove(op);
7348 break;
7349 case MSG_OSD_PG_INFO:
7350 handle_pg_info(op);
7351 break;
7352 case MSG_OSD_PG_TRIM:
7353 handle_pg_trim(op);
7354 break;
7355 case MSG_OSD_BACKFILL_RESERVE:
7356 handle_pg_backfill_reserve(op);
7357 break;
7358 case MSG_OSD_RECOVERY_RESERVE:
7359 handle_pg_recovery_reserve(op);
7360 break;
7361 }
7362}
7363
7364void OSD::_dispatch(Message *m)
7365{
7366 assert(osd_lock.is_locked());
7367 dout(20) << "_dispatch " << m << " " << *m << dendl;
7368
7369 switch (m->get_type()) {
7370
7371 // -- don't need lock --
7372 case CEPH_MSG_PING:
7373 dout(10) << "ping from " << m->get_source() << dendl;
7374 m->put();
7375 break;
7376
7377 // -- don't need OSDMap --
7378
7379 // map and replication
7380 case CEPH_MSG_OSD_MAP:
7381 handle_osd_map(static_cast<MOSDMap*>(m));
7382 break;
7383
7384 // osd
7385 case MSG_PGSTATSACK:
7386 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7387 break;
7388
7389 case MSG_MON_COMMAND:
7390 handle_command(static_cast<MMonCommand*>(m));
7391 break;
7392 case MSG_COMMAND:
7393 handle_command(static_cast<MCommand*>(m));
7394 break;
7395
7396 case MSG_OSD_SCRUB:
7397 handle_scrub(static_cast<MOSDScrub*>(m));
7398 break;
7399
c07f9fc5
FG
7400 case MSG_OSD_FORCE_RECOVERY:
7401 handle_force_recovery(m);
7402 break;
7403
7c673cae
FG
7404 // -- need OSDMap --
7405
7406 case MSG_OSD_PG_CREATE:
7407 case MSG_OSD_PG_NOTIFY:
7408 case MSG_OSD_PG_QUERY:
7409 case MSG_OSD_PG_LOG:
7410 case MSG_OSD_PG_REMOVE:
7411 case MSG_OSD_PG_INFO:
7412 case MSG_OSD_PG_TRIM:
7413 case MSG_OSD_BACKFILL_RESERVE:
7414 case MSG_OSD_RECOVERY_RESERVE:
7415 {
7416 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7417 if (m->trace)
7418 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7419 // no map? starting up?
7420 if (!osdmap) {
7421 dout(7) << "no OSDMap, not booted" << dendl;
7422 logger->inc(l_osd_waiting_for_map);
7423 waiting_for_osdmap.push_back(op);
7424 op->mark_delayed("no osdmap");
7425 break;
7426 }
7427
7428 // need OSDMap
7429 dispatch_op(op);
7430 }
7431 }
7432}
7433
7434void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7435{
7436 pg->lock();
7437 if (pg->is_primary()) {
7438 pg->unreg_next_scrub();
7439 pg->scrubber.must_scrub = true;
7440 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7441 pg->scrubber.must_repair = m->repair;
7442 pg->reg_next_scrub();
7443 dout(10) << "marking " << *pg << " for scrub" << dendl;
7444 }
7445 pg->unlock();
7446}
7447
7448void OSD::handle_scrub(MOSDScrub *m)
7449{
7450 dout(10) << "handle_scrub " << *m << dendl;
7451 if (!require_mon_or_mgr_peer(m)) {
7452 m->put();
7453 return;
7454 }
7455 if (m->fsid != monc->get_fsid()) {
7456 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7457 m->put();
7458 return;
7459 }
7460
7461 RWLock::RLocker l(pg_map_lock);
7462 if (m->scrub_pgs.empty()) {
7463 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7464 p != pg_map.end();
7465 ++p)
7466 handle_pg_scrub(m, p->second);
7467 } else {
7468 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7469 p != m->scrub_pgs.end();
7470 ++p) {
7471 spg_t pcand;
7472 if (osdmap->get_primary_shard(*p, &pcand)) {
7473 auto pg_map_entry = pg_map.find(pcand);
7474 if (pg_map_entry != pg_map.end()) {
7475 handle_pg_scrub(m, pg_map_entry->second);
7476 }
7477 }
7478 }
7479 }
7480
7481 m->put();
7482}
7483
7484bool OSD::scrub_random_backoff()
7485{
7486 bool coin_flip = (rand() / (double)RAND_MAX >=
7487 cct->_conf->osd_scrub_backoff_ratio);
7488 if (!coin_flip) {
7489 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7490 return true;
7491 }
7492 return false;
7493}
7494
7495OSDService::ScrubJob::ScrubJob(CephContext* cct,
7496 const spg_t& pg, const utime_t& timestamp,
7497 double pool_scrub_min_interval,
7498 double pool_scrub_max_interval, bool must)
7499 : cct(cct),
7500 pgid(pg),
7501 sched_time(timestamp),
7502 deadline(timestamp)
7503{
7504 // if not explicitly requested, postpone the scrub with a random delay
7505 if (!must) {
7506 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7507 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7508 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7509 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7510
7511 sched_time += scrub_min_interval;
7512 double r = rand() / (double)RAND_MAX;
7513 sched_time +=
7514 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7515 deadline += scrub_max_interval;
7516 }
7517}
7518
7519bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7520 if (sched_time < rhs.sched_time)
7521 return true;
7522 if (sched_time > rhs.sched_time)
7523 return false;
7524 return pgid < rhs.pgid;
7525}
7526
7527bool OSD::scrub_time_permit(utime_t now)
7528{
7529 struct tm bdt;
7530 time_t tt = now.sec();
7531 localtime_r(&tt, &bdt);
28e407b8
AA
7532
7533 bool day_permit = false;
7534 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7535 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7536 day_permit = true;
7537 }
7538 } else {
7539 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7540 day_permit = true;
7541 }
7542 }
7543
7544 if (!day_permit) {
7545 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7546 << " - " << cct->_conf->osd_scrub_end_week_day
7547 << " now " << bdt.tm_wday << " = no" << dendl;
7548 return false;
7549 }
7550
7c673cae
FG
7551 bool time_permit = false;
7552 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7553 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7554 time_permit = true;
7555 }
7556 } else {
7557 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7558 time_permit = true;
7559 }
7560 }
7561 if (!time_permit) {
7562 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7563 << " - " << cct->_conf->osd_scrub_end_hour
7564 << " now " << bdt.tm_hour << " = no" << dendl;
7565 } else {
7566 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7567 << " - " << cct->_conf->osd_scrub_end_hour
7568 << " now " << bdt.tm_hour << " = yes" << dendl;
7569 }
7570 return time_permit;
7571}
7572
7573bool OSD::scrub_load_below_threshold()
7574{
7575 double loadavgs[3];
7576 if (getloadavg(loadavgs, 3) != 3) {
7577 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7578 return false;
7579 }
7580
7581 // allow scrub if below configured threshold
7582 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7583 dout(20) << __func__ << " loadavg " << loadavgs[0]
7584 << " < max " << cct->_conf->osd_scrub_load_threshold
7585 << " = yes" << dendl;
7586 return true;
7587 }
7588
7589 // allow scrub if below daily avg and currently decreasing
7590 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7591 dout(20) << __func__ << " loadavg " << loadavgs[0]
7592 << " < daily_loadavg " << daily_loadavg
7593 << " and < 15m avg " << loadavgs[2]
7594 << " = yes" << dendl;
7595 return true;
7596 }
7597
7598 dout(20) << __func__ << " loadavg " << loadavgs[0]
7599 << " >= max " << cct->_conf->osd_scrub_load_threshold
7600 << " and ( >= daily_loadavg " << daily_loadavg
7601 << " or >= 15m avg " << loadavgs[2]
7602 << ") = no" << dendl;
7603 return false;
7604}
7605
7606void OSD::sched_scrub()
7607{
7608 // if not permitted, fail fast
7609 if (!service.can_inc_scrubs_pending()) {
7610 return;
7611 }
b5b8bbf5
FG
7612 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7613 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7614 return;
7615 }
7616
7c673cae
FG
7617
7618 utime_t now = ceph_clock_now();
7619 bool time_permit = scrub_time_permit(now);
7620 bool load_is_low = scrub_load_below_threshold();
7621 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7622
7623 OSDService::ScrubJob scrub;
7624 if (service.first_scrub_stamp(&scrub)) {
7625 do {
7626 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7627
7628 if (scrub.sched_time > now) {
7629 // save ourselves some effort
7630 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7631 << " > " << now << dendl;
7632 break;
7633 }
7634
7c673cae
FG
7635 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7636 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7637 << (!time_permit ? "time not permit" : "high load") << dendl;
7638 continue;
7639 }
7640
7641 PG *pg = _lookup_lock_pg(scrub.pgid);
7642 if (!pg)
7643 continue;
7644 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7645 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7646 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7647 (load_is_low ? ", load_is_low" : " deadline < now"))
7648 << dendl;
7649 if (pg->sched_scrub()) {
7650 pg->unlock();
7651 break;
7652 }
7653 }
7654 pg->unlock();
7655 } while (service.next_scrub_stamp(scrub, &scrub));
7656 }
7657 dout(20) << "sched_scrub done" << dendl;
7658}
7659
7660
7661
b32b8144
FG
7662vector<OSDHealthMetric> OSD::get_health_metrics()
7663{
7664 vector<OSDHealthMetric> metrics;
7665 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
7666 auto n_primaries = pending_creates_from_mon;
7667 for (const auto& create : pending_creates_from_osd) {
7668 if (create.second) {
7669 n_primaries++;
7670 }
7671 }
7672 metrics.emplace_back(osd_metric::PENDING_CREATING_PGS, n_primaries);
7673 return metrics;
7674}
7675
7c673cae
FG
7676// =====================================================
7677// MAP
7678
7679void OSD::wait_for_new_map(OpRequestRef op)
7680{
7681 // ask?
7682 if (waiting_for_osdmap.empty()) {
7683 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7684 }
7685
7686 logger->inc(l_osd_waiting_for_map);
7687 waiting_for_osdmap.push_back(op);
7688 op->mark_delayed("wait for new map");
7689}
7690
7691
7692/** update_map
7693 * assimilate new OSDMap(s). scan pgs, etc.
7694 */
7695
7696void OSD::note_down_osd(int peer)
7697{
7698 assert(osd_lock.is_locked());
7699 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7700
7701 heartbeat_lock.Lock();
7702 failure_queue.erase(peer);
7703 failure_pending.erase(peer);
7704 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7705 if (p != heartbeat_peers.end()) {
7706 p->second.con_back->mark_down();
7707 if (p->second.con_front) {
7708 p->second.con_front->mark_down();
7709 }
7710 heartbeat_peers.erase(p);
7711 }
7712 heartbeat_lock.Unlock();
7713}
7714
7715void OSD::note_up_osd(int peer)
7716{
7717 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7718 heartbeat_set_peers_need_update();
7719}
7720
7721struct C_OnMapCommit : public Context {
7722 OSD *osd;
7723 epoch_t first, last;
7724 MOSDMap *msg;
7725 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7726 : osd(o), first(f), last(l), msg(m) {}
7727 void finish(int r) override {
7728 osd->_committed_osd_maps(first, last, msg);
7729 msg->put();
7730 }
7731};
7732
7733struct C_OnMapApply : public Context {
7734 OSDService *service;
7735 list<OSDMapRef> pinned_maps;
7736 epoch_t e;
7737 C_OnMapApply(OSDService *service,
7738 const list<OSDMapRef> &pinned_maps,
7739 epoch_t e)
7740 : service(service), pinned_maps(pinned_maps), e(e) {}
7741 void finish(int r) override {
7742 service->clear_map_bl_cache_pins(e);
7743 }
7744};
7745
7746void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7747{
181888fb
FG
7748 Mutex::Locker l(osdmap_subscribe_lock);
7749 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7750 return;
7751
181888fb
FG
7752 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7753
7c673cae
FG
7754 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7755 force_request) {
7756 monc->renew_subs();
7757 }
7758}
7759
7760void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7761{
7762 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7763 if (min <= superblock.oldest_map)
7764 return;
7765
7766 int num = 0;
7767 ObjectStore::Transaction t;
7768 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7769 dout(20) << " removing old osdmap epoch " << e << dendl;
7770 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7771 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7772 superblock.oldest_map = e + 1;
7773 num++;
7774 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7775 service.publish_superblock(superblock);
7776 write_superblock(t);
7777 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7778 assert(tr == 0);
7779 num = 0;
7780 if (!skip_maps) {
7781 // skip_maps leaves us with a range of old maps if we fail to remove all
7782 // of them before moving superblock.oldest_map forward to the first map
7783 // in the incoming MOSDMap msg. so we should continue removing them in
7784 // this case, even we could do huge series of delete transactions all at
7785 // once.
7786 break;
7787 }
7788 }
7789 }
7790 if (num > 0) {
7791 service.publish_superblock(superblock);
7792 write_superblock(t);
224ce89b
WB
7793 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7794 assert(tr == 0);
7c673cae
FG
7795 }
7796 // we should not remove the cached maps
7797 assert(min <= service.map_cache.cached_key_lower_bound());
7798}
7799
7800void OSD::handle_osd_map(MOSDMap *m)
7801{
7802 assert(osd_lock.is_locked());
7803 // Keep a ref in the list until we get the newly received map written
7804 // onto disk. This is important because as long as the refs are alive,
7805 // the OSDMaps will be pinned in the cache and we won't try to read it
7806 // off of disk. Otherwise these maps will probably not stay in the cache,
7807 // and reading those OSDMaps before they are actually written can result
7808 // in a crash.
7809 list<OSDMapRef> pinned_maps;
7810 if (m->fsid != monc->get_fsid()) {
7811 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7812 << monc->get_fsid() << dendl;
7813 m->put();
7814 return;
7815 }
7816 if (is_initializing()) {
7817 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7818 m->put();
7819 return;
7820 }
7821
7822 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7823 if (session && !(session->entity_name.is_mon() ||
7824 session->entity_name.is_osd())) {
7825 //not enough perms!
7826 dout(10) << "got osd map from Session " << session
7827 << " which we can't take maps from (not a mon or osd)" << dendl;
7828 m->put();
7829 session->put();
7830 return;
7831 }
7832 if (session)
7833 session->put();
7834
7835 // share with the objecter
7836 if (!is_preboot())
7837 service.objecter->handle_osd_map(m);
7838
7839 epoch_t first = m->get_first();
7840 epoch_t last = m->get_last();
7841 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7842 << superblock.newest_map
7843 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7844 << dendl;
7845
7846 logger->inc(l_osd_map);
7847 logger->inc(l_osd_mape, last - first + 1);
7848 if (first <= superblock.newest_map)
7849 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7850 if (service.max_oldest_map < m->oldest_map) {
7851 service.max_oldest_map = m->oldest_map;
7852 assert(service.max_oldest_map >= superblock.oldest_map);
7853 }
7854
7855 // make sure there is something new, here, before we bother flushing
7856 // the queues and such
7857 if (last <= superblock.newest_map) {
7858 dout(10) << " no new maps here, dropping" << dendl;
7859 m->put();
7860 return;
7861 }
7862
7863 // missing some?
7864 bool skip_maps = false;
7865 if (first > superblock.newest_map + 1) {
7866 dout(10) << "handle_osd_map message skips epochs "
7867 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7868 if (m->oldest_map <= superblock.newest_map + 1) {
7869 osdmap_subscribe(superblock.newest_map + 1, false);
7870 m->put();
7871 return;
7872 }
7873 // always try to get the full range of maps--as many as we can. this
7874 // 1- is good to have
7875 // 2- is at present the only way to ensure that we get a *full* map as
7876 // the first map!
7877 if (m->oldest_map < first) {
7878 osdmap_subscribe(m->oldest_map - 1, true);
7879 m->put();
7880 return;
7881 }
7882 skip_maps = true;
7883 }
7884
7885 ObjectStore::Transaction t;
7886 uint64_t txn_size = 0;
7887
7888 // store new maps: queue for disk and put in the osdmap cache
7889 epoch_t start = MAX(superblock.newest_map + 1, first);
7890 for (epoch_t e = start; e <= last; e++) {
7891 if (txn_size >= t.get_num_bytes()) {
7892 derr << __func__ << " transaction size overflowed" << dendl;
7893 assert(txn_size < t.get_num_bytes());
7894 }
7895 txn_size = t.get_num_bytes();
7896 map<epoch_t,bufferlist>::iterator p;
7897 p = m->maps.find(e);
7898 if (p != m->maps.end()) {
7899 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7900 OSDMap *o = new OSDMap;
7901 bufferlist& bl = p->second;
7902
7903 o->decode(bl);
7904
7905 ghobject_t fulloid = get_osdmap_pobject_name(e);
7906 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7907 pin_map_bl(e, bl);
7908 pinned_maps.push_back(add_map(o));
7909
7910 got_full_map(e);
7911 continue;
7912 }
7913
7914 p = m->incremental_maps.find(e);
7915 if (p != m->incremental_maps.end()) {
7916 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7917 bufferlist& bl = p->second;
7918 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7919 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7920 pin_map_inc_bl(e, bl);
7921
7922 OSDMap *o = new OSDMap;
7923 if (e > 1) {
7924 bufferlist obl;
7925 bool got = get_map_bl(e - 1, obl);
7926 assert(got);
7927 o->decode(obl);
7928 }
7929
7930 OSDMap::Incremental inc;
7931 bufferlist::iterator p = bl.begin();
7932 inc.decode(p);
7933 if (o->apply_incremental(inc) < 0) {
7934 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7935 assert(0 == "bad fsid");
7936 }
7937
7938 bufferlist fbl;
7939 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7940
7941 bool injected_failure = false;
7942 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7943 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7944 derr << __func__ << " injecting map crc failure" << dendl;
7945 injected_failure = true;
7946 }
7947
7948 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7949 dout(2) << "got incremental " << e
7950 << " but failed to encode full with correct crc; requesting"
7951 << dendl;
7952 clog->warn() << "failed to encode map e" << e << " with expected crc";
7953 dout(20) << "my encoded map was:\n";
7954 fbl.hexdump(*_dout);
7955 *_dout << dendl;
7956 delete o;
7957 request_full_map(e, last);
7958 last = e - 1;
7959 break;
7960 }
7961 got_full_map(e);
7962
7963 ghobject_t fulloid = get_osdmap_pobject_name(e);
7964 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7965 pin_map_bl(e, fbl);
7966 pinned_maps.push_back(add_map(o));
7967 continue;
7968 }
7969
7970 assert(0 == "MOSDMap lied about what maps it had?");
7971 }
7972
7973 // even if this map isn't from a mon, we may have satisfied our subscription
7974 monc->sub_got("osdmap", last);
7975
7976 if (!m->maps.empty() && requested_full_first) {
7977 dout(10) << __func__ << " still missing full maps " << requested_full_first
7978 << ".." << requested_full_last << dendl;
7979 rerequest_full_maps();
7980 }
7981
7c673cae
FG
7982 if (superblock.oldest_map) {
7983 // make sure we at least keep pace with incoming maps
7984 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7985 }
7986
7987 if (!superblock.oldest_map || skip_maps)
7988 superblock.oldest_map = first;
7989 superblock.newest_map = last;
7990 superblock.current_epoch = last;
7991
7992 // note in the superblock that we were clean thru the prior epoch
7993 epoch_t boot_epoch = service.get_boot_epoch();
7994 if (boot_epoch && boot_epoch >= superblock.mounted) {
7995 superblock.mounted = boot_epoch;
7996 superblock.clean_thru = last;
7997 }
7998
7999 // superblock and commit
8000 write_superblock(t);
8001 store->queue_transaction(
8002 service.meta_osr.get(),
8003 std::move(t),
8004 new C_OnMapApply(&service, pinned_maps, last),
8005 new C_OnMapCommit(this, start, last, m), 0);
8006 service.publish_superblock(superblock);
8007}
8008
8009void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8010{
8011 dout(10) << __func__ << " " << first << ".." << last << dendl;
8012 if (is_stopping()) {
8013 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8014 return;
8015 }
8016 Mutex::Locker l(osd_lock);
31f18b77
FG
8017 if (is_stopping()) {
8018 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8019 return;
8020 }
7c673cae
FG
8021 map_lock.get_write();
8022
8023 bool do_shutdown = false;
8024 bool do_restart = false;
8025 bool network_error = false;
8026
8027 // advance through the new maps
8028 for (epoch_t cur = first; cur <= last; cur++) {
8029 dout(10) << " advance to epoch " << cur
8030 << " (<= last " << last
8031 << " <= newest_map " << superblock.newest_map
8032 << ")" << dendl;
8033
8034 OSDMapRef newmap = get_map(cur);
8035 assert(newmap); // we just cached it above!
8036
8037 // start blacklisting messages sent to peers that go down.
8038 service.pre_publish_map(newmap);
8039
8040 // kill connections to newly down osds
8041 bool waited_for_reservations = false;
8042 set<int> old;
8043 osdmap->get_all_osds(old);
8044 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8045 if (*p != whoami &&
8046 osdmap->is_up(*p) && // in old map
8047 newmap->is_down(*p)) { // but not the new one
8048 if (!waited_for_reservations) {
8049 service.await_reserved_maps();
8050 waited_for_reservations = true;
8051 }
8052 note_down_osd(*p);
8053 } else if (*p != whoami &&
8054 osdmap->is_down(*p) &&
8055 newmap->is_up(*p)) {
8056 note_up_osd(*p);
8057 }
8058 }
8059
31f18b77
FG
8060 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
8061 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
8062 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7c673cae
FG
8063 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8064 << dendl;
8065 if (is_booting()) {
8066 // this captures the case where we sent the boot message while
8067 // NOUP was being set on the mon and our boot request was
8068 // dropped, and then later it is cleared. it imperfectly
8069 // handles the case where our original boot message was not
8070 // dropped and we restart even though we might have booted, but
8071 // that is harmless (boot will just take slightly longer).
8072 do_restart = true;
8073 }
8074 }
31f18b77
FG
8075 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
8076 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
8077 dout(10) << __func__ << " require_osd_release reached luminous in "
8078 << newmap->get_epoch() << dendl;
8079 clear_pg_stat_queue();
224ce89b 8080 clear_outstanding_pg_stats();
31f18b77 8081 }
7c673cae
FG
8082
8083 osdmap = newmap;
8084 epoch_t up_epoch;
8085 epoch_t boot_epoch;
8086 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8087 if (!up_epoch &&
8088 osdmap->is_up(whoami) &&
8089 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
8090 up_epoch = osdmap->get_epoch();
8091 dout(10) << "up_epoch is " << up_epoch << dendl;
8092 if (!boot_epoch) {
8093 boot_epoch = osdmap->get_epoch();
8094 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8095 }
8096 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8097 }
8098 }
8099
8100 had_map_since = ceph_clock_now();
8101
8102 epoch_t _bind_epoch = service.get_bind_epoch();
8103 if (osdmap->is_up(whoami) &&
8104 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
8105 _bind_epoch < osdmap->get_up_from(whoami)) {
8106
8107 if (is_booting()) {
8108 dout(1) << "state: booting -> active" << dendl;
8109 set_state(STATE_ACTIVE);
8110
8111 // set incarnation so that osd_reqid_t's we generate for our
8112 // objecter requests are unique across restarts.
8113 service.objecter->set_client_incarnation(osdmap->get_epoch());
8114 }
8115 }
8116
8117 if (osdmap->get_epoch() > 0 &&
8118 is_active()) {
8119 if (!osdmap->exists(whoami)) {
8120 dout(0) << "map says i do not exist. shutting down." << dendl;
8121 do_shutdown = true; // don't call shutdown() while we have
8122 // everything paused
8123 } else if (!osdmap->is_up(whoami) ||
8124 !osdmap->get_addr(whoami).probably_equals(
8125 client_messenger->get_myaddr()) ||
8126 !osdmap->get_cluster_addr(whoami).probably_equals(
8127 cluster_messenger->get_myaddr()) ||
8128 !osdmap->get_hb_back_addr(whoami).probably_equals(
8129 hb_back_server_messenger->get_myaddr()) ||
8130 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8131 !osdmap->get_hb_front_addr(whoami).probably_equals(
8132 hb_front_server_messenger->get_myaddr()))) {
8133 if (!osdmap->is_up(whoami)) {
8134 if (service.is_preparing_to_stop() || service.is_stopping()) {
8135 service.got_stop_ack();
8136 } else {
c07f9fc5
FG
8137 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8138 "but it is still running";
8139 clog->debug() << "map e" << osdmap->get_epoch()
8140 << " wrongly marked me down at e"
8141 << osdmap->get_down_at(whoami);
7c673cae
FG
8142 }
8143 } else if (!osdmap->get_addr(whoami).probably_equals(
8144 client_messenger->get_myaddr())) {
8145 clog->error() << "map e" << osdmap->get_epoch()
8146 << " had wrong client addr (" << osdmap->get_addr(whoami)
8147 << " != my " << client_messenger->get_myaddr() << ")";
8148 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8149 cluster_messenger->get_myaddr())) {
8150 clog->error() << "map e" << osdmap->get_epoch()
8151 << " had wrong cluster addr ("
8152 << osdmap->get_cluster_addr(whoami)
8153 << " != my " << cluster_messenger->get_myaddr() << ")";
8154 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8155 hb_back_server_messenger->get_myaddr())) {
8156 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8157 << " had wrong heartbeat back addr ("
7c673cae
FG
8158 << osdmap->get_hb_back_addr(whoami)
8159 << " != my " << hb_back_server_messenger->get_myaddr()
8160 << ")";
8161 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8162 !osdmap->get_hb_front_addr(whoami).probably_equals(
8163 hb_front_server_messenger->get_myaddr())) {
8164 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8165 << " had wrong heartbeat front addr ("
7c673cae
FG
8166 << osdmap->get_hb_front_addr(whoami)
8167 << " != my " << hb_front_server_messenger->get_myaddr()
8168 << ")";
8169 }
8170
8171 if (!service.is_stopping()) {
8172 epoch_t up_epoch = 0;
8173 epoch_t bind_epoch = osdmap->get_epoch();
8174 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8175 do_restart = true;
8176
8177 //add markdown log
8178 utime_t now = ceph_clock_now();
8179 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8180 osd_markdown_log.push_back(now);
8181 //clear all out-of-date log
8182 while (!osd_markdown_log.empty() &&
8183 osd_markdown_log.front() + grace < now)
8184 osd_markdown_log.pop_front();
8185 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8186 dout(0) << __func__ << " marked down "
8187 << osd_markdown_log.size()
8188 << " > osd_max_markdown_count "
8189 << cct->_conf->osd_max_markdown_count
8190 << " in last " << grace << " seconds, shutting down"
8191 << dendl;
8192 do_restart = false;
8193 do_shutdown = true;
8194 }
8195
8196 start_waiting_for_healthy();
8197
8198 set<int> avoid_ports;
8199#if defined(__FreeBSD__)
8200 // prevent FreeBSD from grabbing the client_messenger port during
8201 // rebinding. In which case a cluster_meesneger will connect also
8202 // to the same port
8203 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8204#endif
8205 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8206 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8207 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8208
8209 int r = cluster_messenger->rebind(avoid_ports);
8210 if (r != 0) {
8211 do_shutdown = true; // FIXME: do_restart?
8212 network_error = true;
8213 dout(0) << __func__ << " marked down:"
8214 << " rebind cluster_messenger failed" << dendl;
8215 }
8216
8217 r = hb_back_server_messenger->rebind(avoid_ports);
8218 if (r != 0) {
8219 do_shutdown = true; // FIXME: do_restart?
8220 network_error = true;
8221 dout(0) << __func__ << " marked down:"
8222 << " rebind hb_back_server_messenger failed" << dendl;
8223 }
8224
8225 r = hb_front_server_messenger->rebind(avoid_ports);
8226 if (r != 0) {
8227 do_shutdown = true; // FIXME: do_restart?
8228 network_error = true;
8229 dout(0) << __func__ << " marked down:"
8230 << " rebind hb_front_server_messenger failed" << dendl;
8231 }
8232
8233 hb_front_client_messenger->mark_down_all();
8234 hb_back_client_messenger->mark_down_all();
8235
8236 reset_heartbeat_peers();
8237 }
8238 }
8239 }
8240
8241 map_lock.put_write();
8242
8243 check_osdmap_features(store);
8244
8245 // yay!
8246 consume_map();
8247
8248 if (is_active() || is_waiting_for_healthy())
8249 maybe_update_heartbeat_peers();
8250
8251 if (!is_active()) {
8252 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8253 peering_wq.drain();
8254 } else {
8255 activate_map();
8256 }
8257
31f18b77 8258 if (do_shutdown) {
7c673cae
FG
8259 if (network_error) {
8260 Mutex::Locker l(heartbeat_lock);
8261 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8262 failure_pending.begin();
8263 while (it != failure_pending.end()) {
8264 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8265 << it->first << dendl;
8266 send_still_alive(osdmap->get_epoch(), it->second.second);
8267 failure_pending.erase(it++);
8268 }
8269 }
8270 // trigger shutdown in a different thread
8271 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8272 queue_async_signal(SIGINT);
8273 }
31f18b77
FG
8274 else if (m->newest_map && m->newest_map > last) {
8275 dout(10) << " msg say newest map is " << m->newest_map
8276 << ", requesting more" << dendl;
8277 osdmap_subscribe(osdmap->get_epoch()+1, false);
8278 }
7c673cae
FG
8279 else if (is_preboot()) {
8280 if (m->get_source().is_mon())
8281 _preboot(m->oldest_map, m->newest_map);
8282 else
8283 start_boot();
8284 }
8285 else if (do_restart)
8286 start_boot();
8287
8288}
8289
8290void OSD::check_osdmap_features(ObjectStore *fs)
8291{
8292 // adjust required feature bits?
8293
8294 // we have to be a bit careful here, because we are accessing the
8295 // Policy structures without taking any lock. in particular, only
8296 // modify integer values that can safely be read by a racing CPU.
8297 // since we are only accessing existing Policy structures a their
8298 // current memory location, and setting or clearing bits in integer
8299 // fields, and we are the only writer, this is not a problem.
8300
8301 {
8302 Messenger::Policy p = client_messenger->get_default_policy();
8303 uint64_t mask;
8304 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8305 if ((p.features_required & mask) != features) {
8306 dout(0) << "crush map has features " << features
8307 << ", adjusting msgr requires for clients" << dendl;
8308 p.features_required = (p.features_required & ~mask) | features;
8309 client_messenger->set_default_policy(p);
8310 }
8311 }
8312 {
8313 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8314 uint64_t mask;
8315 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8316 if ((p.features_required & mask) != features) {
8317 dout(0) << "crush map has features " << features
8318 << " was " << p.features_required
8319 << ", adjusting msgr requires for mons" << dendl;
8320 p.features_required = (p.features_required & ~mask) | features;
8321 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8322 }
8323 }
8324 {
8325 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8326 uint64_t mask;
8327 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8328
8329 if ((p.features_required & mask) != features) {
8330 dout(0) << "crush map has features " << features
8331 << ", adjusting msgr requires for osds" << dendl;
8332 p.features_required = (p.features_required & ~mask) | features;
8333 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8334 }
8335
8336 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8337 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8338 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8339 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8340 ObjectStore::Transaction t;
8341 write_superblock(t);
8342 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8343 assert(err == 0);
8344 }
8345 }
8346}
8347
8348bool OSD::advance_pg(
8349 epoch_t osd_epoch, PG *pg,
8350 ThreadPool::TPHandle &handle,
8351 PG::RecoveryCtx *rctx,
31f18b77 8352 set<PGRef> *new_pgs)
7c673cae
FG
8353{
8354 assert(pg->is_locked());
8355 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8356 OSDMapRef lastmap = pg->get_osdmap();
8357
8358 if (lastmap->get_epoch() == osd_epoch)
8359 return true;
8360 assert(lastmap->get_epoch() < osd_epoch);
8361
8362 epoch_t min_epoch = service.get_min_pg_epoch();
8363 epoch_t max;
8364 if (min_epoch) {
8365 max = min_epoch + cct->_conf->osd_map_max_advance;
8366 } else {
8367 max = next_epoch + cct->_conf->osd_map_max_advance;
8368 }
8369
8370 for (;
8371 next_epoch <= osd_epoch && next_epoch <= max;
8372 ++next_epoch) {
8373 OSDMapRef nextmap = service.try_get_map(next_epoch);
8374 if (!nextmap) {
8375 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8376 // make sure max is bumped up so that we can get past any
8377 // gap in maps
8378 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8379 continue;
8380 }
8381
8382 vector<int> newup, newacting;
8383 int up_primary, acting_primary;
8384 nextmap->pg_to_up_acting_osds(
8385 pg->info.pgid.pgid,
8386 &newup, &up_primary,
8387 &newacting, &acting_primary);
8388 pg->handle_advance_map(
8389 nextmap, lastmap, newup, up_primary,
8390 newacting, acting_primary, rctx);
8391
8392 // Check for split!
8393 set<spg_t> children;
8394 spg_t parent(pg->info.pgid);
8395 if (parent.is_split(
8396 lastmap->get_pg_num(pg->pool.id),
8397 nextmap->get_pg_num(pg->pool.id),
8398 &children)) {
8399 service.mark_split_in_progress(pg->info.pgid, children);
8400 split_pgs(
8401 pg, children, new_pgs, lastmap, nextmap,
8402 rctx);
8403 }
8404
8405 lastmap = nextmap;
8406 handle.reset_tp_timeout();
8407 }
8408 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8409 pg->handle_activate_map(rctx);
8410 if (next_epoch <= osd_epoch) {
8411 dout(10) << __func__ << " advanced to max " << max
8412 << " past min epoch " << min_epoch
8413 << " ... will requeue " << *pg << dendl;
8414 return false;
8415 }
8416 return true;
8417}
8418
8419void OSD::consume_map()
8420{
8421 assert(osd_lock.is_locked());
8422 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8423
3efd9988
FG
8424 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8425 * speak the older sorting version any more. Be careful not to force
8426 * a shutdown if we are merely processing old maps, though.
8427 */
8428 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8429 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8430 ceph_abort();
8431 }
8432
7c673cae
FG
8433 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8434 list<PGRef> to_remove;
8435
8436 // scan pg's
8437 {
8438 RWLock::RLocker l(pg_map_lock);
8439 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8440 it != pg_map.end();
8441 ++it) {
8442 PG *pg = it->second;
8443 pg->lock();
8444 if (pg->is_primary())
8445 num_pg_primary++;
8446 else if (pg->is_replica())
8447 num_pg_replica++;
8448 else
8449 num_pg_stray++;
8450
8451 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8452 //pool is deleted!
8453 to_remove.push_back(PGRef(pg));
8454 } else {
8455 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8456 }
8457
8458 pg->unlock();
8459 }
3efd9988
FG
8460
8461 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8462 for (auto pg = pending_creates_from_osd.cbegin();
8463 pg != pending_creates_from_osd.cend();) {
b32b8144 8464 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
3efd9988
FG
8465 pg = pending_creates_from_osd.erase(pg);
8466 } else {
8467 ++pg;
8468 }
8469 }
7c673cae
FG
8470 }
8471
8472 for (list<PGRef>::iterator i = to_remove.begin();
8473 i != to_remove.end();
8474 to_remove.erase(i++)) {
8475 RWLock::WLocker locker(pg_map_lock);
8476 (*i)->lock();
8477 _remove_pg(&**i);
8478 (*i)->unlock();
8479 }
8480
8481 service.expand_pg_num(service.get_osdmap(), osdmap);
8482
8483 service.pre_publish_map(osdmap);
8484 service.await_reserved_maps();
8485 service.publish_map(osdmap);
8486
8487 service.maybe_inject_dispatch_delay();
8488
8489 dispatch_sessions_waiting_on_map();
8490
8491 service.maybe_inject_dispatch_delay();
8492
8493 // remove any PGs which we no longer host from the session waiting_for_pg lists
8494 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8495 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8496
8497 service.maybe_inject_dispatch_delay();
8498
8499 // scan pg's
8500 {
8501 RWLock::RLocker l(pg_map_lock);
8502 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8503 it != pg_map.end();
8504 ++it) {
8505 PG *pg = it->second;
8506 pg->lock();
8507 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8508 pg->unlock();
8509 }
8510
8511 logger->set(l_osd_pg, pg_map.size());
8512 }
8513 logger->set(l_osd_pg_primary, num_pg_primary);
8514 logger->set(l_osd_pg_replica, num_pg_replica);
8515 logger->set(l_osd_pg_stray, num_pg_stray);
94b18763 8516 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
7c673cae
FG
8517}
8518
8519void OSD::activate_map()
8520{
8521 assert(osd_lock.is_locked());
8522
8523 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8524
7c673cae
FG
8525 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8526 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8527 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8528 }
8529
8530 // norecover?
8531 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8532 if (!service.recovery_is_paused()) {
8533 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8534 service.pause_recovery();
8535 }
8536 } else {
8537 if (service.recovery_is_paused()) {
8538 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8539 service.unpause_recovery();
8540 }
8541 }
8542
8543 service.activate_map();
8544
8545 // process waiters
8546 take_waiters(waiting_for_osdmap);
8547}
8548
8549bool OSD::require_mon_peer(const Message *m)
8550{
8551 if (!m->get_connection()->peer_is_mon()) {
8552 dout(0) << "require_mon_peer received from non-mon "
8553 << m->get_connection()->get_peer_addr()
8554 << " " << *m << dendl;
8555 return false;
8556 }
8557 return true;
8558}
8559
8560bool OSD::require_mon_or_mgr_peer(const Message *m)
8561{
8562 if (!m->get_connection()->peer_is_mon() &&
8563 !m->get_connection()->peer_is_mgr()) {
8564 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8565 << m->get_connection()->get_peer_addr()
8566 << " " << *m << dendl;
8567 return false;
8568 }
8569 return true;
8570}
8571
8572bool OSD::require_osd_peer(const Message *m)
8573{
8574 if (!m->get_connection()->peer_is_osd()) {
8575 dout(0) << "require_osd_peer received from non-osd "
8576 << m->get_connection()->get_peer_addr()
8577 << " " << *m << dendl;
8578 return false;
8579 }
8580 return true;
8581}
8582
8583bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8584{
8585 epoch_t up_epoch = service.get_up_epoch();
8586 if (epoch < up_epoch) {
8587 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8588 return false;
8589 }
8590
8591 if (!is_active()) {
8592 dout(7) << "still in boot state, dropping message " << *m << dendl;
8593 return false;
8594 }
8595
8596 return true;
8597}
8598
8599bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8600 bool is_fast_dispatch)
8601{
8602 int from = m->get_source().num();
8603
8604 if (map->is_down(from) ||
8605 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8606 dout(5) << "from dead osd." << from << ", marking down, "
8607 << " msg was " << m->get_source_inst().addr
8608 << " expected " << (map->is_up(from) ?
8609 map->get_cluster_addr(from) : entity_addr_t())
8610 << dendl;
8611 ConnectionRef con = m->get_connection();
8612 con->mark_down();
8613 Session *s = static_cast<Session*>(con->get_priv());
8614 if (s) {
8615 if (!is_fast_dispatch)
8616 s->session_dispatch_lock.Lock();
8617 clear_session_waiting_on_map(s);
8618 con->set_priv(NULL); // break ref <-> session cycle, if any
8619 if (!is_fast_dispatch)
8620 s->session_dispatch_lock.Unlock();
8621 s->put();
8622 }
8623 return false;
8624 }
8625 return true;
8626}
8627
8628
8629/*
8630 * require that we have same (or newer) map, and that
8631 * the source is the pg primary.
8632 */
8633bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8634 bool is_fast_dispatch)
8635{
8636 const Message *m = op->get_req();
8637 dout(15) << "require_same_or_newer_map " << epoch
8638 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8639
8640 assert(osd_lock.is_locked());
8641
8642 // do they have a newer map?
8643 if (epoch > osdmap->get_epoch()) {
8644 dout(7) << "waiting for newer map epoch " << epoch
8645 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8646 wait_for_new_map(op);
8647 return false;
8648 }
8649
8650 if (!require_self_aliveness(op->get_req(), epoch)) {
8651 return false;
8652 }
8653
8654 // ok, our map is same or newer.. do they still exist?
8655 if (m->get_connection()->get_messenger() == cluster_messenger &&
8656 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8657 return false;
8658 }
8659
8660 return true;
8661}
8662
8663
8664
8665
8666
8667// ----------------------------------------
8668// pg creation
8669
8670void OSD::split_pgs(
8671 PG *parent,
31f18b77 8672 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8673 OSDMapRef curmap,
8674 OSDMapRef nextmap,
8675 PG::RecoveryCtx *rctx)
8676{
8677 unsigned pg_num = nextmap->get_pg_num(
8678 parent->pool.id);
8679 parent->update_snap_mapper_bits(
8680 parent->info.pgid.get_split_bits(pg_num)
8681 );
8682
8683 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8684 parent->info.stats.stats.sum.split(updated_stats);
8685
8686 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8687 for (set<spg_t>::const_iterator i = childpgids.begin();
8688 i != childpgids.end();
8689 ++i, ++stat_iter) {
8690 assert(stat_iter != updated_stats.end());
8691 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8692 assert(service.splitting(*i));
8693 PG* child = _make_pg(nextmap, *i);
8694 child->lock(true);
8695 out_pgs->insert(child);
8696 rctx->created_pgs.insert(child);
8697
8698 unsigned split_bits = i->get_split_bits(pg_num);
8699 dout(10) << "pg_num is " << pg_num << dendl;
8700 dout(10) << "m_seed " << i->ps() << dendl;
8701 dout(10) << "split_bits is " << split_bits << dendl;
8702
8703 parent->split_colls(
8704 *i,
8705 split_bits,
8706 i->ps(),
8707 &child->pool.info,
8708 rctx->transaction);
8709 parent->split_into(
8710 i->pgid,
8711 child,
8712 split_bits);
8713 child->info.stats.stats.sum = *stat_iter;
8714
8715 child->write_if_dirty(*(rctx->transaction));
8716 child->unlock();
8717 }
8718 assert(stat_iter != updated_stats.end());
8719 parent->info.stats.stats.sum = *stat_iter;
8720 parent->write_if_dirty(*(rctx->transaction));
8721}
8722
8723/*
8724 * holding osd_lock
8725 */
8726void OSD::handle_pg_create(OpRequestRef op)
8727{
8728 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8729 assert(m->get_type() == MSG_OSD_PG_CREATE);
8730
8731 dout(10) << "handle_pg_create " << *m << dendl;
8732
8733 if (!require_mon_peer(op->get_req())) {
8734 return;
8735 }
8736
8737 if (!require_same_or_newer_map(op, m->epoch, false))
8738 return;
8739
8740 op->mark_started();
8741
8742 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8743 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8744 p != m->mkpg.end();
8745 ++p, ++ci) {
8746 assert(ci != m->ctimes.end() && ci->first == p->first);
8747 epoch_t created = p->second.created;
8748 if (p->second.split_bits) // Skip split pgs
8749 continue;
8750 pg_t on = p->first;
8751
8752 if (on.preferred() >= 0) {
8753 dout(20) << "ignoring localized pg " << on << dendl;
8754 continue;
8755 }
8756
8757 if (!osdmap->have_pg_pool(on.pool())) {
8758 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8759 continue;
8760 }
8761
8762 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8763
8764 // is it still ours?
8765 vector<int> up, acting;
8766 int up_primary = -1;
8767 int acting_primary = -1;
8768 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8769 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8770
8771 if (acting_primary != whoami) {
8772 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8773 << "), my role=" << role << ", skipping" << dendl;
8774 continue;
8775 }
8776
8777 spg_t pgid;
8778 bool mapped = osdmap->get_primary_shard(on, &pgid);
8779 assert(mapped);
8780
8781 PastIntervals pi(
8782 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8783 *osdmap);
8784 pg_history_t history;
8785 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8786
8787 // The mon won't resend unless the primary changed, so
8788 // we ignore same_interval_since. We'll pass this history
8789 // to handle_pg_peering_evt with the current epoch as the
8790 // event -- the project_pg_history check in
8791 // handle_pg_peering_evt will be a noop.
8792 if (history.same_primary_since > m->epoch) {
8793 dout(10) << __func__ << ": got obsolete pg create on pgid "
8794 << pgid << " from epoch " << m->epoch
8795 << ", primary changed in " << history.same_primary_since
8796 << dendl;
8797 continue;
8798 }
7c673cae
FG
8799 if (handle_pg_peering_evt(
8800 pgid,
8801 history,
8802 pi,
8803 osdmap->get_epoch(),
8804 PG::CephPeeringEvtRef(
8805 new PG::CephPeeringEvt(
8806 osdmap->get_epoch(),
8807 osdmap->get_epoch(),
8808 PG::NullEvt()))
8809 ) == -EEXIST) {
8810 service.send_pg_created(pgid.pgid);
8811 }
8812 }
7c673cae 8813
3efd9988
FG
8814 {
8815 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8816 if (pending_creates_from_mon == 0) {
8817 last_pg_create_epoch = m->epoch;
8818 }
8819 }
7c673cae
FG
8820 maybe_update_heartbeat_peers();
8821}
8822
8823
8824// ----------------------------------------
8825// peering and recovery
8826
8827PG::RecoveryCtx OSD::create_context()
8828{
8829 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8830 C_Contexts *on_applied = new C_Contexts(cct);
8831 C_Contexts *on_safe = new C_Contexts(cct);
8832 map<int, map<spg_t,pg_query_t> > *query_map =
8833 new map<int, map<spg_t, pg_query_t> >;
8834 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8835 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8836 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8837 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8838 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8839 on_applied, on_safe, t);
8840 return rctx;
8841}
8842
8843struct C_OpenPGs : public Context {
8844 set<PGRef> pgs;
8845 ObjectStore *store;
8846 OSD *osd;
8847 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8848 pgs.swap(p);
8849 }
8850 void finish(int r) override {
8851 RWLock::RLocker l(osd->pg_map_lock);
8852 for (auto p : pgs) {
8853 if (osd->pg_map.count(p->info.pgid)) {
8854 p->ch = store->open_collection(p->coll);
8855 assert(p->ch);
8856 }
8857 }
8858 }
8859};
8860
8861void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8862 ThreadPool::TPHandle *handle)
8863{
8864 if (!ctx.transaction->empty()) {
8865 if (!ctx.created_pgs.empty()) {
8866 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8867 }
8868 int tr = store->queue_transaction(
8869 pg->osr.get(),
8870 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8871 TrackedOpRef(), handle);
8872 delete (ctx.transaction);
8873 assert(tr == 0);
8874 ctx.transaction = new ObjectStore::Transaction;
8875 ctx.on_applied = new C_Contexts(cct);
8876 ctx.on_safe = new C_Contexts(cct);
8877 }
8878}
8879
8880void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8881 ThreadPool::TPHandle *handle)
8882{
8883 if (service.get_osdmap()->is_up(whoami) &&
8884 is_active()) {
8885 do_notifies(*ctx.notify_list, curmap);
8886 do_queries(*ctx.query_map, curmap);
8887 do_infos(*ctx.info_map, curmap);
8888 }
8889 delete ctx.notify_list;
8890 delete ctx.query_map;
8891 delete ctx.info_map;
8892 if ((ctx.on_applied->empty() &&
8893 ctx.on_safe->empty() &&
8894 ctx.transaction->empty() &&
8895 ctx.created_pgs.empty()) || !pg) {
8896 delete ctx.transaction;
8897 delete ctx.on_applied;
8898 delete ctx.on_safe;
8899 assert(ctx.created_pgs.empty());
8900 } else {
8901 if (!ctx.created_pgs.empty()) {
8902 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8903 }
8904 int tr = store->queue_transaction(
8905 pg->osr.get(),
8906 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8907 handle);
8908 delete (ctx.transaction);
8909 assert(tr == 0);
8910 }
8911}
8912
8913/** do_notifies
8914 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8915 * content for, and they are primary for.
8916 */
8917
8918void OSD::do_notifies(
8919 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8920 OSDMapRef curmap)
8921{
8922 for (map<int,
8923 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8924 notify_list.begin();
8925 it != notify_list.end();
8926 ++it) {
8927 if (!curmap->is_up(it->first)) {
8928 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8929 continue;
8930 }
8931 ConnectionRef con = service.get_con_osd_cluster(
8932 it->first, curmap->get_epoch());
8933 if (!con) {
8934 dout(20) << __func__ << " skipping osd." << it->first
8935 << " (NULL con)" << dendl;
8936 continue;
8937 }
8938 service.share_map_peer(it->first, con.get(), curmap);
3efd9988 8939 dout(7) << __func__ << " osd." << it->first
7c673cae
FG
8940 << " on " << it->second.size() << " PGs" << dendl;
8941 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8942 it->second);
8943 con->send_message(m);
8944 }
8945}
8946
8947
8948/** do_queries
8949 * send out pending queries for info | summaries
8950 */
8951void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8952 OSDMapRef curmap)
8953{
8954 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8955 pit != query_map.end();
8956 ++pit) {
8957 if (!curmap->is_up(pit->first)) {
8958 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8959 continue;
8960 }
8961 int who = pit->first;
8962 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8963 if (!con) {
8964 dout(20) << __func__ << " skipping osd." << who
8965 << " (NULL con)" << dendl;
8966 continue;
8967 }
8968 service.share_map_peer(who, con.get(), curmap);
8969 dout(7) << __func__ << " querying osd." << who
8970 << " on " << pit->second.size() << " PGs" << dendl;
8971 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8972 con->send_message(m);
8973 }
8974}
8975
8976
8977void OSD::do_infos(map<int,
8978 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8979 OSDMapRef curmap)
8980{
8981 for (map<int,
8982 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8983 info_map.begin();
8984 p != info_map.end();
8985 ++p) {
8986 if (!curmap->is_up(p->first)) {
8987 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8988 continue;
8989 }
8990 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8991 i != p->second.end();
8992 ++i) {
8993 dout(20) << __func__ << " sending info " << i->first.info
8994 << " to shard " << p->first << dendl;
8995 }
8996 ConnectionRef con = service.get_con_osd_cluster(
8997 p->first, curmap->get_epoch());
8998 if (!con) {
8999 dout(20) << __func__ << " skipping osd." << p->first
9000 << " (NULL con)" << dendl;
9001 continue;
9002 }
9003 service.share_map_peer(p->first, con.get(), curmap);
9004 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9005 m->pg_list = p->second;
9006 con->send_message(m);
9007 }
9008 info_map.clear();
9009}
9010
9011
9012/** PGNotify
9013 * from non-primary to primary
9014 * includes pg_info_t.
9015 * NOTE: called with opqueue active.
9016 */
9017void OSD::handle_pg_notify(OpRequestRef op)
9018{
9019 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
9020 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
9021
9022 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
9023 int from = m->get_source().num();
9024
9025 if (!require_osd_peer(op->get_req()))
9026 return;
9027
9028 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9029 return;
9030
9031 op->mark_started();
9032
9033 for (auto it = m->get_pg_list().begin();
9034 it != m->get_pg_list().end();
9035 ++it) {
9036 if (it->first.info.pgid.preferred() >= 0) {
9037 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
9038 continue;
9039 }
9040
9041 handle_pg_peering_evt(
9042 spg_t(it->first.info.pgid.pgid, it->first.to),
9043 it->first.info.history, it->second,
9044 it->first.query_epoch,
9045 PG::CephPeeringEvtRef(
9046 new PG::CephPeeringEvt(
9047 it->first.epoch_sent, it->first.query_epoch,
9048 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
9049 op->get_req()->get_connection()->get_features())))
9050 );
9051 }
9052}
9053
9054void OSD::handle_pg_log(OpRequestRef op)
9055{
9056 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
9057 assert(m->get_type() == MSG_OSD_PG_LOG);
9058 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
9059
9060 if (!require_osd_peer(op->get_req()))
9061 return;
9062
9063 int from = m->get_source().num();
9064 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9065 return;
9066
9067 if (m->info.pgid.preferred() >= 0) {
9068 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
9069 return;
9070 }
9071
9072 op->mark_started();
9073 handle_pg_peering_evt(
9074 spg_t(m->info.pgid.pgid, m->to),
9075 m->info.history, m->past_intervals, m->get_epoch(),
9076 PG::CephPeeringEvtRef(
9077 new PG::CephPeeringEvt(
9078 m->get_epoch(), m->get_query_epoch(),
9079 PG::MLogRec(pg_shard_t(from, m->from), m)))
9080 );
9081}
9082
9083void OSD::handle_pg_info(OpRequestRef op)
9084{
9085 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
9086 assert(m->get_type() == MSG_OSD_PG_INFO);
9087 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
9088
9089 if (!require_osd_peer(op->get_req()))
9090 return;
9091
9092 int from = m->get_source().num();
9093 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9094 return;
9095
9096 op->mark_started();
9097
9098 for (auto p = m->pg_list.begin();
9099 p != m->pg_list.end();
9100 ++p) {
9101 if (p->first.info.pgid.preferred() >= 0) {
9102 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
9103 continue;
9104 }
9105
9106 handle_pg_peering_evt(
9107 spg_t(p->first.info.pgid.pgid, p->first.to),
9108 p->first.info.history, p->second, p->first.epoch_sent,
9109 PG::CephPeeringEvtRef(
9110 new PG::CephPeeringEvt(
9111 p->first.epoch_sent, p->first.query_epoch,
9112 PG::MInfoRec(
9113 pg_shard_t(
9114 from, p->first.from), p->first.info, p->first.epoch_sent)))
9115 );
9116 }
9117}
9118
9119void OSD::handle_pg_trim(OpRequestRef op)
9120{
9121 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
9122 assert(m->get_type() == MSG_OSD_PG_TRIM);
9123
9124 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
9125
9126 if (!require_osd_peer(op->get_req()))
9127 return;
9128
9129 int from = m->get_source().num();
9130 if (!require_same_or_newer_map(op, m->epoch, false))
9131 return;
9132
9133 if (m->pgid.preferred() >= 0) {
9134 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9135 return;
9136 }
9137
9138 op->mark_started();
9139
9140 PG *pg = _lookup_lock_pg(m->pgid);
9141 if(!pg) {
9142 dout(10) << " don't have pg " << m->pgid << dendl;
9143 return;
9144 }
9145
9146 if (m->epoch < pg->info.history.same_interval_since) {
9147 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9148 pg->unlock();
9149 return;
9150 }
9151
9152 if (pg->is_primary()) {
9153 // peer is informing us of their last_complete_ondisk
9154 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9155 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9156 m->trim_to;
9157 // trim log when the pg is recovered
9158 pg->calc_min_last_complete_ondisk();
9159 } else {
9160 // primary is instructing us to trim
9161 ObjectStore::Transaction t;
9162 pg->pg_log.trim(m->trim_to, pg->info);
9163 pg->dirty_info = true;
9164 pg->write_if_dirty(t);
9165 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9166 assert(tr == 0);
9167 }
9168 pg->unlock();
9169}
9170
9171void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9172{
9173 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9174 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9175
9176 if (!require_osd_peer(op->get_req()))
9177 return;
9178 if (!require_same_or_newer_map(op, m->query_epoch, false))
9179 return;
9180
9181 PG::CephPeeringEvtRef evt;
9182 if (m->type == MBackfillReserve::REQUEST) {
9183 evt = PG::CephPeeringEvtRef(
9184 new PG::CephPeeringEvt(
9185 m->query_epoch,
9186 m->query_epoch,
9187 PG::RequestBackfillPrio(m->priority)));
9188 } else if (m->type == MBackfillReserve::GRANT) {
9189 evt = PG::CephPeeringEvtRef(
9190 new PG::CephPeeringEvt(
9191 m->query_epoch,
9192 m->query_epoch,
9193 PG::RemoteBackfillReserved()));
9194 } else if (m->type == MBackfillReserve::REJECT) {
3efd9988
FG
9195 // NOTE: this is replica -> primary "i reject your request"
9196 // and also primary -> replica "cancel my previously-granted request"
7c673cae
FG
9197 evt = PG::CephPeeringEvtRef(
9198 new PG::CephPeeringEvt(
9199 m->query_epoch,
9200 m->query_epoch,
9201 PG::RemoteReservationRejected()));
9202 } else {
9203 ceph_abort();
9204 }
9205
9206 if (service.splitting(m->pgid)) {
9207 peering_wait_for_split[m->pgid].push_back(evt);
9208 return;
9209 }
9210
9211 PG *pg = _lookup_lock_pg(m->pgid);
9212 if (!pg) {
9213 dout(10) << " don't have pg " << m->pgid << dendl;
9214 return;
9215 }
9216
9217 pg->queue_peering_event(evt);
9218 pg->unlock();
9219}
9220
9221void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9222{
9223 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9224 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9225
9226 if (!require_osd_peer(op->get_req()))
9227 return;
9228 if (!require_same_or_newer_map(op, m->query_epoch, false))
9229 return;
9230
9231 PG::CephPeeringEvtRef evt;
9232 if (m->type == MRecoveryReserve::REQUEST) {
9233 evt = PG::CephPeeringEvtRef(
9234 new PG::CephPeeringEvt(
9235 m->query_epoch,
9236 m->query_epoch,
9237 PG::RequestRecovery()));
9238 } else if (m->type == MRecoveryReserve::GRANT) {
9239 evt = PG::CephPeeringEvtRef(
9240 new PG::CephPeeringEvt(
9241 m->query_epoch,
9242 m->query_epoch,
9243 PG::RemoteRecoveryReserved()));
9244 } else if (m->type == MRecoveryReserve::RELEASE) {
9245 evt = PG::CephPeeringEvtRef(
9246 new PG::CephPeeringEvt(
9247 m->query_epoch,
9248 m->query_epoch,
9249 PG::RecoveryDone()));
9250 } else {
9251 ceph_abort();
9252 }
9253
9254 if (service.splitting(m->pgid)) {
9255 peering_wait_for_split[m->pgid].push_back(evt);
9256 return;
9257 }
9258
9259 PG *pg = _lookup_lock_pg(m->pgid);
9260 if (!pg) {
9261 dout(10) << " don't have pg " << m->pgid << dendl;
9262 return;
9263 }
9264
9265 pg->queue_peering_event(evt);
9266 pg->unlock();
9267}
9268
c07f9fc5
FG
9269void OSD::handle_force_recovery(Message *m)
9270{
9271 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9272 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
c07f9fc5 9273
d2e6a577 9274 vector<PGRef> local_pgs;
c07f9fc5
FG
9275 local_pgs.reserve(msg->forced_pgs.size());
9276
d2e6a577
FG
9277 {
9278 RWLock::RLocker l(pg_map_lock);
9279 for (auto& i : msg->forced_pgs) {
9280 spg_t locpg;
9281 if (osdmap->get_primary_shard(i, &locpg)) {
9282 auto pg_map_entry = pg_map.find(locpg);
9283 if (pg_map_entry != pg_map.end()) {
9284 local_pgs.push_back(pg_map_entry->second);
9285 }
c07f9fc5
FG
9286 }
9287 }
9288 }
9289
9290 if (local_pgs.size()) {
9291 service.adjust_pg_priorities(local_pgs, msg->options);
9292 }
9293
9294 msg->put();
9295}
7c673cae
FG
9296
9297/** PGQuery
9298 * from primary to replica | stray
9299 * NOTE: called with opqueue active.
9300 */
9301void OSD::handle_pg_query(OpRequestRef op)
9302{
9303 assert(osd_lock.is_locked());
9304
9305 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9306 assert(m->get_type() == MSG_OSD_PG_QUERY);
9307
9308 if (!require_osd_peer(op->get_req()))
9309 return;
9310
9311 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9312 int from = m->get_source().num();
9313
9314 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9315 return;
9316
9317 op->mark_started();
9318
9319 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9320
9321 for (auto it = m->pg_list.begin();
9322 it != m->pg_list.end();
9323 ++it) {
9324 spg_t pgid = it->first;
9325
9326 if (pgid.preferred() >= 0) {
9327 dout(10) << "ignoring localized pg " << pgid << dendl;
9328 continue;
9329 }
9330
9331 if (service.splitting(pgid)) {
9332 peering_wait_for_split[pgid].push_back(
9333 PG::CephPeeringEvtRef(
9334 new PG::CephPeeringEvt(
9335 it->second.epoch_sent, it->second.epoch_sent,
9336 PG::MQuery(pg_shard_t(from, it->second.from),
9337 it->second, it->second.epoch_sent))));
9338 continue;
9339 }
9340
9341 {
9342 RWLock::RLocker l(pg_map_lock);
9343 if (pg_map.count(pgid)) {
9344 PG *pg = 0;
9345 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9346 pg->queue_query(
9347 it->second.epoch_sent, it->second.epoch_sent,
9348 pg_shard_t(from, it->second.from), it->second);
9349 pg->unlock();
9350 continue;
9351 }
9352 }
9353
9354 if (!osdmap->have_pg_pool(pgid.pool()))
9355 continue;
9356
9357 // get active crush mapping
9358 int up_primary, acting_primary;
9359 vector<int> up, acting;
9360 osdmap->pg_to_up_acting_osds(
9361 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9362
9363 // same primary?
9364 pg_history_t history = it->second.history;
9365 bool valid_history = project_pg_history(
9366 pgid, history, it->second.epoch_sent,
9367 up, up_primary, acting, acting_primary);
9368
9369 if (!valid_history ||
9370 it->second.epoch_sent < history.same_interval_since) {
9371 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9372 << history.same_interval_since
9373 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9374 continue;
9375 }
9376
9377 dout(10) << " pg " << pgid << " dne" << dendl;
9378 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9379 /* This is racy, but that should be ok: if we complete the deletion
9380 * before the pg is recreated, we'll just start it off backfilling
9381 * instead of just empty */
9382 if (service.deleting_pgs.lookup(pgid))
9383 empty.set_last_backfill(hobject_t());
9384 if (it->second.type == pg_query_t::LOG ||
9385 it->second.type == pg_query_t::FULLLOG) {
9386 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9387 if (con) {
9388 MOSDPGLog *mlog = new MOSDPGLog(
9389 it->second.from, it->second.to,
9390 osdmap->get_epoch(), empty,
9391 it->second.epoch_sent);
9392 service.share_map_peer(from, con.get(), osdmap);
9393 con->send_message(mlog);
9394 }
9395 } else {
9396 notify_list[from].push_back(
9397 make_pair(
9398 pg_notify_t(
9399 it->second.from, it->second.to,
9400 it->second.epoch_sent,
9401 osdmap->get_epoch(),
9402 empty),
9403 PastIntervals(
9404 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9405 *osdmap)));
9406 }
9407 }
9408 do_notifies(notify_list, osdmap);
9409}
9410
9411
9412void OSD::handle_pg_remove(OpRequestRef op)
9413{
9414 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9415 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9416 assert(osd_lock.is_locked());
9417
9418 if (!require_osd_peer(op->get_req()))
9419 return;
9420
9421 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9422 << m->pg_list.size() << " pgs" << dendl;
9423
9424 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9425 return;
9426
9427 op->mark_started();
9428
9429 for (auto it = m->pg_list.begin();
9430 it != m->pg_list.end();
9431 ++it) {
9432 spg_t pgid = *it;
9433 if (pgid.preferred() >= 0) {
9434 dout(10) << "ignoring localized pg " << pgid << dendl;
9435 continue;
9436 }
9437
9438 RWLock::WLocker l(pg_map_lock);
9439 if (pg_map.count(pgid) == 0) {
9440 dout(10) << " don't have pg " << pgid << dendl;
9441 continue;
9442 }
9443 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9444 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9445 pg_history_t history = pg->info.history;
9446 int up_primary, acting_primary;
9447 vector<int> up, acting;
9448 osdmap->pg_to_up_acting_osds(
9449 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9450 bool valid_history = project_pg_history(
9451 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9452 up, up_primary, acting, acting_primary);
9453 if (valid_history &&
9454 history.same_interval_since <= m->get_epoch()) {
9455 assert(pg->get_primary().osd == m->get_source().num());
9456 PGRef _pg(pg);
9457 _remove_pg(pg);
9458 pg->unlock();
9459 } else {
9460 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9461 << history.same_interval_since
9462 << " > " << m->get_epoch() << dendl;
9463 pg->unlock();
9464 }
9465 }
9466}
9467
9468void OSD::_remove_pg(PG *pg)
9469{
9470 ObjectStore::Transaction rmt ;
9471
9472 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9473 // the pg_map must be done together without unlocking the pg lock,
9474 // to avoid racing with watcher cleanup in ms_handle_reset
9475 // and handle_notify_timeout
9476 pg->on_removal(&rmt);
9477
9478 service.cancel_pending_splits_for_parent(pg->info.pgid);
9479 int tr = store->queue_transaction(
9480 pg->osr.get(), std::move(rmt), NULL,
9481 new ContainerContext<
9482 SequencerRef>(pg->osr));
9483 assert(tr == 0);
9484
9485 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9486 pg->info.pgid,
9487 make_pair(
9488 pg->info.pgid,
9489 PGRef(pg))
9490 );
9491 remove_wq.queue(make_pair(PGRef(pg), deleting));
9492
9493 service.pg_remove_epoch(pg->info.pgid);
9494
9495 // dereference from op_wq
9496 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9497
9498 // remove from map
9499 pg_map.erase(pg->info.pgid);
9500 pg->put("PGMap"); // since we've taken it out of map
9501}
9502
7c673cae
FG
9503// =========================================================
9504// RECOVERY
9505
9506void OSDService::_maybe_queue_recovery() {
9507 assert(recovery_lock.is_locked_by_me());
9508 uint64_t available_pushes;
9509 while (!awaiting_throttle.empty() &&
9510 _recover_now(&available_pushes)) {
9511 uint64_t to_start = MIN(
9512 available_pushes,
9513 cct->_conf->osd_recovery_max_single_start);
9514 _queue_for_recovery(awaiting_throttle.front(), to_start);
9515 awaiting_throttle.pop_front();
9516 recovery_ops_reserved += to_start;
9517 }
9518}
9519
9520bool OSDService::_recover_now(uint64_t *available_pushes)
9521{
9522 if (available_pushes)
9523 *available_pushes = 0;
9524
9525 if (ceph_clock_now() < defer_recovery_until) {
9526 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9527 return false;
9528 }
9529
9530 if (recovery_paused) {
9531 dout(15) << __func__ << " paused" << dendl;
9532 return false;
9533 }
9534
9535 uint64_t max = cct->_conf->osd_recovery_max_active;
9536 if (max <= recovery_ops_active + recovery_ops_reserved) {
9537 dout(15) << __func__ << " active " << recovery_ops_active
9538 << " + reserved " << recovery_ops_reserved
9539 << " >= max " << max << dendl;
9540 return false;
9541 }
9542
9543 if (available_pushes)
9544 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9545
9546 return true;
9547}
9548
c07f9fc5 9549
d2e6a577 9550void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
c07f9fc5
FG
9551{
9552 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9553 return;
9554 int newstate = 0;
9555
c07f9fc5
FG
9556 if (newflags & OFR_BACKFILL) {
9557 newstate = PG_STATE_FORCED_BACKFILL;
9558 } else if (newflags & OFR_RECOVERY) {
9559 newstate = PG_STATE_FORCED_RECOVERY;
9560 }
9561
9562 // debug output here may get large, don't generate it if debug level is below
9563 // 10 and use abbreviated pg ids otherwise
9564 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9565 stringstream ss;
9566
9567 for (auto& i : pgs) {
9568 ss << i->get_pgid() << " ";
9569 }
9570
9571 dout(10) << __func__ << " working on " << ss.str() << dendl;
9572 }
9573
9574 if (newflags & OFR_CANCEL) {
9575 for (auto& i : pgs) {
d2e6a577
FG
9576 i->lock();
9577 i->_change_recovery_force_mode(newstate, true);
9578 i->unlock();
c07f9fc5
FG
9579 }
9580 } else {
9581 for (auto& i : pgs) {
9582 // make sure the PG is in correct state before forcing backfill or recovery, or
9583 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9584 // or forcing somehow recovery/backfill.
d2e6a577 9585 i->lock();
c07f9fc5
FG
9586 int pgstate = i->get_state();
9587 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
3efd9988 9588 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
d2e6a577
FG
9589 i->_change_recovery_force_mode(newstate, false);
9590 i->unlock();
c07f9fc5
FG
9591 }
9592 }
9593}
9594
7c673cae
FG
9595void OSD::do_recovery(
9596 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9597 ThreadPool::TPHandle &handle)
9598{
9599 uint64_t started = 0;
31f18b77
FG
9600
9601 /*
9602 * When the value of osd_recovery_sleep is set greater than zero, recovery
9603 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9604 * recovery event's schedule time. This is done by adding a
9605 * recovery_requeue_callback event, which re-queues the recovery op using
9606 * queue_recovery_after_sleep.
9607 */
c07f9fc5 9608 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9609 {
31f18b77 9610 Mutex::Locker l(service.recovery_sleep_lock);
b32b8144
FG
9611 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9612 PGRef pgref(pg);
9613 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9614 dout(20) << "do_recovery wake up at "
9615 << ceph_clock_now()
9616 << ", re-queuing recovery" << dendl;
9617 Mutex::Locker l(service.recovery_sleep_lock);
9618 service.recovery_needs_sleep = false;
9619 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9620 });
9621
9622 // This is true for the first recovery op and when the previous recovery op
9623 // has been scheduled in the past. The next recovery op is scheduled after
9624 // completing the sleep from now.
9625 if (service.recovery_schedule_time < ceph_clock_now()) {
9626 service.recovery_schedule_time = ceph_clock_now();
9627 }
9628 service.recovery_schedule_time += recovery_sleep;
9629 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9630 recovery_requeue_callback);
9631 dout(20) << "Recovery event scheduled at "
9632 << service.recovery_schedule_time << dendl;
9633 return;
9634 }
7c673cae
FG
9635 }
9636
9637 {
b32b8144
FG
9638 {
9639 Mutex::Locker l(service.recovery_sleep_lock);
9640 service.recovery_needs_sleep = true;
9641 }
9642
7c673cae
FG
9643 if (pg->pg_has_reset_since(queued)) {
9644 goto out;
9645 }
9646
9647 assert(!pg->deleting);
9648 assert(pg->is_peered() && pg->is_primary());
9649
9650 assert(pg->recovery_queued);
9651 pg->recovery_queued = false;
9652
9653 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9654#ifdef DEBUG_RECOVERY_OIDS
9655 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9656#endif
9657
9658 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9659 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9660 << " on " << *pg << dendl;
9661
9662 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9663 if (!started && (more || !pg->have_unfound())) {
9664 goto out;
9665 }
9666
9667 PG::RecoveryCtx rctx = create_context();
9668 rctx.handle = &handle;
9669
9670 /*
9671 * if we couldn't start any recovery ops and things are still
9672 * unfound, see if we can discover more missing object locations.
9673 * It may be that our initial locations were bad and we errored
9674 * out while trying to pull.
9675 */
9676 if (!more && pg->have_unfound()) {
9677 pg->discover_all_missing(*rctx.query_map);
9678 if (rctx.query_map->empty()) {
224ce89b 9679 string action;
3efd9988 9680 if (pg->state_test(PG_STATE_BACKFILLING)) {
224ce89b
WB
9681 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9682 queued,
9683 queued,
3efd9988 9684 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
224ce89b
WB
9685 pg->queue_peering_event(evt);
9686 action = "in backfill";
9687 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9688 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9689 queued,
9690 queued,
3efd9988 9691 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
224ce89b
WB
9692 pg->queue_peering_event(evt);
9693 action = "in recovery";
9694 } else {
9695 action = "already out of recovery/backfill";
9696 }
9697 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
7c673cae 9698 } else {
224ce89b 9699 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
7c673cae
FG
9700 pg->queue_recovery();
9701 }
9702 }
9703
9704 pg->write_if_dirty(*rctx.transaction);
9705 OSDMapRef curmap = pg->get_osdmap();
9706 dispatch_context(rctx, pg, curmap);
9707 }
9708
9709 out:
9710 assert(started <= reserved_pushes);
9711 service.release_reserved_pushes(reserved_pushes);
9712}
9713
9714void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9715{
9716 Mutex::Locker l(recovery_lock);
9717 dout(10) << "start_recovery_op " << *pg << " " << soid
9718 << " (" << recovery_ops_active << "/"
9719 << cct->_conf->osd_recovery_max_active << " rops)"
9720 << dendl;
9721 recovery_ops_active++;
9722
9723#ifdef DEBUG_RECOVERY_OIDS
9724 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9725 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9726 recovery_oids[pg->info.pgid].insert(soid);
9727#endif
9728}
9729
9730void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9731{
9732 Mutex::Locker l(recovery_lock);
9733 dout(10) << "finish_recovery_op " << *pg << " " << soid
9734 << " dequeue=" << dequeue
9735 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9736 << dendl;
9737
9738 // adjust count
9739 assert(recovery_ops_active > 0);
9740 recovery_ops_active--;
9741
9742#ifdef DEBUG_RECOVERY_OIDS
9743 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9744 assert(recovery_oids[pg->info.pgid].count(soid));
9745 recovery_oids[pg->info.pgid].erase(soid);
9746#endif
9747
9748 _maybe_queue_recovery();
9749}
9750
9751bool OSDService::is_recovery_active()
9752{
b5b8bbf5 9753 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9754}
9755
9756// =========================================================
9757// OPS
9758
9759bool OSD::op_is_discardable(const MOSDOp *op)
9760{
9761 // drop client request if they are not connected and can't get the
9762 // reply anyway.
9763 if (!op->get_connection()->is_connected()) {
9764 return true;
9765 }
9766 return false;
9767}
9768
9769void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9770{
9771 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9772 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9773 << " cost " << op->get_req()->get_cost()
9774 << " latency " << latency
9775 << " epoch " << epoch
9776 << " " << *(op->get_req()) << dendl;
9777 op->osd_trace.event("enqueue op");
9778 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9779 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9780 op->mark_queued_for_pg();
224ce89b 9781 logger->tinc(l_osd_op_before_queue_op_lat, latency);
7c673cae
FG
9782 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9783}
9784
9785
9786
9787/*
9788 * NOTE: dequeue called in worker thread, with pg lock
9789 */
9790void OSD::dequeue_op(
9791 PGRef pg, OpRequestRef op,
9792 ThreadPool::TPHandle &handle)
9793{
9794 FUNCTRACE();
9795 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9796
9797 utime_t now = ceph_clock_now();
9798 op->set_dequeued_time(now);
9799 utime_t latency = now - op->get_req()->get_recv_stamp();
9800 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9801 << " cost " << op->get_req()->get_cost()
9802 << " latency " << latency
9803 << " " << *(op->get_req())
9804 << " pg " << *pg << dendl;
9805
224ce89b
WB
9806 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9807
7c673cae
FG
9808 Session *session = static_cast<Session *>(
9809 op->get_req()->get_connection()->get_priv());
9810 if (session) {
9811 maybe_share_map(session, op, pg->get_osdmap());
9812 session->put();
9813 }
9814
9815 if (pg->deleting)
9816 return;
9817
9818 op->mark_reached_pg();
9819 op->osd_trace.event("dequeue_op");
9820
9821 pg->do_request(op, handle);
9822
9823 // finish
9824 dout(10) << "dequeue_op " << op << " finish" << dendl;
9825 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9826}
9827
9828
9829struct C_CompleteSplits : public Context {
9830 OSD *osd;
31f18b77
FG
9831 set<PGRef> pgs;
9832 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
7c673cae
FG
9833 : osd(osd), pgs(in) {}
9834 void finish(int r) override {
9835 Mutex::Locker l(osd->osd_lock);
9836 if (osd->is_stopping())
9837 return;
9838 PG::RecoveryCtx rctx = osd->create_context();
31f18b77 9839 for (set<PGRef>::iterator i = pgs.begin();
7c673cae
FG
9840 i != pgs.end();
9841 ++i) {
9842 osd->pg_map_lock.get_write();
9843 (*i)->lock();
31f18b77
FG
9844 PG *pg = i->get();
9845 osd->add_newly_split_pg(pg, &rctx);
7c673cae
FG
9846 if (!((*i)->deleting)) {
9847 set<spg_t> to_complete;
9848 to_complete.insert((*i)->info.pgid);
9849 osd->service.complete_split(to_complete);
9850 }
9851 osd->pg_map_lock.put_write();
31f18b77 9852 osd->dispatch_context_transaction(rctx, pg);
7c673cae
FG
9853 osd->wake_pg_waiters(*i);
9854 (*i)->unlock();
9855 }
9856
9857 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9858 }
9859};
9860
9861void OSD::process_peering_events(
9862 const list<PG*> &pgs,
9863 ThreadPool::TPHandle &handle
9864 )
9865{
9866 bool need_up_thru = false;
9867 epoch_t same_interval_since = 0;
9868 OSDMapRef curmap;
9869 PG::RecoveryCtx rctx = create_context();
9870 rctx.handle = &handle;
9871 for (list<PG*>::const_iterator i = pgs.begin();
9872 i != pgs.end();
9873 ++i) {
31f18b77 9874 set<PGRef> split_pgs;
7c673cae
FG
9875 PG *pg = *i;
9876 pg->lock_suspend_timeout(handle);
9877 curmap = service.get_osdmap();
9878 if (pg->deleting) {
9879 pg->unlock();
9880 continue;
9881 }
9882 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9883 // we need to requeue the PG explicitly since we didn't actually
9884 // handle an event
9885 peering_wq.queue(pg);
9886 } else {
9887 assert(!pg->peering_queue.empty());
9888 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9889 pg->peering_queue.pop_front();
9890 pg->handle_peering_event(evt, &rctx);
9891 }
9892 need_up_thru = pg->need_up_thru || need_up_thru;
9893 same_interval_since = MAX(pg->info.history.same_interval_since,
9894 same_interval_since);
9895 pg->write_if_dirty(*rctx.transaction);
9896 if (!split_pgs.empty()) {
9897 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9898 split_pgs.clear();
9899 }
9900 dispatch_context_transaction(rctx, pg, &handle);
9901 pg->unlock();
9902 }
9903 if (need_up_thru)
9904 queue_want_up_thru(same_interval_since);
9905 dispatch_context(rctx, 0, curmap, &handle);
9906
9907 service.send_pg_temp();
9908}
9909
9910// --------------------------------
9911
9912const char** OSD::get_tracked_conf_keys() const
9913{
9914 static const char* KEYS[] = {
9915 "osd_max_backfills",
9916 "osd_min_recovery_priority",
224ce89b
WB
9917 "osd_max_trimming_pgs",
9918 "osd_op_complaint_time",
9919 "osd_op_log_threshold",
9920 "osd_op_history_size",
9921 "osd_op_history_duration",
9922 "osd_op_history_slow_op_size",
9923 "osd_op_history_slow_op_threshold",
7c673cae
FG
9924 "osd_enable_op_tracker",
9925 "osd_map_cache_size",
9926 "osd_map_max_advance",
9927 "osd_pg_epoch_persisted_max_stale",
9928 "osd_disk_thread_ioprio_class",
9929 "osd_disk_thread_ioprio_priority",
9930 // clog & admin clog
9931 "clog_to_monitors",
9932 "clog_to_syslog",
9933 "clog_to_syslog_facility",
9934 "clog_to_syslog_level",
9935 "osd_objectstore_fuse",
9936 "clog_to_graylog",
9937 "clog_to_graylog_host",
9938 "clog_to_graylog_port",
9939 "host",
9940 "fsid",
9941 "osd_recovery_delay_start",
9942 "osd_client_message_size_cap",
9943 "osd_client_message_cap",
31f18b77
FG
9944 "osd_heartbeat_min_size",
9945 "osd_heartbeat_interval",
7c673cae
FG
9946 NULL
9947 };
9948 return KEYS;
9949}
9950
9951void OSD::handle_conf_change(const struct md_config_t *conf,
9952 const std::set <std::string> &changed)
9953{
9954 if (changed.count("osd_max_backfills")) {
9955 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9956 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9957 }
9958 if (changed.count("osd_min_recovery_priority")) {
9959 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9960 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9961 }
9962 if (changed.count("osd_max_trimming_pgs")) {
9963 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9964 }
9965 if (changed.count("osd_op_complaint_time") ||
9966 changed.count("osd_op_log_threshold")) {
9967 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9968 cct->_conf->osd_op_log_threshold);
9969 }
9970 if (changed.count("osd_op_history_size") ||
9971 changed.count("osd_op_history_duration")) {
9972 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9973 cct->_conf->osd_op_history_duration);
9974 }
9975 if (changed.count("osd_op_history_slow_op_size") ||
9976 changed.count("osd_op_history_slow_op_threshold")) {
9977 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9978 cct->_conf->osd_op_history_slow_op_threshold);
9979 }
9980 if (changed.count("osd_enable_op_tracker")) {
9981 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9982 }
9983 if (changed.count("osd_disk_thread_ioprio_class") ||
9984 changed.count("osd_disk_thread_ioprio_priority")) {
9985 set_disk_tp_priority();
9986 }
9987 if (changed.count("osd_map_cache_size")) {
9988 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9989 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9990 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9991 }
9992 if (changed.count("clog_to_monitors") ||
9993 changed.count("clog_to_syslog") ||
9994 changed.count("clog_to_syslog_level") ||
9995 changed.count("clog_to_syslog_facility") ||
9996 changed.count("clog_to_graylog") ||
9997 changed.count("clog_to_graylog_host") ||
9998 changed.count("clog_to_graylog_port") ||
9999 changed.count("host") ||
10000 changed.count("fsid")) {
10001 update_log_config();
10002 }
10003
10004#ifdef HAVE_LIBFUSE
10005 if (changed.count("osd_objectstore_fuse")) {
10006 if (store) {
10007 enable_disable_fuse(false);
10008 }
10009 }
10010#endif
10011
10012 if (changed.count("osd_recovery_delay_start")) {
10013 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10014 service.kick_recovery_queue();
10015 }
10016
10017 if (changed.count("osd_client_message_cap")) {
10018 uint64_t newval = cct->_conf->osd_client_message_cap;
10019 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10020 if (pol.throttler_messages && newval > 0) {
10021 pol.throttler_messages->reset_max(newval);
10022 }
10023 }
10024 if (changed.count("osd_client_message_size_cap")) {
10025 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10026 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10027 if (pol.throttler_bytes && newval > 0) {
10028 pol.throttler_bytes->reset_max(newval);
10029 }
10030 }
10031
10032 check_config();
10033}
10034
10035void OSD::update_log_config()
10036{
10037 map<string,string> log_to_monitors;
10038 map<string,string> log_to_syslog;
10039 map<string,string> log_channel;
10040 map<string,string> log_prio;
10041 map<string,string> log_to_graylog;
10042 map<string,string> log_to_graylog_host;
10043 map<string,string> log_to_graylog_port;
10044 uuid_d fsid;
10045 string host;
10046
10047 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10048 log_channel, log_prio, log_to_graylog,
10049 log_to_graylog_host, log_to_graylog_port,
10050 fsid, host) == 0)
10051 clog->update_config(log_to_monitors, log_to_syslog,
10052 log_channel, log_prio, log_to_graylog,
10053 log_to_graylog_host, log_to_graylog_port,
10054 fsid, host);
10055 derr << "log_to_monitors " << log_to_monitors << dendl;
10056}
10057
10058void OSD::check_config()
10059{
10060 // some sanity checks
10061 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
10062 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10063 << " is not > osd_map_max_advance ("
10064 << cct->_conf->osd_map_max_advance << ")";
10065 }
10066 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10067 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10068 << " is not > osd_pg_epoch_persisted_max_stale ("
10069 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10070 }
10071}
10072
10073void OSD::set_disk_tp_priority()
10074{
10075 dout(10) << __func__
10076 << " class " << cct->_conf->osd_disk_thread_ioprio_class
10077 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
10078 << dendl;
10079 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
10080 cct->_conf->osd_disk_thread_ioprio_priority < 0)
10081 return;
10082 int cls =
10083 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
10084 if (cls < 0)
10085 derr << __func__ << cpp_strerror(cls) << ": "
10086 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
10087 << " but only the following values are allowed: idle, be or rt" << dendl;
10088 else
10089 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10090}
10091
10092// --------------------------------
10093
10094void OSD::get_latest_osdmap()
10095{
10096 dout(10) << __func__ << " -- start" << dendl;
10097
10098 C_SaferCond cond;
10099 service.objecter->wait_for_latest_osdmap(&cond);
10100 cond.wait();
10101
10102 dout(10) << __func__ << " -- finish" << dendl;
10103}
10104
10105// --------------------------------
10106
10107int OSD::init_op_flags(OpRequestRef& op)
10108{
10109 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10110 vector<OSDOp>::const_iterator iter;
10111
10112 // client flags have no bearing on whether an op is a read, write, etc.
10113 op->rmw_flags = 0;
10114
10115 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10116 op->set_force_rwordered();
10117 }
10118
10119 // set bits based on op codes, called methods.
10120 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10121 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10122 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10123 /* This a bit odd. PING isn't actually a write. It can't
10124 * result in an update to the object_info. PINGs also aren'ty
10125 * resent, so there's no reason to write out a log entry
10126 *
10127 * However, we pipeline them behind writes, so let's force
10128 * the write_ordered flag.
10129 */
10130 op->set_force_rwordered();
10131 } else {
10132 if (ceph_osd_op_mode_modify(iter->op.op))
10133 op->set_write();
10134 }
10135 if (ceph_osd_op_mode_read(iter->op.op))
10136 op->set_read();
10137
10138 // set READ flag if there are src_oids
10139 if (iter->soid.oid.name.length())
10140 op->set_read();
10141
10142 // set PGOP flag if there are PG ops
10143 if (ceph_osd_op_type_pg(iter->op.op))
10144 op->set_pg_op();
10145
10146 if (ceph_osd_op_mode_cache(iter->op.op))
10147 op->set_cache();
10148
10149 // check for ec base pool
10150 int64_t poolid = m->get_pg().pool();
10151 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10152 if (pool && pool->is_tier()) {
10153 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10154 if (base_pool && base_pool->require_rollback()) {
10155 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10156 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 10157 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
10158 (iter->op.op != CEPH_OSD_OP_STAT) &&
10159 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10160 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10161 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10162 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10163 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10164 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10165 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10166 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10167 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10168 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10169 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10170 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10171 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10172 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10173 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10174 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10175 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10176 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10177 op->set_promote();
10178 }
10179 }
10180 }
10181
10182 switch (iter->op.op) {
10183 case CEPH_OSD_OP_CALL:
10184 {
10185 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10186 int is_write, is_read;
10187 string cname, mname;
10188 bp.copy(iter->op.cls.class_len, cname);
10189 bp.copy(iter->op.cls.method_len, mname);
10190
10191 ClassHandler::ClassData *cls;
10192 int r = class_handler->open_class(cname, &cls);
10193 if (r) {
10194 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10195 if (r == -ENOENT)
10196 r = -EOPNOTSUPP;
10197 else if (r != -EPERM) // propagate permission errors
10198 r = -EIO;
10199 return r;
10200 }
10201 int flags = cls->get_method_flags(mname.c_str());
10202 if (flags < 0) {
10203 if (flags == -ENOENT)
10204 r = -EOPNOTSUPP;
10205 else
10206 r = flags;
10207 return r;
10208 }
10209 is_read = flags & CLS_METHOD_RD;
10210 is_write = flags & CLS_METHOD_WR;
10211 bool is_promote = flags & CLS_METHOD_PROMOTE;
10212
10213 dout(10) << "class " << cname << " method " << mname << " "
10214 << "flags=" << (is_read ? "r" : "")
10215 << (is_write ? "w" : "")
10216 << (is_promote ? "p" : "")
10217 << dendl;
10218 if (is_read)
10219 op->set_class_read();
10220 if (is_write)
10221 op->set_class_write();
10222 if (is_promote)
10223 op->set_promote();
10224 op->add_class(cname, is_read, is_write, cls->whitelisted);
10225 break;
10226 }
10227
10228 case CEPH_OSD_OP_WATCH:
10229 // force the read bit for watch since it is depends on previous
10230 // watch state (and may return early if the watch exists) or, in
10231 // the case of ping, is simply a read op.
10232 op->set_read();
10233 // fall through
10234 case CEPH_OSD_OP_NOTIFY:
10235 case CEPH_OSD_OP_NOTIFY_ACK:
10236 {
10237 op->set_promote();
10238 break;
10239 }
10240
10241 case CEPH_OSD_OP_DELETE:
10242 // if we get a delete with FAILOK we can skip handle cache. without
10243 // FAILOK we still need to promote (or do something smarter) to
10244 // determine whether to return ENOENT or 0.
10245 if (iter == m->ops.begin() &&
10246 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10247 op->set_skip_handle_cache();
10248 }
10249 // skip promotion when proxying a delete op
10250 if (m->ops.size() == 1) {
10251 op->set_skip_promote();
10252 }
10253 break;
10254
10255 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10256 case CEPH_OSD_OP_CACHE_FLUSH:
10257 case CEPH_OSD_OP_CACHE_EVICT:
10258 // If try_flush/flush/evict is the only op, can skip handle cache.
10259 if (m->ops.size() == 1) {
10260 op->set_skip_handle_cache();
10261 }
10262 break;
10263
10264 case CEPH_OSD_OP_READ:
10265 case CEPH_OSD_OP_SYNC_READ:
10266 case CEPH_OSD_OP_SPARSE_READ:
10267 case CEPH_OSD_OP_CHECKSUM:
10268 case CEPH_OSD_OP_WRITEFULL:
10269 if (m->ops.size() == 1 &&
10270 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10271 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10272 op->set_skip_promote();
10273 }
10274 break;
10275
10276 // force promotion when pin an object in cache tier
10277 case CEPH_OSD_OP_CACHE_PIN:
10278 op->set_promote();
10279 break;
10280
10281 default:
10282 break;
10283 }
10284 }
10285
10286 if (op->rmw_flags == 0)
10287 return -EINVAL;
10288
10289 return 0;
10290}
10291
10292void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10293 for (list<PG*>::iterator i = peering_queue.begin();
10294 i != peering_queue.end() &&
10295 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10296 ) {
10297 if (in_use.count(*i)) {
10298 ++i;
10299 } else {
10300 out->push_back(*i);
10301 peering_queue.erase(i++);
10302 }
10303 }
10304 in_use.insert(out->begin(), out->end());
10305}
10306
224ce89b 10307
7c673cae
FG
10308// =============================================================
10309
10310#undef dout_context
10311#define dout_context osd->cct
10312#undef dout_prefix
10313#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10314
10315void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10316{
10317 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10318 auto sdata = shard_list[shard_index];
10319 bool queued = false;
7c673cae
FG
10320 {
10321 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10322 auto p = sdata->pg_slots.find(pgid);
10323 if (p != sdata->pg_slots.end()) {
10324 dout(20) << __func__ << " " << pgid
10325 << " to_process " << p->second.to_process
10326 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10327 for (auto i = p->second.to_process.rbegin();
10328 i != p->second.to_process.rend();
10329 ++i) {
10330 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10331 }
7c673cae
FG
10332 p->second.to_process.clear();
10333 p->second.waiting_for_pg = false;
10334 ++p->second.requeue_seq;
10335 queued = true;
10336 }
10337 }
7c673cae
FG
10338 if (queued) {
10339 sdata->sdata_lock.Lock();
10340 sdata->sdata_cond.SignalOne();
10341 sdata->sdata_lock.Unlock();
10342 }
10343}
10344
10345void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10346{
10347 unsigned pushes_to_free = 0;
10348 for (auto sdata : shard_list) {
10349 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10350 sdata->waiting_for_pg_osdmap = osdmap;
10351 auto p = sdata->pg_slots.begin();
10352 while (p != sdata->pg_slots.end()) {
10353 ShardData::pg_slot& slot = p->second;
10354 if (!slot.to_process.empty() && slot.num_running == 0) {
10355 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10356 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10357 << dendl;
10358 ++p;
10359 continue;
10360 }
10361 while (!slot.to_process.empty() &&
10362 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10363 auto& qi = slot.to_process.front();
10364 dout(20) << __func__ << " " << p->first
10365 << " item " << qi
10366 << " epoch " << qi.get_map_epoch()
10367 << " <= " << osdmap->get_epoch()
10368 << ", stale, dropping" << dendl;
10369 pushes_to_free += qi.get_reserved_pushes();
10370 slot.to_process.pop_front();
10371 }
10372 }
10373 if (slot.to_process.empty() &&
10374 slot.num_running == 0 &&
10375 !slot.pg) {
10376 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10377 p = sdata->pg_slots.erase(p);
10378 } else {
10379 ++p;
10380 }
10381 }
10382 }
10383 if (pushes_to_free > 0) {
10384 osd->service.release_reserved_pushes(pushes_to_free);
10385 }
10386}
10387
10388void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10389{
10390 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10391 auto sdata = shard_list[shard_index];
10392 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10393 auto p = sdata->pg_slots.find(pgid);
10394 if (p != sdata->pg_slots.end()) {
10395 auto& slot = p->second;
10396 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10397 assert(!slot.pg || slot.pg->deleting);
10398 slot.pg = nullptr;
10399 }
10400}
10401
10402void OSD::ShardedOpWQ::clear_pg_slots()
10403{
10404 for (auto sdata : shard_list) {
10405 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10406 sdata->pg_slots.clear();
10407 sdata->waiting_for_pg_osdmap.reset();
10408 // don't bother with reserved pushes; we are shutting down
10409 }
10410}
10411
10412#undef dout_prefix
10413#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10414
10415void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10416{
10417 uint32_t shard_index = thread_index % num_shards;
10418 ShardData *sdata = shard_list[shard_index];
10419 assert(NULL != sdata);
10420
10421 // peek at spg_t
10422 sdata->sdata_op_ordering_lock.Lock();
10423 if (sdata->pqueue->empty()) {
10424 dout(20) << __func__ << " empty q, waiting" << dendl;
10425 // optimistically sleep a moment; maybe another work item will come along.
7c673cae
FG
10426 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10427 osd->cct->_conf->threadpool_default_timeout, 0);
10428 sdata->sdata_lock.Lock();
224ce89b 10429 sdata->sdata_op_ordering_lock.Unlock();
7c673cae
FG
10430 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10431 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10432 sdata->sdata_lock.Unlock();
10433 sdata->sdata_op_ordering_lock.Lock();
10434 if (sdata->pqueue->empty()) {
10435 sdata->sdata_op_ordering_lock.Unlock();
10436 return;
10437 }
10438 }
10439 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10440 if (osd->is_stopping()) {
10441 sdata->sdata_op_ordering_lock.Unlock();
10442 return; // OSD shutdown, discard.
10443 }
10444 PGRef pg;
10445 uint64_t requeue_seq;
10446 {
10447 auto& slot = sdata->pg_slots[item.first];
10448 dout(30) << __func__ << " " << item.first
10449 << " to_process " << slot.to_process
10450 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10451 slot.to_process.push_back(item.second);
10452 // note the requeue seq now...
10453 requeue_seq = slot.requeue_seq;
10454 if (slot.waiting_for_pg) {
10455 // save ourselves a bit of effort
10456 dout(20) << __func__ << " " << item.first << " item " << item.second
10457 << " queued, waiting_for_pg" << dendl;
10458 sdata->sdata_op_ordering_lock.Unlock();
10459 return;
10460 }
10461 pg = slot.pg;
10462 dout(20) << __func__ << " " << item.first << " item " << item.second
10463 << " queued" << dendl;
10464 ++slot.num_running;
10465 }
10466 sdata->sdata_op_ordering_lock.Unlock();
10467
10468 osd->service.maybe_inject_dispatch_delay();
10469
10470 // [lookup +] lock pg (if we have it)
10471 if (!pg) {
10472 pg = osd->_lookup_lock_pg(item.first);
10473 } else {
10474 pg->lock();
10475 }
10476
10477 osd->service.maybe_inject_dispatch_delay();
10478
10479 boost::optional<PGQueueable> qi;
10480
10481 // we don't use a Mutex::Locker here because of the
10482 // osd->service.release_reserved_pushes() call below
10483 sdata->sdata_op_ordering_lock.Lock();
10484
10485 auto q = sdata->pg_slots.find(item.first);
10486 assert(q != sdata->pg_slots.end());
10487 auto& slot = q->second;
10488 --slot.num_running;
10489
10490 if (slot.to_process.empty()) {
10491 // raced with wake_pg_waiters or prune_pg_waiters
10492 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10493 if (pg) {
10494 pg->unlock();
10495 }
10496 sdata->sdata_op_ordering_lock.Unlock();
10497 return;
10498 }
10499 if (requeue_seq != slot.requeue_seq) {
10500 dout(20) << __func__ << " " << item.first
10501 << " requeue_seq " << slot.requeue_seq << " > our "
10502 << requeue_seq << ", we raced with wake_pg_waiters"
10503 << dendl;
10504 if (pg) {
10505 pg->unlock();
10506 }
10507 sdata->sdata_op_ordering_lock.Unlock();
10508 return;
10509 }
10510 if (pg && !slot.pg && !pg->deleting) {
10511 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10512 slot.pg = pg;
10513 }
10514 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10515 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10516
10517 // make sure we're not already waiting for this pg
10518 if (slot.waiting_for_pg) {
10519 dout(20) << __func__ << " " << item.first << " item " << item.second
10520 << " slot is waiting_for_pg" << dendl;
10521 if (pg) {
10522 pg->unlock();
10523 }
10524 sdata->sdata_op_ordering_lock.Unlock();
10525 return;
10526 }
10527
10528 // take next item
10529 qi = slot.to_process.front();
10530 slot.to_process.pop_front();
10531 dout(20) << __func__ << " " << item.first << " item " << *qi
10532 << " pg " << pg << dendl;
10533
10534 if (!pg) {
10535 // should this pg shard exist on this osd in this (or a later) epoch?
10536 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10537 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10538 dout(20) << __func__ << " " << item.first
10539 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10540 slot.to_process.push_front(*qi);
10541 slot.waiting_for_pg = true;
10542 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10543 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10544 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10545 << ", will wait on " << *qi << dendl;
10546 slot.to_process.push_front(*qi);
10547 slot.waiting_for_pg = true;
10548 } else {
10549 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10550 << " dropping " << *qi << dendl;
10551 // share map with client?
10552 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10553 Session *session = static_cast<Session *>(
10554 (*_op)->get_req()->get_connection()->get_priv());
10555 if (session) {
10556 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10557 session->put();
10558 }
10559 }
10560 unsigned pushes_to_free = qi->get_reserved_pushes();
10561 if (pushes_to_free > 0) {
10562 sdata->sdata_op_ordering_lock.Unlock();
10563 osd->service.release_reserved_pushes(pushes_to_free);
10564 return;
10565 }
10566 }
10567 sdata->sdata_op_ordering_lock.Unlock();
10568 return;
10569 }
10570 sdata->sdata_op_ordering_lock.Unlock();
10571
10572
10573 // osd_opwq_process marks the point at which an operation has been dequeued
10574 // and will begin to be handled by a worker thread.
10575 {
10576#ifdef WITH_LTTNG
10577 osd_reqid_t reqid;
10578 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10579 reqid = (*_op)->get_reqid();
10580 }
10581#endif
10582 tracepoint(osd, opwq_process_start, reqid.name._type,
10583 reqid.name._num, reqid.tid, reqid.inc);
10584 }
10585
10586 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10587 Formatter *f = Formatter::create("json");
10588 f->open_object_section("q");
10589 dump(f);
10590 f->close_section();
10591 f->flush(*_dout);
10592 delete f;
10593 *_dout << dendl;
10594
10595 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10596 suicide_interval);
10597 qi->run(osd, pg, tp_handle);
10598
10599 {
10600#ifdef WITH_LTTNG
10601 osd_reqid_t reqid;
10602 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10603 reqid = (*_op)->get_reqid();
10604 }
10605#endif
10606 tracepoint(osd, opwq_process_finish, reqid.name._type,
10607 reqid.name._num, reqid.tid, reqid.inc);
10608 }
10609
10610 pg->unlock();
10611}
10612
10613void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10614 uint32_t shard_index =
10615 item.first.hash_to_shard(shard_list.size());
10616
10617 ShardData* sdata = shard_list[shard_index];
10618 assert (NULL != sdata);
10619 unsigned priority = item.second.get_priority();
10620 unsigned cost = item.second.get_cost();
10621 sdata->sdata_op_ordering_lock.Lock();
10622
10623 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10624 if (priority >= osd->op_prio_cutoff)
10625 sdata->pqueue->enqueue_strict(
10626 item.second.get_owner(), priority, item);
10627 else
10628 sdata->pqueue->enqueue(
10629 item.second.get_owner(),
10630 priority, cost, item);
10631 sdata->sdata_op_ordering_lock.Unlock();
10632
10633 sdata->sdata_lock.Lock();
10634 sdata->sdata_cond.SignalOne();
10635 sdata->sdata_lock.Unlock();
10636
10637}
10638
10639void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10640{
10641 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10642 ShardData* sdata = shard_list[shard_index];
10643 assert (NULL != sdata);
10644 sdata->sdata_op_ordering_lock.Lock();
10645 auto p = sdata->pg_slots.find(item.first);
10646 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10647 // we may be racing with _process, which has dequeued a new item
10648 // from pqueue, put it on to_process, and is now busy taking the
10649 // pg lock. ensure this old requeued item is ordered before any
10650 // such newer item in to_process.
10651 p->second.to_process.push_front(item.second);
10652 item.second = p->second.to_process.back();
10653 p->second.to_process.pop_back();
10654 dout(20) << __func__ << " " << item.first
10655 << " " << p->second.to_process.front()
10656 << " shuffled w/ " << item.second << dendl;
10657 } else {
10658 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10659 }
10660 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10661 sdata->sdata_op_ordering_lock.Unlock();
10662 sdata->sdata_lock.Lock();
10663 sdata->sdata_cond.SignalOne();
10664 sdata->sdata_lock.Unlock();
10665}
10666
10667namespace ceph {
10668namespace osd_cmds {
10669
10670int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10671{
10672 if (!ceph_using_tcmalloc()) {
10673 os << "could not issue heap profiler command -- not using tcmalloc!";
10674 return -EOPNOTSUPP;
10675 }
10676
10677 string cmd;
10678 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10679 os << "unable to get value for command \"" << cmd << "\"";
10680 return -EINVAL;
10681 }
10682
10683 std::vector<std::string> cmd_vec;
10684 get_str_vec(cmd, cmd_vec);
10685
10686 ceph_heap_profiler_handle_command(cmd_vec, os);
10687
10688 return 0;
10689}
10690
10691}} // namespace ceph::osd_cmds
10692
224ce89b
WB
10693
10694std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10695 switch(q) {
10696 case OSD::io_queue::prioritized:
10697 out << "prioritized";
10698 break;
10699 case OSD::io_queue::weightedpriority:
10700 out << "weightedpriority";
10701 break;
10702 case OSD::io_queue::mclock_opclass:
10703 out << "mclock_opclass";
10704 break;
10705 case OSD::io_queue::mclock_client:
10706 out << "mclock_client";
10707 break;
10708 }
10709 return out;
10710}