]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDSMap.cc
import ceph 16.2.6
[ceph.git] / ceph / src / mds / MDSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
f67539c2
TL
15#include <ostream>
16
3efd9988
FG
17#include "common/debug.h"
18#include "mon/health_check.h"
7c673cae
FG
19
20#include "MDSMap.h"
21
f67539c2
TL
22using std::dec;
23using std::hex;
24using std::list;
25using std::make_pair;
26using std::map;
27using std::multimap;
28using std::ostream;
29using std::pair;
30using std::string;
31using std::set;
32
33using ceph::bufferlist;
34using ceph::Formatter;
7c673cae 35
3efd9988
FG
36#define dout_context g_ceph_context
37#define dout_subsys ceph_subsys_
7c673cae
FG
38
39// features
1adf2230 40CompatSet MDSMap::get_compat_set_all() {
7c673cae
FG
41 CompatSet::FeatureSet feature_compat;
42 CompatSet::FeatureSet feature_ro_compat;
43 CompatSet::FeatureSet feature_incompat;
44 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
45 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
46 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
47 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
48 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
49 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
50 feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
51 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
52 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
11fdf7f2 53 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
7c673cae
FG
54
55 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
56}
57
1adf2230 58CompatSet MDSMap::get_compat_set_default() {
7c673cae
FG
59 CompatSet::FeatureSet feature_compat;
60 CompatSet::FeatureSet feature_ro_compat;
61 CompatSet::FeatureSet feature_incompat;
62 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
63 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
64 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
65 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
66 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
67 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
68 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
69 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
11fdf7f2 70 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
7c673cae
FG
71
72 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
73}
74
75// base (pre v0.20)
1adf2230 76CompatSet MDSMap::get_compat_set_base() {
7c673cae
FG
77 CompatSet::FeatureSet feature_compat_base;
78 CompatSet::FeatureSet feature_incompat_base;
79 feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
80 CompatSet::FeatureSet feature_ro_compat_base;
81
82 return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
83}
84
85void MDSMap::mds_info_t::dump(Formatter *f) const
86{
87 f->dump_unsigned("gid", global_id);
88 f->dump_string("name", name);
89 f->dump_int("rank", rank);
90 f->dump_int("incarnation", inc);
91 f->dump_stream("state") << ceph_mds_state_name(state);
92 f->dump_int("state_seq", state_seq);
11fdf7f2
TL
93 f->dump_stream("addr") << addrs.get_legacy_str();
94 f->dump_object("addrs", addrs);
9f95a23c 95 f->dump_int("join_fscid", join_fscid);
7c673cae
FG
96 if (laggy_since != utime_t())
97 f->dump_stream("laggy_since") << laggy_since;
98
7c673cae
FG
99 f->open_array_section("export_targets");
100 for (set<mds_rank_t>::iterator p = export_targets.begin();
101 p != export_targets.end(); ++p) {
102 f->dump_int("mds", *p);
103 }
104 f->close_section();
105 f->dump_unsigned("features", mds_features);
11fdf7f2 106 f->dump_unsigned("flags", flags);
522d829b 107 f->dump_object("compat", compat);
7c673cae
FG
108}
109
9f95a23c 110void MDSMap::mds_info_t::dump(std::ostream& o) const
7c673cae 111{
9f95a23c
TL
112 o << "[mds." << name << "{" << rank << ":" << global_id << "}"
113 << " state " << ceph_mds_state_name(state)
114 << " seq " << state_seq;
7c673cae 115 if (laggy()) {
9f95a23c 116 o << " laggy since " << laggy_since;
7c673cae 117 }
7c673cae 118 if (!export_targets.empty()) {
9f95a23c 119 o << " export targets " << export_targets;
7c673cae 120 }
11fdf7f2 121 if (is_frozen()) {
9f95a23c 122 o << " frozen";
11fdf7f2 123 }
9f95a23c
TL
124 if (join_fscid != FS_CLUSTER_ID_NONE) {
125 o << " join_fscid=" << join_fscid;
126 }
522d829b
TL
127 o << " addr " << addrs;
128 o << " compat ";
129 compat.printlite(o);
130 o << "]";
7c673cae
FG
131}
132
9f95a23c 133void MDSMap::mds_info_t::generate_test_instances(std::list<mds_info_t*>& ls)
7c673cae
FG
134{
135 mds_info_t *sample = new mds_info_t();
136 ls.push_back(sample);
137 sample = new mds_info_t();
138 sample->global_id = 1;
139 sample->name = "test_instance";
140 sample->rank = 0;
141 ls.push_back(sample);
142}
143
144void MDSMap::dump(Formatter *f) const
145{
146 f->dump_int("epoch", epoch);
147 f->dump_unsigned("flags", flags);
148 f->dump_unsigned("ever_allowed_features", ever_allowed_features);
149 f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
150 f->dump_stream("created") << created;
151 f->dump_stream("modified") << modified;
152 f->dump_int("tableserver", tableserver);
153 f->dump_int("root", root);
154 f->dump_int("session_timeout", session_timeout);
155 f->dump_int("session_autoclose", session_autoclose);
f67539c2
TL
156 f->open_object_section("required_client_features");
157 cephfs_dump_features(f, required_client_features);
158 f->close_section();
7c673cae
FG
159 f->dump_int("max_file_size", max_file_size);
160 f->dump_int("last_failure", last_failure);
161 f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
162 f->open_object_section("compat");
163 compat.dump(f);
164 f->close_section();
165 f->dump_int("max_mds", max_mds);
166 f->open_array_section("in");
167 for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
168 f->dump_int("mds", *p);
169 f->close_section();
170 f->open_object_section("up");
171 for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
172 char s[14];
173 sprintf(s, "mds_%d", int(p->first));
174 f->dump_int(s, p->second);
175 }
176 f->close_section();
177 f->open_array_section("failed");
178 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
179 f->dump_int("mds", *p);
180 f->close_section();
181 f->open_array_section("damaged");
182 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
183 f->dump_int("mds", *p);
184 f->close_section();
185 f->open_array_section("stopped");
186 for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
187 f->dump_int("mds", *p);
188 f->close_section();
189 f->open_object_section("info");
9f95a23c 190 for (const auto& [gid, info] : mds_info) {
7c673cae 191 char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
9f95a23c 192 sprintf(s, "gid_%llu", (long long unsigned)gid);
7c673cae 193 f->open_object_section(s);
9f95a23c 194 info.dump(f);
7c673cae
FG
195 f->close_section();
196 }
197 f->close_section();
198 f->open_array_section("data_pools");
9f95a23c 199 for (const auto& p: data_pools)
31f18b77 200 f->dump_int("pool", p);
7c673cae
FG
201 f->close_section();
202 f->dump_int("metadata_pool", metadata_pool);
203 f->dump_bool("enabled", enabled);
204 f->dump_string("fs_name", fs_name);
205 f->dump_string("balancer", balancer);
206 f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
207}
208
9f95a23c 209void MDSMap::generate_test_instances(std::list<MDSMap*>& ls)
7c673cae
FG
210{
211 MDSMap *m = new MDSMap();
212 m->max_mds = 1;
31f18b77 213 m->data_pools.push_back(0);
7c673cae
FG
214 m->metadata_pool = 1;
215 m->cas_pool = 2;
1adf2230 216 m->compat = get_compat_set_all();
7c673cae
FG
217
218 // these aren't the defaults, just in case anybody gets confused
219 m->session_timeout = 61;
220 m->session_autoclose = 301;
221 m->max_file_size = 1<<24;
222 ls.push_back(m);
223}
224
225void MDSMap::print(ostream& out) const
226{
227 out << "fs_name\t" << fs_name << "\n";
228 out << "epoch\t" << epoch << "\n";
229 out << "flags\t" << hex << flags << dec << "\n";
230 out << "created\t" << created << "\n";
231 out << "modified\t" << modified << "\n";
232 out << "tableserver\t" << tableserver << "\n";
233 out << "root\t" << root << "\n";
234 out << "session_timeout\t" << session_timeout << "\n"
235 << "session_autoclose\t" << session_autoclose << "\n";
236 out << "max_file_size\t" << max_file_size << "\n";
f67539c2 237 out << "required_client_features\t" << cephfs_stringify_features(required_client_features) << "\n";
7c673cae
FG
238 out << "last_failure\t" << last_failure << "\n"
239 << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
240 out << "compat\t" << compat << "\n";
241 out << "max_mds\t" << max_mds << "\n";
242 out << "in\t" << in << "\n"
243 << "up\t" << up << "\n"
244 << "failed\t" << failed << "\n"
245 << "damaged\t" << damaged << "\n"
246 << "stopped\t" << stopped << "\n";
247 out << "data_pools\t" << data_pools << "\n";
248 out << "metadata_pool\t" << metadata_pool << "\n";
249 out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
250 out << "balancer\t" << balancer << "\n";
251 out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
252
253 multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
254 for (const auto &p : mds_info) {
255 foo.insert(std::make_pair(
256 std::make_pair(p.second.rank, p.second.inc-1), p.first));
257 }
258
259 for (const auto &p : foo) {
9f95a23c 260 out << mds_info.at(p.second) << "\n";
7c673cae
FG
261 }
262}
263
7c673cae
FG
264void MDSMap::print_summary(Formatter *f, ostream *out) const
265{
266 map<mds_rank_t,string> by_rank;
267 map<string,int> by_state;
268
269 if (f) {
270 f->dump_unsigned("epoch", get_epoch());
271 f->dump_unsigned("up", up.size());
272 f->dump_unsigned("in", in.size());
273 f->dump_unsigned("max", max_mds);
274 } else {
275 *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
276 }
277
278 if (f)
279 f->open_array_section("by_rank");
280 for (const auto &p : mds_info) {
281 string s = ceph_mds_state_name(p.second.state);
282 if (p.second.laggy())
283 s += "(laggy or crashed)";
284
285 if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
286 if (f) {
287 f->open_object_section("mds");
288 f->dump_unsigned("rank", p.second.rank);
289 f->dump_string("name", p.second.name);
290 f->dump_string("status", s);
291 f->close_section();
292 } else {
293 by_rank[p.second.rank] = p.second.name + "=" + s;
294 }
295 } else {
296 by_state[s]++;
297 }
298 }
299 if (f) {
300 f->close_section();
301 } else {
302 if (!by_rank.empty())
303 *out << " " << by_rank;
304 }
305
306 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
307 if (f) {
308 f->dump_unsigned(p->first.c_str(), p->second);
309 } else {
310 *out << ", " << p->second << " " << p->first;
311 }
312 }
313
314 if (!failed.empty()) {
315 if (f) {
316 f->dump_unsigned("failed", failed.size());
317 } else {
318 *out << ", " << failed.size() << " failed";
319 }
320 }
321
322 if (!damaged.empty()) {
323 if (f) {
324 f->dump_unsigned("damaged", damaged.size());
325 } else {
326 *out << ", " << damaged.size() << " damaged";
327 }
328 }
329 //if (stopped.size())
330 //out << ", " << stopped.size() << " stopped";
331}
332
333void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
334 list<pair<health_status_t,string> > *detail) const
335{
336 if (!failed.empty()) {
f67539c2
TL
337 CachedStackStringStream css;
338 *css << "mds rank"
7c673cae
FG
339 << ((failed.size() > 1) ? "s ":" ")
340 << failed
341 << ((failed.size() > 1) ? " have":" has")
342 << " failed";
f67539c2 343 summary.push_back(make_pair(HEALTH_ERR, css->str()));
7c673cae 344 if (detail) {
f67539c2
TL
345 for (const auto& r : failed) {
346 CachedStackStringStream css;
347 *css << "mds." << r << " has failed";
348 detail->push_back(make_pair(HEALTH_ERR, css->str()));
7c673cae
FG
349 }
350 }
351 }
352
353 if (!damaged.empty()) {
f67539c2
TL
354 CachedStackStringStream css;
355 *css << "mds rank"
356 << ((damaged.size() > 1) ? "s ":" ")
357 << damaged
358 << ((damaged.size() > 1) ? " are":" is")
359 << " damaged";
360 summary.push_back(make_pair(HEALTH_ERR, css->str()));
7c673cae 361 if (detail) {
f67539c2
TL
362 for (const auto& r : damaged) {
363 CachedStackStringStream css;
364 *css << "mds." << r << " is damaged";
365 detail->push_back(make_pair(HEALTH_ERR, css->str()));
7c673cae
FG
366 }
367 }
368 }
369
370 if (is_degraded()) {
371 summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
372 if (detail) {
373 detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
374 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
375 if (!is_up(i))
376 continue;
377 mds_gid_t gid = up.find(i)->second;
9f95a23c 378 const auto& info = mds_info.at(gid);
f67539c2 379 CachedStackStringStream css;
7c673cae 380 if (is_resolve(i))
f67539c2 381 *css << "mds." << info.name << " at " << info.addrs
11fdf7f2 382 << " rank " << i << " is resolving";
7c673cae 383 if (is_replay(i))
f67539c2 384 *css << "mds." << info.name << " at " << info.addrs
11fdf7f2 385 << " rank " << i << " is replaying journal";
7c673cae 386 if (is_rejoin(i))
f67539c2 387 *css << "mds." << info.name << " at " << info.addrs
11fdf7f2 388 << " rank " << i << " is rejoining";
7c673cae 389 if (is_reconnect(i))
f67539c2 390 *css << "mds." << info.name << " at " << info.addrs
11fdf7f2 391 << " rank " << i << " is reconnecting to clients";
f67539c2
TL
392 if (css->strv().length())
393 detail->push_back(make_pair(HEALTH_WARN, css->str()));
7c673cae
FG
394 }
395 }
396 }
397
11fdf7f2 398 {
f67539c2
TL
399 CachedStackStringStream css;
400 *css << fs_name << " max_mds " << max_mds;
401 summary.push_back(make_pair(HEALTH_WARN, css->str()));
11fdf7f2
TL
402 }
403
404 if ((mds_rank_t)up.size() < max_mds) {
f67539c2
TL
405 CachedStackStringStream css;
406 *css << fs_name << " has " << up.size()
407 << " active MDS(s), but has max_mds of " << max_mds;
408 summary.push_back(make_pair(HEALTH_WARN, css->str()));
11fdf7f2
TL
409 }
410
7c673cae
FG
411 set<string> laggy;
412 for (const auto &u : up) {
9f95a23c
TL
413 const auto& info = mds_info.at(u.second);
414 if (info.laggy()) {
415 laggy.insert(info.name);
7c673cae 416 if (detail) {
f67539c2
TL
417 CachedStackStringStream css;
418 *css << "mds." << info.name << " at " << info.addrs
11fdf7f2 419 << " is laggy/unresponsive";
f67539c2 420 detail->push_back(make_pair(HEALTH_WARN, css->str()));
7c673cae
FG
421 }
422 }
423 }
424
425 if (!laggy.empty()) {
f67539c2
TL
426 CachedStackStringStream css;
427 *css << "mds " << laggy
428 << ((laggy.size() > 1) ? " are":" is")
429 << " laggy";
430 summary.push_back(make_pair(HEALTH_WARN, css->str()));
7c673cae 431 }
11fdf7f2
TL
432
433 if (get_max_mds() > 1 &&
434 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
f67539c2
TL
435 CachedStackStringStream css;
436 *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
437 summary.push_back(make_pair(HEALTH_WARN, css->str()));
11fdf7f2 438 }
7c673cae
FG
439}
440
224ce89b
WB
441void MDSMap::get_health_checks(health_check_map_t *checks) const
442{
d2e6a577 443 // MDS_DAMAGE
224ce89b 444 if (!damaged.empty()) {
d2e6a577 445 health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
9f95a23c
TL
446 "%num% mds daemon%plurals% damaged",
447 damaged.size());
f67539c2
TL
448 for (const auto& p : damaged) {
449 CachedStackStringStream css;
450 *css << "fs " << fs_name << " mds." << p << " is damaged";
451 check.detail.push_back(css->str());
224ce89b
WB
452 }
453 }
454
455 // FS_DEGRADED
224ce89b 456 if (is_degraded()) {
d2e6a577 457 health_check_t& fscheck = checks->get_or_add(
224ce89b 458 "FS_DEGRADED", HEALTH_WARN,
9f95a23c 459 "%num% filesystem%plurals% %isorare% degraded", 1);
f67539c2
TL
460 CachedStackStringStream css;
461 *css << "fs " << fs_name << " is degraded";
462 fscheck.detail.push_back(css->str());
224ce89b
WB
463
464 list<string> detail;
465 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
466 if (!is_up(i))
467 continue;
468 mds_gid_t gid = up.find(i)->second;
9f95a23c 469 const auto& info = mds_info.at(gid);
f67539c2
TL
470 CachedStackStringStream css;
471 *css << "fs " << fs_name << " mds." << info.name << " at "
9f95a23c 472 << info.addrs << " rank " << i;
224ce89b 473 if (is_resolve(i))
f67539c2 474 *css << " is resolving";
224ce89b 475 if (is_replay(i))
f67539c2 476 *css << " is replaying journal";
224ce89b 477 if (is_rejoin(i))
f67539c2 478 *css << " is rejoining";
224ce89b 479 if (is_reconnect(i))
f67539c2
TL
480 *css << " is reconnecting to clients";
481 if (css->strv().length())
482 detail.push_back(css->str());
224ce89b 483 }
224ce89b 484 }
11fdf7f2
TL
485
486 // MDS_UP_LESS_THAN_MAX
487 if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
488 health_check_t& check = checks->add(
489 "MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
9f95a23c 490 "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
f67539c2
TL
491 CachedStackStringStream css;
492 *css << "fs " << fs_name << " has " << get_num_in_mds()
493 << " MDS online, but wants " << get_max_mds();
494 check.detail.push_back(css->str());
11fdf7f2
TL
495 }
496
497 // MDS_ALL_DOWN
498 if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
499 health_check_t &check = checks->add(
500 "MDS_ALL_DOWN", HEALTH_ERR,
9f95a23c 501 "%num% filesystem%plurals% %isorare% offline", 1);
f67539c2
TL
502 CachedStackStringStream css;
503 *css << "fs " << fs_name << " is offline because no MDS is active for it.";
504 check.detail.push_back(css->str());
11fdf7f2
TL
505 }
506
507 if (get_max_mds() > 1 &&
508 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
509 health_check_t &check = checks->add(
510 "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
9f95a23c 511 "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
f67539c2
TL
512 CachedStackStringStream css;
513 *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
514 check.detail.push_back(css->str());
11fdf7f2 515 }
9f95a23c
TL
516
517 if (get_inline_data_enabled()) {
518 health_check_t &check = checks->add(
519 "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN,
520 "%num% filesystem%plurals% with deprecated feature inline_data", 1);
f67539c2
TL
521 CachedStackStringStream css;
522 *css << "fs " << fs_name << " has deprecated feature inline_data enabled.";
523 check.detail.push_back(css->str());
9f95a23c 524 }
224ce89b
WB
525}
526
7c673cae
FG
527void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
528{
522d829b 529 __u8 v = 10;
11fdf7f2
TL
530 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
531 v = 7;
532 }
533 ENCODE_START(v, 4, bl);
534 encode(global_id, bl);
535 encode(name, bl);
536 encode(rank, bl);
537 encode(inc, bl);
538 encode((int32_t)state, bl);
539 encode(state_seq, bl);
540 if (v < 8) {
541 encode(addrs.legacy_addr(), bl, features);
542 } else {
543 encode(addrs, bl, features);
544 }
545 encode(laggy_since, bl);
546 encode(MDS_RANK_NONE, bl); /* standby_for_rank */
547 encode(std::string(), bl); /* standby_for_name */
548 encode(export_targets, bl);
549 encode(mds_features, bl);
9f95a23c 550 encode(join_fscid, bl); /* formerly: standby_for_fscid */
11fdf7f2
TL
551 encode(false, bl);
552 if (v >= 9) {
553 encode(flags, bl);
554 }
522d829b
TL
555 if (v >= 10) {
556 encode(compat, bl);
557 }
7c673cae
FG
558 ENCODE_FINISH(bl);
559}
560
561void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
562{
563 __u8 struct_v = 3;
11fdf7f2
TL
564 using ceph::encode;
565 encode(struct_v, bl);
566 encode(global_id, bl);
567 encode(name, bl);
568 encode(rank, bl);
569 encode(inc, bl);
570 encode((int32_t)state, bl);
571 encode(state_seq, bl);
572 encode(addrs.legacy_addr(), bl, 0);
573 encode(laggy_since, bl);
574 encode(MDS_RANK_NONE, bl);
575 encode(std::string(), bl);
576 encode(export_targets, bl);
7c673cae
FG
577}
578
11fdf7f2 579void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
7c673cae 580{
522d829b 581 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
11fdf7f2
TL
582 decode(global_id, bl);
583 decode(name, bl);
584 decode(rank, bl);
585 decode(inc, bl);
f91f0fd5
TL
586 int32_t raw_state;
587 decode(raw_state, bl);
588 state = (MDSMap::DaemonState)raw_state;
11fdf7f2
TL
589 decode(state_seq, bl);
590 decode(addrs, bl);
591 decode(laggy_since, bl);
592 {
593 mds_rank_t standby_for_rank;
594 decode(standby_for_rank, bl);
595 }
596 {
597 std::string standby_for_name;
598 decode(standby_for_name, bl);
599 }
7c673cae 600 if (struct_v >= 2)
11fdf7f2 601 decode(export_targets, bl);
7c673cae 602 if (struct_v >= 5)
11fdf7f2 603 decode(mds_features, bl);
7c673cae 604 if (struct_v >= 6) {
9f95a23c 605 decode(join_fscid, bl);
7c673cae
FG
606 }
607 if (struct_v >= 7) {
11fdf7f2
TL
608 bool standby_replay;
609 decode(standby_replay, bl);
610 }
611 if (struct_v >= 9) {
612 decode(flags, bl);
7c673cae 613 }
522d829b
TL
614 if (struct_v >= 10) {
615 decode(compat, bl);
616 }
7c673cae
FG
617 DECODE_FINISH(bl);
618}
619
d2e6a577
FG
620std::string MDSMap::mds_info_t::human_name() const
621{
622 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
f67539c2
TL
623 CachedStackStringStream css;
624 *css << "daemon mds." << name;
625 return css->str();
d2e6a577 626}
7c673cae
FG
627
628void MDSMap::encode(bufferlist& bl, uint64_t features) const
629{
630 std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that
631 // old-mon peers have something sane
632 // during upgrade
633 for (const auto rank : in) {
634 inc.insert(std::make_pair(rank, epoch));
635 }
636
11fdf7f2 637 using ceph::encode;
7c673cae
FG
638 if ((features & CEPH_FEATURE_PGID64) == 0) {
639 __u16 v = 2;
11fdf7f2
TL
640 encode(v, bl);
641 encode(epoch, bl);
642 encode(flags, bl);
643 encode(last_failure, bl);
644 encode(root, bl);
645 encode(session_timeout, bl);
646 encode(session_autoclose, bl);
647 encode(max_file_size, bl);
648 encode(max_mds, bl);
7c673cae 649 __u32 n = mds_info.size();
11fdf7f2 650 encode(n, bl);
7c673cae
FG
651 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
652 i != mds_info.end(); ++i) {
11fdf7f2
TL
653 encode(i->first, bl);
654 encode(i->second, bl, features);
7c673cae
FG
655 }
656 n = data_pools.size();
11fdf7f2 657 encode(n, bl);
31f18b77
FG
658 for (const auto p: data_pools) {
659 n = p;
11fdf7f2 660 encode(n, bl);
7c673cae
FG
661 }
662
663 int32_t m = cas_pool;
11fdf7f2 664 encode(m, bl);
7c673cae
FG
665 return;
666 } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
667 __u16 v = 3;
11fdf7f2
TL
668 encode(v, bl);
669 encode(epoch, bl);
670 encode(flags, bl);
671 encode(last_failure, bl);
672 encode(root, bl);
673 encode(session_timeout, bl);
674 encode(session_autoclose, bl);
675 encode(max_file_size, bl);
676 encode(max_mds, bl);
7c673cae 677 __u32 n = mds_info.size();
11fdf7f2 678 encode(n, bl);
7c673cae
FG
679 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
680 i != mds_info.end(); ++i) {
11fdf7f2
TL
681 encode(i->first, bl);
682 encode(i->second, bl, features);
7c673cae 683 }
11fdf7f2
TL
684 encode(data_pools, bl);
685 encode(cas_pool, bl);
7c673cae 686
7c673cae 687 __u16 ev = 5;
11fdf7f2
TL
688 encode(ev, bl);
689 encode(compat, bl);
690 encode(metadata_pool, bl);
691 encode(created, bl);
692 encode(modified, bl);
693 encode(tableserver, bl);
694 encode(in, bl);
695 encode(inc, bl);
696 encode(up, bl);
697 encode(failed, bl);
698 encode(stopped, bl);
699 encode(last_failure_osd_epoch, bl);
7c673cae
FG
700 return;
701 }
702
703 ENCODE_START(5, 4, bl);
11fdf7f2
TL
704 encode(epoch, bl);
705 encode(flags, bl);
706 encode(last_failure, bl);
707 encode(root, bl);
708 encode(session_timeout, bl);
709 encode(session_autoclose, bl);
710 encode(max_file_size, bl);
711 encode(max_mds, bl);
712 encode(mds_info, bl, features);
713 encode(data_pools, bl);
714 encode(cas_pool, bl);
7c673cae 715
f67539c2 716 __u16 ev = 16;
11fdf7f2
TL
717 encode(ev, bl);
718 encode(compat, bl);
719 encode(metadata_pool, bl);
720 encode(created, bl);
721 encode(modified, bl);
722 encode(tableserver, bl);
723 encode(in, bl);
724 encode(inc, bl);
725 encode(up, bl);
726 encode(failed, bl);
727 encode(stopped, bl);
728 encode(last_failure_osd_epoch, bl);
729 encode(ever_allowed_features, bl);
730 encode(explicitly_allowed_features, bl);
731 encode(inline_data_enabled, bl);
732 encode(enabled, bl);
733 encode(fs_name, bl);
734 encode(damaged, bl);
735 encode(balancer, bl);
736 encode(standby_count_wanted, bl);
737 encode(old_max_mds, bl);
f67539c2
TL
738 {
739 ceph_release_t min_compat_client = ceph_release_t::unknown;
740 encode(min_compat_client, bl);
741 }
742 encode(required_client_features, bl);
7c673cae
FG
743 ENCODE_FINISH(bl);
744}
745
11fdf7f2 746void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
747{
748 /* Before we did stricter checking, it was possible to remove a data pool
749 * without also deleting it from the MDSMap. Check for that here after
750 * decoding the data pools.
751 */
752
753 for (auto it = data_pools.begin(); it != data_pools.end();) {
754 if (!pool_exists(*it)) {
755 dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
756 it = data_pools.erase(it);
757 } else {
758 it++;
759 }
760 }
761}
762
11fdf7f2 763void MDSMap::decode(bufferlist::const_iterator& p)
7c673cae
FG
764{
765 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
766
767 cached_up_features = 0;
768 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
11fdf7f2
TL
769 decode(epoch, p);
770 decode(flags, p);
771 decode(last_failure, p);
772 decode(root, p);
773 decode(session_timeout, p);
774 decode(session_autoclose, p);
775 decode(max_file_size, p);
776 decode(max_mds, p);
777 decode(mds_info, p);
7c673cae
FG
778 if (struct_v < 3) {
779 __u32 n;
11fdf7f2 780 decode(n, p);
7c673cae
FG
781 while (n--) {
782 __u32 m;
11fdf7f2 783 decode(m, p);
31f18b77 784 data_pools.push_back(m);
7c673cae
FG
785 }
786 __s32 s;
11fdf7f2 787 decode(s, p);
7c673cae
FG
788 cas_pool = s;
789 } else {
11fdf7f2
TL
790 decode(data_pools, p);
791 decode(cas_pool, p);
7c673cae
FG
792 }
793
794 // kclient ignores everything from here
795 __u16 ev = 1;
796 if (struct_v >= 2)
11fdf7f2 797 decode(ev, p);
7c673cae 798 if (ev >= 3)
11fdf7f2 799 decode(compat, p);
7c673cae 800 else
1adf2230 801 compat = get_compat_set_base();
7c673cae
FG
802 if (ev < 5) {
803 __u32 n;
11fdf7f2 804 decode(n, p);
7c673cae
FG
805 metadata_pool = n;
806 } else {
11fdf7f2
TL
807 decode(metadata_pool, p);
808 }
809 decode(created, p);
810 decode(modified, p);
811 decode(tableserver, p);
812 decode(in, p);
813 decode(inc, p);
814 decode(up, p);
815 decode(failed, p);
816 decode(stopped, p);
7c673cae 817 if (ev >= 4)
11fdf7f2 818 decode(last_failure_osd_epoch, p);
7c673cae
FG
819 if (ev >= 6) {
820 if (ev < 10) {
821 // previously this was a bool about snaps, not a flag map
822 bool flag;
11fdf7f2 823 decode(flag, p);
7c673cae 824 ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
11fdf7f2 825 decode(flag, p);
7c673cae 826 explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
7c673cae 827 } else {
11fdf7f2
TL
828 decode(ever_allowed_features, p);
829 decode(explicitly_allowed_features, p);
7c673cae
FG
830 }
831 } else {
11fdf7f2 832 ever_allowed_features = 0;
7c673cae 833 explicitly_allowed_features = 0;
7c673cae
FG
834 }
835 if (ev >= 7)
11fdf7f2 836 decode(inline_data_enabled, p);
7c673cae
FG
837
838 if (ev >= 8) {
11fdf7f2
TL
839 ceph_assert(struct_v >= 5);
840 decode(enabled, p);
841 decode(fs_name, p);
7c673cae
FG
842 } else {
843 if (epoch > 1) {
844 // If an MDS has ever been started, epoch will be greater than 1,
845 // assume filesystem is enabled.
846 enabled = true;
847 } else {
848 // Upgrading from a cluster that never used an MDS, switch off
849 // filesystem until it's explicitly enabled.
850 enabled = false;
851 }
852 }
853
854 if (ev >= 9) {
11fdf7f2 855 decode(damaged, p);
7c673cae
FG
856 }
857
858 if (ev >= 11) {
11fdf7f2 859 decode(balancer, p);
7c673cae
FG
860 }
861
862 if (ev >= 12) {
11fdf7f2
TL
863 decode(standby_count_wanted, p);
864 }
865
866 if (ev >= 13) {
867 decode(old_max_mds, p);
868 }
869
f67539c2
TL
870 if (ev >= 14) {
871 ceph_release_t min_compat_client;
872 if (ev == 14) {
873 int8_t r;
874 decode(r, p);
875 if (r < 0) {
876 min_compat_client = ceph_release_t::unknown;
877 } else {
878 min_compat_client = ceph_release_t{static_cast<uint8_t>(r)};
879 }
880 } else if (ev >= 15) {
881 decode(min_compat_client, p);
882 }
883 if (ev >= 16) {
884 decode(required_client_features, p);
9f95a23c 885 } else {
f67539c2 886 set_min_compat_client(min_compat_client);
9f95a23c 887 }
7c673cae
FG
888 }
889
522d829b
TL
890 for (auto& p: mds_info) {
891 static const CompatSet empty;
892 auto& info = p.second;
893 if (empty.compare(info.compat) == 0) {
894 /* bootstrap old compat; mds_info_t::decode does not have access to MDSMap */
895 info.compat = compat;
896 }
897 }
898
7c673cae
FG
899 DECODE_FINISH(p);
900}
901
902MDSMap::availability_t MDSMap::is_cluster_available() const
903{
904 if (epoch == 0) {
905 // If I'm a client, this means I'm looking at an MDSMap instance
906 // that was never actually initialized from the mons. Client should
907 // wait.
908 return TRANSIENT_UNAVAILABLE;
909 }
910
911 // If a rank is marked damage (unavailable until operator intervenes)
912 if (damaged.size()) {
913 return STUCK_UNAVAILABLE;
914 }
915
916 // If no ranks are created (filesystem not initialized)
917 if (in.empty()) {
918 return STUCK_UNAVAILABLE;
919 }
920
921 for (const auto rank : in) {
922 if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
923 // This might only be transient, but because we can't see
924 // standbys, we have no way of knowing whether there is a
925 // standby available to replace the laggy guy.
926 return STUCK_UNAVAILABLE;
927 }
928 }
929
930 if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
931 // Nobody looks stuck, so indicate to client they should go ahead
932 // and try mounting if anybody is active. This may include e.g.
933 // one MDS failing over and another active: the client should
934 // proceed to start talking to the active one and let the
935 // transiently-unavailable guy catch up later.
936 return AVAILABLE;
937 } else {
938 // Nothing indicating we were stuck, but nobody active (yet)
939 //return TRANSIENT_UNAVAILABLE;
940
941 // Because we don't have standbys in the MDSMap any more, we can't
942 // reliably indicate transient vs. stuck, so always say stuck so
943 // that the client doesn't block.
944 return STUCK_UNAVAILABLE;
945 }
946}
947
948bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
949{
950 bool state_valid = true;
951 if (next != prev) {
952 if (prev == MDSMap::STATE_REPLAY) {
953 if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) {
954 state_valid = false;
955 }
956 } else if (prev == MDSMap::STATE_REJOIN) {
1adf2230
AA
957 if (next != MDSMap::STATE_ACTIVE &&
958 next != MDSMap::STATE_CLIENTREPLAY &&
959 next != MDSMap::STATE_STOPPED) {
7c673cae
FG
960 state_valid = false;
961 }
1adf2230 962 } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
7c673cae
FG
963 // Once I have entered replay, the only allowable transitions are to
964 // the next next along in the sequence.
965 if (next != prev + 1) {
966 state_valid = false;
967 }
968 }
969 }
970
971 return state_valid;
972}
973
974bool MDSMap::check_health(mds_rank_t standby_daemon_count)
975{
976 std::set<mds_rank_t> standbys;
977 get_standby_replay_mds_set(standbys);
978 std::set<mds_rank_t> actives;
979 get_active_mds_set(actives);
980 mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
981
982 /* If there are standby daemons available/replaying and
983 * standby_count_wanted is unset (default), then we set it to 1. This will
984 * happen during health checks by the mons. Also, during initial creation
985 * of the FS we will have no actives so we don't want to change the default
986 * yet.
987 */
988 if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
989 set_standby_count_wanted(1);
990 return true;
991 }
992 return false;
993}
9f95a23c
TL
994
995mds_gid_t MDSMap::find_mds_gid_by_name(std::string_view s) const {
996 for (const auto& [gid, info] : mds_info) {
997 if (info.name == s) {
998 return gid;
999 }
1000 }
1001 return MDS_GID_NONE;
1002}
1003
1004unsigned MDSMap::get_num_mds(int state) const {
1005 unsigned n = 0;
1006 for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
1007 p != mds_info.end();
1008 ++p)
1009 if (p->second.state == state) ++n;
1010 return n;
1011}
1012
1013void MDSMap::get_up_mds_set(std::set<mds_rank_t>& s) const {
1014 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
1015 p != up.end();
1016 ++p)
1017 s.insert(p->first);
1018}
1019
1020uint64_t MDSMap::get_up_features() {
1021 if (!cached_up_features) {
1022 bool first = true;
1023 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
1024 p != up.end();
1025 ++p) {
1026 std::map<mds_gid_t, mds_info_t>::const_iterator q =
1027 mds_info.find(p->second);
1028 ceph_assert(q != mds_info.end());
1029 if (first) {
1030 cached_up_features = q->second.mds_features;
1031 first = false;
1032 } else {
1033 cached_up_features &= q->second.mds_features;
1034 }
1035 }
1036 }
1037 return cached_up_features;
1038}
1039
1040void MDSMap::get_recovery_mds_set(std::set<mds_rank_t>& s) const {
1041 s = failed;
1042 for (const auto& p : damaged)
1043 s.insert(p);
1044 for (const auto& p : mds_info)
1045 if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING)
1046 s.insert(p.second.rank);
1047}
1048
1049void MDSMap::get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const {
1050 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
1051 p != mds_info.end();
1052 ++p)
1053 if (p->second.state >= first && p->second.state <= STATE_STOPPING)
1054 s.insert(p->second.rank);
1055}
1056
1057void MDSMap::get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
1058 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
1059 p != mds_info.end();
1060 ++p)
1061 if (p->second.state == state)
1062 s.insert(p->second.rank);
1063}
1064
1065mds_gid_t MDSMap::get_standby_replay(mds_rank_t r) const {
1066 for (auto& [gid,info] : mds_info) {
1067 if (info.rank == r && info.state == STATE_STANDBY_REPLAY) {
1068 return gid;
1069 }
1070 }
1071 return MDS_GID_NONE;
1072}
1073
1074bool MDSMap::is_degraded() const {
1075 if (!failed.empty() || !damaged.empty())
1076 return true;
1077 for (const auto& p : mds_info) {
1078 if (p.second.is_degraded())
1079 return true;
1080 }
1081 return false;
1082}
f67539c2
TL
1083
1084void MDSMap::set_min_compat_client(ceph_release_t version)
1085{
1086 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1087
1088 if (version >= ceph_release_t::octopus)
1089 bits.push_back(CEPHFS_FEATURE_OCTOPUS);
1090 else if (version >= ceph_release_t::nautilus)
1091 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1092 else if (version >= ceph_release_t::mimic)
1093 bits.push_back(CEPHFS_FEATURE_MIMIC);
1094 else if (version >= ceph_release_t::luminous)
1095 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1096 else if (version >= ceph_release_t::kraken)
1097 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1098 else if (version >= ceph_release_t::jewel)
1099 bits.push_back(CEPHFS_FEATURE_JEWEL);
1100
1101 std::sort(bits.begin(), bits.end());
1102 required_client_features = feature_bitset_t(bits);
1103}