]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/FSMap.cc
import ceph 15.2.14
[ceph.git] / ceph / src / mds / FSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include "FSMap.h"
17
11fdf7f2 18#include "common/StackStringStream.h"
7c673cae 19
11fdf7f2
TL
20#include <sstream>
21#ifdef WITH_SEASTAR
22#include "crimson/common/config_proxy.h"
23#else
24#include "common/config_proxy.h"
25#endif
26#include "global/global_context.h"
224ce89b
WB
27#include "mon/health_check.h"
28
11fdf7f2 29using std::stringstream;
7c673cae
FG
30
31void Filesystem::dump(Formatter *f) const
32{
33 f->open_object_section("mdsmap");
34 mds_map.dump(f);
35 f->close_section();
36 f->dump_int("id", fscid);
37}
38
39void FSMap::dump(Formatter *f) const
40{
41 f->dump_int("epoch", epoch);
11fdf7f2
TL
42 // Use 'default' naming to match 'set-default' CLI
43 f->dump_int("default_fscid", legacy_client_fscid);
7c673cae
FG
44
45 f->open_object_section("compat");
46 compat.dump(f);
47 f->close_section();
48
49 f->open_object_section("feature_flags");
50 f->dump_bool("enable_multiple", enable_multiple);
51 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
52 f->close_section();
53
54 f->open_array_section("standbys");
9f95a23c 55 for (const auto& [gid, info] : standby_daemons) {
7c673cae 56 f->open_object_section("info");
9f95a23c
TL
57 info.dump(f);
58 f->dump_int("epoch", standby_epochs.at(gid));
7c673cae
FG
59 f->close_section();
60 }
61 f->close_section();
62
63 f->open_array_section("filesystems");
64 for (const auto &fs : filesystems) {
65 f->open_object_section("filesystem");
66 fs.second->dump(f);
67 f->close_section();
68 }
69 f->close_section();
70}
71
9f95a23c
TL
72FSMap &FSMap::operator=(const FSMap &rhs)
73{
74 epoch = rhs.epoch;
75 next_filesystem_id = rhs.next_filesystem_id;
76 legacy_client_fscid = rhs.legacy_client_fscid;
77 compat = rhs.compat;
78 enable_multiple = rhs.enable_multiple;
79 mds_roles = rhs.mds_roles;
80 standby_daemons = rhs.standby_daemons;
81 standby_epochs = rhs.standby_epochs;
82
83 filesystems.clear();
84 for (const auto &i : rhs.filesystems) {
85 const auto &fs = i.second;
86 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
87 }
88
89 return *this;
90}
91
92void FSMap::generate_test_instances(std::list<FSMap*>& ls)
7c673cae
FG
93{
94 FSMap *m = new FSMap();
95
96 std::list<MDSMap*> mds_map_instances;
97 MDSMap::generate_test_instances(mds_map_instances);
98
99 int k = 20;
100 for (auto i : mds_map_instances) {
11fdf7f2 101 auto fs = Filesystem::create();
7c673cae
FG
102 fs->fscid = k++;
103 fs->mds_map = *i;
104 delete i;
105 m->filesystems[fs->fscid] = fs;
106 }
107 mds_map_instances.clear();
108
109 ls.push_back(m);
110}
111
112void FSMap::print(ostream& out) const
113{
114 out << "e" << epoch << std::endl;
115 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
116 << ever_enabled_multiple << std::endl;
117 out << "compat: " << compat << std::endl;
118 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
119 out << " " << std::endl;
120
121 if (filesystems.empty()) {
122 out << "No filesystems configured" << std::endl;
7c673cae
FG
123 }
124
11fdf7f2
TL
125 for (const auto& p : filesystems) {
126 p.second->print(out);
7c673cae
FG
127 out << " " << std::endl << " " << std::endl; // Space out a bit
128 }
129
130 if (!standby_daemons.empty()) {
131 out << "Standby daemons:" << std::endl << " " << std::endl;
132 }
133
9f95a23c
TL
134 for (const auto& p : standby_daemons) {
135 out << p.second << std::endl;
7c673cae
FG
136 }
137}
138
7c673cae
FG
139void FSMap::print_summary(Formatter *f, ostream *out) const
140{
7c673cae
FG
141 if (f) {
142 f->dump_unsigned("epoch", get_epoch());
11fdf7f2
TL
143 for (const auto &p : filesystems) {
144 auto& fs = p.second;
7c673cae
FG
145 f->dump_unsigned("id", fs->fscid);
146 f->dump_unsigned("up", fs->mds_map.up.size());
147 f->dump_unsigned("in", fs->mds_map.in.size());
148 f->dump_unsigned("max", fs->mds_map.max_mds);
149 }
150 } else {
11fdf7f2
TL
151 auto count = filesystems.size();
152 if (count <= 3) {
153 bool first = true;
154 for (const auto& p : filesystems) {
155 const auto& fs = p.second;
156 if (!first) {
157 *out << " ";
158 }
159 if (fs->mds_map.is_degraded()) {
160 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
161 } else {
162 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
163 }
164 first = false;
165 }
166 } else {
167 *out << count << " fs";
168 unsigned degraded = 0;
169 CachedStackStringStream css;
170 *css << " (degraded: ";
171 for (const auto& p : filesystems) {
172 const auto& fs = p.second;
173 if (fs->mds_map.is_degraded()) {
174 degraded++;
175 if (degraded <= 3) {
176 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
177 }
178 }
179 }
180 if (degraded > 0) {
181 if (degraded <= 3) {
182 *css << ")";
183 *out << css->strv();
184 } else {
185 *out << " (degraded: " << degraded << " fs)";
186 }
187 }
7c673cae
FG
188 }
189 }
190
191 if (f) {
192 f->open_array_section("by_rank");
193 }
194
11fdf7f2
TL
195 std::map<MDSMap::DaemonState,unsigned> by_state;
196 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
197 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
198 for (const auto& [gid, fscid] : mds_roles) {
199 if (fscid == FS_CLUSTER_ID_NONE)
200 continue;
201
202 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
203 auto s = std::string(ceph_mds_state_name(info.state));
7c673cae
FG
204 if (info.laggy()) {
205 s += "(laggy or crashed)";
206 }
207
11fdf7f2
TL
208 if (f) {
209 f->open_object_section("mds");
210 f->dump_unsigned("filesystem_id", fscid);
211 f->dump_unsigned("rank", info.rank);
212 f->dump_string("name", info.name);
213 f->dump_string("status", s);
214 f->dump_unsigned("gid", gid);
215 f->close_section();
216 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
217 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
7c673cae 218 }
11fdf7f2 219 by_state[info.state]++;
7c673cae
FG
220 }
221
222 if (f) {
223 f->close_section();
224 } else {
11fdf7f2 225 if (0 < by_rank.size() && by_rank.size() < 5) {
7c673cae
FG
226 if (filesystems.size() > 1) {
227 // Disambiguate filesystems
228 std::map<std::string, std::string> pretty;
11fdf7f2
TL
229 for (const auto& [role,status] : by_rank) {
230 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
231 CachedStackStringStream css;
232 *css << fs_name << ":" << role.rank;
233 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
234 --by_state[status.first]; /* already printed! */
7c673cae
FG
235 }
236 *out << " " << pretty;
237 } else {
238 // Omit FSCID in output when only one filesystem exists
239 std::map<mds_rank_t, std::string> shortened;
11fdf7f2
TL
240 for (const auto& [role,status] : by_rank) {
241 shortened[role.rank] = status.second;
242 --by_state[status.first]; /* already printed! */
7c673cae
FG
243 }
244 *out << " " << shortened;
245 }
246 }
11fdf7f2
TL
247 for (const auto& [state, count] : by_state) {
248 if (count > 0) {
249 auto s = std::string_view(ceph_mds_state_name(state));
250 *out << " " << count << " " << s;
251 }
252 }
7c673cae
FG
253 }
254
11fdf7f2
TL
255 if (f) {
256 const auto state = MDSMap::DaemonState::STATE_STANDBY;
257 auto&& name = ceph_mds_state_name(state);
258 auto count = standby_daemons.size();
259 f->dump_unsigned(name, count);
7c673cae
FG
260 }
261
262 size_t failed = 0;
263 size_t damaged = 0;
11fdf7f2
TL
264 for (const auto& p : filesystems) {
265 auto& fs = p.second;
7c673cae
FG
266 failed += fs->mds_map.failed.size();
267 damaged += fs->mds_map.damaged.size();
268 }
269
270 if (failed > 0) {
271 if (f) {
272 f->dump_unsigned("failed", failed);
273 } else {
274 *out << ", " << failed << " failed";
275 }
276 }
277
278 if (damaged > 0) {
279 if (f) {
280 f->dump_unsigned("damaged", damaged);
281 } else {
282 *out << ", " << damaged << " damaged";
283 }
284 }
285 //if (stopped.size())
286 //out << ", " << stopped.size() << " stopped";
287}
288
9f95a23c
TL
289mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
290{
291 for (const auto &i : mds_map.mds_info) {
292 const auto &info = i.second;
293 if (info.state == MDSMap::STATE_STANDBY_REPLAY
294 && info.rank == mds_map.mds_info.at(who).rank) {
295 return info.global_id;
296 }
297 }
298 return MDS_GID_NONE;
299}
7c673cae 300
11fdf7f2
TL
301Filesystem::ref FSMap::create_filesystem(std::string_view name,
302 int64_t metadata_pool, int64_t data_pool, uint64_t features)
7c673cae 303{
11fdf7f2 304 auto fs = Filesystem::create();
28e407b8 305 fs->mds_map.epoch = epoch;
11fdf7f2 306 fs->mds_map.fs_name = name;
31f18b77 307 fs->mds_map.data_pools.push_back(data_pool);
7c673cae
FG
308 fs->mds_map.metadata_pool = metadata_pool;
309 fs->mds_map.cas_pool = -1;
7c673cae
FG
310 fs->mds_map.compat = compat;
311 fs->mds_map.created = ceph_clock_now();
312 fs->mds_map.modified = ceph_clock_now();
7c673cae 313 fs->mds_map.enabled = true;
9f95a23c
TL
314 fs->fscid = next_filesystem_id++;
315 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
316 // have initialized next_filesystem_id such that it's never used here.
317 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
7c673cae
FG
318 filesystems[fs->fscid] = fs;
319
320 // Created first filesystem? Set it as the one
321 // for legacy clients to use
322 if (filesystems.size() == 1) {
323 legacy_client_fscid = fs->fscid;
324 }
11fdf7f2
TL
325
326 return fs;
7c673cae
FG
327}
328
9f95a23c
TL
329Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
330{
331 for (const auto& p : filesystems) {
332 if (p.second->mds_map.fs_name == name) {
333 return p.second;
334 }
335 }
336 return nullptr;
337}
338
339std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
340{
341 std::vector<Filesystem::const_ref> ret;
342 for (const auto& p : filesystems) {
343 ret.push_back(p.second);
344 }
345 return ret;
346}
347
7c673cae
FG
348void FSMap::reset_filesystem(fs_cluster_id_t fscid)
349{
350 auto fs = get_filesystem(fscid);
11fdf7f2 351 auto new_fs = Filesystem::create();
7c673cae
FG
352
353 // Populate rank 0 as existing (so don't go into CREATING)
354 // but failed (so that next available MDS is assigned the rank)
355 new_fs->mds_map.in.insert(mds_rank_t(0));
356 new_fs->mds_map.failed.insert(mds_rank_t(0));
357
358 // Carry forward what makes sense
359 new_fs->fscid = fs->fscid;
360 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
7c673cae
FG
361 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
362 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
363 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
364 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
7c673cae
FG
365 new_fs->mds_map.compat = compat;
366 new_fs->mds_map.created = ceph_clock_now();
367 new_fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
368 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
369 new_fs->mds_map.enabled = true;
370
c07f9fc5
FG
371 // Remember mds ranks that have ever started. (They should load old inotable
372 // instead of creating new one if they start again.)
373 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
374 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
375 new_fs->mds_map.stopped.erase(mds_rank_t(0));
376
7c673cae
FG
377 // Persist the new FSMap
378 filesystems[new_fs->fscid] = new_fs;
379}
380
381void FSMap::get_health(list<pair<health_status_t,string> >& summary,
382 list<pair<health_status_t,string> > *detail) const
383{
384 mds_rank_t standby_count_wanted = 0;
385 for (const auto &i : filesystems) {
386 const auto &fs = i.second;
387
388 // TODO: move get_health up into here so that we can qualify
389 // all the messages with what filesystem they're talking about
390 fs->mds_map.get_health(summary, detail);
391
392 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
393 }
394
395 if (standby_count_wanted) {
396 std::ostringstream oss;
397 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
398 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
399 }
400}
401
402bool FSMap::check_health(void)
403{
404 bool changed = false;
405 for (auto &i : filesystems) {
406 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
407 }
408 return changed;
409}
410
224ce89b
WB
411void FSMap::get_health_checks(health_check_map_t *checks) const
412{
413 mds_rank_t standby_count_wanted = 0;
414 for (const auto &i : filesystems) {
415 const auto &fs = i.second;
416 health_check_map_t fschecks;
d2e6a577 417
224ce89b 418 fs->mds_map.get_health_checks(&fschecks);
d2e6a577
FG
419
420 // Some of the failed ranks might be transient (i.e. there are standbys
421 // ready to replace them). We will report only on "stuck" failed, i.e.
422 // ranks which are failed and have no standby replacement available.
423 std::set<mds_rank_t> stuck_failed;
424
425 for (const auto &rank : fs->mds_map.failed) {
9f95a23c
TL
426 auto rep_info = find_replacement_for({fs->fscid, rank});
427 if (!rep_info) {
d2e6a577
FG
428 stuck_failed.insert(rank);
429 }
430 }
431
432 // FS_WITH_FAILED_MDS
433 if (!stuck_failed.empty()) {
434 health_check_t& fscheck = checks->get_or_add(
435 "FS_WITH_FAILED_MDS", HEALTH_WARN,
9f95a23c 436 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
d2e6a577
FG
437 ostringstream ss;
438 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
439 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
440 fscheck.detail.push_back(ss.str()); }
441
224ce89b
WB
442 checks->merge(fschecks);
443 standby_count_wanted = std::max(
444 standby_count_wanted,
445 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
446 }
447
448 // MDS_INSUFFICIENT_STANDBY
449 if (standby_count_wanted) {
450 std::ostringstream oss, dss;
d2e6a577 451 oss << "insufficient standby MDS daemons available";
9f95a23c 452 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1);
224ce89b
WB
453 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
454 << " more";
455 d.detail.push_back(dss.str());
456 }
457}
458
9f95a23c 459void FSMap::update_compat(const CompatSet &c)
7c673cae 460{
9f95a23c
TL
461 // We could do something more complicated here to enable
462 // different filesystems to be served by different MDS versions,
463 // but this is a lot simpler because it doesn't require us to
464 // track the compat versions for standby daemons.
465 compat = c;
466 for (const auto &i : filesystems) {
467 MDSMap &mds_map = i.second->mds_map;
468 mds_map.compat = c;
469 mds_map.epoch = epoch;
470 }
471}
7c673cae 472
9f95a23c
TL
473void FSMap::encode(bufferlist& bl, uint64_t features) const
474{
ec96510d 475 ENCODE_START(STRUCT_VERSION, 6, bl);
9f95a23c
TL
476 encode(epoch, bl);
477 encode(next_filesystem_id, bl);
478 encode(legacy_client_fscid, bl);
479 encode(compat, bl);
480 encode(enable_multiple, bl);
481 {
482 std::vector<Filesystem::ref> v;
483 v.reserve(filesystems.size());
484 for (auto& p : filesystems) v.emplace_back(p.second);
485 encode(v, bl, features);
7c673cae 486 }
9f95a23c
TL
487 encode(mds_roles, bl);
488 encode(standby_daemons, bl, features);
489 encode(standby_epochs, bl);
490 encode(ever_enabled_multiple, bl);
491 ENCODE_FINISH(bl);
7c673cae
FG
492}
493
11fdf7f2 494void FSMap::decode(bufferlist::const_iterator& p)
7c673cae 495{
7c673cae
FG
496 // The highest MDSMap encoding version before we changed the
497 // MDSMonitor to store an FSMap instead of an MDSMap was
498 // 5, so anything older than 6 is decoded as an MDSMap,
499 // and anything newer is decoded as an FSMap.
ec96510d
FG
500 DECODE_START_LEGACY_COMPAT_LEN_16(STRUCT_VERSION, 4, 4, p);
501 struct_version = struct_v;
7c673cae 502 if (struct_v < 6) {
3efd9988
FG
503 // Because the mon used to store an MDSMap where we now
504 // store an FSMap, FSMap knows how to decode the legacy
505 // MDSMap format (it never needs to encode it though).
506 MDSMap legacy_mds_map;
507
7c673cae 508 // Decoding an MDSMap (upgrade)
11fdf7f2
TL
509 decode(epoch, p);
510 decode(legacy_mds_map.flags, p);
511 decode(legacy_mds_map.last_failure, p);
512 decode(legacy_mds_map.root, p);
513 decode(legacy_mds_map.session_timeout, p);
514 decode(legacy_mds_map.session_autoclose, p);
515 decode(legacy_mds_map.max_file_size, p);
516 decode(legacy_mds_map.max_mds, p);
517 decode(legacy_mds_map.mds_info, p);
7c673cae
FG
518 if (struct_v < 3) {
519 __u32 n;
11fdf7f2 520 decode(n, p);
7c673cae
FG
521 while (n--) {
522 __u32 m;
11fdf7f2 523 decode(m, p);
31f18b77 524 legacy_mds_map.data_pools.push_back(m);
7c673cae
FG
525 }
526 __s32 s;
11fdf7f2 527 decode(s, p);
7c673cae
FG
528 legacy_mds_map.cas_pool = s;
529 } else {
11fdf7f2
TL
530 decode(legacy_mds_map.data_pools, p);
531 decode(legacy_mds_map.cas_pool, p);
7c673cae
FG
532 }
533
534 // kclient ignores everything from here
535 __u16 ev = 1;
536 if (struct_v >= 2)
11fdf7f2 537 decode(ev, p);
7c673cae 538 if (ev >= 3)
11fdf7f2 539 decode(legacy_mds_map.compat, p);
7c673cae 540 else
1adf2230 541 legacy_mds_map.compat = MDSMap::get_compat_set_base();
7c673cae
FG
542 if (ev < 5) {
543 __u32 n;
11fdf7f2 544 decode(n, p);
7c673cae
FG
545 legacy_mds_map.metadata_pool = n;
546 } else {
11fdf7f2 547 decode(legacy_mds_map.metadata_pool, p);
7c673cae 548 }
11fdf7f2
TL
549 decode(legacy_mds_map.created, p);
550 decode(legacy_mds_map.modified, p);
551 decode(legacy_mds_map.tableserver, p);
552 decode(legacy_mds_map.in, p);
7c673cae 553 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
11fdf7f2
TL
554 decode(inc, p);
555 decode(legacy_mds_map.up, p);
556 decode(legacy_mds_map.failed, p);
557 decode(legacy_mds_map.stopped, p);
7c673cae 558 if (ev >= 4)
11fdf7f2 559 decode(legacy_mds_map.last_failure_osd_epoch, p);
7c673cae
FG
560 if (ev >= 6) {
561 if (ev < 10) {
562 // previously this was a bool about snaps, not a flag map
563 bool flag;
11fdf7f2 564 decode(flag, p);
7c673cae
FG
565 legacy_mds_map.ever_allowed_features = flag ?
566 CEPH_MDSMAP_ALLOW_SNAPS : 0;
11fdf7f2 567 decode(flag, p);
7c673cae
FG
568 legacy_mds_map.explicitly_allowed_features = flag ?
569 CEPH_MDSMAP_ALLOW_SNAPS : 0;
7c673cae 570 } else {
11fdf7f2
TL
571 decode(legacy_mds_map.ever_allowed_features, p);
572 decode(legacy_mds_map.explicitly_allowed_features, p);
7c673cae
FG
573 }
574 } else {
11fdf7f2 575 legacy_mds_map.ever_allowed_features = 0;
7c673cae 576 legacy_mds_map.explicitly_allowed_features = 0;
7c673cae
FG
577 }
578 if (ev >= 7)
11fdf7f2 579 decode(legacy_mds_map.inline_data_enabled, p);
7c673cae
FG
580
581 if (ev >= 8) {
11fdf7f2
TL
582 ceph_assert(struct_v >= 5);
583 decode(legacy_mds_map.enabled, p);
584 decode(legacy_mds_map.fs_name, p);
7c673cae
FG
585 } else {
586 legacy_mds_map.fs_name = "default";
587 if (epoch > 1) {
588 // If an MDS has ever been started, epoch will be greater than 1,
589 // assume filesystem is enabled.
590 legacy_mds_map.enabled = true;
591 } else {
592 // Upgrading from a cluster that never used an MDS, switch off
593 // filesystem until it's explicitly enabled.
594 legacy_mds_map.enabled = false;
595 }
596 }
597
598 if (ev >= 9) {
11fdf7f2 599 decode(legacy_mds_map.damaged, p);
7c673cae
FG
600 }
601
602 // We're upgrading, populate filesystems from the legacy fields
603 filesystems.clear();
604 standby_daemons.clear();
605 standby_epochs.clear();
606 mds_roles.clear();
607 compat = legacy_mds_map.compat;
608 enable_multiple = false;
609
610 // Synthesise a Filesystem from legacy_mds_map, if enabled
611 if (legacy_mds_map.enabled) {
612 // Construct a Filesystem from the legacy MDSMap
11fdf7f2 613 auto migrate_fs = Filesystem::create();
7c673cae
FG
614 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
615 migrate_fs->mds_map = legacy_mds_map;
616 migrate_fs->mds_map.epoch = epoch;
617 filesystems[migrate_fs->fscid] = migrate_fs;
618
619 // List of GIDs that had invalid states
620 std::set<mds_gid_t> drop_gids;
621
622 // Construct mds_roles, standby_daemons, and remove
623 // standbys from the MDSMap in the Filesystem.
11fdf7f2
TL
624 for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
625 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
626 /* drop any legacy standby-replay daemons */
627 drop_gids.insert(gid);
628 } else if (info.rank == MDS_RANK_NONE) {
629 if (info.state != MDSMap::STATE_STANDBY) {
7c673cae
FG
630 // Old MDSMaps can have down:dne here, which
631 // is invalid in an FSMap (#17837)
11fdf7f2 632 drop_gids.insert(gid);
7c673cae 633 } else {
11fdf7f2 634 insert(info); // into standby_daemons
7c673cae
FG
635 }
636 } else {
11fdf7f2 637 mds_roles[gid] = migrate_fs->fscid;
7c673cae
FG
638 }
639 }
640 for (const auto &p : standby_daemons) {
641 // Erase from this Filesystem's MDSMap, because it has
642 // been copied into FSMap::Standby_daemons above
643 migrate_fs->mds_map.mds_info.erase(p.first);
644 }
645 for (const auto &gid : drop_gids) {
646 // Throw away all info for this MDS because it was identified
647 // as having invalid state above.
648 migrate_fs->mds_map.mds_info.erase(gid);
649 }
650
651 legacy_client_fscid = migrate_fs->fscid;
652 } else {
653 legacy_client_fscid = FS_CLUSTER_ID_NONE;
654 }
655 } else {
11fdf7f2
TL
656 decode(epoch, p);
657 decode(next_filesystem_id, p);
658 decode(legacy_client_fscid, p);
659 decode(compat, p);
660 decode(enable_multiple, p);
661 {
662 std::vector<Filesystem::ref> v;
663 decode(v, p);
664 filesystems.clear();
665 for (auto& ref : v) {
666 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
667 ceph_assert(em.second);
668 }
7c673cae 669 }
11fdf7f2
TL
670 decode(mds_roles, p);
671 decode(standby_daemons, p);
672 decode(standby_epochs, p);
7c673cae 673 if (struct_v >= 7) {
11fdf7f2 674 decode(ever_enabled_multiple, p);
7c673cae
FG
675 }
676 }
677
678 DECODE_FINISH(p);
679}
680
11fdf7f2 681void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
682{
683 for (auto &fs : filesystems) {
684 fs.second->mds_map.sanitize(pool_exists);
685 }
686}
7c673cae
FG
687
688void Filesystem::encode(bufferlist& bl, uint64_t features) const
689{
690 ENCODE_START(1, 1, bl);
11fdf7f2 691 encode(fscid, bl);
7c673cae
FG
692 bufferlist mdsmap_bl;
693 mds_map.encode(mdsmap_bl, features);
11fdf7f2 694 encode(mdsmap_bl, bl);
7c673cae
FG
695 ENCODE_FINISH(bl);
696}
697
11fdf7f2 698void Filesystem::decode(bufferlist::const_iterator& p)
7c673cae
FG
699{
700 DECODE_START(1, p);
11fdf7f2 701 decode(fscid, p);
7c673cae 702 bufferlist mdsmap_bl;
11fdf7f2
TL
703 decode(mdsmap_bl, p);
704 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
7c673cae
FG
705 mds_map.decode(mdsmap_bl_iter);
706 DECODE_FINISH(p);
707}
708
709int FSMap::parse_filesystem(
11fdf7f2
TL
710 std::string_view ns_str,
711 Filesystem::const_ref* result
7c673cae
FG
712 ) const
713{
714 std::string ns_err;
94b18763
FG
715 std::string s(ns_str);
716 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
7c673cae
FG
717 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
718 for (auto &fs : filesystems) {
94b18763 719 if (fs.second->mds_map.fs_name == s) {
7c673cae
FG
720 *result = std::const_pointer_cast<const Filesystem>(fs.second);
721 return 0;
722 }
723 }
724 return -ENOENT;
725 } else {
726 *result = get_filesystem(fscid);
727 return 0;
728 }
729}
730
731void Filesystem::print(std::ostream &out) const
732{
733 out << "Filesystem '" << mds_map.fs_name
734 << "' (" << fscid << ")" << std::endl;
735 mds_map.print(out);
736}
737
9f95a23c 738bool FSMap::is_any_degraded() const
7c673cae 739{
9f95a23c
TL
740 for (auto& i : filesystems) {
741 if (i.second->mds_map.is_degraded()) {
742 return true;
743 }
744 }
745 return false;
746}
747
748std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
749{
750 std::map<mds_gid_t, mds_info_t> result;
751 for (const auto &i : standby_daemons) {
752 result[i.first] = i.second;
753 }
754
755 for (const auto &i : filesystems) {
756 const auto &fs_info = i.second->mds_map.get_mds_info();
757 for (const auto &j : fs_info) {
758 result[j.first] = j.second;
759 }
760 }
761
762 return result;
763}
764
765const MDSMap::mds_info_t* FSMap::get_available_standby(fs_cluster_id_t fscid) const
766{
767 const mds_info_t* who = nullptr;
11fdf7f2
TL
768 for (const auto& [gid, info] : standby_daemons) {
769 ceph_assert(info.rank == MDS_RANK_NONE);
770 ceph_assert(info.state == MDSMap::STATE_STANDBY);
7c673cae 771
11fdf7f2 772 if (info.laggy() || info.is_frozen()) {
7c673cae
FG
773 continue;
774 }
775
9f95a23c
TL
776 if (info.join_fscid == fscid) {
777 who = &info;
778 break;
779 } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
780 who = &info; /* vanilla standby */
781 } else if (who == nullptr) {
782 who = &info; /* standby for another fs, last resort */
783 }
784 }
785 return who;
786}
787
788mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
789{
790 const auto info = get_mds_info();
791 for (const auto &p : info) {
792 if (p.second.name == s) {
793 return p.first;
794 }
7c673cae 795 }
11fdf7f2 796 return MDS_GID_NONE;
7c673cae
FG
797}
798
9f95a23c
TL
799const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
800{
801 std::map<mds_gid_t, mds_info_t> result;
802 for (const auto &i : standby_daemons) {
803 if (i.second.name == name) {
804 return &(i.second);
805 }
806 }
807
808 for (const auto &i : filesystems) {
809 const auto &fs_info = i.second->mds_map.get_mds_info();
810 for (const auto &j : fs_info) {
811 if (j.second.name == name) {
812 return &(j.second);
813 }
814 }
815 }
816
817 return nullptr;
818}
819
820const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
11fdf7f2
TL
821{
822 auto&& fs = get_filesystem(role.fscid);
7c673cae 823
11fdf7f2
TL
824 // First see if we have a STANDBY_REPLAY
825 for (const auto& [gid, info] : fs->mds_map.mds_info) {
826 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
827 if (info.is_frozen()) {
828 /* the standby-replay is frozen, do nothing! */
9f95a23c 829 return nullptr;
11fdf7f2 830 } else {
9f95a23c 831 return &info;
11fdf7f2 832 }
7c673cae
FG
833 }
834 }
7c673cae 835
9f95a23c 836 return get_available_standby(role.fscid);
7c673cae
FG
837}
838
839void FSMap::sanity() const
840{
841 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
11fdf7f2 842 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
7c673cae
FG
843 }
844
845 for (const auto &i : filesystems) {
846 auto fs = i.second;
11fdf7f2
TL
847 ceph_assert(fs->mds_map.compat.compare(compat) == 0);
848 ceph_assert(fs->fscid == i.first);
7c673cae 849 for (const auto &j : fs->mds_map.mds_info) {
11fdf7f2
TL
850 ceph_assert(j.second.rank != MDS_RANK_NONE);
851 ceph_assert(mds_roles.count(j.first) == 1);
852 ceph_assert(standby_daemons.count(j.first) == 0);
853 ceph_assert(standby_epochs.count(j.first) == 0);
854 ceph_assert(mds_roles.at(j.first) == i.first);
7c673cae 855 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
11fdf7f2
TL
856 ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first);
857 ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0);
858 ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0);
7c673cae
FG
859 }
860 }
861
862 for (const auto &j : fs->mds_map.up) {
863 mds_rank_t rank = j.first;
11fdf7f2 864 ceph_assert(fs->mds_map.in.count(rank) == 1);
7c673cae 865 mds_gid_t gid = j.second;
11fdf7f2 866 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
7c673cae
FG
867 }
868 }
869
870 for (const auto &i : standby_daemons) {
11fdf7f2
TL
871 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
872 ceph_assert(i.second.rank == MDS_RANK_NONE);
873 ceph_assert(i.second.global_id == i.first);
874 ceph_assert(standby_epochs.count(i.first) == 1);
875 ceph_assert(mds_roles.count(i.first) == 1);
876 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
7c673cae
FG
877 }
878
879 for (const auto &i : standby_epochs) {
11fdf7f2 880 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae
FG
881 }
882
883 for (const auto &i : mds_roles) {
884 if (i.second == FS_CLUSTER_ID_NONE) {
11fdf7f2 885 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae 886 } else {
11fdf7f2
TL
887 ceph_assert(filesystems.count(i.second) == 1);
888 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
7c673cae
FG
889 }
890 }
891}
892
893void FSMap::promote(
894 mds_gid_t standby_gid,
11fdf7f2 895 Filesystem& filesystem,
7c673cae
FG
896 mds_rank_t assigned_rank)
897{
11fdf7f2 898 ceph_assert(gid_exists(standby_gid));
7c673cae
FG
899 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
900 if (!is_standby_replay) {
11fdf7f2
TL
901 ceph_assert(standby_daemons.count(standby_gid));
902 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
7c673cae
FG
903 }
904
11fdf7f2 905 MDSMap &mds_map = filesystem.mds_map;
7c673cae
FG
906
907 // Insert daemon state to Filesystem
908 if (!is_standby_replay) {
909 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
910 } else {
11fdf7f2
TL
911 ceph_assert(mds_map.mds_info.count(standby_gid));
912 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
913 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
7c673cae 914 }
9f95a23c 915 auto& info = mds_map.mds_info[standby_gid];
7c673cae
FG
916
917 if (mds_map.stopped.erase(assigned_rank)) {
918 // The cluster is being expanded with a stopped rank
919 info.state = MDSMap::STATE_STARTING;
920 } else if (!mds_map.is_in(assigned_rank)) {
921 // The cluster is being expanded with a new rank
922 info.state = MDSMap::STATE_CREATING;
923 } else {
924 // An existing rank is being assigned to a replacement
925 info.state = MDSMap::STATE_REPLAY;
926 mds_map.failed.erase(assigned_rank);
927 }
928 info.rank = assigned_rank;
929 info.inc = epoch;
11fdf7f2 930 mds_roles[standby_gid] = filesystem.fscid;
7c673cae
FG
931
932 // Update the rank state in Filesystem
933 mds_map.in.insert(assigned_rank);
934 mds_map.up[assigned_rank] = standby_gid;
935
936 // Remove from the list of standbys
937 if (!is_standby_replay) {
938 standby_daemons.erase(standby_gid);
939 standby_epochs.erase(standby_gid);
940 }
941
942 // Indicate that Filesystem has been modified
943 mds_map.epoch = epoch;
944}
945
946void FSMap::assign_standby_replay(
947 const mds_gid_t standby_gid,
948 const fs_cluster_id_t leader_ns,
949 const mds_rank_t leader_rank)
950{
11fdf7f2
TL
951 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
952 ceph_assert(gid_exists(standby_gid));
953 ceph_assert(!gid_has_rank(standby_gid));
954 ceph_assert(standby_daemons.count(standby_gid));
7c673cae
FG
955
956 // Insert to the filesystem
957 auto fs = filesystems.at(leader_ns);
958 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
959 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
960 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
961 mds_roles[standby_gid] = leader_ns;
962
963 // Remove from the list of standbys
964 standby_daemons.erase(standby_gid);
965 standby_epochs.erase(standby_gid);
966
967 // Indicate that Filesystem has been modified
968 fs->mds_map.epoch = epoch;
969}
970
971void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
972{
973 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
974 standby_daemons.erase(who);
975 standby_epochs.erase(who);
976 } else {
977 auto &fs = filesystems.at(mds_roles.at(who));
978 const auto &info = fs->mds_map.mds_info.at(who);
979 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
980 if (info.state == MDSMap::STATE_CREATING) {
981 // If this gid didn't make it past CREATING, then forget
982 // the rank ever existed so that next time it's handed out
983 // to a gid it'll go back into CREATING.
984 fs->mds_map.in.erase(info.rank);
985 } else {
986 // Put this rank into the failed list so that the next available
987 // STANDBY will pick it up.
988 fs->mds_map.failed.insert(info.rank);
989 }
11fdf7f2 990 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
7c673cae
FG
991 fs->mds_map.up.erase(info.rank);
992 }
993 fs->mds_map.mds_info.erase(who);
994 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
995 fs->mds_map.epoch = epoch;
996 }
997
998 mds_roles.erase(who);
999}
1000
1001void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
1002{
11fdf7f2 1003 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1004 auto fs = filesystems.at(mds_roles.at(who));
1005 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
1006
1007 erase(who, blacklist_epoch);
1008 fs->mds_map.failed.erase(rank);
1009 fs->mds_map.damaged.insert(rank);
1010
11fdf7f2 1011 ceph_assert(fs->mds_map.epoch == epoch);
7c673cae
FG
1012}
1013
1014/**
1015 * Update to indicate that the rank `rank` is to be removed
1016 * from the damaged list of the filesystem `fscid`
1017 */
1018bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
1019{
1020 auto fs = filesystems.at(fscid);
1021
1022 if (fs->mds_map.damaged.erase(rank)) {
1023 fs->mds_map.failed.insert(rank);
1024 fs->mds_map.epoch = epoch;
1025 return true;
1026 } else {
1027 return false;
1028 }
1029}
1030
1031void FSMap::insert(const MDSMap::mds_info_t &new_info)
1032{
11fdf7f2
TL
1033 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
1034 ceph_assert(new_info.rank == MDS_RANK_NONE);
7c673cae
FG
1035 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
1036 standby_daemons[new_info.global_id] = new_info;
1037 standby_epochs[new_info.global_id] = epoch;
1038}
1039
9f95a23c 1040std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
7c673cae 1041{
11fdf7f2 1042 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1043 auto fs = filesystems.at(mds_roles.at(who));
1044 const auto &info = fs->mds_map.mds_info.at(who);
1045 fs->mds_map.up.erase(info.rank);
1046 fs->mds_map.in.erase(info.rank);
1047 fs->mds_map.stopped.insert(info.rank);
1048
1049 // Also drop any standby replays that were following this rank
9f95a23c 1050 std::vector<mds_gid_t> standbys;
7c673cae
FG
1051 for (const auto &i : fs->mds_map.mds_info) {
1052 const auto &other_gid = i.first;
1053 const auto &other_info = i.second;
1054 if (other_info.rank == info.rank
1055 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
1056 standbys.push_back(other_gid);
1057 erase(other_gid, 0);
1058 }
1059 }
1060
1061 fs->mds_map.mds_info.erase(who);
1062 mds_roles.erase(who);
1063
1064 fs->mds_map.epoch = epoch;
1065
1066 return standbys;
1067}
1068
1069
1070/**
1071 * Given one of the following forms:
1072 * <fs name>:<rank>
1073 * <fs id>:<rank>
1074 * <rank>
1075 *
1076 * Parse into a mds_role_t. The rank-only form is only valid
1077 * if legacy_client_ns is set.
1078 */
1079int FSMap::parse_role(
11fdf7f2 1080 std::string_view role_str,
7c673cae
FG
1081 mds_role_t *role,
1082 std::ostream &ss) const
1083{
1084 size_t colon_pos = role_str.find(":");
1085 size_t rank_pos;
11fdf7f2 1086 Filesystem::const_ref fs;
7c673cae
FG
1087 if (colon_pos == std::string::npos) {
1088 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1089 ss << "No filesystem selected";
1090 return -ENOENT;
1091 }
1092 fs = get_filesystem(legacy_client_fscid);
1093 rank_pos = 0;
1094 } else {
1095 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1096 ss << "Invalid filesystem";
1097 return -ENOENT;
1098 }
1099 rank_pos = colon_pos+1;
1100 }
1101
1102 mds_rank_t rank;
1103 std::string err;
94b18763 1104 std::string rank_str(role_str.substr(rank_pos));
7c673cae
FG
1105 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1106 if (rank_i < 0 || !err.empty()) {
1107 ss << "Invalid rank '" << rank_str << "'";
1108 return -EINVAL;
1109 } else {
1110 rank = rank_i;
1111 }
1112
1113 if (fs->mds_map.in.count(rank) == 0) {
1114 ss << "Rank '" << rank << "' not found";
1115 return -ENOENT;
1116 }
1117
1118 *role = {fs->fscid, rank};
1119
1120 return 0;
1121}
9f95a23c
TL
1122
1123bool FSMap::pool_in_use(int64_t poolid) const
1124{
1125 for (auto const &i : filesystems) {
1126 if (i.second->mds_map.is_data_pool(poolid)
1127 || i.second->mds_map.metadata_pool == poolid) {
1128 return true;
1129 }
1130 }
1131 return false;
1132}
1133
1134void FSMap::erase_filesystem(fs_cluster_id_t fscid)
1135{
1136 filesystems.erase(fscid);
1137 for (auto& [gid, info] : standby_daemons) {
1138 if (info.join_fscid == fscid) {
1139 modify_daemon(gid, [](auto& info) {
1140 info.join_fscid = FS_CLUSTER_ID_NONE;
1141 });
1142 }
1143 }
1144 for (auto& p : filesystems) {
1145 for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
1146 if (info.join_fscid == fscid) {
1147 modify_daemon(gid, [](auto& info) {
1148 info.join_fscid = FS_CLUSTER_ID_NONE;
1149 });
1150 }
1151 }
1152 }
1153}