]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/FSMap.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / FSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include "FSMap.h"
17
11fdf7f2 18#include "common/StackStringStream.h"
7c673cae 19
11fdf7f2
TL
20#include <sstream>
21#ifdef WITH_SEASTAR
22#include "crimson/common/config_proxy.h"
23#else
24#include "common/config_proxy.h"
25#endif
26#include "global/global_context.h"
224ce89b
WB
27#include "mon/health_check.h"
28
11fdf7f2 29using std::stringstream;
7c673cae
FG
30
31void Filesystem::dump(Formatter *f) const
32{
33 f->open_object_section("mdsmap");
34 mds_map.dump(f);
35 f->close_section();
36 f->dump_int("id", fscid);
37}
38
39void FSMap::dump(Formatter *f) const
40{
41 f->dump_int("epoch", epoch);
11fdf7f2
TL
42 // Use 'default' naming to match 'set-default' CLI
43 f->dump_int("default_fscid", legacy_client_fscid);
7c673cae
FG
44
45 f->open_object_section("compat");
46 compat.dump(f);
47 f->close_section();
48
49 f->open_object_section("feature_flags");
50 f->dump_bool("enable_multiple", enable_multiple);
51 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
52 f->close_section();
53
54 f->open_array_section("standbys");
9f95a23c 55 for (const auto& [gid, info] : standby_daemons) {
7c673cae 56 f->open_object_section("info");
9f95a23c
TL
57 info.dump(f);
58 f->dump_int("epoch", standby_epochs.at(gid));
7c673cae
FG
59 f->close_section();
60 }
61 f->close_section();
62
63 f->open_array_section("filesystems");
64 for (const auto &fs : filesystems) {
65 f->open_object_section("filesystem");
66 fs.second->dump(f);
67 f->close_section();
68 }
69 f->close_section();
70}
71
9f95a23c
TL
72FSMap &FSMap::operator=(const FSMap &rhs)
73{
74 epoch = rhs.epoch;
75 next_filesystem_id = rhs.next_filesystem_id;
76 legacy_client_fscid = rhs.legacy_client_fscid;
77 compat = rhs.compat;
78 enable_multiple = rhs.enable_multiple;
79 mds_roles = rhs.mds_roles;
80 standby_daemons = rhs.standby_daemons;
81 standby_epochs = rhs.standby_epochs;
82
83 filesystems.clear();
84 for (const auto &i : rhs.filesystems) {
85 const auto &fs = i.second;
86 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
87 }
88
89 return *this;
90}
91
92void FSMap::generate_test_instances(std::list<FSMap*>& ls)
7c673cae
FG
93{
94 FSMap *m = new FSMap();
95
96 std::list<MDSMap*> mds_map_instances;
97 MDSMap::generate_test_instances(mds_map_instances);
98
99 int k = 20;
100 for (auto i : mds_map_instances) {
11fdf7f2 101 auto fs = Filesystem::create();
7c673cae
FG
102 fs->fscid = k++;
103 fs->mds_map = *i;
104 delete i;
105 m->filesystems[fs->fscid] = fs;
106 }
107 mds_map_instances.clear();
108
109 ls.push_back(m);
110}
111
112void FSMap::print(ostream& out) const
113{
114 out << "e" << epoch << std::endl;
115 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
116 << ever_enabled_multiple << std::endl;
117 out << "compat: " << compat << std::endl;
118 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
119 out << " " << std::endl;
120
121 if (filesystems.empty()) {
122 out << "No filesystems configured" << std::endl;
7c673cae
FG
123 }
124
11fdf7f2
TL
125 for (const auto& p : filesystems) {
126 p.second->print(out);
7c673cae
FG
127 out << " " << std::endl << " " << std::endl; // Space out a bit
128 }
129
130 if (!standby_daemons.empty()) {
131 out << "Standby daemons:" << std::endl << " " << std::endl;
132 }
133
9f95a23c
TL
134 for (const auto& p : standby_daemons) {
135 out << p.second << std::endl;
7c673cae
FG
136 }
137}
138
7c673cae
FG
139void FSMap::print_summary(Formatter *f, ostream *out) const
140{
7c673cae
FG
141 if (f) {
142 f->dump_unsigned("epoch", get_epoch());
11fdf7f2
TL
143 for (const auto &p : filesystems) {
144 auto& fs = p.second;
7c673cae
FG
145 f->dump_unsigned("id", fs->fscid);
146 f->dump_unsigned("up", fs->mds_map.up.size());
147 f->dump_unsigned("in", fs->mds_map.in.size());
148 f->dump_unsigned("max", fs->mds_map.max_mds);
149 }
150 } else {
11fdf7f2
TL
151 auto count = filesystems.size();
152 if (count <= 3) {
153 bool first = true;
154 for (const auto& p : filesystems) {
155 const auto& fs = p.second;
156 if (!first) {
157 *out << " ";
158 }
159 if (fs->mds_map.is_degraded()) {
160 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
161 } else {
162 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
163 }
164 first = false;
165 }
166 } else {
167 *out << count << " fs";
168 unsigned degraded = 0;
169 CachedStackStringStream css;
170 *css << " (degraded: ";
171 for (const auto& p : filesystems) {
172 const auto& fs = p.second;
173 if (fs->mds_map.is_degraded()) {
174 degraded++;
175 if (degraded <= 3) {
176 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
177 }
178 }
179 }
180 if (degraded > 0) {
181 if (degraded <= 3) {
182 *css << ")";
183 *out << css->strv();
184 } else {
185 *out << " (degraded: " << degraded << " fs)";
186 }
187 }
7c673cae
FG
188 }
189 }
190
191 if (f) {
192 f->open_array_section("by_rank");
193 }
194
11fdf7f2
TL
195 std::map<MDSMap::DaemonState,unsigned> by_state;
196 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
197 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
198 for (const auto& [gid, fscid] : mds_roles) {
199 if (fscid == FS_CLUSTER_ID_NONE)
200 continue;
201
202 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
203 auto s = std::string(ceph_mds_state_name(info.state));
7c673cae
FG
204 if (info.laggy()) {
205 s += "(laggy or crashed)";
206 }
207
11fdf7f2
TL
208 if (f) {
209 f->open_object_section("mds");
210 f->dump_unsigned("filesystem_id", fscid);
211 f->dump_unsigned("rank", info.rank);
212 f->dump_string("name", info.name);
213 f->dump_string("status", s);
214 f->dump_unsigned("gid", gid);
215 f->close_section();
216 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
217 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
7c673cae 218 }
11fdf7f2 219 by_state[info.state]++;
7c673cae
FG
220 }
221
222 if (f) {
223 f->close_section();
224 } else {
11fdf7f2 225 if (0 < by_rank.size() && by_rank.size() < 5) {
7c673cae
FG
226 if (filesystems.size() > 1) {
227 // Disambiguate filesystems
228 std::map<std::string, std::string> pretty;
11fdf7f2
TL
229 for (const auto& [role,status] : by_rank) {
230 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
231 CachedStackStringStream css;
232 *css << fs_name << ":" << role.rank;
233 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
234 --by_state[status.first]; /* already printed! */
7c673cae
FG
235 }
236 *out << " " << pretty;
237 } else {
238 // Omit FSCID in output when only one filesystem exists
239 std::map<mds_rank_t, std::string> shortened;
11fdf7f2
TL
240 for (const auto& [role,status] : by_rank) {
241 shortened[role.rank] = status.second;
242 --by_state[status.first]; /* already printed! */
7c673cae
FG
243 }
244 *out << " " << shortened;
245 }
246 }
11fdf7f2
TL
247 for (const auto& [state, count] : by_state) {
248 if (count > 0) {
249 auto s = std::string_view(ceph_mds_state_name(state));
250 *out << " " << count << " " << s;
251 }
252 }
7c673cae
FG
253 }
254
11fdf7f2
TL
255 if (f) {
256 const auto state = MDSMap::DaemonState::STATE_STANDBY;
257 auto&& name = ceph_mds_state_name(state);
258 auto count = standby_daemons.size();
259 f->dump_unsigned(name, count);
7c673cae
FG
260 }
261
262 size_t failed = 0;
263 size_t damaged = 0;
11fdf7f2
TL
264 for (const auto& p : filesystems) {
265 auto& fs = p.second;
7c673cae
FG
266 failed += fs->mds_map.failed.size();
267 damaged += fs->mds_map.damaged.size();
268 }
269
270 if (failed > 0) {
271 if (f) {
272 f->dump_unsigned("failed", failed);
273 } else {
274 *out << ", " << failed << " failed";
275 }
276 }
277
278 if (damaged > 0) {
279 if (f) {
280 f->dump_unsigned("damaged", damaged);
281 } else {
282 *out << ", " << damaged << " damaged";
283 }
284 }
285 //if (stopped.size())
286 //out << ", " << stopped.size() << " stopped";
287}
288
9f95a23c
TL
289mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
290{
291 for (const auto &i : mds_map.mds_info) {
292 const auto &info = i.second;
293 if (info.state == MDSMap::STATE_STANDBY_REPLAY
294 && info.rank == mds_map.mds_info.at(who).rank) {
295 return info.global_id;
296 }
297 }
298 return MDS_GID_NONE;
299}
7c673cae 300
11fdf7f2
TL
301Filesystem::ref FSMap::create_filesystem(std::string_view name,
302 int64_t metadata_pool, int64_t data_pool, uint64_t features)
7c673cae 303{
11fdf7f2 304 auto fs = Filesystem::create();
28e407b8 305 fs->mds_map.epoch = epoch;
11fdf7f2 306 fs->mds_map.fs_name = name;
31f18b77 307 fs->mds_map.data_pools.push_back(data_pool);
7c673cae
FG
308 fs->mds_map.metadata_pool = metadata_pool;
309 fs->mds_map.cas_pool = -1;
7c673cae
FG
310 fs->mds_map.compat = compat;
311 fs->mds_map.created = ceph_clock_now();
312 fs->mds_map.modified = ceph_clock_now();
7c673cae 313 fs->mds_map.enabled = true;
9f95a23c
TL
314 fs->fscid = next_filesystem_id++;
315 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
316 // have initialized next_filesystem_id such that it's never used here.
317 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
7c673cae
FG
318 filesystems[fs->fscid] = fs;
319
320 // Created first filesystem? Set it as the one
321 // for legacy clients to use
322 if (filesystems.size() == 1) {
323 legacy_client_fscid = fs->fscid;
324 }
11fdf7f2
TL
325
326 return fs;
7c673cae
FG
327}
328
9f95a23c
TL
329Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
330{
331 for (const auto& p : filesystems) {
332 if (p.second->mds_map.fs_name == name) {
333 return p.second;
334 }
335 }
336 return nullptr;
337}
338
339std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
340{
341 std::vector<Filesystem::const_ref> ret;
342 for (const auto& p : filesystems) {
343 ret.push_back(p.second);
344 }
345 return ret;
346}
347
7c673cae
FG
348void FSMap::reset_filesystem(fs_cluster_id_t fscid)
349{
350 auto fs = get_filesystem(fscid);
11fdf7f2 351 auto new_fs = Filesystem::create();
7c673cae
FG
352
353 // Populate rank 0 as existing (so don't go into CREATING)
354 // but failed (so that next available MDS is assigned the rank)
355 new_fs->mds_map.in.insert(mds_rank_t(0));
356 new_fs->mds_map.failed.insert(mds_rank_t(0));
357
358 // Carry forward what makes sense
359 new_fs->fscid = fs->fscid;
360 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
7c673cae
FG
361 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
362 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
363 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
364 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
7c673cae
FG
365 new_fs->mds_map.compat = compat;
366 new_fs->mds_map.created = ceph_clock_now();
367 new_fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
368 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
369 new_fs->mds_map.enabled = true;
370
c07f9fc5
FG
371 // Remember mds ranks that have ever started. (They should load old inotable
372 // instead of creating new one if they start again.)
373 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
374 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
375 new_fs->mds_map.stopped.erase(mds_rank_t(0));
376
7c673cae
FG
377 // Persist the new FSMap
378 filesystems[new_fs->fscid] = new_fs;
379}
380
381void FSMap::get_health(list<pair<health_status_t,string> >& summary,
382 list<pair<health_status_t,string> > *detail) const
383{
384 mds_rank_t standby_count_wanted = 0;
385 for (const auto &i : filesystems) {
386 const auto &fs = i.second;
387
388 // TODO: move get_health up into here so that we can qualify
389 // all the messages with what filesystem they're talking about
390 fs->mds_map.get_health(summary, detail);
391
392 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
393 }
394
395 if (standby_count_wanted) {
396 std::ostringstream oss;
397 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
398 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
399 }
400}
401
402bool FSMap::check_health(void)
403{
404 bool changed = false;
405 for (auto &i : filesystems) {
406 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
407 }
408 return changed;
409}
410
224ce89b
WB
411void FSMap::get_health_checks(health_check_map_t *checks) const
412{
413 mds_rank_t standby_count_wanted = 0;
414 for (const auto &i : filesystems) {
415 const auto &fs = i.second;
416 health_check_map_t fschecks;
d2e6a577 417
224ce89b 418 fs->mds_map.get_health_checks(&fschecks);
d2e6a577
FG
419
420 // Some of the failed ranks might be transient (i.e. there are standbys
421 // ready to replace them). We will report only on "stuck" failed, i.e.
422 // ranks which are failed and have no standby replacement available.
423 std::set<mds_rank_t> stuck_failed;
424
425 for (const auto &rank : fs->mds_map.failed) {
9f95a23c
TL
426 auto rep_info = find_replacement_for({fs->fscid, rank});
427 if (!rep_info) {
d2e6a577
FG
428 stuck_failed.insert(rank);
429 }
430 }
431
432 // FS_WITH_FAILED_MDS
433 if (!stuck_failed.empty()) {
434 health_check_t& fscheck = checks->get_or_add(
435 "FS_WITH_FAILED_MDS", HEALTH_WARN,
9f95a23c 436 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
d2e6a577
FG
437 ostringstream ss;
438 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
439 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
440 fscheck.detail.push_back(ss.str()); }
441
224ce89b
WB
442 checks->merge(fschecks);
443 standby_count_wanted = std::max(
444 standby_count_wanted,
445 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
446 }
447
448 // MDS_INSUFFICIENT_STANDBY
449 if (standby_count_wanted) {
450 std::ostringstream oss, dss;
d2e6a577 451 oss << "insufficient standby MDS daemons available";
9f95a23c 452 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1);
224ce89b
WB
453 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
454 << " more";
455 d.detail.push_back(dss.str());
456 }
457}
458
9f95a23c 459void FSMap::update_compat(const CompatSet &c)
7c673cae 460{
9f95a23c
TL
461 // We could do something more complicated here to enable
462 // different filesystems to be served by different MDS versions,
463 // but this is a lot simpler because it doesn't require us to
464 // track the compat versions for standby daemons.
465 compat = c;
466 for (const auto &i : filesystems) {
467 MDSMap &mds_map = i.second->mds_map;
468 mds_map.compat = c;
469 mds_map.epoch = epoch;
470 }
471}
7c673cae 472
9f95a23c
TL
473void FSMap::encode(bufferlist& bl, uint64_t features) const
474{
475 ENCODE_START(7, 6, bl);
476 encode(epoch, bl);
477 encode(next_filesystem_id, bl);
478 encode(legacy_client_fscid, bl);
479 encode(compat, bl);
480 encode(enable_multiple, bl);
481 {
482 std::vector<Filesystem::ref> v;
483 v.reserve(filesystems.size());
484 for (auto& p : filesystems) v.emplace_back(p.second);
485 encode(v, bl, features);
7c673cae 486 }
9f95a23c
TL
487 encode(mds_roles, bl);
488 encode(standby_daemons, bl, features);
489 encode(standby_epochs, bl);
490 encode(ever_enabled_multiple, bl);
491 ENCODE_FINISH(bl);
7c673cae
FG
492}
493
11fdf7f2 494void FSMap::decode(bufferlist::const_iterator& p)
7c673cae 495{
7c673cae
FG
496 // The highest MDSMap encoding version before we changed the
497 // MDSMonitor to store an FSMap instead of an MDSMap was
498 // 5, so anything older than 6 is decoded as an MDSMap,
499 // and anything newer is decoded as an FSMap.
f91f0fd5 500 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
7c673cae 501 if (struct_v < 6) {
3efd9988
FG
502 // Because the mon used to store an MDSMap where we now
503 // store an FSMap, FSMap knows how to decode the legacy
504 // MDSMap format (it never needs to encode it though).
505 MDSMap legacy_mds_map;
506
7c673cae 507 // Decoding an MDSMap (upgrade)
11fdf7f2
TL
508 decode(epoch, p);
509 decode(legacy_mds_map.flags, p);
510 decode(legacy_mds_map.last_failure, p);
511 decode(legacy_mds_map.root, p);
512 decode(legacy_mds_map.session_timeout, p);
513 decode(legacy_mds_map.session_autoclose, p);
514 decode(legacy_mds_map.max_file_size, p);
515 decode(legacy_mds_map.max_mds, p);
516 decode(legacy_mds_map.mds_info, p);
7c673cae
FG
517 if (struct_v < 3) {
518 __u32 n;
11fdf7f2 519 decode(n, p);
7c673cae
FG
520 while (n--) {
521 __u32 m;
11fdf7f2 522 decode(m, p);
31f18b77 523 legacy_mds_map.data_pools.push_back(m);
7c673cae
FG
524 }
525 __s32 s;
11fdf7f2 526 decode(s, p);
7c673cae
FG
527 legacy_mds_map.cas_pool = s;
528 } else {
11fdf7f2
TL
529 decode(legacy_mds_map.data_pools, p);
530 decode(legacy_mds_map.cas_pool, p);
7c673cae
FG
531 }
532
533 // kclient ignores everything from here
534 __u16 ev = 1;
535 if (struct_v >= 2)
11fdf7f2 536 decode(ev, p);
7c673cae 537 if (ev >= 3)
11fdf7f2 538 decode(legacy_mds_map.compat, p);
7c673cae 539 else
1adf2230 540 legacy_mds_map.compat = MDSMap::get_compat_set_base();
7c673cae
FG
541 if (ev < 5) {
542 __u32 n;
11fdf7f2 543 decode(n, p);
7c673cae
FG
544 legacy_mds_map.metadata_pool = n;
545 } else {
11fdf7f2 546 decode(legacy_mds_map.metadata_pool, p);
7c673cae 547 }
11fdf7f2
TL
548 decode(legacy_mds_map.created, p);
549 decode(legacy_mds_map.modified, p);
550 decode(legacy_mds_map.tableserver, p);
551 decode(legacy_mds_map.in, p);
7c673cae 552 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
11fdf7f2
TL
553 decode(inc, p);
554 decode(legacy_mds_map.up, p);
555 decode(legacy_mds_map.failed, p);
556 decode(legacy_mds_map.stopped, p);
7c673cae 557 if (ev >= 4)
11fdf7f2 558 decode(legacy_mds_map.last_failure_osd_epoch, p);
7c673cae
FG
559 if (ev >= 6) {
560 if (ev < 10) {
561 // previously this was a bool about snaps, not a flag map
562 bool flag;
11fdf7f2 563 decode(flag, p);
7c673cae
FG
564 legacy_mds_map.ever_allowed_features = flag ?
565 CEPH_MDSMAP_ALLOW_SNAPS : 0;
11fdf7f2 566 decode(flag, p);
7c673cae
FG
567 legacy_mds_map.explicitly_allowed_features = flag ?
568 CEPH_MDSMAP_ALLOW_SNAPS : 0;
7c673cae 569 } else {
11fdf7f2
TL
570 decode(legacy_mds_map.ever_allowed_features, p);
571 decode(legacy_mds_map.explicitly_allowed_features, p);
7c673cae
FG
572 }
573 } else {
11fdf7f2 574 legacy_mds_map.ever_allowed_features = 0;
7c673cae 575 legacy_mds_map.explicitly_allowed_features = 0;
7c673cae
FG
576 }
577 if (ev >= 7)
11fdf7f2 578 decode(legacy_mds_map.inline_data_enabled, p);
7c673cae
FG
579
580 if (ev >= 8) {
11fdf7f2
TL
581 ceph_assert(struct_v >= 5);
582 decode(legacy_mds_map.enabled, p);
583 decode(legacy_mds_map.fs_name, p);
7c673cae
FG
584 } else {
585 legacy_mds_map.fs_name = "default";
586 if (epoch > 1) {
587 // If an MDS has ever been started, epoch will be greater than 1,
588 // assume filesystem is enabled.
589 legacy_mds_map.enabled = true;
590 } else {
591 // Upgrading from a cluster that never used an MDS, switch off
592 // filesystem until it's explicitly enabled.
593 legacy_mds_map.enabled = false;
594 }
595 }
596
597 if (ev >= 9) {
11fdf7f2 598 decode(legacy_mds_map.damaged, p);
7c673cae
FG
599 }
600
601 // We're upgrading, populate filesystems from the legacy fields
602 filesystems.clear();
603 standby_daemons.clear();
604 standby_epochs.clear();
605 mds_roles.clear();
606 compat = legacy_mds_map.compat;
607 enable_multiple = false;
608
609 // Synthesise a Filesystem from legacy_mds_map, if enabled
610 if (legacy_mds_map.enabled) {
611 // Construct a Filesystem from the legacy MDSMap
11fdf7f2 612 auto migrate_fs = Filesystem::create();
7c673cae
FG
613 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
614 migrate_fs->mds_map = legacy_mds_map;
615 migrate_fs->mds_map.epoch = epoch;
616 filesystems[migrate_fs->fscid] = migrate_fs;
617
618 // List of GIDs that had invalid states
619 std::set<mds_gid_t> drop_gids;
620
621 // Construct mds_roles, standby_daemons, and remove
622 // standbys from the MDSMap in the Filesystem.
11fdf7f2
TL
623 for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
624 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
625 /* drop any legacy standby-replay daemons */
626 drop_gids.insert(gid);
627 } else if (info.rank == MDS_RANK_NONE) {
628 if (info.state != MDSMap::STATE_STANDBY) {
7c673cae
FG
629 // Old MDSMaps can have down:dne here, which
630 // is invalid in an FSMap (#17837)
11fdf7f2 631 drop_gids.insert(gid);
7c673cae 632 } else {
11fdf7f2 633 insert(info); // into standby_daemons
7c673cae
FG
634 }
635 } else {
11fdf7f2 636 mds_roles[gid] = migrate_fs->fscid;
7c673cae
FG
637 }
638 }
639 for (const auto &p : standby_daemons) {
640 // Erase from this Filesystem's MDSMap, because it has
641 // been copied into FSMap::Standby_daemons above
642 migrate_fs->mds_map.mds_info.erase(p.first);
643 }
644 for (const auto &gid : drop_gids) {
645 // Throw away all info for this MDS because it was identified
646 // as having invalid state above.
647 migrate_fs->mds_map.mds_info.erase(gid);
648 }
649
650 legacy_client_fscid = migrate_fs->fscid;
651 } else {
652 legacy_client_fscid = FS_CLUSTER_ID_NONE;
653 }
654 } else {
11fdf7f2
TL
655 decode(epoch, p);
656 decode(next_filesystem_id, p);
657 decode(legacy_client_fscid, p);
658 decode(compat, p);
659 decode(enable_multiple, p);
660 {
661 std::vector<Filesystem::ref> v;
662 decode(v, p);
663 filesystems.clear();
664 for (auto& ref : v) {
665 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
666 ceph_assert(em.second);
667 }
7c673cae 668 }
11fdf7f2
TL
669 decode(mds_roles, p);
670 decode(standby_daemons, p);
671 decode(standby_epochs, p);
7c673cae 672 if (struct_v >= 7) {
11fdf7f2 673 decode(ever_enabled_multiple, p);
7c673cae
FG
674 }
675 }
676
677 DECODE_FINISH(p);
678}
679
11fdf7f2 680void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
681{
682 for (auto &fs : filesystems) {
683 fs.second->mds_map.sanitize(pool_exists);
684 }
685}
7c673cae
FG
686
687void Filesystem::encode(bufferlist& bl, uint64_t features) const
688{
689 ENCODE_START(1, 1, bl);
11fdf7f2 690 encode(fscid, bl);
7c673cae
FG
691 bufferlist mdsmap_bl;
692 mds_map.encode(mdsmap_bl, features);
11fdf7f2 693 encode(mdsmap_bl, bl);
7c673cae
FG
694 ENCODE_FINISH(bl);
695}
696
11fdf7f2 697void Filesystem::decode(bufferlist::const_iterator& p)
7c673cae
FG
698{
699 DECODE_START(1, p);
11fdf7f2 700 decode(fscid, p);
7c673cae 701 bufferlist mdsmap_bl;
11fdf7f2
TL
702 decode(mdsmap_bl, p);
703 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
7c673cae
FG
704 mds_map.decode(mdsmap_bl_iter);
705 DECODE_FINISH(p);
706}
707
708int FSMap::parse_filesystem(
11fdf7f2
TL
709 std::string_view ns_str,
710 Filesystem::const_ref* result
7c673cae
FG
711 ) const
712{
713 std::string ns_err;
94b18763
FG
714 std::string s(ns_str);
715 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
7c673cae
FG
716 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
717 for (auto &fs : filesystems) {
94b18763 718 if (fs.second->mds_map.fs_name == s) {
7c673cae
FG
719 *result = std::const_pointer_cast<const Filesystem>(fs.second);
720 return 0;
721 }
722 }
723 return -ENOENT;
724 } else {
725 *result = get_filesystem(fscid);
726 return 0;
727 }
728}
729
730void Filesystem::print(std::ostream &out) const
731{
732 out << "Filesystem '" << mds_map.fs_name
733 << "' (" << fscid << ")" << std::endl;
734 mds_map.print(out);
735}
736
9f95a23c 737bool FSMap::is_any_degraded() const
7c673cae 738{
9f95a23c
TL
739 for (auto& i : filesystems) {
740 if (i.second->mds_map.is_degraded()) {
741 return true;
742 }
743 }
744 return false;
745}
746
747std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
748{
749 std::map<mds_gid_t, mds_info_t> result;
750 for (const auto &i : standby_daemons) {
751 result[i.first] = i.second;
752 }
753
754 for (const auto &i : filesystems) {
755 const auto &fs_info = i.second->mds_map.get_mds_info();
756 for (const auto &j : fs_info) {
757 result[j.first] = j.second;
758 }
759 }
760
761 return result;
762}
763
764const MDSMap::mds_info_t* FSMap::get_available_standby(fs_cluster_id_t fscid) const
765{
766 const mds_info_t* who = nullptr;
11fdf7f2
TL
767 for (const auto& [gid, info] : standby_daemons) {
768 ceph_assert(info.rank == MDS_RANK_NONE);
769 ceph_assert(info.state == MDSMap::STATE_STANDBY);
7c673cae 770
11fdf7f2 771 if (info.laggy() || info.is_frozen()) {
7c673cae
FG
772 continue;
773 }
774
9f95a23c
TL
775 if (info.join_fscid == fscid) {
776 who = &info;
777 break;
778 } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
779 who = &info; /* vanilla standby */
780 } else if (who == nullptr) {
781 who = &info; /* standby for another fs, last resort */
782 }
783 }
784 return who;
785}
786
787mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
788{
789 const auto info = get_mds_info();
790 for (const auto &p : info) {
791 if (p.second.name == s) {
792 return p.first;
793 }
7c673cae 794 }
11fdf7f2 795 return MDS_GID_NONE;
7c673cae
FG
796}
797
9f95a23c
TL
798const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
799{
800 std::map<mds_gid_t, mds_info_t> result;
801 for (const auto &i : standby_daemons) {
802 if (i.second.name == name) {
803 return &(i.second);
804 }
805 }
806
807 for (const auto &i : filesystems) {
808 const auto &fs_info = i.second->mds_map.get_mds_info();
809 for (const auto &j : fs_info) {
810 if (j.second.name == name) {
811 return &(j.second);
812 }
813 }
814 }
815
816 return nullptr;
817}
818
819const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
11fdf7f2
TL
820{
821 auto&& fs = get_filesystem(role.fscid);
7c673cae 822
11fdf7f2
TL
823 // First see if we have a STANDBY_REPLAY
824 for (const auto& [gid, info] : fs->mds_map.mds_info) {
825 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
826 if (info.is_frozen()) {
827 /* the standby-replay is frozen, do nothing! */
9f95a23c 828 return nullptr;
11fdf7f2 829 } else {
9f95a23c 830 return &info;
11fdf7f2 831 }
7c673cae
FG
832 }
833 }
7c673cae 834
9f95a23c 835 return get_available_standby(role.fscid);
7c673cae
FG
836}
837
838void FSMap::sanity() const
839{
840 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
11fdf7f2 841 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
7c673cae
FG
842 }
843
844 for (const auto &i : filesystems) {
845 auto fs = i.second;
11fdf7f2
TL
846 ceph_assert(fs->mds_map.compat.compare(compat) == 0);
847 ceph_assert(fs->fscid == i.first);
7c673cae 848 for (const auto &j : fs->mds_map.mds_info) {
11fdf7f2
TL
849 ceph_assert(j.second.rank != MDS_RANK_NONE);
850 ceph_assert(mds_roles.count(j.first) == 1);
851 ceph_assert(standby_daemons.count(j.first) == 0);
852 ceph_assert(standby_epochs.count(j.first) == 0);
853 ceph_assert(mds_roles.at(j.first) == i.first);
7c673cae 854 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
11fdf7f2
TL
855 ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first);
856 ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0);
857 ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0);
7c673cae
FG
858 }
859 }
860
861 for (const auto &j : fs->mds_map.up) {
862 mds_rank_t rank = j.first;
11fdf7f2 863 ceph_assert(fs->mds_map.in.count(rank) == 1);
7c673cae 864 mds_gid_t gid = j.second;
11fdf7f2 865 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
7c673cae
FG
866 }
867 }
868
869 for (const auto &i : standby_daemons) {
11fdf7f2
TL
870 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
871 ceph_assert(i.second.rank == MDS_RANK_NONE);
872 ceph_assert(i.second.global_id == i.first);
873 ceph_assert(standby_epochs.count(i.first) == 1);
874 ceph_assert(mds_roles.count(i.first) == 1);
875 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
7c673cae
FG
876 }
877
878 for (const auto &i : standby_epochs) {
11fdf7f2 879 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae
FG
880 }
881
882 for (const auto &i : mds_roles) {
883 if (i.second == FS_CLUSTER_ID_NONE) {
11fdf7f2 884 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae 885 } else {
11fdf7f2
TL
886 ceph_assert(filesystems.count(i.second) == 1);
887 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
7c673cae
FG
888 }
889 }
890}
891
892void FSMap::promote(
893 mds_gid_t standby_gid,
11fdf7f2 894 Filesystem& filesystem,
7c673cae
FG
895 mds_rank_t assigned_rank)
896{
11fdf7f2 897 ceph_assert(gid_exists(standby_gid));
7c673cae
FG
898 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
899 if (!is_standby_replay) {
11fdf7f2
TL
900 ceph_assert(standby_daemons.count(standby_gid));
901 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
7c673cae
FG
902 }
903
11fdf7f2 904 MDSMap &mds_map = filesystem.mds_map;
7c673cae
FG
905
906 // Insert daemon state to Filesystem
907 if (!is_standby_replay) {
908 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
909 } else {
11fdf7f2
TL
910 ceph_assert(mds_map.mds_info.count(standby_gid));
911 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
912 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
7c673cae 913 }
9f95a23c 914 auto& info = mds_map.mds_info[standby_gid];
7c673cae
FG
915
916 if (mds_map.stopped.erase(assigned_rank)) {
917 // The cluster is being expanded with a stopped rank
918 info.state = MDSMap::STATE_STARTING;
919 } else if (!mds_map.is_in(assigned_rank)) {
920 // The cluster is being expanded with a new rank
921 info.state = MDSMap::STATE_CREATING;
922 } else {
923 // An existing rank is being assigned to a replacement
924 info.state = MDSMap::STATE_REPLAY;
925 mds_map.failed.erase(assigned_rank);
926 }
927 info.rank = assigned_rank;
928 info.inc = epoch;
11fdf7f2 929 mds_roles[standby_gid] = filesystem.fscid;
7c673cae
FG
930
931 // Update the rank state in Filesystem
932 mds_map.in.insert(assigned_rank);
933 mds_map.up[assigned_rank] = standby_gid;
934
935 // Remove from the list of standbys
936 if (!is_standby_replay) {
937 standby_daemons.erase(standby_gid);
938 standby_epochs.erase(standby_gid);
939 }
940
941 // Indicate that Filesystem has been modified
942 mds_map.epoch = epoch;
943}
944
945void FSMap::assign_standby_replay(
946 const mds_gid_t standby_gid,
947 const fs_cluster_id_t leader_ns,
948 const mds_rank_t leader_rank)
949{
11fdf7f2
TL
950 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
951 ceph_assert(gid_exists(standby_gid));
952 ceph_assert(!gid_has_rank(standby_gid));
953 ceph_assert(standby_daemons.count(standby_gid));
7c673cae
FG
954
955 // Insert to the filesystem
956 auto fs = filesystems.at(leader_ns);
957 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
958 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
959 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
960 mds_roles[standby_gid] = leader_ns;
961
962 // Remove from the list of standbys
963 standby_daemons.erase(standby_gid);
964 standby_epochs.erase(standby_gid);
965
966 // Indicate that Filesystem has been modified
967 fs->mds_map.epoch = epoch;
968}
969
970void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
971{
972 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
973 standby_daemons.erase(who);
974 standby_epochs.erase(who);
975 } else {
976 auto &fs = filesystems.at(mds_roles.at(who));
977 const auto &info = fs->mds_map.mds_info.at(who);
978 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
979 if (info.state == MDSMap::STATE_CREATING) {
980 // If this gid didn't make it past CREATING, then forget
981 // the rank ever existed so that next time it's handed out
982 // to a gid it'll go back into CREATING.
983 fs->mds_map.in.erase(info.rank);
984 } else {
985 // Put this rank into the failed list so that the next available
986 // STANDBY will pick it up.
987 fs->mds_map.failed.insert(info.rank);
988 }
11fdf7f2 989 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
7c673cae
FG
990 fs->mds_map.up.erase(info.rank);
991 }
992 fs->mds_map.mds_info.erase(who);
993 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
994 fs->mds_map.epoch = epoch;
995 }
996
997 mds_roles.erase(who);
998}
999
1000void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
1001{
11fdf7f2 1002 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1003 auto fs = filesystems.at(mds_roles.at(who));
1004 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
1005
1006 erase(who, blacklist_epoch);
1007 fs->mds_map.failed.erase(rank);
1008 fs->mds_map.damaged.insert(rank);
1009
11fdf7f2 1010 ceph_assert(fs->mds_map.epoch == epoch);
7c673cae
FG
1011}
1012
1013/**
1014 * Update to indicate that the rank `rank` is to be removed
1015 * from the damaged list of the filesystem `fscid`
1016 */
1017bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
1018{
1019 auto fs = filesystems.at(fscid);
1020
1021 if (fs->mds_map.damaged.erase(rank)) {
1022 fs->mds_map.failed.insert(rank);
1023 fs->mds_map.epoch = epoch;
1024 return true;
1025 } else {
1026 return false;
1027 }
1028}
1029
1030void FSMap::insert(const MDSMap::mds_info_t &new_info)
1031{
11fdf7f2
TL
1032 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
1033 ceph_assert(new_info.rank == MDS_RANK_NONE);
7c673cae
FG
1034 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
1035 standby_daemons[new_info.global_id] = new_info;
1036 standby_epochs[new_info.global_id] = epoch;
1037}
1038
9f95a23c 1039std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
7c673cae 1040{
11fdf7f2 1041 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1042 auto fs = filesystems.at(mds_roles.at(who));
1043 const auto &info = fs->mds_map.mds_info.at(who);
1044 fs->mds_map.up.erase(info.rank);
1045 fs->mds_map.in.erase(info.rank);
1046 fs->mds_map.stopped.insert(info.rank);
1047
1048 // Also drop any standby replays that were following this rank
9f95a23c 1049 std::vector<mds_gid_t> standbys;
7c673cae
FG
1050 for (const auto &i : fs->mds_map.mds_info) {
1051 const auto &other_gid = i.first;
1052 const auto &other_info = i.second;
1053 if (other_info.rank == info.rank
1054 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
1055 standbys.push_back(other_gid);
1056 erase(other_gid, 0);
1057 }
1058 }
1059
1060 fs->mds_map.mds_info.erase(who);
1061 mds_roles.erase(who);
1062
1063 fs->mds_map.epoch = epoch;
1064
1065 return standbys;
1066}
1067
1068
1069/**
1070 * Given one of the following forms:
1071 * <fs name>:<rank>
1072 * <fs id>:<rank>
1073 * <rank>
1074 *
1075 * Parse into a mds_role_t. The rank-only form is only valid
1076 * if legacy_client_ns is set.
1077 */
1078int FSMap::parse_role(
11fdf7f2 1079 std::string_view role_str,
7c673cae
FG
1080 mds_role_t *role,
1081 std::ostream &ss) const
1082{
1083 size_t colon_pos = role_str.find(":");
1084 size_t rank_pos;
11fdf7f2 1085 Filesystem::const_ref fs;
7c673cae
FG
1086 if (colon_pos == std::string::npos) {
1087 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1088 ss << "No filesystem selected";
1089 return -ENOENT;
1090 }
1091 fs = get_filesystem(legacy_client_fscid);
1092 rank_pos = 0;
1093 } else {
1094 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1095 ss << "Invalid filesystem";
1096 return -ENOENT;
1097 }
1098 rank_pos = colon_pos+1;
1099 }
1100
1101 mds_rank_t rank;
1102 std::string err;
94b18763 1103 std::string rank_str(role_str.substr(rank_pos));
7c673cae
FG
1104 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1105 if (rank_i < 0 || !err.empty()) {
1106 ss << "Invalid rank '" << rank_str << "'";
1107 return -EINVAL;
1108 } else {
1109 rank = rank_i;
1110 }
1111
1112 if (fs->mds_map.in.count(rank) == 0) {
1113 ss << "Rank '" << rank << "' not found";
1114 return -ENOENT;
1115 }
1116
1117 *role = {fs->fscid, rank};
1118
1119 return 0;
1120}
9f95a23c
TL
1121
1122bool FSMap::pool_in_use(int64_t poolid) const
1123{
1124 for (auto const &i : filesystems) {
1125 if (i.second->mds_map.is_data_pool(poolid)
1126 || i.second->mds_map.metadata_pool == poolid) {
1127 return true;
1128 }
1129 }
1130 return false;
1131}
1132
1133void FSMap::erase_filesystem(fs_cluster_id_t fscid)
1134{
1135 filesystems.erase(fscid);
1136 for (auto& [gid, info] : standby_daemons) {
1137 if (info.join_fscid == fscid) {
1138 modify_daemon(gid, [](auto& info) {
1139 info.join_fscid = FS_CLUSTER_ID_NONE;
1140 });
1141 }
1142 }
1143 for (auto& p : filesystems) {
1144 for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
1145 if (info.join_fscid == fscid) {
1146 modify_daemon(gid, [](auto& info) {
1147 info.join_fscid = FS_CLUSTER_ID_NONE;
1148 });
1149 }
1150 }
1151 }
1152}