]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/FSMap.cc
update download target update for octopus release
[ceph.git] / ceph / src / mds / FSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include "FSMap.h"
17
11fdf7f2 18#include "common/StackStringStream.h"
7c673cae 19
11fdf7f2
TL
20#include <sstream>
21#ifdef WITH_SEASTAR
22#include "crimson/common/config_proxy.h"
23#else
24#include "common/config_proxy.h"
25#endif
26#include "global/global_context.h"
224ce89b
WB
27#include "mon/health_check.h"
28
11fdf7f2 29using std::stringstream;
7c673cae
FG
30
31void Filesystem::dump(Formatter *f) const
32{
33 f->open_object_section("mdsmap");
34 mds_map.dump(f);
35 f->close_section();
36 f->dump_int("id", fscid);
37}
38
39void FSMap::dump(Formatter *f) const
40{
41 f->dump_int("epoch", epoch);
11fdf7f2
TL
42 // Use 'default' naming to match 'set-default' CLI
43 f->dump_int("default_fscid", legacy_client_fscid);
7c673cae
FG
44
45 f->open_object_section("compat");
46 compat.dump(f);
47 f->close_section();
48
49 f->open_object_section("feature_flags");
50 f->dump_bool("enable_multiple", enable_multiple);
51 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
52 f->close_section();
53
54 f->open_array_section("standbys");
55 for (const auto &i : standby_daemons) {
56 f->open_object_section("info");
57 i.second.dump(f);
58 f->dump_int("epoch", standby_epochs.at(i.first));
59 f->close_section();
60 }
61 f->close_section();
62
63 f->open_array_section("filesystems");
64 for (const auto &fs : filesystems) {
65 f->open_object_section("filesystem");
66 fs.second->dump(f);
67 f->close_section();
68 }
69 f->close_section();
70}
71
72void FSMap::generate_test_instances(list<FSMap*>& ls)
73{
74 FSMap *m = new FSMap();
75
76 std::list<MDSMap*> mds_map_instances;
77 MDSMap::generate_test_instances(mds_map_instances);
78
79 int k = 20;
80 for (auto i : mds_map_instances) {
11fdf7f2 81 auto fs = Filesystem::create();
7c673cae
FG
82 fs->fscid = k++;
83 fs->mds_map = *i;
84 delete i;
85 m->filesystems[fs->fscid] = fs;
86 }
87 mds_map_instances.clear();
88
89 ls.push_back(m);
90}
91
92void FSMap::print(ostream& out) const
93{
94 out << "e" << epoch << std::endl;
95 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
96 << ever_enabled_multiple << std::endl;
97 out << "compat: " << compat << std::endl;
98 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
99 out << " " << std::endl;
100
101 if (filesystems.empty()) {
102 out << "No filesystems configured" << std::endl;
7c673cae
FG
103 }
104
11fdf7f2
TL
105 for (const auto& p : filesystems) {
106 p.second->print(out);
7c673cae
FG
107 out << " " << std::endl << " " << std::endl; // Space out a bit
108 }
109
110 if (!standby_daemons.empty()) {
111 out << "Standby daemons:" << std::endl << " " << std::endl;
112 }
113
114 for (const auto &p : standby_daemons) {
115 p.second.print_summary(out);
116 out << std::endl;
117 }
118}
119
120
121
122void FSMap::print_summary(Formatter *f, ostream *out) const
123{
7c673cae
FG
124 if (f) {
125 f->dump_unsigned("epoch", get_epoch());
11fdf7f2
TL
126 for (const auto &p : filesystems) {
127 auto& fs = p.second;
7c673cae
FG
128 f->dump_unsigned("id", fs->fscid);
129 f->dump_unsigned("up", fs->mds_map.up.size());
130 f->dump_unsigned("in", fs->mds_map.in.size());
131 f->dump_unsigned("max", fs->mds_map.max_mds);
132 }
133 } else {
11fdf7f2
TL
134 auto count = filesystems.size();
135 if (count <= 3) {
136 bool first = true;
137 for (const auto& p : filesystems) {
138 const auto& fs = p.second;
139 if (!first) {
140 *out << " ";
141 }
142 if (fs->mds_map.is_degraded()) {
143 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
144 } else {
145 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
146 }
147 first = false;
148 }
149 } else {
150 *out << count << " fs";
151 unsigned degraded = 0;
152 CachedStackStringStream css;
153 *css << " (degraded: ";
154 for (const auto& p : filesystems) {
155 const auto& fs = p.second;
156 if (fs->mds_map.is_degraded()) {
157 degraded++;
158 if (degraded <= 3) {
159 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
160 }
161 }
162 }
163 if (degraded > 0) {
164 if (degraded <= 3) {
165 *css << ")";
166 *out << css->strv();
167 } else {
168 *out << " (degraded: " << degraded << " fs)";
169 }
170 }
7c673cae
FG
171 }
172 }
173
174 if (f) {
175 f->open_array_section("by_rank");
176 }
177
11fdf7f2
TL
178 std::map<MDSMap::DaemonState,unsigned> by_state;
179 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
180 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
181 for (const auto& [gid, fscid] : mds_roles) {
182 if (fscid == FS_CLUSTER_ID_NONE)
183 continue;
184
185 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
186 auto s = std::string(ceph_mds_state_name(info.state));
7c673cae
FG
187 if (info.laggy()) {
188 s += "(laggy or crashed)";
189 }
190
11fdf7f2
TL
191 if (f) {
192 f->open_object_section("mds");
193 f->dump_unsigned("filesystem_id", fscid);
194 f->dump_unsigned("rank", info.rank);
195 f->dump_string("name", info.name);
196 f->dump_string("status", s);
197 f->dump_unsigned("gid", gid);
198 f->close_section();
199 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
200 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
7c673cae 201 }
11fdf7f2 202 by_state[info.state]++;
7c673cae
FG
203 }
204
205 if (f) {
206 f->close_section();
207 } else {
11fdf7f2 208 if (0 < by_rank.size() && by_rank.size() < 5) {
7c673cae
FG
209 if (filesystems.size() > 1) {
210 // Disambiguate filesystems
211 std::map<std::string, std::string> pretty;
11fdf7f2
TL
212 for (const auto& [role,status] : by_rank) {
213 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
214 CachedStackStringStream css;
215 *css << fs_name << ":" << role.rank;
216 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
217 --by_state[status.first]; /* already printed! */
7c673cae
FG
218 }
219 *out << " " << pretty;
220 } else {
221 // Omit FSCID in output when only one filesystem exists
222 std::map<mds_rank_t, std::string> shortened;
11fdf7f2
TL
223 for (const auto& [role,status] : by_rank) {
224 shortened[role.rank] = status.second;
225 --by_state[status.first]; /* already printed! */
7c673cae
FG
226 }
227 *out << " " << shortened;
228 }
229 }
11fdf7f2
TL
230 for (const auto& [state, count] : by_state) {
231 if (count > 0) {
232 auto s = std::string_view(ceph_mds_state_name(state));
233 *out << " " << count << " " << s;
234 }
235 }
7c673cae
FG
236 }
237
11fdf7f2
TL
238 if (f) {
239 const auto state = MDSMap::DaemonState::STATE_STANDBY;
240 auto&& name = ceph_mds_state_name(state);
241 auto count = standby_daemons.size();
242 f->dump_unsigned(name, count);
7c673cae
FG
243 }
244
245 size_t failed = 0;
246 size_t damaged = 0;
11fdf7f2
TL
247 for (const auto& p : filesystems) {
248 auto& fs = p.second;
7c673cae
FG
249 failed += fs->mds_map.failed.size();
250 damaged += fs->mds_map.damaged.size();
251 }
252
253 if (failed > 0) {
254 if (f) {
255 f->dump_unsigned("failed", failed);
256 } else {
257 *out << ", " << failed << " failed";
258 }
259 }
260
261 if (damaged > 0) {
262 if (f) {
263 f->dump_unsigned("damaged", damaged);
264 } else {
265 *out << ", " << damaged << " damaged";
266 }
267 }
268 //if (stopped.size())
269 //out << ", " << stopped.size() << " stopped";
270}
271
272
11fdf7f2
TL
273Filesystem::ref FSMap::create_filesystem(std::string_view name,
274 int64_t metadata_pool, int64_t data_pool, uint64_t features)
7c673cae 275{
11fdf7f2 276 auto fs = Filesystem::create();
28e407b8 277 fs->mds_map.epoch = epoch;
11fdf7f2 278 fs->mds_map.fs_name = name;
31f18b77 279 fs->mds_map.data_pools.push_back(data_pool);
7c673cae
FG
280 fs->mds_map.metadata_pool = metadata_pool;
281 fs->mds_map.cas_pool = -1;
7c673cae
FG
282 fs->mds_map.compat = compat;
283 fs->mds_map.created = ceph_clock_now();
284 fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
285 fs->mds_map.enabled = true;
286 if (features & CEPH_FEATURE_SERVER_JEWEL) {
287 fs->fscid = next_filesystem_id++;
288 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
289 // have initialized next_filesystem_id such that it's never used here.
11fdf7f2 290 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
7c673cae
FG
291 } else {
292 // Use anon fscid because this will get thrown away when encoding
293 // as legacy MDSMap for legacy mons.
11fdf7f2 294 ceph_assert(filesystems.empty());
7c673cae
FG
295 fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
296 }
297 filesystems[fs->fscid] = fs;
298
299 // Created first filesystem? Set it as the one
300 // for legacy clients to use
301 if (filesystems.size() == 1) {
302 legacy_client_fscid = fs->fscid;
303 }
11fdf7f2
TL
304
305 return fs;
7c673cae
FG
306}
307
308void FSMap::reset_filesystem(fs_cluster_id_t fscid)
309{
310 auto fs = get_filesystem(fscid);
11fdf7f2 311 auto new_fs = Filesystem::create();
7c673cae
FG
312
313 // Populate rank 0 as existing (so don't go into CREATING)
314 // but failed (so that next available MDS is assigned the rank)
315 new_fs->mds_map.in.insert(mds_rank_t(0));
316 new_fs->mds_map.failed.insert(mds_rank_t(0));
317
318 // Carry forward what makes sense
319 new_fs->fscid = fs->fscid;
320 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
7c673cae
FG
321 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
322 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
323 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
324 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
7c673cae
FG
325 new_fs->mds_map.compat = compat;
326 new_fs->mds_map.created = ceph_clock_now();
327 new_fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
328 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
329 new_fs->mds_map.enabled = true;
330
c07f9fc5
FG
331 // Remember mds ranks that have ever started. (They should load old inotable
332 // instead of creating new one if they start again.)
333 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
334 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
335 new_fs->mds_map.stopped.erase(mds_rank_t(0));
336
7c673cae
FG
337 // Persist the new FSMap
338 filesystems[new_fs->fscid] = new_fs;
339}
340
341void FSMap::get_health(list<pair<health_status_t,string> >& summary,
342 list<pair<health_status_t,string> > *detail) const
343{
344 mds_rank_t standby_count_wanted = 0;
345 for (const auto &i : filesystems) {
346 const auto &fs = i.second;
347
348 // TODO: move get_health up into here so that we can qualify
349 // all the messages with what filesystem they're talking about
350 fs->mds_map.get_health(summary, detail);
351
352 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
353 }
354
355 if (standby_count_wanted) {
356 std::ostringstream oss;
357 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
358 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
359 }
360}
361
362bool FSMap::check_health(void)
363{
364 bool changed = false;
365 for (auto &i : filesystems) {
366 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
367 }
368 return changed;
369}
370
224ce89b
WB
371void FSMap::get_health_checks(health_check_map_t *checks) const
372{
373 mds_rank_t standby_count_wanted = 0;
374 for (const auto &i : filesystems) {
375 const auto &fs = i.second;
376 health_check_map_t fschecks;
d2e6a577 377
224ce89b 378 fs->mds_map.get_health_checks(&fschecks);
d2e6a577
FG
379
380 // Some of the failed ranks might be transient (i.e. there are standbys
381 // ready to replace them). We will report only on "stuck" failed, i.e.
382 // ranks which are failed and have no standby replacement available.
383 std::set<mds_rank_t> stuck_failed;
384
385 for (const auto &rank : fs->mds_map.failed) {
11fdf7f2 386 auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
d2e6a577
FG
387 if (replacement == MDS_GID_NONE) {
388 stuck_failed.insert(rank);
389 }
390 }
391
392 // FS_WITH_FAILED_MDS
393 if (!stuck_failed.empty()) {
394 health_check_t& fscheck = checks->get_or_add(
395 "FS_WITH_FAILED_MDS", HEALTH_WARN,
181888fb 396 "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
d2e6a577
FG
397 ostringstream ss;
398 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
399 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
400 fscheck.detail.push_back(ss.str()); }
401
224ce89b
WB
402 checks->merge(fschecks);
403 standby_count_wanted = std::max(
404 standby_count_wanted,
405 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
406 }
407
408 // MDS_INSUFFICIENT_STANDBY
409 if (standby_count_wanted) {
410 std::ostringstream oss, dss;
d2e6a577
FG
411 oss << "insufficient standby MDS daemons available";
412 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
224ce89b
WB
413 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
414 << " more";
415 d.detail.push_back(dss.str());
416 }
417}
418
7c673cae
FG
419void FSMap::encode(bufferlist& bl, uint64_t features) const
420{
421 if (features & CEPH_FEATURE_SERVER_JEWEL) {
422 ENCODE_START(7, 6, bl);
11fdf7f2
TL
423 encode(epoch, bl);
424 encode(next_filesystem_id, bl);
425 encode(legacy_client_fscid, bl);
426 encode(compat, bl);
427 encode(enable_multiple, bl);
428 {
429 std::vector<Filesystem::ref> v;
430 v.reserve(filesystems.size());
431 for (auto& p : filesystems) v.emplace_back(p.second);
432 encode(v, bl, features);
7c673cae 433 }
11fdf7f2
TL
434 encode(mds_roles, bl);
435 encode(standby_daemons, bl, features);
436 encode(standby_epochs, bl);
437 encode(ever_enabled_multiple, bl);
7c673cae
FG
438 ENCODE_FINISH(bl);
439 } else {
440 if (filesystems.empty()) {
441 MDSMap disabled_map;
442 disabled_map.epoch = epoch;
443 disabled_map.encode(bl, features);
444 } else {
445 // MDSMonitor should never have created multiple filesystems
446 // until the quorum features indicated Jewel
11fdf7f2 447 ceph_assert(filesystems.size() == 1);
7c673cae
FG
448 auto fs = filesystems.begin()->second;
449
450 // Take the MDSMap for the enabled filesystem, and populated its
451 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
452 MDSMap full_mdsmap = fs->mds_map;
453 full_mdsmap.epoch = epoch;
454 for (const auto &p : standby_daemons) {
455 full_mdsmap.mds_info[p.first] = p.second;
456 }
457
458 // Old MDSMaps don't set rank on standby replay daemons
459 for (auto &i : full_mdsmap.mds_info) {
460 auto &info = i.second;
461 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
462 info.rank = MDS_RANK_NONE;
463 }
464 }
465
466 full_mdsmap.encode(bl, features);
467 }
468 }
469}
470
11fdf7f2 471void FSMap::decode(bufferlist::const_iterator& p)
7c673cae 472{
7c673cae
FG
473 // The highest MDSMap encoding version before we changed the
474 // MDSMonitor to store an FSMap instead of an MDSMap was
475 // 5, so anything older than 6 is decoded as an MDSMap,
476 // and anything newer is decoded as an FSMap.
477 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
478 if (struct_v < 6) {
3efd9988
FG
479 // Because the mon used to store an MDSMap where we now
480 // store an FSMap, FSMap knows how to decode the legacy
481 // MDSMap format (it never needs to encode it though).
482 MDSMap legacy_mds_map;
483
7c673cae 484 // Decoding an MDSMap (upgrade)
11fdf7f2
TL
485 decode(epoch, p);
486 decode(legacy_mds_map.flags, p);
487 decode(legacy_mds_map.last_failure, p);
488 decode(legacy_mds_map.root, p);
489 decode(legacy_mds_map.session_timeout, p);
490 decode(legacy_mds_map.session_autoclose, p);
491 decode(legacy_mds_map.max_file_size, p);
492 decode(legacy_mds_map.max_mds, p);
493 decode(legacy_mds_map.mds_info, p);
7c673cae
FG
494 if (struct_v < 3) {
495 __u32 n;
11fdf7f2 496 decode(n, p);
7c673cae
FG
497 while (n--) {
498 __u32 m;
11fdf7f2 499 decode(m, p);
31f18b77 500 legacy_mds_map.data_pools.push_back(m);
7c673cae
FG
501 }
502 __s32 s;
11fdf7f2 503 decode(s, p);
7c673cae
FG
504 legacy_mds_map.cas_pool = s;
505 } else {
11fdf7f2
TL
506 decode(legacy_mds_map.data_pools, p);
507 decode(legacy_mds_map.cas_pool, p);
7c673cae
FG
508 }
509
510 // kclient ignores everything from here
511 __u16 ev = 1;
512 if (struct_v >= 2)
11fdf7f2 513 decode(ev, p);
7c673cae 514 if (ev >= 3)
11fdf7f2 515 decode(legacy_mds_map.compat, p);
7c673cae 516 else
1adf2230 517 legacy_mds_map.compat = MDSMap::get_compat_set_base();
7c673cae
FG
518 if (ev < 5) {
519 __u32 n;
11fdf7f2 520 decode(n, p);
7c673cae
FG
521 legacy_mds_map.metadata_pool = n;
522 } else {
11fdf7f2 523 decode(legacy_mds_map.metadata_pool, p);
7c673cae 524 }
11fdf7f2
TL
525 decode(legacy_mds_map.created, p);
526 decode(legacy_mds_map.modified, p);
527 decode(legacy_mds_map.tableserver, p);
528 decode(legacy_mds_map.in, p);
7c673cae 529 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
11fdf7f2
TL
530 decode(inc, p);
531 decode(legacy_mds_map.up, p);
532 decode(legacy_mds_map.failed, p);
533 decode(legacy_mds_map.stopped, p);
7c673cae 534 if (ev >= 4)
11fdf7f2 535 decode(legacy_mds_map.last_failure_osd_epoch, p);
7c673cae
FG
536 if (ev >= 6) {
537 if (ev < 10) {
538 // previously this was a bool about snaps, not a flag map
539 bool flag;
11fdf7f2 540 decode(flag, p);
7c673cae
FG
541 legacy_mds_map.ever_allowed_features = flag ?
542 CEPH_MDSMAP_ALLOW_SNAPS : 0;
11fdf7f2 543 decode(flag, p);
7c673cae
FG
544 legacy_mds_map.explicitly_allowed_features = flag ?
545 CEPH_MDSMAP_ALLOW_SNAPS : 0;
7c673cae 546 } else {
11fdf7f2
TL
547 decode(legacy_mds_map.ever_allowed_features, p);
548 decode(legacy_mds_map.explicitly_allowed_features, p);
7c673cae
FG
549 }
550 } else {
11fdf7f2 551 legacy_mds_map.ever_allowed_features = 0;
7c673cae 552 legacy_mds_map.explicitly_allowed_features = 0;
7c673cae
FG
553 }
554 if (ev >= 7)
11fdf7f2 555 decode(legacy_mds_map.inline_data_enabled, p);
7c673cae
FG
556
557 if (ev >= 8) {
11fdf7f2
TL
558 ceph_assert(struct_v >= 5);
559 decode(legacy_mds_map.enabled, p);
560 decode(legacy_mds_map.fs_name, p);
7c673cae
FG
561 } else {
562 legacy_mds_map.fs_name = "default";
563 if (epoch > 1) {
564 // If an MDS has ever been started, epoch will be greater than 1,
565 // assume filesystem is enabled.
566 legacy_mds_map.enabled = true;
567 } else {
568 // Upgrading from a cluster that never used an MDS, switch off
569 // filesystem until it's explicitly enabled.
570 legacy_mds_map.enabled = false;
571 }
572 }
573
574 if (ev >= 9) {
11fdf7f2 575 decode(legacy_mds_map.damaged, p);
7c673cae
FG
576 }
577
578 // We're upgrading, populate filesystems from the legacy fields
579 filesystems.clear();
580 standby_daemons.clear();
581 standby_epochs.clear();
582 mds_roles.clear();
583 compat = legacy_mds_map.compat;
584 enable_multiple = false;
585
586 // Synthesise a Filesystem from legacy_mds_map, if enabled
587 if (legacy_mds_map.enabled) {
588 // Construct a Filesystem from the legacy MDSMap
11fdf7f2 589 auto migrate_fs = Filesystem::create();
7c673cae
FG
590 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
591 migrate_fs->mds_map = legacy_mds_map;
592 migrate_fs->mds_map.epoch = epoch;
593 filesystems[migrate_fs->fscid] = migrate_fs;
594
595 // List of GIDs that had invalid states
596 std::set<mds_gid_t> drop_gids;
597
598 // Construct mds_roles, standby_daemons, and remove
599 // standbys from the MDSMap in the Filesystem.
11fdf7f2
TL
600 for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
601 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
602 /* drop any legacy standby-replay daemons */
603 drop_gids.insert(gid);
604 } else if (info.rank == MDS_RANK_NONE) {
605 if (info.state != MDSMap::STATE_STANDBY) {
7c673cae
FG
606 // Old MDSMaps can have down:dne here, which
607 // is invalid in an FSMap (#17837)
11fdf7f2 608 drop_gids.insert(gid);
7c673cae 609 } else {
11fdf7f2 610 insert(info); // into standby_daemons
7c673cae
FG
611 }
612 } else {
11fdf7f2 613 mds_roles[gid] = migrate_fs->fscid;
7c673cae
FG
614 }
615 }
616 for (const auto &p : standby_daemons) {
617 // Erase from this Filesystem's MDSMap, because it has
618 // been copied into FSMap::Standby_daemons above
619 migrate_fs->mds_map.mds_info.erase(p.first);
620 }
621 for (const auto &gid : drop_gids) {
622 // Throw away all info for this MDS because it was identified
623 // as having invalid state above.
624 migrate_fs->mds_map.mds_info.erase(gid);
625 }
626
627 legacy_client_fscid = migrate_fs->fscid;
628 } else {
629 legacy_client_fscid = FS_CLUSTER_ID_NONE;
630 }
631 } else {
11fdf7f2
TL
632 decode(epoch, p);
633 decode(next_filesystem_id, p);
634 decode(legacy_client_fscid, p);
635 decode(compat, p);
636 decode(enable_multiple, p);
637 {
638 std::vector<Filesystem::ref> v;
639 decode(v, p);
640 filesystems.clear();
641 for (auto& ref : v) {
642 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
643 ceph_assert(em.second);
644 }
7c673cae 645 }
11fdf7f2
TL
646 decode(mds_roles, p);
647 decode(standby_daemons, p);
648 decode(standby_epochs, p);
7c673cae 649 if (struct_v >= 7) {
11fdf7f2 650 decode(ever_enabled_multiple, p);
7c673cae
FG
651 }
652 }
653
654 DECODE_FINISH(p);
655}
656
11fdf7f2 657void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
658{
659 for (auto &fs : filesystems) {
660 fs.second->mds_map.sanitize(pool_exists);
661 }
662}
7c673cae
FG
663
664void Filesystem::encode(bufferlist& bl, uint64_t features) const
665{
666 ENCODE_START(1, 1, bl);
11fdf7f2 667 encode(fscid, bl);
7c673cae
FG
668 bufferlist mdsmap_bl;
669 mds_map.encode(mdsmap_bl, features);
11fdf7f2 670 encode(mdsmap_bl, bl);
7c673cae
FG
671 ENCODE_FINISH(bl);
672}
673
11fdf7f2 674void Filesystem::decode(bufferlist::const_iterator& p)
7c673cae
FG
675{
676 DECODE_START(1, p);
11fdf7f2 677 decode(fscid, p);
7c673cae 678 bufferlist mdsmap_bl;
11fdf7f2
TL
679 decode(mdsmap_bl, p);
680 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
7c673cae
FG
681 mds_map.decode(mdsmap_bl_iter);
682 DECODE_FINISH(p);
683}
684
685int FSMap::parse_filesystem(
11fdf7f2
TL
686 std::string_view ns_str,
687 Filesystem::const_ref* result
7c673cae
FG
688 ) const
689{
690 std::string ns_err;
94b18763
FG
691 std::string s(ns_str);
692 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
7c673cae
FG
693 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
694 for (auto &fs : filesystems) {
94b18763 695 if (fs.second->mds_map.fs_name == s) {
7c673cae
FG
696 *result = std::const_pointer_cast<const Filesystem>(fs.second);
697 return 0;
698 }
699 }
700 return -ENOENT;
701 } else {
702 *result = get_filesystem(fscid);
703 return 0;
704 }
705}
706
707void Filesystem::print(std::ostream &out) const
708{
709 out << "Filesystem '" << mds_map.fs_name
710 << "' (" << fscid << ")" << std::endl;
711 mds_map.print(out);
712}
713
11fdf7f2 714mds_gid_t FSMap::get_available_standby() const
7c673cae 715{
11fdf7f2
TL
716 for (const auto& [gid, info] : standby_daemons) {
717 ceph_assert(info.rank == MDS_RANK_NONE);
718 ceph_assert(info.state == MDSMap::STATE_STANDBY);
7c673cae 719
11fdf7f2 720 if (info.laggy() || info.is_frozen()) {
7c673cae
FG
721 continue;
722 }
723
11fdf7f2 724 return gid;
7c673cae 725 }
11fdf7f2 726 return MDS_GID_NONE;
7c673cae
FG
727}
728
11fdf7f2
TL
729mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
730{
731 auto&& fs = get_filesystem(role.fscid);
7c673cae 732
11fdf7f2
TL
733 // First see if we have a STANDBY_REPLAY
734 for (const auto& [gid, info] : fs->mds_map.mds_info) {
735 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
736 if (info.is_frozen()) {
737 /* the standby-replay is frozen, do nothing! */
738 return MDS_GID_NONE;
739 } else {
740 return gid;
741 }
7c673cae
FG
742 }
743 }
7c673cae 744
11fdf7f2 745 return get_available_standby();
7c673cae
FG
746}
747
748void FSMap::sanity() const
749{
750 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
11fdf7f2 751 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
7c673cae
FG
752 }
753
754 for (const auto &i : filesystems) {
755 auto fs = i.second;
11fdf7f2
TL
756 ceph_assert(fs->mds_map.compat.compare(compat) == 0);
757 ceph_assert(fs->fscid == i.first);
7c673cae 758 for (const auto &j : fs->mds_map.mds_info) {
11fdf7f2
TL
759 ceph_assert(j.second.rank != MDS_RANK_NONE);
760 ceph_assert(mds_roles.count(j.first) == 1);
761 ceph_assert(standby_daemons.count(j.first) == 0);
762 ceph_assert(standby_epochs.count(j.first) == 0);
763 ceph_assert(mds_roles.at(j.first) == i.first);
7c673cae 764 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
11fdf7f2
TL
765 ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first);
766 ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0);
767 ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0);
7c673cae
FG
768 }
769 }
770
771 for (const auto &j : fs->mds_map.up) {
772 mds_rank_t rank = j.first;
11fdf7f2 773 ceph_assert(fs->mds_map.in.count(rank) == 1);
7c673cae 774 mds_gid_t gid = j.second;
11fdf7f2 775 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
7c673cae
FG
776 }
777 }
778
779 for (const auto &i : standby_daemons) {
11fdf7f2
TL
780 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
781 ceph_assert(i.second.rank == MDS_RANK_NONE);
782 ceph_assert(i.second.global_id == i.first);
783 ceph_assert(standby_epochs.count(i.first) == 1);
784 ceph_assert(mds_roles.count(i.first) == 1);
785 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
7c673cae
FG
786 }
787
788 for (const auto &i : standby_epochs) {
11fdf7f2 789 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae
FG
790 }
791
792 for (const auto &i : mds_roles) {
793 if (i.second == FS_CLUSTER_ID_NONE) {
11fdf7f2 794 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae 795 } else {
11fdf7f2
TL
796 ceph_assert(filesystems.count(i.second) == 1);
797 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
7c673cae
FG
798 }
799 }
800}
801
802void FSMap::promote(
803 mds_gid_t standby_gid,
11fdf7f2 804 Filesystem& filesystem,
7c673cae
FG
805 mds_rank_t assigned_rank)
806{
11fdf7f2 807 ceph_assert(gid_exists(standby_gid));
7c673cae
FG
808 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
809 if (!is_standby_replay) {
11fdf7f2
TL
810 ceph_assert(standby_daemons.count(standby_gid));
811 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
7c673cae
FG
812 }
813
11fdf7f2 814 MDSMap &mds_map = filesystem.mds_map;
7c673cae
FG
815
816 // Insert daemon state to Filesystem
817 if (!is_standby_replay) {
818 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
819 } else {
11fdf7f2
TL
820 ceph_assert(mds_map.mds_info.count(standby_gid));
821 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
822 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
7c673cae
FG
823 }
824 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
825
826 if (mds_map.stopped.erase(assigned_rank)) {
827 // The cluster is being expanded with a stopped rank
828 info.state = MDSMap::STATE_STARTING;
829 } else if (!mds_map.is_in(assigned_rank)) {
830 // The cluster is being expanded with a new rank
831 info.state = MDSMap::STATE_CREATING;
832 } else {
833 // An existing rank is being assigned to a replacement
834 info.state = MDSMap::STATE_REPLAY;
835 mds_map.failed.erase(assigned_rank);
836 }
837 info.rank = assigned_rank;
838 info.inc = epoch;
11fdf7f2 839 mds_roles[standby_gid] = filesystem.fscid;
7c673cae
FG
840
841 // Update the rank state in Filesystem
842 mds_map.in.insert(assigned_rank);
843 mds_map.up[assigned_rank] = standby_gid;
844
845 // Remove from the list of standbys
846 if (!is_standby_replay) {
847 standby_daemons.erase(standby_gid);
848 standby_epochs.erase(standby_gid);
849 }
850
851 // Indicate that Filesystem has been modified
852 mds_map.epoch = epoch;
853}
854
855void FSMap::assign_standby_replay(
856 const mds_gid_t standby_gid,
857 const fs_cluster_id_t leader_ns,
858 const mds_rank_t leader_rank)
859{
11fdf7f2
TL
860 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
861 ceph_assert(gid_exists(standby_gid));
862 ceph_assert(!gid_has_rank(standby_gid));
863 ceph_assert(standby_daemons.count(standby_gid));
7c673cae
FG
864
865 // Insert to the filesystem
866 auto fs = filesystems.at(leader_ns);
867 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
868 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
869 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
870 mds_roles[standby_gid] = leader_ns;
871
872 // Remove from the list of standbys
873 standby_daemons.erase(standby_gid);
874 standby_epochs.erase(standby_gid);
875
876 // Indicate that Filesystem has been modified
877 fs->mds_map.epoch = epoch;
878}
879
880void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
881{
882 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
883 standby_daemons.erase(who);
884 standby_epochs.erase(who);
885 } else {
886 auto &fs = filesystems.at(mds_roles.at(who));
887 const auto &info = fs->mds_map.mds_info.at(who);
888 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
889 if (info.state == MDSMap::STATE_CREATING) {
890 // If this gid didn't make it past CREATING, then forget
891 // the rank ever existed so that next time it's handed out
892 // to a gid it'll go back into CREATING.
893 fs->mds_map.in.erase(info.rank);
894 } else {
895 // Put this rank into the failed list so that the next available
896 // STANDBY will pick it up.
897 fs->mds_map.failed.insert(info.rank);
898 }
11fdf7f2 899 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
7c673cae
FG
900 fs->mds_map.up.erase(info.rank);
901 }
902 fs->mds_map.mds_info.erase(who);
903 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
904 fs->mds_map.epoch = epoch;
905 }
906
907 mds_roles.erase(who);
908}
909
910void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
911{
11fdf7f2 912 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
913 auto fs = filesystems.at(mds_roles.at(who));
914 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
915
916 erase(who, blacklist_epoch);
917 fs->mds_map.failed.erase(rank);
918 fs->mds_map.damaged.insert(rank);
919
11fdf7f2 920 ceph_assert(fs->mds_map.epoch == epoch);
7c673cae
FG
921}
922
923/**
924 * Update to indicate that the rank `rank` is to be removed
925 * from the damaged list of the filesystem `fscid`
926 */
927bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
928{
929 auto fs = filesystems.at(fscid);
930
931 if (fs->mds_map.damaged.erase(rank)) {
932 fs->mds_map.failed.insert(rank);
933 fs->mds_map.epoch = epoch;
934 return true;
935 } else {
936 return false;
937 }
938}
939
940void FSMap::insert(const MDSMap::mds_info_t &new_info)
941{
11fdf7f2
TL
942 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
943 ceph_assert(new_info.rank == MDS_RANK_NONE);
7c673cae
FG
944 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
945 standby_daemons[new_info.global_id] = new_info;
946 standby_epochs[new_info.global_id] = epoch;
947}
948
949std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
950{
11fdf7f2 951 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
952 auto fs = filesystems.at(mds_roles.at(who));
953 const auto &info = fs->mds_map.mds_info.at(who);
954 fs->mds_map.up.erase(info.rank);
955 fs->mds_map.in.erase(info.rank);
956 fs->mds_map.stopped.insert(info.rank);
957
958 // Also drop any standby replays that were following this rank
959 std::list<mds_gid_t> standbys;
960 for (const auto &i : fs->mds_map.mds_info) {
961 const auto &other_gid = i.first;
962 const auto &other_info = i.second;
963 if (other_info.rank == info.rank
964 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
965 standbys.push_back(other_gid);
966 erase(other_gid, 0);
967 }
968 }
969
970 fs->mds_map.mds_info.erase(who);
971 mds_roles.erase(who);
972
973 fs->mds_map.epoch = epoch;
974
975 return standbys;
976}
977
978
979/**
980 * Given one of the following forms:
981 * <fs name>:<rank>
982 * <fs id>:<rank>
983 * <rank>
984 *
985 * Parse into a mds_role_t. The rank-only form is only valid
986 * if legacy_client_ns is set.
987 */
988int FSMap::parse_role(
11fdf7f2 989 std::string_view role_str,
7c673cae
FG
990 mds_role_t *role,
991 std::ostream &ss) const
992{
993 size_t colon_pos = role_str.find(":");
994 size_t rank_pos;
11fdf7f2 995 Filesystem::const_ref fs;
7c673cae
FG
996 if (colon_pos == std::string::npos) {
997 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
998 ss << "No filesystem selected";
999 return -ENOENT;
1000 }
1001 fs = get_filesystem(legacy_client_fscid);
1002 rank_pos = 0;
1003 } else {
1004 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1005 ss << "Invalid filesystem";
1006 return -ENOENT;
1007 }
1008 rank_pos = colon_pos+1;
1009 }
1010
1011 mds_rank_t rank;
1012 std::string err;
94b18763 1013 std::string rank_str(role_str.substr(rank_pos));
7c673cae
FG
1014 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1015 if (rank_i < 0 || !err.empty()) {
1016 ss << "Invalid rank '" << rank_str << "'";
1017 return -EINVAL;
1018 } else {
1019 rank = rank_i;
1020 }
1021
1022 if (fs->mds_map.in.count(rank) == 0) {
1023 ss << "Rank '" << rank << "' not found";
1024 return -ENOENT;
1025 }
1026
1027 *role = {fs->fscid, rank};
1028
1029 return 0;
1030}