]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/FSMap.cc
import ceph 16.2.7
[ceph.git] / ceph / src / mds / FSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
f67539c2 15#include <ostream>
7c673cae
FG
16
17#include "FSMap.h"
18
11fdf7f2 19#include "common/StackStringStream.h"
7c673cae 20
11fdf7f2
TL
21#ifdef WITH_SEASTAR
22#include "crimson/common/config_proxy.h"
23#else
24#include "common/config_proxy.h"
25#endif
26#include "global/global_context.h"
224ce89b
WB
27#include "mon/health_check.h"
28
f67539c2
TL
29using std::list;
30using std::pair;
31using std::ostream;
32using std::string;
33
34using ceph::bufferlist;
35using ceph::Formatter;
36
37void ClusterInfo::encode(ceph::buffer::list &bl) const {
38 ENCODE_START(1, 1, bl);
39 encode(client_name, bl);
40 encode(cluster_name, bl);
41 encode(fs_name, bl);
42 ENCODE_FINISH(bl);
43}
44
45void ClusterInfo::decode(ceph::buffer::list::const_iterator &iter) {
46 DECODE_START(1, iter);
47 decode(client_name, iter);
48 decode(cluster_name, iter);
49 decode(fs_name, iter);
50 DECODE_FINISH(iter);
51}
52
53void ClusterInfo::dump(ceph::Formatter *f) const {
54 f->dump_string("client_name", client_name);
55 f->dump_string("cluster_name", cluster_name);
56 f->dump_string("fs_name", fs_name);
57}
58
59void ClusterInfo::print(std::ostream& out) const {
60 out << "[client_name=" << client_name << ", cluster_name=" << cluster_name
61 << ", fs_name=" << fs_name << "]" << std::endl;
62}
63
64void Peer::encode(ceph::buffer::list &bl) const {
65 ENCODE_START(1, 1, bl);
66 encode(uuid, bl);
67 encode(remote, bl);
68 ENCODE_FINISH(bl);
69}
70
71void Peer::decode(ceph::buffer::list::const_iterator &iter) {
72 DECODE_START(1, iter);
73 decode(uuid, iter);
74 decode(remote, iter);
75 DECODE_FINISH(iter);
76}
77
78void Peer::dump(ceph::Formatter *f) const {
79 f->open_object_section(uuid);
80 f->dump_object("remote", remote);
81 f->close_section();
82}
83
84void Peer::print(std::ostream& out) const {
85 out << "[uuid=" << uuid << ", remote=" << remote << "]" << std::endl;
86}
87
88void MirrorInfo::encode(ceph::buffer::list &bl) const {
89 ENCODE_START(1, 1, bl);
90 encode(mirrored, bl);
91 encode(peers, bl);
92 ENCODE_FINISH(bl);
93}
94
95void MirrorInfo::decode(ceph::buffer::list::const_iterator &iter) {
96 DECODE_START(1, iter);
97 decode(mirrored, iter);
98 decode(peers, iter);
99 DECODE_FINISH(iter);
100}
101
102void MirrorInfo::dump(ceph::Formatter *f) const {
103 f->open_object_section("peers");
104 for (auto &peer : peers) {
105 peer.dump(f);
106 }
107 f->close_section(); // peers
108}
109
110void MirrorInfo::print(std::ostream& out) const {
111 out << "[peers=" << peers << "]" << std::endl;
112}
7c673cae
FG
113
114void Filesystem::dump(Formatter *f) const
115{
116 f->open_object_section("mdsmap");
117 mds_map.dump(f);
118 f->close_section();
119 f->dump_int("id", fscid);
f67539c2
TL
120 if (mirror_info.is_mirrored()) {
121 f->open_object_section("mirror_info");
122 mirror_info.dump(f);
123 f->close_section(); // mirror_info
124 }
7c673cae
FG
125}
126
127void FSMap::dump(Formatter *f) const
128{
129 f->dump_int("epoch", epoch);
11fdf7f2
TL
130 // Use 'default' naming to match 'set-default' CLI
131 f->dump_int("default_fscid", legacy_client_fscid);
7c673cae
FG
132
133 f->open_object_section("compat");
522d829b 134 default_compat.dump(f);
7c673cae
FG
135 f->close_section();
136
137 f->open_object_section("feature_flags");
138 f->dump_bool("enable_multiple", enable_multiple);
139 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
140 f->close_section();
141
142 f->open_array_section("standbys");
9f95a23c 143 for (const auto& [gid, info] : standby_daemons) {
7c673cae 144 f->open_object_section("info");
9f95a23c
TL
145 info.dump(f);
146 f->dump_int("epoch", standby_epochs.at(gid));
7c673cae
FG
147 f->close_section();
148 }
149 f->close_section();
150
151 f->open_array_section("filesystems");
152 for (const auto &fs : filesystems) {
153 f->open_object_section("filesystem");
154 fs.second->dump(f);
155 f->close_section();
156 }
157 f->close_section();
158}
159
9f95a23c
TL
160FSMap &FSMap::operator=(const FSMap &rhs)
161{
162 epoch = rhs.epoch;
163 next_filesystem_id = rhs.next_filesystem_id;
164 legacy_client_fscid = rhs.legacy_client_fscid;
522d829b 165 default_compat = rhs.default_compat;
9f95a23c
TL
166 enable_multiple = rhs.enable_multiple;
167 mds_roles = rhs.mds_roles;
168 standby_daemons = rhs.standby_daemons;
169 standby_epochs = rhs.standby_epochs;
170
171 filesystems.clear();
172 for (const auto &i : rhs.filesystems) {
173 const auto &fs = i.second;
174 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
175 }
176
177 return *this;
178}
179
180void FSMap::generate_test_instances(std::list<FSMap*>& ls)
7c673cae
FG
181{
182 FSMap *m = new FSMap();
183
184 std::list<MDSMap*> mds_map_instances;
185 MDSMap::generate_test_instances(mds_map_instances);
186
187 int k = 20;
188 for (auto i : mds_map_instances) {
11fdf7f2 189 auto fs = Filesystem::create();
7c673cae
FG
190 fs->fscid = k++;
191 fs->mds_map = *i;
192 delete i;
193 m->filesystems[fs->fscid] = fs;
194 }
195 mds_map_instances.clear();
196
197 ls.push_back(m);
198}
199
200void FSMap::print(ostream& out) const
201{
202 out << "e" << epoch << std::endl;
203 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
204 << ever_enabled_multiple << std::endl;
522d829b 205 out << "default compat: " << default_compat << std::endl;
7c673cae
FG
206 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
207 out << " " << std::endl;
208
209 if (filesystems.empty()) {
210 out << "No filesystems configured" << std::endl;
7c673cae
FG
211 }
212
11fdf7f2
TL
213 for (const auto& p : filesystems) {
214 p.second->print(out);
7c673cae
FG
215 out << " " << std::endl << " " << std::endl; // Space out a bit
216 }
217
218 if (!standby_daemons.empty()) {
219 out << "Standby daemons:" << std::endl << " " << std::endl;
220 }
221
9f95a23c
TL
222 for (const auto& p : standby_daemons) {
223 out << p.second << std::endl;
7c673cae
FG
224 }
225}
226
f67539c2
TL
227void FSMap::print_daemon_summary(ostream& out) const
228{
229 // this appears in the "services:" section of "ceph status"
230 int num_up = 0, num_in = 0, num_failed = 0;
231 int num_standby_replay = 0;
232 for (auto& [fscid, fs] : filesystems) {
233 num_up += fs->mds_map.get_num_up_mds();
234 num_in += fs->mds_map.get_num_in_mds();
235 num_failed += fs->mds_map.get_num_failed_mds();
236 num_standby_replay += fs->mds_map.get_num_standby_replay_mds();
237 }
238 int num_standby = standby_daemons.size();
239 out << num_up << "/" << num_in << " daemons up";
240 if (num_failed) {
241 out << " (" << num_failed << " failed)";
242 }
243 if (num_standby) {
244 out << ", " << num_standby << " standby";
245 }
246 if (num_standby_replay) {
247 out << ", " << num_standby_replay << " hot standby";
248 }
249}
250
251void FSMap::print_fs_summary(ostream& out) const
252{
253 // this appears in the "data:" section of "ceph status"
254 if (!filesystems.empty()) {
255 int num_failed = 0, num_recovering = 0, num_stopped = 0, num_healthy = 0;
256 int num_damaged = 0;
257 for (auto& [fscid, fs] : filesystems) {
258 if (fs->mds_map.is_any_damaged()) {
259 ++num_damaged;
260 }
261 if (fs->mds_map.is_any_failed()) {
262 ++num_failed;
263 } else if (fs->mds_map.is_degraded()) {
264 ++num_recovering;
265 } else if (fs->mds_map.get_max_mds() == 0) {
266 ++num_stopped;
267 } else {
268 ++num_healthy;
269 }
270 }
271 out << " volumes: "
272 << num_healthy << "/" << filesystems.size() << " healthy";
273 if (num_recovering) {
274 out << ", " << num_recovering << " recovering";
275 }
276 if (num_failed) {
277 out << ", " << num_failed << " failed";
278 }
279 if (num_stopped) {
280 out << ", " << num_stopped << " stopped";
281 }
282 if (num_damaged) {
283 out << "; " << num_damaged << " damaged";
284 }
285 out << "\n";
286 }
287}
288
7c673cae
FG
289void FSMap::print_summary(Formatter *f, ostream *out) const
290{
7c673cae
FG
291 if (f) {
292 f->dump_unsigned("epoch", get_epoch());
11fdf7f2
TL
293 for (const auto &p : filesystems) {
294 auto& fs = p.second;
7c673cae
FG
295 f->dump_unsigned("id", fs->fscid);
296 f->dump_unsigned("up", fs->mds_map.up.size());
297 f->dump_unsigned("in", fs->mds_map.in.size());
298 f->dump_unsigned("max", fs->mds_map.max_mds);
299 }
300 } else {
11fdf7f2
TL
301 auto count = filesystems.size();
302 if (count <= 3) {
303 bool first = true;
304 for (const auto& p : filesystems) {
305 const auto& fs = p.second;
306 if (!first) {
307 *out << " ";
308 }
309 if (fs->mds_map.is_degraded()) {
310 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
311 } else {
312 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
313 }
314 first = false;
315 }
316 } else {
317 *out << count << " fs";
318 unsigned degraded = 0;
319 CachedStackStringStream css;
320 *css << " (degraded: ";
321 for (const auto& p : filesystems) {
322 const auto& fs = p.second;
323 if (fs->mds_map.is_degraded()) {
324 degraded++;
325 if (degraded <= 3) {
326 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
327 }
328 }
329 }
330 if (degraded > 0) {
331 if (degraded <= 3) {
332 *css << ")";
333 *out << css->strv();
334 } else {
335 *out << " (degraded: " << degraded << " fs)";
336 }
337 }
7c673cae
FG
338 }
339 }
340
341 if (f) {
342 f->open_array_section("by_rank");
343 }
344
11fdf7f2
TL
345 std::map<MDSMap::DaemonState,unsigned> by_state;
346 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
347 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
348 for (const auto& [gid, fscid] : mds_roles) {
349 if (fscid == FS_CLUSTER_ID_NONE)
350 continue;
351
352 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
353 auto s = std::string(ceph_mds_state_name(info.state));
7c673cae
FG
354 if (info.laggy()) {
355 s += "(laggy or crashed)";
356 }
357
11fdf7f2
TL
358 if (f) {
359 f->open_object_section("mds");
360 f->dump_unsigned("filesystem_id", fscid);
361 f->dump_unsigned("rank", info.rank);
362 f->dump_string("name", info.name);
363 f->dump_string("status", s);
364 f->dump_unsigned("gid", gid);
365 f->close_section();
366 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
367 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
7c673cae 368 }
11fdf7f2 369 by_state[info.state]++;
7c673cae
FG
370 }
371
372 if (f) {
373 f->close_section();
374 } else {
11fdf7f2 375 if (0 < by_rank.size() && by_rank.size() < 5) {
7c673cae
FG
376 if (filesystems.size() > 1) {
377 // Disambiguate filesystems
378 std::map<std::string, std::string> pretty;
11fdf7f2
TL
379 for (const auto& [role,status] : by_rank) {
380 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
381 CachedStackStringStream css;
382 *css << fs_name << ":" << role.rank;
383 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
384 --by_state[status.first]; /* already printed! */
7c673cae
FG
385 }
386 *out << " " << pretty;
387 } else {
388 // Omit FSCID in output when only one filesystem exists
389 std::map<mds_rank_t, std::string> shortened;
11fdf7f2
TL
390 for (const auto& [role,status] : by_rank) {
391 shortened[role.rank] = status.second;
392 --by_state[status.first]; /* already printed! */
7c673cae
FG
393 }
394 *out << " " << shortened;
395 }
396 }
11fdf7f2
TL
397 for (const auto& [state, count] : by_state) {
398 if (count > 0) {
399 auto s = std::string_view(ceph_mds_state_name(state));
400 *out << " " << count << " " << s;
401 }
402 }
7c673cae
FG
403 }
404
11fdf7f2
TL
405 if (f) {
406 const auto state = MDSMap::DaemonState::STATE_STANDBY;
407 auto&& name = ceph_mds_state_name(state);
408 auto count = standby_daemons.size();
409 f->dump_unsigned(name, count);
7c673cae
FG
410 }
411
412 size_t failed = 0;
413 size_t damaged = 0;
11fdf7f2
TL
414 for (const auto& p : filesystems) {
415 auto& fs = p.second;
7c673cae
FG
416 failed += fs->mds_map.failed.size();
417 damaged += fs->mds_map.damaged.size();
418 }
419
420 if (failed > 0) {
421 if (f) {
422 f->dump_unsigned("failed", failed);
423 } else {
424 *out << ", " << failed << " failed";
425 }
426 }
427
428 if (damaged > 0) {
429 if (f) {
430 f->dump_unsigned("damaged", damaged);
431 } else {
432 *out << ", " << damaged << " damaged";
433 }
434 }
435 //if (stopped.size())
436 //out << ", " << stopped.size() << " stopped";
437}
438
9f95a23c
TL
439mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
440{
441 for (const auto &i : mds_map.mds_info) {
442 const auto &info = i.second;
443 if (info.state == MDSMap::STATE_STANDBY_REPLAY
444 && info.rank == mds_map.mds_info.at(who).rank) {
445 return info.global_id;
446 }
447 }
448 return MDS_GID_NONE;
449}
7c673cae 450
11fdf7f2 451Filesystem::ref FSMap::create_filesystem(std::string_view name,
522d829b
TL
452 int64_t metadata_pool, int64_t data_pool, uint64_t features,
453 fs_cluster_id_t fscid)
7c673cae 454{
11fdf7f2 455 auto fs = Filesystem::create();
28e407b8 456 fs->mds_map.epoch = epoch;
11fdf7f2 457 fs->mds_map.fs_name = name;
31f18b77 458 fs->mds_map.data_pools.push_back(data_pool);
7c673cae
FG
459 fs->mds_map.metadata_pool = metadata_pool;
460 fs->mds_map.cas_pool = -1;
522d829b 461 fs->mds_map.compat = default_compat;
7c673cae
FG
462 fs->mds_map.created = ceph_clock_now();
463 fs->mds_map.modified = ceph_clock_now();
7c673cae 464 fs->mds_map.enabled = true;
522d829b
TL
465 if (fscid == FS_CLUSTER_ID_NONE) {
466 fs->fscid = next_filesystem_id++;
467 } else {
468 fs->fscid = fscid;
469 next_filesystem_id = std::max(fscid, (fs_cluster_id_t)next_filesystem_id) + 1;
470 }
471
472 // File system's ID can be FS_CLUSTER_ID_ANONYMOUS if we're recovering
473 // a legacy file system by passing FS_CLUSTER_ID_ANONYMOUS as the desired
474 // file system ID
475 if (fscid != FS_CLUSTER_ID_ANONYMOUS) {
476 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
477 // have initialized next_filesystem_id such that it's never used here.
478 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
479 }
7c673cae
FG
480 filesystems[fs->fscid] = fs;
481
482 // Created first filesystem? Set it as the one
483 // for legacy clients to use
484 if (filesystems.size() == 1) {
485 legacy_client_fscid = fs->fscid;
486 }
11fdf7f2
TL
487
488 return fs;
7c673cae
FG
489}
490
9f95a23c
TL
491Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
492{
493 for (const auto& p : filesystems) {
494 if (p.second->mds_map.fs_name == name) {
495 return p.second;
496 }
497 }
498 return nullptr;
499}
500
501std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
502{
503 std::vector<Filesystem::const_ref> ret;
504 for (const auto& p : filesystems) {
505 ret.push_back(p.second);
506 }
507 return ret;
508}
509
7c673cae
FG
510void FSMap::reset_filesystem(fs_cluster_id_t fscid)
511{
512 auto fs = get_filesystem(fscid);
11fdf7f2 513 auto new_fs = Filesystem::create();
7c673cae
FG
514
515 // Populate rank 0 as existing (so don't go into CREATING)
516 // but failed (so that next available MDS is assigned the rank)
517 new_fs->mds_map.in.insert(mds_rank_t(0));
518 new_fs->mds_map.failed.insert(mds_rank_t(0));
519
520 // Carry forward what makes sense
521 new_fs->fscid = fs->fscid;
522 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
7c673cae
FG
523 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
524 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
525 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
526 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
522d829b 527 new_fs->mds_map.compat = default_compat;
7c673cae
FG
528 new_fs->mds_map.created = ceph_clock_now();
529 new_fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
530 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
531 new_fs->mds_map.enabled = true;
532
c07f9fc5
FG
533 // Remember mds ranks that have ever started. (They should load old inotable
534 // instead of creating new one if they start again.)
535 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
536 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
537 new_fs->mds_map.stopped.erase(mds_rank_t(0));
538
7c673cae
FG
539 // Persist the new FSMap
540 filesystems[new_fs->fscid] = new_fs;
541}
542
543void FSMap::get_health(list<pair<health_status_t,string> >& summary,
544 list<pair<health_status_t,string> > *detail) const
545{
546 mds_rank_t standby_count_wanted = 0;
547 for (const auto &i : filesystems) {
548 const auto &fs = i.second;
549
550 // TODO: move get_health up into here so that we can qualify
551 // all the messages with what filesystem they're talking about
552 fs->mds_map.get_health(summary, detail);
553
554 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
555 }
556
557 if (standby_count_wanted) {
f67539c2
TL
558 CachedStackStringStream css;
559 *css << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
560 summary.push_back(make_pair(HEALTH_WARN, css->str()));
7c673cae
FG
561 }
562}
563
564bool FSMap::check_health(void)
565{
566 bool changed = false;
567 for (auto &i : filesystems) {
568 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
569 }
570 return changed;
571}
572
224ce89b
WB
573void FSMap::get_health_checks(health_check_map_t *checks) const
574{
575 mds_rank_t standby_count_wanted = 0;
576 for (const auto &i : filesystems) {
577 const auto &fs = i.second;
578 health_check_map_t fschecks;
d2e6a577 579
224ce89b 580 fs->mds_map.get_health_checks(&fschecks);
d2e6a577
FG
581
582 // Some of the failed ranks might be transient (i.e. there are standbys
583 // ready to replace them). We will report only on "stuck" failed, i.e.
584 // ranks which are failed and have no standby replacement available.
585 std::set<mds_rank_t> stuck_failed;
586
587 for (const auto &rank : fs->mds_map.failed) {
9f95a23c
TL
588 auto rep_info = find_replacement_for({fs->fscid, rank});
589 if (!rep_info) {
d2e6a577
FG
590 stuck_failed.insert(rank);
591 }
592 }
593
594 // FS_WITH_FAILED_MDS
595 if (!stuck_failed.empty()) {
596 health_check_t& fscheck = checks->get_or_add(
597 "FS_WITH_FAILED_MDS", HEALTH_WARN,
9f95a23c 598 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
f67539c2
TL
599 CachedStackStringStream css;
600 *css << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
d2e6a577 601 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
f67539c2 602 fscheck.detail.push_back(css->str()); }
d2e6a577 603
224ce89b
WB
604 checks->merge(fschecks);
605 standby_count_wanted = std::max(
606 standby_count_wanted,
607 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
608 }
609
610 // MDS_INSUFFICIENT_STANDBY
611 if (standby_count_wanted) {
f67539c2
TL
612 CachedStackStringStream css1, css2;
613 *css1 << "insufficient standby MDS daemons available";
614 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, css1->str(), 1);
615 *css2 << "have " << standby_daemons.size() << "; want " << standby_count_wanted
616 << " more";
617 d.detail.push_back(css2->str());
224ce89b
WB
618 }
619}
620
9f95a23c
TL
621void FSMap::encode(bufferlist& bl, uint64_t features) const
622{
522d829b 623 ENCODE_START(STRUCT_VERSION, 6, bl);
9f95a23c
TL
624 encode(epoch, bl);
625 encode(next_filesystem_id, bl);
626 encode(legacy_client_fscid, bl);
522d829b 627 encode(default_compat, bl);
9f95a23c
TL
628 encode(enable_multiple, bl);
629 {
630 std::vector<Filesystem::ref> v;
631 v.reserve(filesystems.size());
632 for (auto& p : filesystems) v.emplace_back(p.second);
633 encode(v, bl, features);
7c673cae 634 }
9f95a23c
TL
635 encode(mds_roles, bl);
636 encode(standby_daemons, bl, features);
637 encode(standby_epochs, bl);
638 encode(ever_enabled_multiple, bl);
639 ENCODE_FINISH(bl);
7c673cae
FG
640}
641
11fdf7f2 642void FSMap::decode(bufferlist::const_iterator& p)
7c673cae 643{
a4b75251 644 struct_version = 0;
522d829b
TL
645 DECODE_START(STRUCT_VERSION, p);
646 DECODE_OLDEST(7);
647 struct_version = struct_v;
f67539c2
TL
648 decode(epoch, p);
649 decode(next_filesystem_id, p);
650 decode(legacy_client_fscid, p);
522d829b 651 decode(default_compat, p);
f67539c2
TL
652 decode(enable_multiple, p);
653 {
654 std::vector<Filesystem::ref> v;
655 decode(v, p);
7c673cae 656 filesystems.clear();
f67539c2
TL
657 for (auto& ref : v) {
658 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
659 ceph_assert(em.second);
7c673cae
FG
660 }
661 }
f67539c2
TL
662 decode(mds_roles, p);
663 decode(standby_daemons, p);
664 decode(standby_epochs, p);
665 if (struct_v >= 7) {
666 decode(ever_enabled_multiple, p);
667 }
7c673cae
FG
668 DECODE_FINISH(p);
669}
670
11fdf7f2 671void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
672{
673 for (auto &fs : filesystems) {
674 fs.second->mds_map.sanitize(pool_exists);
675 }
676}
7c673cae
FG
677
678void Filesystem::encode(bufferlist& bl, uint64_t features) const
679{
f67539c2 680 ENCODE_START(2, 1, bl);
11fdf7f2 681 encode(fscid, bl);
7c673cae
FG
682 bufferlist mdsmap_bl;
683 mds_map.encode(mdsmap_bl, features);
11fdf7f2 684 encode(mdsmap_bl, bl);
f67539c2 685 encode(mirror_info, bl);
7c673cae
FG
686 ENCODE_FINISH(bl);
687}
688
11fdf7f2 689void Filesystem::decode(bufferlist::const_iterator& p)
7c673cae 690{
f67539c2 691 DECODE_START(2, p);
11fdf7f2 692 decode(fscid, p);
7c673cae 693 bufferlist mdsmap_bl;
11fdf7f2
TL
694 decode(mdsmap_bl, p);
695 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
7c673cae 696 mds_map.decode(mdsmap_bl_iter);
f67539c2
TL
697 if (struct_v >= 2) {
698 decode(mirror_info, p);
699 }
7c673cae
FG
700 DECODE_FINISH(p);
701}
702
703int FSMap::parse_filesystem(
11fdf7f2
TL
704 std::string_view ns_str,
705 Filesystem::const_ref* result
7c673cae
FG
706 ) const
707{
708 std::string ns_err;
94b18763
FG
709 std::string s(ns_str);
710 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
7c673cae
FG
711 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
712 for (auto &fs : filesystems) {
94b18763 713 if (fs.second->mds_map.fs_name == s) {
7c673cae
FG
714 *result = std::const_pointer_cast<const Filesystem>(fs.second);
715 return 0;
716 }
717 }
f67539c2 718 return -CEPHFS_ENOENT;
7c673cae
FG
719 } else {
720 *result = get_filesystem(fscid);
721 return 0;
722 }
723}
724
725void Filesystem::print(std::ostream &out) const
726{
727 out << "Filesystem '" << mds_map.fs_name
728 << "' (" << fscid << ")" << std::endl;
729 mds_map.print(out);
f67539c2
TL
730 if (mirror_info.is_mirrored()) {
731 mirror_info.print(out);
732 }
7c673cae
FG
733}
734
9f95a23c 735bool FSMap::is_any_degraded() const
7c673cae 736{
9f95a23c
TL
737 for (auto& i : filesystems) {
738 if (i.second->mds_map.is_degraded()) {
739 return true;
740 }
741 }
742 return false;
743}
744
745std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
746{
747 std::map<mds_gid_t, mds_info_t> result;
748 for (const auto &i : standby_daemons) {
749 result[i.first] = i.second;
750 }
751
752 for (const auto &i : filesystems) {
753 const auto &fs_info = i.second->mds_map.get_mds_info();
754 for (const auto &j : fs_info) {
755 result[j.first] = j.second;
756 }
757 }
758
759 return result;
760}
761
522d829b 762const MDSMap::mds_info_t* FSMap::get_available_standby(const Filesystem& fs) const
9f95a23c 763{
522d829b 764 const bool upgradeable = fs.is_upgradeable();
9f95a23c 765 const mds_info_t* who = nullptr;
11fdf7f2
TL
766 for (const auto& [gid, info] : standby_daemons) {
767 ceph_assert(info.rank == MDS_RANK_NONE);
768 ceph_assert(info.state == MDSMap::STATE_STANDBY);
7c673cae 769
11fdf7f2 770 if (info.laggy() || info.is_frozen()) {
7c673cae 771 continue;
522d829b
TL
772 } else if (!info.compat.writeable(fs.mds_map.compat)) {
773 /* standby is not compatible with this fs */
774 continue;
775 } else if (!upgradeable && !fs.mds_map.compat.writeable(info.compat)) {
776 /* promotion would change fs.mds_map.compat and we're not upgradeable */
777 continue;
7c673cae
FG
778 }
779
522d829b 780 if (info.join_fscid == fs.fscid) {
9f95a23c
TL
781 who = &info;
782 break;
783 } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
784 who = &info; /* vanilla standby */
785 } else if (who == nullptr) {
786 who = &info; /* standby for another fs, last resort */
787 }
788 }
789 return who;
790}
791
792mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
793{
794 const auto info = get_mds_info();
795 for (const auto &p : info) {
796 if (p.second.name == s) {
797 return p.first;
798 }
7c673cae 799 }
11fdf7f2 800 return MDS_GID_NONE;
7c673cae
FG
801}
802
9f95a23c
TL
803const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
804{
805 std::map<mds_gid_t, mds_info_t> result;
806 for (const auto &i : standby_daemons) {
807 if (i.second.name == name) {
808 return &(i.second);
809 }
810 }
811
812 for (const auto &i : filesystems) {
813 const auto &fs_info = i.second->mds_map.get_mds_info();
814 for (const auto &j : fs_info) {
815 if (j.second.name == name) {
816 return &(j.second);
817 }
818 }
819 }
820
821 return nullptr;
822}
823
824const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
11fdf7f2
TL
825{
826 auto&& fs = get_filesystem(role.fscid);
7c673cae 827
11fdf7f2
TL
828 // First see if we have a STANDBY_REPLAY
829 for (const auto& [gid, info] : fs->mds_map.mds_info) {
830 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
831 if (info.is_frozen()) {
832 /* the standby-replay is frozen, do nothing! */
9f95a23c 833 return nullptr;
11fdf7f2 834 } else {
522d829b 835 ceph_assert(info.compat.writeable(fs->mds_map.compat));
9f95a23c 836 return &info;
11fdf7f2 837 }
7c673cae
FG
838 }
839 }
7c673cae 840
522d829b 841 return get_available_standby(*fs);
7c673cae
FG
842}
843
a4b75251 844void FSMap::sanity(bool pending) const
7c673cae 845{
a4b75251
TL
846 /* Only do some sanity checks on **new** FSMaps. Older versions may not be
847 * compliant.
848 */
849
7c673cae 850 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
11fdf7f2 851 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
7c673cae
FG
852 }
853
522d829b
TL
854 for (const auto& [fscid, fs] : filesystems) {
855 ceph_assert(fscid == fs->fscid);
856 for (const auto& [gid, info] : fs->mds_map.mds_info) {
857 ceph_assert(info.rank != MDS_RANK_NONE);
858 ceph_assert(mds_roles.at(gid) == fscid);
859 ceph_assert(standby_daemons.count(gid) == 0);
860 ceph_assert(standby_epochs.count(gid) == 0);
861 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
862 ceph_assert(fs->mds_map.up.at(info.rank) == gid);
863 ceph_assert(fs->mds_map.failed.count(info.rank) == 0);
864 ceph_assert(fs->mds_map.damaged.count(info.rank) == 0);
865 } else {
a4b75251 866 ceph_assert(!pending || fs->mds_map.allows_standby_replay());
7c673cae 867 }
522d829b 868 ceph_assert(info.compat.writeable(fs->mds_map.compat));
7c673cae
FG
869 }
870
871 for (const auto &j : fs->mds_map.up) {
872 mds_rank_t rank = j.first;
11fdf7f2 873 ceph_assert(fs->mds_map.in.count(rank) == 1);
7c673cae 874 mds_gid_t gid = j.second;
11fdf7f2 875 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
7c673cae
FG
876 }
877 }
878
879 for (const auto &i : standby_daemons) {
11fdf7f2
TL
880 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
881 ceph_assert(i.second.rank == MDS_RANK_NONE);
882 ceph_assert(i.second.global_id == i.first);
883 ceph_assert(standby_epochs.count(i.first) == 1);
884 ceph_assert(mds_roles.count(i.first) == 1);
885 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
7c673cae
FG
886 }
887
888 for (const auto &i : standby_epochs) {
11fdf7f2 889 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae
FG
890 }
891
892 for (const auto &i : mds_roles) {
893 if (i.second == FS_CLUSTER_ID_NONE) {
11fdf7f2 894 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae 895 } else {
11fdf7f2
TL
896 ceph_assert(filesystems.count(i.second) == 1);
897 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
7c673cae
FG
898 }
899 }
900}
901
902void FSMap::promote(
903 mds_gid_t standby_gid,
11fdf7f2 904 Filesystem& filesystem,
7c673cae
FG
905 mds_rank_t assigned_rank)
906{
11fdf7f2 907 ceph_assert(gid_exists(standby_gid));
7c673cae
FG
908 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
909 if (!is_standby_replay) {
11fdf7f2
TL
910 ceph_assert(standby_daemons.count(standby_gid));
911 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
7c673cae
FG
912 }
913
11fdf7f2 914 MDSMap &mds_map = filesystem.mds_map;
7c673cae
FG
915
916 // Insert daemon state to Filesystem
917 if (!is_standby_replay) {
918 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
919 } else {
11fdf7f2
TL
920 ceph_assert(mds_map.mds_info.count(standby_gid));
921 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
922 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
7c673cae 923 }
522d829b 924 auto& info = mds_map.mds_info.at(standby_gid);
7c673cae 925
a4b75251
TL
926 if (!filesystem.mds_map.compat.writeable(info.compat)) {
927 ceph_assert(filesystem.is_upgradeable());
928 filesystem.mds_map.compat.merge(info.compat);
929 }
930
7c673cae
FG
931 if (mds_map.stopped.erase(assigned_rank)) {
932 // The cluster is being expanded with a stopped rank
933 info.state = MDSMap::STATE_STARTING;
934 } else if (!mds_map.is_in(assigned_rank)) {
935 // The cluster is being expanded with a new rank
936 info.state = MDSMap::STATE_CREATING;
937 } else {
938 // An existing rank is being assigned to a replacement
939 info.state = MDSMap::STATE_REPLAY;
940 mds_map.failed.erase(assigned_rank);
941 }
942 info.rank = assigned_rank;
943 info.inc = epoch;
522d829b 944 mds_roles.at(standby_gid) = filesystem.fscid;
7c673cae
FG
945
946 // Update the rank state in Filesystem
947 mds_map.in.insert(assigned_rank);
948 mds_map.up[assigned_rank] = standby_gid;
949
950 // Remove from the list of standbys
951 if (!is_standby_replay) {
952 standby_daemons.erase(standby_gid);
953 standby_epochs.erase(standby_gid);
954 }
955
956 // Indicate that Filesystem has been modified
957 mds_map.epoch = epoch;
958}
959
960void FSMap::assign_standby_replay(
961 const mds_gid_t standby_gid,
962 const fs_cluster_id_t leader_ns,
963 const mds_rank_t leader_rank)
964{
11fdf7f2
TL
965 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
966 ceph_assert(gid_exists(standby_gid));
967 ceph_assert(!gid_has_rank(standby_gid));
968 ceph_assert(standby_daemons.count(standby_gid));
7c673cae
FG
969
970 // Insert to the filesystem
971 auto fs = filesystems.at(leader_ns);
972 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
973 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
974 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
975 mds_roles[standby_gid] = leader_ns;
976
977 // Remove from the list of standbys
978 standby_daemons.erase(standby_gid);
979 standby_epochs.erase(standby_gid);
980
981 // Indicate that Filesystem has been modified
982 fs->mds_map.epoch = epoch;
983}
984
f67539c2 985void FSMap::erase(mds_gid_t who, epoch_t blocklist_epoch)
7c673cae
FG
986{
987 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
988 standby_daemons.erase(who);
989 standby_epochs.erase(who);
990 } else {
991 auto &fs = filesystems.at(mds_roles.at(who));
992 const auto &info = fs->mds_map.mds_info.at(who);
993 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
994 if (info.state == MDSMap::STATE_CREATING) {
995 // If this gid didn't make it past CREATING, then forget
996 // the rank ever existed so that next time it's handed out
997 // to a gid it'll go back into CREATING.
998 fs->mds_map.in.erase(info.rank);
999 } else {
1000 // Put this rank into the failed list so that the next available
1001 // STANDBY will pick it up.
1002 fs->mds_map.failed.insert(info.rank);
1003 }
11fdf7f2 1004 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
7c673cae
FG
1005 fs->mds_map.up.erase(info.rank);
1006 }
1007 fs->mds_map.mds_info.erase(who);
f67539c2 1008 fs->mds_map.last_failure_osd_epoch = blocklist_epoch;
7c673cae
FG
1009 fs->mds_map.epoch = epoch;
1010 }
1011
1012 mds_roles.erase(who);
1013}
1014
f67539c2 1015void FSMap::damaged(mds_gid_t who, epoch_t blocklist_epoch)
7c673cae 1016{
11fdf7f2 1017 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae 1018 auto fs = filesystems.at(mds_roles.at(who));
a4b75251 1019 mds_rank_t rank = fs->mds_map.mds_info.at(who).rank;
7c673cae 1020
f67539c2 1021 erase(who, blocklist_epoch);
7c673cae
FG
1022 fs->mds_map.failed.erase(rank);
1023 fs->mds_map.damaged.insert(rank);
1024
11fdf7f2 1025 ceph_assert(fs->mds_map.epoch == epoch);
7c673cae
FG
1026}
1027
1028/**
1029 * Update to indicate that the rank `rank` is to be removed
1030 * from the damaged list of the filesystem `fscid`
1031 */
1032bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
1033{
1034 auto fs = filesystems.at(fscid);
1035
1036 if (fs->mds_map.damaged.erase(rank)) {
1037 fs->mds_map.failed.insert(rank);
1038 fs->mds_map.epoch = epoch;
1039 return true;
1040 } else {
1041 return false;
1042 }
1043}
1044
1045void FSMap::insert(const MDSMap::mds_info_t &new_info)
1046{
a4b75251
TL
1047 static const CompatSet empty;
1048
11fdf7f2
TL
1049 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
1050 ceph_assert(new_info.rank == MDS_RANK_NONE);
7c673cae 1051 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
a4b75251
TL
1052 auto& info = standby_daemons[new_info.global_id];
1053 info = new_info;
1054 if (empty.compare(info.compat) == 0) {
1055 // bootstrap old compat: boot beacon contains empty compat on old (v16.2.4
1056 // or older) MDS.
1057 info.compat = MDSMap::get_compat_set_v16_2_4();
1058 }
7c673cae
FG
1059 standby_epochs[new_info.global_id] = epoch;
1060}
1061
9f95a23c 1062std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
7c673cae 1063{
11fdf7f2 1064 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1065 auto fs = filesystems.at(mds_roles.at(who));
1066 const auto &info = fs->mds_map.mds_info.at(who);
1067 fs->mds_map.up.erase(info.rank);
1068 fs->mds_map.in.erase(info.rank);
1069 fs->mds_map.stopped.insert(info.rank);
1070
1071 // Also drop any standby replays that were following this rank
9f95a23c 1072 std::vector<mds_gid_t> standbys;
7c673cae
FG
1073 for (const auto &i : fs->mds_map.mds_info) {
1074 const auto &other_gid = i.first;
1075 const auto &other_info = i.second;
1076 if (other_info.rank == info.rank
1077 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
1078 standbys.push_back(other_gid);
1079 erase(other_gid, 0);
1080 }
1081 }
1082
1083 fs->mds_map.mds_info.erase(who);
1084 mds_roles.erase(who);
1085
1086 fs->mds_map.epoch = epoch;
1087
1088 return standbys;
1089}
1090
1091
1092/**
1093 * Given one of the following forms:
1094 * <fs name>:<rank>
1095 * <fs id>:<rank>
1096 * <rank>
1097 *
1098 * Parse into a mds_role_t. The rank-only form is only valid
1099 * if legacy_client_ns is set.
1100 */
f67539c2
TL
1101
1102int FSMap::parse_role(
1103 std::string_view role_str,
1104 mds_role_t *role,
1105 std::ostream &ss,
1106 const std::vector<string> &filter) const
1107{
1108 int r = parse_role(role_str, role, ss);
1109 if (r < 0) return r;
1110
1111 string_view fs_name = get_filesystem(role->fscid)->mds_map.get_fs_name();
1112
1113 if (!filter.empty() &&
1114 std::find(filter.begin(), filter.end(), fs_name) == filter.end()) {
1115 if (r >= 0) {
1116 ss << "Invalid file system";
1117 }
1118 return -CEPHFS_ENOENT;
1119 }
1120
1121 return r;
1122}
1123
7c673cae 1124int FSMap::parse_role(
11fdf7f2 1125 std::string_view role_str,
7c673cae
FG
1126 mds_role_t *role,
1127 std::ostream &ss) const
1128{
1129 size_t colon_pos = role_str.find(":");
1130 size_t rank_pos;
11fdf7f2 1131 Filesystem::const_ref fs;
7c673cae
FG
1132 if (colon_pos == std::string::npos) {
1133 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1134 ss << "No filesystem selected";
f67539c2 1135 return -CEPHFS_ENOENT;
7c673cae
FG
1136 }
1137 fs = get_filesystem(legacy_client_fscid);
1138 rank_pos = 0;
1139 } else {
1140 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1141 ss << "Invalid filesystem";
f67539c2 1142 return -CEPHFS_ENOENT;
7c673cae
FG
1143 }
1144 rank_pos = colon_pos+1;
1145 }
1146
1147 mds_rank_t rank;
1148 std::string err;
94b18763 1149 std::string rank_str(role_str.substr(rank_pos));
7c673cae
FG
1150 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1151 if (rank_i < 0 || !err.empty()) {
1152 ss << "Invalid rank '" << rank_str << "'";
f67539c2 1153 return -CEPHFS_EINVAL;
7c673cae
FG
1154 } else {
1155 rank = rank_i;
1156 }
1157
1158 if (fs->mds_map.in.count(rank) == 0) {
1159 ss << "Rank '" << rank << "' not found";
f67539c2 1160 return -CEPHFS_ENOENT;
7c673cae
FG
1161 }
1162
1163 *role = {fs->fscid, rank};
1164
1165 return 0;
1166}
9f95a23c
TL
1167
1168bool FSMap::pool_in_use(int64_t poolid) const
1169{
1170 for (auto const &i : filesystems) {
1171 if (i.second->mds_map.is_data_pool(poolid)
1172 || i.second->mds_map.metadata_pool == poolid) {
1173 return true;
1174 }
1175 }
1176 return false;
1177}
1178
1179void FSMap::erase_filesystem(fs_cluster_id_t fscid)
1180{
1181 filesystems.erase(fscid);
1182 for (auto& [gid, info] : standby_daemons) {
1183 if (info.join_fscid == fscid) {
1184 modify_daemon(gid, [](auto& info) {
1185 info.join_fscid = FS_CLUSTER_ID_NONE;
1186 });
1187 }
1188 }
1189 for (auto& p : filesystems) {
1190 for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
1191 if (info.join_fscid == fscid) {
1192 modify_daemon(gid, [](auto& info) {
1193 info.join_fscid = FS_CLUSTER_ID_NONE;
1194 });
1195 }
1196 }
1197 }
1198}