]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/FSMap.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / FSMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
f67539c2 15#include <ostream>
7c673cae
FG
16
17#include "FSMap.h"
18
11fdf7f2 19#include "common/StackStringStream.h"
7c673cae 20
11fdf7f2
TL
21#ifdef WITH_SEASTAR
22#include "crimson/common/config_proxy.h"
23#else
24#include "common/config_proxy.h"
25#endif
26#include "global/global_context.h"
224ce89b
WB
27#include "mon/health_check.h"
28
f67539c2
TL
29using std::list;
30using std::pair;
31using std::ostream;
32using std::string;
20effc67 33using std::string_view;
f67539c2
TL
34
35using ceph::bufferlist;
36using ceph::Formatter;
37
38void ClusterInfo::encode(ceph::buffer::list &bl) const {
39 ENCODE_START(1, 1, bl);
40 encode(client_name, bl);
41 encode(cluster_name, bl);
42 encode(fs_name, bl);
43 ENCODE_FINISH(bl);
44}
45
46void ClusterInfo::decode(ceph::buffer::list::const_iterator &iter) {
47 DECODE_START(1, iter);
48 decode(client_name, iter);
49 decode(cluster_name, iter);
50 decode(fs_name, iter);
51 DECODE_FINISH(iter);
52}
53
54void ClusterInfo::dump(ceph::Formatter *f) const {
55 f->dump_string("client_name", client_name);
56 f->dump_string("cluster_name", cluster_name);
57 f->dump_string("fs_name", fs_name);
58}
59
60void ClusterInfo::print(std::ostream& out) const {
61 out << "[client_name=" << client_name << ", cluster_name=" << cluster_name
62 << ", fs_name=" << fs_name << "]" << std::endl;
63}
64
65void Peer::encode(ceph::buffer::list &bl) const {
66 ENCODE_START(1, 1, bl);
67 encode(uuid, bl);
68 encode(remote, bl);
69 ENCODE_FINISH(bl);
70}
71
72void Peer::decode(ceph::buffer::list::const_iterator &iter) {
73 DECODE_START(1, iter);
74 decode(uuid, iter);
75 decode(remote, iter);
76 DECODE_FINISH(iter);
77}
78
79void Peer::dump(ceph::Formatter *f) const {
80 f->open_object_section(uuid);
81 f->dump_object("remote", remote);
82 f->close_section();
83}
84
85void Peer::print(std::ostream& out) const {
86 out << "[uuid=" << uuid << ", remote=" << remote << "]" << std::endl;
87}
88
89void MirrorInfo::encode(ceph::buffer::list &bl) const {
90 ENCODE_START(1, 1, bl);
91 encode(mirrored, bl);
92 encode(peers, bl);
93 ENCODE_FINISH(bl);
94}
95
96void MirrorInfo::decode(ceph::buffer::list::const_iterator &iter) {
97 DECODE_START(1, iter);
98 decode(mirrored, iter);
99 decode(peers, iter);
100 DECODE_FINISH(iter);
101}
102
103void MirrorInfo::dump(ceph::Formatter *f) const {
104 f->open_object_section("peers");
105 for (auto &peer : peers) {
106 peer.dump(f);
107 }
108 f->close_section(); // peers
109}
110
111void MirrorInfo::print(std::ostream& out) const {
112 out << "[peers=" << peers << "]" << std::endl;
113}
7c673cae
FG
114
115void Filesystem::dump(Formatter *f) const
116{
117 f->open_object_section("mdsmap");
118 mds_map.dump(f);
119 f->close_section();
120 f->dump_int("id", fscid);
f67539c2
TL
121 if (mirror_info.is_mirrored()) {
122 f->open_object_section("mirror_info");
123 mirror_info.dump(f);
124 f->close_section(); // mirror_info
125 }
7c673cae
FG
126}
127
128void FSMap::dump(Formatter *f) const
129{
130 f->dump_int("epoch", epoch);
11fdf7f2
TL
131 // Use 'default' naming to match 'set-default' CLI
132 f->dump_int("default_fscid", legacy_client_fscid);
7c673cae
FG
133
134 f->open_object_section("compat");
522d829b 135 default_compat.dump(f);
7c673cae
FG
136 f->close_section();
137
138 f->open_object_section("feature_flags");
139 f->dump_bool("enable_multiple", enable_multiple);
140 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
141 f->close_section();
142
143 f->open_array_section("standbys");
9f95a23c 144 for (const auto& [gid, info] : standby_daemons) {
7c673cae 145 f->open_object_section("info");
9f95a23c
TL
146 info.dump(f);
147 f->dump_int("epoch", standby_epochs.at(gid));
7c673cae
FG
148 f->close_section();
149 }
150 f->close_section();
151
152 f->open_array_section("filesystems");
153 for (const auto &fs : filesystems) {
154 f->open_object_section("filesystem");
155 fs.second->dump(f);
156 f->close_section();
157 }
158 f->close_section();
159}
160
9f95a23c
TL
161FSMap &FSMap::operator=(const FSMap &rhs)
162{
163 epoch = rhs.epoch;
164 next_filesystem_id = rhs.next_filesystem_id;
165 legacy_client_fscid = rhs.legacy_client_fscid;
522d829b 166 default_compat = rhs.default_compat;
9f95a23c
TL
167 enable_multiple = rhs.enable_multiple;
168 mds_roles = rhs.mds_roles;
169 standby_daemons = rhs.standby_daemons;
170 standby_epochs = rhs.standby_epochs;
171
172 filesystems.clear();
173 for (const auto &i : rhs.filesystems) {
174 const auto &fs = i.second;
175 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
176 }
177
178 return *this;
179}
180
181void FSMap::generate_test_instances(std::list<FSMap*>& ls)
7c673cae
FG
182{
183 FSMap *m = new FSMap();
184
185 std::list<MDSMap*> mds_map_instances;
186 MDSMap::generate_test_instances(mds_map_instances);
187
188 int k = 20;
189 for (auto i : mds_map_instances) {
11fdf7f2 190 auto fs = Filesystem::create();
7c673cae
FG
191 fs->fscid = k++;
192 fs->mds_map = *i;
193 delete i;
194 m->filesystems[fs->fscid] = fs;
195 }
196 mds_map_instances.clear();
197
198 ls.push_back(m);
199}
200
201void FSMap::print(ostream& out) const
202{
203 out << "e" << epoch << std::endl;
204 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
205 << ever_enabled_multiple << std::endl;
522d829b 206 out << "default compat: " << default_compat << std::endl;
7c673cae
FG
207 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
208 out << " " << std::endl;
209
210 if (filesystems.empty()) {
211 out << "No filesystems configured" << std::endl;
7c673cae
FG
212 }
213
11fdf7f2
TL
214 for (const auto& p : filesystems) {
215 p.second->print(out);
7c673cae
FG
216 out << " " << std::endl << " " << std::endl; // Space out a bit
217 }
218
219 if (!standby_daemons.empty()) {
220 out << "Standby daemons:" << std::endl << " " << std::endl;
221 }
222
9f95a23c
TL
223 for (const auto& p : standby_daemons) {
224 out << p.second << std::endl;
7c673cae
FG
225 }
226}
227
f67539c2
TL
228void FSMap::print_daemon_summary(ostream& out) const
229{
230 // this appears in the "services:" section of "ceph status"
231 int num_up = 0, num_in = 0, num_failed = 0;
232 int num_standby_replay = 0;
233 for (auto& [fscid, fs] : filesystems) {
234 num_up += fs->mds_map.get_num_up_mds();
235 num_in += fs->mds_map.get_num_in_mds();
236 num_failed += fs->mds_map.get_num_failed_mds();
237 num_standby_replay += fs->mds_map.get_num_standby_replay_mds();
238 }
239 int num_standby = standby_daemons.size();
240 out << num_up << "/" << num_in << " daemons up";
241 if (num_failed) {
242 out << " (" << num_failed << " failed)";
243 }
244 if (num_standby) {
245 out << ", " << num_standby << " standby";
246 }
247 if (num_standby_replay) {
248 out << ", " << num_standby_replay << " hot standby";
249 }
250}
251
252void FSMap::print_fs_summary(ostream& out) const
253{
254 // this appears in the "data:" section of "ceph status"
255 if (!filesystems.empty()) {
256 int num_failed = 0, num_recovering = 0, num_stopped = 0, num_healthy = 0;
257 int num_damaged = 0;
258 for (auto& [fscid, fs] : filesystems) {
259 if (fs->mds_map.is_any_damaged()) {
260 ++num_damaged;
261 }
262 if (fs->mds_map.is_any_failed()) {
263 ++num_failed;
264 } else if (fs->mds_map.is_degraded()) {
265 ++num_recovering;
266 } else if (fs->mds_map.get_max_mds() == 0) {
267 ++num_stopped;
268 } else {
269 ++num_healthy;
270 }
271 }
272 out << " volumes: "
273 << num_healthy << "/" << filesystems.size() << " healthy";
274 if (num_recovering) {
275 out << ", " << num_recovering << " recovering";
276 }
277 if (num_failed) {
278 out << ", " << num_failed << " failed";
279 }
280 if (num_stopped) {
281 out << ", " << num_stopped << " stopped";
282 }
283 if (num_damaged) {
284 out << "; " << num_damaged << " damaged";
285 }
286 out << "\n";
287 }
288}
289
7c673cae
FG
290void FSMap::print_summary(Formatter *f, ostream *out) const
291{
7c673cae
FG
292 if (f) {
293 f->dump_unsigned("epoch", get_epoch());
11fdf7f2
TL
294 for (const auto &p : filesystems) {
295 auto& fs = p.second;
7c673cae
FG
296 f->dump_unsigned("id", fs->fscid);
297 f->dump_unsigned("up", fs->mds_map.up.size());
298 f->dump_unsigned("in", fs->mds_map.in.size());
299 f->dump_unsigned("max", fs->mds_map.max_mds);
300 }
301 } else {
11fdf7f2
TL
302 auto count = filesystems.size();
303 if (count <= 3) {
304 bool first = true;
305 for (const auto& p : filesystems) {
306 const auto& fs = p.second;
307 if (!first) {
308 *out << " ";
309 }
310 if (fs->mds_map.is_degraded()) {
311 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
312 } else {
313 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
314 }
315 first = false;
316 }
317 } else {
318 *out << count << " fs";
319 unsigned degraded = 0;
320 CachedStackStringStream css;
321 *css << " (degraded: ";
322 for (const auto& p : filesystems) {
323 const auto& fs = p.second;
324 if (fs->mds_map.is_degraded()) {
325 degraded++;
326 if (degraded <= 3) {
327 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
328 }
329 }
330 }
331 if (degraded > 0) {
332 if (degraded <= 3) {
333 *css << ")";
334 *out << css->strv();
335 } else {
336 *out << " (degraded: " << degraded << " fs)";
337 }
338 }
7c673cae
FG
339 }
340 }
341
342 if (f) {
343 f->open_array_section("by_rank");
344 }
345
11fdf7f2
TL
346 std::map<MDSMap::DaemonState,unsigned> by_state;
347 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
348 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
349 for (const auto& [gid, fscid] : mds_roles) {
350 if (fscid == FS_CLUSTER_ID_NONE)
351 continue;
352
353 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
354 auto s = std::string(ceph_mds_state_name(info.state));
7c673cae
FG
355 if (info.laggy()) {
356 s += "(laggy or crashed)";
357 }
358
11fdf7f2
TL
359 if (f) {
360 f->open_object_section("mds");
361 f->dump_unsigned("filesystem_id", fscid);
362 f->dump_unsigned("rank", info.rank);
363 f->dump_string("name", info.name);
364 f->dump_string("status", s);
365 f->dump_unsigned("gid", gid);
366 f->close_section();
367 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
368 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
7c673cae 369 }
11fdf7f2 370 by_state[info.state]++;
7c673cae
FG
371 }
372
373 if (f) {
374 f->close_section();
375 } else {
11fdf7f2 376 if (0 < by_rank.size() && by_rank.size() < 5) {
7c673cae
FG
377 if (filesystems.size() > 1) {
378 // Disambiguate filesystems
379 std::map<std::string, std::string> pretty;
11fdf7f2
TL
380 for (const auto& [role,status] : by_rank) {
381 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
382 CachedStackStringStream css;
383 *css << fs_name << ":" << role.rank;
384 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
385 --by_state[status.first]; /* already printed! */
7c673cae
FG
386 }
387 *out << " " << pretty;
388 } else {
389 // Omit FSCID in output when only one filesystem exists
390 std::map<mds_rank_t, std::string> shortened;
11fdf7f2
TL
391 for (const auto& [role,status] : by_rank) {
392 shortened[role.rank] = status.second;
393 --by_state[status.first]; /* already printed! */
7c673cae
FG
394 }
395 *out << " " << shortened;
396 }
397 }
11fdf7f2
TL
398 for (const auto& [state, count] : by_state) {
399 if (count > 0) {
400 auto s = std::string_view(ceph_mds_state_name(state));
401 *out << " " << count << " " << s;
402 }
403 }
7c673cae
FG
404 }
405
11fdf7f2
TL
406 if (f) {
407 const auto state = MDSMap::DaemonState::STATE_STANDBY;
408 auto&& name = ceph_mds_state_name(state);
409 auto count = standby_daemons.size();
410 f->dump_unsigned(name, count);
7c673cae
FG
411 }
412
413 size_t failed = 0;
414 size_t damaged = 0;
11fdf7f2
TL
415 for (const auto& p : filesystems) {
416 auto& fs = p.second;
7c673cae
FG
417 failed += fs->mds_map.failed.size();
418 damaged += fs->mds_map.damaged.size();
419 }
420
421 if (failed > 0) {
422 if (f) {
423 f->dump_unsigned("failed", failed);
424 } else {
425 *out << ", " << failed << " failed";
426 }
427 }
428
429 if (damaged > 0) {
430 if (f) {
431 f->dump_unsigned("damaged", damaged);
432 } else {
433 *out << ", " << damaged << " damaged";
434 }
435 }
436 //if (stopped.size())
437 //out << ", " << stopped.size() << " stopped";
438}
439
9f95a23c
TL
440mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
441{
442 for (const auto &i : mds_map.mds_info) {
443 const auto &info = i.second;
444 if (info.state == MDSMap::STATE_STANDBY_REPLAY
445 && info.rank == mds_map.mds_info.at(who).rank) {
446 return info.global_id;
447 }
448 }
449 return MDS_GID_NONE;
450}
7c673cae 451
11fdf7f2 452Filesystem::ref FSMap::create_filesystem(std::string_view name,
522d829b 453 int64_t metadata_pool, int64_t data_pool, uint64_t features,
20effc67 454 fs_cluster_id_t fscid, bool recover)
7c673cae 455{
11fdf7f2 456 auto fs = Filesystem::create();
28e407b8 457 fs->mds_map.epoch = epoch;
11fdf7f2 458 fs->mds_map.fs_name = name;
31f18b77 459 fs->mds_map.data_pools.push_back(data_pool);
7c673cae
FG
460 fs->mds_map.metadata_pool = metadata_pool;
461 fs->mds_map.cas_pool = -1;
522d829b 462 fs->mds_map.compat = default_compat;
7c673cae
FG
463 fs->mds_map.created = ceph_clock_now();
464 fs->mds_map.modified = ceph_clock_now();
7c673cae 465 fs->mds_map.enabled = true;
522d829b
TL
466 if (fscid == FS_CLUSTER_ID_NONE) {
467 fs->fscid = next_filesystem_id++;
468 } else {
469 fs->fscid = fscid;
470 next_filesystem_id = std::max(fscid, (fs_cluster_id_t)next_filesystem_id) + 1;
471 }
472
20effc67
TL
473 if (recover) {
474 // Populate rank 0 as existing (so don't go into CREATING)
475 // but failed (so that next available MDS is assigned the rank)
476 fs->mds_map.in.insert(mds_rank_t(0));
477 fs->mds_map.failed.insert(mds_rank_t(0));
478
479 fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE);
480 }
481
522d829b
TL
482 // File system's ID can be FS_CLUSTER_ID_ANONYMOUS if we're recovering
483 // a legacy file system by passing FS_CLUSTER_ID_ANONYMOUS as the desired
484 // file system ID
485 if (fscid != FS_CLUSTER_ID_ANONYMOUS) {
486 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
487 // have initialized next_filesystem_id such that it's never used here.
488 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
489 }
7c673cae
FG
490 filesystems[fs->fscid] = fs;
491
492 // Created first filesystem? Set it as the one
493 // for legacy clients to use
494 if (filesystems.size() == 1) {
495 legacy_client_fscid = fs->fscid;
496 }
11fdf7f2
TL
497
498 return fs;
7c673cae
FG
499}
500
9f95a23c
TL
501Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
502{
503 for (const auto& p : filesystems) {
504 if (p.second->mds_map.fs_name == name) {
505 return p.second;
506 }
507 }
508 return nullptr;
509}
510
511std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
512{
513 std::vector<Filesystem::const_ref> ret;
514 for (const auto& p : filesystems) {
515 ret.push_back(p.second);
516 }
517 return ret;
518}
519
7c673cae
FG
520void FSMap::reset_filesystem(fs_cluster_id_t fscid)
521{
522 auto fs = get_filesystem(fscid);
11fdf7f2 523 auto new_fs = Filesystem::create();
7c673cae
FG
524
525 // Populate rank 0 as existing (so don't go into CREATING)
526 // but failed (so that next available MDS is assigned the rank)
527 new_fs->mds_map.in.insert(mds_rank_t(0));
528 new_fs->mds_map.failed.insert(mds_rank_t(0));
529
530 // Carry forward what makes sense
531 new_fs->fscid = fs->fscid;
532 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
7c673cae
FG
533 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
534 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
535 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
536 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
522d829b 537 new_fs->mds_map.compat = default_compat;
7c673cae
FG
538 new_fs->mds_map.created = ceph_clock_now();
539 new_fs->mds_map.modified = ceph_clock_now();
7c673cae
FG
540 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
541 new_fs->mds_map.enabled = true;
542
c07f9fc5
FG
543 // Remember mds ranks that have ever started. (They should load old inotable
544 // instead of creating new one if they start again.)
545 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
546 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
547 new_fs->mds_map.stopped.erase(mds_rank_t(0));
548
7c673cae
FG
549 // Persist the new FSMap
550 filesystems[new_fs->fscid] = new_fs;
551}
552
553void FSMap::get_health(list<pair<health_status_t,string> >& summary,
554 list<pair<health_status_t,string> > *detail) const
555{
556 mds_rank_t standby_count_wanted = 0;
557 for (const auto &i : filesystems) {
558 const auto &fs = i.second;
559
560 // TODO: move get_health up into here so that we can qualify
561 // all the messages with what filesystem they're talking about
562 fs->mds_map.get_health(summary, detail);
563
564 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
565 }
566
567 if (standby_count_wanted) {
f67539c2
TL
568 CachedStackStringStream css;
569 *css << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
570 summary.push_back(make_pair(HEALTH_WARN, css->str()));
7c673cae
FG
571 }
572}
573
574bool FSMap::check_health(void)
575{
576 bool changed = false;
577 for (auto &i : filesystems) {
578 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
579 }
580 return changed;
581}
582
224ce89b
WB
583void FSMap::get_health_checks(health_check_map_t *checks) const
584{
585 mds_rank_t standby_count_wanted = 0;
586 for (const auto &i : filesystems) {
587 const auto &fs = i.second;
588 health_check_map_t fschecks;
d2e6a577 589
224ce89b 590 fs->mds_map.get_health_checks(&fschecks);
d2e6a577
FG
591
592 // Some of the failed ranks might be transient (i.e. there are standbys
593 // ready to replace them). We will report only on "stuck" failed, i.e.
594 // ranks which are failed and have no standby replacement available.
595 std::set<mds_rank_t> stuck_failed;
596
597 for (const auto &rank : fs->mds_map.failed) {
9f95a23c
TL
598 auto rep_info = find_replacement_for({fs->fscid, rank});
599 if (!rep_info) {
d2e6a577
FG
600 stuck_failed.insert(rank);
601 }
602 }
603
604 // FS_WITH_FAILED_MDS
605 if (!stuck_failed.empty()) {
606 health_check_t& fscheck = checks->get_or_add(
607 "FS_WITH_FAILED_MDS", HEALTH_WARN,
9f95a23c 608 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
f67539c2
TL
609 CachedStackStringStream css;
610 *css << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
d2e6a577 611 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
f67539c2 612 fscheck.detail.push_back(css->str()); }
d2e6a577 613
224ce89b
WB
614 checks->merge(fschecks);
615 standby_count_wanted = std::max(
616 standby_count_wanted,
617 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
618 }
619
620 // MDS_INSUFFICIENT_STANDBY
621 if (standby_count_wanted) {
f67539c2
TL
622 CachedStackStringStream css1, css2;
623 *css1 << "insufficient standby MDS daemons available";
624 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, css1->str(), 1);
625 *css2 << "have " << standby_daemons.size() << "; want " << standby_count_wanted
626 << " more";
627 d.detail.push_back(css2->str());
224ce89b
WB
628 }
629}
630
9f95a23c
TL
631void FSMap::encode(bufferlist& bl, uint64_t features) const
632{
522d829b 633 ENCODE_START(STRUCT_VERSION, 6, bl);
9f95a23c
TL
634 encode(epoch, bl);
635 encode(next_filesystem_id, bl);
636 encode(legacy_client_fscid, bl);
522d829b 637 encode(default_compat, bl);
9f95a23c
TL
638 encode(enable_multiple, bl);
639 {
640 std::vector<Filesystem::ref> v;
641 v.reserve(filesystems.size());
642 for (auto& p : filesystems) v.emplace_back(p.second);
643 encode(v, bl, features);
7c673cae 644 }
9f95a23c
TL
645 encode(mds_roles, bl);
646 encode(standby_daemons, bl, features);
647 encode(standby_epochs, bl);
648 encode(ever_enabled_multiple, bl);
649 ENCODE_FINISH(bl);
7c673cae
FG
650}
651
11fdf7f2 652void FSMap::decode(bufferlist::const_iterator& p)
7c673cae 653{
a4b75251 654 struct_version = 0;
522d829b
TL
655 DECODE_START(STRUCT_VERSION, p);
656 DECODE_OLDEST(7);
657 struct_version = struct_v;
f67539c2
TL
658 decode(epoch, p);
659 decode(next_filesystem_id, p);
660 decode(legacy_client_fscid, p);
522d829b 661 decode(default_compat, p);
f67539c2
TL
662 decode(enable_multiple, p);
663 {
664 std::vector<Filesystem::ref> v;
665 decode(v, p);
7c673cae 666 filesystems.clear();
f67539c2
TL
667 for (auto& ref : v) {
668 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
669 ceph_assert(em.second);
7c673cae
FG
670 }
671 }
f67539c2
TL
672 decode(mds_roles, p);
673 decode(standby_daemons, p);
674 decode(standby_epochs, p);
675 if (struct_v >= 7) {
676 decode(ever_enabled_multiple, p);
677 }
7c673cae
FG
678 DECODE_FINISH(p);
679}
680
11fdf7f2 681void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
3efd9988
FG
682{
683 for (auto &fs : filesystems) {
684 fs.second->mds_map.sanitize(pool_exists);
685 }
686}
7c673cae
FG
687
688void Filesystem::encode(bufferlist& bl, uint64_t features) const
689{
f67539c2 690 ENCODE_START(2, 1, bl);
11fdf7f2 691 encode(fscid, bl);
7c673cae
FG
692 bufferlist mdsmap_bl;
693 mds_map.encode(mdsmap_bl, features);
11fdf7f2 694 encode(mdsmap_bl, bl);
f67539c2 695 encode(mirror_info, bl);
7c673cae
FG
696 ENCODE_FINISH(bl);
697}
698
11fdf7f2 699void Filesystem::decode(bufferlist::const_iterator& p)
7c673cae 700{
f67539c2 701 DECODE_START(2, p);
11fdf7f2 702 decode(fscid, p);
7c673cae 703 bufferlist mdsmap_bl;
11fdf7f2
TL
704 decode(mdsmap_bl, p);
705 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
7c673cae 706 mds_map.decode(mdsmap_bl_iter);
f67539c2
TL
707 if (struct_v >= 2) {
708 decode(mirror_info, p);
709 }
7c673cae
FG
710 DECODE_FINISH(p);
711}
712
713int FSMap::parse_filesystem(
11fdf7f2
TL
714 std::string_view ns_str,
715 Filesystem::const_ref* result
7c673cae
FG
716 ) const
717{
718 std::string ns_err;
94b18763
FG
719 std::string s(ns_str);
720 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
7c673cae
FG
721 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
722 for (auto &fs : filesystems) {
94b18763 723 if (fs.second->mds_map.fs_name == s) {
7c673cae
FG
724 *result = std::const_pointer_cast<const Filesystem>(fs.second);
725 return 0;
726 }
727 }
f67539c2 728 return -CEPHFS_ENOENT;
7c673cae
FG
729 } else {
730 *result = get_filesystem(fscid);
731 return 0;
732 }
733}
734
735void Filesystem::print(std::ostream &out) const
736{
737 out << "Filesystem '" << mds_map.fs_name
738 << "' (" << fscid << ")" << std::endl;
739 mds_map.print(out);
f67539c2
TL
740 if (mirror_info.is_mirrored()) {
741 mirror_info.print(out);
742 }
7c673cae
FG
743}
744
9f95a23c 745bool FSMap::is_any_degraded() const
7c673cae 746{
9f95a23c
TL
747 for (auto& i : filesystems) {
748 if (i.second->mds_map.is_degraded()) {
749 return true;
750 }
751 }
752 return false;
753}
754
755std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
756{
757 std::map<mds_gid_t, mds_info_t> result;
758 for (const auto &i : standby_daemons) {
759 result[i.first] = i.second;
760 }
761
762 for (const auto &i : filesystems) {
763 const auto &fs_info = i.second->mds_map.get_mds_info();
764 for (const auto &j : fs_info) {
765 result[j.first] = j.second;
766 }
767 }
768
769 return result;
770}
771
522d829b 772const MDSMap::mds_info_t* FSMap::get_available_standby(const Filesystem& fs) const
9f95a23c 773{
522d829b 774 const bool upgradeable = fs.is_upgradeable();
9f95a23c 775 const mds_info_t* who = nullptr;
11fdf7f2
TL
776 for (const auto& [gid, info] : standby_daemons) {
777 ceph_assert(info.rank == MDS_RANK_NONE);
778 ceph_assert(info.state == MDSMap::STATE_STANDBY);
7c673cae 779
11fdf7f2 780 if (info.laggy() || info.is_frozen()) {
7c673cae 781 continue;
522d829b
TL
782 } else if (!info.compat.writeable(fs.mds_map.compat)) {
783 /* standby is not compatible with this fs */
784 continue;
785 } else if (!upgradeable && !fs.mds_map.compat.writeable(info.compat)) {
786 /* promotion would change fs.mds_map.compat and we're not upgradeable */
787 continue;
7c673cae
FG
788 }
789
522d829b 790 if (info.join_fscid == fs.fscid) {
9f95a23c
TL
791 who = &info;
792 break;
793 } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
794 who = &info; /* vanilla standby */
795 } else if (who == nullptr) {
796 who = &info; /* standby for another fs, last resort */
797 }
798 }
799 return who;
800}
801
802mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
803{
804 const auto info = get_mds_info();
805 for (const auto &p : info) {
806 if (p.second.name == s) {
807 return p.first;
808 }
7c673cae 809 }
11fdf7f2 810 return MDS_GID_NONE;
7c673cae
FG
811}
812
9f95a23c
TL
813const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
814{
815 std::map<mds_gid_t, mds_info_t> result;
816 for (const auto &i : standby_daemons) {
817 if (i.second.name == name) {
818 return &(i.second);
819 }
820 }
821
822 for (const auto &i : filesystems) {
823 const auto &fs_info = i.second->mds_map.get_mds_info();
824 for (const auto &j : fs_info) {
825 if (j.second.name == name) {
826 return &(j.second);
827 }
828 }
829 }
830
831 return nullptr;
832}
833
834const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
11fdf7f2
TL
835{
836 auto&& fs = get_filesystem(role.fscid);
7c673cae 837
11fdf7f2
TL
838 // First see if we have a STANDBY_REPLAY
839 for (const auto& [gid, info] : fs->mds_map.mds_info) {
840 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
841 if (info.is_frozen()) {
842 /* the standby-replay is frozen, do nothing! */
9f95a23c 843 return nullptr;
11fdf7f2 844 } else {
522d829b 845 ceph_assert(info.compat.writeable(fs->mds_map.compat));
9f95a23c 846 return &info;
11fdf7f2 847 }
7c673cae
FG
848 }
849 }
7c673cae 850
522d829b 851 return get_available_standby(*fs);
7c673cae
FG
852}
853
a4b75251 854void FSMap::sanity(bool pending) const
7c673cae 855{
a4b75251
TL
856 /* Only do some sanity checks on **new** FSMaps. Older versions may not be
857 * compliant.
858 */
859
7c673cae 860 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
11fdf7f2 861 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
7c673cae
FG
862 }
863
522d829b
TL
864 for (const auto& [fscid, fs] : filesystems) {
865 ceph_assert(fscid == fs->fscid);
866 for (const auto& [gid, info] : fs->mds_map.mds_info) {
867 ceph_assert(info.rank != MDS_RANK_NONE);
868 ceph_assert(mds_roles.at(gid) == fscid);
869 ceph_assert(standby_daemons.count(gid) == 0);
870 ceph_assert(standby_epochs.count(gid) == 0);
871 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
872 ceph_assert(fs->mds_map.up.at(info.rank) == gid);
873 ceph_assert(fs->mds_map.failed.count(info.rank) == 0);
874 ceph_assert(fs->mds_map.damaged.count(info.rank) == 0);
875 } else {
a4b75251 876 ceph_assert(!pending || fs->mds_map.allows_standby_replay());
7c673cae 877 }
522d829b 878 ceph_assert(info.compat.writeable(fs->mds_map.compat));
7c673cae
FG
879 }
880
881 for (const auto &j : fs->mds_map.up) {
882 mds_rank_t rank = j.first;
11fdf7f2 883 ceph_assert(fs->mds_map.in.count(rank) == 1);
7c673cae 884 mds_gid_t gid = j.second;
11fdf7f2 885 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
7c673cae
FG
886 }
887 }
888
889 for (const auto &i : standby_daemons) {
11fdf7f2
TL
890 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
891 ceph_assert(i.second.rank == MDS_RANK_NONE);
892 ceph_assert(i.second.global_id == i.first);
893 ceph_assert(standby_epochs.count(i.first) == 1);
894 ceph_assert(mds_roles.count(i.first) == 1);
895 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
7c673cae
FG
896 }
897
898 for (const auto &i : standby_epochs) {
11fdf7f2 899 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae
FG
900 }
901
902 for (const auto &i : mds_roles) {
903 if (i.second == FS_CLUSTER_ID_NONE) {
11fdf7f2 904 ceph_assert(standby_daemons.count(i.first) == 1);
7c673cae 905 } else {
11fdf7f2
TL
906 ceph_assert(filesystems.count(i.second) == 1);
907 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
7c673cae
FG
908 }
909 }
910}
911
912void FSMap::promote(
913 mds_gid_t standby_gid,
11fdf7f2 914 Filesystem& filesystem,
7c673cae
FG
915 mds_rank_t assigned_rank)
916{
11fdf7f2 917 ceph_assert(gid_exists(standby_gid));
7c673cae
FG
918 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
919 if (!is_standby_replay) {
11fdf7f2
TL
920 ceph_assert(standby_daemons.count(standby_gid));
921 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
7c673cae
FG
922 }
923
11fdf7f2 924 MDSMap &mds_map = filesystem.mds_map;
7c673cae
FG
925
926 // Insert daemon state to Filesystem
927 if (!is_standby_replay) {
928 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
929 } else {
11fdf7f2
TL
930 ceph_assert(mds_map.mds_info.count(standby_gid));
931 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
932 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
7c673cae 933 }
522d829b 934 auto& info = mds_map.mds_info.at(standby_gid);
7c673cae 935
a4b75251
TL
936 if (!filesystem.mds_map.compat.writeable(info.compat)) {
937 ceph_assert(filesystem.is_upgradeable());
938 filesystem.mds_map.compat.merge(info.compat);
939 }
940
7c673cae
FG
941 if (mds_map.stopped.erase(assigned_rank)) {
942 // The cluster is being expanded with a stopped rank
943 info.state = MDSMap::STATE_STARTING;
944 } else if (!mds_map.is_in(assigned_rank)) {
945 // The cluster is being expanded with a new rank
946 info.state = MDSMap::STATE_CREATING;
947 } else {
948 // An existing rank is being assigned to a replacement
949 info.state = MDSMap::STATE_REPLAY;
950 mds_map.failed.erase(assigned_rank);
951 }
952 info.rank = assigned_rank;
953 info.inc = epoch;
522d829b 954 mds_roles.at(standby_gid) = filesystem.fscid;
7c673cae
FG
955
956 // Update the rank state in Filesystem
957 mds_map.in.insert(assigned_rank);
958 mds_map.up[assigned_rank] = standby_gid;
959
960 // Remove from the list of standbys
961 if (!is_standby_replay) {
962 standby_daemons.erase(standby_gid);
963 standby_epochs.erase(standby_gid);
964 }
965
966 // Indicate that Filesystem has been modified
967 mds_map.epoch = epoch;
968}
969
970void FSMap::assign_standby_replay(
971 const mds_gid_t standby_gid,
972 const fs_cluster_id_t leader_ns,
973 const mds_rank_t leader_rank)
974{
11fdf7f2
TL
975 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
976 ceph_assert(gid_exists(standby_gid));
977 ceph_assert(!gid_has_rank(standby_gid));
978 ceph_assert(standby_daemons.count(standby_gid));
7c673cae
FG
979
980 // Insert to the filesystem
981 auto fs = filesystems.at(leader_ns);
982 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
983 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
984 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
985 mds_roles[standby_gid] = leader_ns;
986
987 // Remove from the list of standbys
988 standby_daemons.erase(standby_gid);
989 standby_epochs.erase(standby_gid);
990
991 // Indicate that Filesystem has been modified
992 fs->mds_map.epoch = epoch;
993}
994
f67539c2 995void FSMap::erase(mds_gid_t who, epoch_t blocklist_epoch)
7c673cae
FG
996{
997 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
998 standby_daemons.erase(who);
999 standby_epochs.erase(who);
1000 } else {
1001 auto &fs = filesystems.at(mds_roles.at(who));
1002 const auto &info = fs->mds_map.mds_info.at(who);
1003 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
1004 if (info.state == MDSMap::STATE_CREATING) {
1005 // If this gid didn't make it past CREATING, then forget
1006 // the rank ever existed so that next time it's handed out
1007 // to a gid it'll go back into CREATING.
1008 fs->mds_map.in.erase(info.rank);
1009 } else {
1010 // Put this rank into the failed list so that the next available
1011 // STANDBY will pick it up.
1012 fs->mds_map.failed.insert(info.rank);
1013 }
11fdf7f2 1014 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
7c673cae
FG
1015 fs->mds_map.up.erase(info.rank);
1016 }
1017 fs->mds_map.mds_info.erase(who);
f67539c2 1018 fs->mds_map.last_failure_osd_epoch = blocklist_epoch;
7c673cae
FG
1019 fs->mds_map.epoch = epoch;
1020 }
1021
1022 mds_roles.erase(who);
1023}
1024
f67539c2 1025void FSMap::damaged(mds_gid_t who, epoch_t blocklist_epoch)
7c673cae 1026{
11fdf7f2 1027 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae 1028 auto fs = filesystems.at(mds_roles.at(who));
a4b75251 1029 mds_rank_t rank = fs->mds_map.mds_info.at(who).rank;
7c673cae 1030
f67539c2 1031 erase(who, blocklist_epoch);
7c673cae
FG
1032 fs->mds_map.failed.erase(rank);
1033 fs->mds_map.damaged.insert(rank);
1034
11fdf7f2 1035 ceph_assert(fs->mds_map.epoch == epoch);
7c673cae
FG
1036}
1037
1038/**
1039 * Update to indicate that the rank `rank` is to be removed
1040 * from the damaged list of the filesystem `fscid`
1041 */
1042bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
1043{
1044 auto fs = filesystems.at(fscid);
1045
1046 if (fs->mds_map.damaged.erase(rank)) {
1047 fs->mds_map.failed.insert(rank);
1048 fs->mds_map.epoch = epoch;
1049 return true;
1050 } else {
1051 return false;
1052 }
1053}
1054
1055void FSMap::insert(const MDSMap::mds_info_t &new_info)
1056{
a4b75251
TL
1057 static const CompatSet empty;
1058
11fdf7f2
TL
1059 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
1060 ceph_assert(new_info.rank == MDS_RANK_NONE);
7c673cae 1061 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
a4b75251
TL
1062 auto& info = standby_daemons[new_info.global_id];
1063 info = new_info;
1064 if (empty.compare(info.compat) == 0) {
1065 // bootstrap old compat: boot beacon contains empty compat on old (v16.2.4
1066 // or older) MDS.
1067 info.compat = MDSMap::get_compat_set_v16_2_4();
1068 }
20effc67
TL
1069 /* TODO remove after R is released
1070 * Insert INLINE; see comment in MDSMap::decode.
1071 */
1072 info.compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
7c673cae
FG
1073 standby_epochs[new_info.global_id] = epoch;
1074}
1075
9f95a23c 1076std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
7c673cae 1077{
11fdf7f2 1078 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
7c673cae
FG
1079 auto fs = filesystems.at(mds_roles.at(who));
1080 const auto &info = fs->mds_map.mds_info.at(who);
1081 fs->mds_map.up.erase(info.rank);
1082 fs->mds_map.in.erase(info.rank);
1083 fs->mds_map.stopped.insert(info.rank);
1084
1085 // Also drop any standby replays that were following this rank
9f95a23c 1086 std::vector<mds_gid_t> standbys;
7c673cae
FG
1087 for (const auto &i : fs->mds_map.mds_info) {
1088 const auto &other_gid = i.first;
1089 const auto &other_info = i.second;
1090 if (other_info.rank == info.rank
1091 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
1092 standbys.push_back(other_gid);
1093 erase(other_gid, 0);
1094 }
1095 }
1096
1097 fs->mds_map.mds_info.erase(who);
1098 mds_roles.erase(who);
1099
1100 fs->mds_map.epoch = epoch;
1101
1102 return standbys;
1103}
1104
1105
1106/**
1107 * Given one of the following forms:
1108 * <fs name>:<rank>
1109 * <fs id>:<rank>
1110 * <rank>
1111 *
1112 * Parse into a mds_role_t. The rank-only form is only valid
1113 * if legacy_client_ns is set.
1114 */
f67539c2
TL
1115
1116int FSMap::parse_role(
1117 std::string_view role_str,
1118 mds_role_t *role,
1119 std::ostream &ss,
1120 const std::vector<string> &filter) const
1121{
1122 int r = parse_role(role_str, role, ss);
1123 if (r < 0) return r;
1124
1125 string_view fs_name = get_filesystem(role->fscid)->mds_map.get_fs_name();
1126
1127 if (!filter.empty() &&
1128 std::find(filter.begin(), filter.end(), fs_name) == filter.end()) {
1129 if (r >= 0) {
1130 ss << "Invalid file system";
1131 }
1132 return -CEPHFS_ENOENT;
1133 }
1134
1135 return r;
1136}
1137
7c673cae 1138int FSMap::parse_role(
11fdf7f2 1139 std::string_view role_str,
7c673cae
FG
1140 mds_role_t *role,
1141 std::ostream &ss) const
1142{
1143 size_t colon_pos = role_str.find(":");
1144 size_t rank_pos;
11fdf7f2 1145 Filesystem::const_ref fs;
7c673cae
FG
1146 if (colon_pos == std::string::npos) {
1147 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1148 ss << "No filesystem selected";
f67539c2 1149 return -CEPHFS_ENOENT;
7c673cae
FG
1150 }
1151 fs = get_filesystem(legacy_client_fscid);
1152 rank_pos = 0;
1153 } else {
1154 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1155 ss << "Invalid filesystem";
f67539c2 1156 return -CEPHFS_ENOENT;
7c673cae
FG
1157 }
1158 rank_pos = colon_pos+1;
1159 }
1160
1161 mds_rank_t rank;
1162 std::string err;
94b18763 1163 std::string rank_str(role_str.substr(rank_pos));
7c673cae
FG
1164 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1165 if (rank_i < 0 || !err.empty()) {
1166 ss << "Invalid rank '" << rank_str << "'";
f67539c2 1167 return -CEPHFS_EINVAL;
7c673cae
FG
1168 } else {
1169 rank = rank_i;
1170 }
1171
1172 if (fs->mds_map.in.count(rank) == 0) {
1173 ss << "Rank '" << rank << "' not found";
f67539c2 1174 return -CEPHFS_ENOENT;
7c673cae
FG
1175 }
1176
1177 *role = {fs->fscid, rank};
1178
1179 return 0;
1180}
9f95a23c
TL
1181
1182bool FSMap::pool_in_use(int64_t poolid) const
1183{
1184 for (auto const &i : filesystems) {
1185 if (i.second->mds_map.is_data_pool(poolid)
1186 || i.second->mds_map.metadata_pool == poolid) {
1187 return true;
1188 }
1189 }
1190 return false;
1191}
1192
1193void FSMap::erase_filesystem(fs_cluster_id_t fscid)
1194{
1195 filesystems.erase(fscid);
1196 for (auto& [gid, info] : standby_daemons) {
1197 if (info.join_fscid == fscid) {
1198 modify_daemon(gid, [](auto& info) {
1199 info.join_fscid = FS_CLUSTER_ID_NONE;
1200 });
1201 }
1202 }
1203 for (auto& p : filesystems) {
1204 for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
1205 if (info.join_fscid == fscid) {
1206 modify_daemon(gid, [](auto& info) {
1207 info.join_fscid = FS_CLUSTER_ID_NONE;
1208 });
1209 }
1210 }
1211 }
1212}