]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/FSMap.cc
update source to 12.2.11
[ceph.git] / ceph / src / mds / FSMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include "FSMap.h"
17
18 #include <sstream>
19 using std::stringstream;
20
21 #include "mon/health_check.h"
22
23
24 void Filesystem::dump(Formatter *f) const
25 {
26 f->open_object_section("mdsmap");
27 mds_map.dump(f);
28 f->close_section();
29 f->dump_int("id", fscid);
30 }
31
32 void FSMap::dump(Formatter *f) const
33 {
34 f->dump_int("epoch", epoch);
35
36 f->open_object_section("compat");
37 compat.dump(f);
38 f->close_section();
39
40 f->open_object_section("feature_flags");
41 f->dump_bool("enable_multiple", enable_multiple);
42 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
43 f->close_section();
44
45 f->open_array_section("standbys");
46 for (const auto &i : standby_daemons) {
47 f->open_object_section("info");
48 i.second.dump(f);
49 f->dump_int("epoch", standby_epochs.at(i.first));
50 f->close_section();
51 }
52 f->close_section();
53
54 f->open_array_section("filesystems");
55 for (const auto &fs : filesystems) {
56 f->open_object_section("filesystem");
57 fs.second->dump(f);
58 f->close_section();
59 }
60 f->close_section();
61 }
62
63 void FSMap::generate_test_instances(list<FSMap*>& ls)
64 {
65 FSMap *m = new FSMap();
66
67 std::list<MDSMap*> mds_map_instances;
68 MDSMap::generate_test_instances(mds_map_instances);
69
70 int k = 20;
71 for (auto i : mds_map_instances) {
72 auto fs = std::make_shared<Filesystem>();
73 fs->fscid = k++;
74 fs->mds_map = *i;
75 delete i;
76 m->filesystems[fs->fscid] = fs;
77 }
78 mds_map_instances.clear();
79
80 ls.push_back(m);
81 }
82
83 void FSMap::print(ostream& out) const
84 {
85 out << "e" << epoch << std::endl;
86 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
87 << ever_enabled_multiple << std::endl;
88 out << "compat: " << compat << std::endl;
89 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
90 out << " " << std::endl;
91
92 if (filesystems.empty()) {
93 out << "No filesystems configured" << std::endl;
94 return;
95 }
96
97 for (const auto &fs : filesystems) {
98 fs.second->print(out);
99 out << " " << std::endl << " " << std::endl; // Space out a bit
100 }
101
102 if (!standby_daemons.empty()) {
103 out << "Standby daemons:" << std::endl << " " << std::endl;
104 }
105
106 for (const auto &p : standby_daemons) {
107 p.second.print_summary(out);
108 out << std::endl;
109 }
110 }
111
112
113
114 void FSMap::print_summary(Formatter *f, ostream *out) const
115 {
116 map<mds_role_t,string> by_rank;
117 map<string,int> by_state;
118
119 if (f) {
120 f->dump_unsigned("epoch", get_epoch());
121 for (auto i : filesystems) {
122 auto fs = i.second;
123 f->dump_unsigned("id", fs->fscid);
124 f->dump_unsigned("up", fs->mds_map.up.size());
125 f->dump_unsigned("in", fs->mds_map.in.size());
126 f->dump_unsigned("max", fs->mds_map.max_mds);
127 }
128 } else {
129 for (auto i : filesystems) {
130 auto fs = i.second;
131 *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/"
132 << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up ";
133 }
134 }
135
136 if (f) {
137 f->open_array_section("by_rank");
138 }
139
140 const auto all_info = get_mds_info();
141 for (const auto &p : all_info) {
142 const auto &info = p.second;
143 string s = ceph_mds_state_name(info.state);
144 if (info.laggy()) {
145 s += "(laggy or crashed)";
146 }
147
148 const fs_cluster_id_t fscid = mds_roles.at(info.global_id);
149
150 if (info.rank != MDS_RANK_NONE &&
151 info.state != MDSMap::STATE_STANDBY_REPLAY) {
152 if (f) {
153 f->open_object_section("mds");
154 f->dump_unsigned("filesystem_id", fscid);
155 f->dump_unsigned("rank", info.rank);
156 f->dump_string("name", info.name);
157 f->dump_string("status", s);
158 f->close_section();
159 } else {
160 by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s;
161 }
162 } else {
163 by_state[s]++;
164 }
165 }
166
167 if (f) {
168 f->close_section();
169 } else {
170 if (!by_rank.empty()) {
171 if (filesystems.size() > 1) {
172 // Disambiguate filesystems
173 std::map<std::string, std::string> pretty;
174 for (auto i : by_rank) {
175 const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name;
176 std::ostringstream o;
177 o << "[" << fs_name << ":" << i.first.rank << "]";
178 pretty[o.str()] = i.second;
179 }
180 *out << " " << pretty;
181 } else {
182 // Omit FSCID in output when only one filesystem exists
183 std::map<mds_rank_t, std::string> shortened;
184 for (auto i : by_rank) {
185 shortened[i.first.rank] = i.second;
186 }
187 *out << " " << shortened;
188 }
189 }
190 }
191
192 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
193 if (f) {
194 f->dump_unsigned(p->first.c_str(), p->second);
195 } else {
196 *out << ", " << p->second << " " << p->first;
197 }
198 }
199
200 size_t failed = 0;
201 size_t damaged = 0;
202 for (auto i : filesystems) {
203 auto fs = i.second;
204 failed += fs->mds_map.failed.size();
205 damaged += fs->mds_map.damaged.size();
206 }
207
208 if (failed > 0) {
209 if (f) {
210 f->dump_unsigned("failed", failed);
211 } else {
212 *out << ", " << failed << " failed";
213 }
214 }
215
216 if (damaged > 0) {
217 if (f) {
218 f->dump_unsigned("damaged", damaged);
219 } else {
220 *out << ", " << damaged << " damaged";
221 }
222 }
223 //if (stopped.size())
224 //out << ", " << stopped.size() << " stopped";
225 }
226
227
228 void FSMap::create_filesystem(boost::string_view name,
229 int64_t metadata_pool, int64_t data_pool,
230 uint64_t features)
231 {
232 auto fs = std::make_shared<Filesystem>();
233 fs->mds_map.epoch = epoch;
234 fs->mds_map.fs_name = std::string(name);
235 fs->mds_map.data_pools.push_back(data_pool);
236 fs->mds_map.metadata_pool = metadata_pool;
237 fs->mds_map.cas_pool = -1;
238 fs->mds_map.compat = compat;
239 fs->mds_map.created = ceph_clock_now();
240 fs->mds_map.modified = ceph_clock_now();
241 fs->mds_map.enabled = true;
242 if (features & CEPH_FEATURE_SERVER_JEWEL) {
243 fs->fscid = next_filesystem_id++;
244 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
245 // have initialized next_filesystem_id such that it's never used here.
246 assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
247 } else {
248 // Use anon fscid because this will get thrown away when encoding
249 // as legacy MDSMap for legacy mons.
250 assert(filesystems.empty());
251 fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
252 }
253 filesystems[fs->fscid] = fs;
254
255 // Created first filesystem? Set it as the one
256 // for legacy clients to use
257 if (filesystems.size() == 1) {
258 legacy_client_fscid = fs->fscid;
259 }
260 }
261
262 void FSMap::reset_filesystem(fs_cluster_id_t fscid)
263 {
264 auto fs = get_filesystem(fscid);
265 auto new_fs = std::make_shared<Filesystem>();
266
267 // Populate rank 0 as existing (so don't go into CREATING)
268 // but failed (so that next available MDS is assigned the rank)
269 new_fs->mds_map.in.insert(mds_rank_t(0));
270 new_fs->mds_map.failed.insert(mds_rank_t(0));
271
272 // Carry forward what makes sense
273 new_fs->fscid = fs->fscid;
274 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
275 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
276 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
277 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
278 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
279 new_fs->mds_map.compat = compat;
280 new_fs->mds_map.created = ceph_clock_now();
281 new_fs->mds_map.modified = ceph_clock_now();
282 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
283 new_fs->mds_map.enabled = true;
284
285 // Remember mds ranks that have ever started. (They should load old inotable
286 // instead of creating new one if they start again.)
287 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
288 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
289 new_fs->mds_map.stopped.erase(mds_rank_t(0));
290
291 // Persist the new FSMap
292 filesystems[new_fs->fscid] = new_fs;
293 }
294
295 void FSMap::get_health(list<pair<health_status_t,string> >& summary,
296 list<pair<health_status_t,string> > *detail) const
297 {
298 mds_rank_t standby_count_wanted = 0;
299 for (const auto &i : filesystems) {
300 const auto &fs = i.second;
301
302 // TODO: move get_health up into here so that we can qualify
303 // all the messages with what filesystem they're talking about
304 fs->mds_map.get_health(summary, detail);
305
306 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
307 }
308
309 if (standby_count_wanted) {
310 std::ostringstream oss;
311 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
312 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
313 }
314 }
315
316 bool FSMap::check_health(void)
317 {
318 bool changed = false;
319 for (auto &i : filesystems) {
320 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
321 }
322 return changed;
323 }
324
325 void FSMap::get_health_checks(health_check_map_t *checks) const
326 {
327 mds_rank_t standby_count_wanted = 0;
328 for (const auto &i : filesystems) {
329 const auto &fs = i.second;
330 health_check_map_t fschecks;
331
332 fs->mds_map.get_health_checks(&fschecks);
333
334 // Some of the failed ranks might be transient (i.e. there are standbys
335 // ready to replace them). We will report only on "stuck" failed, i.e.
336 // ranks which are failed and have no standby replacement available.
337 std::set<mds_rank_t> stuck_failed;
338
339 for (const auto &rank : fs->mds_map.failed) {
340 const mds_gid_t replacement = find_replacement_for(
341 {fs->fscid, rank}, {}, g_conf->mon_force_standby_active);
342 if (replacement == MDS_GID_NONE) {
343 stuck_failed.insert(rank);
344 }
345 }
346
347 // FS_WITH_FAILED_MDS
348 if (!stuck_failed.empty()) {
349 health_check_t& fscheck = checks->get_or_add(
350 "FS_WITH_FAILED_MDS", HEALTH_WARN,
351 "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
352 ostringstream ss;
353 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
354 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
355 fscheck.detail.push_back(ss.str()); }
356
357 checks->merge(fschecks);
358 standby_count_wanted = std::max(
359 standby_count_wanted,
360 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
361 }
362
363 // MDS_INSUFFICIENT_STANDBY
364 if (standby_count_wanted) {
365 std::ostringstream oss, dss;
366 oss << "insufficient standby MDS daemons available";
367 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
368 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
369 << " more";
370 d.detail.push_back(dss.str());
371 }
372 }
373
374 void FSMap::encode(bufferlist& bl, uint64_t features) const
375 {
376 if (features & CEPH_FEATURE_SERVER_JEWEL) {
377 ENCODE_START(7, 6, bl);
378 ::encode(epoch, bl);
379 ::encode(next_filesystem_id, bl);
380 ::encode(legacy_client_fscid, bl);
381 ::encode(compat, bl);
382 ::encode(enable_multiple, bl);
383 std::vector<Filesystem> fs_list;
384 for (auto i : filesystems) {
385 fs_list.push_back(*(i.second));
386 }
387 ::encode(fs_list, bl, features);
388 ::encode(mds_roles, bl);
389 ::encode(standby_daemons, bl, features);
390 ::encode(standby_epochs, bl);
391 ::encode(ever_enabled_multiple, bl);
392 ENCODE_FINISH(bl);
393 } else {
394 if (filesystems.empty()) {
395 MDSMap disabled_map;
396 disabled_map.epoch = epoch;
397 disabled_map.encode(bl, features);
398 } else {
399 // MDSMonitor should never have created multiple filesystems
400 // until the quorum features indicated Jewel
401 assert(filesystems.size() == 1);
402 auto fs = filesystems.begin()->second;
403
404 // Take the MDSMap for the enabled filesystem, and populated its
405 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
406 MDSMap full_mdsmap = fs->mds_map;
407 full_mdsmap.epoch = epoch;
408 for (const auto &p : standby_daemons) {
409 full_mdsmap.mds_info[p.first] = p.second;
410 }
411
412 // Old MDSMaps don't set rank on standby replay daemons
413 for (auto &i : full_mdsmap.mds_info) {
414 auto &info = i.second;
415 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
416 info.rank = MDS_RANK_NONE;
417 }
418 }
419
420 full_mdsmap.encode(bl, features);
421 }
422 }
423 }
424
425 void FSMap::decode(bufferlist::iterator& p)
426 {
427 // The highest MDSMap encoding version before we changed the
428 // MDSMonitor to store an FSMap instead of an MDSMap was
429 // 5, so anything older than 6 is decoded as an MDSMap,
430 // and anything newer is decoded as an FSMap.
431 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
432 if (struct_v < 6) {
433 // Because the mon used to store an MDSMap where we now
434 // store an FSMap, FSMap knows how to decode the legacy
435 // MDSMap format (it never needs to encode it though).
436 MDSMap legacy_mds_map;
437
438 // Decoding an MDSMap (upgrade)
439 ::decode(epoch, p);
440 ::decode(legacy_mds_map.flags, p);
441 ::decode(legacy_mds_map.last_failure, p);
442 ::decode(legacy_mds_map.root, p);
443 ::decode(legacy_mds_map.session_timeout, p);
444 ::decode(legacy_mds_map.session_autoclose, p);
445 ::decode(legacy_mds_map.max_file_size, p);
446 ::decode(legacy_mds_map.max_mds, p);
447 ::decode(legacy_mds_map.mds_info, p);
448 if (struct_v < 3) {
449 __u32 n;
450 ::decode(n, p);
451 while (n--) {
452 __u32 m;
453 ::decode(m, p);
454 legacy_mds_map.data_pools.push_back(m);
455 }
456 __s32 s;
457 ::decode(s, p);
458 legacy_mds_map.cas_pool = s;
459 } else {
460 ::decode(legacy_mds_map.data_pools, p);
461 ::decode(legacy_mds_map.cas_pool, p);
462 }
463
464 // kclient ignores everything from here
465 __u16 ev = 1;
466 if (struct_v >= 2)
467 ::decode(ev, p);
468 if (ev >= 3)
469 ::decode(legacy_mds_map.compat, p);
470 else
471 legacy_mds_map.compat = MDSMap::get_compat_set_base();
472 if (ev < 5) {
473 __u32 n;
474 ::decode(n, p);
475 legacy_mds_map.metadata_pool = n;
476 } else {
477 ::decode(legacy_mds_map.metadata_pool, p);
478 }
479 ::decode(legacy_mds_map.created, p);
480 ::decode(legacy_mds_map.modified, p);
481 ::decode(legacy_mds_map.tableserver, p);
482 ::decode(legacy_mds_map.in, p);
483 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
484 ::decode(inc, p);
485 ::decode(legacy_mds_map.up, p);
486 ::decode(legacy_mds_map.failed, p);
487 ::decode(legacy_mds_map.stopped, p);
488 if (ev >= 4)
489 ::decode(legacy_mds_map.last_failure_osd_epoch, p);
490 if (ev >= 6) {
491 if (ev < 10) {
492 // previously this was a bool about snaps, not a flag map
493 bool flag;
494 ::decode(flag, p);
495 legacy_mds_map.ever_allowed_features = flag ?
496 CEPH_MDSMAP_ALLOW_SNAPS : 0;
497 ::decode(flag, p);
498 legacy_mds_map.explicitly_allowed_features = flag ?
499 CEPH_MDSMAP_ALLOW_SNAPS : 0;
500 if (legacy_mds_map.max_mds > 1) {
501 legacy_mds_map.set_multimds_allowed();
502 }
503 } else {
504 ::decode(legacy_mds_map.ever_allowed_features, p);
505 ::decode(legacy_mds_map.explicitly_allowed_features, p);
506 }
507 } else {
508 legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
509 legacy_mds_map.explicitly_allowed_features = 0;
510 if (legacy_mds_map.max_mds > 1) {
511 legacy_mds_map.set_multimds_allowed();
512 }
513 }
514 if (ev >= 7)
515 ::decode(legacy_mds_map.inline_data_enabled, p);
516
517 if (ev >= 8) {
518 assert(struct_v >= 5);
519 ::decode(legacy_mds_map.enabled, p);
520 ::decode(legacy_mds_map.fs_name, p);
521 } else {
522 legacy_mds_map.fs_name = "default";
523 if (epoch > 1) {
524 // If an MDS has ever been started, epoch will be greater than 1,
525 // assume filesystem is enabled.
526 legacy_mds_map.enabled = true;
527 } else {
528 // Upgrading from a cluster that never used an MDS, switch off
529 // filesystem until it's explicitly enabled.
530 legacy_mds_map.enabled = false;
531 }
532 }
533
534 if (ev >= 9) {
535 ::decode(legacy_mds_map.damaged, p);
536 }
537
538 // We're upgrading, populate filesystems from the legacy fields
539 filesystems.clear();
540 standby_daemons.clear();
541 standby_epochs.clear();
542 mds_roles.clear();
543 compat = legacy_mds_map.compat;
544 enable_multiple = false;
545
546 // Synthesise a Filesystem from legacy_mds_map, if enabled
547 if (legacy_mds_map.enabled) {
548 // Construct a Filesystem from the legacy MDSMap
549 auto migrate_fs = std::make_shared<Filesystem>();
550 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
551 migrate_fs->mds_map = legacy_mds_map;
552 migrate_fs->mds_map.epoch = epoch;
553 filesystems[migrate_fs->fscid] = migrate_fs;
554
555 // List of GIDs that had invalid states
556 std::set<mds_gid_t> drop_gids;
557
558 // Construct mds_roles, standby_daemons, and remove
559 // standbys from the MDSMap in the Filesystem.
560 for (auto &p : migrate_fs->mds_map.mds_info) {
561 if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) {
562 // In legacy MDSMap, standby replay daemons don't have
563 // rank set, but since FSMap they do.
564 p.second.rank = p.second.standby_for_rank;
565 }
566 if (p.second.rank == MDS_RANK_NONE) {
567 if (p.second.state != MDSMap::STATE_STANDBY) {
568 // Old MDSMaps can have down:dne here, which
569 // is invalid in an FSMap (#17837)
570 drop_gids.insert(p.first);
571 } else {
572 insert(p.second); // into standby_daemons
573 }
574 } else {
575 mds_roles[p.first] = migrate_fs->fscid;
576 }
577 }
578 for (const auto &p : standby_daemons) {
579 // Erase from this Filesystem's MDSMap, because it has
580 // been copied into FSMap::Standby_daemons above
581 migrate_fs->mds_map.mds_info.erase(p.first);
582 }
583 for (const auto &gid : drop_gids) {
584 // Throw away all info for this MDS because it was identified
585 // as having invalid state above.
586 migrate_fs->mds_map.mds_info.erase(gid);
587 }
588
589 legacy_client_fscid = migrate_fs->fscid;
590 } else {
591 legacy_client_fscid = FS_CLUSTER_ID_NONE;
592 }
593 } else {
594 ::decode(epoch, p);
595 ::decode(next_filesystem_id, p);
596 ::decode(legacy_client_fscid, p);
597 ::decode(compat, p);
598 ::decode(enable_multiple, p);
599 std::vector<Filesystem> fs_list;
600 ::decode(fs_list, p);
601 filesystems.clear();
602 for (std::vector<Filesystem>::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) {
603 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
604 }
605
606 ::decode(mds_roles, p);
607 ::decode(standby_daemons, p);
608 ::decode(standby_epochs, p);
609 if (struct_v >= 7) {
610 ::decode(ever_enabled_multiple, p);
611 }
612 }
613
614 DECODE_FINISH(p);
615 }
616
617 void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
618 {
619 for (auto &fs : filesystems) {
620 fs.second->mds_map.sanitize(pool_exists);
621 }
622 }
623
624 void Filesystem::encode(bufferlist& bl, uint64_t features) const
625 {
626 ENCODE_START(1, 1, bl);
627 ::encode(fscid, bl);
628 bufferlist mdsmap_bl;
629 mds_map.encode(mdsmap_bl, features);
630 ::encode(mdsmap_bl, bl);
631 ENCODE_FINISH(bl);
632 }
633
634 void Filesystem::decode(bufferlist::iterator& p)
635 {
636 DECODE_START(1, p);
637 ::decode(fscid, p);
638 bufferlist mdsmap_bl;
639 ::decode(mdsmap_bl, p);
640 bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin();
641 mds_map.decode(mdsmap_bl_iter);
642 DECODE_FINISH(p);
643 }
644
645 int FSMap::parse_filesystem(
646 boost::string_view ns_str,
647 std::shared_ptr<const Filesystem> *result
648 ) const
649 {
650 std::string ns_err;
651 std::string s(ns_str);
652 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
653 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
654 for (auto &fs : filesystems) {
655 if (fs.second->mds_map.fs_name == s) {
656 *result = std::const_pointer_cast<const Filesystem>(fs.second);
657 return 0;
658 }
659 }
660 return -ENOENT;
661 } else {
662 *result = get_filesystem(fscid);
663 return 0;
664 }
665 }
666
667 void Filesystem::print(std::ostream &out) const
668 {
669 out << "Filesystem '" << mds_map.fs_name
670 << "' (" << fscid << ")" << std::endl;
671 mds_map.print(out);
672 }
673
674 mds_gid_t FSMap::find_standby_for(mds_role_t role, boost::string_view name) const
675 {
676 mds_gid_t result = MDS_GID_NONE;
677
678 // First see if we have a STANDBY_REPLAY
679 auto fs = get_filesystem(role.fscid);
680 for (const auto &i : fs->mds_map.mds_info) {
681 const auto &info = i.second;
682 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
683 return info.global_id;
684 }
685 }
686
687 // See if there are any STANDBY daemons available
688 for (const auto &i : standby_daemons) {
689 const auto &gid = i.first;
690 const auto &info = i.second;
691 assert(info.state == MDSMap::STATE_STANDBY);
692 assert(info.rank == MDS_RANK_NONE);
693
694 if (info.laggy()) {
695 continue;
696 }
697
698 // The mds_info_t may or may not tell us exactly which filesystem
699 // the standby_for_rank refers to: lookup via legacy_client_fscid
700 mds_role_t target_role = {
701 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
702 legacy_client_fscid : info.standby_for_fscid,
703 info.standby_for_rank};
704
705 if ((target_role.rank == role.rank && target_role.fscid == role.fscid)
706 || (name.length() && info.standby_for_name == name)) {
707 // It's a named standby for *me*, use it.
708 return gid;
709 } else if (
710 info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
711 (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
712 info.standby_for_fscid == role.fscid)) {
713 // It's not a named standby for anyone, use it if we don't find
714 // a named standby for me later, unless it targets another FSCID.
715 result = gid;
716 }
717 }
718
719 return result;
720 }
721
722 mds_gid_t FSMap::find_unused_for(mds_role_t role,
723 bool force_standby_active) const {
724 for (const auto &i : standby_daemons) {
725 const auto &gid = i.first;
726 const auto &info = i.second;
727 assert(info.state == MDSMap::STATE_STANDBY);
728
729 if (info.laggy() || info.rank >= 0)
730 continue;
731
732 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
733 info.standby_for_fscid != role.fscid)
734 continue;
735 if (info.standby_for_rank != MDS_RANK_NONE &&
736 info.standby_for_rank != role.rank)
737 continue;
738
739 // To be considered 'unused' a daemon must either not
740 // be selected for standby-replay or the force_standby_active
741 // setting must be enabled to use replay daemons anyway.
742 if (!info.standby_replay || force_standby_active) {
743 return gid;
744 }
745 }
746 return MDS_GID_NONE;
747 }
748
749 mds_gid_t FSMap::find_replacement_for(mds_role_t role, boost::string_view name,
750 bool force_standby_active) const {
751 const mds_gid_t standby = find_standby_for(role, name);
752 if (standby)
753 return standby;
754 else
755 return find_unused_for(role, force_standby_active);
756 }
757
758 void FSMap::sanity() const
759 {
760 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
761 assert(filesystems.count(legacy_client_fscid) == 1);
762 }
763
764 for (const auto &i : filesystems) {
765 auto fs = i.second;
766 assert(fs->mds_map.compat.compare(compat) == 0);
767 assert(fs->fscid == i.first);
768 for (const auto &j : fs->mds_map.mds_info) {
769 assert(j.second.rank != MDS_RANK_NONE);
770 assert(mds_roles.count(j.first) == 1);
771 assert(standby_daemons.count(j.first) == 0);
772 assert(standby_epochs.count(j.first) == 0);
773 assert(mds_roles.at(j.first) == i.first);
774 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
775 assert(fs->mds_map.up.at(j.second.rank) == j.first);
776 assert(fs->mds_map.failed.count(j.second.rank) == 0);
777 assert(fs->mds_map.damaged.count(j.second.rank) == 0);
778 }
779 }
780
781 for (const auto &j : fs->mds_map.up) {
782 mds_rank_t rank = j.first;
783 assert(fs->mds_map.in.count(rank) == 1);
784 mds_gid_t gid = j.second;
785 assert(fs->mds_map.mds_info.count(gid) == 1);
786 }
787 }
788
789 for (const auto &i : standby_daemons) {
790 assert(i.second.state == MDSMap::STATE_STANDBY);
791 assert(i.second.rank == MDS_RANK_NONE);
792 assert(i.second.global_id == i.first);
793 assert(standby_epochs.count(i.first) == 1);
794 assert(mds_roles.count(i.first) == 1);
795 assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
796 }
797
798 for (const auto &i : standby_epochs) {
799 assert(standby_daemons.count(i.first) == 1);
800 }
801
802 for (const auto &i : mds_roles) {
803 if (i.second == FS_CLUSTER_ID_NONE) {
804 assert(standby_daemons.count(i.first) == 1);
805 } else {
806 assert(filesystems.count(i.second) == 1);
807 assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
808 }
809 }
810 }
811
812 void FSMap::promote(
813 mds_gid_t standby_gid,
814 const std::shared_ptr<Filesystem> &filesystem,
815 mds_rank_t assigned_rank)
816 {
817 assert(gid_exists(standby_gid));
818 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
819 if (!is_standby_replay) {
820 assert(standby_daemons.count(standby_gid));
821 assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
822 }
823
824 MDSMap &mds_map = filesystem->mds_map;
825
826 // Insert daemon state to Filesystem
827 if (!is_standby_replay) {
828 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
829 } else {
830 assert(mds_map.mds_info.count(standby_gid));
831 assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
832 assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
833 }
834 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
835
836 if (mds_map.stopped.erase(assigned_rank)) {
837 // The cluster is being expanded with a stopped rank
838 info.state = MDSMap::STATE_STARTING;
839 } else if (!mds_map.is_in(assigned_rank)) {
840 // The cluster is being expanded with a new rank
841 info.state = MDSMap::STATE_CREATING;
842 } else {
843 // An existing rank is being assigned to a replacement
844 info.state = MDSMap::STATE_REPLAY;
845 mds_map.failed.erase(assigned_rank);
846 }
847 info.rank = assigned_rank;
848 info.inc = epoch;
849 mds_roles[standby_gid] = filesystem->fscid;
850
851 // Update the rank state in Filesystem
852 mds_map.in.insert(assigned_rank);
853 mds_map.up[assigned_rank] = standby_gid;
854
855 // Remove from the list of standbys
856 if (!is_standby_replay) {
857 standby_daemons.erase(standby_gid);
858 standby_epochs.erase(standby_gid);
859 }
860
861 // Indicate that Filesystem has been modified
862 mds_map.epoch = epoch;
863 }
864
865 void FSMap::assign_standby_replay(
866 const mds_gid_t standby_gid,
867 const fs_cluster_id_t leader_ns,
868 const mds_rank_t leader_rank)
869 {
870 assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
871 assert(gid_exists(standby_gid));
872 assert(!gid_has_rank(standby_gid));
873 assert(standby_daemons.count(standby_gid));
874
875 // Insert to the filesystem
876 auto fs = filesystems.at(leader_ns);
877 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
878 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
879 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
880 mds_roles[standby_gid] = leader_ns;
881
882 // Remove from the list of standbys
883 standby_daemons.erase(standby_gid);
884 standby_epochs.erase(standby_gid);
885
886 // Indicate that Filesystem has been modified
887 fs->mds_map.epoch = epoch;
888 }
889
890 void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
891 {
892 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
893 standby_daemons.erase(who);
894 standby_epochs.erase(who);
895 } else {
896 auto &fs = filesystems.at(mds_roles.at(who));
897 const auto &info = fs->mds_map.mds_info.at(who);
898 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
899 if (info.state == MDSMap::STATE_CREATING) {
900 // If this gid didn't make it past CREATING, then forget
901 // the rank ever existed so that next time it's handed out
902 // to a gid it'll go back into CREATING.
903 fs->mds_map.in.erase(info.rank);
904 } else {
905 // Put this rank into the failed list so that the next available
906 // STANDBY will pick it up.
907 fs->mds_map.failed.insert(info.rank);
908 }
909 assert(fs->mds_map.up.at(info.rank) == info.global_id);
910 fs->mds_map.up.erase(info.rank);
911 }
912 fs->mds_map.mds_info.erase(who);
913 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
914 fs->mds_map.epoch = epoch;
915 }
916
917 mds_roles.erase(who);
918 }
919
920 void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
921 {
922 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
923 auto fs = filesystems.at(mds_roles.at(who));
924 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
925
926 erase(who, blacklist_epoch);
927 fs->mds_map.failed.erase(rank);
928 fs->mds_map.damaged.insert(rank);
929
930 assert(fs->mds_map.epoch == epoch);
931 }
932
933 /**
934 * Update to indicate that the rank `rank` is to be removed
935 * from the damaged list of the filesystem `fscid`
936 */
937 bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
938 {
939 auto fs = filesystems.at(fscid);
940
941 if (fs->mds_map.damaged.erase(rank)) {
942 fs->mds_map.failed.insert(rank);
943 fs->mds_map.epoch = epoch;
944 return true;
945 } else {
946 return false;
947 }
948 }
949
950 void FSMap::insert(const MDSMap::mds_info_t &new_info)
951 {
952 assert(new_info.state == MDSMap::STATE_STANDBY);
953 assert(new_info.rank == MDS_RANK_NONE);
954 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
955 standby_daemons[new_info.global_id] = new_info;
956 standby_epochs[new_info.global_id] = epoch;
957 }
958
959 std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
960 {
961 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
962 auto fs = filesystems.at(mds_roles.at(who));
963 const auto &info = fs->mds_map.mds_info.at(who);
964 fs->mds_map.up.erase(info.rank);
965 fs->mds_map.in.erase(info.rank);
966 fs->mds_map.stopped.insert(info.rank);
967
968 // Also drop any standby replays that were following this rank
969 std::list<mds_gid_t> standbys;
970 for (const auto &i : fs->mds_map.mds_info) {
971 const auto &other_gid = i.first;
972 const auto &other_info = i.second;
973 if (other_info.rank == info.rank
974 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
975 standbys.push_back(other_gid);
976 erase(other_gid, 0);
977 }
978 }
979
980 fs->mds_map.mds_info.erase(who);
981 mds_roles.erase(who);
982
983 fs->mds_map.epoch = epoch;
984
985 return standbys;
986 }
987
988
989 /**
990 * Given one of the following forms:
991 * <fs name>:<rank>
992 * <fs id>:<rank>
993 * <rank>
994 *
995 * Parse into a mds_role_t. The rank-only form is only valid
996 * if legacy_client_ns is set.
997 */
998 int FSMap::parse_role(
999 boost::string_view role_str,
1000 mds_role_t *role,
1001 std::ostream &ss) const
1002 {
1003 size_t colon_pos = role_str.find(":");
1004 size_t rank_pos;
1005 std::shared_ptr<const Filesystem> fs;
1006 if (colon_pos == std::string::npos) {
1007 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1008 ss << "No filesystem selected";
1009 return -ENOENT;
1010 }
1011 fs = get_filesystem(legacy_client_fscid);
1012 rank_pos = 0;
1013 } else {
1014 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1015 ss << "Invalid filesystem";
1016 return -ENOENT;
1017 }
1018 rank_pos = colon_pos+1;
1019 }
1020
1021 mds_rank_t rank;
1022 std::string err;
1023 std::string rank_str(role_str.substr(rank_pos));
1024 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1025 if (rank_i < 0 || !err.empty()) {
1026 ss << "Invalid rank '" << rank_str << "'";
1027 return -EINVAL;
1028 } else {
1029 rank = rank_i;
1030 }
1031
1032 if (fs->mds_map.in.count(rank) == 0) {
1033 ss << "Rank '" << rank << "' not found";
1034 return -ENOENT;
1035 }
1036
1037 *role = {fs->fscid, rank};
1038
1039 return 0;
1040 }