]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/FSMap.cc
eb08d02b7dcb007362210a1d6cb9298b79204f00
[ceph.git] / ceph / src / mds / FSMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include "FSMap.h"
17
18 #include <sstream>
19 using std::stringstream;
20
21
22 void Filesystem::dump(Formatter *f) const
23 {
24 f->open_object_section("mdsmap");
25 mds_map.dump(f);
26 f->close_section();
27 f->dump_int("id", fscid);
28 }
29
30 void FSMap::dump(Formatter *f) const
31 {
32 f->dump_int("epoch", epoch);
33
34 f->open_object_section("compat");
35 compat.dump(f);
36 f->close_section();
37
38 f->open_object_section("feature_flags");
39 f->dump_bool("enable_multiple", enable_multiple);
40 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
41 f->close_section();
42
43 f->open_array_section("standbys");
44 for (const auto &i : standby_daemons) {
45 f->open_object_section("info");
46 i.second.dump(f);
47 f->dump_int("epoch", standby_epochs.at(i.first));
48 f->close_section();
49 }
50 f->close_section();
51
52 f->open_array_section("filesystems");
53 for (const auto &fs : filesystems) {
54 f->open_object_section("filesystem");
55 fs.second->dump(f);
56 f->close_section();
57 }
58 f->close_section();
59 }
60
61 void FSMap::generate_test_instances(list<FSMap*>& ls)
62 {
63 FSMap *m = new FSMap();
64
65 std::list<MDSMap*> mds_map_instances;
66 MDSMap::generate_test_instances(mds_map_instances);
67
68 int k = 20;
69 for (auto i : mds_map_instances) {
70 auto fs = std::make_shared<Filesystem>();
71 fs->fscid = k++;
72 fs->mds_map = *i;
73 delete i;
74 m->filesystems[fs->fscid] = fs;
75 }
76 mds_map_instances.clear();
77
78 ls.push_back(m);
79 }
80
81 void FSMap::print(ostream& out) const
82 {
83 out << "e" << epoch << std::endl;
84 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
85 << ever_enabled_multiple << std::endl;
86 out << "compat: " << compat << std::endl;
87 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
88 out << " " << std::endl;
89
90 if (filesystems.empty()) {
91 out << "No filesystems configured" << std::endl;
92 return;
93 }
94
95 for (const auto &fs : filesystems) {
96 fs.second->print(out);
97 out << " " << std::endl << " " << std::endl; // Space out a bit
98 }
99
100 if (!standby_daemons.empty()) {
101 out << "Standby daemons:" << std::endl << " " << std::endl;
102 }
103
104 for (const auto &p : standby_daemons) {
105 p.second.print_summary(out);
106 out << std::endl;
107 }
108 }
109
110
111
112 void FSMap::print_summary(Formatter *f, ostream *out) const
113 {
114 map<mds_role_t,string> by_rank;
115 map<string,int> by_state;
116
117 if (f) {
118 f->dump_unsigned("epoch", get_epoch());
119 for (auto i : filesystems) {
120 auto fs = i.second;
121 f->dump_unsigned("id", fs->fscid);
122 f->dump_unsigned("up", fs->mds_map.up.size());
123 f->dump_unsigned("in", fs->mds_map.in.size());
124 f->dump_unsigned("max", fs->mds_map.max_mds);
125 }
126 } else {
127 if (filesystems.size() == 1) {
128 auto fs = filesystems.begin()->second;
129 *out << fs->mds_map.up.size() << "/" << fs->mds_map.in.size() << "/"
130 << fs->mds_map.max_mds << " up";
131 } else {
132 for (auto i : filesystems) {
133 auto fs = i.second;
134 *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/"
135 << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up ";
136 }
137 }
138 }
139
140 if (f) {
141 f->open_array_section("by_rank");
142 }
143
144 const auto all_info = get_mds_info();
145 for (const auto &p : all_info) {
146 const auto &info = p.second;
147 string s = ceph_mds_state_name(info.state);
148 if (info.laggy()) {
149 s += "(laggy or crashed)";
150 }
151
152 const fs_cluster_id_t fscid = mds_roles.at(info.global_id);
153
154 if (info.rank != MDS_RANK_NONE &&
155 info.state != MDSMap::STATE_STANDBY_REPLAY) {
156 if (f) {
157 f->open_object_section("mds");
158 f->dump_unsigned("filesystem_id", fscid);
159 f->dump_unsigned("rank", info.rank);
160 f->dump_string("name", info.name);
161 f->dump_string("status", s);
162 f->close_section();
163 } else {
164 by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s;
165 }
166 } else {
167 by_state[s]++;
168 }
169 }
170
171 if (f) {
172 f->close_section();
173 } else {
174 if (!by_rank.empty()) {
175 if (filesystems.size() > 1) {
176 // Disambiguate filesystems
177 std::map<std::string, std::string> pretty;
178 for (auto i : by_rank) {
179 const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name;
180 std::ostringstream o;
181 o << "[" << fs_name << ":" << i.first.rank << "]";
182 pretty[o.str()] = i.second;
183 }
184 *out << " " << pretty;
185 } else {
186 // Omit FSCID in output when only one filesystem exists
187 std::map<mds_rank_t, std::string> shortened;
188 for (auto i : by_rank) {
189 shortened[i.first.rank] = i.second;
190 }
191 *out << " " << shortened;
192 }
193 }
194 }
195
196 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
197 if (f) {
198 f->dump_unsigned(p->first.c_str(), p->second);
199 } else {
200 *out << ", " << p->second << " " << p->first;
201 }
202 }
203
204 size_t failed = 0;
205 size_t damaged = 0;
206 for (auto i : filesystems) {
207 auto fs = i.second;
208 failed += fs->mds_map.failed.size();
209 damaged += fs->mds_map.damaged.size();
210 }
211
212 if (failed > 0) {
213 if (f) {
214 f->dump_unsigned("failed", failed);
215 } else {
216 *out << ", " << failed << " failed";
217 }
218 }
219
220 if (damaged > 0) {
221 if (f) {
222 f->dump_unsigned("damaged", damaged);
223 } else {
224 *out << ", " << damaged << " damaged";
225 }
226 }
227 //if (stopped.size())
228 //out << ", " << stopped.size() << " stopped";
229 }
230
231
232 void FSMap::create_filesystem(const std::string &name,
233 int64_t metadata_pool, int64_t data_pool,
234 uint64_t features)
235 {
236 auto fs = std::make_shared<Filesystem>();
237 fs->mds_map.fs_name = name;
238 fs->mds_map.max_mds = 1;
239 fs->mds_map.data_pools.push_back(data_pool);
240 fs->mds_map.metadata_pool = metadata_pool;
241 fs->mds_map.cas_pool = -1;
242 fs->mds_map.max_file_size = g_conf->mds_max_file_size;
243 fs->mds_map.compat = compat;
244 fs->mds_map.created = ceph_clock_now();
245 fs->mds_map.modified = ceph_clock_now();
246 fs->mds_map.session_timeout = g_conf->mds_session_timeout;
247 fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
248 fs->mds_map.enabled = true;
249 if (features & CEPH_FEATURE_SERVER_JEWEL) {
250 fs->fscid = next_filesystem_id++;
251 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
252 // have initialized next_filesystem_id such that it's never used here.
253 assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
254 } else {
255 // Use anon fscid because this will get thrown away when encoding
256 // as legacy MDSMap for legacy mons.
257 assert(filesystems.empty());
258 fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
259 }
260 filesystems[fs->fscid] = fs;
261
262 // Created first filesystem? Set it as the one
263 // for legacy clients to use
264 if (filesystems.size() == 1) {
265 legacy_client_fscid = fs->fscid;
266 }
267 }
268
269 void FSMap::reset_filesystem(fs_cluster_id_t fscid)
270 {
271 auto fs = get_filesystem(fscid);
272 auto new_fs = std::make_shared<Filesystem>();
273
274 // Populate rank 0 as existing (so don't go into CREATING)
275 // but failed (so that next available MDS is assigned the rank)
276 new_fs->mds_map.in.insert(mds_rank_t(0));
277 new_fs->mds_map.failed.insert(mds_rank_t(0));
278
279 // Carry forward what makes sense
280 new_fs->fscid = fs->fscid;
281 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
282 new_fs->mds_map.max_mds = 1;
283 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
284 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
285 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
286 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
287 new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
288 new_fs->mds_map.compat = compat;
289 new_fs->mds_map.created = ceph_clock_now();
290 new_fs->mds_map.modified = ceph_clock_now();
291 new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
292 new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
293 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
294 new_fs->mds_map.enabled = true;
295
296 // Persist the new FSMap
297 filesystems[new_fs->fscid] = new_fs;
298 }
299
300 void FSMap::get_health(list<pair<health_status_t,string> >& summary,
301 list<pair<health_status_t,string> > *detail) const
302 {
303 mds_rank_t standby_count_wanted = 0;
304 for (const auto &i : filesystems) {
305 const auto &fs = i.second;
306
307 // TODO: move get_health up into here so that we can qualify
308 // all the messages with what filesystem they're talking about
309 fs->mds_map.get_health(summary, detail);
310
311 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
312 }
313
314 if (standby_count_wanted) {
315 std::ostringstream oss;
316 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
317 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
318 }
319 }
320
321 bool FSMap::check_health(void)
322 {
323 bool changed = false;
324 for (auto &i : filesystems) {
325 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
326 }
327 return changed;
328 }
329
330 void FSMap::encode(bufferlist& bl, uint64_t features) const
331 {
332 if (features & CEPH_FEATURE_SERVER_JEWEL) {
333 ENCODE_START(7, 6, bl);
334 ::encode(epoch, bl);
335 ::encode(next_filesystem_id, bl);
336 ::encode(legacy_client_fscid, bl);
337 ::encode(compat, bl);
338 ::encode(enable_multiple, bl);
339 std::vector<Filesystem> fs_list;
340 for (auto i : filesystems) {
341 fs_list.push_back(*(i.second));
342 }
343 ::encode(fs_list, bl, features);
344 ::encode(mds_roles, bl);
345 ::encode(standby_daemons, bl, features);
346 ::encode(standby_epochs, bl);
347 ::encode(ever_enabled_multiple, bl);
348 ENCODE_FINISH(bl);
349 } else {
350 if (filesystems.empty()) {
351 MDSMap disabled_map;
352 disabled_map.epoch = epoch;
353 disabled_map.encode(bl, features);
354 } else {
355 // MDSMonitor should never have created multiple filesystems
356 // until the quorum features indicated Jewel
357 assert(filesystems.size() == 1);
358 auto fs = filesystems.begin()->second;
359
360 // Take the MDSMap for the enabled filesystem, and populated its
361 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
362 MDSMap full_mdsmap = fs->mds_map;
363 full_mdsmap.epoch = epoch;
364 for (const auto &p : standby_daemons) {
365 full_mdsmap.mds_info[p.first] = p.second;
366 }
367
368 // Old MDSMaps don't set rank on standby replay daemons
369 for (auto &i : full_mdsmap.mds_info) {
370 auto &info = i.second;
371 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
372 info.rank = MDS_RANK_NONE;
373 }
374 }
375
376 full_mdsmap.encode(bl, features);
377 }
378 }
379 }
380
381 void FSMap::decode(bufferlist::iterator& p)
382 {
383 // Because the mon used to store an MDSMap where we now
384 // store an FSMap, FSMap knows how to decode the legacy
385 // MDSMap format (it never needs to encode it though).
386 MDSMap legacy_mds_map;
387
388 // The highest MDSMap encoding version before we changed the
389 // MDSMonitor to store an FSMap instead of an MDSMap was
390 // 5, so anything older than 6 is decoded as an MDSMap,
391 // and anything newer is decoded as an FSMap.
392 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
393 if (struct_v < 6) {
394 // Decoding an MDSMap (upgrade)
395 ::decode(epoch, p);
396 ::decode(legacy_mds_map.flags, p);
397 ::decode(legacy_mds_map.last_failure, p);
398 ::decode(legacy_mds_map.root, p);
399 ::decode(legacy_mds_map.session_timeout, p);
400 ::decode(legacy_mds_map.session_autoclose, p);
401 ::decode(legacy_mds_map.max_file_size, p);
402 ::decode(legacy_mds_map.max_mds, p);
403 ::decode(legacy_mds_map.mds_info, p);
404 if (struct_v < 3) {
405 __u32 n;
406 ::decode(n, p);
407 while (n--) {
408 __u32 m;
409 ::decode(m, p);
410 legacy_mds_map.data_pools.push_back(m);
411 }
412 __s32 s;
413 ::decode(s, p);
414 legacy_mds_map.cas_pool = s;
415 } else {
416 ::decode(legacy_mds_map.data_pools, p);
417 ::decode(legacy_mds_map.cas_pool, p);
418 }
419
420 // kclient ignores everything from here
421 __u16 ev = 1;
422 if (struct_v >= 2)
423 ::decode(ev, p);
424 if (ev >= 3)
425 ::decode(legacy_mds_map.compat, p);
426 else
427 legacy_mds_map.compat = get_mdsmap_compat_set_base();
428 if (ev < 5) {
429 __u32 n;
430 ::decode(n, p);
431 legacy_mds_map.metadata_pool = n;
432 } else {
433 ::decode(legacy_mds_map.metadata_pool, p);
434 }
435 ::decode(legacy_mds_map.created, p);
436 ::decode(legacy_mds_map.modified, p);
437 ::decode(legacy_mds_map.tableserver, p);
438 ::decode(legacy_mds_map.in, p);
439 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
440 ::decode(inc, p);
441 ::decode(legacy_mds_map.up, p);
442 ::decode(legacy_mds_map.failed, p);
443 ::decode(legacy_mds_map.stopped, p);
444 if (ev >= 4)
445 ::decode(legacy_mds_map.last_failure_osd_epoch, p);
446 if (ev >= 6) {
447 if (ev < 10) {
448 // previously this was a bool about snaps, not a flag map
449 bool flag;
450 ::decode(flag, p);
451 legacy_mds_map.ever_allowed_features = flag ?
452 CEPH_MDSMAP_ALLOW_SNAPS : 0;
453 ::decode(flag, p);
454 legacy_mds_map.explicitly_allowed_features = flag ?
455 CEPH_MDSMAP_ALLOW_SNAPS : 0;
456 if (legacy_mds_map.max_mds > 1) {
457 legacy_mds_map.set_multimds_allowed();
458 }
459 } else {
460 ::decode(legacy_mds_map.ever_allowed_features, p);
461 ::decode(legacy_mds_map.explicitly_allowed_features, p);
462 }
463 } else {
464 legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
465 legacy_mds_map.explicitly_allowed_features = 0;
466 if (legacy_mds_map.max_mds > 1) {
467 legacy_mds_map.set_multimds_allowed();
468 }
469 }
470 if (ev >= 7)
471 ::decode(legacy_mds_map.inline_data_enabled, p);
472
473 if (ev >= 8) {
474 assert(struct_v >= 5);
475 ::decode(legacy_mds_map.enabled, p);
476 ::decode(legacy_mds_map.fs_name, p);
477 } else {
478 legacy_mds_map.fs_name = "default";
479 if (epoch > 1) {
480 // If an MDS has ever been started, epoch will be greater than 1,
481 // assume filesystem is enabled.
482 legacy_mds_map.enabled = true;
483 } else {
484 // Upgrading from a cluster that never used an MDS, switch off
485 // filesystem until it's explicitly enabled.
486 legacy_mds_map.enabled = false;
487 }
488 }
489
490 if (ev >= 9) {
491 ::decode(legacy_mds_map.damaged, p);
492 }
493
494 // We're upgrading, populate filesystems from the legacy fields
495 filesystems.clear();
496 standby_daemons.clear();
497 standby_epochs.clear();
498 mds_roles.clear();
499 compat = legacy_mds_map.compat;
500 enable_multiple = false;
501
502 // Synthesise a Filesystem from legacy_mds_map, if enabled
503 if (legacy_mds_map.enabled) {
504 // Construct a Filesystem from the legacy MDSMap
505 auto migrate_fs = std::make_shared<Filesystem>();
506 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
507 migrate_fs->mds_map = legacy_mds_map;
508 migrate_fs->mds_map.epoch = epoch;
509 filesystems[migrate_fs->fscid] = migrate_fs;
510
511 // List of GIDs that had invalid states
512 std::set<mds_gid_t> drop_gids;
513
514 // Construct mds_roles, standby_daemons, and remove
515 // standbys from the MDSMap in the Filesystem.
516 for (auto &p : migrate_fs->mds_map.mds_info) {
517 if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) {
518 // In legacy MDSMap, standby replay daemons don't have
519 // rank set, but since FSMap they do.
520 p.second.rank = p.second.standby_for_rank;
521 }
522 if (p.second.rank == MDS_RANK_NONE) {
523 if (p.second.state != MDSMap::STATE_STANDBY) {
524 // Old MDSMaps can have down:dne here, which
525 // is invalid in an FSMap (#17837)
526 drop_gids.insert(p.first);
527 } else {
528 insert(p.second); // into standby_daemons
529 }
530 } else {
531 mds_roles[p.first] = migrate_fs->fscid;
532 }
533 }
534 for (const auto &p : standby_daemons) {
535 // Erase from this Filesystem's MDSMap, because it has
536 // been copied into FSMap::Standby_daemons above
537 migrate_fs->mds_map.mds_info.erase(p.first);
538 }
539 for (const auto &gid : drop_gids) {
540 // Throw away all info for this MDS because it was identified
541 // as having invalid state above.
542 migrate_fs->mds_map.mds_info.erase(gid);
543 }
544
545 legacy_client_fscid = migrate_fs->fscid;
546 } else {
547 legacy_client_fscid = FS_CLUSTER_ID_NONE;
548 }
549 } else {
550 ::decode(epoch, p);
551 ::decode(next_filesystem_id, p);
552 ::decode(legacy_client_fscid, p);
553 ::decode(compat, p);
554 ::decode(enable_multiple, p);
555 std::vector<Filesystem> fs_list;
556 ::decode(fs_list, p);
557 filesystems.clear();
558 for (std::vector<Filesystem>::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) {
559 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
560 }
561
562 ::decode(mds_roles, p);
563 ::decode(standby_daemons, p);
564 ::decode(standby_epochs, p);
565 if (struct_v >= 7) {
566 ::decode(ever_enabled_multiple, p);
567 }
568 }
569
570 DECODE_FINISH(p);
571 }
572
573
574 void Filesystem::encode(bufferlist& bl, uint64_t features) const
575 {
576 ENCODE_START(1, 1, bl);
577 ::encode(fscid, bl);
578 bufferlist mdsmap_bl;
579 mds_map.encode(mdsmap_bl, features);
580 ::encode(mdsmap_bl, bl);
581 ENCODE_FINISH(bl);
582 }
583
584 void Filesystem::decode(bufferlist::iterator& p)
585 {
586 DECODE_START(1, p);
587 ::decode(fscid, p);
588 bufferlist mdsmap_bl;
589 ::decode(mdsmap_bl, p);
590 bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin();
591 mds_map.decode(mdsmap_bl_iter);
592 DECODE_FINISH(p);
593 }
594
595 int FSMap::parse_filesystem(
596 std::string const &ns_str,
597 std::shared_ptr<const Filesystem> *result
598 ) const
599 {
600 std::string ns_err;
601 fs_cluster_id_t fscid = strict_strtol(ns_str.c_str(), 10, &ns_err);
602 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
603 for (auto &fs : filesystems) {
604 if (fs.second->mds_map.fs_name == ns_str) {
605 *result = std::const_pointer_cast<const Filesystem>(fs.second);
606 return 0;
607 }
608 }
609 return -ENOENT;
610 } else {
611 *result = get_filesystem(fscid);
612 return 0;
613 }
614 }
615
616 void Filesystem::print(std::ostream &out) const
617 {
618 out << "Filesystem '" << mds_map.fs_name
619 << "' (" << fscid << ")" << std::endl;
620 mds_map.print(out);
621 }
622
623 mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const
624 {
625 mds_gid_t result = MDS_GID_NONE;
626
627 // First see if we have a STANDBY_REPLAY
628 auto fs = get_filesystem(role.fscid);
629 for (const auto &i : fs->mds_map.mds_info) {
630 const auto &info = i.second;
631 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
632 return info.global_id;
633 }
634 }
635
636 // See if there are any STANDBY daemons available
637 for (const auto &i : standby_daemons) {
638 const auto &gid = i.first;
639 const auto &info = i.second;
640 assert(info.state == MDSMap::STATE_STANDBY);
641 assert(info.rank == MDS_RANK_NONE);
642
643 if (info.laggy()) {
644 continue;
645 }
646
647 // The mds_info_t may or may not tell us exactly which filesystem
648 // the standby_for_rank refers to: lookup via legacy_client_fscid
649 mds_role_t target_role = {
650 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
651 legacy_client_fscid : info.standby_for_fscid,
652 info.standby_for_rank};
653
654 if ((target_role.rank == role.rank && target_role.fscid == role.fscid)
655 || (name.length() && info.standby_for_name == name)) {
656 // It's a named standby for *me*, use it.
657 return gid;
658 } else if (
659 info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
660 (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
661 info.standby_for_fscid == role.fscid)) {
662 // It's not a named standby for anyone, use it if we don't find
663 // a named standby for me later, unless it targets another FSCID.
664 result = gid;
665 }
666 }
667
668 return result;
669 }
670
671 mds_gid_t FSMap::find_unused_for(mds_role_t role,
672 bool force_standby_active) const {
673 for (const auto &i : standby_daemons) {
674 const auto &gid = i.first;
675 const auto &info = i.second;
676 assert(info.state == MDSMap::STATE_STANDBY);
677
678 if (info.laggy() || info.rank >= 0)
679 continue;
680
681 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
682 info.standby_for_fscid != role.fscid)
683 continue;
684 if (info.standby_for_rank != MDS_RANK_NONE &&
685 info.standby_for_rank != role.rank)
686 continue;
687
688 // To be considered 'unused' a daemon must either not
689 // be selected for standby-replay or the force_standby_active
690 // setting must be enabled to use replay daemons anyway.
691 if (!info.standby_replay || force_standby_active) {
692 return gid;
693 }
694 }
695 return MDS_GID_NONE;
696 }
697
698 mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name,
699 bool force_standby_active) const {
700 const mds_gid_t standby = find_standby_for(role, name);
701 if (standby)
702 return standby;
703 else
704 return find_unused_for(role, force_standby_active);
705 }
706
707 void FSMap::sanity() const
708 {
709 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
710 assert(filesystems.count(legacy_client_fscid) == 1);
711 }
712
713 for (const auto &i : filesystems) {
714 auto fs = i.second;
715 assert(fs->mds_map.compat.compare(compat) == 0);
716 assert(fs->fscid == i.first);
717 for (const auto &j : fs->mds_map.mds_info) {
718 assert(j.second.rank != MDS_RANK_NONE);
719 assert(mds_roles.count(j.first) == 1);
720 assert(standby_daemons.count(j.first) == 0);
721 assert(standby_epochs.count(j.first) == 0);
722 assert(mds_roles.at(j.first) == i.first);
723 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
724 assert(fs->mds_map.up.at(j.second.rank) == j.first);
725 assert(fs->mds_map.failed.count(j.second.rank) == 0);
726 assert(fs->mds_map.damaged.count(j.second.rank) == 0);
727 }
728 }
729
730 for (const auto &j : fs->mds_map.up) {
731 mds_rank_t rank = j.first;
732 assert(fs->mds_map.in.count(rank) == 1);
733 mds_gid_t gid = j.second;
734 assert(fs->mds_map.mds_info.count(gid) == 1);
735 }
736 }
737
738 for (const auto &i : standby_daemons) {
739 assert(i.second.state == MDSMap::STATE_STANDBY);
740 assert(i.second.rank == MDS_RANK_NONE);
741 assert(i.second.global_id == i.first);
742 assert(standby_epochs.count(i.first) == 1);
743 assert(mds_roles.count(i.first) == 1);
744 assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
745 }
746
747 for (const auto &i : standby_epochs) {
748 assert(standby_daemons.count(i.first) == 1);
749 }
750
751 for (const auto &i : mds_roles) {
752 if (i.second == FS_CLUSTER_ID_NONE) {
753 assert(standby_daemons.count(i.first) == 1);
754 } else {
755 assert(filesystems.count(i.second) == 1);
756 assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
757 }
758 }
759 }
760
761 void FSMap::promote(
762 mds_gid_t standby_gid,
763 const std::shared_ptr<Filesystem> &filesystem,
764 mds_rank_t assigned_rank)
765 {
766 assert(gid_exists(standby_gid));
767 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
768 if (!is_standby_replay) {
769 assert(standby_daemons.count(standby_gid));
770 assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
771 }
772
773 MDSMap &mds_map = filesystem->mds_map;
774
775 // Insert daemon state to Filesystem
776 if (!is_standby_replay) {
777 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
778 } else {
779 assert(mds_map.mds_info.count(standby_gid));
780 assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
781 assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
782 }
783 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
784
785 if (mds_map.stopped.erase(assigned_rank)) {
786 // The cluster is being expanded with a stopped rank
787 info.state = MDSMap::STATE_STARTING;
788 } else if (!mds_map.is_in(assigned_rank)) {
789 // The cluster is being expanded with a new rank
790 info.state = MDSMap::STATE_CREATING;
791 } else {
792 // An existing rank is being assigned to a replacement
793 info.state = MDSMap::STATE_REPLAY;
794 mds_map.failed.erase(assigned_rank);
795 }
796 info.rank = assigned_rank;
797 info.inc = epoch;
798 mds_roles[standby_gid] = filesystem->fscid;
799
800 // Update the rank state in Filesystem
801 mds_map.in.insert(assigned_rank);
802 mds_map.up[assigned_rank] = standby_gid;
803
804 // Remove from the list of standbys
805 if (!is_standby_replay) {
806 standby_daemons.erase(standby_gid);
807 standby_epochs.erase(standby_gid);
808 }
809
810 // Indicate that Filesystem has been modified
811 mds_map.epoch = epoch;
812 }
813
814 void FSMap::assign_standby_replay(
815 const mds_gid_t standby_gid,
816 const fs_cluster_id_t leader_ns,
817 const mds_rank_t leader_rank)
818 {
819 assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
820 assert(gid_exists(standby_gid));
821 assert(!gid_has_rank(standby_gid));
822 assert(standby_daemons.count(standby_gid));
823
824 // Insert to the filesystem
825 auto fs = filesystems.at(leader_ns);
826 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
827 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
828 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
829 mds_roles[standby_gid] = leader_ns;
830
831 // Remove from the list of standbys
832 standby_daemons.erase(standby_gid);
833 standby_epochs.erase(standby_gid);
834
835 // Indicate that Filesystem has been modified
836 fs->mds_map.epoch = epoch;
837 }
838
839 void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
840 {
841 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
842 standby_daemons.erase(who);
843 standby_epochs.erase(who);
844 } else {
845 auto &fs = filesystems.at(mds_roles.at(who));
846 const auto &info = fs->mds_map.mds_info.at(who);
847 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
848 if (info.state == MDSMap::STATE_CREATING) {
849 // If this gid didn't make it past CREATING, then forget
850 // the rank ever existed so that next time it's handed out
851 // to a gid it'll go back into CREATING.
852 fs->mds_map.in.erase(info.rank);
853 } else {
854 // Put this rank into the failed list so that the next available
855 // STANDBY will pick it up.
856 fs->mds_map.failed.insert(info.rank);
857 }
858 assert(fs->mds_map.up.at(info.rank) == info.global_id);
859 fs->mds_map.up.erase(info.rank);
860 }
861 fs->mds_map.mds_info.erase(who);
862 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
863 fs->mds_map.epoch = epoch;
864 }
865
866 mds_roles.erase(who);
867 }
868
869 void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
870 {
871 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
872 auto fs = filesystems.at(mds_roles.at(who));
873 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
874
875 erase(who, blacklist_epoch);
876 fs->mds_map.failed.erase(rank);
877 fs->mds_map.damaged.insert(rank);
878
879 assert(fs->mds_map.epoch == epoch);
880 }
881
882 /**
883 * Update to indicate that the rank `rank` is to be removed
884 * from the damaged list of the filesystem `fscid`
885 */
886 bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
887 {
888 auto fs = filesystems.at(fscid);
889
890 if (fs->mds_map.damaged.erase(rank)) {
891 fs->mds_map.failed.insert(rank);
892 fs->mds_map.epoch = epoch;
893 return true;
894 } else {
895 return false;
896 }
897 }
898
899 void FSMap::insert(const MDSMap::mds_info_t &new_info)
900 {
901 assert(new_info.state == MDSMap::STATE_STANDBY);
902 assert(new_info.rank == MDS_RANK_NONE);
903 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
904 standby_daemons[new_info.global_id] = new_info;
905 standby_epochs[new_info.global_id] = epoch;
906 }
907
908 std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
909 {
910 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
911 auto fs = filesystems.at(mds_roles.at(who));
912 const auto &info = fs->mds_map.mds_info.at(who);
913 fs->mds_map.up.erase(info.rank);
914 fs->mds_map.in.erase(info.rank);
915 fs->mds_map.stopped.insert(info.rank);
916
917 // Also drop any standby replays that were following this rank
918 std::list<mds_gid_t> standbys;
919 for (const auto &i : fs->mds_map.mds_info) {
920 const auto &other_gid = i.first;
921 const auto &other_info = i.second;
922 if (other_info.rank == info.rank
923 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
924 standbys.push_back(other_gid);
925 erase(other_gid, 0);
926 }
927 }
928
929 fs->mds_map.mds_info.erase(who);
930 mds_roles.erase(who);
931
932 fs->mds_map.epoch = epoch;
933
934 return standbys;
935 }
936
937
938 /**
939 * Given one of the following forms:
940 * <fs name>:<rank>
941 * <fs id>:<rank>
942 * <rank>
943 *
944 * Parse into a mds_role_t. The rank-only form is only valid
945 * if legacy_client_ns is set.
946 */
947 int FSMap::parse_role(
948 const std::string &role_str,
949 mds_role_t *role,
950 std::ostream &ss) const
951 {
952 size_t colon_pos = role_str.find(":");
953 size_t rank_pos;
954 std::shared_ptr<const Filesystem> fs;
955 if (colon_pos == std::string::npos) {
956 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
957 ss << "No filesystem selected";
958 return -ENOENT;
959 }
960 fs = get_filesystem(legacy_client_fscid);
961 rank_pos = 0;
962 } else {
963 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
964 ss << "Invalid filesystem";
965 return -ENOENT;
966 }
967 rank_pos = colon_pos+1;
968 }
969
970 mds_rank_t rank;
971 std::string err;
972 std::string rank_str = role_str.substr(rank_pos);
973 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
974 if (rank_i < 0 || !err.empty()) {
975 ss << "Invalid rank '" << rank_str << "'";
976 return -EINVAL;
977 } else {
978 rank = rank_i;
979 }
980
981 if (fs->mds_map.in.count(rank) == 0) {
982 ss << "Rank '" << rank << "' not found";
983 return -ENOENT;
984 }
985
986 *role = {fs->fscid, rank};
987
988 return 0;
989 }