]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/FSMap.cc
10abd5d2c0d8f46450d5ea9a052d07fc8bdac4f4
[ceph.git] / ceph / src / mds / FSMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include "FSMap.h"
17
18 #include <sstream>
19 using std::stringstream;
20
21 #include "mon/health_check.h"
22
23
24 void Filesystem::dump(Formatter *f) const
25 {
26 f->open_object_section("mdsmap");
27 mds_map.dump(f);
28 f->close_section();
29 f->dump_int("id", fscid);
30 }
31
32 void FSMap::dump(Formatter *f) const
33 {
34 f->dump_int("epoch", epoch);
35
36 f->open_object_section("compat");
37 compat.dump(f);
38 f->close_section();
39
40 f->open_object_section("feature_flags");
41 f->dump_bool("enable_multiple", enable_multiple);
42 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
43 f->close_section();
44
45 f->open_array_section("standbys");
46 for (const auto &i : standby_daemons) {
47 f->open_object_section("info");
48 i.second.dump(f);
49 f->dump_int("epoch", standby_epochs.at(i.first));
50 f->close_section();
51 }
52 f->close_section();
53
54 f->open_array_section("filesystems");
55 for (const auto &fs : filesystems) {
56 f->open_object_section("filesystem");
57 fs.second->dump(f);
58 f->close_section();
59 }
60 f->close_section();
61 }
62
63 void FSMap::generate_test_instances(list<FSMap*>& ls)
64 {
65 FSMap *m = new FSMap();
66
67 std::list<MDSMap*> mds_map_instances;
68 MDSMap::generate_test_instances(mds_map_instances);
69
70 int k = 20;
71 for (auto i : mds_map_instances) {
72 auto fs = std::make_shared<Filesystem>();
73 fs->fscid = k++;
74 fs->mds_map = *i;
75 delete i;
76 m->filesystems[fs->fscid] = fs;
77 }
78 mds_map_instances.clear();
79
80 ls.push_back(m);
81 }
82
83 void FSMap::print(ostream& out) const
84 {
85 out << "e" << epoch << std::endl;
86 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
87 << ever_enabled_multiple << std::endl;
88 out << "compat: " << compat << std::endl;
89 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
90 out << " " << std::endl;
91
92 if (filesystems.empty()) {
93 out << "No filesystems configured" << std::endl;
94 return;
95 }
96
97 for (const auto &fs : filesystems) {
98 fs.second->print(out);
99 out << " " << std::endl << " " << std::endl; // Space out a bit
100 }
101
102 if (!standby_daemons.empty()) {
103 out << "Standby daemons:" << std::endl << " " << std::endl;
104 }
105
106 for (const auto &p : standby_daemons) {
107 p.second.print_summary(out);
108 out << std::endl;
109 }
110 }
111
112
113
114 void FSMap::print_summary(Formatter *f, ostream *out) const
115 {
116 map<mds_role_t,string> by_rank;
117 map<string,int> by_state;
118
119 if (f) {
120 f->dump_unsigned("epoch", get_epoch());
121 for (auto i : filesystems) {
122 auto fs = i.second;
123 f->dump_unsigned("id", fs->fscid);
124 f->dump_unsigned("up", fs->mds_map.up.size());
125 f->dump_unsigned("in", fs->mds_map.in.size());
126 f->dump_unsigned("max", fs->mds_map.max_mds);
127 }
128 } else {
129 if (filesystems.size() == 1) {
130 auto fs = filesystems.begin()->second;
131 *out << fs->mds_map.up.size() << "/" << fs->mds_map.in.size() << "/"
132 << fs->mds_map.max_mds << " up";
133 } else {
134 for (auto i : filesystems) {
135 auto fs = i.second;
136 *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/"
137 << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up ";
138 }
139 }
140 }
141
142 if (f) {
143 f->open_array_section("by_rank");
144 }
145
146 const auto all_info = get_mds_info();
147 for (const auto &p : all_info) {
148 const auto &info = p.second;
149 string s = ceph_mds_state_name(info.state);
150 if (info.laggy()) {
151 s += "(laggy or crashed)";
152 }
153
154 const fs_cluster_id_t fscid = mds_roles.at(info.global_id);
155
156 if (info.rank != MDS_RANK_NONE &&
157 info.state != MDSMap::STATE_STANDBY_REPLAY) {
158 if (f) {
159 f->open_object_section("mds");
160 f->dump_unsigned("filesystem_id", fscid);
161 f->dump_unsigned("rank", info.rank);
162 f->dump_string("name", info.name);
163 f->dump_string("status", s);
164 f->close_section();
165 } else {
166 by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s;
167 }
168 } else {
169 by_state[s]++;
170 }
171 }
172
173 if (f) {
174 f->close_section();
175 } else {
176 if (!by_rank.empty()) {
177 if (filesystems.size() > 1) {
178 // Disambiguate filesystems
179 std::map<std::string, std::string> pretty;
180 for (auto i : by_rank) {
181 const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name;
182 std::ostringstream o;
183 o << "[" << fs_name << ":" << i.first.rank << "]";
184 pretty[o.str()] = i.second;
185 }
186 *out << " " << pretty;
187 } else {
188 // Omit FSCID in output when only one filesystem exists
189 std::map<mds_rank_t, std::string> shortened;
190 for (auto i : by_rank) {
191 shortened[i.first.rank] = i.second;
192 }
193 *out << " " << shortened;
194 }
195 }
196 }
197
198 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
199 if (f) {
200 f->dump_unsigned(p->first.c_str(), p->second);
201 } else {
202 *out << ", " << p->second << " " << p->first;
203 }
204 }
205
206 size_t failed = 0;
207 size_t damaged = 0;
208 for (auto i : filesystems) {
209 auto fs = i.second;
210 failed += fs->mds_map.failed.size();
211 damaged += fs->mds_map.damaged.size();
212 }
213
214 if (failed > 0) {
215 if (f) {
216 f->dump_unsigned("failed", failed);
217 } else {
218 *out << ", " << failed << " failed";
219 }
220 }
221
222 if (damaged > 0) {
223 if (f) {
224 f->dump_unsigned("damaged", damaged);
225 } else {
226 *out << ", " << damaged << " damaged";
227 }
228 }
229 //if (stopped.size())
230 //out << ", " << stopped.size() << " stopped";
231 }
232
233
234 void FSMap::create_filesystem(const std::string &name,
235 int64_t metadata_pool, int64_t data_pool,
236 uint64_t features)
237 {
238 auto fs = std::make_shared<Filesystem>();
239 fs->mds_map.fs_name = name;
240 fs->mds_map.max_mds = 1;
241 fs->mds_map.data_pools.push_back(data_pool);
242 fs->mds_map.metadata_pool = metadata_pool;
243 fs->mds_map.cas_pool = -1;
244 fs->mds_map.max_file_size = g_conf->mds_max_file_size;
245 fs->mds_map.compat = compat;
246 fs->mds_map.created = ceph_clock_now();
247 fs->mds_map.modified = ceph_clock_now();
248 fs->mds_map.session_timeout = g_conf->mds_session_timeout;
249 fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
250 fs->mds_map.enabled = true;
251 if (features & CEPH_FEATURE_SERVER_JEWEL) {
252 fs->fscid = next_filesystem_id++;
253 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
254 // have initialized next_filesystem_id such that it's never used here.
255 assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
256 } else {
257 // Use anon fscid because this will get thrown away when encoding
258 // as legacy MDSMap for legacy mons.
259 assert(filesystems.empty());
260 fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
261 }
262 filesystems[fs->fscid] = fs;
263
264 // Created first filesystem? Set it as the one
265 // for legacy clients to use
266 if (filesystems.size() == 1) {
267 legacy_client_fscid = fs->fscid;
268 }
269 }
270
271 void FSMap::reset_filesystem(fs_cluster_id_t fscid)
272 {
273 auto fs = get_filesystem(fscid);
274 auto new_fs = std::make_shared<Filesystem>();
275
276 // Populate rank 0 as existing (so don't go into CREATING)
277 // but failed (so that next available MDS is assigned the rank)
278 new_fs->mds_map.in.insert(mds_rank_t(0));
279 new_fs->mds_map.failed.insert(mds_rank_t(0));
280
281 // Carry forward what makes sense
282 new_fs->fscid = fs->fscid;
283 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
284 new_fs->mds_map.max_mds = 1;
285 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
286 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
287 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
288 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
289 new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
290 new_fs->mds_map.compat = compat;
291 new_fs->mds_map.created = ceph_clock_now();
292 new_fs->mds_map.modified = ceph_clock_now();
293 new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
294 new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
295 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
296 new_fs->mds_map.enabled = true;
297
298 // Remember mds ranks that have ever started. (They should load old inotable
299 // instead of creating new one if they start again.)
300 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
301 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
302 new_fs->mds_map.stopped.erase(mds_rank_t(0));
303
304 // Persist the new FSMap
305 filesystems[new_fs->fscid] = new_fs;
306 }
307
308 void FSMap::get_health(list<pair<health_status_t,string> >& summary,
309 list<pair<health_status_t,string> > *detail) const
310 {
311 mds_rank_t standby_count_wanted = 0;
312 for (const auto &i : filesystems) {
313 const auto &fs = i.second;
314
315 // TODO: move get_health up into here so that we can qualify
316 // all the messages with what filesystem they're talking about
317 fs->mds_map.get_health(summary, detail);
318
319 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
320 }
321
322 if (standby_count_wanted) {
323 std::ostringstream oss;
324 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
325 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
326 }
327 }
328
329 bool FSMap::check_health(void)
330 {
331 bool changed = false;
332 for (auto &i : filesystems) {
333 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
334 }
335 return changed;
336 }
337
338 void FSMap::get_health_checks(health_check_map_t *checks) const
339 {
340 mds_rank_t standby_count_wanted = 0;
341 for (const auto &i : filesystems) {
342 const auto &fs = i.second;
343 health_check_map_t fschecks;
344 fs->mds_map.get_health_checks(&fschecks);
345 checks->merge(fschecks);
346 standby_count_wanted = std::max(
347 standby_count_wanted,
348 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
349 }
350
351 // MDS_INSUFFICIENT_STANDBY
352 if (standby_count_wanted) {
353 std::ostringstream oss, dss;
354 oss << "insufficient standby daemons available";
355 auto& d = checks->add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
356 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
357 << " more";
358 d.detail.push_back(dss.str());
359 }
360 }
361
362 void FSMap::encode(bufferlist& bl, uint64_t features) const
363 {
364 if (features & CEPH_FEATURE_SERVER_JEWEL) {
365 ENCODE_START(7, 6, bl);
366 ::encode(epoch, bl);
367 ::encode(next_filesystem_id, bl);
368 ::encode(legacy_client_fscid, bl);
369 ::encode(compat, bl);
370 ::encode(enable_multiple, bl);
371 std::vector<Filesystem> fs_list;
372 for (auto i : filesystems) {
373 fs_list.push_back(*(i.second));
374 }
375 ::encode(fs_list, bl, features);
376 ::encode(mds_roles, bl);
377 ::encode(standby_daemons, bl, features);
378 ::encode(standby_epochs, bl);
379 ::encode(ever_enabled_multiple, bl);
380 ENCODE_FINISH(bl);
381 } else {
382 if (filesystems.empty()) {
383 MDSMap disabled_map;
384 disabled_map.epoch = epoch;
385 disabled_map.encode(bl, features);
386 } else {
387 // MDSMonitor should never have created multiple filesystems
388 // until the quorum features indicated Jewel
389 assert(filesystems.size() == 1);
390 auto fs = filesystems.begin()->second;
391
392 // Take the MDSMap for the enabled filesystem, and populated its
393 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
394 MDSMap full_mdsmap = fs->mds_map;
395 full_mdsmap.epoch = epoch;
396 for (const auto &p : standby_daemons) {
397 full_mdsmap.mds_info[p.first] = p.second;
398 }
399
400 // Old MDSMaps don't set rank on standby replay daemons
401 for (auto &i : full_mdsmap.mds_info) {
402 auto &info = i.second;
403 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
404 info.rank = MDS_RANK_NONE;
405 }
406 }
407
408 full_mdsmap.encode(bl, features);
409 }
410 }
411 }
412
413 void FSMap::decode(bufferlist::iterator& p)
414 {
415 // Because the mon used to store an MDSMap where we now
416 // store an FSMap, FSMap knows how to decode the legacy
417 // MDSMap format (it never needs to encode it though).
418 MDSMap legacy_mds_map;
419
420 // The highest MDSMap encoding version before we changed the
421 // MDSMonitor to store an FSMap instead of an MDSMap was
422 // 5, so anything older than 6 is decoded as an MDSMap,
423 // and anything newer is decoded as an FSMap.
424 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
425 if (struct_v < 6) {
426 // Decoding an MDSMap (upgrade)
427 ::decode(epoch, p);
428 ::decode(legacy_mds_map.flags, p);
429 ::decode(legacy_mds_map.last_failure, p);
430 ::decode(legacy_mds_map.root, p);
431 ::decode(legacy_mds_map.session_timeout, p);
432 ::decode(legacy_mds_map.session_autoclose, p);
433 ::decode(legacy_mds_map.max_file_size, p);
434 ::decode(legacy_mds_map.max_mds, p);
435 ::decode(legacy_mds_map.mds_info, p);
436 if (struct_v < 3) {
437 __u32 n;
438 ::decode(n, p);
439 while (n--) {
440 __u32 m;
441 ::decode(m, p);
442 legacy_mds_map.data_pools.push_back(m);
443 }
444 __s32 s;
445 ::decode(s, p);
446 legacy_mds_map.cas_pool = s;
447 } else {
448 ::decode(legacy_mds_map.data_pools, p);
449 ::decode(legacy_mds_map.cas_pool, p);
450 }
451
452 // kclient ignores everything from here
453 __u16 ev = 1;
454 if (struct_v >= 2)
455 ::decode(ev, p);
456 if (ev >= 3)
457 ::decode(legacy_mds_map.compat, p);
458 else
459 legacy_mds_map.compat = get_mdsmap_compat_set_base();
460 if (ev < 5) {
461 __u32 n;
462 ::decode(n, p);
463 legacy_mds_map.metadata_pool = n;
464 } else {
465 ::decode(legacy_mds_map.metadata_pool, p);
466 }
467 ::decode(legacy_mds_map.created, p);
468 ::decode(legacy_mds_map.modified, p);
469 ::decode(legacy_mds_map.tableserver, p);
470 ::decode(legacy_mds_map.in, p);
471 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
472 ::decode(inc, p);
473 ::decode(legacy_mds_map.up, p);
474 ::decode(legacy_mds_map.failed, p);
475 ::decode(legacy_mds_map.stopped, p);
476 if (ev >= 4)
477 ::decode(legacy_mds_map.last_failure_osd_epoch, p);
478 if (ev >= 6) {
479 if (ev < 10) {
480 // previously this was a bool about snaps, not a flag map
481 bool flag;
482 ::decode(flag, p);
483 legacy_mds_map.ever_allowed_features = flag ?
484 CEPH_MDSMAP_ALLOW_SNAPS : 0;
485 ::decode(flag, p);
486 legacy_mds_map.explicitly_allowed_features = flag ?
487 CEPH_MDSMAP_ALLOW_SNAPS : 0;
488 if (legacy_mds_map.max_mds > 1) {
489 legacy_mds_map.set_multimds_allowed();
490 }
491 } else {
492 ::decode(legacy_mds_map.ever_allowed_features, p);
493 ::decode(legacy_mds_map.explicitly_allowed_features, p);
494 }
495 } else {
496 legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
497 legacy_mds_map.explicitly_allowed_features = 0;
498 if (legacy_mds_map.max_mds > 1) {
499 legacy_mds_map.set_multimds_allowed();
500 }
501 }
502 if (ev >= 7)
503 ::decode(legacy_mds_map.inline_data_enabled, p);
504
505 if (ev >= 8) {
506 assert(struct_v >= 5);
507 ::decode(legacy_mds_map.enabled, p);
508 ::decode(legacy_mds_map.fs_name, p);
509 } else {
510 legacy_mds_map.fs_name = "default";
511 if (epoch > 1) {
512 // If an MDS has ever been started, epoch will be greater than 1,
513 // assume filesystem is enabled.
514 legacy_mds_map.enabled = true;
515 } else {
516 // Upgrading from a cluster that never used an MDS, switch off
517 // filesystem until it's explicitly enabled.
518 legacy_mds_map.enabled = false;
519 }
520 }
521
522 if (ev >= 9) {
523 ::decode(legacy_mds_map.damaged, p);
524 }
525
526 // We're upgrading, populate filesystems from the legacy fields
527 filesystems.clear();
528 standby_daemons.clear();
529 standby_epochs.clear();
530 mds_roles.clear();
531 compat = legacy_mds_map.compat;
532 enable_multiple = false;
533
534 // Synthesise a Filesystem from legacy_mds_map, if enabled
535 if (legacy_mds_map.enabled) {
536 // Construct a Filesystem from the legacy MDSMap
537 auto migrate_fs = std::make_shared<Filesystem>();
538 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
539 migrate_fs->mds_map = legacy_mds_map;
540 migrate_fs->mds_map.epoch = epoch;
541 filesystems[migrate_fs->fscid] = migrate_fs;
542
543 // List of GIDs that had invalid states
544 std::set<mds_gid_t> drop_gids;
545
546 // Construct mds_roles, standby_daemons, and remove
547 // standbys from the MDSMap in the Filesystem.
548 for (auto &p : migrate_fs->mds_map.mds_info) {
549 if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) {
550 // In legacy MDSMap, standby replay daemons don't have
551 // rank set, but since FSMap they do.
552 p.second.rank = p.second.standby_for_rank;
553 }
554 if (p.second.rank == MDS_RANK_NONE) {
555 if (p.second.state != MDSMap::STATE_STANDBY) {
556 // Old MDSMaps can have down:dne here, which
557 // is invalid in an FSMap (#17837)
558 drop_gids.insert(p.first);
559 } else {
560 insert(p.second); // into standby_daemons
561 }
562 } else {
563 mds_roles[p.first] = migrate_fs->fscid;
564 }
565 }
566 for (const auto &p : standby_daemons) {
567 // Erase from this Filesystem's MDSMap, because it has
568 // been copied into FSMap::Standby_daemons above
569 migrate_fs->mds_map.mds_info.erase(p.first);
570 }
571 for (const auto &gid : drop_gids) {
572 // Throw away all info for this MDS because it was identified
573 // as having invalid state above.
574 migrate_fs->mds_map.mds_info.erase(gid);
575 }
576
577 legacy_client_fscid = migrate_fs->fscid;
578 } else {
579 legacy_client_fscid = FS_CLUSTER_ID_NONE;
580 }
581 } else {
582 ::decode(epoch, p);
583 ::decode(next_filesystem_id, p);
584 ::decode(legacy_client_fscid, p);
585 ::decode(compat, p);
586 ::decode(enable_multiple, p);
587 std::vector<Filesystem> fs_list;
588 ::decode(fs_list, p);
589 filesystems.clear();
590 for (std::vector<Filesystem>::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) {
591 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
592 }
593
594 ::decode(mds_roles, p);
595 ::decode(standby_daemons, p);
596 ::decode(standby_epochs, p);
597 if (struct_v >= 7) {
598 ::decode(ever_enabled_multiple, p);
599 }
600 }
601
602 DECODE_FINISH(p);
603 }
604
605
606 void Filesystem::encode(bufferlist& bl, uint64_t features) const
607 {
608 ENCODE_START(1, 1, bl);
609 ::encode(fscid, bl);
610 bufferlist mdsmap_bl;
611 mds_map.encode(mdsmap_bl, features);
612 ::encode(mdsmap_bl, bl);
613 ENCODE_FINISH(bl);
614 }
615
616 void Filesystem::decode(bufferlist::iterator& p)
617 {
618 DECODE_START(1, p);
619 ::decode(fscid, p);
620 bufferlist mdsmap_bl;
621 ::decode(mdsmap_bl, p);
622 bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin();
623 mds_map.decode(mdsmap_bl_iter);
624 DECODE_FINISH(p);
625 }
626
627 int FSMap::parse_filesystem(
628 std::string const &ns_str,
629 std::shared_ptr<const Filesystem> *result
630 ) const
631 {
632 std::string ns_err;
633 fs_cluster_id_t fscid = strict_strtol(ns_str.c_str(), 10, &ns_err);
634 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
635 for (auto &fs : filesystems) {
636 if (fs.second->mds_map.fs_name == ns_str) {
637 *result = std::const_pointer_cast<const Filesystem>(fs.second);
638 return 0;
639 }
640 }
641 return -ENOENT;
642 } else {
643 *result = get_filesystem(fscid);
644 return 0;
645 }
646 }
647
648 void Filesystem::print(std::ostream &out) const
649 {
650 out << "Filesystem '" << mds_map.fs_name
651 << "' (" << fscid << ")" << std::endl;
652 mds_map.print(out);
653 }
654
655 mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const
656 {
657 mds_gid_t result = MDS_GID_NONE;
658
659 // First see if we have a STANDBY_REPLAY
660 auto fs = get_filesystem(role.fscid);
661 for (const auto &i : fs->mds_map.mds_info) {
662 const auto &info = i.second;
663 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
664 return info.global_id;
665 }
666 }
667
668 // See if there are any STANDBY daemons available
669 for (const auto &i : standby_daemons) {
670 const auto &gid = i.first;
671 const auto &info = i.second;
672 assert(info.state == MDSMap::STATE_STANDBY);
673 assert(info.rank == MDS_RANK_NONE);
674
675 if (info.laggy()) {
676 continue;
677 }
678
679 // The mds_info_t may or may not tell us exactly which filesystem
680 // the standby_for_rank refers to: lookup via legacy_client_fscid
681 mds_role_t target_role = {
682 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
683 legacy_client_fscid : info.standby_for_fscid,
684 info.standby_for_rank};
685
686 if ((target_role.rank == role.rank && target_role.fscid == role.fscid)
687 || (name.length() && info.standby_for_name == name)) {
688 // It's a named standby for *me*, use it.
689 return gid;
690 } else if (
691 info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
692 (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
693 info.standby_for_fscid == role.fscid)) {
694 // It's not a named standby for anyone, use it if we don't find
695 // a named standby for me later, unless it targets another FSCID.
696 result = gid;
697 }
698 }
699
700 return result;
701 }
702
703 mds_gid_t FSMap::find_unused_for(mds_role_t role,
704 bool force_standby_active) const {
705 for (const auto &i : standby_daemons) {
706 const auto &gid = i.first;
707 const auto &info = i.second;
708 assert(info.state == MDSMap::STATE_STANDBY);
709
710 if (info.laggy() || info.rank >= 0)
711 continue;
712
713 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
714 info.standby_for_fscid != role.fscid)
715 continue;
716 if (info.standby_for_rank != MDS_RANK_NONE &&
717 info.standby_for_rank != role.rank)
718 continue;
719
720 // To be considered 'unused' a daemon must either not
721 // be selected for standby-replay or the force_standby_active
722 // setting must be enabled to use replay daemons anyway.
723 if (!info.standby_replay || force_standby_active) {
724 return gid;
725 }
726 }
727 return MDS_GID_NONE;
728 }
729
730 mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name,
731 bool force_standby_active) const {
732 const mds_gid_t standby = find_standby_for(role, name);
733 if (standby)
734 return standby;
735 else
736 return find_unused_for(role, force_standby_active);
737 }
738
739 void FSMap::sanity() const
740 {
741 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
742 assert(filesystems.count(legacy_client_fscid) == 1);
743 }
744
745 for (const auto &i : filesystems) {
746 auto fs = i.second;
747 assert(fs->mds_map.compat.compare(compat) == 0);
748 assert(fs->fscid == i.first);
749 for (const auto &j : fs->mds_map.mds_info) {
750 assert(j.second.rank != MDS_RANK_NONE);
751 assert(mds_roles.count(j.first) == 1);
752 assert(standby_daemons.count(j.first) == 0);
753 assert(standby_epochs.count(j.first) == 0);
754 assert(mds_roles.at(j.first) == i.first);
755 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
756 assert(fs->mds_map.up.at(j.second.rank) == j.first);
757 assert(fs->mds_map.failed.count(j.second.rank) == 0);
758 assert(fs->mds_map.damaged.count(j.second.rank) == 0);
759 }
760 }
761
762 for (const auto &j : fs->mds_map.up) {
763 mds_rank_t rank = j.first;
764 assert(fs->mds_map.in.count(rank) == 1);
765 mds_gid_t gid = j.second;
766 assert(fs->mds_map.mds_info.count(gid) == 1);
767 }
768 }
769
770 for (const auto &i : standby_daemons) {
771 assert(i.second.state == MDSMap::STATE_STANDBY);
772 assert(i.second.rank == MDS_RANK_NONE);
773 assert(i.second.global_id == i.first);
774 assert(standby_epochs.count(i.first) == 1);
775 assert(mds_roles.count(i.first) == 1);
776 assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
777 }
778
779 for (const auto &i : standby_epochs) {
780 assert(standby_daemons.count(i.first) == 1);
781 }
782
783 for (const auto &i : mds_roles) {
784 if (i.second == FS_CLUSTER_ID_NONE) {
785 assert(standby_daemons.count(i.first) == 1);
786 } else {
787 assert(filesystems.count(i.second) == 1);
788 assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
789 }
790 }
791 }
792
793 void FSMap::promote(
794 mds_gid_t standby_gid,
795 const std::shared_ptr<Filesystem> &filesystem,
796 mds_rank_t assigned_rank)
797 {
798 assert(gid_exists(standby_gid));
799 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
800 if (!is_standby_replay) {
801 assert(standby_daemons.count(standby_gid));
802 assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
803 }
804
805 MDSMap &mds_map = filesystem->mds_map;
806
807 // Insert daemon state to Filesystem
808 if (!is_standby_replay) {
809 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
810 } else {
811 assert(mds_map.mds_info.count(standby_gid));
812 assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
813 assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
814 }
815 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
816
817 if (mds_map.stopped.erase(assigned_rank)) {
818 // The cluster is being expanded with a stopped rank
819 info.state = MDSMap::STATE_STARTING;
820 } else if (!mds_map.is_in(assigned_rank)) {
821 // The cluster is being expanded with a new rank
822 info.state = MDSMap::STATE_CREATING;
823 } else {
824 // An existing rank is being assigned to a replacement
825 info.state = MDSMap::STATE_REPLAY;
826 mds_map.failed.erase(assigned_rank);
827 }
828 info.rank = assigned_rank;
829 info.inc = epoch;
830 mds_roles[standby_gid] = filesystem->fscid;
831
832 // Update the rank state in Filesystem
833 mds_map.in.insert(assigned_rank);
834 mds_map.up[assigned_rank] = standby_gid;
835
836 // Remove from the list of standbys
837 if (!is_standby_replay) {
838 standby_daemons.erase(standby_gid);
839 standby_epochs.erase(standby_gid);
840 }
841
842 // Indicate that Filesystem has been modified
843 mds_map.epoch = epoch;
844 }
845
846 void FSMap::assign_standby_replay(
847 const mds_gid_t standby_gid,
848 const fs_cluster_id_t leader_ns,
849 const mds_rank_t leader_rank)
850 {
851 assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
852 assert(gid_exists(standby_gid));
853 assert(!gid_has_rank(standby_gid));
854 assert(standby_daemons.count(standby_gid));
855
856 // Insert to the filesystem
857 auto fs = filesystems.at(leader_ns);
858 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
859 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
860 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
861 mds_roles[standby_gid] = leader_ns;
862
863 // Remove from the list of standbys
864 standby_daemons.erase(standby_gid);
865 standby_epochs.erase(standby_gid);
866
867 // Indicate that Filesystem has been modified
868 fs->mds_map.epoch = epoch;
869 }
870
871 void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
872 {
873 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
874 standby_daemons.erase(who);
875 standby_epochs.erase(who);
876 } else {
877 auto &fs = filesystems.at(mds_roles.at(who));
878 const auto &info = fs->mds_map.mds_info.at(who);
879 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
880 if (info.state == MDSMap::STATE_CREATING) {
881 // If this gid didn't make it past CREATING, then forget
882 // the rank ever existed so that next time it's handed out
883 // to a gid it'll go back into CREATING.
884 fs->mds_map.in.erase(info.rank);
885 } else {
886 // Put this rank into the failed list so that the next available
887 // STANDBY will pick it up.
888 fs->mds_map.failed.insert(info.rank);
889 }
890 assert(fs->mds_map.up.at(info.rank) == info.global_id);
891 fs->mds_map.up.erase(info.rank);
892 }
893 fs->mds_map.mds_info.erase(who);
894 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
895 fs->mds_map.epoch = epoch;
896 }
897
898 mds_roles.erase(who);
899 }
900
901 void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
902 {
903 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
904 auto fs = filesystems.at(mds_roles.at(who));
905 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
906
907 erase(who, blacklist_epoch);
908 fs->mds_map.failed.erase(rank);
909 fs->mds_map.damaged.insert(rank);
910
911 assert(fs->mds_map.epoch == epoch);
912 }
913
914 /**
915 * Update to indicate that the rank `rank` is to be removed
916 * from the damaged list of the filesystem `fscid`
917 */
918 bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
919 {
920 auto fs = filesystems.at(fscid);
921
922 if (fs->mds_map.damaged.erase(rank)) {
923 fs->mds_map.failed.insert(rank);
924 fs->mds_map.epoch = epoch;
925 return true;
926 } else {
927 return false;
928 }
929 }
930
931 void FSMap::insert(const MDSMap::mds_info_t &new_info)
932 {
933 assert(new_info.state == MDSMap::STATE_STANDBY);
934 assert(new_info.rank == MDS_RANK_NONE);
935 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
936 standby_daemons[new_info.global_id] = new_info;
937 standby_epochs[new_info.global_id] = epoch;
938 }
939
940 std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
941 {
942 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
943 auto fs = filesystems.at(mds_roles.at(who));
944 const auto &info = fs->mds_map.mds_info.at(who);
945 fs->mds_map.up.erase(info.rank);
946 fs->mds_map.in.erase(info.rank);
947 fs->mds_map.stopped.insert(info.rank);
948
949 // Also drop any standby replays that were following this rank
950 std::list<mds_gid_t> standbys;
951 for (const auto &i : fs->mds_map.mds_info) {
952 const auto &other_gid = i.first;
953 const auto &other_info = i.second;
954 if (other_info.rank == info.rank
955 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
956 standbys.push_back(other_gid);
957 erase(other_gid, 0);
958 }
959 }
960
961 fs->mds_map.mds_info.erase(who);
962 mds_roles.erase(who);
963
964 fs->mds_map.epoch = epoch;
965
966 return standbys;
967 }
968
969
970 /**
971 * Given one of the following forms:
972 * <fs name>:<rank>
973 * <fs id>:<rank>
974 * <rank>
975 *
976 * Parse into a mds_role_t. The rank-only form is only valid
977 * if legacy_client_ns is set.
978 */
979 int FSMap::parse_role(
980 const std::string &role_str,
981 mds_role_t *role,
982 std::ostream &ss) const
983 {
984 size_t colon_pos = role_str.find(":");
985 size_t rank_pos;
986 std::shared_ptr<const Filesystem> fs;
987 if (colon_pos == std::string::npos) {
988 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
989 ss << "No filesystem selected";
990 return -ENOENT;
991 }
992 fs = get_filesystem(legacy_client_fscid);
993 rank_pos = 0;
994 } else {
995 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
996 ss << "Invalid filesystem";
997 return -ENOENT;
998 }
999 rank_pos = colon_pos+1;
1000 }
1001
1002 mds_rank_t rank;
1003 std::string err;
1004 std::string rank_str = role_str.substr(rank_pos);
1005 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1006 if (rank_i < 0 || !err.empty()) {
1007 ss << "Invalid rank '" << rank_str << "'";
1008 return -EINVAL;
1009 } else {
1010 rank = rank_i;
1011 }
1012
1013 if (fs->mds_map.in.count(rank) == 0) {
1014 ss << "Rank '" << rank << "' not found";
1015 return -ENOENT;
1016 }
1017
1018 *role = {fs->fscid, rank};
1019
1020 return 0;
1021 }