]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #include "FSMap.h" | |
17 | ||
18 | #include <sstream> | |
19 | using std::stringstream; | |
20 | ||
224ce89b WB |
21 | #include "mon/health_check.h" |
22 | ||
7c673cae FG |
23 | |
24 | void Filesystem::dump(Formatter *f) const | |
25 | { | |
26 | f->open_object_section("mdsmap"); | |
27 | mds_map.dump(f); | |
28 | f->close_section(); | |
29 | f->dump_int("id", fscid); | |
30 | } | |
31 | ||
32 | void FSMap::dump(Formatter *f) const | |
33 | { | |
34 | f->dump_int("epoch", epoch); | |
35 | ||
36 | f->open_object_section("compat"); | |
37 | compat.dump(f); | |
38 | f->close_section(); | |
39 | ||
40 | f->open_object_section("feature_flags"); | |
41 | f->dump_bool("enable_multiple", enable_multiple); | |
42 | f->dump_bool("ever_enabled_multiple", ever_enabled_multiple); | |
43 | f->close_section(); | |
44 | ||
45 | f->open_array_section("standbys"); | |
46 | for (const auto &i : standby_daemons) { | |
47 | f->open_object_section("info"); | |
48 | i.second.dump(f); | |
49 | f->dump_int("epoch", standby_epochs.at(i.first)); | |
50 | f->close_section(); | |
51 | } | |
52 | f->close_section(); | |
53 | ||
54 | f->open_array_section("filesystems"); | |
55 | for (const auto &fs : filesystems) { | |
56 | f->open_object_section("filesystem"); | |
57 | fs.second->dump(f); | |
58 | f->close_section(); | |
59 | } | |
60 | f->close_section(); | |
61 | } | |
62 | ||
63 | void FSMap::generate_test_instances(list<FSMap*>& ls) | |
64 | { | |
65 | FSMap *m = new FSMap(); | |
66 | ||
67 | std::list<MDSMap*> mds_map_instances; | |
68 | MDSMap::generate_test_instances(mds_map_instances); | |
69 | ||
70 | int k = 20; | |
71 | for (auto i : mds_map_instances) { | |
72 | auto fs = std::make_shared<Filesystem>(); | |
73 | fs->fscid = k++; | |
74 | fs->mds_map = *i; | |
75 | delete i; | |
76 | m->filesystems[fs->fscid] = fs; | |
77 | } | |
78 | mds_map_instances.clear(); | |
79 | ||
80 | ls.push_back(m); | |
81 | } | |
82 | ||
83 | void FSMap::print(ostream& out) const | |
84 | { | |
85 | out << "e" << epoch << std::endl; | |
86 | out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << "," | |
87 | << ever_enabled_multiple << std::endl; | |
88 | out << "compat: " << compat << std::endl; | |
89 | out << "legacy client fscid: " << legacy_client_fscid << std::endl; | |
90 | out << " " << std::endl; | |
91 | ||
92 | if (filesystems.empty()) { | |
93 | out << "No filesystems configured" << std::endl; | |
94 | return; | |
95 | } | |
96 | ||
97 | for (const auto &fs : filesystems) { | |
98 | fs.second->print(out); | |
99 | out << " " << std::endl << " " << std::endl; // Space out a bit | |
100 | } | |
101 | ||
102 | if (!standby_daemons.empty()) { | |
103 | out << "Standby daemons:" << std::endl << " " << std::endl; | |
104 | } | |
105 | ||
106 | for (const auto &p : standby_daemons) { | |
107 | p.second.print_summary(out); | |
108 | out << std::endl; | |
109 | } | |
110 | } | |
111 | ||
112 | ||
113 | ||
114 | void FSMap::print_summary(Formatter *f, ostream *out) const | |
115 | { | |
116 | map<mds_role_t,string> by_rank; | |
117 | map<string,int> by_state; | |
118 | ||
119 | if (f) { | |
120 | f->dump_unsigned("epoch", get_epoch()); | |
121 | for (auto i : filesystems) { | |
122 | auto fs = i.second; | |
123 | f->dump_unsigned("id", fs->fscid); | |
124 | f->dump_unsigned("up", fs->mds_map.up.size()); | |
125 | f->dump_unsigned("in", fs->mds_map.in.size()); | |
126 | f->dump_unsigned("max", fs->mds_map.max_mds); | |
127 | } | |
128 | } else { | |
d2e6a577 FG |
129 | for (auto i : filesystems) { |
130 | auto fs = i.second; | |
131 | *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/" | |
132 | << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up "; | |
7c673cae FG |
133 | } |
134 | } | |
135 | ||
136 | if (f) { | |
137 | f->open_array_section("by_rank"); | |
138 | } | |
139 | ||
140 | const auto all_info = get_mds_info(); | |
141 | for (const auto &p : all_info) { | |
142 | const auto &info = p.second; | |
143 | string s = ceph_mds_state_name(info.state); | |
144 | if (info.laggy()) { | |
145 | s += "(laggy or crashed)"; | |
146 | } | |
147 | ||
148 | const fs_cluster_id_t fscid = mds_roles.at(info.global_id); | |
149 | ||
150 | if (info.rank != MDS_RANK_NONE && | |
151 | info.state != MDSMap::STATE_STANDBY_REPLAY) { | |
152 | if (f) { | |
153 | f->open_object_section("mds"); | |
154 | f->dump_unsigned("filesystem_id", fscid); | |
155 | f->dump_unsigned("rank", info.rank); | |
156 | f->dump_string("name", info.name); | |
157 | f->dump_string("status", s); | |
158 | f->close_section(); | |
159 | } else { | |
160 | by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s; | |
161 | } | |
162 | } else { | |
163 | by_state[s]++; | |
164 | } | |
165 | } | |
166 | ||
167 | if (f) { | |
168 | f->close_section(); | |
169 | } else { | |
170 | if (!by_rank.empty()) { | |
171 | if (filesystems.size() > 1) { | |
172 | // Disambiguate filesystems | |
173 | std::map<std::string, std::string> pretty; | |
174 | for (auto i : by_rank) { | |
175 | const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name; | |
176 | std::ostringstream o; | |
177 | o << "[" << fs_name << ":" << i.first.rank << "]"; | |
178 | pretty[o.str()] = i.second; | |
179 | } | |
180 | *out << " " << pretty; | |
181 | } else { | |
182 | // Omit FSCID in output when only one filesystem exists | |
183 | std::map<mds_rank_t, std::string> shortened; | |
184 | for (auto i : by_rank) { | |
185 | shortened[i.first.rank] = i.second; | |
186 | } | |
187 | *out << " " << shortened; | |
188 | } | |
189 | } | |
190 | } | |
191 | ||
192 | for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) { | |
193 | if (f) { | |
194 | f->dump_unsigned(p->first.c_str(), p->second); | |
195 | } else { | |
196 | *out << ", " << p->second << " " << p->first; | |
197 | } | |
198 | } | |
199 | ||
200 | size_t failed = 0; | |
201 | size_t damaged = 0; | |
202 | for (auto i : filesystems) { | |
203 | auto fs = i.second; | |
204 | failed += fs->mds_map.failed.size(); | |
205 | damaged += fs->mds_map.damaged.size(); | |
206 | } | |
207 | ||
208 | if (failed > 0) { | |
209 | if (f) { | |
210 | f->dump_unsigned("failed", failed); | |
211 | } else { | |
212 | *out << ", " << failed << " failed"; | |
213 | } | |
214 | } | |
215 | ||
216 | if (damaged > 0) { | |
217 | if (f) { | |
218 | f->dump_unsigned("damaged", damaged); | |
219 | } else { | |
220 | *out << ", " << damaged << " damaged"; | |
221 | } | |
222 | } | |
223 | //if (stopped.size()) | |
224 | //out << ", " << stopped.size() << " stopped"; | |
225 | } | |
226 | ||
227 | ||
94b18763 | 228 | void FSMap::create_filesystem(boost::string_view name, |
7c673cae FG |
229 | int64_t metadata_pool, int64_t data_pool, |
230 | uint64_t features) | |
231 | { | |
232 | auto fs = std::make_shared<Filesystem>(); | |
28e407b8 | 233 | fs->mds_map.epoch = epoch; |
94b18763 | 234 | fs->mds_map.fs_name = std::string(name); |
7c673cae | 235 | fs->mds_map.max_mds = 1; |
31f18b77 | 236 | fs->mds_map.data_pools.push_back(data_pool); |
7c673cae FG |
237 | fs->mds_map.metadata_pool = metadata_pool; |
238 | fs->mds_map.cas_pool = -1; | |
239 | fs->mds_map.max_file_size = g_conf->mds_max_file_size; | |
240 | fs->mds_map.compat = compat; | |
241 | fs->mds_map.created = ceph_clock_now(); | |
242 | fs->mds_map.modified = ceph_clock_now(); | |
243 | fs->mds_map.session_timeout = g_conf->mds_session_timeout; | |
244 | fs->mds_map.session_autoclose = g_conf->mds_session_autoclose; | |
245 | fs->mds_map.enabled = true; | |
246 | if (features & CEPH_FEATURE_SERVER_JEWEL) { | |
247 | fs->fscid = next_filesystem_id++; | |
248 | // ANONYMOUS is only for upgrades from legacy mdsmaps, we should | |
249 | // have initialized next_filesystem_id such that it's never used here. | |
250 | assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS); | |
251 | } else { | |
252 | // Use anon fscid because this will get thrown away when encoding | |
253 | // as legacy MDSMap for legacy mons. | |
254 | assert(filesystems.empty()); | |
255 | fs->fscid = FS_CLUSTER_ID_ANONYMOUS; | |
256 | } | |
257 | filesystems[fs->fscid] = fs; | |
258 | ||
259 | // Created first filesystem? Set it as the one | |
260 | // for legacy clients to use | |
261 | if (filesystems.size() == 1) { | |
262 | legacy_client_fscid = fs->fscid; | |
263 | } | |
264 | } | |
265 | ||
266 | void FSMap::reset_filesystem(fs_cluster_id_t fscid) | |
267 | { | |
268 | auto fs = get_filesystem(fscid); | |
269 | auto new_fs = std::make_shared<Filesystem>(); | |
270 | ||
271 | // Populate rank 0 as existing (so don't go into CREATING) | |
272 | // but failed (so that next available MDS is assigned the rank) | |
273 | new_fs->mds_map.in.insert(mds_rank_t(0)); | |
274 | new_fs->mds_map.failed.insert(mds_rank_t(0)); | |
275 | ||
276 | // Carry forward what makes sense | |
277 | new_fs->fscid = fs->fscid; | |
278 | new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled; | |
279 | new_fs->mds_map.max_mds = 1; | |
280 | new_fs->mds_map.data_pools = fs->mds_map.data_pools; | |
281 | new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool; | |
282 | new_fs->mds_map.cas_pool = fs->mds_map.cas_pool; | |
283 | new_fs->mds_map.fs_name = fs->mds_map.fs_name; | |
284 | new_fs->mds_map.max_file_size = g_conf->mds_max_file_size; | |
285 | new_fs->mds_map.compat = compat; | |
286 | new_fs->mds_map.created = ceph_clock_now(); | |
287 | new_fs->mds_map.modified = ceph_clock_now(); | |
288 | new_fs->mds_map.session_timeout = g_conf->mds_session_timeout; | |
289 | new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose; | |
290 | new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted; | |
291 | new_fs->mds_map.enabled = true; | |
292 | ||
c07f9fc5 FG |
293 | // Remember mds ranks that have ever started. (They should load old inotable |
294 | // instead of creating new one if they start again.) | |
295 | new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end()); | |
296 | new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end()); | |
297 | new_fs->mds_map.stopped.erase(mds_rank_t(0)); | |
298 | ||
7c673cae FG |
299 | // Persist the new FSMap |
300 | filesystems[new_fs->fscid] = new_fs; | |
301 | } | |
302 | ||
303 | void FSMap::get_health(list<pair<health_status_t,string> >& summary, | |
304 | list<pair<health_status_t,string> > *detail) const | |
305 | { | |
306 | mds_rank_t standby_count_wanted = 0; | |
307 | for (const auto &i : filesystems) { | |
308 | const auto &fs = i.second; | |
309 | ||
310 | // TODO: move get_health up into here so that we can qualify | |
311 | // all the messages with what filesystem they're talking about | |
312 | fs->mds_map.get_health(summary, detail); | |
313 | ||
314 | standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); | |
315 | } | |
316 | ||
317 | if (standby_count_wanted) { | |
318 | std::ostringstream oss; | |
319 | oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more"; | |
320 | summary.push_back(make_pair(HEALTH_WARN, oss.str())); | |
321 | } | |
322 | } | |
323 | ||
324 | bool FSMap::check_health(void) | |
325 | { | |
326 | bool changed = false; | |
327 | for (auto &i : filesystems) { | |
328 | changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size()); | |
329 | } | |
330 | return changed; | |
331 | } | |
332 | ||
224ce89b WB |
333 | void FSMap::get_health_checks(health_check_map_t *checks) const |
334 | { | |
335 | mds_rank_t standby_count_wanted = 0; | |
336 | for (const auto &i : filesystems) { | |
337 | const auto &fs = i.second; | |
338 | health_check_map_t fschecks; | |
d2e6a577 | 339 | |
224ce89b | 340 | fs->mds_map.get_health_checks(&fschecks); |
d2e6a577 FG |
341 | |
342 | // Some of the failed ranks might be transient (i.e. there are standbys | |
343 | // ready to replace them). We will report only on "stuck" failed, i.e. | |
344 | // ranks which are failed and have no standby replacement available. | |
345 | std::set<mds_rank_t> stuck_failed; | |
346 | ||
347 | for (const auto &rank : fs->mds_map.failed) { | |
348 | const mds_gid_t replacement = find_replacement_for( | |
349 | {fs->fscid, rank}, {}, g_conf->mon_force_standby_active); | |
350 | if (replacement == MDS_GID_NONE) { | |
351 | stuck_failed.insert(rank); | |
352 | } | |
353 | } | |
354 | ||
355 | // FS_WITH_FAILED_MDS | |
356 | if (!stuck_failed.empty()) { | |
357 | health_check_t& fscheck = checks->get_or_add( | |
358 | "FS_WITH_FAILED_MDS", HEALTH_WARN, | |
181888fb | 359 | "%num% filesystem%plurals% %hasorhave% a failed mds daemon"); |
d2e6a577 FG |
360 | ostringstream ss; |
361 | ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size() | |
362 | << " failed mds" << (stuck_failed.size() > 1 ? "s" : ""); | |
363 | fscheck.detail.push_back(ss.str()); } | |
364 | ||
224ce89b WB |
365 | checks->merge(fschecks); |
366 | standby_count_wanted = std::max( | |
367 | standby_count_wanted, | |
368 | fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); | |
369 | } | |
370 | ||
371 | // MDS_INSUFFICIENT_STANDBY | |
372 | if (standby_count_wanted) { | |
373 | std::ostringstream oss, dss; | |
d2e6a577 FG |
374 | oss << "insufficient standby MDS daemons available"; |
375 | auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str()); | |
224ce89b WB |
376 | dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted |
377 | << " more"; | |
378 | d.detail.push_back(dss.str()); | |
379 | } | |
380 | } | |
381 | ||
7c673cae FG |
382 | void FSMap::encode(bufferlist& bl, uint64_t features) const |
383 | { | |
384 | if (features & CEPH_FEATURE_SERVER_JEWEL) { | |
385 | ENCODE_START(7, 6, bl); | |
386 | ::encode(epoch, bl); | |
387 | ::encode(next_filesystem_id, bl); | |
388 | ::encode(legacy_client_fscid, bl); | |
389 | ::encode(compat, bl); | |
390 | ::encode(enable_multiple, bl); | |
391 | std::vector<Filesystem> fs_list; | |
392 | for (auto i : filesystems) { | |
393 | fs_list.push_back(*(i.second)); | |
394 | } | |
395 | ::encode(fs_list, bl, features); | |
396 | ::encode(mds_roles, bl); | |
397 | ::encode(standby_daemons, bl, features); | |
398 | ::encode(standby_epochs, bl); | |
399 | ::encode(ever_enabled_multiple, bl); | |
400 | ENCODE_FINISH(bl); | |
401 | } else { | |
402 | if (filesystems.empty()) { | |
403 | MDSMap disabled_map; | |
404 | disabled_map.epoch = epoch; | |
405 | disabled_map.encode(bl, features); | |
406 | } else { | |
407 | // MDSMonitor should never have created multiple filesystems | |
408 | // until the quorum features indicated Jewel | |
409 | assert(filesystems.size() == 1); | |
410 | auto fs = filesystems.begin()->second; | |
411 | ||
412 | // Take the MDSMap for the enabled filesystem, and populated its | |
413 | // mds_info with the standbys to get a pre-jewel-style mon MDSMap. | |
414 | MDSMap full_mdsmap = fs->mds_map; | |
415 | full_mdsmap.epoch = epoch; | |
416 | for (const auto &p : standby_daemons) { | |
417 | full_mdsmap.mds_info[p.first] = p.second; | |
418 | } | |
419 | ||
420 | // Old MDSMaps don't set rank on standby replay daemons | |
421 | for (auto &i : full_mdsmap.mds_info) { | |
422 | auto &info = i.second; | |
423 | if (info.state == MDSMap::STATE_STANDBY_REPLAY) { | |
424 | info.rank = MDS_RANK_NONE; | |
425 | } | |
426 | } | |
427 | ||
428 | full_mdsmap.encode(bl, features); | |
429 | } | |
430 | } | |
431 | } | |
432 | ||
433 | void FSMap::decode(bufferlist::iterator& p) | |
434 | { | |
7c673cae FG |
435 | // The highest MDSMap encoding version before we changed the |
436 | // MDSMonitor to store an FSMap instead of an MDSMap was | |
437 | // 5, so anything older than 6 is decoded as an MDSMap, | |
438 | // and anything newer is decoded as an FSMap. | |
439 | DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p); | |
440 | if (struct_v < 6) { | |
3efd9988 FG |
441 | // Because the mon used to store an MDSMap where we now |
442 | // store an FSMap, FSMap knows how to decode the legacy | |
443 | // MDSMap format (it never needs to encode it though). | |
444 | MDSMap legacy_mds_map; | |
445 | ||
7c673cae FG |
446 | // Decoding an MDSMap (upgrade) |
447 | ::decode(epoch, p); | |
448 | ::decode(legacy_mds_map.flags, p); | |
449 | ::decode(legacy_mds_map.last_failure, p); | |
450 | ::decode(legacy_mds_map.root, p); | |
451 | ::decode(legacy_mds_map.session_timeout, p); | |
452 | ::decode(legacy_mds_map.session_autoclose, p); | |
453 | ::decode(legacy_mds_map.max_file_size, p); | |
454 | ::decode(legacy_mds_map.max_mds, p); | |
455 | ::decode(legacy_mds_map.mds_info, p); | |
456 | if (struct_v < 3) { | |
457 | __u32 n; | |
458 | ::decode(n, p); | |
459 | while (n--) { | |
460 | __u32 m; | |
461 | ::decode(m, p); | |
31f18b77 | 462 | legacy_mds_map.data_pools.push_back(m); |
7c673cae FG |
463 | } |
464 | __s32 s; | |
465 | ::decode(s, p); | |
466 | legacy_mds_map.cas_pool = s; | |
467 | } else { | |
468 | ::decode(legacy_mds_map.data_pools, p); | |
469 | ::decode(legacy_mds_map.cas_pool, p); | |
470 | } | |
471 | ||
472 | // kclient ignores everything from here | |
473 | __u16 ev = 1; | |
474 | if (struct_v >= 2) | |
475 | ::decode(ev, p); | |
476 | if (ev >= 3) | |
477 | ::decode(legacy_mds_map.compat, p); | |
478 | else | |
479 | legacy_mds_map.compat = get_mdsmap_compat_set_base(); | |
480 | if (ev < 5) { | |
481 | __u32 n; | |
482 | ::decode(n, p); | |
483 | legacy_mds_map.metadata_pool = n; | |
484 | } else { | |
485 | ::decode(legacy_mds_map.metadata_pool, p); | |
486 | } | |
487 | ::decode(legacy_mds_map.created, p); | |
488 | ::decode(legacy_mds_map.modified, p); | |
489 | ::decode(legacy_mds_map.tableserver, p); | |
490 | ::decode(legacy_mds_map.in, p); | |
491 | std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop | |
492 | ::decode(inc, p); | |
493 | ::decode(legacy_mds_map.up, p); | |
494 | ::decode(legacy_mds_map.failed, p); | |
495 | ::decode(legacy_mds_map.stopped, p); | |
496 | if (ev >= 4) | |
497 | ::decode(legacy_mds_map.last_failure_osd_epoch, p); | |
498 | if (ev >= 6) { | |
499 | if (ev < 10) { | |
500 | // previously this was a bool about snaps, not a flag map | |
501 | bool flag; | |
502 | ::decode(flag, p); | |
503 | legacy_mds_map.ever_allowed_features = flag ? | |
504 | CEPH_MDSMAP_ALLOW_SNAPS : 0; | |
505 | ::decode(flag, p); | |
506 | legacy_mds_map.explicitly_allowed_features = flag ? | |
507 | CEPH_MDSMAP_ALLOW_SNAPS : 0; | |
508 | if (legacy_mds_map.max_mds > 1) { | |
509 | legacy_mds_map.set_multimds_allowed(); | |
510 | } | |
511 | } else { | |
512 | ::decode(legacy_mds_map.ever_allowed_features, p); | |
513 | ::decode(legacy_mds_map.explicitly_allowed_features, p); | |
514 | } | |
515 | } else { | |
516 | legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS; | |
517 | legacy_mds_map.explicitly_allowed_features = 0; | |
518 | if (legacy_mds_map.max_mds > 1) { | |
519 | legacy_mds_map.set_multimds_allowed(); | |
520 | } | |
521 | } | |
522 | if (ev >= 7) | |
523 | ::decode(legacy_mds_map.inline_data_enabled, p); | |
524 | ||
525 | if (ev >= 8) { | |
526 | assert(struct_v >= 5); | |
527 | ::decode(legacy_mds_map.enabled, p); | |
528 | ::decode(legacy_mds_map.fs_name, p); | |
529 | } else { | |
530 | legacy_mds_map.fs_name = "default"; | |
531 | if (epoch > 1) { | |
532 | // If an MDS has ever been started, epoch will be greater than 1, | |
533 | // assume filesystem is enabled. | |
534 | legacy_mds_map.enabled = true; | |
535 | } else { | |
536 | // Upgrading from a cluster that never used an MDS, switch off | |
537 | // filesystem until it's explicitly enabled. | |
538 | legacy_mds_map.enabled = false; | |
539 | } | |
540 | } | |
541 | ||
542 | if (ev >= 9) { | |
543 | ::decode(legacy_mds_map.damaged, p); | |
544 | } | |
545 | ||
546 | // We're upgrading, populate filesystems from the legacy fields | |
547 | filesystems.clear(); | |
548 | standby_daemons.clear(); | |
549 | standby_epochs.clear(); | |
550 | mds_roles.clear(); | |
551 | compat = legacy_mds_map.compat; | |
552 | enable_multiple = false; | |
553 | ||
554 | // Synthesise a Filesystem from legacy_mds_map, if enabled | |
555 | if (legacy_mds_map.enabled) { | |
556 | // Construct a Filesystem from the legacy MDSMap | |
557 | auto migrate_fs = std::make_shared<Filesystem>(); | |
558 | migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS; | |
559 | migrate_fs->mds_map = legacy_mds_map; | |
560 | migrate_fs->mds_map.epoch = epoch; | |
561 | filesystems[migrate_fs->fscid] = migrate_fs; | |
562 | ||
563 | // List of GIDs that had invalid states | |
564 | std::set<mds_gid_t> drop_gids; | |
565 | ||
566 | // Construct mds_roles, standby_daemons, and remove | |
567 | // standbys from the MDSMap in the Filesystem. | |
568 | for (auto &p : migrate_fs->mds_map.mds_info) { | |
569 | if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) { | |
570 | // In legacy MDSMap, standby replay daemons don't have | |
571 | // rank set, but since FSMap they do. | |
572 | p.second.rank = p.second.standby_for_rank; | |
573 | } | |
574 | if (p.second.rank == MDS_RANK_NONE) { | |
575 | if (p.second.state != MDSMap::STATE_STANDBY) { | |
576 | // Old MDSMaps can have down:dne here, which | |
577 | // is invalid in an FSMap (#17837) | |
578 | drop_gids.insert(p.first); | |
579 | } else { | |
580 | insert(p.second); // into standby_daemons | |
581 | } | |
582 | } else { | |
583 | mds_roles[p.first] = migrate_fs->fscid; | |
584 | } | |
585 | } | |
586 | for (const auto &p : standby_daemons) { | |
587 | // Erase from this Filesystem's MDSMap, because it has | |
588 | // been copied into FSMap::Standby_daemons above | |
589 | migrate_fs->mds_map.mds_info.erase(p.first); | |
590 | } | |
591 | for (const auto &gid : drop_gids) { | |
592 | // Throw away all info for this MDS because it was identified | |
593 | // as having invalid state above. | |
594 | migrate_fs->mds_map.mds_info.erase(gid); | |
595 | } | |
596 | ||
597 | legacy_client_fscid = migrate_fs->fscid; | |
598 | } else { | |
599 | legacy_client_fscid = FS_CLUSTER_ID_NONE; | |
600 | } | |
601 | } else { | |
602 | ::decode(epoch, p); | |
603 | ::decode(next_filesystem_id, p); | |
604 | ::decode(legacy_client_fscid, p); | |
605 | ::decode(compat, p); | |
606 | ::decode(enable_multiple, p); | |
607 | std::vector<Filesystem> fs_list; | |
608 | ::decode(fs_list, p); | |
609 | filesystems.clear(); | |
610 | for (std::vector<Filesystem>::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) { | |
611 | filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs); | |
612 | } | |
613 | ||
614 | ::decode(mds_roles, p); | |
615 | ::decode(standby_daemons, p); | |
616 | ::decode(standby_epochs, p); | |
617 | if (struct_v >= 7) { | |
618 | ::decode(ever_enabled_multiple, p); | |
619 | } | |
620 | } | |
621 | ||
622 | DECODE_FINISH(p); | |
623 | } | |
624 | ||
3efd9988 FG |
625 | void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists) |
626 | { | |
627 | for (auto &fs : filesystems) { | |
628 | fs.second->mds_map.sanitize(pool_exists); | |
629 | } | |
630 | } | |
7c673cae FG |
631 | |
632 | void Filesystem::encode(bufferlist& bl, uint64_t features) const | |
633 | { | |
634 | ENCODE_START(1, 1, bl); | |
635 | ::encode(fscid, bl); | |
636 | bufferlist mdsmap_bl; | |
637 | mds_map.encode(mdsmap_bl, features); | |
638 | ::encode(mdsmap_bl, bl); | |
639 | ENCODE_FINISH(bl); | |
640 | } | |
641 | ||
642 | void Filesystem::decode(bufferlist::iterator& p) | |
643 | { | |
644 | DECODE_START(1, p); | |
645 | ::decode(fscid, p); | |
646 | bufferlist mdsmap_bl; | |
647 | ::decode(mdsmap_bl, p); | |
648 | bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin(); | |
649 | mds_map.decode(mdsmap_bl_iter); | |
650 | DECODE_FINISH(p); | |
651 | } | |
652 | ||
653 | int FSMap::parse_filesystem( | |
94b18763 | 654 | boost::string_view ns_str, |
7c673cae FG |
655 | std::shared_ptr<const Filesystem> *result |
656 | ) const | |
657 | { | |
658 | std::string ns_err; | |
94b18763 FG |
659 | std::string s(ns_str); |
660 | fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err); | |
7c673cae FG |
661 | if (!ns_err.empty() || filesystems.count(fscid) == 0) { |
662 | for (auto &fs : filesystems) { | |
94b18763 | 663 | if (fs.second->mds_map.fs_name == s) { |
7c673cae FG |
664 | *result = std::const_pointer_cast<const Filesystem>(fs.second); |
665 | return 0; | |
666 | } | |
667 | } | |
668 | return -ENOENT; | |
669 | } else { | |
670 | *result = get_filesystem(fscid); | |
671 | return 0; | |
672 | } | |
673 | } | |
674 | ||
675 | void Filesystem::print(std::ostream &out) const | |
676 | { | |
677 | out << "Filesystem '" << mds_map.fs_name | |
678 | << "' (" << fscid << ")" << std::endl; | |
679 | mds_map.print(out); | |
680 | } | |
681 | ||
94b18763 | 682 | mds_gid_t FSMap::find_standby_for(mds_role_t role, boost::string_view name) const |
7c673cae FG |
683 | { |
684 | mds_gid_t result = MDS_GID_NONE; | |
685 | ||
686 | // First see if we have a STANDBY_REPLAY | |
687 | auto fs = get_filesystem(role.fscid); | |
688 | for (const auto &i : fs->mds_map.mds_info) { | |
689 | const auto &info = i.second; | |
690 | if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) { | |
691 | return info.global_id; | |
692 | } | |
693 | } | |
694 | ||
695 | // See if there are any STANDBY daemons available | |
696 | for (const auto &i : standby_daemons) { | |
697 | const auto &gid = i.first; | |
698 | const auto &info = i.second; | |
699 | assert(info.state == MDSMap::STATE_STANDBY); | |
700 | assert(info.rank == MDS_RANK_NONE); | |
701 | ||
702 | if (info.laggy()) { | |
703 | continue; | |
704 | } | |
705 | ||
706 | // The mds_info_t may or may not tell us exactly which filesystem | |
707 | // the standby_for_rank refers to: lookup via legacy_client_fscid | |
708 | mds_role_t target_role = { | |
709 | info.standby_for_fscid == FS_CLUSTER_ID_NONE ? | |
710 | legacy_client_fscid : info.standby_for_fscid, | |
711 | info.standby_for_rank}; | |
712 | ||
713 | if ((target_role.rank == role.rank && target_role.fscid == role.fscid) | |
714 | || (name.length() && info.standby_for_name == name)) { | |
715 | // It's a named standby for *me*, use it. | |
716 | return gid; | |
717 | } else if ( | |
718 | info.standby_for_rank < 0 && info.standby_for_name.length() == 0 && | |
719 | (info.standby_for_fscid == FS_CLUSTER_ID_NONE || | |
720 | info.standby_for_fscid == role.fscid)) { | |
721 | // It's not a named standby for anyone, use it if we don't find | |
722 | // a named standby for me later, unless it targets another FSCID. | |
723 | result = gid; | |
724 | } | |
725 | } | |
726 | ||
727 | return result; | |
728 | } | |
729 | ||
31f18b77 FG |
730 | mds_gid_t FSMap::find_unused_for(mds_role_t role, |
731 | bool force_standby_active) const { | |
7c673cae FG |
732 | for (const auto &i : standby_daemons) { |
733 | const auto &gid = i.first; | |
734 | const auto &info = i.second; | |
735 | assert(info.state == MDSMap::STATE_STANDBY); | |
736 | ||
737 | if (info.laggy() || info.rank >= 0) | |
738 | continue; | |
739 | ||
740 | if (info.standby_for_fscid != FS_CLUSTER_ID_NONE && | |
31f18b77 FG |
741 | info.standby_for_fscid != role.fscid) |
742 | continue; | |
743 | if (info.standby_for_rank != MDS_RANK_NONE && | |
744 | info.standby_for_rank != role.rank) | |
7c673cae FG |
745 | continue; |
746 | ||
747 | // To be considered 'unused' a daemon must either not | |
748 | // be selected for standby-replay or the force_standby_active | |
749 | // setting must be enabled to use replay daemons anyway. | |
750 | if (!info.standby_replay || force_standby_active) { | |
751 | return gid; | |
752 | } | |
753 | } | |
754 | return MDS_GID_NONE; | |
755 | } | |
756 | ||
94b18763 | 757 | mds_gid_t FSMap::find_replacement_for(mds_role_t role, boost::string_view name, |
7c673cae FG |
758 | bool force_standby_active) const { |
759 | const mds_gid_t standby = find_standby_for(role, name); | |
760 | if (standby) | |
761 | return standby; | |
762 | else | |
31f18b77 | 763 | return find_unused_for(role, force_standby_active); |
7c673cae FG |
764 | } |
765 | ||
766 | void FSMap::sanity() const | |
767 | { | |
768 | if (legacy_client_fscid != FS_CLUSTER_ID_NONE) { | |
769 | assert(filesystems.count(legacy_client_fscid) == 1); | |
770 | } | |
771 | ||
772 | for (const auto &i : filesystems) { | |
773 | auto fs = i.second; | |
774 | assert(fs->mds_map.compat.compare(compat) == 0); | |
775 | assert(fs->fscid == i.first); | |
776 | for (const auto &j : fs->mds_map.mds_info) { | |
777 | assert(j.second.rank != MDS_RANK_NONE); | |
778 | assert(mds_roles.count(j.first) == 1); | |
779 | assert(standby_daemons.count(j.first) == 0); | |
780 | assert(standby_epochs.count(j.first) == 0); | |
781 | assert(mds_roles.at(j.first) == i.first); | |
782 | if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) { | |
783 | assert(fs->mds_map.up.at(j.second.rank) == j.first); | |
784 | assert(fs->mds_map.failed.count(j.second.rank) == 0); | |
785 | assert(fs->mds_map.damaged.count(j.second.rank) == 0); | |
786 | } | |
787 | } | |
788 | ||
789 | for (const auto &j : fs->mds_map.up) { | |
790 | mds_rank_t rank = j.first; | |
791 | assert(fs->mds_map.in.count(rank) == 1); | |
792 | mds_gid_t gid = j.second; | |
793 | assert(fs->mds_map.mds_info.count(gid) == 1); | |
794 | } | |
795 | } | |
796 | ||
797 | for (const auto &i : standby_daemons) { | |
798 | assert(i.second.state == MDSMap::STATE_STANDBY); | |
799 | assert(i.second.rank == MDS_RANK_NONE); | |
800 | assert(i.second.global_id == i.first); | |
801 | assert(standby_epochs.count(i.first) == 1); | |
802 | assert(mds_roles.count(i.first) == 1); | |
803 | assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE); | |
804 | } | |
805 | ||
806 | for (const auto &i : standby_epochs) { | |
807 | assert(standby_daemons.count(i.first) == 1); | |
808 | } | |
809 | ||
810 | for (const auto &i : mds_roles) { | |
811 | if (i.second == FS_CLUSTER_ID_NONE) { | |
812 | assert(standby_daemons.count(i.first) == 1); | |
813 | } else { | |
814 | assert(filesystems.count(i.second) == 1); | |
815 | assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1); | |
816 | } | |
817 | } | |
818 | } | |
819 | ||
820 | void FSMap::promote( | |
821 | mds_gid_t standby_gid, | |
822 | const std::shared_ptr<Filesystem> &filesystem, | |
823 | mds_rank_t assigned_rank) | |
824 | { | |
825 | assert(gid_exists(standby_gid)); | |
826 | bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE; | |
827 | if (!is_standby_replay) { | |
828 | assert(standby_daemons.count(standby_gid)); | |
829 | assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY); | |
830 | } | |
831 | ||
832 | MDSMap &mds_map = filesystem->mds_map; | |
833 | ||
834 | // Insert daemon state to Filesystem | |
835 | if (!is_standby_replay) { | |
836 | mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); | |
837 | } else { | |
838 | assert(mds_map.mds_info.count(standby_gid)); | |
839 | assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY); | |
840 | assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank); | |
841 | } | |
842 | MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid]; | |
843 | ||
844 | if (mds_map.stopped.erase(assigned_rank)) { | |
845 | // The cluster is being expanded with a stopped rank | |
846 | info.state = MDSMap::STATE_STARTING; | |
847 | } else if (!mds_map.is_in(assigned_rank)) { | |
848 | // The cluster is being expanded with a new rank | |
849 | info.state = MDSMap::STATE_CREATING; | |
850 | } else { | |
851 | // An existing rank is being assigned to a replacement | |
852 | info.state = MDSMap::STATE_REPLAY; | |
853 | mds_map.failed.erase(assigned_rank); | |
854 | } | |
855 | info.rank = assigned_rank; | |
856 | info.inc = epoch; | |
857 | mds_roles[standby_gid] = filesystem->fscid; | |
858 | ||
859 | // Update the rank state in Filesystem | |
860 | mds_map.in.insert(assigned_rank); | |
861 | mds_map.up[assigned_rank] = standby_gid; | |
862 | ||
863 | // Remove from the list of standbys | |
864 | if (!is_standby_replay) { | |
865 | standby_daemons.erase(standby_gid); | |
866 | standby_epochs.erase(standby_gid); | |
867 | } | |
868 | ||
869 | // Indicate that Filesystem has been modified | |
870 | mds_map.epoch = epoch; | |
871 | } | |
872 | ||
873 | void FSMap::assign_standby_replay( | |
874 | const mds_gid_t standby_gid, | |
875 | const fs_cluster_id_t leader_ns, | |
876 | const mds_rank_t leader_rank) | |
877 | { | |
878 | assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE); | |
879 | assert(gid_exists(standby_gid)); | |
880 | assert(!gid_has_rank(standby_gid)); | |
881 | assert(standby_daemons.count(standby_gid)); | |
882 | ||
883 | // Insert to the filesystem | |
884 | auto fs = filesystems.at(leader_ns); | |
885 | fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); | |
886 | fs->mds_map.mds_info[standby_gid].rank = leader_rank; | |
887 | fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY; | |
888 | mds_roles[standby_gid] = leader_ns; | |
889 | ||
890 | // Remove from the list of standbys | |
891 | standby_daemons.erase(standby_gid); | |
892 | standby_epochs.erase(standby_gid); | |
893 | ||
894 | // Indicate that Filesystem has been modified | |
895 | fs->mds_map.epoch = epoch; | |
896 | } | |
897 | ||
898 | void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch) | |
899 | { | |
900 | if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) { | |
901 | standby_daemons.erase(who); | |
902 | standby_epochs.erase(who); | |
903 | } else { | |
904 | auto &fs = filesystems.at(mds_roles.at(who)); | |
905 | const auto &info = fs->mds_map.mds_info.at(who); | |
906 | if (info.state != MDSMap::STATE_STANDBY_REPLAY) { | |
907 | if (info.state == MDSMap::STATE_CREATING) { | |
908 | // If this gid didn't make it past CREATING, then forget | |
909 | // the rank ever existed so that next time it's handed out | |
910 | // to a gid it'll go back into CREATING. | |
911 | fs->mds_map.in.erase(info.rank); | |
912 | } else { | |
913 | // Put this rank into the failed list so that the next available | |
914 | // STANDBY will pick it up. | |
915 | fs->mds_map.failed.insert(info.rank); | |
916 | } | |
917 | assert(fs->mds_map.up.at(info.rank) == info.global_id); | |
918 | fs->mds_map.up.erase(info.rank); | |
919 | } | |
920 | fs->mds_map.mds_info.erase(who); | |
921 | fs->mds_map.last_failure_osd_epoch = blacklist_epoch; | |
922 | fs->mds_map.epoch = epoch; | |
923 | } | |
924 | ||
925 | mds_roles.erase(who); | |
926 | } | |
927 | ||
928 | void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch) | |
929 | { | |
930 | assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); | |
931 | auto fs = filesystems.at(mds_roles.at(who)); | |
932 | mds_rank_t rank = fs->mds_map.mds_info[who].rank; | |
933 | ||
934 | erase(who, blacklist_epoch); | |
935 | fs->mds_map.failed.erase(rank); | |
936 | fs->mds_map.damaged.insert(rank); | |
937 | ||
938 | assert(fs->mds_map.epoch == epoch); | |
939 | } | |
940 | ||
941 | /** | |
942 | * Update to indicate that the rank `rank` is to be removed | |
943 | * from the damaged list of the filesystem `fscid` | |
944 | */ | |
945 | bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank) | |
946 | { | |
947 | auto fs = filesystems.at(fscid); | |
948 | ||
949 | if (fs->mds_map.damaged.erase(rank)) { | |
950 | fs->mds_map.failed.insert(rank); | |
951 | fs->mds_map.epoch = epoch; | |
952 | return true; | |
953 | } else { | |
954 | return false; | |
955 | } | |
956 | } | |
957 | ||
958 | void FSMap::insert(const MDSMap::mds_info_t &new_info) | |
959 | { | |
960 | assert(new_info.state == MDSMap::STATE_STANDBY); | |
961 | assert(new_info.rank == MDS_RANK_NONE); | |
962 | mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE; | |
963 | standby_daemons[new_info.global_id] = new_info; | |
964 | standby_epochs[new_info.global_id] = epoch; | |
965 | } | |
966 | ||
967 | std::list<mds_gid_t> FSMap::stop(mds_gid_t who) | |
968 | { | |
969 | assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); | |
970 | auto fs = filesystems.at(mds_roles.at(who)); | |
971 | const auto &info = fs->mds_map.mds_info.at(who); | |
972 | fs->mds_map.up.erase(info.rank); | |
973 | fs->mds_map.in.erase(info.rank); | |
974 | fs->mds_map.stopped.insert(info.rank); | |
975 | ||
976 | // Also drop any standby replays that were following this rank | |
977 | std::list<mds_gid_t> standbys; | |
978 | for (const auto &i : fs->mds_map.mds_info) { | |
979 | const auto &other_gid = i.first; | |
980 | const auto &other_info = i.second; | |
981 | if (other_info.rank == info.rank | |
982 | && other_info.state == MDSMap::STATE_STANDBY_REPLAY) { | |
983 | standbys.push_back(other_gid); | |
984 | erase(other_gid, 0); | |
985 | } | |
986 | } | |
987 | ||
988 | fs->mds_map.mds_info.erase(who); | |
989 | mds_roles.erase(who); | |
990 | ||
991 | fs->mds_map.epoch = epoch; | |
992 | ||
993 | return standbys; | |
994 | } | |
995 | ||
996 | ||
997 | /** | |
998 | * Given one of the following forms: | |
999 | * <fs name>:<rank> | |
1000 | * <fs id>:<rank> | |
1001 | * <rank> | |
1002 | * | |
1003 | * Parse into a mds_role_t. The rank-only form is only valid | |
1004 | * if legacy_client_ns is set. | |
1005 | */ | |
1006 | int FSMap::parse_role( | |
94b18763 | 1007 | boost::string_view role_str, |
7c673cae FG |
1008 | mds_role_t *role, |
1009 | std::ostream &ss) const | |
1010 | { | |
1011 | size_t colon_pos = role_str.find(":"); | |
1012 | size_t rank_pos; | |
1013 | std::shared_ptr<const Filesystem> fs; | |
1014 | if (colon_pos == std::string::npos) { | |
1015 | if (legacy_client_fscid == FS_CLUSTER_ID_NONE) { | |
1016 | ss << "No filesystem selected"; | |
1017 | return -ENOENT; | |
1018 | } | |
1019 | fs = get_filesystem(legacy_client_fscid); | |
1020 | rank_pos = 0; | |
1021 | } else { | |
1022 | if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) { | |
1023 | ss << "Invalid filesystem"; | |
1024 | return -ENOENT; | |
1025 | } | |
1026 | rank_pos = colon_pos+1; | |
1027 | } | |
1028 | ||
1029 | mds_rank_t rank; | |
1030 | std::string err; | |
94b18763 | 1031 | std::string rank_str(role_str.substr(rank_pos)); |
7c673cae FG |
1032 | long rank_i = strict_strtol(rank_str.c_str(), 10, &err); |
1033 | if (rank_i < 0 || !err.empty()) { | |
1034 | ss << "Invalid rank '" << rank_str << "'"; | |
1035 | return -EINVAL; | |
1036 | } else { | |
1037 | rank = rank_i; | |
1038 | } | |
1039 | ||
1040 | if (fs->mds_map.in.count(rank) == 0) { | |
1041 | ss << "Rank '" << rank << "' not found"; | |
1042 | return -ENOENT; | |
1043 | } | |
1044 | ||
1045 | *role = {fs->fscid, rank}; | |
1046 | ||
1047 | return 0; | |
1048 | } |