]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #include "MDSMap.h" | |
17 | ||
18 | #include <sstream> | |
19 | using std::stringstream; | |
20 | ||
224ce89b WB |
21 | #include "mon/health_check.h" |
22 | ||
7c673cae FG |
23 | |
24 | // features | |
25 | CompatSet get_mdsmap_compat_set_all() { | |
26 | CompatSet::FeatureSet feature_compat; | |
27 | CompatSet::FeatureSet feature_ro_compat; | |
28 | CompatSet::FeatureSet feature_incompat; | |
29 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); | |
30 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); | |
31 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); | |
32 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); | |
33 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); | |
34 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); | |
35 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); | |
36 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); | |
37 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); | |
38 | ||
39 | return CompatSet(feature_compat, feature_ro_compat, feature_incompat); | |
40 | } | |
41 | ||
42 | CompatSet get_mdsmap_compat_set_default() { | |
43 | CompatSet::FeatureSet feature_compat; | |
44 | CompatSet::FeatureSet feature_ro_compat; | |
45 | CompatSet::FeatureSet feature_incompat; | |
46 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); | |
47 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); | |
48 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); | |
49 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); | |
50 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); | |
51 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); | |
52 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); | |
53 | feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); | |
54 | ||
55 | return CompatSet(feature_compat, feature_ro_compat, feature_incompat); | |
56 | } | |
57 | ||
58 | // base (pre v0.20) | |
59 | CompatSet get_mdsmap_compat_set_base() { | |
60 | CompatSet::FeatureSet feature_compat_base; | |
61 | CompatSet::FeatureSet feature_incompat_base; | |
62 | feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE); | |
63 | CompatSet::FeatureSet feature_ro_compat_base; | |
64 | ||
65 | return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base); | |
66 | } | |
67 | ||
68 | void MDSMap::mds_info_t::dump(Formatter *f) const | |
69 | { | |
70 | f->dump_unsigned("gid", global_id); | |
71 | f->dump_string("name", name); | |
72 | f->dump_int("rank", rank); | |
73 | f->dump_int("incarnation", inc); | |
74 | f->dump_stream("state") << ceph_mds_state_name(state); | |
75 | f->dump_int("state_seq", state_seq); | |
76 | f->dump_stream("addr") << addr; | |
77 | if (laggy_since != utime_t()) | |
78 | f->dump_stream("laggy_since") << laggy_since; | |
79 | ||
80 | f->dump_int("standby_for_rank", standby_for_rank); | |
81 | f->dump_int("standby_for_fscid", standby_for_fscid); | |
82 | f->dump_string("standby_for_name", standby_for_name); | |
83 | f->dump_bool("standby_replay", standby_replay); | |
84 | f->open_array_section("export_targets"); | |
85 | for (set<mds_rank_t>::iterator p = export_targets.begin(); | |
86 | p != export_targets.end(); ++p) { | |
87 | f->dump_int("mds", *p); | |
88 | } | |
89 | f->close_section(); | |
90 | f->dump_unsigned("features", mds_features); | |
91 | } | |
92 | ||
93 | void MDSMap::mds_info_t::print_summary(ostream &out) const | |
94 | { | |
95 | out << global_id << ":\t" | |
96 | << addr | |
97 | << " '" << name << "'" | |
98 | << " mds." << rank | |
99 | << "." << inc | |
100 | << " " << ceph_mds_state_name(state) | |
101 | << " seq " << state_seq; | |
102 | if (laggy()) { | |
103 | out << " laggy since " << laggy_since; | |
104 | } | |
105 | if (standby_for_rank != -1 || | |
106 | !standby_for_name.empty()) { | |
107 | out << " (standby for"; | |
108 | //if (standby_for_rank >= 0) | |
109 | out << " rank " << standby_for_rank; | |
110 | if (!standby_for_name.empty()) { | |
111 | out << " '" << standby_for_name << "'"; | |
112 | } | |
113 | out << ")"; | |
114 | } | |
115 | if (!export_targets.empty()) { | |
116 | out << " export_targets=" << export_targets; | |
117 | } | |
118 | } | |
119 | ||
120 | void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls) | |
121 | { | |
122 | mds_info_t *sample = new mds_info_t(); | |
123 | ls.push_back(sample); | |
124 | sample = new mds_info_t(); | |
125 | sample->global_id = 1; | |
126 | sample->name = "test_instance"; | |
127 | sample->rank = 0; | |
128 | ls.push_back(sample); | |
129 | } | |
130 | ||
131 | void MDSMap::dump(Formatter *f) const | |
132 | { | |
133 | f->dump_int("epoch", epoch); | |
134 | f->dump_unsigned("flags", flags); | |
135 | f->dump_unsigned("ever_allowed_features", ever_allowed_features); | |
136 | f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features); | |
137 | f->dump_stream("created") << created; | |
138 | f->dump_stream("modified") << modified; | |
139 | f->dump_int("tableserver", tableserver); | |
140 | f->dump_int("root", root); | |
141 | f->dump_int("session_timeout", session_timeout); | |
142 | f->dump_int("session_autoclose", session_autoclose); | |
143 | f->dump_int("max_file_size", max_file_size); | |
144 | f->dump_int("last_failure", last_failure); | |
145 | f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); | |
146 | f->open_object_section("compat"); | |
147 | compat.dump(f); | |
148 | f->close_section(); | |
149 | f->dump_int("max_mds", max_mds); | |
150 | f->open_array_section("in"); | |
151 | for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p) | |
152 | f->dump_int("mds", *p); | |
153 | f->close_section(); | |
154 | f->open_object_section("up"); | |
155 | for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) { | |
156 | char s[14]; | |
157 | sprintf(s, "mds_%d", int(p->first)); | |
158 | f->dump_int(s, p->second); | |
159 | } | |
160 | f->close_section(); | |
161 | f->open_array_section("failed"); | |
162 | for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) | |
163 | f->dump_int("mds", *p); | |
164 | f->close_section(); | |
165 | f->open_array_section("damaged"); | |
166 | for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) | |
167 | f->dump_int("mds", *p); | |
168 | f->close_section(); | |
169 | f->open_array_section("stopped"); | |
170 | for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p) | |
171 | f->dump_int("mds", *p); | |
172 | f->close_section(); | |
173 | f->open_object_section("info"); | |
174 | for (map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { | |
175 | char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0' | |
176 | sprintf(s, "gid_%llu", (long long unsigned)p->first); | |
177 | f->open_object_section(s); | |
178 | p->second.dump(f); | |
179 | f->close_section(); | |
180 | } | |
181 | f->close_section(); | |
182 | f->open_array_section("data_pools"); | |
31f18b77 FG |
183 | for (const auto p: data_pools) |
184 | f->dump_int("pool", p); | |
7c673cae FG |
185 | f->close_section(); |
186 | f->dump_int("metadata_pool", metadata_pool); | |
187 | f->dump_bool("enabled", enabled); | |
188 | f->dump_string("fs_name", fs_name); | |
189 | f->dump_string("balancer", balancer); | |
190 | f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); | |
191 | } | |
192 | ||
193 | void MDSMap::generate_test_instances(list<MDSMap*>& ls) | |
194 | { | |
195 | MDSMap *m = new MDSMap(); | |
196 | m->max_mds = 1; | |
31f18b77 | 197 | m->data_pools.push_back(0); |
7c673cae FG |
198 | m->metadata_pool = 1; |
199 | m->cas_pool = 2; | |
200 | m->compat = get_mdsmap_compat_set_all(); | |
201 | ||
202 | // these aren't the defaults, just in case anybody gets confused | |
203 | m->session_timeout = 61; | |
204 | m->session_autoclose = 301; | |
205 | m->max_file_size = 1<<24; | |
206 | ls.push_back(m); | |
207 | } | |
208 | ||
209 | void MDSMap::print(ostream& out) const | |
210 | { | |
211 | out << "fs_name\t" << fs_name << "\n"; | |
212 | out << "epoch\t" << epoch << "\n"; | |
213 | out << "flags\t" << hex << flags << dec << "\n"; | |
214 | out << "created\t" << created << "\n"; | |
215 | out << "modified\t" << modified << "\n"; | |
216 | out << "tableserver\t" << tableserver << "\n"; | |
217 | out << "root\t" << root << "\n"; | |
218 | out << "session_timeout\t" << session_timeout << "\n" | |
219 | << "session_autoclose\t" << session_autoclose << "\n"; | |
220 | out << "max_file_size\t" << max_file_size << "\n"; | |
221 | out << "last_failure\t" << last_failure << "\n" | |
222 | << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n"; | |
223 | out << "compat\t" << compat << "\n"; | |
224 | out << "max_mds\t" << max_mds << "\n"; | |
225 | out << "in\t" << in << "\n" | |
226 | << "up\t" << up << "\n" | |
227 | << "failed\t" << failed << "\n" | |
228 | << "damaged\t" << damaged << "\n" | |
229 | << "stopped\t" << stopped << "\n"; | |
230 | out << "data_pools\t" << data_pools << "\n"; | |
231 | out << "metadata_pool\t" << metadata_pool << "\n"; | |
232 | out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; | |
233 | out << "balancer\t" << balancer << "\n"; | |
234 | out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; | |
235 | ||
236 | multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo; | |
237 | for (const auto &p : mds_info) { | |
238 | foo.insert(std::make_pair( | |
239 | std::make_pair(p.second.rank, p.second.inc-1), p.first)); | |
240 | } | |
241 | ||
242 | for (const auto &p : foo) { | |
243 | const mds_info_t& info = mds_info.at(p.second); | |
244 | info.print_summary(out); | |
245 | out << "\n"; | |
246 | } | |
247 | } | |
248 | ||
249 | ||
250 | ||
251 | void MDSMap::print_summary(Formatter *f, ostream *out) const | |
252 | { | |
253 | map<mds_rank_t,string> by_rank; | |
254 | map<string,int> by_state; | |
255 | ||
256 | if (f) { | |
257 | f->dump_unsigned("epoch", get_epoch()); | |
258 | f->dump_unsigned("up", up.size()); | |
259 | f->dump_unsigned("in", in.size()); | |
260 | f->dump_unsigned("max", max_mds); | |
261 | } else { | |
262 | *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up"; | |
263 | } | |
264 | ||
265 | if (f) | |
266 | f->open_array_section("by_rank"); | |
267 | for (const auto &p : mds_info) { | |
268 | string s = ceph_mds_state_name(p.second.state); | |
269 | if (p.second.laggy()) | |
270 | s += "(laggy or crashed)"; | |
271 | ||
272 | if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) { | |
273 | if (f) { | |
274 | f->open_object_section("mds"); | |
275 | f->dump_unsigned("rank", p.second.rank); | |
276 | f->dump_string("name", p.second.name); | |
277 | f->dump_string("status", s); | |
278 | f->close_section(); | |
279 | } else { | |
280 | by_rank[p.second.rank] = p.second.name + "=" + s; | |
281 | } | |
282 | } else { | |
283 | by_state[s]++; | |
284 | } | |
285 | } | |
286 | if (f) { | |
287 | f->close_section(); | |
288 | } else { | |
289 | if (!by_rank.empty()) | |
290 | *out << " " << by_rank; | |
291 | } | |
292 | ||
293 | for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) { | |
294 | if (f) { | |
295 | f->dump_unsigned(p->first.c_str(), p->second); | |
296 | } else { | |
297 | *out << ", " << p->second << " " << p->first; | |
298 | } | |
299 | } | |
300 | ||
301 | if (!failed.empty()) { | |
302 | if (f) { | |
303 | f->dump_unsigned("failed", failed.size()); | |
304 | } else { | |
305 | *out << ", " << failed.size() << " failed"; | |
306 | } | |
307 | } | |
308 | ||
309 | if (!damaged.empty()) { | |
310 | if (f) { | |
311 | f->dump_unsigned("damaged", damaged.size()); | |
312 | } else { | |
313 | *out << ", " << damaged.size() << " damaged"; | |
314 | } | |
315 | } | |
316 | //if (stopped.size()) | |
317 | //out << ", " << stopped.size() << " stopped"; | |
318 | } | |
319 | ||
320 | void MDSMap::get_health(list<pair<health_status_t,string> >& summary, | |
321 | list<pair<health_status_t,string> > *detail) const | |
322 | { | |
323 | if (!failed.empty()) { | |
324 | std::ostringstream oss; | |
325 | oss << "mds rank" | |
326 | << ((failed.size() > 1) ? "s ":" ") | |
327 | << failed | |
328 | << ((failed.size() > 1) ? " have":" has") | |
329 | << " failed"; | |
330 | summary.push_back(make_pair(HEALTH_ERR, oss.str())); | |
331 | if (detail) { | |
332 | for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) { | |
333 | std::ostringstream oss; | |
334 | oss << "mds." << *p << " has failed"; | |
335 | detail->push_back(make_pair(HEALTH_ERR, oss.str())); | |
336 | } | |
337 | } | |
338 | } | |
339 | ||
340 | if (!damaged.empty()) { | |
341 | std::ostringstream oss; | |
342 | oss << "mds rank" | |
343 | << ((damaged.size() > 1) ? "s ":" ") | |
344 | << damaged | |
345 | << ((damaged.size() > 1) ? " are":" is") | |
346 | << " damaged"; | |
347 | summary.push_back(make_pair(HEALTH_ERR, oss.str())); | |
348 | if (detail) { | |
349 | for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) { | |
350 | std::ostringstream oss; | |
351 | oss << "mds." << *p << " is damaged"; | |
352 | detail->push_back(make_pair(HEALTH_ERR, oss.str())); | |
353 | } | |
354 | } | |
355 | } | |
356 | ||
357 | if (is_degraded()) { | |
358 | summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); | |
359 | if (detail) { | |
360 | detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); | |
361 | for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { | |
362 | if (!is_up(i)) | |
363 | continue; | |
364 | mds_gid_t gid = up.find(i)->second; | |
365 | map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid); | |
366 | stringstream ss; | |
367 | if (is_resolve(i)) | |
368 | ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is resolving"; | |
369 | if (is_replay(i)) | |
370 | ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is replaying journal"; | |
371 | if (is_rejoin(i)) | |
372 | ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is rejoining"; | |
373 | if (is_reconnect(i)) | |
374 | ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is reconnecting to clients"; | |
375 | if (ss.str().length()) | |
376 | detail->push_back(make_pair(HEALTH_WARN, ss.str())); | |
377 | } | |
378 | } | |
379 | } | |
380 | ||
381 | map<mds_gid_t, mds_info_t>::const_iterator m_end = mds_info.end(); | |
382 | set<string> laggy; | |
383 | for (const auto &u : up) { | |
384 | map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u.second); | |
385 | if (m == m_end) { | |
386 | std::cerr << "Up rank " << u.first << " GID " << u.second << " not found!" << std::endl; | |
387 | } | |
388 | assert(m != m_end); | |
389 | const mds_info_t &mds_info(m->second); | |
390 | if (mds_info.laggy()) { | |
391 | laggy.insert(mds_info.name); | |
392 | if (detail) { | |
393 | std::ostringstream oss; | |
394 | oss << "mds." << mds_info.name << " at " << mds_info.addr << " is laggy/unresponsive"; | |
395 | detail->push_back(make_pair(HEALTH_WARN, oss.str())); | |
396 | } | |
397 | } | |
398 | } | |
399 | ||
400 | if (!laggy.empty()) { | |
401 | std::ostringstream oss; | |
402 | oss << "mds " << laggy | |
403 | << ((laggy.size() > 1) ? " are":" is") | |
404 | << " laggy"; | |
405 | summary.push_back(make_pair(HEALTH_WARN, oss.str())); | |
406 | } | |
407 | } | |
408 | ||
224ce89b WB |
409 | void MDSMap::get_health_checks(health_check_map_t *checks) const |
410 | { | |
411 | // FS_WITH_FAILED_MDS | |
412 | // MDS_FAILED | |
413 | if (!failed.empty()) { | |
414 | health_check_t& fscheck = checks->add( | |
415 | "FS_WITH_FAILED_MDS", HEALTH_WARN, | |
416 | "%num% filesystem%plurals% %isorare% have a failed mds daemon"); | |
417 | ostringstream ss; | |
418 | ss << "fs " << fs_name << " has " << failed.size() << " failed mds" | |
419 | << (failed.size() > 1 ? "s" : ""); | |
420 | fscheck.detail.push_back(ss.str()); | |
421 | ||
422 | health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR, | |
423 | "%num% mds daemon%plurals% down"); | |
424 | for (auto p : failed) { | |
425 | std::ostringstream oss; | |
426 | oss << "fs " << fs_name << " mds." << p << " has failed"; | |
427 | check.detail.push_back(oss.str()); | |
428 | } | |
429 | } | |
430 | ||
431 | // MDS_DAMAGED | |
432 | if (!damaged.empty()) { | |
433 | health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR, | |
434 | "%num% mds daemon%plurals% damaged"); | |
435 | for (auto p : damaged) { | |
436 | std::ostringstream oss; | |
437 | oss << "fs " << fs_name << " mds." << p << " is damaged"; | |
438 | check.detail.push_back(oss.str()); | |
439 | } | |
440 | } | |
441 | ||
442 | // FS_DEGRADED | |
443 | // MDS_DEGRADED | |
444 | if (is_degraded()) { | |
445 | health_check_t& fscheck = checks->add( | |
446 | "FS_DEGRADED", HEALTH_WARN, | |
447 | "%num% filesystem%plurals% %isorare% degraded"); | |
448 | ostringstream ss; | |
449 | ss << "fs " << fs_name << " is degraded"; | |
450 | fscheck.detail.push_back(ss.str()); | |
451 | ||
452 | list<string> detail; | |
453 | for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { | |
454 | if (!is_up(i)) | |
455 | continue; | |
456 | mds_gid_t gid = up.find(i)->second; | |
457 | map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid); | |
458 | stringstream ss; | |
459 | ss << "fs " << fs_name << " mds." << info->second.name << " at " | |
460 | << info->second.addr << " rank " << i; | |
461 | if (is_resolve(i)) | |
462 | ss << " is resolving"; | |
463 | if (is_replay(i)) | |
464 | ss << " is replaying journal"; | |
465 | if (is_rejoin(i)) | |
466 | ss << " is rejoining"; | |
467 | if (is_reconnect(i)) | |
468 | ss << " is reconnecting to clients"; | |
469 | if (ss.str().length()) | |
470 | detail.push_back(ss.str()); | |
471 | } | |
472 | if (!detail.empty()) { | |
473 | health_check_t& check = checks->add( | |
474 | "MDS_DEGRADED", HEALTH_WARN, | |
475 | "%num% mds daemon%plurals% %isorare% degraded"); | |
476 | check.detail.insert(check.detail.end(), detail.begin(), detail.end()); | |
477 | } | |
478 | } | |
479 | } | |
480 | ||
7c673cae FG |
481 | void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const |
482 | { | |
483 | ENCODE_START(7, 4, bl); | |
484 | ::encode(global_id, bl); | |
485 | ::encode(name, bl); | |
486 | ::encode(rank, bl); | |
487 | ::encode(inc, bl); | |
488 | ::encode((int32_t)state, bl); | |
489 | ::encode(state_seq, bl); | |
490 | ::encode(addr, bl, features); | |
491 | ::encode(laggy_since, bl); | |
492 | ::encode(standby_for_rank, bl); | |
493 | ::encode(standby_for_name, bl); | |
494 | ::encode(export_targets, bl); | |
495 | ::encode(mds_features, bl); | |
496 | ::encode(standby_for_fscid, bl); | |
497 | ::encode(standby_replay, bl); | |
498 | ENCODE_FINISH(bl); | |
499 | } | |
500 | ||
501 | void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const | |
502 | { | |
503 | __u8 struct_v = 3; | |
504 | ::encode(struct_v, bl); | |
505 | ::encode(global_id, bl); | |
506 | ::encode(name, bl); | |
507 | ::encode(rank, bl); | |
508 | ::encode(inc, bl); | |
509 | ::encode((int32_t)state, bl); | |
510 | ::encode(state_seq, bl); | |
511 | ::encode(addr, bl, 0); | |
512 | ::encode(laggy_since, bl); | |
513 | ::encode(standby_for_rank, bl); | |
514 | ::encode(standby_for_name, bl); | |
515 | ::encode(export_targets, bl); | |
516 | } | |
517 | ||
518 | void MDSMap::mds_info_t::decode(bufferlist::iterator& bl) | |
519 | { | |
520 | DECODE_START_LEGACY_COMPAT_LEN(7, 4, 4, bl); | |
521 | ::decode(global_id, bl); | |
522 | ::decode(name, bl); | |
523 | ::decode(rank, bl); | |
524 | ::decode(inc, bl); | |
525 | ::decode((int32_t&)(state), bl); | |
526 | ::decode(state_seq, bl); | |
527 | ::decode(addr, bl); | |
528 | ::decode(laggy_since, bl); | |
529 | ::decode(standby_for_rank, bl); | |
530 | ::decode(standby_for_name, bl); | |
531 | if (struct_v >= 2) | |
532 | ::decode(export_targets, bl); | |
533 | if (struct_v >= 5) | |
534 | ::decode(mds_features, bl); | |
535 | if (struct_v >= 6) { | |
536 | ::decode(standby_for_fscid, bl); | |
537 | } | |
538 | if (struct_v >= 7) { | |
539 | ::decode(standby_replay, bl); | |
540 | } | |
541 | DECODE_FINISH(bl); | |
542 | } | |
543 | ||
544 | ||
545 | ||
546 | void MDSMap::encode(bufferlist& bl, uint64_t features) const | |
547 | { | |
548 | std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that | |
549 | // old-mon peers have something sane | |
550 | // during upgrade | |
551 | for (const auto rank : in) { | |
552 | inc.insert(std::make_pair(rank, epoch)); | |
553 | } | |
554 | ||
555 | if ((features & CEPH_FEATURE_PGID64) == 0) { | |
556 | __u16 v = 2; | |
557 | ::encode(v, bl); | |
558 | ::encode(epoch, bl); | |
559 | ::encode(flags, bl); | |
560 | ::encode(last_failure, bl); | |
561 | ::encode(root, bl); | |
562 | ::encode(session_timeout, bl); | |
563 | ::encode(session_autoclose, bl); | |
564 | ::encode(max_file_size, bl); | |
565 | ::encode(max_mds, bl); | |
566 | __u32 n = mds_info.size(); | |
567 | ::encode(n, bl); | |
568 | for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin(); | |
569 | i != mds_info.end(); ++i) { | |
570 | ::encode(i->first, bl); | |
571 | ::encode(i->second, bl, features); | |
572 | } | |
573 | n = data_pools.size(); | |
574 | ::encode(n, bl); | |
31f18b77 FG |
575 | for (const auto p: data_pools) { |
576 | n = p; | |
7c673cae FG |
577 | ::encode(n, bl); |
578 | } | |
579 | ||
580 | int32_t m = cas_pool; | |
581 | ::encode(m, bl); | |
582 | return; | |
583 | } else if ((features & CEPH_FEATURE_MDSENC) == 0) { | |
584 | __u16 v = 3; | |
585 | ::encode(v, bl); | |
586 | ::encode(epoch, bl); | |
587 | ::encode(flags, bl); | |
588 | ::encode(last_failure, bl); | |
589 | ::encode(root, bl); | |
590 | ::encode(session_timeout, bl); | |
591 | ::encode(session_autoclose, bl); | |
592 | ::encode(max_file_size, bl); | |
593 | ::encode(max_mds, bl); | |
594 | __u32 n = mds_info.size(); | |
595 | ::encode(n, bl); | |
596 | for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin(); | |
597 | i != mds_info.end(); ++i) { | |
598 | ::encode(i->first, bl); | |
599 | ::encode(i->second, bl, features); | |
600 | } | |
601 | ::encode(data_pools, bl); | |
602 | ::encode(cas_pool, bl); | |
603 | ||
604 | // kclient ignores everything from here | |
605 | __u16 ev = 5; | |
606 | ::encode(ev, bl); | |
607 | ::encode(compat, bl); | |
608 | ::encode(metadata_pool, bl); | |
609 | ::encode(created, bl); | |
610 | ::encode(modified, bl); | |
611 | ::encode(tableserver, bl); | |
612 | ::encode(in, bl); | |
613 | ::encode(inc, bl); | |
614 | ::encode(up, bl); | |
615 | ::encode(failed, bl); | |
616 | ::encode(stopped, bl); | |
617 | ::encode(last_failure_osd_epoch, bl); | |
618 | return; | |
619 | } | |
620 | ||
621 | ENCODE_START(5, 4, bl); | |
622 | ::encode(epoch, bl); | |
623 | ::encode(flags, bl); | |
624 | ::encode(last_failure, bl); | |
625 | ::encode(root, bl); | |
626 | ::encode(session_timeout, bl); | |
627 | ::encode(session_autoclose, bl); | |
628 | ::encode(max_file_size, bl); | |
629 | ::encode(max_mds, bl); | |
630 | ::encode(mds_info, bl, features); | |
631 | ::encode(data_pools, bl); | |
632 | ::encode(cas_pool, bl); | |
633 | ||
634 | // kclient ignores everything from here | |
635 | __u16 ev = 12; | |
636 | ::encode(ev, bl); | |
637 | ::encode(compat, bl); | |
638 | ::encode(metadata_pool, bl); | |
639 | ::encode(created, bl); | |
640 | ::encode(modified, bl); | |
641 | ::encode(tableserver, bl); | |
642 | ::encode(in, bl); | |
643 | ::encode(inc, bl); | |
644 | ::encode(up, bl); | |
645 | ::encode(failed, bl); | |
646 | ::encode(stopped, bl); | |
647 | ::encode(last_failure_osd_epoch, bl); | |
648 | ::encode(ever_allowed_features, bl); | |
649 | ::encode(explicitly_allowed_features, bl); | |
650 | ::encode(inline_data_enabled, bl); | |
651 | ::encode(enabled, bl); | |
652 | ::encode(fs_name, bl); | |
653 | ::encode(damaged, bl); | |
654 | ::encode(balancer, bl); | |
655 | ::encode(standby_count_wanted, bl); | |
656 | ENCODE_FINISH(bl); | |
657 | } | |
658 | ||
659 | void MDSMap::decode(bufferlist::iterator& p) | |
660 | { | |
661 | std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop | |
662 | ||
663 | cached_up_features = 0; | |
664 | DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p); | |
665 | ::decode(epoch, p); | |
666 | ::decode(flags, p); | |
667 | ::decode(last_failure, p); | |
668 | ::decode(root, p); | |
669 | ::decode(session_timeout, p); | |
670 | ::decode(session_autoclose, p); | |
671 | ::decode(max_file_size, p); | |
672 | ::decode(max_mds, p); | |
673 | ::decode(mds_info, p); | |
674 | if (struct_v < 3) { | |
675 | __u32 n; | |
676 | ::decode(n, p); | |
677 | while (n--) { | |
678 | __u32 m; | |
679 | ::decode(m, p); | |
31f18b77 | 680 | data_pools.push_back(m); |
7c673cae FG |
681 | } |
682 | __s32 s; | |
683 | ::decode(s, p); | |
684 | cas_pool = s; | |
685 | } else { | |
686 | ::decode(data_pools, p); | |
687 | ::decode(cas_pool, p); | |
688 | } | |
689 | ||
690 | // kclient ignores everything from here | |
691 | __u16 ev = 1; | |
692 | if (struct_v >= 2) | |
693 | ::decode(ev, p); | |
694 | if (ev >= 3) | |
695 | ::decode(compat, p); | |
696 | else | |
697 | compat = get_mdsmap_compat_set_base(); | |
698 | if (ev < 5) { | |
699 | __u32 n; | |
700 | ::decode(n, p); | |
701 | metadata_pool = n; | |
702 | } else { | |
703 | ::decode(metadata_pool, p); | |
704 | } | |
705 | ::decode(created, p); | |
706 | ::decode(modified, p); | |
707 | ::decode(tableserver, p); | |
708 | ::decode(in, p); | |
709 | ::decode(inc, p); | |
710 | ::decode(up, p); | |
711 | ::decode(failed, p); | |
712 | ::decode(stopped, p); | |
713 | if (ev >= 4) | |
714 | ::decode(last_failure_osd_epoch, p); | |
715 | if (ev >= 6) { | |
716 | if (ev < 10) { | |
717 | // previously this was a bool about snaps, not a flag map | |
718 | bool flag; | |
719 | ::decode(flag, p); | |
720 | ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; | |
721 | ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS|CEPH_MDSMAP_ALLOW_DIRFRAGS; | |
722 | ::decode(flag, p); | |
723 | explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; | |
724 | if (max_mds > 1) { | |
725 | set_multimds_allowed(); | |
726 | } | |
727 | } else { | |
728 | ::decode(ever_allowed_features, p); | |
729 | ::decode(explicitly_allowed_features, p); | |
730 | } | |
731 | } else { | |
732 | ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS; | |
733 | explicitly_allowed_features = 0; | |
734 | if (max_mds > 1) { | |
735 | set_multimds_allowed(); | |
736 | } | |
737 | } | |
738 | if (ev >= 7) | |
739 | ::decode(inline_data_enabled, p); | |
740 | ||
741 | if (ev >= 8) { | |
742 | assert(struct_v >= 5); | |
743 | ::decode(enabled, p); | |
744 | ::decode(fs_name, p); | |
745 | } else { | |
746 | if (epoch > 1) { | |
747 | // If an MDS has ever been started, epoch will be greater than 1, | |
748 | // assume filesystem is enabled. | |
749 | enabled = true; | |
750 | } else { | |
751 | // Upgrading from a cluster that never used an MDS, switch off | |
752 | // filesystem until it's explicitly enabled. | |
753 | enabled = false; | |
754 | } | |
755 | } | |
756 | ||
757 | if (ev >= 9) { | |
758 | ::decode(damaged, p); | |
759 | } | |
760 | ||
761 | if (ev >= 11) { | |
762 | ::decode(balancer, p); | |
763 | } | |
764 | ||
765 | if (ev >= 12) { | |
766 | ::decode(standby_count_wanted, p); | |
767 | } | |
768 | ||
769 | DECODE_FINISH(p); | |
770 | } | |
771 | ||
772 | MDSMap::availability_t MDSMap::is_cluster_available() const | |
773 | { | |
774 | if (epoch == 0) { | |
775 | // If I'm a client, this means I'm looking at an MDSMap instance | |
776 | // that was never actually initialized from the mons. Client should | |
777 | // wait. | |
778 | return TRANSIENT_UNAVAILABLE; | |
779 | } | |
780 | ||
781 | // If a rank is marked damage (unavailable until operator intervenes) | |
782 | if (damaged.size()) { | |
783 | return STUCK_UNAVAILABLE; | |
784 | } | |
785 | ||
786 | // If no ranks are created (filesystem not initialized) | |
787 | if (in.empty()) { | |
788 | return STUCK_UNAVAILABLE; | |
789 | } | |
790 | ||
791 | for (const auto rank : in) { | |
792 | if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) { | |
793 | // This might only be transient, but because we can't see | |
794 | // standbys, we have no way of knowing whether there is a | |
795 | // standby available to replace the laggy guy. | |
796 | return STUCK_UNAVAILABLE; | |
797 | } | |
798 | } | |
799 | ||
800 | if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) { | |
801 | // Nobody looks stuck, so indicate to client they should go ahead | |
802 | // and try mounting if anybody is active. This may include e.g. | |
803 | // one MDS failing over and another active: the client should | |
804 | // proceed to start talking to the active one and let the | |
805 | // transiently-unavailable guy catch up later. | |
806 | return AVAILABLE; | |
807 | } else { | |
808 | // Nothing indicating we were stuck, but nobody active (yet) | |
809 | //return TRANSIENT_UNAVAILABLE; | |
810 | ||
811 | // Because we don't have standbys in the MDSMap any more, we can't | |
812 | // reliably indicate transient vs. stuck, so always say stuck so | |
813 | // that the client doesn't block. | |
814 | return STUCK_UNAVAILABLE; | |
815 | } | |
816 | } | |
817 | ||
818 | bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) | |
819 | { | |
820 | bool state_valid = true; | |
821 | if (next != prev) { | |
822 | if (prev == MDSMap::STATE_REPLAY) { | |
823 | if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) { | |
824 | state_valid = false; | |
825 | } | |
826 | } else if (prev == MDSMap::STATE_REJOIN) { | |
827 | if (next != MDSMap::STATE_ACTIVE | |
828 | && next != MDSMap::STATE_CLIENTREPLAY | |
829 | && next != MDSMap::STATE_STOPPED) { | |
830 | state_valid = false; | |
831 | } | |
832 | } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) { | |
833 | // Once I have entered replay, the only allowable transitions are to | |
834 | // the next next along in the sequence. | |
835 | if (next != prev + 1) { | |
836 | state_valid = false; | |
837 | } | |
838 | } | |
839 | } | |
840 | ||
841 | return state_valid; | |
842 | } | |
843 | ||
844 | bool MDSMap::check_health(mds_rank_t standby_daemon_count) | |
845 | { | |
846 | std::set<mds_rank_t> standbys; | |
847 | get_standby_replay_mds_set(standbys); | |
848 | std::set<mds_rank_t> actives; | |
849 | get_active_mds_set(actives); | |
850 | mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count; | |
851 | ||
852 | /* If there are standby daemons available/replaying and | |
853 | * standby_count_wanted is unset (default), then we set it to 1. This will | |
854 | * happen during health checks by the mons. Also, during initial creation | |
855 | * of the FS we will have no actives so we don't want to change the default | |
856 | * yet. | |
857 | */ | |
858 | if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) { | |
859 | set_standby_count_wanted(1); | |
860 | return true; | |
861 | } | |
862 | return false; | |
863 | } |