]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/ceph_monstore_tool.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / tools / ceph_monstore_tool.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2012 Inktank, Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13 #include <boost/program_options/variables_map.hpp>
14 #include <boost/program_options/parsers.hpp>
15 #include <boost/scope_exit.hpp>
16
17 #include <stdlib.h>
18 #include <string>
19
20 #include "common/Formatter.h"
21 #include "common/errno.h"
22
23 #include "auth/KeyRing.h"
24 #include "auth/cephx/CephxKeyServer.h"
25 #include "global/global_init.h"
26 #include "include/scope_guard.h"
27 #include "include/stringify.h"
28 #include "mgr/mgr_commands.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/MonitorDBStore.h"
31 #include "mon/Paxos.h"
32 #include "mon/MonMap.h"
33 #include "mds/FSMap.h"
34 #include "mon/MgrMap.h"
35 #include "osd/OSDMap.h"
36 #include "crush/CrushCompiler.h"
37 #include "mon/CreatingPGs.h"
38
39 namespace po = boost::program_options;
40
41 using namespace std;
42
43 class TraceIter {
44 int fd;
45 unsigned idx;
46 MonitorDBStore::TransactionRef t;
47 public:
48 explicit TraceIter(string fname) : fd(-1), idx(-1) {
49 fd = ::open(fname.c_str(), O_RDONLY|O_BINARY);
50 t.reset(new MonitorDBStore::Transaction);
51 }
52 bool valid() {
53 return fd != -1;
54 }
55 MonitorDBStore::TransactionRef cur() {
56 ceph_assert(valid());
57 return t;
58 }
59 unsigned num() { return idx; }
60 void next() {
61 ++idx;
62 bufferlist bl;
63 int r = bl.read_fd(fd, 6);
64 if (r < 0) {
65 std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
66 << std::endl;
67 ::close(fd);
68 fd = -1;
69 return;
70 } else if ((unsigned)r < 6) {
71 std::cerr << "short read" << std::endl;
72 ::close(fd);
73 fd = -1;
74 return;
75 }
76 auto bliter = bl.cbegin();
77 uint8_t ver, ver2;
78 decode(ver, bliter);
79 decode(ver2, bliter);
80 uint32_t len;
81 decode(len, bliter);
82 r = bl.read_fd(fd, len);
83 if (r < 0) {
84 std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
85 << std::endl;
86 ::close(fd);
87 fd = -1;
88 return;
89 } else if ((unsigned)r < len) {
90 std::cerr << "short read" << std::endl;
91 ::close(fd);
92 fd = -1;
93 return;
94 }
95 bliter = bl.cbegin();
96 t.reset(new MonitorDBStore::Transaction);
97 t->decode(bliter);
98 }
99 void init() {
100 next();
101 }
102 ~TraceIter() {
103 if (fd != -1) {
104 ::close(fd);
105 fd = -1;
106 }
107 }
108 };
109
110
111 int parse_cmd_args(
112 po::options_description *desc, /// < visible options description
113 po::options_description *hidden_desc, /// < hidden options description
114 po::positional_options_description *positional, /// < positional args
115 vector<string> &cmd_args, /// < arguments to be parsed
116 po::variables_map *vm /// > post-parsing variable map
117 )
118 {
119 // desc_all will aggregate all visible and hidden options for parsing.
120 //
121 // From boost's program_options point of view, there is absolutely no
122 // distinction between 'desc' and 'hidden_desc'. This is a distinction
123 // that is only useful to us: 'desc' is whatever we are willing to show
124 // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to
125 // take advantage of but do not wish to show on 'usage()'.
126 //
127 // For example, consider that program_options matches positional arguments
128 // (specified via 'positional') against the paramenters defined on a
129 // given 'po::options_description' class. This is performed below,
130 // supplying both the description and the positional arguments to the
131 // parser. However, we do not want the parameters that are mapped to
132 // positional arguments to be shown on usage, as that makes for ugly and
133 // confusing usage messages. Therefore we dissociate the options'
134 // description that is to be used as an aid to the user from those options
135 // that are nothing but useful for internal purposes (i.e., mapping options
136 // to positional arguments). We still need to aggregate them before parsing
137 // and that's what 'desc_all' is all about.
138 //
139
140 ceph_assert(desc != NULL);
141
142 po::options_description desc_all;
143 desc_all.add(*desc);
144 if (hidden_desc != NULL)
145 desc_all.add(*hidden_desc);
146
147 try {
148 po::command_line_parser parser = po::command_line_parser(cmd_args).
149 options(desc_all);
150
151 if (positional) {
152 parser = parser.positional(*positional);
153 }
154
155 po::parsed_options parsed = parser.run();
156 po::store(parsed, *vm);
157 po::notify(*vm);
158 } catch (po::error &e) {
159 std::cerr << "error: " << e.what() << std::endl;
160 return -EINVAL;
161 }
162 return 0;
163 }
164
165
166 /**
167 * usage: ceph-monstore-tool <store-path> <command> [options]
168 *
169 * commands:
170 *
171 * store-copy < --out arg >
172 * dump-keys
173 * compact
174 * getmonmap < --out arg [ --version arg ] >
175 * getosdmap < --out arg [ --version arg ] >
176 * dump-paxos <--dump-start VER> <--dump-end VER>
177 * dump-trace < --trace-file arg >
178 * replay-trace
179 * random-gen
180 * rewrite-crush
181 *
182 * wanted syntax:
183 *
184 * ceph-monstore-tool PATH CMD [options]
185 *
186 * ceph-monstore-tool PATH store-copy <PATH2 | -o PATH2>
187 * ceph-monstore-tool PATH dump-keys
188 * ceph-monstore-tool PATH compact
189 * ceph-monstore-tool PATH get monmap [VER]
190 * ceph-monstore-tool PATH get osdmap [VER]
191 * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER
192 *
193 *
194 */
195 void usage(const char *n, po::options_description &d)
196 {
197 std::cerr <<
198 "usage: " << n << " <store-path> <cmd> [args|options]\n"
199 << "\n"
200 << "Commands:\n"
201 << " store-copy PATH copies store to PATH\n"
202 << " compact compacts the store\n"
203 << " get monmap [-- options] get monmap (version VER if specified)\n"
204 << " (default: last committed)\n"
205 << " get osdmap [-- options] get osdmap (version VER if specified)\n"
206 << " (default: last committed)\n"
207 << " get mdsmap [-- options] get mdsmap (version VER if specified)\n"
208 << " (default: last committed)\n"
209 << " get mgr [-- options] get mgr map (version VER if specified)\n"
210 << " (default: last committed)\n"
211 << " get crushmap [-- options] get crushmap (version VER if specified)\n"
212 << " (default: last committed)\n"
213 << " show-versions [-- options] show the first&last committed version of map\n"
214 << " (show-versions -- --help for more info)\n"
215 << " dump-keys dumps store keys to FILE\n"
216 << " (default: stdout)\n"
217 << " dump-paxos [-- options] dump paxos transactions\n"
218 << " (dump-paxos -- --help for more info)\n"
219 << " dump-trace FILE [-- options] dump contents of trace file FILE\n"
220 << " (dump-trace -- --help for more info)\n"
221 << " replay-trace FILE [-- options] replay trace from FILE\n"
222 << " (replay-trace -- --help for more info)\n"
223 << " random-gen [-- options] add randomly generated ops to the store\n"
224 << " (random-gen -- --help for more info)\n"
225 << " rewrite-crush [-- options] add a rewrite commit to the store\n"
226 << " (rewrite-crush -- --help for more info)\n"
227 << " rebuild rebuild store\n"
228 << " (rebuild -- --help for more info)\n"
229 << std::endl;
230 std::cerr << d << std::endl;
231 std::cerr
232 << "\nPlease Note:\n"
233 << "* Ceph-specific options should be in the format --option-name=VAL\n"
234 << " (specifically, do not forget the '='!!)\n"
235 << "* Command-specific options need to be passed after a '--'\n"
236 << " e.g., 'get monmap -- --version 10 --out /tmp/foo'"
237 << std::endl;
238 }
239
240 int update_osdmap(MonitorDBStore& store, version_t ver, bool copy,
241 std::shared_ptr<CrushWrapper> crush,
242 MonitorDBStore::Transaction* t) {
243 const string prefix("osdmap");
244
245 // full
246 bufferlist bl;
247 int r = 0;
248 r = store.get(prefix, store.combine_strings("full", ver), bl);
249 if (r) {
250 std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl;
251 return r;
252 }
253 OSDMap osdmap;
254 osdmap.decode(bl);
255 osdmap.crush = crush;
256 if (copy) {
257 osdmap.inc_epoch();
258 }
259 bl.clear();
260 // be consistent with OSDMonitor::update_from_paxos()
261 osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
262 t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl);
263
264 // incremental
265 OSDMap::Incremental inc;
266 if (copy) {
267 inc.epoch = osdmap.get_epoch();
268 inc.fsid = osdmap.get_fsid();
269 } else {
270 bl.clear();
271 r = store.get(prefix, ver, bl);
272 if (r) {
273 std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl;
274 return r;
275 }
276 OSDMap::Incremental inc(bl);
277 if (inc.crush.length()) {
278 inc.crush.clear();
279 crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
280 }
281 if (inc.fullmap.length()) {
282 OSDMap fullmap;
283 fullmap.decode(inc.fullmap);
284 fullmap.crush = crush;
285 inc.fullmap.clear();
286 fullmap.encode(inc.fullmap);
287 }
288 }
289 ceph_assert(osdmap.have_crc());
290 inc.full_crc = osdmap.get_crc();
291 bl.clear();
292 // be consistent with OSDMonitor::update_from_paxos()
293 inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
294 t->put(prefix, inc.epoch, bl);
295 return 0;
296 }
297
298 int rewrite_transaction(MonitorDBStore& store, int version,
299 const string& crush_file,
300 MonitorDBStore::Transaction* t) {
301 const string prefix("osdmap");
302
303 // calc the known-good epoch
304 version_t last_committed = store.get(prefix, "last_committed");
305 version_t good_version = 0;
306 if (version <= 0) {
307 if (last_committed >= (unsigned)-version) {
308 good_version = last_committed + version;
309 } else {
310 std::cerr << "osdmap-version is less than: -" << last_committed << std::endl;
311 return EINVAL;
312 }
313 } else {
314 good_version = version;
315 }
316 if (good_version >= last_committed) {
317 std::cout << "good epoch is greater or equal to the last committed one: "
318 << good_version << " >= " << last_committed << std::endl;
319 return 0;
320 }
321
322 // load/extract the crush map
323 int r = 0;
324 std::shared_ptr<CrushWrapper> crush(new CrushWrapper);
325 if (crush_file.empty()) {
326 bufferlist bl;
327 r = store.get(prefix, store.combine_strings("full", good_version), bl);
328 if (r) {
329 std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
330 return r;
331 }
332 OSDMap osdmap;
333 osdmap.decode(bl);
334 crush = osdmap.crush;
335 } else {
336 string err;
337 bufferlist bl;
338 r = bl.read_file(crush_file.c_str(), &err);
339 if (r) {
340 std::cerr << err << ": " << cpp_strerror(r) << std::endl;
341 return r;
342 }
343 auto p = bl.cbegin();
344 crush->decode(p);
345 }
346
347 // prepare a transaction to rewrite the epochs
348 // (good_version, last_committed]
349 // with the good crush map.
350 // XXX: may need to break this into several paxos versions?
351 ceph_assert(good_version < last_committed);
352 for (version_t v = good_version + 1; v <= last_committed; v++) {
353 cout << "rewriting epoch #" << v << "/" << last_committed << std::endl;
354 r = update_osdmap(store, v, false, crush, t);
355 if (r)
356 return r;
357 }
358
359 // add a new osdmap epoch to store, so monitors will update their current osdmap
360 // in addition to the ones stored in epochs.
361 //
362 // This is needed due to the way the monitor updates from paxos and the
363 // facilities we are leveraging to push this update to the rest of the
364 // quorum.
365 //
366 // In a nutshell, we are generating a good version of the osdmap, with a
367 // proper crush, and building a transaction that will replace the bad
368 // osdmaps with good osdmaps. But this transaction needs to be applied on
369 // all nodes, so that the monitors will have good osdmaps to share with
370 // clients. We thus leverage Paxos, specifically the recovery mechanism, by
371 // creating a pending value that will be committed once the monitors form an
372 // initial quorum after being brought back to life.
373 //
374 // However, the way the monitor works has the paxos services, including the
375 // OSDMonitor, updating their state from disk *prior* to the recovery phase
376 // begins (so they have an up to date state in memory). This means the
377 // OSDMonitor will see the old, broken map, before the new paxos version is
378 // applied to disk, and the old version is cached. Even though we have the
379 // good map now, and we share the good map with clients, we will still be
380 // working on the old broken map. Instead of mucking around the monitor to
381 // make this work, we instead opt for adding the same osdmap but with a
382 // newer version, so that the OSDMonitor picks up on it when it updates from
383 // paxos after the proposal has been committed. This is not elegant, but
384 // avoids further unpleasantness that would arise from kludging around the
385 // current behavior. Also, has the added benefit of making sure the clients
386 // get an updated version of the map (because last_committed+1 >
387 // last_committed) :)
388 //
389 cout << "adding a new epoch #" << last_committed+1 << std::endl;
390 r = update_osdmap(store, last_committed++, true, crush, t);
391 if (r)
392 return r;
393 t->put(prefix, store.combine_strings("full", "latest"), last_committed);
394 t->put(prefix, "last_committed", last_committed);
395 return 0;
396 }
397
398 /**
399 * create a new paxos version which carries a proposal to rewrite all epochs
400 * of incremental and full map of "osdmap" after a faulty crush map is injected.
401 * so the leader will trigger a recovery and propagate this fix to its peons,
402 * after the proposal is accepted, and the transaction in it is applied. all
403 * monitors will rewrite the bad crush map with the good one, and have a new
404 * osdmap epoch with the good crush map in it.
405 */
406 int rewrite_crush(const char* progname,
407 vector<string>& subcmds,
408 MonitorDBStore& store) {
409 po::options_description op_desc("Allowed 'rewrite-crush' options");
410 int version = -1;
411 string crush_file;
412 op_desc.add_options()
413 ("help,h", "produce this help message")
414 ("crush", po::value<string>(&crush_file),
415 ("path to the crush map file "
416 "(default: will instead extract it from the known-good osdmap)"))
417 ("good-epoch", po::value<int>(&version),
418 "known-good epoch of osdmap, if a negative number '-N' is given, the "
419 "$last_committed-N is used instead (default: -1). "
420 "Please note, -1 is not necessarily a good epoch, because there are "
421 "good chance that we have more epochs slipped into the monstore after "
422 "the one where the crushmap is firstly injected.")
423 ;
424 po::variables_map op_vm;
425 int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm);
426 if (r) {
427 return -r;
428 }
429 if (op_vm.count("help")) {
430 usage(progname, op_desc);
431 return 0;
432 }
433
434 MonitorDBStore::Transaction rewrite_txn;
435 r = rewrite_transaction(store, version, crush_file, &rewrite_txn);
436 if (r) {
437 return r;
438 }
439
440 // store the transaction into store as a proposal
441 const string prefix("paxos");
442 version_t pending_v = store.get(prefix, "last_committed") + 1;
443 auto t(std::make_shared<MonitorDBStore::Transaction>());
444 bufferlist bl;
445 rewrite_txn.encode(bl);
446 cout << "adding pending commit " << pending_v
447 << " " << bl.length() << " bytes" << std::endl;
448 t->put(prefix, pending_v, bl);
449 t->put(prefix, "pending_v", pending_v);
450 // a large enough yet unique proposal number will probably do the trick
451 version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1;
452 t->put(prefix, "pending_pn", pending_pn);
453 store.apply_transaction(t);
454 return 0;
455 }
456
457 static int update_auth(MonitorDBStore& st, const string& keyring_path)
458 {
459 // import all keyrings stored in the keyring file
460 KeyRing keyring;
461 int r = keyring.load(g_ceph_context, keyring_path);
462 if (r < 0) {
463 cerr << "unable to load admin keyring: " << keyring_path << std::endl;
464 return r;
465 }
466
467 bufferlist bl;
468 __u8 v = 1;
469 encode(v, bl);
470
471 for (const auto& k : keyring.get_keys()) {
472 KeyServerData::Incremental auth_inc;
473 auth_inc.name = k.first;
474 auth_inc.auth = k.second;
475 if (auth_inc.auth.caps.empty()) {
476 cerr << "no caps granted to: " << auth_inc.name << std::endl;
477 return -EINVAL;
478 }
479 map<string,string> caps;
480 std::transform(begin(auth_inc.auth.caps), end(auth_inc.auth.caps),
481 inserter(caps, end(caps)),
482 [](auto& cap) {
483 string c;
484 auto p = cap.second.cbegin();
485 decode(c, p);
486 return make_pair(cap.first, c);
487 });
488 cout << "adding auth for '"
489 << auth_inc.name << "': " << auth_inc.auth
490 << " with caps(" << caps << ")" << std::endl;
491 auth_inc.op = KeyServerData::AUTH_INC_ADD;
492
493 AuthMonitor::Incremental inc;
494 inc.inc_type = AuthMonitor::AUTH_DATA;
495 encode(auth_inc, inc.auth_data);
496 inc.auth_type = CEPH_AUTH_CEPHX;
497 inc.encode(bl, CEPH_FEATURES_ALL);
498 }
499
500 // prime rotating secrets
501 {
502 KeyServer ks(g_ceph_context, nullptr);
503 KeyServerData::Incremental auth_inc;
504 auth_inc.op = KeyServerData::AUTH_INC_SET_ROTATING;
505 bool r = ks.prepare_rotating_update(auth_inc.rotating_bl);
506 ceph_assert(r);
507 AuthMonitor::Incremental inc;
508 inc.inc_type = AuthMonitor::AUTH_DATA;
509 encode(auth_inc, inc.auth_data);
510 inc.auth_type = CEPH_AUTH_CEPHX;
511 inc.encode(bl, CEPH_FEATURES_ALL);
512 }
513
514 const string prefix("auth");
515 auto last_committed = st.get(prefix, "last_committed") + 1;
516 auto t = make_shared<MonitorDBStore::Transaction>();
517 t->put(prefix, last_committed, bl);
518 t->put(prefix, "last_committed", last_committed);
519 auto first_committed = st.get(prefix, "first_committed");
520 if (!first_committed) {
521 t->put(prefix, "first_committed", last_committed);
522 }
523 st.apply_transaction(t);
524 return 0;
525 }
526
527 static int update_mkfs(MonitorDBStore& st,
528 const string& monmap_path,
529 const vector<string>& mon_ids)
530 {
531 MonMap monmap;
532 if (!monmap_path.empty()) {
533 cout << __func__ << " pulling initial monmap from " << monmap_path << std::endl;
534 bufferlist bl;
535 string err;
536 int r = bl.read_file(monmap_path.c_str(), &err);
537 if (r < 0) {
538 cerr << "failed to read monmap from " << monmap_path << ": "
539 << cpp_strerror(r) << std::endl;
540 return r;
541 }
542 monmap.decode(bl);
543 } else {
544 cout << __func__ << " generating seed initial monmap" << std::endl;
545 int r = monmap.build_initial(g_ceph_context, true, cerr);
546 if (r) {
547 cerr << "no initial monitors" << std::endl;
548 return -EINVAL;
549 }
550 vector<string> new_names;
551 if (!mon_ids.empty()) {
552 if (mon_ids.size() != monmap.size()) {
553 cerr << "Please pass the same number of <mon-ids> to name the hosts "
554 << "listed in 'mon_host'. "
555 << mon_ids.size() << " mon-id(s) specified, "
556 << "while you have " << monmap.size() << " mon hosts." << std::endl;
557 return -EINVAL;
558 }
559 new_names = mon_ids;
560 } else {
561 for (unsigned rank = 0; rank < monmap.size(); rank++) {
562 string new_name{"a"};
563 new_name[0] += rank;
564 new_names.push_back(std::move(new_name));
565 }
566 }
567 for (unsigned rank = 0; rank < monmap.size(); rank++) {
568 auto name = monmap.get_name(rank);
569 if (name.compare(0, 7, "noname-") == 0) {
570 monmap.rename(name, new_names[rank]);
571 }
572 }
573 }
574 monmap.print(cout);
575 bufferlist bl;
576 monmap.encode(bl, CEPH_FEATURES_ALL);
577 monmap.set_epoch(0);
578 auto t = make_shared<MonitorDBStore::Transaction>();
579 t->put("mkfs", "monmap", bl);
580 st.apply_transaction(t);
581 return 0;
582 }
583
584 static int update_monitor(MonitorDBStore& st)
585 {
586 const string prefix("monitor");
587 // a stripped-down Monitor::mkfs()
588 bufferlist bl;
589 bl.append(CEPH_MON_ONDISK_MAGIC "\n");
590 auto t = make_shared<MonitorDBStore::Transaction>();
591 t->put(prefix, "magic", bl);
592 st.apply_transaction(t);
593 return 0;
594 }
595
596 // rebuild
597 // - creating_pgs
598 static int update_creating_pgs(MonitorDBStore& st)
599 {
600 bufferlist bl;
601 auto last_osdmap_epoch = st.get("osdmap", "last_committed");
602 int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl);
603 if (r < 0) {
604 cerr << "unable to load osdmap e" << last_osdmap_epoch << std::endl;
605 return r;
606 }
607
608 OSDMap osdmap;
609 osdmap.decode(bl);
610 creating_pgs_t creating;
611 for (auto& i : osdmap.get_pools()) {
612 creating.created_pools.insert(i.first);
613 }
614 creating.last_scan_epoch = last_osdmap_epoch;
615
616 bufferlist newbl;
617 encode(creating, newbl, CEPH_FEATURES_ALL);
618
619 auto t = make_shared<MonitorDBStore::Transaction>();
620 t->put("osd_pg_creating", "creating", newbl);
621 st.apply_transaction(t);
622 return 0;
623 }
624
625 // rebuild
626 // - mgr
627 // - mgr_command_desc
628 static int update_mgrmap(MonitorDBStore& st)
629 {
630 auto t = make_shared<MonitorDBStore::Transaction>();
631
632 {
633 MgrMap map;
634 // mgr expects epoch > 1
635 map.epoch++;
636 auto initial_modules =
637 get_str_vec(g_ceph_context->_conf.get_val<string>("mgr_initial_modules"));
638 copy(begin(initial_modules),
639 end(initial_modules),
640 inserter(map.modules, end(map.modules)));
641 bufferlist bl;
642 map.encode(bl, CEPH_FEATURES_ALL);
643 t->put("mgr", map.epoch, bl);
644 t->put("mgr", "last_committed", map.epoch);
645 }
646 {
647 auto mgr_command_descs = mgr_commands;
648 for (auto& c : mgr_command_descs) {
649 c.set_flag(MonCommand::FLAG_MGR);
650 }
651 bufferlist bl;
652 encode(mgr_command_descs, bl);
653 t->put("mgr_command_descs", "", bl);
654 }
655 return st.apply_transaction(t);
656 }
657
658 static int update_paxos(MonitorDBStore& st)
659 {
660 const string prefix("paxos");
661 // a large enough version greater than the maximum possible `last_committed`
662 // that could be replied by the peons when the leader is collecting paxos
663 // transactions during recovery
664 constexpr version_t first_committed = 0x42;
665 constexpr version_t last_committed = first_committed;
666 for (version_t v = first_committed; v < last_committed + 1; v++) {
667 auto t = make_shared<MonitorDBStore::Transaction>();
668 if (v == first_committed) {
669 t->put(prefix, "first_committed", v);
670 }
671 bufferlist proposal;
672 MonitorDBStore::Transaction empty_txn;
673 empty_txn.encode(proposal);
674 t->put(prefix, v, proposal);
675 t->put(prefix, "last_committed", v);
676 st.apply_transaction(t);
677 }
678 // build a pending paxos proposal from all non-permanent k/v pairs. once the
679 // proposal is committed, it will gets applied. on the sync provider side, it
680 // will be a no-op, but on its peers, the paxos commit will help to build up
681 // the necessary epochs.
682 bufferlist pending_proposal;
683 {
684 MonitorDBStore::Transaction t;
685 vector<string> prefixes = {"auth", "osdmap",
686 "mgr", "mgr_command_desc"};
687 for (const auto& prefix : prefixes) {
688 for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
689 auto key = i->raw_key();
690 auto val = i->value();
691 t.put(key.first, key.second, val);
692 }
693 }
694 t.encode(pending_proposal);
695 }
696 auto pending_v = last_committed + 1;
697 auto t = make_shared<MonitorDBStore::Transaction>();
698 t->put(prefix, pending_v, pending_proposal);
699 t->put(prefix, "pending_v", pending_v);
700 t->put(prefix, "pending_pn", 400);
701 st.apply_transaction(t);
702 return 0;
703 }
704
705 int rebuild_monstore(const char* progname,
706 vector<string>& subcmds,
707 MonitorDBStore& st)
708 {
709 po::options_description op_desc("Allowed 'rebuild' options");
710 string keyring_path;
711 string monmap_path;
712 vector<string> mon_ids;
713 op_desc.add_options()
714 ("keyring", po::value<string>(&keyring_path),
715 "path to the client.admin key")
716 ("monmap", po::value<string>(&monmap_path),
717 "path to the initial monmap")
718 ("mon-ids", po::value<vector<string>>(&mon_ids)->multitoken(),
719 "mon ids, use 'a', 'b', ... if not specified");
720 po::positional_options_description pos_desc;
721 pos_desc.add("mon-ids", -1);
722 po::variables_map op_vm;
723 int r = parse_cmd_args(&op_desc, nullptr, &pos_desc, subcmds, &op_vm);
724 if (r) {
725 return -r;
726 }
727 if (op_vm.count("help")) {
728 usage(progname, op_desc);
729 return 0;
730 }
731 if (!keyring_path.empty())
732 update_auth(st, keyring_path);
733 if ((r = update_creating_pgs(st))) {
734 return r;
735 }
736 if ((r = update_mgrmap(st))) {
737 return r;
738 }
739 if ((r = update_paxos(st))) {
740 return r;
741 }
742 if ((r = update_mkfs(st, monmap_path, mon_ids))) {
743 return r;
744 }
745 if ((r = update_monitor(st))) {
746 return r;
747 }
748 return 0;
749 }
750
751 int main(int argc, char **argv) {
752 int err = 0;
753 po::options_description desc("Allowed options");
754 string store_path, cmd;
755 vector<string> subcmds;
756 desc.add_options()
757 ("help,h", "produce help message")
758 ;
759
760 /* Dear Future Developer:
761 *
762 * for further improvement, should you need to pass specific options to
763 * a command (e.g., get osdmap VER --hex), you can expand the current
764 * format by creating additional 'po::option_description' and passing
765 * 'subcmds' to 'po::command_line_parser', much like what is currently
766 * done by default. However, beware: in order to differentiate a
767 * command-specific option from the generic/global options, you will need
768 * to pass '--' in the command line (so that the first parser, the one
769 * below, assumes it has reached the end of all options); e.g.,
770 * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as
771 * far as I got with this library. Improvements on this format will be
772 * left as an excercise for the reader. -Joao
773 */
774 po::options_description positional_desc("Positional argument options");
775 positional_desc.add_options()
776 ("store-path", po::value<string>(&store_path),
777 "path to monitor's store")
778 ("command", po::value<string>(&cmd),
779 "Command")
780 ("subcmd", po::value<vector<string> >(&subcmds),
781 "Command arguments/Sub-Commands")
782 ;
783 po::positional_options_description positional;
784 positional.add("store-path", 1);
785 positional.add("command", 1);
786 positional.add("subcmd", -1);
787
788 po::options_description all_desc("All options");
789 all_desc.add(desc).add(positional_desc);
790
791 vector<string> ceph_option_strings;
792 po::variables_map vm;
793 try {
794 po::parsed_options parsed =
795 po::command_line_parser(argc, argv).
796 options(all_desc).
797 positional(positional).
798 allow_unregistered().run();
799
800 po::store(
801 parsed,
802 vm);
803 po::notify(vm);
804
805 // Specifying po::include_positional would have our positional arguments
806 // being collected (thus being part of ceph_option_strings and eventually
807 // passed on to global_init() below).
808 // Instead we specify po::exclude_positional, which has the upside of
809 // completely avoid this, but the downside of having to specify ceph
810 // options as --VAR=VAL (note the '='); otherwise we will capture the
811 // positional 'VAL' as belonging to us, never being collected.
812 ceph_option_strings = po::collect_unrecognized(parsed.options,
813 po::exclude_positional);
814
815 } catch(po::error &e) {
816 std::cerr << "error: " << e.what() << std::endl;
817 return 1;
818 }
819
820 // parse command structure before calling global_init() and friends.
821
822 if (vm.empty() || vm.count("help") ||
823 store_path.empty() || cmd.empty() ||
824 *cmd.begin() == '-') {
825 usage(argv[0], desc);
826 return 1;
827 }
828
829 vector<const char *> ceph_options;
830 ceph_options.reserve(ceph_option_strings.size());
831 for (vector<string>::iterator i = ceph_option_strings.begin();
832 i != ceph_option_strings.end();
833 ++i) {
834 ceph_options.push_back(i->c_str());
835 }
836
837 auto cct = global_init(
838 NULL, ceph_options, CEPH_ENTITY_TYPE_MON,
839 CODE_ENVIRONMENT_UTILITY,
840 CINIT_FLAG_NO_MON_CONFIG);
841 common_init_finish(g_ceph_context);
842 cct->_conf.apply_changes(nullptr);
843
844 // this is where we'll write *whatever*, on a per-command basis.
845 // not all commands require some place to write their things.
846 MonitorDBStore st(store_path);
847 if (store_path.size()) {
848 stringstream ss;
849 int r = st.open(ss);
850 if (r < 0) {
851 std::cerr << ss.str() << std::endl;
852 return EINVAL;
853 }
854 }
855
856 auto close_store = make_scope_guard([&] {
857 st.close();
858 });
859
860 if (cmd == "dump-keys") {
861 KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
862 while (iter->valid()) {
863 pair<string,string> key(iter->raw_key());
864 cout << key.first << " / " << key.second << std::endl;
865 iter->next();
866 }
867 } else if (cmd == "compact") {
868 st.compact();
869 } else if (cmd == "get") {
870 unsigned v = 0;
871 string outpath;
872 string map_type;
873 // visible options for this command
874 po::options_description op_desc("Allowed 'get' options");
875 op_desc.add_options()
876 ("help,h", "produce this help message")
877 ("out,o", po::value<string>(&outpath),
878 "output file (default: stdout)")
879 ("version,v", po::value<unsigned>(&v),
880 "map version to obtain")
881 ("readable,r", "print the map information in human readable format")
882 ;
883 // this is going to be a positional argument; we don't want to show
884 // it as an option during --help, but we do want to have it captured
885 // when parsing.
886 po::options_description hidden_op_desc("Hidden 'get' options");
887 hidden_op_desc.add_options()
888 ("map-type", po::value<string>(&map_type),
889 "map-type")
890 ;
891 po::positional_options_description op_positional;
892 op_positional.add("map-type", 1);
893
894 po::variables_map op_vm;
895 int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
896 subcmds, &op_vm);
897 if (r < 0) {
898 return -r;
899 }
900
901 if (op_vm.count("help") || map_type.empty()) {
902 usage(argv[0], op_desc);
903 return 0;
904 }
905
906 if (v == 0) {
907 if (map_type == "crushmap") {
908 v = st.get("osdmap", "last_committed");
909 } else {
910 v = st.get(map_type, "last_committed");
911 }
912 }
913
914 int fd = STDOUT_FILENO;
915 if (!outpath.empty()){
916 fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0666);
917 if (fd < 0) {
918 std::cerr << "error opening output file: "
919 << cpp_strerror(errno) << std::endl;
920 return EINVAL;
921 }
922 }
923
924 auto close_fd = make_scope_guard([&] {
925 ::close(fd);
926 if (r < 0 && fd != STDOUT_FILENO) {
927 ::remove(outpath.c_str());
928 }
929 });
930
931 bufferlist bl;
932 r = 0;
933 if (map_type == "osdmap") {
934 r = st.get(map_type, st.combine_strings("full", v), bl);
935 } else if (map_type == "crushmap") {
936 bufferlist tmp;
937 r = st.get("osdmap", st.combine_strings("full", v), tmp);
938 if (r >= 0) {
939 OSDMap osdmap;
940 osdmap.decode(tmp);
941 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
942 }
943 } else {
944 r = st.get(map_type, v, bl);
945 }
946 if (r < 0) {
947 std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
948 return EINVAL;
949 }
950
951 if (op_vm.count("readable")) {
952 stringstream ss;
953 bufferlist out;
954 try {
955 if (map_type == "monmap") {
956 MonMap monmap;
957 monmap.decode(bl);
958 monmap.print(ss);
959 } else if (map_type == "osdmap") {
960 OSDMap osdmap;
961 osdmap.decode(bl);
962 osdmap.print(ss);
963 } else if (map_type == "mdsmap") {
964 FSMap fs_map;
965 fs_map.decode(bl);
966 fs_map.print(ss);
967 } else if (map_type == "mgr") {
968 MgrMap mgr_map;
969 auto p = bl.cbegin();
970 mgr_map.decode(p);
971 JSONFormatter f;
972 f.dump_object("mgrmap", mgr_map);
973 f.flush(ss);
974 } else if (map_type == "crushmap") {
975 CrushWrapper cw;
976 auto it = bl.cbegin();
977 cw.decode(it);
978 CrushCompiler cc(cw, std::cerr, 0);
979 cc.decompile(ss);
980 } else {
981 std::cerr << "This type of readable map does not exist: " << map_type
982 << std::endl << "You can only specify[osdmap|monmap|mdsmap"
983 "|crushmap|mgr]" << std::endl;
984 }
985 } catch (const buffer::error &err) {
986 std::cerr << "Could not decode for human readable output (you may still"
987 " use non-readable mode). Detail: " << err.what() << std::endl;
988 }
989
990 out.append(ss);
991 out.write_fd(fd);
992 } else {
993 bl.write_fd(fd);
994 }
995
996 if (!outpath.empty()) {
997 std::cout << "wrote " << map_type
998 << " version " << v << " to " << outpath
999 << std::endl;
1000 }
1001 } else if (cmd == "show-versions") {
1002 string map_type; //map type:osdmap,monmap...
1003 // visible options for this command
1004 po::options_description op_desc("Allowed 'show-versions' options");
1005 op_desc.add_options()
1006 ("help,h", "produce this help message")
1007 ("map-type", po::value<string>(&map_type), "map_type");
1008
1009 po::positional_options_description op_positional;
1010 op_positional.add("map-type", 1);
1011
1012 po::variables_map op_vm;
1013 int r = parse_cmd_args(&op_desc, NULL, &op_positional,
1014 subcmds, &op_vm);
1015 if (r < 0) {
1016 return -r;
1017 }
1018
1019 if (op_vm.count("help") || map_type.empty()) {
1020 usage(argv[0], op_desc);
1021 return 0;
1022 }
1023
1024 unsigned int v_first = 0;
1025 unsigned int v_last = 0;
1026 v_first = st.get(map_type, "first_committed");
1027 v_last = st.get(map_type, "last_committed");
1028
1029 std::cout << "first committed:\t" << v_first << "\n"
1030 << "last committed:\t" << v_last << std::endl;
1031 } else if (cmd == "dump-paxos") {
1032 unsigned dstart = 0;
1033 unsigned dstop = ~0;
1034 po::options_description op_desc("Allowed 'dump-paxos' options");
1035 op_desc.add_options()
1036 ("help,h", "produce this help message")
1037 ("start,s", po::value<unsigned>(&dstart),
1038 "starting version (default: 0)")
1039 ("end,e", po::value<unsigned>(&dstop),
1040 "finish version (default: ~0)")
1041 ;
1042
1043 po::variables_map op_vm;
1044 int r = parse_cmd_args(&op_desc, NULL, NULL,
1045 subcmds, &op_vm);
1046 if (r < 0) {
1047 return -r;
1048 }
1049
1050 if (op_vm.count("help")) {
1051 usage(argv[0], op_desc);
1052 return 0;
1053 }
1054
1055 if (dstart > dstop) {
1056 std::cerr << "error: 'start' version (value: " << dstart << ") "
1057 << " is greater than 'end' version (value: " << dstop << ")"
1058 << std::endl;
1059 return EINVAL;
1060 }
1061
1062 version_t v = dstart;
1063 for (; v <= dstop; ++v) {
1064 bufferlist bl;
1065 st.get("paxos", v, bl);
1066 if (bl.length() == 0)
1067 break;
1068 cout << "\n--- " << v << " ---" << std::endl;
1069 auto tx(std::make_shared<MonitorDBStore::Transaction>());
1070 Paxos::decode_append_transaction(tx, bl);
1071 JSONFormatter f(true);
1072 tx->dump(&f);
1073 f.flush(cout);
1074 }
1075
1076 std::cout << "dumped " << v << " paxos versions" << std::endl;
1077
1078 } else if (cmd == "dump-trace") {
1079 unsigned dstart = 0;
1080 unsigned dstop = ~0;
1081 string outpath;
1082
1083 // visible options for this command
1084 po::options_description op_desc("Allowed 'dump-trace' options");
1085 op_desc.add_options()
1086 ("help,h", "produce this help message")
1087 ("start,s", po::value<unsigned>(&dstart),
1088 "starting version (default: 0)")
1089 ("end,e", po::value<unsigned>(&dstop),
1090 "finish version (default: ~0)")
1091 ;
1092 // this is going to be a positional argument; we don't want to show
1093 // it as an option during --help, but we do want to have it captured
1094 // when parsing.
1095 po::options_description hidden_op_desc("Hidden 'dump-trace' options");
1096 hidden_op_desc.add_options()
1097 ("out,o", po::value<string>(&outpath),
1098 "file to write the dump to")
1099 ;
1100 po::positional_options_description op_positional;
1101 op_positional.add("out", 1);
1102
1103 po::variables_map op_vm;
1104 int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
1105 subcmds, &op_vm);
1106 if (r < 0) {
1107 return -r;
1108 }
1109
1110 if (op_vm.count("help")) {
1111 usage(argv[0], op_desc);
1112 return 0;
1113 }
1114
1115 if (outpath.empty()) {
1116 usage(argv[0], op_desc);
1117 return EINVAL;
1118 }
1119
1120 if (dstart > dstop) {
1121 std::cerr << "error: 'start' version (value: " << dstart << ") "
1122 << " is greater than 'stop' version (value: " << dstop << ")"
1123 << std::endl;
1124 return EINVAL;
1125 }
1126
1127 TraceIter iter(outpath.c_str());
1128 iter.init();
1129 while (true) {
1130 if (!iter.valid())
1131 break;
1132 if (iter.num() >= dstop) {
1133 break;
1134 }
1135 if (iter.num() >= dstart) {
1136 JSONFormatter f(true);
1137 iter.cur()->dump(&f, false);
1138 f.flush(std::cout);
1139 std::cout << std::endl;
1140 }
1141 iter.next();
1142 }
1143 std::cerr << "Read up to transaction " << iter.num() << std::endl;
1144 } else if (cmd == "replay-trace") {
1145 string inpath;
1146 unsigned num_replays = 1;
1147 // visible options for this command
1148 po::options_description op_desc("Allowed 'replay-trace' options");
1149 op_desc.add_options()
1150 ("help,h", "produce this help message")
1151 ("num-replays,n", po::value<unsigned>(&num_replays),
1152 "finish version (default: 1)")
1153 ;
1154 // this is going to be a positional argument; we don't want to show
1155 // it as an option during --help, but we do want to have it captured
1156 // when parsing.
1157 po::options_description hidden_op_desc("Hidden 'replay-trace' options");
1158 hidden_op_desc.add_options()
1159 ("in,i", po::value<string>(&inpath),
1160 "file to write the dump to")
1161 ;
1162 po::positional_options_description op_positional;
1163 op_positional.add("in", 1);
1164
1165 // op_desc_all will aggregate all visible and hidden options for parsing.
1166 // when we call 'usage()' we just pass 'op_desc', as that's the description
1167 // holding the visible options.
1168 po::options_description op_desc_all;
1169 op_desc_all.add(op_desc).add(hidden_op_desc);
1170
1171 po::variables_map op_vm;
1172 try {
1173 po::parsed_options op_parsed = po::command_line_parser(subcmds).
1174 options(op_desc_all).positional(op_positional).run();
1175 po::store(op_parsed, op_vm);
1176 po::notify(op_vm);
1177 } catch (po::error &e) {
1178 std::cerr << "error: " << e.what() << std::endl;
1179 return EINVAL;
1180 }
1181
1182 if (op_vm.count("help")) {
1183 usage(argv[0], op_desc);
1184 return 0;
1185 }
1186
1187 if (inpath.empty()) {
1188 usage(argv[0], op_desc);
1189 return EINVAL;
1190 }
1191
1192 unsigned num = 0;
1193 for (unsigned i = 0; i < num_replays; ++i) {
1194 TraceIter iter(inpath.c_str());
1195 iter.init();
1196 while (true) {
1197 if (!iter.valid())
1198 break;
1199 std::cerr << "Replaying trans num " << num << std::endl;
1200 st.apply_transaction(iter.cur());
1201 iter.next();
1202 ++num;
1203 }
1204 std::cerr << "Read up to transaction " << iter.num() << std::endl;
1205 }
1206 } else if (cmd == "random-gen") {
1207 unsigned tsize = 200;
1208 unsigned tvalsize = 1024;
1209 unsigned ntrans = 100;
1210 po::options_description op_desc("Allowed 'random-gen' options");
1211 op_desc.add_options()
1212 ("help,h", "produce this help message")
1213 ("num-keys,k", po::value<unsigned>(&tsize),
1214 "keys to write in each transaction (default: 200)")
1215 ("size,s", po::value<unsigned>(&tvalsize),
1216 "size (in bytes) of the value to write in each key (default: 1024)")
1217 ("ntrans,n", po::value<unsigned>(&ntrans),
1218 "number of transactions to run (default: 100)")
1219 ;
1220
1221 po::variables_map op_vm;
1222 try {
1223 po::parsed_options op_parsed = po::command_line_parser(subcmds).
1224 options(op_desc).run();
1225 po::store(op_parsed, op_vm);
1226 po::notify(op_vm);
1227 } catch (po::error &e) {
1228 std::cerr << "error: " << e.what() << std::endl;
1229 return EINVAL;
1230 }
1231
1232 if (op_vm.count("help")) {
1233 usage(argv[0], op_desc);
1234 return 0;
1235 }
1236
1237 unsigned num = 0;
1238 for (unsigned i = 0; i < ntrans; ++i) {
1239 std::cerr << "Applying trans " << i << std::endl;
1240 auto t(std::make_shared<MonitorDBStore::Transaction>());
1241 string prefix;
1242 prefix.push_back((i%26)+'a');
1243 for (unsigned j = 0; j < tsize; ++j) {
1244 stringstream os;
1245 os << num;
1246 bufferlist bl;
1247 for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand());
1248 t->put(prefix, os.str(), bl);
1249 ++num;
1250 }
1251 t->compact_prefix(prefix);
1252 st.apply_transaction(t);
1253 }
1254 } else if (cmd == "store-copy") {
1255 if (subcmds.size() < 1 || subcmds[0].empty()) {
1256 usage(argv[0], desc);
1257 return EINVAL;
1258 }
1259
1260 string out_path = subcmds[0];
1261
1262 MonitorDBStore out_store(out_path);
1263 {
1264 stringstream ss;
1265 int r = out_store.create_and_open(ss);
1266 if (r < 0) {
1267 std::cerr << ss.str() << std::endl;
1268 return err;
1269 }
1270 }
1271
1272
1273 KeyValueDB::WholeSpaceIterator it = st.get_iterator();
1274 uint64_t total_keys = 0;
1275 uint64_t total_size = 0;
1276 uint64_t total_tx = 0;
1277
1278 do {
1279 uint64_t num_keys = 0;
1280
1281 auto tx(std::make_shared<MonitorDBStore::Transaction>());
1282
1283 while (it->valid() && num_keys < 128) {
1284 pair<string,string> k = it->raw_key();
1285 bufferlist v = it->value();
1286 tx->put(k.first, k.second, v);
1287
1288 num_keys ++;
1289 total_tx ++;
1290 total_size += v.length();
1291
1292 it->next();
1293 }
1294
1295 total_keys += num_keys;
1296
1297 if (!tx->empty())
1298 out_store.apply_transaction(tx);
1299
1300 std::cout << "copied " << total_keys << " keys so far ("
1301 << stringify(byte_u_t(total_size)) << ")" << std::endl;
1302
1303 } while (it->valid());
1304 out_store.close();
1305 std::cout << "summary: copied " << total_keys << " keys, using "
1306 << total_tx << " transactions, totalling "
1307 << stringify(byte_u_t(total_size)) << std::endl;
1308 std::cout << "from '" << store_path << "' to '" << out_path << "'"
1309 << std::endl;
1310 } else if (cmd == "rewrite-crush") {
1311 err = rewrite_crush(argv[0], subcmds, st);
1312 } else if (cmd == "rebuild") {
1313 err = rebuild_monstore(argv[0], subcmds, st);
1314 } else {
1315 std::cerr << "Unrecognized command: " << cmd << std::endl;
1316 usage(argv[0], desc);
1317 return err;
1318 }
1319 }