]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/ceph_monstore_tool.cc
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / tools / ceph_monstore_tool.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4* Ceph - scalable distributed file system
5*
6* Copyright (C) 2012 Inktank, Inc.
7*
8* This is free software; you can redistribute it and/or
9* modify it under the terms of the GNU Lesser General Public
10* License version 2.1, as published by the Free Software
11* Foundation. See file COPYING.
12*/
13#include <boost/program_options/variables_map.hpp>
14#include <boost/program_options/parsers.hpp>
15#include <boost/scope_exit.hpp>
16
17#include <stdlib.h>
18#include <string>
19
20#include "common/Formatter.h"
21#include "common/errno.h"
22
23#include "auth/KeyRing.h"
24#include "auth/cephx/CephxKeyServer.h"
25#include "global/global_init.h"
26#include "include/stringify.h"
3efd9988 27#include "mgr/mgr_commands.h"
7c673cae
FG
28#include "mon/AuthMonitor.h"
29#include "mon/MonitorDBStore.h"
30#include "mon/Paxos.h"
31#include "mon/MonMap.h"
32#include "mds/MDSMap.h"
33#include "osd/OSDMap.h"
34#include "crush/CrushCompiler.h"
35
36namespace po = boost::program_options;
37using namespace std;
38
39class TraceIter {
40 int fd;
41 unsigned idx;
42 MonitorDBStore::TransactionRef t;
43public:
44 explicit TraceIter(string fname) : fd(-1), idx(-1) {
45 fd = ::open(fname.c_str(), O_RDONLY);
46 t.reset(new MonitorDBStore::Transaction);
47 }
48 bool valid() {
49 return fd != -1;
50 }
51 MonitorDBStore::TransactionRef cur() {
52 assert(valid());
53 return t;
54 }
55 unsigned num() { return idx; }
56 void next() {
57 ++idx;
58 bufferlist bl;
59 int r = bl.read_fd(fd, 6);
60 if (r < 0) {
61 std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
62 << std::endl;
63 ::close(fd);
64 fd = -1;
65 return;
66 } else if ((unsigned)r < 6) {
67 std::cerr << "short read" << std::endl;
68 ::close(fd);
69 fd = -1;
70 return;
71 }
72 bufferlist::iterator bliter = bl.begin();
73 uint8_t ver, ver2;
74 ::decode(ver, bliter);
75 ::decode(ver2, bliter);
76 uint32_t len;
77 ::decode(len, bliter);
78 r = bl.read_fd(fd, len);
79 if (r < 0) {
80 std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
81 << std::endl;
82 ::close(fd);
83 fd = -1;
84 return;
85 } else if ((unsigned)r < len) {
86 std::cerr << "short read" << std::endl;
87 ::close(fd);
88 fd = -1;
89 return;
90 }
91 bliter = bl.begin();
92 t.reset(new MonitorDBStore::Transaction);
93 t->decode(bliter);
94 }
95 void init() {
96 next();
97 }
98 ~TraceIter() {
99 if (fd != -1) {
100 ::close(fd);
101 fd = -1;
102 }
103 }
104};
105
106
107int parse_cmd_args(
108 po::options_description *desc, /// < visible options description
109 po::options_description *hidden_desc, /// < hidden options description
110 po::positional_options_description *positional, /// < positional args
111 vector<string> &cmd_args, /// < arguments to be parsed
112 po::variables_map *vm /// > post-parsing variable map
113 )
114{
115 // desc_all will aggregate all visible and hidden options for parsing.
116 //
117 // From boost's program_options point of view, there is absolutely no
118 // distinction between 'desc' and 'hidden_desc'. This is a distinction
119 // that is only useful to us: 'desc' is whatever we are willing to show
120 // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to
121 // take advantage of but do not wish to show on 'usage()'.
122 //
123 // For example, consider that program_options matches positional arguments
124 // (specified via 'positional') against the paramenters defined on a
125 // given 'po::options_description' class. This is performed below,
126 // supplying both the description and the positional arguments to the
127 // parser. However, we do not want the parameters that are mapped to
128 // positional arguments to be shown on usage, as that makes for ugly and
129 // confusing usage messages. Therefore we dissociate the options'
130 // description that is to be used as an aid to the user from those options
131 // that are nothing but useful for internal purposes (i.e., mapping options
132 // to positional arguments). We still need to aggregate them before parsing
133 // and that's what 'desc_all' is all about.
134 //
135
136 assert(desc != NULL);
137
138 po::options_description desc_all;
139 desc_all.add(*desc);
140 if (hidden_desc != NULL)
141 desc_all.add(*hidden_desc);
142
143 try {
144 po::command_line_parser parser = po::command_line_parser(cmd_args).
145 options(desc_all);
146
147 if (positional) {
148 parser = parser.positional(*positional);
149 }
150
151 po::parsed_options parsed = parser.run();
152 po::store(parsed, *vm);
153 po::notify(*vm);
154 } catch (po::error &e) {
155 std::cerr << "error: " << e.what() << std::endl;
156 return -EINVAL;
157 }
158 return 0;
159}
160
161
162/**
163 * usage: ceph-monstore-tool <store-path> <command> [options]
164 *
165 * commands:
166 *
167 * store-copy < --out arg >
168 * dump-keys
169 * compact
170 * getmonmap < --out arg [ --version arg ] >
171 * getosdmap < --out arg [ --version arg ] >
172 * dump-paxos <--dump-start VER> <--dump-end VER>
173 * dump-trace < --trace-file arg >
174 * replay-trace
175 * random-gen
176 * rewrite-crush
177 * inflate-pgmap
178 *
179 * wanted syntax:
180 *
181 * ceph-monstore-tool PATH CMD [options]
182 *
183 * ceph-monstore-tool PATH store-copy <PATH2 | -o PATH2>
184 * ceph-monstore-tool PATH dump-keys
185 * ceph-monstore-tool PATH compact
186 * ceph-monstore-tool PATH get monmap [VER]
187 * ceph-monstore-tool PATH get osdmap [VER]
188 * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER
189 *
190 *
191 */
192void usage(const char *n, po::options_description &d)
193{
194 std::cerr <<
195 "usage: " << n << " <store-path> <cmd> [args|options]\n"
196 << "\n"
197 << "Commands:\n"
198 << " store-copy PATH copies store to PATH\n"
199 << " compact compacts the store\n"
200 << " get monmap [-- options] get monmap (version VER if specified)\n"
201 << " (default: last committed)\n"
202 << " get osdmap [-- options] get osdmap (version VER if specified)\n"
203 << " (default: last committed)\n"
204 << " get mdsmap [-- options] get mdsmap (version VER if specified)\n"
205 << " (default: last committed)\n"
206 << " get crushmap [-- options] get crushmap (version VER if specified)\n"
207 << " (default: last committed)\n"
208 << " show-versions [-- options] show the first&last committed version of map\n"
209 << " (show-versions -- --help for more info)\n"
210 << " dump-keys dumps store keys to FILE\n"
211 << " (default: stdout)\n"
212 << " dump-paxos [-- options] dump paxos transactions\n"
213 << " (dump-paxos -- --help for more info)\n"
214 << " dump-trace FILE [-- options] dump contents of trace file FILE\n"
215 << " (dump-trace -- --help for more info)\n"
216 << " replay-trace FILE [-- options] replay trace from FILE\n"
217 << " (replay-trace -- --help for more info)\n"
218 << " random-gen [-- options] add randomly generated ops to the store\n"
219 << " (random-gen -- --help for more info)\n"
220 << " rewrite-crush [-- options] add a rewrite commit to the store\n"
221 << " (rewrite-crush -- --help for more info)\n"
222 << " inflate-pgmap [-- options] add given number of pgmaps to store\n"
223 << " (inflate-pgmap -- --help for more info)\n"
224 << " rebuild rebuild store\n"
225 << " (rebuild -- --help for more info)\n"
226 << std::endl;
227 std::cerr << d << std::endl;
228 std::cerr
229 << "\nPlease Note:\n"
230 << "* Ceph-specific options should be in the format --option-name=VAL\n"
231 << " (specifically, do not forget the '='!!)\n"
232 << "* Command-specific options need to be passed after a '--'\n"
233 << " e.g., 'get monmap -- --version 10 --out /tmp/foo'"
234 << std::endl;
235}
236
237int update_osdmap(MonitorDBStore& store, version_t ver, bool copy,
238 ceph::shared_ptr<CrushWrapper> crush,
239 MonitorDBStore::Transaction* t) {
240 const string prefix("osdmap");
241
242 // full
243 bufferlist bl;
244 int r = 0;
245 r = store.get(prefix, store.combine_strings("full", ver), bl);
246 if (r) {
247 std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl;
248 return r;
249 }
250 OSDMap osdmap;
251 osdmap.decode(bl);
252 osdmap.crush = crush;
253 if (copy) {
254 osdmap.inc_epoch();
255 }
256 bl.clear();
257 // be consistent with OSDMonitor::update_from_paxos()
258 osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
259 t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl);
260
261 // incremental
262 OSDMap::Incremental inc;
263 if (copy) {
264 inc.epoch = osdmap.get_epoch();
265 inc.fsid = osdmap.get_fsid();
266 } else {
267 bl.clear();
268 r = store.get(prefix, ver, bl);
269 if (r) {
270 std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl;
271 return r;
272 }
273 OSDMap::Incremental inc(bl);
274 if (inc.crush.length()) {
275 inc.crush.clear();
276 crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
277 }
278 if (inc.fullmap.length()) {
279 OSDMap fullmap;
280 fullmap.decode(inc.fullmap);
281 fullmap.crush = crush;
282 inc.fullmap.clear();
283 fullmap.encode(inc.fullmap);
284 }
285 }
286 assert(osdmap.have_crc());
287 inc.full_crc = osdmap.get_crc();
288 bl.clear();
289 // be consistent with OSDMonitor::update_from_paxos()
290 inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
291 t->put(prefix, inc.epoch, bl);
292 return 0;
293}
294
295int rewrite_transaction(MonitorDBStore& store, int version,
296 const string& crush_file,
297 MonitorDBStore::Transaction* t) {
298 const string prefix("osdmap");
299
300 // calc the known-good epoch
301 version_t last_committed = store.get(prefix, "last_committed");
302 version_t good_version = 0;
303 if (version <= 0) {
304 if (last_committed >= (unsigned)-version) {
305 good_version = last_committed + version;
306 } else {
307 std::cerr << "osdmap-version is less than: -" << last_committed << std::endl;
308 return EINVAL;
309 }
310 } else {
311 good_version = version;
312 }
313 if (good_version >= last_committed) {
314 std::cout << "good epoch is greater or equal to the last committed one: "
315 << good_version << " >= " << last_committed << std::endl;
316 return 0;
317 }
318
319 // load/extract the crush map
320 int r = 0;
321 ceph::shared_ptr<CrushWrapper> crush(new CrushWrapper);
322 if (crush_file.empty()) {
323 bufferlist bl;
324 r = store.get(prefix, store.combine_strings("full", good_version), bl);
325 if (r) {
326 std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
327 return r;
328 }
329 OSDMap osdmap;
330 osdmap.decode(bl);
331 crush = osdmap.crush;
332 } else {
333 string err;
334 bufferlist bl;
335 r = bl.read_file(crush_file.c_str(), &err);
336 if (r) {
337 std::cerr << err << ": " << cpp_strerror(r) << std::endl;
338 return r;
339 }
340 bufferlist::iterator p = bl.begin();
341 crush->decode(p);
342 }
343
344 // prepare a transaction to rewrite the epochs
345 // (good_version, last_committed]
346 // with the good crush map.
347 // XXX: may need to break this into several paxos versions?
348 assert(good_version < last_committed);
349 for (version_t v = good_version + 1; v <= last_committed; v++) {
350 cout << "rewriting epoch #" << v << "/" << last_committed << std::endl;
351 r = update_osdmap(store, v, false, crush, t);
352 if (r)
353 return r;
354 }
355
356 // add a new osdmap epoch to store, so monitors will update their current osdmap
357 // in addition to the ones stored in epochs.
358 //
359 // This is needed due to the way the monitor updates from paxos and the
360 // facilities we are leveraging to push this update to the rest of the
361 // quorum.
362 //
363 // In a nutshell, we are generating a good version of the osdmap, with a
364 // proper crush, and building a transaction that will replace the bad
365 // osdmaps with good osdmaps. But this transaction needs to be applied on
366 // all nodes, so that the monitors will have good osdmaps to share with
367 // clients. We thus leverage Paxos, specifically the recovery mechanism, by
368 // creating a pending value that will be committed once the monitors form an
369 // initial quorum after being brought back to life.
370 //
371 // However, the way the monitor works has the paxos services, including the
372 // OSDMonitor, updating their state from disk *prior* to the recovery phase
373 // begins (so they have an up to date state in memory). This means the
374 // OSDMonitor will see the old, broken map, before the new paxos version is
375 // applied to disk, and the old version is cached. Even though we have the
376 // good map now, and we share the good map with clients, we will still be
377 // working on the old broken map. Instead of mucking around the monitor to
378 // make this work, we instead opt for adding the same osdmap but with a
379 // newer version, so that the OSDMonitor picks up on it when it updates from
380 // paxos after the proposal has been committed. This is not elegant, but
381 // avoids further unpleasantness that would arise from kludging around the
382 // current behavior. Also, has the added benefit of making sure the clients
383 // get an updated version of the map (because last_committed+1 >
384 // last_committed) :)
385 //
386 cout << "adding a new epoch #" << last_committed+1 << std::endl;
387 r = update_osdmap(store, last_committed++, true, crush, t);
388 if (r)
389 return r;
390 t->put(prefix, store.combine_strings("full", "latest"), last_committed);
391 t->put(prefix, "last_committed", last_committed);
392 return 0;
393}
394
395/**
396 * create a new paxos version which carries a proposal to rewrite all epochs
397 * of incremental and full map of "osdmap" after a faulty crush map is injected.
398 * so the leader will trigger a recovery and propagate this fix to its peons,
399 * after the proposal is accepted, and the transaction in it is applied. all
400 * monitors will rewrite the bad crush map with the good one, and have a new
401 * osdmap epoch with the good crush map in it.
402 */
403int rewrite_crush(const char* progname,
404 vector<string>& subcmds,
405 MonitorDBStore& store) {
406 po::options_description op_desc("Allowed 'rewrite-crush' options");
407 int version = -1;
408 string crush_file;
409 op_desc.add_options()
410 ("help,h", "produce this help message")
411 ("crush", po::value<string>(&crush_file),
412 ("path to the crush map file "
413 "(default: will instead extract it from the known-good osdmap)"))
414 ("good-epoch", po::value<int>(&version),
415 "known-good epoch of osdmap, if a negative number '-N' is given, the "
416 "$last_committed-N is used instead (default: -1). "
417 "Please note, -1 is not necessarily a good epoch, because there are "
418 "good chance that we have more epochs slipped into the monstore after "
419 "the one where the crushmap is firstly injected.")
420 ;
421 po::variables_map op_vm;
422 int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm);
423 if (r) {
424 return -r;
425 }
426 if (op_vm.count("help")) {
427 usage(progname, op_desc);
428 return 0;
429 }
430
431 MonitorDBStore::Transaction rewrite_txn;
432 r = rewrite_transaction(store, version, crush_file, &rewrite_txn);
433 if (r) {
434 return r;
435 }
436
437 // store the transaction into store as a proposal
438 const string prefix("paxos");
439 version_t pending_v = store.get(prefix, "last_committed") + 1;
440 auto t(std::make_shared<MonitorDBStore::Transaction>());
441 bufferlist bl;
442 rewrite_txn.encode(bl);
443 cout << "adding pending commit " << pending_v
444 << " " << bl.length() << " bytes" << std::endl;
445 t->put(prefix, pending_v, bl);
446 t->put(prefix, "pending_v", pending_v);
447 // a large enough yet unique proposal number will probably do the trick
448 version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1;
449 t->put(prefix, "pending_pn", pending_pn);
450 store.apply_transaction(t);
451 return 0;
452}
453
454int inflate_pgmap(MonitorDBStore& st, unsigned n, bool can_be_trimmed) {
455 // put latest pg map into monstore to bloat it up
456 // only format version == 1 is supported
457 version_t last = st.get("pgmap", "last_committed");
458 bufferlist bl;
459
460 // get the latest delta
461 int r = st.get("pgmap", last, bl);
462 if (r) {
463 std::cerr << "Error getting pgmap: " << cpp_strerror(r) << std::endl;
464 return r;
465 }
466
467 // try to pull together an idempotent "delta"
468 ceph::unordered_map<pg_t, pg_stat_t> pg_stat;
469 for (KeyValueDB::Iterator i = st.get_iterator("pgmap_pg");
470 i->valid(); i->next()) {
471 pg_t pgid;
472 if (!pgid.parse(i->key().c_str())) {
473 std::cerr << "unable to parse key " << i->key() << std::endl;
474 continue;
475 }
476 bufferlist pg_bl = i->value();
477 pg_stat_t ps;
478 bufferlist::iterator p = pg_bl.begin();
479 ::decode(ps, p);
480 // will update the last_epoch_clean of all the pgs.
481 pg_stat[pgid] = ps;
482 }
483
484 version_t first = st.get("pgmap", "first_committed");
485 version_t ver = last;
486 auto txn(std::make_shared<MonitorDBStore::Transaction>());
487 for (unsigned i = 0; i < n; i++) {
488 bufferlist trans_bl;
489 bufferlist dirty_pgs;
490 for (ceph::unordered_map<pg_t, pg_stat_t>::iterator ps = pg_stat.begin();
491 ps != pg_stat.end(); ++ps) {
492 ::encode(ps->first, dirty_pgs);
493 if (!can_be_trimmed) {
494 ps->second.last_epoch_clean = first;
495 }
496 ::encode(ps->second, dirty_pgs);
497 }
498 utime_t inc_stamp = ceph_clock_now();
499 ::encode(inc_stamp, trans_bl);
500 ::encode_destructively(dirty_pgs, trans_bl);
501 bufferlist dirty_osds;
502 ::encode(dirty_osds, trans_bl);
503 txn->put("pgmap", ++ver, trans_bl);
504 // update the db in batch
505 if (txn->size() > 1024) {
506 st.apply_transaction(txn);
507 // reset the transaction
508 txn.reset(new MonitorDBStore::Transaction);
509 }
510 }
511 txn->put("pgmap", "last_committed", ver);
512 txn->put("pgmap_meta", "version", ver);
513 // this will also piggy back the leftover pgmap added in the loop above
514 st.apply_transaction(txn);
515 return 0;
516}
517
518static int update_auth(MonitorDBStore& st, const string& keyring_path)
519{
520 // import all keyrings stored in the keyring file
521 KeyRing keyring;
522 int r = keyring.load(g_ceph_context, keyring_path);
523 if (r < 0) {
524 cerr << "unable to load admin keyring: " << keyring_path << std::endl;
525 return r;
526 }
527
528 bufferlist bl;
529 __u8 v = 1;
530 ::encode(v, bl);
531
532 for (const auto& k : keyring.get_keys()) {
533 KeyServerData::Incremental auth_inc;
534 auth_inc.name = k.first;
535 auth_inc.auth = k.second;
536 if (auth_inc.auth.caps.empty()) {
537 cerr << "no caps granted to: " << auth_inc.name << std::endl;
538 return -EINVAL;
539 }
540 auth_inc.op = KeyServerData::AUTH_INC_ADD;
541
542 AuthMonitor::Incremental inc;
543 inc.inc_type = AuthMonitor::AUTH_DATA;
544 ::encode(auth_inc, inc.auth_data);
545 inc.auth_type = CEPH_AUTH_CEPHX;
546
547 inc.encode(bl, CEPH_FEATURES_ALL);
548 }
549
550 const string prefix("auth");
551 auto last_committed = st.get(prefix, "last_committed") + 1;
552 auto t = make_shared<MonitorDBStore::Transaction>();
553 t->put(prefix, last_committed, bl);
554 t->put(prefix, "last_committed", last_committed);
555 auto first_committed = st.get(prefix, "first_committed");
556 if (!first_committed) {
557 t->put(prefix, "first_committed", last_committed);
558 }
559 st.apply_transaction(t);
560 return 0;
561}
562
563static int update_mkfs(MonitorDBStore& st)
564{
565 MonMap monmap;
566 int r = monmap.build_initial(g_ceph_context, cerr);
567 if (r) {
568 cerr << "no initial monitors" << std::endl;
569 return -EINVAL;
570 }
571 bufferlist bl;
572 monmap.encode(bl, CEPH_FEATURES_ALL);
573 monmap.set_epoch(0);
574 auto t = make_shared<MonitorDBStore::Transaction>();
575 t->put("mkfs", "monmap", bl);
576 st.apply_transaction(t);
577 return 0;
578}
579
580static int update_monitor(MonitorDBStore& st)
581{
582 const string prefix("monitor");
583 // a stripped-down Monitor::mkfs()
584 bufferlist bl;
585 bl.append(CEPH_MON_ONDISK_MAGIC "\n");
586 auto t = make_shared<MonitorDBStore::Transaction>();
587 t->put(prefix, "magic", bl);
588 st.apply_transaction(t);
589 return 0;
590}
591
3efd9988
FG
592static int update_mgrmap(MonitorDBStore& st)
593{
594 auto t = make_shared<MonitorDBStore::Transaction>();
595
596 {
597 MgrMap map;
598 // mgr expects epoch > 1
599 map.epoch++;
600 auto initial_modules =
601 get_str_vec(g_ceph_context->_conf->get_val<string>("mgr_initial_modules"));
602 copy(begin(initial_modules),
603 end(initial_modules),
604 inserter(map.modules, end(map.modules)));
605 bufferlist bl;
606 map.encode(bl, CEPH_FEATURES_ALL);
607 t->put("mgr", map.epoch, bl);
608 t->put("mgr", "last_committed", map.epoch);
609 }
610 {
611 auto mgr_command_descs = mgr_commands;
612 for (auto& c : mgr_command_descs) {
613 c.set_flag(MonCommand::FLAG_MGR);
614 }
615 bufferlist bl;
616 ::encode(mgr_command_descs, bl);
617 t->put("mgr_command_desc", "", bl);
618 }
619 return st.apply_transaction(t);
620}
621
7c673cae
FG
622static int update_paxos(MonitorDBStore& st)
623{
624 // build a pending paxos proposal from all non-permanent k/v pairs. once the
625 // proposal is committed, it will gets applied. on the sync provider side, it
626 // will be a no-op, but on its peers, the paxos commit will help to build up
627 // the necessary epochs.
628 bufferlist pending_proposal;
629 {
630 MonitorDBStore::Transaction t;
631 vector<string> prefixes = {"auth", "osdmap",
3efd9988 632 "mgr", "mgr_command_desc",
7c673cae
FG
633 "pgmap", "pgmap_pg", "pgmap_meta"};
634 for (const auto& prefix : prefixes) {
635 for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
636 auto key = i->raw_key();
637 auto val = i->value();
638 t.put(key.first, key.second, val);
639 }
640 }
641 t.encode(pending_proposal);
642 }
643 const string prefix("paxos");
644 auto t = make_shared<MonitorDBStore::Transaction>();
645 t->put(prefix, "first_committed", 0);
646 t->put(prefix, "last_committed", 0);
647 auto pending_v = 1;
648 t->put(prefix, pending_v, pending_proposal);
649 t->put(prefix, "pending_v", pending_v);
650 t->put(prefix, "pending_pn", 400);
651 st.apply_transaction(t);
652 return 0;
653}
654
655// rebuild
656// - pgmap_meta/version
657// - pgmap_meta/last_osdmap_epoch
658// - pgmap_meta/last_pg_scan
659// - pgmap_meta/full_ratio
660// - pgmap_meta/nearfull_ratio
661// - pgmap_meta/stamp
662static int update_pgmap_meta(MonitorDBStore& st)
663{
664 const string prefix("pgmap_meta");
665 auto t = make_shared<MonitorDBStore::Transaction>();
666 // stolen from PGMonitor::create_pending()
667 // the first pgmap_meta
668 t->put(prefix, "version", 1);
669 {
670 auto stamp = ceph_clock_now();
671 bufferlist bl;
672 ::encode(stamp, bl);
673 t->put(prefix, "stamp", bl);
674 }
675 {
676 auto last_osdmap_epoch = st.get("osdmap", "last_committed");
677 t->put(prefix, "last_osdmap_epoch", last_osdmap_epoch);
678 }
679 // be conservative, so PGMonitor will scan the all pools for pg changes
680 t->put(prefix, "last_pg_scan", 1);
681 {
682 auto full_ratio = g_ceph_context->_conf->mon_osd_full_ratio;
683 if (full_ratio > 1.0)
684 full_ratio /= 100.0;
685 bufferlist bl;
686 ::encode(full_ratio, bl);
687 t->put(prefix, "full_ratio", bl);
688 }
689 {
690 auto backfillfull_ratio = g_ceph_context->_conf->mon_osd_backfillfull_ratio;
691 if (backfillfull_ratio > 1.0)
692 backfillfull_ratio /= 100.0;
693 bufferlist bl;
694 ::encode(backfillfull_ratio, bl);
695 t->put(prefix, "backfillfull_ratio", bl);
696 }
697 {
698 auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio;
699 if (nearfull_ratio > 1.0)
700 nearfull_ratio /= 100.0;
701 bufferlist bl;
702 ::encode(nearfull_ratio, bl);
703 t->put(prefix, "nearfull_ratio", bl);
704 }
705 st.apply_transaction(t);
706 return 0;
707}
708
709int rebuild_monstore(const char* progname,
710 vector<string>& subcmds,
711 MonitorDBStore& st)
712{
713 po::options_description op_desc("Allowed 'rebuild' options");
714 string keyring_path;
715 op_desc.add_options()
716 ("keyring", po::value<string>(&keyring_path),
717 "path to the client.admin key");
718 po::variables_map op_vm;
719 int r = parse_cmd_args(&op_desc, nullptr, nullptr, subcmds, &op_vm);
720 if (r) {
721 return -r;
722 }
723 if (op_vm.count("help")) {
724 usage(progname, op_desc);
725 return 0;
726 }
727 if (!keyring_path.empty())
728 update_auth(st, keyring_path);
729 if ((r = update_pgmap_meta(st))) {
730 return r;
731 }
732 if ((r = update_paxos(st))) {
733 return r;
734 }
735 if ((r = update_mkfs(st))) {
736 return r;
737 }
738 if ((r = update_monitor(st))) {
739 return r;
740 }
3efd9988
FG
741 if ((r = update_mgrmap(st))) {
742 return r;
743 }
7c673cae
FG
744 return 0;
745}
746
747int main(int argc, char **argv) {
748 int err = 0;
749 po::options_description desc("Allowed options");
750 string store_path, cmd;
751 vector<string> subcmds;
752 desc.add_options()
753 ("help,h", "produce help message")
754 ;
755
756 /* Dear Future Developer:
757 *
758 * for further improvement, should you need to pass specific options to
759 * a command (e.g., get osdmap VER --hex), you can expand the current
760 * format by creating additional 'po::option_description' and passing
761 * 'subcmds' to 'po::command_line_parser', much like what is currently
762 * done by default. However, beware: in order to differentiate a
763 * command-specific option from the generic/global options, you will need
764 * to pass '--' in the command line (so that the first parser, the one
765 * below, assumes it has reached the end of all options); e.g.,
766 * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as
767 * far as I got with this library. Improvements on this format will be
768 * left as an excercise for the reader. -Joao
769 */
770 po::options_description positional_desc("Positional argument options");
771 positional_desc.add_options()
772 ("store-path", po::value<string>(&store_path),
773 "path to monitor's store")
774 ("command", po::value<string>(&cmd),
775 "Command")
776 ("subcmd", po::value<vector<string> >(&subcmds),
777 "Command arguments/Sub-Commands")
778 ;
779 po::positional_options_description positional;
780 positional.add("store-path", 1);
781 positional.add("command", 1);
782 positional.add("subcmd", -1);
783
784 po::options_description all_desc("All options");
785 all_desc.add(desc).add(positional_desc);
786
787 vector<string> ceph_option_strings;
788 po::variables_map vm;
789 try {
790 po::parsed_options parsed =
791 po::command_line_parser(argc, argv).
792 options(all_desc).
793 positional(positional).
794 allow_unregistered().run();
795
796 po::store(
797 parsed,
798 vm);
799 po::notify(vm);
800
801 // Specifying po::include_positional would have our positional arguments
802 // being collected (thus being part of ceph_option_strings and eventually
803 // passed on to global_init() below).
804 // Instead we specify po::exclude_positional, which has the upside of
805 // completely avoid this, but the downside of having to specify ceph
806 // options as --VAR=VAL (note the '='); otherwise we will capture the
807 // positional 'VAL' as belonging to us, never being collected.
808 ceph_option_strings = po::collect_unrecognized(parsed.options,
809 po::exclude_positional);
810
811 } catch(po::error &e) {
812 std::cerr << "error: " << e.what() << std::endl;
813 return 1;
814 }
815
816 // parse command structure before calling global_init() and friends.
817
818 if (vm.empty() || vm.count("help") ||
819 store_path.empty() || cmd.empty() ||
820 *cmd.begin() == '-') {
821 usage(argv[0], desc);
822 return 1;
823 }
824
825 vector<const char *> ceph_options, def_args;
826 ceph_options.reserve(ceph_option_strings.size());
827 for (vector<string>::iterator i = ceph_option_strings.begin();
828 i != ceph_option_strings.end();
829 ++i) {
830 ceph_options.push_back(i->c_str());
831 }
832
833 auto cct = global_init(
834 &def_args, ceph_options, CEPH_ENTITY_TYPE_MON,
835 CODE_ENVIRONMENT_UTILITY, 0);
836 common_init_finish(g_ceph_context);
837 g_ceph_context->_conf->apply_changes(NULL);
838 g_conf = g_ceph_context->_conf;
839
840 // this is where we'll write *whatever*, on a per-command basis.
841 // not all commands require some place to write their things.
842 MonitorDBStore st(store_path);
843 if (store_path.size()) {
844 stringstream ss;
845 int r = st.open(ss);
846 if (r < 0) {
847 std::cerr << ss.str() << std::endl;
848 return EINVAL;
849 }
850 }
851
852 if (cmd == "dump-keys") {
853 KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
854 while (iter->valid()) {
855 pair<string,string> key(iter->raw_key());
856 cout << key.first << " / " << key.second << std::endl;
857 iter->next();
858 }
859 } else if (cmd == "compact") {
860 st.compact();
861 } else if (cmd == "get") {
862 unsigned v = 0;
863 string outpath;
864 bool readable = false;
865 string map_type;
866 // visible options for this command
867 po::options_description op_desc("Allowed 'get' options");
868 op_desc.add_options()
869 ("help,h", "produce this help message")
870 ("out,o", po::value<string>(&outpath),
871 "output file (default: stdout)")
872 ("version,v", po::value<unsigned>(&v),
873 "map version to obtain")
874 ("readable,r", po::value<bool>(&readable)->default_value(false),
875 "print the map infomation in human readable format")
876 ;
877 // this is going to be a positional argument; we don't want to show
878 // it as an option during --help, but we do want to have it captured
879 // when parsing.
880 po::options_description hidden_op_desc("Hidden 'get' options");
881 hidden_op_desc.add_options()
882 ("map-type", po::value<string>(&map_type),
883 "map-type")
884 ;
885 po::positional_options_description op_positional;
886 op_positional.add("map-type", 1);
887
888 po::variables_map op_vm;
889 int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
890 subcmds, &op_vm);
891 if (r < 0) {
892 err = -r;
893 goto done;
894 }
895
896 if (op_vm.count("help") || map_type.empty()) {
897 usage(argv[0], op_desc);
898 err = 0;
899 goto done;
900 }
901
902 if (v == 0) {
903 if (map_type == "crushmap") {
904 v = st.get("osdmap", "last_committed");
905 } else {
906 v = st.get(map_type, "last_committed");
907 }
908 }
909
910 int fd = STDOUT_FILENO;
911 if (!outpath.empty()){
912 fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
913 if (fd < 0) {
914 std::cerr << "error opening output file: "
915 << cpp_strerror(errno) << std::endl;
916 err = EINVAL;
917 goto done;
918 }
919 }
920
921 BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) {
922 ::close(fd);
923 if (r < 0 && fd != STDOUT_FILENO) {
924 ::remove(outpath.c_str());
925 }
926 } BOOST_SCOPE_EXIT_END
927
928 bufferlist bl;
929 r = 0;
930 if (map_type == "osdmap") {
931 r = st.get(map_type, st.combine_strings("full", v), bl);
932 } else if (map_type == "crushmap") {
933 bufferlist tmp;
934 r = st.get("osdmap", st.combine_strings("full", v), tmp);
935 if (r >= 0) {
936 OSDMap osdmap;
937 osdmap.decode(tmp);
938 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
939 }
940 } else {
941 r = st.get(map_type, v, bl);
942 }
943 if (r < 0) {
944 std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
945 err = EINVAL;
946 goto done;
947 }
948
949 if (readable) {
950 stringstream ss;
951 bufferlist out;
952 if (map_type == "monmap") {
953 MonMap monmap;
954 monmap.decode(bl);
955 monmap.print(ss);
956 } else if (map_type == "osdmap") {
957 OSDMap osdmap;
958 osdmap.decode(bl);
959 osdmap.print(ss);
960 } else if (map_type == "mdsmap") {
961 MDSMap mdsmap;
962 mdsmap.decode(bl);
963 mdsmap.print(ss);
964 } else if (map_type == "crushmap") {
965 CrushWrapper cw;
966 bufferlist::iterator it = bl.begin();
967 cw.decode(it);
968 CrushCompiler cc(cw, std::cerr, 0);
969 cc.decompile(ss);
970 } else {
971 std::cerr << "This type of readable map does not exist: " << map_type << std::endl
972 << "You can only specify[osdmap|monmap|mdsmap|crushmap]" << std::endl;
973 }
974 out.append(ss);
975 out.write_fd(fd);
976 } else {
977 bl.write_fd(fd);
978 }
979
980 if (!outpath.empty()) {
981 std::cout << "wrote " << map_type
982 << " version " << v << " to " << outpath
983 << std::endl;
984 }
985 } else if (cmd == "show-versions") {
986 string map_type; //map type:osdmap,monmap...
987 // visible options for this command
988 po::options_description op_desc("Allowed 'show-versions' options");
989 op_desc.add_options()
990 ("help,h", "produce this help message")
991 ("map-type", po::value<string>(&map_type), "map_type");
992
993 po::positional_options_description op_positional;
994 op_positional.add("map-type", 1);
995
996 po::variables_map op_vm;
997 int r = parse_cmd_args(&op_desc, NULL, &op_positional,
998 subcmds, &op_vm);
999 if (r < 0) {
1000 err = -r;
1001 goto done;
1002 }
1003
1004 if (op_vm.count("help") || map_type.empty()) {
1005 usage(argv[0], op_desc);
1006 err = 0;
1007 goto done;
1008 }
1009
1010 unsigned int v_first = 0;
1011 unsigned int v_last = 0;
1012 v_first = st.get(map_type, "first_committed");
1013 v_last = st.get(map_type, "last_committed");
1014
1015 std::cout << "first committed:\t" << v_first << "\n"
1016 << "last committed:\t" << v_last << std::endl;
1017 } else if (cmd == "dump-paxos") {
1018 unsigned dstart = 0;
1019 unsigned dstop = ~0;
1020 po::options_description op_desc("Allowed 'dump-paxos' options");
1021 op_desc.add_options()
1022 ("help,h", "produce this help message")
1023 ("start,s", po::value<unsigned>(&dstart),
1024 "starting version (default: 0)")
1025 ("end,e", po::value<unsigned>(&dstop),
1026 "finish version (default: ~0)")
1027 ;
1028
1029 po::variables_map op_vm;
1030 int r = parse_cmd_args(&op_desc, NULL, NULL,
1031 subcmds, &op_vm);
1032 if (r < 0) {
1033 err = -r;
1034 goto done;
1035 }
1036
1037 if (op_vm.count("help")) {
1038 usage(argv[0], op_desc);
1039 err = 0;
1040 goto done;
1041 }
1042
1043 if (dstart > dstop) {
1044 std::cerr << "error: 'start' version (value: " << dstart << ") "
1045 << " is greater than 'end' version (value: " << dstop << ")"
1046 << std::endl;
1047 err = EINVAL;
1048 goto done;
1049 }
1050
1051 version_t v = dstart;
1052 for (; v <= dstop; ++v) {
1053 bufferlist bl;
1054 st.get("paxos", v, bl);
1055 if (bl.length() == 0)
1056 break;
1057 cout << "\n--- " << v << " ---" << std::endl;
1058 auto tx(std::make_shared<MonitorDBStore::Transaction>());
1059 Paxos::decode_append_transaction(tx, bl);
1060 JSONFormatter f(true);
1061 tx->dump(&f);
1062 f.flush(cout);
1063 }
1064
1065 std::cout << "dumped " << v << " paxos versions" << std::endl;
1066
1067 } else if (cmd == "dump-trace") {
1068 unsigned dstart = 0;
1069 unsigned dstop = ~0;
1070 string outpath;
1071
1072 // visible options for this command
1073 po::options_description op_desc("Allowed 'dump-trace' options");
1074 op_desc.add_options()
1075 ("help,h", "produce this help message")
1076 ("start,s", po::value<unsigned>(&dstart),
1077 "starting version (default: 0)")
1078 ("end,e", po::value<unsigned>(&dstop),
1079 "finish version (default: ~0)")
1080 ;
1081 // this is going to be a positional argument; we don't want to show
1082 // it as an option during --help, but we do want to have it captured
1083 // when parsing.
1084 po::options_description hidden_op_desc("Hidden 'dump-trace' options");
1085 hidden_op_desc.add_options()
1086 ("out,o", po::value<string>(&outpath),
1087 "file to write the dump to")
1088 ;
1089 po::positional_options_description op_positional;
1090 op_positional.add("out", 1);
1091
1092 po::variables_map op_vm;
1093 int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
1094 subcmds, &op_vm);
1095 if (r < 0) {
1096 err = -r;
1097 goto done;
1098 }
1099
1100 if (op_vm.count("help")) {
1101 usage(argv[0], op_desc);
1102 err = 0;
1103 goto done;
1104 }
1105
1106 if (outpath.empty()) {
1107 usage(argv[0], op_desc);
1108 err = EINVAL;
1109 goto done;
1110 }
1111
1112 if (dstart > dstop) {
1113 std::cerr << "error: 'start' version (value: " << dstart << ") "
1114 << " is greater than 'stop' version (value: " << dstop << ")"
1115 << std::endl;
1116 err = EINVAL;
1117 goto done;
1118 }
1119
1120 TraceIter iter(outpath.c_str());
1121 iter.init();
1122 while (true) {
1123 if (!iter.valid())
1124 break;
1125 if (iter.num() >= dstop) {
1126 break;
1127 }
1128 if (iter.num() >= dstart) {
1129 JSONFormatter f(true);
1130 iter.cur()->dump(&f, false);
1131 f.flush(std::cout);
1132 std::cout << std::endl;
1133 }
1134 iter.next();
1135 }
1136 std::cerr << "Read up to transaction " << iter.num() << std::endl;
1137 } else if (cmd == "replay-trace") {
1138 string inpath;
1139 unsigned num_replays = 1;
1140 // visible options for this command
1141 po::options_description op_desc("Allowed 'replay-trace' options");
1142 op_desc.add_options()
1143 ("help,h", "produce this help message")
1144 ("num-replays,n", po::value<unsigned>(&num_replays),
1145 "finish version (default: 1)")
1146 ;
1147 // this is going to be a positional argument; we don't want to show
1148 // it as an option during --help, but we do want to have it captured
1149 // when parsing.
1150 po::options_description hidden_op_desc("Hidden 'replay-trace' options");
1151 hidden_op_desc.add_options()
1152 ("in,i", po::value<string>(&inpath),
1153 "file to write the dump to")
1154 ;
1155 po::positional_options_description op_positional;
1156 op_positional.add("in", 1);
1157
1158 // op_desc_all will aggregate all visible and hidden options for parsing.
1159 // when we call 'usage()' we just pass 'op_desc', as that's the description
1160 // holding the visible options.
1161 po::options_description op_desc_all;
1162 op_desc_all.add(op_desc).add(hidden_op_desc);
1163
1164 po::variables_map op_vm;
1165 try {
1166 po::parsed_options op_parsed = po::command_line_parser(subcmds).
1167 options(op_desc_all).positional(op_positional).run();
1168 po::store(op_parsed, op_vm);
1169 po::notify(op_vm);
1170 } catch (po::error &e) {
1171 std::cerr << "error: " << e.what() << std::endl;
1172 err = EINVAL;
1173 goto done;
1174 }
1175
1176 if (op_vm.count("help")) {
1177 usage(argv[0], op_desc);
1178 err = 0;
1179 goto done;
1180 }
1181
1182 if (inpath.empty()) {
1183 usage(argv[0], op_desc);
1184 err = EINVAL;
1185 goto done;
1186 }
1187
1188 unsigned num = 0;
1189 for (unsigned i = 0; i < num_replays; ++i) {
1190 TraceIter iter(inpath.c_str());
1191 iter.init();
1192 while (true) {
1193 if (!iter.valid())
1194 break;
1195 std::cerr << "Replaying trans num " << num << std::endl;
1196 st.apply_transaction(iter.cur());
1197 iter.next();
1198 ++num;
1199 }
1200 std::cerr << "Read up to transaction " << iter.num() << std::endl;
1201 }
1202 } else if (cmd == "random-gen") {
1203 unsigned tsize = 200;
1204 unsigned tvalsize = 1024;
1205 unsigned ntrans = 100;
1206 po::options_description op_desc("Allowed 'random-gen' options");
1207 op_desc.add_options()
1208 ("help,h", "produce this help message")
1209 ("num-keys,k", po::value<unsigned>(&tsize),
1210 "keys to write in each transaction (default: 200)")
1211 ("size,s", po::value<unsigned>(&tvalsize),
1212 "size (in bytes) of the value to write in each key (default: 1024)")
1213 ("ntrans,n", po::value<unsigned>(&ntrans),
1214 "number of transactions to run (default: 100)")
1215 ;
1216
1217 po::variables_map op_vm;
1218 try {
1219 po::parsed_options op_parsed = po::command_line_parser(subcmds).
1220 options(op_desc).run();
1221 po::store(op_parsed, op_vm);
1222 po::notify(op_vm);
1223 } catch (po::error &e) {
1224 std::cerr << "error: " << e.what() << std::endl;
1225 err = EINVAL;
1226 goto done;
1227 }
1228
1229 if (op_vm.count("help")) {
1230 usage(argv[0], op_desc);
1231 err = 0;
1232 goto done;
1233 }
1234
1235 unsigned num = 0;
1236 for (unsigned i = 0; i < ntrans; ++i) {
1237 std::cerr << "Applying trans " << i << std::endl;
1238 auto t(std::make_shared<MonitorDBStore::Transaction>());
1239 string prefix;
1240 prefix.push_back((i%26)+'a');
1241 for (unsigned j = 0; j < tsize; ++j) {
1242 stringstream os;
1243 os << num;
1244 bufferlist bl;
1245 for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand());
1246 t->put(prefix, os.str(), bl);
1247 ++num;
1248 }
1249 t->compact_prefix(prefix);
1250 st.apply_transaction(t);
1251 }
1252 } else if (cmd == "store-copy") {
1253 if (subcmds.size() < 1 || subcmds[0].empty()) {
1254 usage(argv[0], desc);
1255 err = EINVAL;
1256 goto done;
1257 }
1258
1259 string out_path = subcmds[0];
1260
1261 MonitorDBStore out_store(out_path);
1262 {
1263 stringstream ss;
1264 int r = out_store.create_and_open(ss);
1265 if (r < 0) {
1266 std::cerr << ss.str() << std::endl;
1267 goto done;
1268 }
1269 }
1270
1271
1272 KeyValueDB::WholeSpaceIterator it = st.get_iterator();
1273 uint64_t total_keys = 0;
1274 uint64_t total_size = 0;
1275 uint64_t total_tx = 0;
1276
1277 do {
1278 uint64_t num_keys = 0;
1279
1280 auto tx(std::make_shared<MonitorDBStore::Transaction>());
1281
1282 while (it->valid() && num_keys < 128) {
1283 pair<string,string> k = it->raw_key();
1284 bufferlist v = it->value();
1285 tx->put(k.first, k.second, v);
1286
1287 num_keys ++;
1288 total_tx ++;
1289 total_size += v.length();
1290
1291 it->next();
1292 }
1293
1294 total_keys += num_keys;
1295
1296 if (!tx->empty())
1297 out_store.apply_transaction(tx);
1298
1299 std::cout << "copied " << total_keys << " keys so far ("
1300 << stringify(si_t(total_size)) << ")" << std::endl;
1301
1302 } while (it->valid());
1303 out_store.close();
1304 std::cout << "summary: copied " << total_keys << " keys, using "
1305 << total_tx << " transactions, totalling "
1306 << stringify(si_t(total_size)) << std::endl;
1307 std::cout << "from '" << store_path << "' to '" << out_path << "'"
1308 << std::endl;
1309 } else if (cmd == "rewrite-crush") {
1310 err = rewrite_crush(argv[0], subcmds, st);
1311 } else if (cmd == "inflate-pgmap") {
1312 unsigned n = 2000;
1313 bool can_be_trimmed = false;
1314 po::options_description op_desc("Allowed 'inflate-pgmap' options");
1315 op_desc.add_options()
1316 ("num-maps,n", po::value<unsigned>(&n),
1317 "number of maps to add (default: 2000)")
1318 ("can-be-trimmed", po::value<bool>(&can_be_trimmed),
1319 "can be trimmed (default: false)")
1320 ;
1321
1322 po::variables_map op_vm;
1323 try {
1324 po::parsed_options op_parsed = po::command_line_parser(subcmds).
1325 options(op_desc).run();
1326 po::store(op_parsed, op_vm);
1327 po::notify(op_vm);
1328 } catch (po::error &e) {
1329 std::cerr << "error: " << e.what() << std::endl;
1330 err = EINVAL;
1331 goto done;
1332 }
1333 err = inflate_pgmap(st, n, can_be_trimmed);
1334 } else if (cmd == "rebuild") {
1335 err = rebuild_monstore(argv[0], subcmds, st);
1336 } else {
1337 std::cerr << "Unrecognized command: " << cmd << std::endl;
1338 usage(argv[0], desc);
1339 goto done;
1340 }
1341
1342 done:
1343 st.close();
1344 return err;
1345}