]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / MDBalancer.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "mdstypes.h"
17
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
20 #include "MDSRank.h"
21 #include "MDSMap.h"
22 #include "CInode.h"
23 #include "CDir.h"
24 #include "MDCache.h"
25 #include "Migrator.h"
26 #include "Mantle.h"
27
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
30
31 #include <fstream>
32 #include <iostream>
33 #include <vector>
34 #include <map>
35 using std::map;
36 using std::vector;
37 using std::chrono::duration_cast;
38
39 #include "common/config.h"
40 #include "common/errno.h"
41
42 #define dout_context g_ceph_context
43 #undef dout_prefix
44 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
45 #undef dout
46 #define dout(lvl) \
47 do {\
48 auto subsys = ceph_subsys_mds;\
49 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
50 subsys = ceph_subsys_mds_balancer;\
51 }\
52 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
53 #undef dendl
54 #define dendl dendl_impl; } while (0)
55
56
57 #define MIN_LOAD 50 // ??
58 #define MIN_REEXPORT 5 // will automatically reexport
59 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
60
61
62 int MDBalancer::proc_message(const Message::const_ref &m)
63 {
64 switch (m->get_type()) {
65
66 case MSG_MDS_HEARTBEAT:
67 handle_heartbeat(MHeartbeat::msgref_cast(m));
68 break;
69
70 default:
71 derr << " balancer unknown message " << m->get_type() << dendl_impl;
72 ceph_abort_msg("balancer unknown message");
73 }
74
75 return 0;
76 }
77
78 MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
79 mds(m), messenger(msgr), mon_client(monc)
80 {
81 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
82 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
83 }
84
85 void MDBalancer::handle_conf_change(const ConfigProxy& conf,
86 const std::set <std::string> &changed,
87 const MDSMap &mds_map)
88 {
89 if (changed.count("mds_bal_fragment_dirs"))
90 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
91 if (changed.count("mds_bal_fragment_interval"))
92 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
93 }
94
95 void MDBalancer::handle_export_pins(void)
96 {
97 auto &q = mds->mdcache->export_pin_queue;
98 auto it = q.begin();
99 dout(20) << "export_pin_queue size=" << q.size() << dendl;
100 while (it != q.end()) {
101 auto cur = it++;
102 CInode *in = *cur;
103 ceph_assert(in->is_dir());
104 mds_rank_t export_pin = in->get_export_pin(false);
105
106 bool remove = true;
107 list<CDir*> dfls;
108 in->get_dirfrags(dfls);
109 for (auto dir : dfls) {
110 if (!dir->is_auth())
111 continue;
112
113 if (export_pin == MDS_RANK_NONE) {
114 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
115 if (dir->is_frozen() || dir->is_freezing()) {
116 // try again later
117 remove = false;
118 continue;
119 }
120 dout(10) << " clear auxsubtree on " << *dir << dendl;
121 dir->state_clear(CDir::STATE_AUXSUBTREE);
122 mds->mdcache->try_subtree_merge(dir);
123 }
124 } else if (export_pin == mds->get_nodeid()) {
125 if (dir->state_test(CDir::STATE_CREATING) ||
126 dir->is_frozen() || dir->is_freezing()) {
127 // try again later
128 remove = false;
129 continue;
130 }
131 if (!dir->is_subtree_root()) {
132 dir->state_set(CDir::STATE_AUXSUBTREE);
133 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
134 dout(10) << " create aux subtree on " << *dir << dendl;
135 } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) {
136 dout(10) << " set auxsubtree bit on " << *dir << dendl;
137 dir->state_set(CDir::STATE_AUXSUBTREE);
138 }
139 } else {
140 mds->mdcache->migrator->export_dir(dir, export_pin);
141 remove = false;
142 }
143 }
144
145 if (remove) {
146 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
147 q.erase(cur);
148 }
149 }
150
151 for (auto &cd : mds->mdcache->get_auth_subtrees()) {
152 mds_rank_t export_pin = cd->inode->get_export_pin();
153 dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
154 if (export_pin >= 0 && export_pin != mds->get_nodeid()) {
155 dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl;
156 mds->mdcache->migrator->export_dir(cd, export_pin);
157 }
158 }
159 }
160
161 void MDBalancer::tick()
162 {
163 static int num_bal_times = g_conf()->mds_bal_max;
164 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
165 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
166 time now = clock::now();
167
168 if (g_conf()->mds_bal_export_pin) {
169 handle_export_pins();
170 }
171
172 // sample?
173 if (chrono::duration<double>(now-last_sample).count() >
174 g_conf()->mds_bal_sample_interval) {
175 dout(15) << "tick last_sample now " << now << dendl;
176 last_sample = now;
177 }
178
179 // We can use duration_cast below, although the result is an int,
180 // because the values from g_conf are also integers.
181 // balance?
182 if (mds->get_nodeid() == 0
183 && mds->is_active()
184 && bal_interval > 0
185 && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
186 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
187 last_heartbeat = now;
188 send_heartbeat();
189 num_bal_times--;
190 }
191 }
192
193
194
195
196 class C_Bal_SendHeartbeat : public MDSInternalContext {
197 public:
198 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
199 void finish(int f) override {
200 mds->balancer->send_heartbeat();
201 }
202 };
203
204
205 double mds_load_t::mds_load() const
206 {
207 switch(g_conf()->mds_bal_mode) {
208 case 0:
209 return
210 .8 * auth.meta_load() +
211 .2 * all.meta_load() +
212 req_rate +
213 10.0 * queue_len;
214
215 case 1:
216 return req_rate + 10.0*queue_len;
217
218 case 2:
219 return cpu_load_avg;
220
221 }
222 ceph_abort();
223 return 0;
224 }
225
226 mds_load_t MDBalancer::get_load()
227 {
228 auto now = clock::now();
229
230 mds_load_t load{DecayRate()}; /* zero DecayRate! */
231
232 if (mds->mdcache->get_root()) {
233 list<CDir*> ls;
234 mds->mdcache->get_root()->get_dirfrags(ls);
235 for (auto &d : ls) {
236 load.auth.add(d->pop_auth_subtree_nested);
237 load.all.add(d->pop_nested);
238 }
239 } else {
240 dout(20) << "get_load no root, no load" << dendl;
241 }
242
243 uint64_t num_requests = mds->get_num_requests();
244
245 uint64_t cpu_time = 1;
246 {
247 string stat_path = PROCPREFIX "/proc/self/stat";
248 ifstream stat_file(stat_path);
249 if (stat_file.is_open()) {
250 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
251 std::istream_iterator<string>());
252 if (stat_vec.size() >= 15) {
253 // utime + stime
254 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
255 strtoll(stat_vec[14].c_str(), nullptr, 10);
256 } else {
257 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
258 }
259 } else {
260 derr << "input file '" << stat_path << "' not found" << dendl_impl;
261 }
262 }
263
264 load.queue_len = messenger->get_dispatch_queue_len();
265
266 bool update_last = true;
267 if (last_get_load != clock::zero() &&
268 now > last_get_load) {
269 double el = std::chrono::duration<double>(now-last_get_load).count();
270 if (el >= 1.0) {
271 if (num_requests > last_num_requests)
272 load.req_rate = (num_requests - last_num_requests) / el;
273 if (cpu_time > last_cpu_time)
274 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
275 } else {
276 auto p = mds_load.find(mds->get_nodeid());
277 if (p != mds_load.end()) {
278 load.req_rate = p->second.req_rate;
279 load.cpu_load_avg = p->second.cpu_load_avg;
280 }
281 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
282 update_last = false;
283 }
284 }
285
286 if (update_last) {
287 last_num_requests = num_requests;
288 last_cpu_time = cpu_time;
289 last_get_load = now;
290 }
291
292 dout(15) << "get_load " << load << dendl;
293 return load;
294 }
295
296 /*
297 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
298 * fallbacks (i.e. kick off async read when we are processing the map and
299 * check status when we get here) with the way the mds is structured.
300 */
301 int MDBalancer::localize_balancer()
302 {
303 /* reset everything */
304 bool ack = false;
305 int r = 0;
306 bufferlist lua_src;
307 Mutex lock("lock");
308 Cond cond;
309
310 /* we assume that balancer is in the metadata pool */
311 object_t oid = object_t(mds->mdsmap->get_balancer());
312 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
313 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
314 new C_SafeCond(&lock, &cond, &ack, &r));
315 dout(15) << "launched non-blocking read tid=" << tid
316 << " oid=" << oid << " oloc=" << oloc << dendl;
317
318 /* timeout: if we waste half our time waiting for RADOS, then abort! */
319 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
320 lock.Lock();
321 int ret_t = cond.WaitInterval(lock, utime_t(bal_interval / 2, 0));
322 lock.Unlock();
323
324 /* success: store the balancer in memory and set the version. */
325 if (!r) {
326 if (ret_t == ETIMEDOUT) {
327 mds->objecter->op_cancel(tid, -ECANCELED);
328 return -ETIMEDOUT;
329 }
330 bal_code.assign(lua_src.to_str());
331 bal_version.assign(oid.name);
332 dout(10) << "localized balancer, bal_code=" << bal_code << dendl;
333 }
334 return r;
335 }
336
337 void MDBalancer::send_heartbeat()
338 {
339 if (mds->is_cluster_degraded()) {
340 dout(10) << "send_heartbeat degraded" << dendl;
341 return;
342 }
343
344 if (!mds->mdcache->is_open()) {
345 dout(5) << "not open" << dendl;
346 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
347 return;
348 }
349
350 if (mds->get_nodeid() == 0) {
351 beat_epoch++;
352 mds_load.clear();
353 }
354
355 // my load
356 mds_load_t load = get_load();
357 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
358 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
359
360 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
361 if (!em.second) {
362 em.first->second = load;
363 }
364
365 // import_map -- how much do i import from whom
366 map<mds_rank_t, float> import_map;
367 for (auto& im : mds->mdcache->get_auth_subtrees()) {
368 mds_rank_t from = im->inode->authority().first;
369 if (from == mds->get_nodeid()) continue;
370 if (im->get_inode()->is_stray()) continue;
371 import_map[from] += im->pop_auth_subtree.meta_load();
372 }
373 mds_import_map[ mds->get_nodeid() ] = import_map;
374
375
376 dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl;
377 for (map<mds_rank_t, float>::iterator it = import_map.begin();
378 it != import_map.end();
379 ++it) {
380 dout(5) << " import_map from " << it->first << " -> " << it->second << dendl;
381 }
382
383
384 set<mds_rank_t> up;
385 mds->get_mds_map()->get_up_mds_set(up);
386 for (const auto& r : up) {
387 if (r == mds->get_nodeid())
388 continue;
389 auto hb = MHeartbeat::create(load, beat_epoch);
390 hb->get_import_map() = import_map;
391 mds->send_message_mds(hb, r);
392 }
393 }
394
395 void MDBalancer::handle_heartbeat(const MHeartbeat::const_ref &m)
396 {
397 mds_rank_t who = mds_rank_t(m->get_source().num());
398 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
399
400 if (!mds->is_active())
401 return;
402
403 if (!mds->mdcache->is_open()) {
404 dout(10) << "opening root on handle_heartbeat" << dendl;
405 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
406 return;
407 }
408
409 if (mds->is_cluster_degraded()) {
410 dout(10) << " degraded, ignoring" << dendl;
411 return;
412 }
413
414 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
415 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
416
417 beat_epoch = m->get_beat();
418 // clear the mds load info whose epoch is less than beat_epoch
419 mds_load.clear();
420 }
421
422 if (who == 0) {
423 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
424 if (beat_epoch != m->get_beat()) {
425 beat_epoch = m->get_beat();
426 mds_load.clear();
427 }
428
429 send_heartbeat();
430
431 mds->mdcache->show_subtrees();
432 } else if (mds->get_nodeid() == 0) {
433 if (beat_epoch != m->get_beat()) {
434 dout(10) << " old heartbeat epoch, ignoring" << dendl;
435 return;
436 }
437 }
438
439 {
440 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
441 if (!em.second) {
442 em.first->second = m->get_load();
443 }
444 }
445 mds_import_map[who] = m->get_import_map();
446
447 {
448 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
449 if (mds_load.size() == cluster_size) {
450 // let's go!
451 //export_empties(); // no!
452
453 /* avoid spamming ceph -w if user does not turn mantle on */
454 if (mds->mdsmap->get_balancer() != "") {
455 int r = mantle_prep_rebalance();
456 if (!r) return;
457 mds->clog->warn() << "using old balancer; mantle failed for "
458 << "balancer=" << mds->mdsmap->get_balancer()
459 << " : " << cpp_strerror(r);
460 }
461 prep_rebalance(m->get_beat());
462 }
463 }
464 }
465
466 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
467 mds_rank_t im, double& maxim)
468 {
469 if (maxex <= 0 || maxim <= 0) return 0.0;
470
471 double howmuch = std::min(maxex, maxim);
472 if (howmuch <= 0) return 0.0;
473
474 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
475
476 if (ex == mds->get_nodeid())
477 state.targets[im] += howmuch;
478
479 state.exported[ex] += howmuch;
480 state.imported[im] += howmuch;
481
482 maxex -= howmuch;
483 maxim -= howmuch;
484
485 return howmuch;
486 }
487
488 void MDBalancer::queue_split(const CDir *dir, bool fast)
489 {
490 dout(10) << __func__ << " enqueuing " << *dir
491 << " (fast=" << fast << ")" << dendl;
492
493 const dirfrag_t frag = dir->dirfrag();
494
495 auto callback = [this, frag](int r) {
496 if (split_pending.erase(frag) == 0) {
497 // Someone beat me to it. This can happen in the fast splitting
498 // path, because we spawn two contexts, one with mds->timer and
499 // one with mds->queue_waiter. The loser can safely just drop
500 // out.
501 return;
502 }
503
504 CDir *split_dir = mds->mdcache->get_dirfrag(frag);
505 if (!split_dir) {
506 dout(10) << "drop split on " << frag << " because not in cache" << dendl;
507 return;
508 }
509 if (!split_dir->is_auth()) {
510 dout(10) << "drop split on " << frag << " because non-auth" << dendl;
511 return;
512 }
513
514 // Pass on to MDCache: note that the split might still not
515 // happen if the checks in MDCache::can_fragment fail.
516 dout(10) << __func__ << " splitting " << *split_dir << dendl;
517 mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits);
518 };
519
520 bool is_new = false;
521 if (split_pending.count(frag) == 0) {
522 split_pending.insert(frag);
523 is_new = true;
524 }
525
526 if (fast) {
527 // Do the split ASAP: enqueue it in the MDSRank waiters which are
528 // run at the end of dispatching the current request
529 mds->queue_waiter(new MDSInternalContextWrapper(mds,
530 new FunctionContext(callback)));
531 } else if (is_new) {
532 // Set a timer to really do the split: we don't do it immediately
533 // so that bursts of ops on a directory have a chance to go through
534 // before we freeze it.
535 mds->timer.add_event_after(bal_fragment_interval,
536 new FunctionContext(callback));
537 }
538 }
539
540 void MDBalancer::queue_merge(CDir *dir)
541 {
542 const auto frag = dir->dirfrag();
543 auto callback = [this, frag](int r) {
544 ceph_assert(frag.frag != frag_t());
545
546 // frag must be in this set because only one context is in flight
547 // for a given frag at a time (because merge_pending is checked before
548 // starting one), and this context is the only one that erases it.
549 merge_pending.erase(frag);
550
551 CDir *dir = mds->mdcache->get_dirfrag(frag);
552 if (!dir) {
553 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
554 return;
555 }
556 ceph_assert(dir->dirfrag() == frag);
557
558 if(!dir->is_auth()) {
559 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
560 return;
561 }
562
563 dout(10) << "merging " << *dir << dendl;
564
565 CInode *diri = dir->get_inode();
566
567 frag_t fg = dir->get_frag();
568 while (fg != frag_t()) {
569 frag_t sibfg = fg.get_sibling();
570 list<CDir*> sibs;
571 bool complete = diri->get_dirfrags_under(sibfg, sibs);
572 if (!complete) {
573 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
574 break;
575 }
576 bool all = true;
577 for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) {
578 CDir *sib = *p;
579 if (!sib->is_auth() || !sib->should_merge()) {
580 all = false;
581 break;
582 }
583 }
584 if (!all) {
585 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
586 break;
587 }
588 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
589 fg = fg.parent();
590 }
591
592 if (fg != dir->get_frag())
593 mds->mdcache->merge_dir(diri, fg);
594 };
595
596 if (merge_pending.count(frag) == 0) {
597 dout(20) << __func__ << " enqueued dir " << *dir << dendl;
598 merge_pending.insert(frag);
599 mds->timer.add_event_after(bal_fragment_interval,
600 new FunctionContext(callback));
601 } else {
602 dout(20) << __func__ << " dir already in queue " << *dir << dendl;
603 }
604 }
605
606 void MDBalancer::prep_rebalance(int beat)
607 {
608 balance_state_t state;
609
610 if (g_conf()->mds_thrash_exports) {
611 //we're going to randomly export to all the mds in the cluster
612 set<mds_rank_t> up_mds;
613 mds->get_mds_map()->get_up_mds_set(up_mds);
614 for (const auto &rank : up_mds) {
615 state.targets[rank] = 0.0;
616 }
617 } else {
618 int cluster_size = mds->get_mds_map()->get_num_in_mds();
619 mds_rank_t whoami = mds->get_nodeid();
620 rebalance_time = clock::now();
621
622 dout(5) << " prep_rebalance: cluster loads are" << dendl;
623
624 mds->mdcache->migrator->clear_export_queue();
625
626 // rescale! turn my mds_load back into meta_load units
627 double load_fac = 1.0;
628 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
629 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
630 double metald = m->second.auth.meta_load();
631 double mdsld = m->second.mds_load();
632 load_fac = metald / mdsld;
633 dout(7) << " load_fac is " << load_fac
634 << " <- " << m->second.auth << " " << metald
635 << " / " << mdsld
636 << dendl;
637 }
638
639 mds_meta_load.clear();
640
641 double total_load = 0.0;
642 multimap<double,mds_rank_t> load_map;
643 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
644 mds_load_t& load = mds_load.at(i);
645
646 double l = load.mds_load() * load_fac;
647 mds_meta_load[i] = l;
648
649 if (whoami == 0)
650 dout(5) << " mds." << i
651 << " " << load
652 << " = " << load.mds_load()
653 << " ~ " << l << dendl;
654
655 if (whoami == i) my_load = l;
656 total_load += l;
657
658 load_map.insert(pair<double,mds_rank_t>( l, i ));
659 }
660
661 // target load
662 target_load = total_load / (double)cluster_size;
663 dout(5) << "prep_rebalance: my load " << my_load
664 << " target " << target_load
665 << " total " << total_load
666 << dendl;
667
668 // under or over?
669 for (auto p : load_map) {
670 if (p.first < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
671 dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
672 mds_last_epoch_under_map[p.second] = beat_epoch;
673 }
674 }
675
676 int last_epoch_under = mds_last_epoch_under_map[whoami];
677 if (last_epoch_under == beat_epoch) {
678 dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
679 return;
680 }
681 // am i over long enough?
682 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
683 dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
684 return;
685 }
686
687 dout(5) << " i am sufficiently overloaded" << dendl;
688
689
690 // first separate exporters and importers
691 multimap<double,mds_rank_t> importers;
692 multimap<double,mds_rank_t> exporters;
693 set<mds_rank_t> importer_set;
694 set<mds_rank_t> exporter_set;
695
696 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
697 it != load_map.end();
698 ++it) {
699 if (it->first < target_load) {
700 dout(15) << " mds." << it->second << " is importer" << dendl;
701 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
702 importer_set.insert(it->second);
703 } else {
704 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
705 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
706 dout(15) << " mds." << it->second << " is exporter" << dendl;
707 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
708 exporter_set.insert(it->second);
709 }
710 }
711 }
712
713
714 // determine load transfer mapping
715
716 if (true) {
717 // analyze import_map; do any matches i can
718
719 dout(15) << " matching exporters to import sources" << dendl;
720
721 // big -> small exporters
722 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
723 ex != exporters.rend();
724 ++ex) {
725 double maxex = get_maxex(state, ex->second);
726 if (maxex <= .001) continue;
727
728 // check importers. for now, just in arbitrary order (no intelligent matching).
729 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
730 im != mds_import_map[ex->second].end();
731 ++im) {
732 double maxim = get_maxim(state, im->first);
733 if (maxim <= .001) continue;
734 try_match(state, ex->second, maxex, im->first, maxim);
735 if (maxex <= .001) break;
736 }
737 }
738 }
739
740 // old way
741 if (beat % 2 == 1) {
742 dout(15) << " matching big exporters to big importers" << dendl;
743 // big exporters to big importers
744 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
745 multimap<double,mds_rank_t>::iterator im = importers.begin();
746 while (ex != exporters.rend() &&
747 im != importers.end()) {
748 double maxex = get_maxex(state, ex->second);
749 double maxim = get_maxim(state, im->second);
750 if (maxex < .001 || maxim < .001) break;
751 try_match(state, ex->second, maxex, im->second, maxim);
752 if (maxex <= .001) ++ex;
753 if (maxim <= .001) ++im;
754 }
755 } else { // new way
756 dout(15) << " matching small exporters to big importers" << dendl;
757 // small exporters to big importers
758 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
759 multimap<double,mds_rank_t>::iterator im = importers.begin();
760 while (ex != exporters.end() &&
761 im != importers.end()) {
762 double maxex = get_maxex(state, ex->second);
763 double maxim = get_maxim(state, im->second);
764 if (maxex < .001 || maxim < .001) break;
765 try_match(state, ex->second, maxex, im->second, maxim);
766 if (maxex <= .001) ++ex;
767 if (maxim <= .001) ++im;
768 }
769 }
770 }
771 try_rebalance(state);
772 }
773
774 int MDBalancer::mantle_prep_rebalance()
775 {
776 balance_state_t state;
777
778 /* refresh balancer if it has changed */
779 if (bal_version != mds->mdsmap->get_balancer()) {
780 bal_version.assign("");
781 int r = localize_balancer();
782 if (r) return r;
783
784 /* only spam the cluster log from 1 mds on version changes */
785 if (mds->get_nodeid() == 0)
786 mds->clog->info() << "mantle balancer version changed: " << bal_version;
787 }
788
789 /* prepare for balancing */
790 int cluster_size = mds->get_mds_map()->get_num_in_mds();
791 rebalance_time = clock::now();
792 mds->mdcache->migrator->clear_export_queue();
793
794 /* fill in the metrics for each mds by grabbing load struct */
795 vector < map<string, double> > metrics (cluster_size);
796 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
797 mds_load_t& load = mds_load.at(i);
798
799 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
800 {"all.meta_load", load.all.meta_load()},
801 {"req_rate", load.req_rate},
802 {"queue_len", load.queue_len},
803 {"cpu_load_avg", load.cpu_load_avg}};
804 }
805
806 /* execute the balancer */
807 Mantle mantle;
808 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
809 dout(5) << " mantle decided that new targets=" << state.targets << dendl;
810
811 /* mantle doesn't know about cluster size, so check target len here */
812 if ((int) state.targets.size() != cluster_size)
813 return -EINVAL;
814 else if (ret)
815 return ret;
816
817 try_rebalance(state);
818 return 0;
819 }
820
821
822
823 void MDBalancer::try_rebalance(balance_state_t& state)
824 {
825 if (g_conf()->mds_thrash_exports) {
826 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
827 << dendl;
828 return;
829 }
830
831 // make a sorted list of my imports
832 multimap<double, CDir*> import_pop_map;
833 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
834
835 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
836 CInode *diri = dir->get_inode();
837 if (diri->is_mdsdir())
838 continue;
839 if (diri->get_export_pin(false) != MDS_RANK_NONE)
840 continue;
841 if (dir->is_freezing() || dir->is_frozen())
842 continue; // export pbly already in progress
843
844 mds_rank_t from = diri->authority().first;
845 double pop = dir->pop_auth_subtree.meta_load();
846 if (g_conf()->mds_bal_idle_threshold > 0 &&
847 pop < g_conf()->mds_bal_idle_threshold &&
848 diri != mds->mdcache->get_root() &&
849 from != mds->get_nodeid()) {
850 dout(5) << " exporting idle (" << pop << ") import " << *dir
851 << " back to mds." << from << dendl;
852 mds->mdcache->migrator->export_dir_nicely(dir, from);
853 continue;
854 }
855
856 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
857 import_pop_map.insert(make_pair(pop, dir));
858 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
859 }
860
861 // do my exports!
862 map<mds_rank_t, double> export_pop_map;
863
864 for (auto &it : state.targets) {
865 mds_rank_t target = it.first;
866 double amount = it.second;
867
868 if (amount < MIN_OFFLOAD)
869 continue;
870 if (amount * 10 * state.targets.size() < target_load)
871 continue;
872
873 dout(5) << "want to send " << amount << " to mds." << target
874 //<< " .. " << (*it).second << " * " << load_fac
875 << " -> " << amount
876 << dendl;//" .. fudge is " << fudge << dendl;
877
878 double& have = export_pop_map[target];
879
880 mds->mdcache->show_subtrees();
881
882 // search imports from target
883 if (import_from_map.count(target)) {
884 dout(5) << " aha, looking through imports from target mds." << target << dendl;
885 for (auto p = import_from_map.equal_range(target);
886 p.first != p.second; ) {
887 CDir *dir = p.first->second.first;
888 double pop = p.first->second.second;
889 dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
890 auto plast = p.first++;
891
892 if (dir->inode->is_base())
893 continue;
894 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
895
896 if (pop <= amount-have) {
897 dout(5) << "reexporting " << *dir << " pop " << pop
898 << " back to mds." << target << dendl;
899 mds->mdcache->migrator->export_dir_nicely(dir, target);
900 have += pop;
901 import_from_map.erase(plast);
902 for (auto q = import_pop_map.equal_range(pop);
903 q.first != q.second; ) {
904 if (q.first->second == dir) {
905 import_pop_map.erase(q.first);
906 break;
907 }
908 q.first++;
909 }
910 } else {
911 dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
912 }
913 if (amount-have < MIN_OFFLOAD)
914 break;
915 }
916 }
917 }
918
919 // any other imports
920 for (auto &it : state.targets) {
921 mds_rank_t target = it.first;
922 double amount = it.second;
923
924 if (!export_pop_map.count(target))
925 continue;
926 double& have = export_pop_map[target];
927 if (amount-have < MIN_OFFLOAD)
928 continue;
929
930 for (auto p = import_pop_map.begin();
931 p != import_pop_map.end(); ) {
932 CDir *dir = p->second;
933 if (dir->inode->is_base()) {
934 ++p;
935 continue;
936 }
937
938 double pop = p->first;
939 if (pop <= amount-have && pop > MIN_REEXPORT) {
940 dout(0) << "reexporting " << *dir << " pop " << pop
941 << " to mds." << target << dendl;
942 have += pop;
943 mds->mdcache->migrator->export_dir_nicely(dir, target);
944 import_pop_map.erase(p++);
945 } else {
946 ++p;
947 }
948 if (amount-have < MIN_OFFLOAD)
949 break;
950 }
951 }
952
953 set<CDir*> already_exporting;
954
955 for (auto &it : state.targets) {
956 mds_rank_t target = it.first;
957 double amount = it.second;
958
959 if (!export_pop_map.count(target))
960 continue;
961 double& have = export_pop_map[target];
962 if (amount-have < MIN_OFFLOAD)
963 continue;
964
965 // okay, search for fragments of my workload
966 list<CDir*> exports;
967
968 for (auto p = import_pop_map.rbegin();
969 p != import_pop_map.rend();
970 ++p) {
971 CDir *dir = p->second;
972 find_exports(dir, amount, exports, have, already_exporting);
973 if (amount-have < MIN_OFFLOAD)
974 break;
975 }
976 //fudge = amount - have;
977
978 for (auto dir : exports) {
979 dout(5) << " - exporting " << dir->pop_auth_subtree
980 << " " << dir->pop_auth_subtree.meta_load()
981 << " to mds." << target << " " << *dir << dendl;
982 mds->mdcache->migrator->export_dir_nicely(dir, target);
983 }
984 }
985
986 dout(5) << "rebalance done" << dendl;
987 mds->mdcache->show_subtrees();
988 }
989
990 void MDBalancer::find_exports(CDir *dir,
991 double amount,
992 list<CDir*>& exports,
993 double& have,
994 set<CDir*>& already_exporting)
995 {
996 auto now = clock::now();
997 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
998 if (duration > 0.1) {
999 derr << " balancer runs too long" << dendl_impl;
1000 have = amount;
1001 return;
1002 }
1003
1004 ceph_assert(dir->is_auth());
1005
1006 double need = amount - have;
1007 if (need < amount * g_conf()->mds_bal_min_start)
1008 return; // good enough!
1009
1010 double needmax = need * g_conf()->mds_bal_need_max;
1011 double needmin = need * g_conf()->mds_bal_need_min;
1012 double midchunk = need * g_conf()->mds_bal_midchunk;
1013 double minchunk = need * g_conf()->mds_bal_minchunk;
1014
1015 list<CDir*> bigger_rep, bigger_unrep;
1016 multimap<double, CDir*> smaller;
1017
1018 double dir_pop = dir->pop_auth_subtree.meta_load();
1019 dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
1020
1021 double subdir_sum = 0;
1022 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1023 !it.end(); ) {
1024 CInode *in = *it;
1025 ++it;
1026
1027 ceph_assert(in->is_dir());
1028 ceph_assert(in->get_parent_dir() == dir);
1029
1030 list<CDir*> dfls;
1031 in->get_nested_dirfrags(dfls);
1032
1033 size_t num_idle_frags = 0;
1034 for (list<CDir*>::iterator p = dfls.begin();
1035 p != dfls.end();
1036 ++p) {
1037 CDir *subdir = *p;
1038 if (already_exporting.count(subdir))
1039 continue;
1040
1041 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1042 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1043 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1044 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1045 continue; // can't export this right now!
1046
1047 // how popular?
1048 double pop = subdir->pop_auth_subtree.meta_load();
1049 subdir_sum += pop;
1050 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1051
1052 if (pop < minchunk) {
1053 num_idle_frags++;
1054 continue;
1055 }
1056
1057 // lucky find?
1058 if (pop > needmin && pop < needmax) {
1059 exports.push_back(subdir);
1060 already_exporting.insert(subdir);
1061 have += pop;
1062 return;
1063 }
1064
1065 if (pop > need) {
1066 if (subdir->is_rep())
1067 bigger_rep.push_back(subdir);
1068 else
1069 bigger_unrep.push_back(subdir);
1070 } else
1071 smaller.insert(pair<double,CDir*>(pop, subdir));
1072 }
1073 if (dfls.size() == num_idle_frags)
1074 in->item_pop_lru.remove_myself();
1075 }
1076 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1077
1078 // grab some sufficiently big small items
1079 multimap<double,CDir*>::reverse_iterator it;
1080 for (it = smaller.rbegin();
1081 it != smaller.rend();
1082 ++it) {
1083
1084 if ((*it).first < midchunk)
1085 break; // try later
1086
1087 dout(7) << " taking smaller " << *(*it).second << dendl;
1088
1089 exports.push_back((*it).second);
1090 already_exporting.insert((*it).second);
1091 have += (*it).first;
1092 if (have > needmin)
1093 return;
1094 }
1095
1096 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1097 for (list<CDir*>::iterator it = bigger_unrep.begin();
1098 it != bigger_unrep.end();
1099 ++it) {
1100 dout(15) << " descending into " << **it << dendl;
1101 find_exports(*it, amount, exports, have, already_exporting);
1102 if (have > needmin)
1103 return;
1104 }
1105
1106 // ok fine, use smaller bits
1107 for (;
1108 it != smaller.rend();
1109 ++it) {
1110 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1111
1112 exports.push_back((*it).second);
1113 already_exporting.insert((*it).second);
1114 have += (*it).first;
1115 if (have > needmin)
1116 return;
1117 }
1118
1119 // ok fine, drill into replicated dirs
1120 for (list<CDir*>::iterator it = bigger_rep.begin();
1121 it != bigger_rep.end();
1122 ++it) {
1123 dout(7) << " descending into replicated " << **it << dendl;
1124 find_exports(*it, amount, exports, have, already_exporting);
1125 if (have > needmin)
1126 return;
1127 }
1128 }
1129
1130 void MDBalancer::hit_inode(CInode *in, int type, int who)
1131 {
1132 // hit inode
1133 in->pop.get(type).hit();
1134
1135 if (in->get_parent_dn())
1136 hit_dir(in->get_parent_dn()->get_dir(), type, who);
1137 }
1138
1139 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1140 {
1141 // split/merge
1142 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
1143 dir->is_auth() &&
1144 !dir->inode->is_base() && // not root/mdsdir (for now at least)
1145 !dir->inode->is_stray()) { // not straydir
1146
1147 // split
1148 if (g_conf()->mds_bal_split_size > 0 && (dir->should_split() || hot)) {
1149 if (split_pending.count(dir->dirfrag()) == 0) {
1150 queue_split(dir, false);
1151 } else {
1152 if (dir->should_split_fast()) {
1153 queue_split(dir, true);
1154 } else {
1155 dout(10) << __func__ << ": fragment already enqueued to split: "
1156 << *dir << dendl;
1157 }
1158 }
1159 }
1160
1161 // merge?
1162 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1163 merge_pending.count(dir->dirfrag()) == 0) {
1164 queue_merge(dir);
1165 }
1166 }
1167 }
1168
1169 void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
1170 {
1171 // hit me
1172 double v = dir->pop_me.get(type).hit(amount);
1173
1174 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1175 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
1176
1177 dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag()
1178 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
1179
1180 maybe_fragment(dir, hot);
1181
1182 // replicate?
1183 if (type == META_POP_IRD && who >= 0) {
1184 dir->pop_spread.hit(who);
1185 }
1186
1187 double rd_adj = 0.0;
1188 if (type == META_POP_IRD &&
1189 dir->last_popularity_sample < last_sample) {
1190 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
1191 dir->last_popularity_sample = last_sample;
1192 double pop_sp = dir->pop_spread.get();
1193 dir_pop += pop_sp * 10;
1194
1195 //if (dir->ino() == inodeno_t(0x10000000002))
1196 if (pop_sp > 0) {
1197 dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp
1198 << " " << dir->pop_spread.last[0]
1199 << " " << dir->pop_spread.last[1]
1200 << " " << dir->pop_spread.last[2]
1201 << " " << dir->pop_spread.last[3]
1202 << " in " << *dir << dendl;
1203 }
1204
1205 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1206 if (!dir->is_rep() &&
1207 dir_pop >= g_conf()->mds_bal_replicate_threshold) {
1208 // replicate
1209 double rdp = dir->pop_me.get(META_POP_IRD).get();
1210 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1211 rd_adj /= 2.0; // temper somewhat
1212
1213 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
1214
1215 dir->dir_rep = CDir::REP_ALL;
1216 mds->mdcache->send_dir_updates(dir, true);
1217
1218 // fixme this should adjust the whole pop hierarchy
1219 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1220 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1221 }
1222
1223 if (dir->ino() != 1 &&
1224 dir->is_rep() &&
1225 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
1226 // unreplicate
1227 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
1228
1229 dir->dir_rep = CDir::REP_NONE;
1230 mds->mdcache->send_dir_updates(dir);
1231 }
1232 }
1233 }
1234
1235 // adjust ancestors
1236 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1237 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1238
1239 while (true) {
1240 CDir *pdir = dir->inode->get_parent_dir();
1241 dir->pop_nested.get(type).hit(amount);
1242 if (rd_adj != 0.0)
1243 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
1244
1245 if (hit_subtree) {
1246 dir->pop_auth_subtree.get(type).hit(amount);
1247
1248 if (rd_adj != 0.0)
1249 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1250
1251 if (dir->is_subtree_root())
1252 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1253 else if (pdir)
1254 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
1255 }
1256
1257 if (hit_subtree_nested) {
1258 dir->pop_auth_subtree_nested.get(type).hit(amount);
1259 if (rd_adj != 0.0)
1260 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
1261 }
1262 if (!pdir) break;
1263 dir = pdir;
1264 }
1265 }
1266
1267
1268 /*
1269 * subtract off an exported chunk.
1270 * this excludes *dir itself (encode_export_dir should have take care of that)
1271 * we _just_ do the parents' nested counters.
1272 *
1273 * NOTE: call me _after_ forcing *dir into a subtree root,
1274 * but _before_ doing the encode_export_dirs.
1275 */
1276 void MDBalancer::subtract_export(CDir *dir)
1277 {
1278 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1279
1280 while (true) {
1281 dir = dir->inode->get_parent_dir();
1282 if (!dir) break;
1283
1284 dir->pop_nested.sub(subload);
1285 dir->pop_auth_subtree_nested.sub(subload);
1286 }
1287 }
1288
1289
1290 void MDBalancer::add_import(CDir *dir)
1291 {
1292 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1293
1294 while (true) {
1295 dir = dir->inode->get_parent_dir();
1296 if (!dir) break;
1297
1298 dir->pop_nested.add(subload);
1299 dir->pop_auth_subtree_nested.add(subload);
1300 }
1301 }
1302
1303 void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
1304 {
1305 bool adjust_subtree_nest = dir->is_auth();
1306 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1307 CDir *cur = dir;
1308 while (true) {
1309 if (inc) {
1310 pdir->pop_nested.add(dir->pop_nested);
1311 if (adjust_subtree) {
1312 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
1313 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1314 }
1315
1316 if (adjust_subtree_nest)
1317 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
1318 } else {
1319 pdir->pop_nested.sub(dir->pop_nested);
1320 if (adjust_subtree)
1321 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
1322
1323 if (adjust_subtree_nest)
1324 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
1325 }
1326
1327 if (pdir->is_subtree_root())
1328 adjust_subtree = false;
1329 cur = pdir;
1330 pdir = pdir->inode->get_parent_dir();
1331 if (!pdir) break;
1332 }
1333 }
1334
1335 void MDBalancer::handle_mds_failure(mds_rank_t who)
1336 {
1337 if (0 == who) {
1338 mds_last_epoch_under_map.clear();
1339 }
1340 }
1341
1342 int MDBalancer::dump_loads(Formatter *f)
1343 {
1344 list<CDir*> dfs;
1345 if (mds->mdcache->get_root()) {
1346 mds->mdcache->get_root()->get_dirfrags(dfs);
1347 } else {
1348 dout(5) << "dump_load no root" << dendl;
1349 }
1350
1351 f->open_object_section("loads");
1352
1353 f->open_array_section("dirfrags");
1354 while (!dfs.empty()) {
1355 CDir *dir = dfs.front();
1356 dfs.pop_front();
1357
1358 f->open_object_section("dir");
1359 dir->dump_load(f);
1360 f->close_section();
1361
1362 for (auto it = dir->begin(); it != dir->end(); ++it) {
1363 CInode *in = it->second->get_linkage()->get_inode();
1364 if (!in || !in->is_dir())
1365 continue;
1366
1367 list<CDir*> ls;
1368 in->get_dirfrags(ls);
1369 for (auto subdir : ls) {
1370 if (subdir->pop_nested.meta_load() < .001)
1371 continue;
1372 dfs.push_back(subdir);
1373 }
1374 }
1375 }
1376 f->close_section(); // dirfrags array
1377
1378 f->open_object_section("mds_load");
1379 {
1380
1381 auto dump_mds_load = [f](mds_load_t& load) {
1382 f->dump_float("request_rate", load.req_rate);
1383 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1384 f->dump_float("queue_length", load.queue_len);
1385 f->dump_float("cpu_load", load.cpu_load_avg);
1386 f->dump_float("mds_load", load.mds_load());
1387
1388 f->open_object_section("auth_dirfrags");
1389 load.auth.dump(f);
1390 f->close_section();
1391 f->open_object_section("all_dirfrags");
1392 load.all.dump(f);
1393 f->close_section();
1394 };
1395
1396 for (auto p : mds_load) {
1397 stringstream name;
1398 name << "mds." << p.first;
1399 f->open_object_section(name.str().c_str());
1400 dump_mds_load(p.second);
1401 f->close_section();
1402 }
1403 }
1404 f->close_section(); // mds_load
1405
1406 f->open_object_section("mds_meta_load");
1407 for (auto p : mds_meta_load) {
1408 stringstream name;
1409 name << "mds." << p.first;
1410 f->dump_float(name.str().c_str(), p.second);
1411 }
1412 f->close_section(); // mds_meta_load
1413
1414 f->open_object_section("mds_import_map");
1415 for (auto p : mds_import_map) {
1416 stringstream name1;
1417 name1 << "mds." << p.first;
1418 f->open_array_section(name1.str().c_str());
1419 for (auto q : p.second) {
1420 f->open_object_section("from");
1421 stringstream name2;
1422 name2 << "mds." << q.first;
1423 f->dump_float(name2.str().c_str(), q.second);
1424 f->close_section();
1425 }
1426 f->close_section(); // mds.? array
1427 }
1428 f->close_section(); // mds_import_map
1429
1430 f->close_section(); // loads
1431 return 0;
1432 }