]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mds / MDBalancer.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "mdstypes.h"
17
18 #include "MDBalancer.h"
19 #include "MDSRank.h"
20 #include "mon/MonClient.h"
21 #include "MDSMap.h"
22 #include "CInode.h"
23 #include "CDir.h"
24 #include "MDCache.h"
25 #include "Migrator.h"
26 #include "Mantle.h"
27
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
30 #include "messages/MHeartbeat.h"
31
32 #include <fstream>
33 #include <iostream>
34 #include <vector>
35 #include <map>
36 using std::map;
37 using std::vector;
38
39 #include "common/config.h"
40 #include "common/errno.h"
41
42 #define dout_context g_ceph_context
43 #define dout_subsys ceph_subsys_mds
44 #undef dout_prefix
45 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
46 #undef dout
47 #define dout(lvl) \
48 do {\
49 auto subsys = ceph_subsys_mds;\
50 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
51 subsys = ceph_subsys_mds_balancer;\
52 }\
53 dout_impl(dout_context, subsys, lvl) dout_prefix
54 #undef dendl
55 #define dendl dendl_impl; } while (0)
56
57
58 #define MIN_LOAD 50 // ??
59 #define MIN_REEXPORT 5 // will automatically reexport
60 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
61
62
63 /* This function DOES put the passed message before returning */
64 int MDBalancer::proc_message(Message *m)
65 {
66 switch (m->get_type()) {
67
68 case MSG_MDS_HEARTBEAT:
69 handle_heartbeat(static_cast<MHeartbeat*>(m));
70 break;
71
72 default:
73 derr << " balancer unknown message " << m->get_type() << dendl_impl;
74 assert(0 == "balancer unknown message");
75 }
76
77 return 0;
78 }
79
80 void MDBalancer::handle_export_pins(void)
81 {
82 auto &q = mds->mdcache->export_pin_queue;
83 auto it = q.begin();
84 dout(20) << "export_pin_queue size=" << q.size() << dendl;
85 while (it != q.end()) {
86 auto cur = it++;
87 CInode *in = *cur;
88 assert(in->is_dir());
89 mds_rank_t export_pin = in->get_export_pin(false);
90
91 bool remove = true;
92 list<CDir*> dfls;
93 in->get_dirfrags(dfls);
94 for (auto dir : dfls) {
95 if (!dir->is_auth())
96 continue;
97
98 if (export_pin == MDS_RANK_NONE) {
99 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
100 if (dir->is_frozen() || dir->is_freezing()) {
101 // try again later
102 remove = false;
103 continue;
104 }
105 dout(10) << " clear auxsubtree on " << *dir << dendl;
106 dir->state_clear(CDir::STATE_AUXSUBTREE);
107 mds->mdcache->try_subtree_merge(dir);
108 }
109 } else if (export_pin == mds->get_nodeid()) {
110 if (dir->state_test(CDir::STATE_CREATING) ||
111 dir->is_frozen() || dir->is_freezing()) {
112 // try again later
113 remove = false;
114 continue;
115 }
116 if (!dir->is_subtree_root()) {
117 dir->state_set(CDir::STATE_AUXSUBTREE);
118 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
119 dout(10) << " create aux subtree on " << *dir << dendl;
120 } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) {
121 dout(10) << " set auxsubtree bit on " << *dir << dendl;
122 dir->state_set(CDir::STATE_AUXSUBTREE);
123 }
124 } else {
125 mds->mdcache->migrator->export_dir(dir, export_pin);
126 remove = false;
127 }
128 }
129
130 if (remove) {
131 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
132 q.erase(cur);
133 }
134 }
135
136 set<CDir *> authsubs;
137 mds->mdcache->get_auth_subtrees(authsubs);
138 for (auto &cd : authsubs) {
139 mds_rank_t export_pin = cd->inode->get_export_pin();
140 dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
141 if (export_pin >= 0 && export_pin != mds->get_nodeid()) {
142 dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl;
143 mds->mdcache->migrator->export_dir(cd, export_pin);
144 }
145 }
146 }
147
148 void MDBalancer::tick()
149 {
150 static int num_bal_times = g_conf->mds_bal_max;
151 static utime_t first = ceph_clock_now();
152 utime_t now = ceph_clock_now();
153 utime_t elapsed = now;
154 elapsed -= first;
155
156 if (g_conf->mds_bal_export_pin) {
157 handle_export_pins();
158 }
159
160 // sample?
161 if ((double)now - (double)last_sample > g_conf->mds_bal_sample_interval) {
162 dout(15) << "tick last_sample now " << now << dendl;
163 last_sample = now;
164 }
165
166 // balance?
167 if (mds->get_nodeid() == 0 &&
168 g_conf->mds_bal_interval > 0 &&
169 (num_bal_times ||
170 (g_conf->mds_bal_max_until >= 0 &&
171 elapsed.sec() > g_conf->mds_bal_max_until)) &&
172 mds->is_active() &&
173 now.sec() - last_heartbeat.sec() >= g_conf->mds_bal_interval) {
174 last_heartbeat = now;
175 send_heartbeat();
176 num_bal_times--;
177 }
178 }
179
180
181
182
183 class C_Bal_SendHeartbeat : public MDSInternalContext {
184 public:
185 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
186 void finish(int f) override {
187 mds->balancer->send_heartbeat();
188 }
189 };
190
191
192 double mds_load_t::mds_load()
193 {
194 switch(g_conf->mds_bal_mode) {
195 case 0:
196 return
197 .8 * auth.meta_load() +
198 .2 * all.meta_load() +
199 req_rate +
200 10.0 * queue_len;
201
202 case 1:
203 return req_rate + 10.0*queue_len;
204
205 case 2:
206 return cpu_load_avg;
207
208 }
209 ceph_abort();
210 return 0;
211 }
212
213 mds_load_t MDBalancer::get_load(utime_t now)
214 {
215 mds_load_t load(now);
216
217 if (mds->mdcache->get_root()) {
218 list<CDir*> ls;
219 mds->mdcache->get_root()->get_dirfrags(ls);
220 for (list<CDir*>::iterator p = ls.begin();
221 p != ls.end();
222 ++p) {
223 load.auth.add(now, mds->mdcache->decayrate, (*p)->pop_auth_subtree_nested);
224 load.all.add(now, mds->mdcache->decayrate, (*p)->pop_nested);
225 }
226 } else {
227 dout(20) << "get_load no root, no load" << dendl;
228 }
229
230 uint64_t num_requests = mds->get_num_requests();
231 bool new_req_rate = false;
232 if (last_get_load != utime_t() &&
233 now > last_get_load &&
234 num_requests >= last_num_requests) {
235 utime_t el = now;
236 el -= last_get_load;
237 if (el.sec() >= 1) {
238 load.req_rate = (num_requests - last_num_requests) / (double)el;
239 new_req_rate = true;
240 }
241 }
242 if (!new_req_rate) {
243 auto p = mds_load.find(mds->get_nodeid());
244 if (p != mds_load.end())
245 load.req_rate = p->second.req_rate;
246 }
247 last_get_load = now;
248 last_num_requests = num_requests;
249
250 load.queue_len = messenger->get_dispatch_queue_len();
251
252 ifstream cpu(PROCPREFIX "/proc/loadavg");
253 if (cpu.is_open())
254 cpu >> load.cpu_load_avg;
255 else
256 derr << "input file " PROCPREFIX "'/proc/loadavg' not found" << dendl_impl;
257
258 dout(15) << "get_load " << load << dendl;
259 return load;
260 }
261
262 /*
263 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
264 * fallbacks (i.e. kick off async read when we are processing the map and
265 * check status when we get here) with the way the mds is structured.
266 */
267 int MDBalancer::localize_balancer()
268 {
269 /* reset everything */
270 bool ack = false;
271 int r = 0;
272 bufferlist lua_src;
273 Mutex lock("lock");
274 Cond cond;
275
276 /* we assume that balancer is in the metadata pool */
277 object_t oid = object_t(mds->mdsmap->get_balancer());
278 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
279 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
280 new C_SafeCond(&lock, &cond, &ack, &r));
281 dout(15) << "launched non-blocking read tid=" << tid
282 << " oid=" << oid << " oloc=" << oloc << dendl;
283
284 /* timeout: if we waste half our time waiting for RADOS, then abort! */
285 double t = ceph_clock_now() + g_conf->mds_bal_interval/2;
286 utime_t timeout;
287 timeout.set_from_double(t);
288 lock.Lock();
289 int ret_t = cond.WaitUntil(lock, timeout);
290 lock.Unlock();
291
292 /* success: store the balancer in memory and set the version. */
293 if (!r) {
294 if (ret_t == ETIMEDOUT) {
295 mds->objecter->op_cancel(tid, -ECANCELED);
296 return -ETIMEDOUT;
297 }
298 bal_code.assign(lua_src.to_str());
299 bal_version.assign(oid.name);
300 dout(10) << "localized balancer, bal_code=" << bal_code << dendl;
301 }
302 return r;
303 }
304
305 void MDBalancer::send_heartbeat()
306 {
307 utime_t now = ceph_clock_now();
308
309 if (mds->is_cluster_degraded()) {
310 dout(10) << "send_heartbeat degraded" << dendl;
311 return;
312 }
313
314 if (!mds->mdcache->is_open()) {
315 dout(5) << "not open" << dendl;
316 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
317 return;
318 }
319
320 if (mds->get_nodeid() == 0) {
321 beat_epoch++;
322 mds_load.clear();
323 }
324
325 // my load
326 mds_load_t load = get_load(now);
327 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
328 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
329
330 mds_load[mds->get_nodeid()] = load;
331
332 // import_map -- how much do i import from whom
333 map<mds_rank_t, float> import_map;
334 set<CDir*> authsubs;
335 mds->mdcache->get_auth_subtrees(authsubs);
336 for (set<CDir*>::iterator it = authsubs.begin();
337 it != authsubs.end();
338 ++it) {
339 CDir *im = *it;
340 mds_rank_t from = im->inode->authority().first;
341 if (from == mds->get_nodeid()) continue;
342 if (im->get_inode()->is_stray()) continue;
343 import_map[from] += im->pop_auth_subtree.meta_load(now, mds->mdcache->decayrate);
344 }
345 mds_import_map[ mds->get_nodeid() ] = import_map;
346
347
348 dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl;
349 for (map<mds_rank_t, float>::iterator it = import_map.begin();
350 it != import_map.end();
351 ++it) {
352 dout(5) << " import_map from " << it->first << " -> " << it->second << dendl;
353 }
354
355
356 set<mds_rank_t> up;
357 mds->get_mds_map()->get_up_mds_set(up);
358 for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) {
359 if (*p == mds->get_nodeid())
360 continue;
361 MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
362 hb->get_import_map() = import_map;
363 messenger->send_message(hb,
364 mds->mdsmap->get_inst(*p));
365 }
366 }
367
368 /* This function DOES put the passed message before returning */
369 void MDBalancer::handle_heartbeat(MHeartbeat *m)
370 {
371 mds_rank_t who = mds_rank_t(m->get_source().num());
372 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
373
374 if (!mds->is_active())
375 goto out;
376
377 if (!mds->mdcache->is_open()) {
378 dout(10) << "opening root on handle_heartbeat" << dendl;
379 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
380 return;
381 }
382
383 if (mds->is_cluster_degraded()) {
384 dout(10) << " degraded, ignoring" << dendl;
385 goto out;
386 }
387
388 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
389 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
390
391 beat_epoch = m->get_beat();
392 // clear the mds load info whose epoch is less than beat_epoch
393 mds_load.clear();
394 }
395
396 if (who == 0) {
397 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
398 if (beat_epoch != m->get_beat()) {
399 beat_epoch = m->get_beat();
400 mds_load.clear();
401 }
402
403 send_heartbeat();
404
405 mds->mdcache->show_subtrees();
406 } else if (mds->get_nodeid() == 0) {
407 if (beat_epoch != m->get_beat()) {
408 dout(10) << " old heartbeat epoch, ignoring" << dendl;
409 goto out;
410 }
411 }
412
413 mds_load[who] = m->get_load();
414 mds_import_map[who] = m->get_import_map();
415
416 {
417 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
418 if (mds_load.size() == cluster_size) {
419 // let's go!
420 //export_empties(); // no!
421
422 /* avoid spamming ceph -w if user does not turn mantle on */
423 if (mds->mdsmap->get_balancer() != "") {
424 int r = mantle_prep_rebalance();
425 if (!r) goto out;
426 mds->clog->warn() << "using old balancer; mantle failed for "
427 << "balancer=" << mds->mdsmap->get_balancer()
428 << " : " << cpp_strerror(r);
429 }
430 prep_rebalance(m->get_beat());
431 }
432 }
433
434 // done
435 out:
436 m->put();
437 }
438
439 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
440 mds_rank_t im, double& maxim)
441 {
442 if (maxex <= 0 || maxim <= 0) return 0.0;
443
444 double howmuch = MIN(maxex, maxim);
445 if (howmuch <= 0) return 0.0;
446
447 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
448
449 if (ex == mds->get_nodeid())
450 state.targets[im] += howmuch;
451
452 state.exported[ex] += howmuch;
453 state.imported[im] += howmuch;
454
455 maxex -= howmuch;
456 maxim -= howmuch;
457
458 return howmuch;
459 }
460
461 void MDBalancer::queue_split(const CDir *dir, bool fast)
462 {
463 dout(10) << __func__ << " enqueuing " << *dir
464 << " (fast=" << fast << ")" << dendl;
465
466 assert(mds->mdsmap->allows_dirfrags());
467 const dirfrag_t frag = dir->dirfrag();
468
469 auto callback = [this, frag](int r) {
470 if (split_pending.erase(frag) == 0) {
471 // Someone beat me to it. This can happen in the fast splitting
472 // path, because we spawn two contexts, one with mds->timer and
473 // one with mds->queue_waiter. The loser can safely just drop
474 // out.
475 return;
476 }
477
478 CDir *split_dir = mds->mdcache->get_dirfrag(frag);
479 if (!split_dir) {
480 dout(10) << "drop split on " << frag << " because not in cache" << dendl;
481 return;
482 }
483 if (!split_dir->is_auth()) {
484 dout(10) << "drop split on " << frag << " because non-auth" << dendl;
485 return;
486 }
487
488 // Pass on to MDCache: note that the split might still not
489 // happen if the checks in MDCache::can_fragment fail.
490 dout(10) << __func__ << " splitting " << *split_dir << dendl;
491 mds->mdcache->split_dir(split_dir, g_conf->mds_bal_split_bits);
492 };
493
494 bool is_new = false;
495 if (split_pending.count(frag) == 0) {
496 split_pending.insert(frag);
497 is_new = true;
498 }
499
500 if (fast) {
501 // Do the split ASAP: enqueue it in the MDSRank waiters which are
502 // run at the end of dispatching the current request
503 mds->queue_waiter(new MDSInternalContextWrapper(mds,
504 new FunctionContext(callback)));
505 } else if (is_new) {
506 // Set a timer to really do the split: we don't do it immediately
507 // so that bursts of ops on a directory have a chance to go through
508 // before we freeze it.
509 mds->timer.add_event_after(g_conf->mds_bal_fragment_interval,
510 new FunctionContext(callback));
511 }
512 }
513
514 void MDBalancer::queue_merge(CDir *dir)
515 {
516 const auto frag = dir->dirfrag();
517 auto callback = [this, frag](int r) {
518 assert(frag.frag != frag_t());
519
520 // frag must be in this set because only one context is in flight
521 // for a given frag at a time (because merge_pending is checked before
522 // starting one), and this context is the only one that erases it.
523 merge_pending.erase(frag);
524
525 CDir *dir = mds->mdcache->get_dirfrag(frag);
526 if (!dir) {
527 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
528 return;
529 }
530 assert(dir->dirfrag() == frag);
531
532 if(!dir->is_auth()) {
533 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
534 return;
535 }
536
537 dout(10) << "merging " << *dir << dendl;
538
539 CInode *diri = dir->get_inode();
540
541 frag_t fg = dir->get_frag();
542 while (fg != frag_t()) {
543 frag_t sibfg = fg.get_sibling();
544 list<CDir*> sibs;
545 bool complete = diri->get_dirfrags_under(sibfg, sibs);
546 if (!complete) {
547 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
548 break;
549 }
550 bool all = true;
551 for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) {
552 CDir *sib = *p;
553 if (!sib->is_auth() || !sib->should_merge()) {
554 all = false;
555 break;
556 }
557 }
558 if (!all) {
559 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
560 break;
561 }
562 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
563 fg = fg.parent();
564 }
565
566 if (fg != dir->get_frag())
567 mds->mdcache->merge_dir(diri, fg);
568 };
569
570 if (merge_pending.count(frag) == 0) {
571 dout(20) << __func__ << " enqueued dir " << *dir << dendl;
572 merge_pending.insert(frag);
573 mds->timer.add_event_after(g_conf->mds_bal_fragment_interval,
574 new FunctionContext(callback));
575 } else {
576 dout(20) << __func__ << " dir already in queue " << *dir << dendl;
577 }
578 }
579
580 void MDBalancer::prep_rebalance(int beat)
581 {
582 balance_state_t state;
583
584 if (g_conf->mds_thrash_exports) {
585 //we're going to randomly export to all the mds in the cluster
586 set<mds_rank_t> up_mds;
587 mds->get_mds_map()->get_up_mds_set(up_mds);
588 for (const auto &rank : up_mds) {
589 state.targets[rank] = 0.0;
590 }
591 } else {
592 int cluster_size = mds->get_mds_map()->get_num_in_mds();
593 mds_rank_t whoami = mds->get_nodeid();
594 rebalance_time = ceph_clock_now();
595
596 dout(5) << " prep_rebalance: cluster loads are" << dendl;
597
598 mds->mdcache->migrator->clear_export_queue();
599
600 // rescale! turn my mds_load back into meta_load units
601 double load_fac = 1.0;
602 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
603 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
604 double metald = m->second.auth.meta_load(rebalance_time, mds->mdcache->decayrate);
605 double mdsld = m->second.mds_load();
606 load_fac = metald / mdsld;
607 dout(7) << " load_fac is " << load_fac
608 << " <- " << m->second.auth << " " << metald
609 << " / " << mdsld
610 << dendl;
611 }
612
613 mds_meta_load.clear();
614
615 double total_load = 0.0;
616 multimap<double,mds_rank_t> load_map;
617 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
618 mds_load_t& load = mds_load.at(i);
619
620 double l = load.mds_load() * load_fac;
621 mds_meta_load[i] = l;
622
623 if (whoami == 0)
624 dout(5) << " mds." << i
625 << " " << load
626 << " = " << load.mds_load()
627 << " ~ " << l << dendl;
628
629 if (whoami == i) my_load = l;
630 total_load += l;
631
632 load_map.insert(pair<double,mds_rank_t>( l, i ));
633 }
634
635 // target load
636 target_load = total_load / (double)cluster_size;
637 dout(5) << "prep_rebalance: my load " << my_load
638 << " target " << target_load
639 << " total " << total_load
640 << dendl;
641
642 // under or over?
643 for (auto p : load_map) {
644 if (p.first < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
645 dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
646 mds_last_epoch_under_map[p.second] = beat_epoch;
647 }
648 }
649
650 int last_epoch_under = mds_last_epoch_under_map[whoami];
651 if (last_epoch_under == beat_epoch) {
652 dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
653 return;
654 }
655 // am i over long enough?
656 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
657 dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
658 return;
659 }
660
661 dout(5) << " i am sufficiently overloaded" << dendl;
662
663
664 // first separate exporters and importers
665 multimap<double,mds_rank_t> importers;
666 multimap<double,mds_rank_t> exporters;
667 set<mds_rank_t> importer_set;
668 set<mds_rank_t> exporter_set;
669
670 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
671 it != load_map.end();
672 ++it) {
673 if (it->first < target_load) {
674 dout(15) << " mds." << it->second << " is importer" << dendl;
675 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
676 importer_set.insert(it->second);
677 } else {
678 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
679 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
680 dout(15) << " mds." << it->second << " is exporter" << dendl;
681 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
682 exporter_set.insert(it->second);
683 }
684 }
685 }
686
687
688 // determine load transfer mapping
689
690 if (true) {
691 // analyze import_map; do any matches i can
692
693 dout(15) << " matching exporters to import sources" << dendl;
694
695 // big -> small exporters
696 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
697 ex != exporters.rend();
698 ++ex) {
699 double maxex = get_maxex(state, ex->second);
700 if (maxex <= .001) continue;
701
702 // check importers. for now, just in arbitrary order (no intelligent matching).
703 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
704 im != mds_import_map[ex->second].end();
705 ++im) {
706 double maxim = get_maxim(state, im->first);
707 if (maxim <= .001) continue;
708 try_match(state, ex->second, maxex, im->first, maxim);
709 if (maxex <= .001) break;
710 }
711 }
712 }
713
714 // old way
715 if (beat % 2 == 1) {
716 dout(15) << " matching big exporters to big importers" << dendl;
717 // big exporters to big importers
718 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
719 multimap<double,mds_rank_t>::iterator im = importers.begin();
720 while (ex != exporters.rend() &&
721 im != importers.end()) {
722 double maxex = get_maxex(state, ex->second);
723 double maxim = get_maxim(state, im->second);
724 if (maxex < .001 || maxim < .001) break;
725 try_match(state, ex->second, maxex, im->second, maxim);
726 if (maxex <= .001) ++ex;
727 if (maxim <= .001) ++im;
728 }
729 } else { // new way
730 dout(15) << " matching small exporters to big importers" << dendl;
731 // small exporters to big importers
732 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
733 multimap<double,mds_rank_t>::iterator im = importers.begin();
734 while (ex != exporters.end() &&
735 im != importers.end()) {
736 double maxex = get_maxex(state, ex->second);
737 double maxim = get_maxim(state, im->second);
738 if (maxex < .001 || maxim < .001) break;
739 try_match(state, ex->second, maxex, im->second, maxim);
740 if (maxex <= .001) ++ex;
741 if (maxim <= .001) ++im;
742 }
743 }
744 }
745 try_rebalance(state);
746 }
747
748 int MDBalancer::mantle_prep_rebalance()
749 {
750 balance_state_t state;
751
752 /* refresh balancer if it has changed */
753 if (bal_version != mds->mdsmap->get_balancer()) {
754 bal_version.assign("");
755 int r = localize_balancer();
756 if (r) return r;
757
758 /* only spam the cluster log from 1 mds on version changes */
759 if (mds->get_nodeid() == 0)
760 mds->clog->info() << "mantle balancer version changed: " << bal_version;
761 }
762
763 /* prepare for balancing */
764 int cluster_size = mds->get_mds_map()->get_num_in_mds();
765 rebalance_time = ceph_clock_now();
766 mds->mdcache->migrator->clear_export_queue();
767
768 /* fill in the metrics for each mds by grabbing load struct */
769 vector < map<string, double> > metrics (cluster_size);
770 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
771 mds_load_t& load = mds_load.at(i);
772
773 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
774 {"all.meta_load", load.all.meta_load()},
775 {"req_rate", load.req_rate},
776 {"queue_len", load.queue_len},
777 {"cpu_load_avg", load.cpu_load_avg}};
778 }
779
780 /* execute the balancer */
781 Mantle mantle;
782 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
783 dout(2) << " mantle decided that new targets=" << state.targets << dendl;
784
785 /* mantle doesn't know about cluster size, so check target len here */
786 if ((int) state.targets.size() != cluster_size)
787 return -EINVAL;
788 else if (ret)
789 return ret;
790
791 try_rebalance(state);
792 return 0;
793 }
794
795
796
797 void MDBalancer::try_rebalance(balance_state_t& state)
798 {
799 if (g_conf->mds_thrash_exports) {
800 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
801 << dendl;
802 return;
803 }
804
805 // make a sorted list of my imports
806 multimap<double, CDir*> import_pop_map;
807 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
808 set<CDir*> fullauthsubs;
809
810 mds->mdcache->get_fullauth_subtrees(fullauthsubs);
811 for (auto dir : fullauthsubs) {
812 CInode *diri = dir->get_inode();
813 if (diri->is_mdsdir())
814 continue;
815 if (diri->get_export_pin(false) != MDS_RANK_NONE)
816 continue;
817 if (dir->is_freezing() || dir->is_frozen())
818 continue; // export pbly already in progress
819
820 mds_rank_t from = diri->authority().first;
821 double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
822 if (g_conf->mds_bal_idle_threshold > 0 &&
823 pop < g_conf->mds_bal_idle_threshold &&
824 diri != mds->mdcache->get_root() &&
825 from != mds->get_nodeid()) {
826 dout(5) << " exporting idle (" << pop << ") import " << *dir
827 << " back to mds." << from << dendl;
828 mds->mdcache->migrator->export_dir_nicely(dir, from);
829 continue;
830 }
831
832 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
833 import_pop_map.insert(make_pair(pop, dir));
834 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
835 }
836
837 // do my exports!
838 map<mds_rank_t, double> export_pop_map;
839
840 for (auto &it : state.targets) {
841 mds_rank_t target = it.first;
842 double amount = it.second;
843
844 if (amount / target_load < .2)
845 continue;
846 if (amount < MIN_OFFLOAD)
847 continue;
848
849 dout(5) << "want to send " << amount << " to mds." << target
850 //<< " .. " << (*it).second << " * " << load_fac
851 << " -> " << amount
852 << dendl;//" .. fudge is " << fudge << dendl;
853
854 double& have = export_pop_map[target];
855
856 mds->mdcache->show_subtrees();
857
858 // search imports from target
859 if (import_from_map.count(target)) {
860 dout(5) << " aha, looking through imports from target mds." << target << dendl;
861 for (auto p = import_from_map.equal_range(target);
862 p.first != p.second; ) {
863 CDir *dir = p.first->second.first;
864 double pop = p.first->second.second;
865 dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
866 auto plast = p.first++;
867
868 if (dir->inode->is_base())
869 continue;
870 assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
871
872 if (pop <= amount-have) {
873 dout(5) << "reexporting " << *dir << " pop " << pop
874 << " back to mds." << target << dendl;
875 mds->mdcache->migrator->export_dir_nicely(dir, target);
876 have += pop;
877 import_from_map.erase(plast);
878 for (auto q = import_pop_map.equal_range(pop);
879 q.first != q.second; ) {
880 if (q.first->second == dir) {
881 import_pop_map.erase(q.first);
882 break;
883 }
884 q.first++;
885 }
886 } else {
887 dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
888 }
889 if (amount-have < MIN_OFFLOAD)
890 break;
891 }
892 }
893 }
894
895 // any other imports
896 for (auto &it : state.targets) {
897 mds_rank_t target = it.first;
898 double amount = it.second;
899
900 if (!export_pop_map.count(target))
901 continue;
902 double& have = export_pop_map[target];
903 if (amount-have < MIN_OFFLOAD)
904 continue;
905
906 for (auto p = import_pop_map.begin();
907 p != import_pop_map.end(); ) {
908 CDir *dir = p->second;
909 if (dir->inode->is_base()) {
910 ++p;
911 continue;
912 }
913
914 double pop = p->first;
915 if (pop <= amount-have && pop > MIN_REEXPORT) {
916 dout(0) << "reexporting " << *dir << " pop " << pop
917 << " to mds." << target << dendl;
918 have += pop;
919 mds->mdcache->migrator->export_dir_nicely(dir, target);
920 import_pop_map.erase(p++);
921 } else {
922 ++p;
923 }
924 if (amount-have < MIN_OFFLOAD)
925 break;
926 }
927 }
928
929 set<CDir*> already_exporting;
930
931 for (auto &it : state.targets) {
932 mds_rank_t target = it.first;
933 double amount = it.second;
934
935 if (!export_pop_map.count(target))
936 continue;
937 double& have = export_pop_map[target];
938 if (amount-have < MIN_OFFLOAD)
939 continue;
940
941 // okay, search for fragments of my workload
942 list<CDir*> exports;
943
944 for (auto p = import_pop_map.rbegin();
945 p != import_pop_map.rend();
946 ++p) {
947 CDir *dir = p->second;
948 find_exports(dir, amount, exports, have, already_exporting);
949 if (amount-have < MIN_OFFLOAD)
950 break;
951 }
952 //fudge = amount - have;
953
954 for (auto dir : exports) {
955 dout(5) << " - exporting " << dir->pop_auth_subtree
956 << " " << dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
957 << " to mds." << target << " " << *dir << dendl;
958 mds->mdcache->migrator->export_dir_nicely(dir, target);
959 }
960 }
961
962 dout(5) << "rebalance done" << dendl;
963 mds->mdcache->show_subtrees();
964 }
965
966 void MDBalancer::find_exports(CDir *dir,
967 double amount,
968 list<CDir*>& exports,
969 double& have,
970 set<CDir*>& already_exporting)
971 {
972 utime_t now = ceph_clock_now();
973 if ((double)(now - rebalance_time) > 0.1) {
974 derr << " balancer runs too long" << dendl_impl;
975 have = amount;
976 return;
977 }
978
979 assert(dir->is_auth());
980
981 double need = amount - have;
982 if (need < amount * g_conf->mds_bal_min_start)
983 return; // good enough!
984
985 double needmax = need * g_conf->mds_bal_need_max;
986 double needmin = need * g_conf->mds_bal_need_min;
987 double midchunk = need * g_conf->mds_bal_midchunk;
988 double minchunk = need * g_conf->mds_bal_minchunk;
989
990 list<CDir*> bigger_rep, bigger_unrep;
991 multimap<double, CDir*> smaller;
992
993 double dir_pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
994 dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
995
996 double subdir_sum = 0;
997 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
998 !it.end(); ) {
999 CInode *in = *it;
1000 ++it;
1001
1002 assert(in->is_dir());
1003 assert(in->get_parent_dir() == dir);
1004
1005 list<CDir*> dfls;
1006 in->get_nested_dirfrags(dfls);
1007
1008 size_t num_idle_frags = 0;
1009 for (list<CDir*>::iterator p = dfls.begin();
1010 p != dfls.end();
1011 ++p) {
1012 CDir *subdir = *p;
1013 if (already_exporting.count(subdir))
1014 continue;
1015
1016 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1017 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1018 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1019 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1020 continue; // can't export this right now!
1021
1022 // how popular?
1023 double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
1024 subdir_sum += pop;
1025 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1026
1027 if (pop < minchunk) {
1028 num_idle_frags++;
1029 continue;
1030 }
1031
1032 // lucky find?
1033 if (pop > needmin && pop < needmax) {
1034 exports.push_back(subdir);
1035 already_exporting.insert(subdir);
1036 have += pop;
1037 return;
1038 }
1039
1040 if (pop > need) {
1041 if (subdir->is_rep())
1042 bigger_rep.push_back(subdir);
1043 else
1044 bigger_unrep.push_back(subdir);
1045 } else
1046 smaller.insert(pair<double,CDir*>(pop, subdir));
1047 }
1048 if (dfls.size() == num_idle_frags)
1049 in->item_pop_lru.remove_myself();
1050 }
1051 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1052
1053 // grab some sufficiently big small items
1054 multimap<double,CDir*>::reverse_iterator it;
1055 for (it = smaller.rbegin();
1056 it != smaller.rend();
1057 ++it) {
1058
1059 if ((*it).first < midchunk)
1060 break; // try later
1061
1062 dout(7) << " taking smaller " << *(*it).second << dendl;
1063
1064 exports.push_back((*it).second);
1065 already_exporting.insert((*it).second);
1066 have += (*it).first;
1067 if (have > needmin)
1068 return;
1069 }
1070
1071 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1072 for (list<CDir*>::iterator it = bigger_unrep.begin();
1073 it != bigger_unrep.end();
1074 ++it) {
1075 dout(15) << " descending into " << **it << dendl;
1076 find_exports(*it, amount, exports, have, already_exporting);
1077 if (have > needmin)
1078 return;
1079 }
1080
1081 // ok fine, use smaller bits
1082 for (;
1083 it != smaller.rend();
1084 ++it) {
1085 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1086
1087 exports.push_back((*it).second);
1088 already_exporting.insert((*it).second);
1089 have += (*it).first;
1090 if (have > needmin)
1091 return;
1092 }
1093
1094 // ok fine, drill into replicated dirs
1095 for (list<CDir*>::iterator it = bigger_rep.begin();
1096 it != bigger_rep.end();
1097 ++it) {
1098 dout(7) << " descending into replicated " << **it << dendl;
1099 find_exports(*it, amount, exports, have, already_exporting);
1100 if (have > needmin)
1101 return;
1102 }
1103 }
1104
1105 void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
1106 {
1107 // hit inode
1108 in->pop.get(type).hit(now, mds->mdcache->decayrate);
1109
1110 if (in->get_parent_dn())
1111 hit_dir(now, in->get_parent_dn()->get_dir(), type, who);
1112 }
1113
1114 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1115 {
1116 // split/merge
1117 if (g_conf->mds_bal_frag && g_conf->mds_bal_fragment_interval > 0 &&
1118 !dir->inode->is_base() && // not root/base (for now at least)
1119 dir->is_auth()) {
1120
1121 // split
1122 if (g_conf->mds_bal_split_size > 0 &&
1123 mds->mdsmap->allows_dirfrags() &&
1124 (dir->should_split() || hot))
1125 {
1126 if (split_pending.count(dir->dirfrag()) == 0) {
1127 queue_split(dir, false);
1128 } else {
1129 if (dir->should_split_fast()) {
1130 queue_split(dir, true);
1131 } else {
1132 dout(10) << __func__ << ": fragment already enqueued to split: "
1133 << *dir << dendl;
1134 }
1135 }
1136 }
1137
1138 // merge?
1139 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1140 merge_pending.count(dir->dirfrag()) == 0) {
1141 queue_merge(dir);
1142 }
1143 }
1144 }
1145
1146 void MDBalancer::hit_dir(const utime_t& now, CDir *dir, int type, int who, double amount)
1147 {
1148 // hit me
1149 double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
1150
1151 const bool hot = (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) ||
1152 (v > g_conf->mds_bal_split_wr && type == META_POP_IWR);
1153
1154 dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag()
1155 << " size " << dir->get_frag_size() << dendl;
1156
1157 maybe_fragment(dir, hot);
1158
1159 // replicate?
1160 if (type == META_POP_IRD && who >= 0) {
1161 dir->pop_spread.hit(now, mds->mdcache->decayrate, who);
1162 }
1163
1164 double rd_adj = 0.0;
1165 if (type == META_POP_IRD &&
1166 dir->last_popularity_sample < last_sample) {
1167 double dir_pop = dir->pop_auth_subtree.get(type).get(now, mds->mdcache->decayrate); // hmm??
1168 dir->last_popularity_sample = last_sample;
1169 double pop_sp = dir->pop_spread.get(now, mds->mdcache->decayrate);
1170 dir_pop += pop_sp * 10;
1171
1172 //if (dir->ino() == inodeno_t(0x10000000002))
1173 if (pop_sp > 0) {
1174 dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp
1175 << " " << dir->pop_spread.last[0]
1176 << " " << dir->pop_spread.last[1]
1177 << " " << dir->pop_spread.last[2]
1178 << " " << dir->pop_spread.last[3]
1179 << " in " << *dir << dendl;
1180 }
1181
1182 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1183 if (!dir->is_rep() &&
1184 dir_pop >= g_conf->mds_bal_replicate_threshold) {
1185 // replicate
1186 double rdp = dir->pop_me.get(META_POP_IRD).get(now, mds->mdcache->decayrate);
1187 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1188 rd_adj /= 2.0; // temper somewhat
1189
1190 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
1191
1192 dir->dir_rep = CDir::REP_ALL;
1193 mds->mdcache->send_dir_updates(dir, true);
1194
1195 // fixme this should adjust the whole pop hierarchy
1196 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1197 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1198 }
1199
1200 if (dir->ino() != 1 &&
1201 dir->is_rep() &&
1202 dir_pop < g_conf->mds_bal_unreplicate_threshold) {
1203 // unreplicate
1204 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
1205
1206 dir->dir_rep = CDir::REP_NONE;
1207 mds->mdcache->send_dir_updates(dir);
1208 }
1209 }
1210 }
1211
1212 // adjust ancestors
1213 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1214 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1215
1216 while (true) {
1217 CDir *pdir = dir->inode->get_parent_dir();
1218 dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
1219 if (rd_adj != 0.0)
1220 dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1221
1222 if (hit_subtree) {
1223 dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
1224
1225 if (rd_adj != 0.0)
1226 dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1227
1228 if (dir->is_subtree_root())
1229 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1230 else if (pdir)
1231 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
1232 }
1233
1234 if (hit_subtree_nested) {
1235 dir->pop_auth_subtree_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
1236 if (rd_adj != 0.0)
1237 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1238 }
1239 if (!pdir) break;
1240 dir = pdir;
1241 }
1242 }
1243
1244
1245 /*
1246 * subtract off an exported chunk.
1247 * this excludes *dir itself (encode_export_dir should have take care of that)
1248 * we _just_ do the parents' nested counters.
1249 *
1250 * NOTE: call me _after_ forcing *dir into a subtree root,
1251 * but _before_ doing the encode_export_dirs.
1252 */
1253 void MDBalancer::subtract_export(CDir *dir, utime_t now)
1254 {
1255 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1256
1257 while (true) {
1258 dir = dir->inode->get_parent_dir();
1259 if (!dir) break;
1260
1261 dir->pop_nested.sub(now, mds->mdcache->decayrate, subload);
1262 dir->pop_auth_subtree_nested.sub(now, mds->mdcache->decayrate, subload);
1263 }
1264 }
1265
1266
1267 void MDBalancer::add_import(CDir *dir, utime_t now)
1268 {
1269 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1270
1271 while (true) {
1272 dir = dir->inode->get_parent_dir();
1273 if (!dir) break;
1274
1275 dir->pop_nested.add(now, mds->mdcache->decayrate, subload);
1276 dir->pop_auth_subtree_nested.add(now, mds->mdcache->decayrate, subload);
1277 }
1278 }
1279
1280 void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc)
1281 {
1282 DecayRate& rate = mds->mdcache->decayrate;
1283
1284 bool adjust_subtree_nest = dir->is_auth();
1285 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1286 CDir *cur = dir;
1287 while (true) {
1288 if (inc) {
1289 pdir->pop_nested.add(now, rate, dir->pop_nested);
1290 if (adjust_subtree) {
1291 pdir->pop_auth_subtree.add(now, rate, dir->pop_auth_subtree);
1292 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1293 }
1294
1295 if (adjust_subtree_nest)
1296 pdir->pop_auth_subtree_nested.add(now, rate, dir->pop_auth_subtree_nested);
1297 } else {
1298 pdir->pop_nested.sub(now, rate, dir->pop_nested);
1299 if (adjust_subtree)
1300 pdir->pop_auth_subtree.sub(now, rate, dir->pop_auth_subtree);
1301
1302 if (adjust_subtree_nest)
1303 pdir->pop_auth_subtree_nested.sub(now, rate, dir->pop_auth_subtree_nested);
1304 }
1305
1306 if (pdir->is_subtree_root())
1307 adjust_subtree = false;
1308 cur = pdir;
1309 pdir = pdir->inode->get_parent_dir();
1310 if (!pdir) break;
1311 }
1312 }
1313
1314 void MDBalancer::handle_mds_failure(mds_rank_t who)
1315 {
1316 if (0 == who) {
1317 mds_last_epoch_under_map.clear();
1318 }
1319 }
1320
1321 int MDBalancer::dump_loads(Formatter *f)
1322 {
1323 utime_t now = ceph_clock_now();
1324 DecayRate& decayrate = mds->mdcache->decayrate;
1325
1326 list<CDir*> dfs;
1327 if (mds->mdcache->get_root()) {
1328 mds->mdcache->get_root()->get_dirfrags(dfs);
1329 } else {
1330 dout(5) << "dump_load no root" << dendl;
1331 }
1332
1333 f->open_object_section("loads");
1334
1335 f->open_array_section("dirfrags");
1336 while (!dfs.empty()) {
1337 CDir *dir = dfs.front();
1338 dfs.pop_front();
1339
1340 if (f) {
1341 f->open_object_section("dir");
1342 dir->dump_load(f, now, decayrate);
1343 f->close_section();
1344 }
1345
1346 for (auto it = dir->begin(); it != dir->end(); ++it) {
1347 CInode *in = it->second->get_linkage()->get_inode();
1348 if (!in || !in->is_dir())
1349 continue;
1350
1351 list<CDir*> ls;
1352 in->get_dirfrags(ls);
1353 for (auto subdir : ls) {
1354 if (subdir->pop_nested.meta_load() < .001)
1355 continue;
1356 dfs.push_back(subdir);
1357 }
1358 }
1359 }
1360 f->close_section(); // dirfrags array
1361
1362 f->open_object_section("mds_load");
1363 {
1364
1365 auto dump_mds_load = [f, now](mds_load_t& load) {
1366 f->dump_float("request_rate", load.req_rate);
1367 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1368 f->dump_float("queue_length", load.queue_len);
1369 f->dump_float("cpu_load", load.cpu_load_avg);
1370 f->dump_float("mds_load", load.mds_load());
1371
1372 DecayRate rate; // no decay
1373 f->open_object_section("auth_dirfrags");
1374 load.auth.dump(f, now, rate);
1375 f->close_section();
1376 f->open_object_section("all_dirfrags");
1377 load.all.dump(f, now, rate);
1378 f->close_section();
1379 };
1380
1381 for (auto p : mds_load) {
1382 stringstream name;
1383 name << "mds." << p.first;
1384 f->open_object_section(name.str().c_str());
1385 dump_mds_load(p.second);
1386 f->close_section();
1387 }
1388 }
1389 f->close_section(); // mds_load
1390
1391 f->open_object_section("mds_meta_load");
1392 for (auto p : mds_meta_load) {
1393 stringstream name;
1394 name << "mds." << p.first;
1395 f->dump_float(name.str().c_str(), p.second);
1396 }
1397 f->close_section(); // mds_meta_load
1398
1399 f->open_object_section("mds_import_map");
1400 for (auto p : mds_import_map) {
1401 stringstream name1;
1402 name1 << "mds." << p.first;
1403 f->open_array_section(name1.str().c_str());
1404 for (auto q : p.second) {
1405 f->open_object_section("from");
1406 stringstream name2;
1407 name2 << "mds." << q.first;
1408 f->dump_float(name2.str().c_str(), q.second);
1409 f->close_section();
1410 }
1411 f->close_section(); // mds.? array
1412 }
1413 f->close_section(); // mds_import_map
1414
1415 f->close_section(); // loads
1416 return 0;
1417 }