]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.cc
c3d56bff9d3dc718211391a5c4836d7f5abcd773
[ceph.git] / ceph / src / mds / MDBalancer.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "mdstypes.h"
17
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
20 #include "MDSRank.h"
21 #include "MDSMap.h"
22 #include "CInode.h"
23 #include "CDir.h"
24 #include "MDCache.h"
25 #include "Migrator.h"
26 #include "Mantle.h"
27
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
30 #include "messages/MHeartbeat.h"
31
32 #include <fstream>
33 #include <iostream>
34 #include <vector>
35 #include <map>
36 using std::map;
37 using std::vector;
38
39 #include "common/config.h"
40 #include "common/errno.h"
41
42 #define dout_context g_ceph_context
43 #define dout_subsys ceph_subsys_mds
44 #undef dout_prefix
45 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
46 #undef dout
47 #define dout(lvl) \
48 do {\
49 auto subsys = ceph_subsys_mds;\
50 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
51 subsys = ceph_subsys_mds_balancer;\
52 }\
53 dout_impl(dout_context, subsys, lvl) dout_prefix
54 #undef dendl
55 #define dendl dendl_impl; } while (0)
56
57
58 #define MIN_LOAD 50 // ??
59 #define MIN_REEXPORT 5 // will automatically reexport
60 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
61
62
63 /* This function DOES put the passed message before returning */
64 int MDBalancer::proc_message(Message *m)
65 {
66 switch (m->get_type()) {
67
68 case MSG_MDS_HEARTBEAT:
69 handle_heartbeat(static_cast<MHeartbeat*>(m));
70 break;
71
72 default:
73 derr << " balancer unknown message " << m->get_type() << dendl_impl;
74 assert(0 == "balancer unknown message");
75 }
76
77 return 0;
78 }
79
80 MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
81 mds(m), messenger(msgr), mon_client(monc)
82 {
83 bal_fragment_interval = g_conf->get_val<int64_t>("mds_bal_fragment_interval");
84 }
85
86 void MDBalancer::handle_conf_change(const struct md_config_t *conf,
87 const std::set <std::string> &changed,
88 const MDSMap &mds_map)
89 {
90 if (changed.count("mds_bal_fragment_interval"))
91 bal_fragment_interval = g_conf->get_val<int64_t>("mds_bal_fragment_interval");
92 }
93
94 void MDBalancer::handle_export_pins(void)
95 {
96 auto &q = mds->mdcache->export_pin_queue;
97 auto it = q.begin();
98 dout(20) << "export_pin_queue size=" << q.size() << dendl;
99 while (it != q.end()) {
100 auto cur = it++;
101 CInode *in = *cur;
102 assert(in->is_dir());
103 mds_rank_t export_pin = in->get_export_pin(false);
104
105 bool remove = true;
106 list<CDir*> dfls;
107 in->get_dirfrags(dfls);
108 for (auto dir : dfls) {
109 if (!dir->is_auth())
110 continue;
111
112 if (export_pin == MDS_RANK_NONE) {
113 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
114 if (dir->is_frozen() || dir->is_freezing()) {
115 // try again later
116 remove = false;
117 continue;
118 }
119 dout(10) << " clear auxsubtree on " << *dir << dendl;
120 dir->state_clear(CDir::STATE_AUXSUBTREE);
121 mds->mdcache->try_subtree_merge(dir);
122 }
123 } else if (export_pin == mds->get_nodeid()) {
124 if (dir->state_test(CDir::STATE_CREATING) ||
125 dir->is_frozen() || dir->is_freezing()) {
126 // try again later
127 remove = false;
128 continue;
129 }
130 if (!dir->is_subtree_root()) {
131 dir->state_set(CDir::STATE_AUXSUBTREE);
132 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
133 dout(10) << " create aux subtree on " << *dir << dendl;
134 } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) {
135 dout(10) << " set auxsubtree bit on " << *dir << dendl;
136 dir->state_set(CDir::STATE_AUXSUBTREE);
137 }
138 } else {
139 mds->mdcache->migrator->export_dir(dir, export_pin);
140 remove = false;
141 }
142 }
143
144 if (remove) {
145 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
146 q.erase(cur);
147 }
148 }
149
150 set<CDir *> authsubs;
151 mds->mdcache->get_auth_subtrees(authsubs);
152 for (auto &cd : authsubs) {
153 mds_rank_t export_pin = cd->inode->get_export_pin();
154 dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
155 if (export_pin >= 0 && export_pin != mds->get_nodeid()) {
156 dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl;
157 mds->mdcache->migrator->export_dir(cd, export_pin);
158 }
159 }
160 }
161
162 void MDBalancer::tick()
163 {
164 static int num_bal_times = g_conf->mds_bal_max;
165 static utime_t first = ceph_clock_now();
166 utime_t now = ceph_clock_now();
167 auto bal_interval = g_conf->get_val<int64_t>("mds_bal_interval");
168 auto bal_max_until = g_conf->get_val<int64_t>("mds_bal_max_until");
169
170 if (g_conf->mds_bal_export_pin) {
171 handle_export_pins();
172 }
173
174 // sample?
175 if ((double)now - (double)last_sample > g_conf->mds_bal_sample_interval) {
176 dout(15) << "tick last_sample now " << now << dendl;
177 last_sample = now;
178 }
179
180 // balance?
181 if (mds->get_nodeid() == 0 &&
182 mds->is_active() &&
183 bal_interval > 0 &&
184 (now - last_heartbeat).sec() >= bal_interval &&
185 (num_bal_times ||
186 (bal_max_until >= 0 && (now - first).sec() > bal_max_until))) {
187 last_heartbeat = now;
188 send_heartbeat();
189 num_bal_times--;
190 }
191 }
192
193
194
195
196 class C_Bal_SendHeartbeat : public MDSInternalContext {
197 public:
198 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
199 void finish(int f) override {
200 mds->balancer->send_heartbeat();
201 }
202 };
203
204
205 double mds_load_t::mds_load()
206 {
207 switch(g_conf->mds_bal_mode) {
208 case 0:
209 return
210 .8 * auth.meta_load() +
211 .2 * all.meta_load() +
212 req_rate +
213 10.0 * queue_len;
214
215 case 1:
216 return req_rate + 10.0*queue_len;
217
218 case 2:
219 return cpu_load_avg;
220
221 }
222 ceph_abort();
223 return 0;
224 }
225
226 mds_load_t MDBalancer::get_load(utime_t now)
227 {
228 mds_load_t load(now);
229
230 if (mds->mdcache->get_root()) {
231 list<CDir*> ls;
232 mds->mdcache->get_root()->get_dirfrags(ls);
233 for (list<CDir*>::iterator p = ls.begin();
234 p != ls.end();
235 ++p) {
236 load.auth.add(now, mds->mdcache->decayrate, (*p)->pop_auth_subtree_nested);
237 load.all.add(now, mds->mdcache->decayrate, (*p)->pop_nested);
238 }
239 } else {
240 dout(20) << "get_load no root, no load" << dendl;
241 }
242
243 uint64_t num_requests = mds->get_num_requests();
244
245 uint64_t cpu_time = 1;
246 {
247 string stat_path = PROCPREFIX "/proc/self/stat";
248 ifstream stat_file(stat_path);
249 if (stat_file.is_open()) {
250 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
251 std::istream_iterator<string>());
252 if (stat_vec.size() >= 15) {
253 // utime + stime
254 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
255 strtoll(stat_vec[14].c_str(), nullptr, 10);
256 } else {
257 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
258 }
259 } else {
260 derr << "input file '" << stat_path << "' not found" << dendl_impl;
261 }
262 }
263
264 load.queue_len = messenger->get_dispatch_queue_len();
265
266 bool update_last = true;
267 if (last_get_load != utime_t() &&
268 now > last_get_load) {
269 utime_t el = now;
270 el -= last_get_load;
271 if (el.sec() >= 1) {
272 if (num_requests > last_num_requests)
273 load.req_rate = (num_requests - last_num_requests) / (double)el;
274 if (cpu_time > last_cpu_time)
275 load.cpu_load_avg = (cpu_time - last_cpu_time) / (double)el;
276 } else {
277 auto p = mds_load.find(mds->get_nodeid());
278 if (p != mds_load.end()) {
279 load.req_rate = p->second.req_rate;
280 load.cpu_load_avg = p->second.cpu_load_avg;
281 }
282 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
283 update_last = false;
284 }
285 }
286
287 if (update_last) {
288 last_num_requests = num_requests;
289 last_cpu_time = cpu_time;
290 last_get_load = now;
291 }
292
293 dout(15) << "get_load " << load << dendl;
294 return load;
295 }
296
297 /*
298 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
299 * fallbacks (i.e. kick off async read when we are processing the map and
300 * check status when we get here) with the way the mds is structured.
301 */
302 int MDBalancer::localize_balancer()
303 {
304 /* reset everything */
305 bool ack = false;
306 int r = 0;
307 bufferlist lua_src;
308 Mutex lock("lock");
309 Cond cond;
310
311 /* we assume that balancer is in the metadata pool */
312 object_t oid = object_t(mds->mdsmap->get_balancer());
313 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
314 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
315 new C_SafeCond(&lock, &cond, &ack, &r));
316 dout(15) << "launched non-blocking read tid=" << tid
317 << " oid=" << oid << " oloc=" << oloc << dendl;
318
319 /* timeout: if we waste half our time waiting for RADOS, then abort! */
320 auto bal_interval = g_conf->get_val<int64_t>("mds_bal_interval");
321 lock.Lock();
322 int ret_t = cond.WaitInterval(lock, utime_t(bal_interval / 2, 0));
323 lock.Unlock();
324
325 /* success: store the balancer in memory and set the version. */
326 if (!r) {
327 if (ret_t == ETIMEDOUT) {
328 mds->objecter->op_cancel(tid, -ECANCELED);
329 return -ETIMEDOUT;
330 }
331 bal_code.assign(lua_src.to_str());
332 bal_version.assign(oid.name);
333 dout(10) << "localized balancer, bal_code=" << bal_code << dendl;
334 }
335 return r;
336 }
337
338 void MDBalancer::send_heartbeat()
339 {
340 utime_t now = ceph_clock_now();
341
342 if (mds->is_cluster_degraded()) {
343 dout(10) << "send_heartbeat degraded" << dendl;
344 return;
345 }
346
347 if (!mds->mdcache->is_open()) {
348 dout(5) << "not open" << dendl;
349 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
350 return;
351 }
352
353 if (mds->get_nodeid() == 0) {
354 beat_epoch++;
355 mds_load.clear();
356 }
357
358 // my load
359 mds_load_t load = get_load(now);
360 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
361 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
362
363 mds_load[mds->get_nodeid()] = load;
364
365 // import_map -- how much do i import from whom
366 map<mds_rank_t, float> import_map;
367 set<CDir*> authsubs;
368 mds->mdcache->get_auth_subtrees(authsubs);
369 for (set<CDir*>::iterator it = authsubs.begin();
370 it != authsubs.end();
371 ++it) {
372 CDir *im = *it;
373 mds_rank_t from = im->inode->authority().first;
374 if (from == mds->get_nodeid()) continue;
375 if (im->get_inode()->is_stray()) continue;
376 import_map[from] += im->pop_auth_subtree.meta_load(now, mds->mdcache->decayrate);
377 }
378 mds_import_map[ mds->get_nodeid() ] = import_map;
379
380
381 dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl;
382 for (map<mds_rank_t, float>::iterator it = import_map.begin();
383 it != import_map.end();
384 ++it) {
385 dout(5) << " import_map from " << it->first << " -> " << it->second << dendl;
386 }
387
388
389 set<mds_rank_t> up;
390 mds->get_mds_map()->get_up_mds_set(up);
391 for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) {
392 if (*p == mds->get_nodeid())
393 continue;
394 MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
395 hb->get_import_map() = import_map;
396 messenger->send_message(hb,
397 mds->mdsmap->get_inst(*p));
398 }
399 }
400
401 /* This function DOES put the passed message before returning */
402 void MDBalancer::handle_heartbeat(MHeartbeat *m)
403 {
404 mds_rank_t who = mds_rank_t(m->get_source().num());
405 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
406
407 if (!mds->is_active())
408 goto out;
409
410 if (!mds->mdcache->is_open()) {
411 dout(10) << "opening root on handle_heartbeat" << dendl;
412 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
413 return;
414 }
415
416 if (mds->is_cluster_degraded()) {
417 dout(10) << " degraded, ignoring" << dendl;
418 goto out;
419 }
420
421 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
422 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
423
424 beat_epoch = m->get_beat();
425 // clear the mds load info whose epoch is less than beat_epoch
426 mds_load.clear();
427 }
428
429 if (who == 0) {
430 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
431 if (beat_epoch != m->get_beat()) {
432 beat_epoch = m->get_beat();
433 mds_load.clear();
434 }
435
436 send_heartbeat();
437
438 mds->mdcache->show_subtrees();
439 } else if (mds->get_nodeid() == 0) {
440 if (beat_epoch != m->get_beat()) {
441 dout(10) << " old heartbeat epoch, ignoring" << dendl;
442 goto out;
443 }
444 }
445
446 mds_load[who] = m->get_load();
447 mds_import_map[who] = m->get_import_map();
448
449 {
450 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
451 if (mds_load.size() == cluster_size) {
452 // let's go!
453 //export_empties(); // no!
454
455 /* avoid spamming ceph -w if user does not turn mantle on */
456 if (mds->mdsmap->get_balancer() != "") {
457 int r = mantle_prep_rebalance();
458 if (!r) goto out;
459 mds->clog->warn() << "using old balancer; mantle failed for "
460 << "balancer=" << mds->mdsmap->get_balancer()
461 << " : " << cpp_strerror(r);
462 }
463 prep_rebalance(m->get_beat());
464 }
465 }
466
467 // done
468 out:
469 m->put();
470 }
471
472 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
473 mds_rank_t im, double& maxim)
474 {
475 if (maxex <= 0 || maxim <= 0) return 0.0;
476
477 double howmuch = MIN(maxex, maxim);
478 if (howmuch <= 0) return 0.0;
479
480 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
481
482 if (ex == mds->get_nodeid())
483 state.targets[im] += howmuch;
484
485 state.exported[ex] += howmuch;
486 state.imported[im] += howmuch;
487
488 maxex -= howmuch;
489 maxim -= howmuch;
490
491 return howmuch;
492 }
493
494 void MDBalancer::queue_split(const CDir *dir, bool fast)
495 {
496 dout(10) << __func__ << " enqueuing " << *dir
497 << " (fast=" << fast << ")" << dendl;
498
499 assert(mds->mdsmap->allows_dirfrags());
500 const dirfrag_t frag = dir->dirfrag();
501
502 auto callback = [this, frag](int r) {
503 if (split_pending.erase(frag) == 0) {
504 // Someone beat me to it. This can happen in the fast splitting
505 // path, because we spawn two contexts, one with mds->timer and
506 // one with mds->queue_waiter. The loser can safely just drop
507 // out.
508 return;
509 }
510
511 CDir *split_dir = mds->mdcache->get_dirfrag(frag);
512 if (!split_dir) {
513 dout(10) << "drop split on " << frag << " because not in cache" << dendl;
514 return;
515 }
516 if (!split_dir->is_auth()) {
517 dout(10) << "drop split on " << frag << " because non-auth" << dendl;
518 return;
519 }
520
521 // Pass on to MDCache: note that the split might still not
522 // happen if the checks in MDCache::can_fragment fail.
523 dout(10) << __func__ << " splitting " << *split_dir << dendl;
524 mds->mdcache->split_dir(split_dir, g_conf->mds_bal_split_bits);
525 };
526
527 bool is_new = false;
528 if (split_pending.count(frag) == 0) {
529 split_pending.insert(frag);
530 is_new = true;
531 }
532
533 if (fast) {
534 // Do the split ASAP: enqueue it in the MDSRank waiters which are
535 // run at the end of dispatching the current request
536 mds->queue_waiter(new MDSInternalContextWrapper(mds,
537 new FunctionContext(callback)));
538 } else if (is_new) {
539 // Set a timer to really do the split: we don't do it immediately
540 // so that bursts of ops on a directory have a chance to go through
541 // before we freeze it.
542 mds->timer.add_event_after(bal_fragment_interval,
543 new FunctionContext(callback));
544 }
545 }
546
547 void MDBalancer::queue_merge(CDir *dir)
548 {
549 const auto frag = dir->dirfrag();
550 auto callback = [this, frag](int r) {
551 assert(frag.frag != frag_t());
552
553 // frag must be in this set because only one context is in flight
554 // for a given frag at a time (because merge_pending is checked before
555 // starting one), and this context is the only one that erases it.
556 merge_pending.erase(frag);
557
558 CDir *dir = mds->mdcache->get_dirfrag(frag);
559 if (!dir) {
560 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
561 return;
562 }
563 assert(dir->dirfrag() == frag);
564
565 if(!dir->is_auth()) {
566 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
567 return;
568 }
569
570 dout(10) << "merging " << *dir << dendl;
571
572 CInode *diri = dir->get_inode();
573
574 frag_t fg = dir->get_frag();
575 while (fg != frag_t()) {
576 frag_t sibfg = fg.get_sibling();
577 list<CDir*> sibs;
578 bool complete = diri->get_dirfrags_under(sibfg, sibs);
579 if (!complete) {
580 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
581 break;
582 }
583 bool all = true;
584 for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) {
585 CDir *sib = *p;
586 if (!sib->is_auth() || !sib->should_merge()) {
587 all = false;
588 break;
589 }
590 }
591 if (!all) {
592 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
593 break;
594 }
595 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
596 fg = fg.parent();
597 }
598
599 if (fg != dir->get_frag())
600 mds->mdcache->merge_dir(diri, fg);
601 };
602
603 if (merge_pending.count(frag) == 0) {
604 dout(20) << __func__ << " enqueued dir " << *dir << dendl;
605 merge_pending.insert(frag);
606 mds->timer.add_event_after(bal_fragment_interval,
607 new FunctionContext(callback));
608 } else {
609 dout(20) << __func__ << " dir already in queue " << *dir << dendl;
610 }
611 }
612
613 void MDBalancer::prep_rebalance(int beat)
614 {
615 balance_state_t state;
616
617 if (g_conf->mds_thrash_exports) {
618 //we're going to randomly export to all the mds in the cluster
619 set<mds_rank_t> up_mds;
620 mds->get_mds_map()->get_up_mds_set(up_mds);
621 for (const auto &rank : up_mds) {
622 state.targets[rank] = 0.0;
623 }
624 } else {
625 int cluster_size = mds->get_mds_map()->get_num_in_mds();
626 mds_rank_t whoami = mds->get_nodeid();
627 rebalance_time = ceph_clock_now();
628
629 dout(5) << " prep_rebalance: cluster loads are" << dendl;
630
631 mds->mdcache->migrator->clear_export_queue();
632
633 // rescale! turn my mds_load back into meta_load units
634 double load_fac = 1.0;
635 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
636 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
637 double metald = m->second.auth.meta_load(rebalance_time, mds->mdcache->decayrate);
638 double mdsld = m->second.mds_load();
639 load_fac = metald / mdsld;
640 dout(7) << " load_fac is " << load_fac
641 << " <- " << m->second.auth << " " << metald
642 << " / " << mdsld
643 << dendl;
644 }
645
646 mds_meta_load.clear();
647
648 double total_load = 0.0;
649 multimap<double,mds_rank_t> load_map;
650 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
651 mds_load_t& load = mds_load.at(i);
652
653 double l = load.mds_load() * load_fac;
654 mds_meta_load[i] = l;
655
656 if (whoami == 0)
657 dout(5) << " mds." << i
658 << " " << load
659 << " = " << load.mds_load()
660 << " ~ " << l << dendl;
661
662 if (whoami == i) my_load = l;
663 total_load += l;
664
665 load_map.insert(pair<double,mds_rank_t>( l, i ));
666 }
667
668 // target load
669 target_load = total_load / (double)cluster_size;
670 dout(5) << "prep_rebalance: my load " << my_load
671 << " target " << target_load
672 << " total " << total_load
673 << dendl;
674
675 // under or over?
676 for (auto p : load_map) {
677 if (p.first < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
678 dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
679 mds_last_epoch_under_map[p.second] = beat_epoch;
680 }
681 }
682
683 int last_epoch_under = mds_last_epoch_under_map[whoami];
684 if (last_epoch_under == beat_epoch) {
685 dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
686 return;
687 }
688 // am i over long enough?
689 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
690 dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
691 return;
692 }
693
694 dout(5) << " i am sufficiently overloaded" << dendl;
695
696
697 // first separate exporters and importers
698 multimap<double,mds_rank_t> importers;
699 multimap<double,mds_rank_t> exporters;
700 set<mds_rank_t> importer_set;
701 set<mds_rank_t> exporter_set;
702
703 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
704 it != load_map.end();
705 ++it) {
706 if (it->first < target_load) {
707 dout(15) << " mds." << it->second << " is importer" << dendl;
708 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
709 importer_set.insert(it->second);
710 } else {
711 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
712 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
713 dout(15) << " mds." << it->second << " is exporter" << dendl;
714 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
715 exporter_set.insert(it->second);
716 }
717 }
718 }
719
720
721 // determine load transfer mapping
722
723 if (true) {
724 // analyze import_map; do any matches i can
725
726 dout(15) << " matching exporters to import sources" << dendl;
727
728 // big -> small exporters
729 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
730 ex != exporters.rend();
731 ++ex) {
732 double maxex = get_maxex(state, ex->second);
733 if (maxex <= .001) continue;
734
735 // check importers. for now, just in arbitrary order (no intelligent matching).
736 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
737 im != mds_import_map[ex->second].end();
738 ++im) {
739 double maxim = get_maxim(state, im->first);
740 if (maxim <= .001) continue;
741 try_match(state, ex->second, maxex, im->first, maxim);
742 if (maxex <= .001) break;
743 }
744 }
745 }
746
747 // old way
748 if (beat % 2 == 1) {
749 dout(15) << " matching big exporters to big importers" << dendl;
750 // big exporters to big importers
751 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
752 multimap<double,mds_rank_t>::iterator im = importers.begin();
753 while (ex != exporters.rend() &&
754 im != importers.end()) {
755 double maxex = get_maxex(state, ex->second);
756 double maxim = get_maxim(state, im->second);
757 if (maxex < .001 || maxim < .001) break;
758 try_match(state, ex->second, maxex, im->second, maxim);
759 if (maxex <= .001) ++ex;
760 if (maxim <= .001) ++im;
761 }
762 } else { // new way
763 dout(15) << " matching small exporters to big importers" << dendl;
764 // small exporters to big importers
765 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
766 multimap<double,mds_rank_t>::iterator im = importers.begin();
767 while (ex != exporters.end() &&
768 im != importers.end()) {
769 double maxex = get_maxex(state, ex->second);
770 double maxim = get_maxim(state, im->second);
771 if (maxex < .001 || maxim < .001) break;
772 try_match(state, ex->second, maxex, im->second, maxim);
773 if (maxex <= .001) ++ex;
774 if (maxim <= .001) ++im;
775 }
776 }
777 }
778 try_rebalance(state);
779 }
780
781 int MDBalancer::mantle_prep_rebalance()
782 {
783 balance_state_t state;
784
785 /* refresh balancer if it has changed */
786 if (bal_version != mds->mdsmap->get_balancer()) {
787 bal_version.assign("");
788 int r = localize_balancer();
789 if (r) return r;
790
791 /* only spam the cluster log from 1 mds on version changes */
792 if (mds->get_nodeid() == 0)
793 mds->clog->info() << "mantle balancer version changed: " << bal_version;
794 }
795
796 /* prepare for balancing */
797 int cluster_size = mds->get_mds_map()->get_num_in_mds();
798 rebalance_time = ceph_clock_now();
799 mds->mdcache->migrator->clear_export_queue();
800
801 /* fill in the metrics for each mds by grabbing load struct */
802 vector < map<string, double> > metrics (cluster_size);
803 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
804 mds_load_t& load = mds_load.at(i);
805
806 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
807 {"all.meta_load", load.all.meta_load()},
808 {"req_rate", load.req_rate},
809 {"queue_len", load.queue_len},
810 {"cpu_load_avg", load.cpu_load_avg}};
811 }
812
813 /* execute the balancer */
814 Mantle mantle;
815 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
816 dout(2) << " mantle decided that new targets=" << state.targets << dendl;
817
818 /* mantle doesn't know about cluster size, so check target len here */
819 if ((int) state.targets.size() != cluster_size)
820 return -EINVAL;
821 else if (ret)
822 return ret;
823
824 try_rebalance(state);
825 return 0;
826 }
827
828
829
830 void MDBalancer::try_rebalance(balance_state_t& state)
831 {
832 if (g_conf->mds_thrash_exports) {
833 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
834 << dendl;
835 return;
836 }
837
838 // make a sorted list of my imports
839 multimap<double, CDir*> import_pop_map;
840 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
841 set<CDir*> fullauthsubs;
842
843 mds->mdcache->get_fullauth_subtrees(fullauthsubs);
844 for (auto dir : fullauthsubs) {
845 CInode *diri = dir->get_inode();
846 if (diri->is_mdsdir())
847 continue;
848 if (diri->get_export_pin(false) != MDS_RANK_NONE)
849 continue;
850 if (dir->is_freezing() || dir->is_frozen())
851 continue; // export pbly already in progress
852
853 mds_rank_t from = diri->authority().first;
854 double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
855 if (g_conf->mds_bal_idle_threshold > 0 &&
856 pop < g_conf->mds_bal_idle_threshold &&
857 diri != mds->mdcache->get_root() &&
858 from != mds->get_nodeid()) {
859 dout(5) << " exporting idle (" << pop << ") import " << *dir
860 << " back to mds." << from << dendl;
861 mds->mdcache->migrator->export_dir_nicely(dir, from);
862 continue;
863 }
864
865 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
866 import_pop_map.insert(make_pair(pop, dir));
867 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
868 }
869
870 // do my exports!
871 map<mds_rank_t, double> export_pop_map;
872
873 for (auto &it : state.targets) {
874 mds_rank_t target = it.first;
875 double amount = it.second;
876
877 if (amount < MIN_OFFLOAD)
878 continue;
879 if (amount * 10 * state.targets.size() < target_load)
880 continue;
881
882 dout(5) << "want to send " << amount << " to mds." << target
883 //<< " .. " << (*it).second << " * " << load_fac
884 << " -> " << amount
885 << dendl;//" .. fudge is " << fudge << dendl;
886
887 double& have = export_pop_map[target];
888
889 mds->mdcache->show_subtrees();
890
891 // search imports from target
892 if (import_from_map.count(target)) {
893 dout(5) << " aha, looking through imports from target mds." << target << dendl;
894 for (auto p = import_from_map.equal_range(target);
895 p.first != p.second; ) {
896 CDir *dir = p.first->second.first;
897 double pop = p.first->second.second;
898 dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
899 auto plast = p.first++;
900
901 if (dir->inode->is_base())
902 continue;
903 assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
904
905 if (pop <= amount-have) {
906 dout(5) << "reexporting " << *dir << " pop " << pop
907 << " back to mds." << target << dendl;
908 mds->mdcache->migrator->export_dir_nicely(dir, target);
909 have += pop;
910 import_from_map.erase(plast);
911 for (auto q = import_pop_map.equal_range(pop);
912 q.first != q.second; ) {
913 if (q.first->second == dir) {
914 import_pop_map.erase(q.first);
915 break;
916 }
917 q.first++;
918 }
919 } else {
920 dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
921 }
922 if (amount-have < MIN_OFFLOAD)
923 break;
924 }
925 }
926 }
927
928 // any other imports
929 for (auto &it : state.targets) {
930 mds_rank_t target = it.first;
931 double amount = it.second;
932
933 if (!export_pop_map.count(target))
934 continue;
935 double& have = export_pop_map[target];
936 if (amount-have < MIN_OFFLOAD)
937 continue;
938
939 for (auto p = import_pop_map.begin();
940 p != import_pop_map.end(); ) {
941 CDir *dir = p->second;
942 if (dir->inode->is_base()) {
943 ++p;
944 continue;
945 }
946
947 double pop = p->first;
948 if (pop <= amount-have && pop > MIN_REEXPORT) {
949 dout(0) << "reexporting " << *dir << " pop " << pop
950 << " to mds." << target << dendl;
951 have += pop;
952 mds->mdcache->migrator->export_dir_nicely(dir, target);
953 import_pop_map.erase(p++);
954 } else {
955 ++p;
956 }
957 if (amount-have < MIN_OFFLOAD)
958 break;
959 }
960 }
961
962 set<CDir*> already_exporting;
963
964 for (auto &it : state.targets) {
965 mds_rank_t target = it.first;
966 double amount = it.second;
967
968 if (!export_pop_map.count(target))
969 continue;
970 double& have = export_pop_map[target];
971 if (amount-have < MIN_OFFLOAD)
972 continue;
973
974 // okay, search for fragments of my workload
975 list<CDir*> exports;
976
977 for (auto p = import_pop_map.rbegin();
978 p != import_pop_map.rend();
979 ++p) {
980 CDir *dir = p->second;
981 find_exports(dir, amount, exports, have, already_exporting);
982 if (amount-have < MIN_OFFLOAD)
983 break;
984 }
985 //fudge = amount - have;
986
987 for (auto dir : exports) {
988 dout(5) << " - exporting " << dir->pop_auth_subtree
989 << " " << dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
990 << " to mds." << target << " " << *dir << dendl;
991 mds->mdcache->migrator->export_dir_nicely(dir, target);
992 }
993 }
994
995 dout(5) << "rebalance done" << dendl;
996 mds->mdcache->show_subtrees();
997 }
998
999 void MDBalancer::find_exports(CDir *dir,
1000 double amount,
1001 list<CDir*>& exports,
1002 double& have,
1003 set<CDir*>& already_exporting)
1004 {
1005 utime_t now = ceph_clock_now();
1006 if ((double)(now - rebalance_time) > 0.1) {
1007 derr << " balancer runs too long" << dendl_impl;
1008 have = amount;
1009 return;
1010 }
1011
1012 assert(dir->is_auth());
1013
1014 double need = amount - have;
1015 if (need < amount * g_conf->mds_bal_min_start)
1016 return; // good enough!
1017
1018 double needmax = need * g_conf->mds_bal_need_max;
1019 double needmin = need * g_conf->mds_bal_need_min;
1020 double midchunk = need * g_conf->mds_bal_midchunk;
1021 double minchunk = need * g_conf->mds_bal_minchunk;
1022
1023 list<CDir*> bigger_rep, bigger_unrep;
1024 multimap<double, CDir*> smaller;
1025
1026 double dir_pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
1027 dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
1028
1029 double subdir_sum = 0;
1030 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1031 !it.end(); ) {
1032 CInode *in = *it;
1033 ++it;
1034
1035 assert(in->is_dir());
1036 assert(in->get_parent_dir() == dir);
1037
1038 list<CDir*> dfls;
1039 in->get_nested_dirfrags(dfls);
1040
1041 size_t num_idle_frags = 0;
1042 for (list<CDir*>::iterator p = dfls.begin();
1043 p != dfls.end();
1044 ++p) {
1045 CDir *subdir = *p;
1046 if (already_exporting.count(subdir))
1047 continue;
1048
1049 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1050 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1051 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1052 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1053 continue; // can't export this right now!
1054
1055 // how popular?
1056 double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
1057 subdir_sum += pop;
1058 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1059
1060 if (pop < minchunk) {
1061 num_idle_frags++;
1062 continue;
1063 }
1064
1065 // lucky find?
1066 if (pop > needmin && pop < needmax) {
1067 exports.push_back(subdir);
1068 already_exporting.insert(subdir);
1069 have += pop;
1070 return;
1071 }
1072
1073 if (pop > need) {
1074 if (subdir->is_rep())
1075 bigger_rep.push_back(subdir);
1076 else
1077 bigger_unrep.push_back(subdir);
1078 } else
1079 smaller.insert(pair<double,CDir*>(pop, subdir));
1080 }
1081 if (dfls.size() == num_idle_frags)
1082 in->item_pop_lru.remove_myself();
1083 }
1084 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1085
1086 // grab some sufficiently big small items
1087 multimap<double,CDir*>::reverse_iterator it;
1088 for (it = smaller.rbegin();
1089 it != smaller.rend();
1090 ++it) {
1091
1092 if ((*it).first < midchunk)
1093 break; // try later
1094
1095 dout(7) << " taking smaller " << *(*it).second << dendl;
1096
1097 exports.push_back((*it).second);
1098 already_exporting.insert((*it).second);
1099 have += (*it).first;
1100 if (have > needmin)
1101 return;
1102 }
1103
1104 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1105 for (list<CDir*>::iterator it = bigger_unrep.begin();
1106 it != bigger_unrep.end();
1107 ++it) {
1108 dout(15) << " descending into " << **it << dendl;
1109 find_exports(*it, amount, exports, have, already_exporting);
1110 if (have > needmin)
1111 return;
1112 }
1113
1114 // ok fine, use smaller bits
1115 for (;
1116 it != smaller.rend();
1117 ++it) {
1118 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1119
1120 exports.push_back((*it).second);
1121 already_exporting.insert((*it).second);
1122 have += (*it).first;
1123 if (have > needmin)
1124 return;
1125 }
1126
1127 // ok fine, drill into replicated dirs
1128 for (list<CDir*>::iterator it = bigger_rep.begin();
1129 it != bigger_rep.end();
1130 ++it) {
1131 dout(7) << " descending into replicated " << **it << dendl;
1132 find_exports(*it, amount, exports, have, already_exporting);
1133 if (have > needmin)
1134 return;
1135 }
1136 }
1137
1138 void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
1139 {
1140 // hit inode
1141 in->pop.get(type).hit(now, mds->mdcache->decayrate);
1142
1143 if (in->get_parent_dn())
1144 hit_dir(now, in->get_parent_dn()->get_dir(), type, who);
1145 }
1146
1147 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1148 {
1149 // split/merge
1150 if (g_conf->mds_bal_frag && bal_fragment_interval > 0 &&
1151 dir->is_auth() &&
1152 !dir->inode->is_base() && // not root/base (for now at least)
1153 !dir->inode->is_stray()) { // not straydir
1154
1155 // split
1156 if (g_conf->mds_bal_split_size > 0 &&
1157 mds->mdsmap->allows_dirfrags() &&
1158 (dir->should_split() || hot))
1159 {
1160 if (split_pending.count(dir->dirfrag()) == 0) {
1161 queue_split(dir, false);
1162 } else {
1163 if (dir->should_split_fast()) {
1164 queue_split(dir, true);
1165 } else {
1166 dout(10) << __func__ << ": fragment already enqueued to split: "
1167 << *dir << dendl;
1168 }
1169 }
1170 }
1171
1172 // merge?
1173 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1174 merge_pending.count(dir->dirfrag()) == 0) {
1175 queue_merge(dir);
1176 }
1177 }
1178 }
1179
1180 void MDBalancer::hit_dir(const utime_t& now, CDir *dir, int type, int who, double amount)
1181 {
1182 // hit me
1183 double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
1184
1185 const bool hot = (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) ||
1186 (v > g_conf->mds_bal_split_wr && type == META_POP_IWR);
1187
1188 dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag()
1189 << " size " << dir->get_frag_size() << dendl;
1190
1191 maybe_fragment(dir, hot);
1192
1193 // replicate?
1194 if (type == META_POP_IRD && who >= 0) {
1195 dir->pop_spread.hit(now, mds->mdcache->decayrate, who);
1196 }
1197
1198 double rd_adj = 0.0;
1199 if (type == META_POP_IRD &&
1200 dir->last_popularity_sample < last_sample) {
1201 double dir_pop = dir->pop_auth_subtree.get(type).get(now, mds->mdcache->decayrate); // hmm??
1202 dir->last_popularity_sample = last_sample;
1203 double pop_sp = dir->pop_spread.get(now, mds->mdcache->decayrate);
1204 dir_pop += pop_sp * 10;
1205
1206 //if (dir->ino() == inodeno_t(0x10000000002))
1207 if (pop_sp > 0) {
1208 dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp
1209 << " " << dir->pop_spread.last[0]
1210 << " " << dir->pop_spread.last[1]
1211 << " " << dir->pop_spread.last[2]
1212 << " " << dir->pop_spread.last[3]
1213 << " in " << *dir << dendl;
1214 }
1215
1216 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1217 if (!dir->is_rep() &&
1218 dir_pop >= g_conf->mds_bal_replicate_threshold) {
1219 // replicate
1220 double rdp = dir->pop_me.get(META_POP_IRD).get(now, mds->mdcache->decayrate);
1221 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1222 rd_adj /= 2.0; // temper somewhat
1223
1224 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
1225
1226 dir->dir_rep = CDir::REP_ALL;
1227 mds->mdcache->send_dir_updates(dir, true);
1228
1229 // fixme this should adjust the whole pop hierarchy
1230 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1231 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1232 }
1233
1234 if (dir->ino() != 1 &&
1235 dir->is_rep() &&
1236 dir_pop < g_conf->mds_bal_unreplicate_threshold) {
1237 // unreplicate
1238 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
1239
1240 dir->dir_rep = CDir::REP_NONE;
1241 mds->mdcache->send_dir_updates(dir);
1242 }
1243 }
1244 }
1245
1246 // adjust ancestors
1247 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1248 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1249
1250 while (true) {
1251 CDir *pdir = dir->inode->get_parent_dir();
1252 dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
1253 if (rd_adj != 0.0)
1254 dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1255
1256 if (hit_subtree) {
1257 dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
1258
1259 if (rd_adj != 0.0)
1260 dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1261
1262 if (dir->is_subtree_root())
1263 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1264 else if (pdir)
1265 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
1266 }
1267
1268 if (hit_subtree_nested) {
1269 dir->pop_auth_subtree_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
1270 if (rd_adj != 0.0)
1271 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
1272 }
1273 if (!pdir) break;
1274 dir = pdir;
1275 }
1276 }
1277
1278
1279 /*
1280 * subtract off an exported chunk.
1281 * this excludes *dir itself (encode_export_dir should have take care of that)
1282 * we _just_ do the parents' nested counters.
1283 *
1284 * NOTE: call me _after_ forcing *dir into a subtree root,
1285 * but _before_ doing the encode_export_dirs.
1286 */
1287 void MDBalancer::subtract_export(CDir *dir, utime_t now)
1288 {
1289 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1290
1291 while (true) {
1292 dir = dir->inode->get_parent_dir();
1293 if (!dir) break;
1294
1295 dir->pop_nested.sub(now, mds->mdcache->decayrate, subload);
1296 dir->pop_auth_subtree_nested.sub(now, mds->mdcache->decayrate, subload);
1297 }
1298 }
1299
1300
1301 void MDBalancer::add_import(CDir *dir, utime_t now)
1302 {
1303 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1304
1305 while (true) {
1306 dir = dir->inode->get_parent_dir();
1307 if (!dir) break;
1308
1309 dir->pop_nested.add(now, mds->mdcache->decayrate, subload);
1310 dir->pop_auth_subtree_nested.add(now, mds->mdcache->decayrate, subload);
1311 }
1312 }
1313
1314 void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc)
1315 {
1316 DecayRate& rate = mds->mdcache->decayrate;
1317
1318 bool adjust_subtree_nest = dir->is_auth();
1319 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1320 CDir *cur = dir;
1321 while (true) {
1322 if (inc) {
1323 pdir->pop_nested.add(now, rate, dir->pop_nested);
1324 if (adjust_subtree) {
1325 pdir->pop_auth_subtree.add(now, rate, dir->pop_auth_subtree);
1326 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1327 }
1328
1329 if (adjust_subtree_nest)
1330 pdir->pop_auth_subtree_nested.add(now, rate, dir->pop_auth_subtree_nested);
1331 } else {
1332 pdir->pop_nested.sub(now, rate, dir->pop_nested);
1333 if (adjust_subtree)
1334 pdir->pop_auth_subtree.sub(now, rate, dir->pop_auth_subtree);
1335
1336 if (adjust_subtree_nest)
1337 pdir->pop_auth_subtree_nested.sub(now, rate, dir->pop_auth_subtree_nested);
1338 }
1339
1340 if (pdir->is_subtree_root())
1341 adjust_subtree = false;
1342 cur = pdir;
1343 pdir = pdir->inode->get_parent_dir();
1344 if (!pdir) break;
1345 }
1346 }
1347
1348 void MDBalancer::handle_mds_failure(mds_rank_t who)
1349 {
1350 if (0 == who) {
1351 mds_last_epoch_under_map.clear();
1352 }
1353 }
1354
1355 int MDBalancer::dump_loads(Formatter *f)
1356 {
1357 utime_t now = ceph_clock_now();
1358 DecayRate& decayrate = mds->mdcache->decayrate;
1359
1360 list<CDir*> dfs;
1361 if (mds->mdcache->get_root()) {
1362 mds->mdcache->get_root()->get_dirfrags(dfs);
1363 } else {
1364 dout(5) << "dump_load no root" << dendl;
1365 }
1366
1367 f->open_object_section("loads");
1368
1369 f->open_array_section("dirfrags");
1370 while (!dfs.empty()) {
1371 CDir *dir = dfs.front();
1372 dfs.pop_front();
1373
1374 if (f) {
1375 f->open_object_section("dir");
1376 dir->dump_load(f, now, decayrate);
1377 f->close_section();
1378 }
1379
1380 for (auto it = dir->begin(); it != dir->end(); ++it) {
1381 CInode *in = it->second->get_linkage()->get_inode();
1382 if (!in || !in->is_dir())
1383 continue;
1384
1385 list<CDir*> ls;
1386 in->get_dirfrags(ls);
1387 for (auto subdir : ls) {
1388 if (subdir->pop_nested.meta_load() < .001)
1389 continue;
1390 dfs.push_back(subdir);
1391 }
1392 }
1393 }
1394 f->close_section(); // dirfrags array
1395
1396 f->open_object_section("mds_load");
1397 {
1398
1399 auto dump_mds_load = [f, now](mds_load_t& load) {
1400 f->dump_float("request_rate", load.req_rate);
1401 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1402 f->dump_float("queue_length", load.queue_len);
1403 f->dump_float("cpu_load", load.cpu_load_avg);
1404 f->dump_float("mds_load", load.mds_load());
1405
1406 DecayRate rate; // no decay
1407 f->open_object_section("auth_dirfrags");
1408 load.auth.dump(f, now, rate);
1409 f->close_section();
1410 f->open_object_section("all_dirfrags");
1411 load.all.dump(f, now, rate);
1412 f->close_section();
1413 };
1414
1415 for (auto p : mds_load) {
1416 stringstream name;
1417 name << "mds." << p.first;
1418 f->open_object_section(name.str().c_str());
1419 dump_mds_load(p.second);
1420 f->close_section();
1421 }
1422 }
1423 f->close_section(); // mds_load
1424
1425 f->open_object_section("mds_meta_load");
1426 for (auto p : mds_meta_load) {
1427 stringstream name;
1428 name << "mds." << p.first;
1429 f->dump_float(name.str().c_str(), p.second);
1430 }
1431 f->close_section(); // mds_meta_load
1432
1433 f->open_object_section("mds_import_map");
1434 for (auto p : mds_import_map) {
1435 stringstream name1;
1436 name1 << "mds." << p.first;
1437 f->open_array_section(name1.str().c_str());
1438 for (auto q : p.second) {
1439 f->open_object_section("from");
1440 stringstream name2;
1441 name2 << "mds." << q.first;
1442 f->dump_float(name2.str().c_str(), q.second);
1443 f->close_section();
1444 }
1445 f->close_section(); // mds.? array
1446 }
1447 f->close_section(); // mds_import_map
1448
1449 f->close_section(); // loads
1450 return 0;
1451 }