]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mds / MDBalancer.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "mdstypes.h"
17
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
20 #include "MDSRank.h"
21 #include "MDSMap.h"
22 #include "CInode.h"
23 #include "CDir.h"
24 #include "MDCache.h"
25 #include "Migrator.h"
26 #include "Mantle.h"
27
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
30
31 #include <fstream>
32 #include <vector>
33 #include <map>
34 using std::map;
35 using std::vector;
36 using std::chrono::duration_cast;
37
38 #include "common/config.h"
39 #include "common/errno.h"
40
41 #define dout_context g_ceph_context
42 #undef dout_prefix
43 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
44 #undef dout
45 #define dout(lvl) \
46 do {\
47 auto subsys = ceph_subsys_mds;\
48 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
49 subsys = ceph_subsys_mds_balancer;\
50 }\
51 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
52 #undef dendl
53 #define dendl dendl_impl; } while (0)
54
55
56 #define MIN_LOAD 50 // ??
57 #define MIN_REEXPORT 5 // will automatically reexport
58 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
59
60
61 int MDBalancer::proc_message(const cref_t<Message> &m)
62 {
63 switch (m->get_type()) {
64
65 case MSG_MDS_HEARTBEAT:
66 handle_heartbeat(ref_cast<MHeartbeat>(m));
67 break;
68
69 default:
70 derr << " balancer unknown message " << m->get_type() << dendl_impl;
71 ceph_abort_msg("balancer unknown message");
72 }
73
74 return 0;
75 }
76
77 MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
78 mds(m), messenger(msgr), mon_client(monc)
79 {
80 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
81 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
82 }
83
84 void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
85 {
86 if (changed.count("mds_bal_fragment_dirs"))
87 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
88 if (changed.count("mds_bal_fragment_interval"))
89 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
90 }
91
92 void MDBalancer::handle_export_pins(void)
93 {
94 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
95 auto mdcache = mds->mdcache;
96
97 auto &q = mdcache->export_pin_queue;
98 auto it = q.begin();
99 dout(20) << "export_pin_queue size=" << q.size() << dendl;
100 while (it != q.end()) {
101 auto cur = it++;
102 CInode *in = *cur;
103 ceph_assert(in->is_dir());
104
105 mds_rank_t export_pin = in->get_export_pin(false);
106 in->check_pin_policy(export_pin);
107
108 if (export_pin >= max_mds) {
109 dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
110 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
111 q.erase(cur);
112
113 in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
114 mdcache->export_pin_delayed_queue.insert(in);
115 continue;
116 }
117
118 dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
119 unsigned min_frag_bits = 0;
120 mds_rank_t target = MDS_RANK_NONE;
121 if (export_pin >= 0)
122 target = export_pin;
123 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
124 target = mdcache->hash_into_rank_bucket(in->ino());
125 else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
126 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
127
128 bool remove = true;
129 for (auto&& dir : in->get_dirfrags()) {
130 if (!dir->is_auth())
131 continue;
132
133 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
134 if (dir->get_frag().bits() < min_frag_bits) {
135 if (!dir->state_test(CDir::STATE_CREATING) &&
136 !dir->is_frozen() && !dir->is_freezing()) {
137 queue_split(dir, true);
138 }
139 remove = false;
140 continue;
141 }
142 target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
143 }
144
145 if (target == MDS_RANK_NONE) {
146 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
147 if (dir->is_frozen() || dir->is_freezing()) {
148 // try again later
149 remove = false;
150 continue;
151 }
152 dout(10) << " clear auxsubtree on " << *dir << dendl;
153 dir->state_clear(CDir::STATE_AUXSUBTREE);
154 mds->mdcache->try_subtree_merge(dir);
155 }
156 } else if (target == mds->get_nodeid()) {
157 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
158 ceph_assert(dir->is_subtree_root());
159 } else if (dir->state_test(CDir::STATE_CREATING) ||
160 dir->is_frozen() || dir->is_freezing()) {
161 // try again later
162 remove = false;
163 continue;
164 } else if (!dir->is_subtree_root()) {
165 dir->state_set(CDir::STATE_AUXSUBTREE);
166 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
167 dout(10) << " create aux subtree on " << *dir << dendl;
168 } else {
169 dout(10) << " set auxsubtree bit on " << *dir << dendl;
170 dir->state_set(CDir::STATE_AUXSUBTREE);
171 }
172 } else {
173 /* Only export a directory if it's non-empty. An empty directory will
174 * be sent back by the importer.
175 */
176 if (dir->get_num_head_items() > 0) {
177 mds->mdcache->migrator->export_dir(dir, target);
178 }
179 remove = false;
180 }
181 }
182
183 if (remove) {
184 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
185 q.erase(cur);
186 }
187 }
188
189 std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
190 bool print_auth_subtrees = true;
191
192 if (authsubs.size() > AUTH_TREES_THRESHOLD &&
193 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
194 dout(15) << "number of auth trees = " << authsubs.size() << "; not "
195 "printing auth trees" << dendl;
196 print_auth_subtrees = false;
197 }
198
199 for (auto &cd : authsubs) {
200 mds_rank_t export_pin = cd->inode->get_export_pin();
201 cd->inode->check_pin_policy(export_pin);
202
203 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
204 export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
205 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
206 export_pin = mdcache->hash_into_rank_bucket(cd->ino());
207 }
208
209 if (print_auth_subtrees)
210 dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
211
212 if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
213 export_pin < mds->mdsmap->get_max_mds()) {
214 mdcache->migrator->export_dir(cd, export_pin);
215 }
216 }
217 }
218
219 void MDBalancer::tick()
220 {
221 static int num_bal_times = g_conf()->mds_bal_max;
222 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
223 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
224 time now = clock::now();
225
226 if (g_conf()->mds_bal_export_pin) {
227 handle_export_pins();
228 }
229
230 // sample?
231 if (chrono::duration<double>(now-last_sample).count() >
232 g_conf()->mds_bal_sample_interval) {
233 dout(15) << "tick last_sample now " << now << dendl;
234 last_sample = now;
235 }
236
237 // We can use duration_cast below, although the result is an int,
238 // because the values from g_conf are also integers.
239 // balance?
240 if (mds->get_nodeid() == 0
241 && mds->is_active()
242 && bal_interval > 0
243 && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
244 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
245 last_heartbeat = now;
246 send_heartbeat();
247 num_bal_times--;
248 }
249
250 mds->mdcache->show_subtrees(10, true);
251 }
252
253
254
255
256 class C_Bal_SendHeartbeat : public MDSInternalContext {
257 public:
258 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
259 void finish(int f) override {
260 mds->balancer->send_heartbeat();
261 }
262 };
263
264
265 double mds_load_t::mds_load() const
266 {
267 switch(g_conf()->mds_bal_mode) {
268 case 0:
269 return
270 .8 * auth.meta_load() +
271 .2 * all.meta_load() +
272 req_rate +
273 10.0 * queue_len;
274
275 case 1:
276 return req_rate + 10.0*queue_len;
277
278 case 2:
279 return cpu_load_avg;
280
281 }
282 ceph_abort();
283 return 0;
284 }
285
286 mds_load_t MDBalancer::get_load()
287 {
288 auto now = clock::now();
289
290 mds_load_t load{DecayRate()}; /* zero DecayRate! */
291
292 if (mds->mdcache->get_root()) {
293 auto&& ls = mds->mdcache->get_root()->get_dirfrags();
294 for (auto &d : ls) {
295 load.auth.add(d->pop_auth_subtree_nested);
296 load.all.add(d->pop_nested);
297 }
298 } else {
299 dout(20) << "no root, no load" << dendl;
300 }
301
302 uint64_t num_requests = mds->get_num_requests();
303
304 uint64_t cpu_time = 1;
305 {
306 string stat_path = PROCPREFIX "/proc/self/stat";
307 ifstream stat_file(stat_path);
308 if (stat_file.is_open()) {
309 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
310 std::istream_iterator<string>());
311 if (stat_vec.size() >= 15) {
312 // utime + stime
313 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
314 strtoll(stat_vec[14].c_str(), nullptr, 10);
315 } else {
316 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
317 }
318 } else {
319 derr << "input file '" << stat_path << "' not found" << dendl_impl;
320 }
321 }
322
323 load.queue_len = messenger->get_dispatch_queue_len();
324
325 bool update_last = true;
326 if (last_get_load != clock::zero() &&
327 now > last_get_load) {
328 double el = std::chrono::duration<double>(now-last_get_load).count();
329 if (el >= 1.0) {
330 if (num_requests > last_num_requests)
331 load.req_rate = (num_requests - last_num_requests) / el;
332 if (cpu_time > last_cpu_time)
333 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
334 } else {
335 auto p = mds_load.find(mds->get_nodeid());
336 if (p != mds_load.end()) {
337 load.req_rate = p->second.req_rate;
338 load.cpu_load_avg = p->second.cpu_load_avg;
339 }
340 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
341 update_last = false;
342 }
343 }
344
345 if (update_last) {
346 last_num_requests = num_requests;
347 last_cpu_time = cpu_time;
348 last_get_load = now;
349 }
350
351 dout(15) << load << dendl;
352 return load;
353 }
354
355 /*
356 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
357 * fallbacks (i.e. kick off async read when we are processing the map and
358 * check status when we get here) with the way the mds is structured.
359 */
360 int MDBalancer::localize_balancer()
361 {
362 /* reset everything */
363 bool ack = false;
364 int r = 0;
365 bufferlist lua_src;
366 ceph::mutex lock = ceph::make_mutex("lock");
367 ceph::condition_variable cond;
368
369 /* we assume that balancer is in the metadata pool */
370 object_t oid = object_t(mds->mdsmap->get_balancer());
371 object_locator_t oloc(mds->get_metadata_pool());
372 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
373 new C_SafeCond(lock, cond, &ack, &r));
374 dout(15) << "launched non-blocking read tid=" << tid
375 << " oid=" << oid << " oloc=" << oloc << dendl;
376
377 /* timeout: if we waste half our time waiting for RADOS, then abort! */
378 std::cv_status ret_t = [&] {
379 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
380 std::unique_lock locker{lock};
381 return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
382 }();
383 /* success: store the balancer in memory and set the version. */
384 if (!r) {
385 if (ret_t == std::cv_status::timeout) {
386 mds->objecter->op_cancel(tid, -CEPHFS_ECANCELED);
387 return -CEPHFS_ETIMEDOUT;
388 }
389 bal_code.assign(lua_src.to_str());
390 bal_version.assign(oid.name);
391 dout(10) "bal_code=" << bal_code << dendl;
392 }
393 return r;
394 }
395
396 void MDBalancer::send_heartbeat()
397 {
398 if (mds->is_cluster_degraded()) {
399 dout(10) << "degraded" << dendl;
400 return;
401 }
402
403 if (!mds->mdcache->is_open()) {
404 dout(10) << "not open" << dendl;
405 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
406 return;
407 }
408
409 if (mds->get_nodeid() == 0) {
410 beat_epoch++;
411 mds_load.clear();
412 }
413
414 // my load
415 mds_load_t load = get_load();
416 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
417 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
418
419 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
420 if (!em.second) {
421 em.first->second = load;
422 }
423
424 // import_map -- how much do i import from whom
425 map<mds_rank_t, float> import_map;
426 for (auto& im : mds->mdcache->get_auth_subtrees()) {
427 mds_rank_t from = im->inode->authority().first;
428 if (from == mds->get_nodeid()) continue;
429 if (im->get_inode()->is_stray()) continue;
430 import_map[from] += im->pop_auth_subtree.meta_load();
431 }
432 mds_import_map[ mds->get_nodeid() ] = import_map;
433
434
435 dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
436 for (const auto& [rank, load] : import_map) {
437 dout(5) << " import_map from " << rank << " -> " << load << dendl;
438 }
439
440
441 set<mds_rank_t> up;
442 mds->get_mds_map()->get_up_mds_set(up);
443 for (const auto& r : up) {
444 if (r == mds->get_nodeid())
445 continue;
446 auto hb = make_message<MHeartbeat>(load, beat_epoch);
447 hb->get_import_map() = import_map;
448 mds->send_message_mds(hb, r);
449 }
450 }
451
452 void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
453 {
454 mds_rank_t who = mds_rank_t(m->get_source().num());
455 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
456
457 if (!mds->is_active())
458 return;
459
460 if (!mds->mdcache->is_open()) {
461 dout(10) << "opening root on handle_heartbeat" << dendl;
462 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
463 return;
464 }
465
466 if (mds->is_cluster_degraded()) {
467 dout(10) << " degraded, ignoring" << dendl;
468 return;
469 }
470
471 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
472 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
473
474 beat_epoch = m->get_beat();
475 // clear the mds load info whose epoch is less than beat_epoch
476 mds_load.clear();
477 }
478
479 if (who == 0) {
480 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
481 if (beat_epoch != m->get_beat()) {
482 beat_epoch = m->get_beat();
483 mds_load.clear();
484 }
485
486 send_heartbeat();
487
488 mds->mdcache->show_subtrees();
489 } else if (mds->get_nodeid() == 0) {
490 if (beat_epoch != m->get_beat()) {
491 dout(10) << " old heartbeat epoch, ignoring" << dendl;
492 return;
493 }
494 }
495
496 {
497 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
498 if (!em.second) {
499 em.first->second = m->get_load();
500 }
501 }
502 mds_import_map[who] = m->get_import_map();
503
504 {
505 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
506 if (mds_load.size() == cluster_size) {
507 // let's go!
508 //export_empties(); // no!
509
510 /* avoid spamming ceph -w if user does not turn mantle on */
511 if (mds->mdsmap->get_balancer() != "") {
512 int r = mantle_prep_rebalance();
513 if (!r) return;
514 mds->clog->warn() << "using old balancer; mantle failed for "
515 << "balancer=" << mds->mdsmap->get_balancer()
516 << " : " << cpp_strerror(r);
517 }
518 prep_rebalance(m->get_beat());
519 }
520 }
521 }
522
523 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
524 mds_rank_t im, double& maxim)
525 {
526 if (maxex <= 0 || maxim <= 0) return 0.0;
527
528 double howmuch = std::min(maxex, maxim);
529
530 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
531
532 if (ex == mds->get_nodeid())
533 state.targets[im] += howmuch;
534
535 state.exported[ex] += howmuch;
536 state.imported[im] += howmuch;
537
538 maxex -= howmuch;
539 maxim -= howmuch;
540
541 return howmuch;
542 }
543
544 void MDBalancer::queue_split(const CDir *dir, bool fast)
545 {
546 dout(10) << __func__ << " enqueuing " << *dir
547 << " (fast=" << fast << ")" << dendl;
548
549 const dirfrag_t df = dir->dirfrag();
550
551 auto callback = [this, df](int r) {
552 if (split_pending.erase(df) == 0) {
553 // Someone beat me to it. This can happen in the fast splitting
554 // path, because we spawn two contexts, one with mds->timer and
555 // one with mds->queue_waiter. The loser can safely just drop
556 // out.
557 return;
558 }
559
560 auto mdcache = mds->mdcache;
561
562 CDir *dir = mdcache->get_dirfrag(df);
563 if (!dir) {
564 dout(10) << "drop split on " << df << " because not in cache" << dendl;
565 return;
566 }
567 if (!dir->is_auth()) {
568 dout(10) << "drop split on " << df << " because non-auth" << dendl;
569 return;
570 }
571
572 // Pass on to MDCache: note that the split might still not
573 // happen if the checks in MDCache::can_fragment fail.
574 dout(10) << __func__ << " splitting " << *dir << dendl;
575 int bits = g_conf()->mds_bal_split_bits;
576 if (dir->inode->is_ephemeral_dist()) {
577 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
578 if (df.frag.bits() + bits < min_frag_bits)
579 bits = min_frag_bits - df.frag.bits();
580 }
581 mdcache->split_dir(dir, bits);
582 };
583
584 auto ret = split_pending.insert(df);
585 bool is_new = ret.second;
586
587 if (fast) {
588 // Do the split ASAP: enqueue it in the MDSRank waiters which are
589 // run at the end of dispatching the current request
590 mds->queue_waiter(new MDSInternalContextWrapper(mds,
591 new LambdaContext(std::move(callback))));
592 } else if (is_new) {
593 // Set a timer to really do the split: we don't do it immediately
594 // so that bursts of ops on a directory have a chance to go through
595 // before we freeze it.
596 mds->timer.add_event_after(bal_fragment_interval,
597 new LambdaContext(std::move(callback)));
598 }
599 }
600
601 void MDBalancer::queue_merge(CDir *dir)
602 {
603 const auto frag = dir->dirfrag();
604 auto callback = [this, frag](int r) {
605 ceph_assert(frag.frag != frag_t());
606
607 // frag must be in this set because only one context is in flight
608 // for a given frag at a time (because merge_pending is checked before
609 // starting one), and this context is the only one that erases it.
610 merge_pending.erase(frag);
611
612 auto mdcache = mds->mdcache;
613 CDir *dir = mdcache->get_dirfrag(frag);
614 if (!dir) {
615 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
616 return;
617 }
618 ceph_assert(dir->dirfrag() == frag);
619
620 if(!dir->is_auth()) {
621 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
622 return;
623 }
624
625 dout(10) << "merging " << *dir << dendl;
626
627 CInode *diri = dir->get_inode();
628
629 unsigned min_frag_bits = 0;
630 if (diri->is_ephemeral_dist())
631 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
632
633 frag_t fg = dir->get_frag();
634 while (fg.bits() > min_frag_bits) {
635 frag_t sibfg = fg.get_sibling();
636 auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
637 if (!complete) {
638 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
639 break;
640 }
641 bool all = true;
642 for (auto& sib : sibs) {
643 if (!sib->is_auth() || !sib->should_merge()) {
644 all = false;
645 break;
646 }
647 }
648 if (!all) {
649 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
650 break;
651 }
652 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
653 fg = fg.parent();
654 }
655
656 if (fg != dir->get_frag())
657 mdcache->merge_dir(diri, fg);
658 };
659
660 if (merge_pending.count(frag) == 0) {
661 dout(20) << " enqueued dir " << *dir << dendl;
662 merge_pending.insert(frag);
663 mds->timer.add_event_after(bal_fragment_interval,
664 new LambdaContext(std::move(callback)));
665 } else {
666 dout(20) << " dir already in queue " << *dir << dendl;
667 }
668 }
669
670 void MDBalancer::prep_rebalance(int beat)
671 {
672 balance_state_t state;
673
674 if (g_conf()->mds_thrash_exports) {
675 //we're going to randomly export to all the mds in the cluster
676 set<mds_rank_t> up_mds;
677 mds->get_mds_map()->get_up_mds_set(up_mds);
678 for (const auto &rank : up_mds) {
679 state.targets[rank] = 0.0;
680 }
681 } else {
682 int cluster_size = mds->get_mds_map()->get_num_in_mds();
683 mds_rank_t whoami = mds->get_nodeid();
684 rebalance_time = clock::now();
685
686 dout(7) << "cluster loads are" << dendl;
687
688 mds->mdcache->migrator->clear_export_queue();
689
690 // rescale! turn my mds_load back into meta_load units
691 double load_fac = 1.0;
692 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
693 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
694 double metald = m->second.auth.meta_load();
695 double mdsld = m->second.mds_load();
696 load_fac = metald / mdsld;
697 dout(7) << " load_fac is " << load_fac
698 << " <- " << m->second.auth << " " << metald
699 << " / " << mdsld
700 << dendl;
701 }
702
703 mds_meta_load.clear();
704
705 double total_load = 0.0;
706 multimap<double,mds_rank_t> load_map;
707 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
708 mds_load_t& load = mds_load.at(i);
709
710 double l = load.mds_load() * load_fac;
711 mds_meta_load[i] = l;
712
713 if (whoami == 0)
714 dout(7) << " mds." << i
715 << " " << load
716 << " = " << load.mds_load()
717 << " ~ " << l << dendl;
718
719 if (whoami == i) my_load = l;
720 total_load += l;
721
722 load_map.insert(pair<double,mds_rank_t>( l, i ));
723 }
724
725 // target load
726 target_load = total_load / (double)cluster_size;
727 dout(7) << "my load " << my_load
728 << " target " << target_load
729 << " total " << total_load
730 << dendl;
731
732 // under or over?
733 for (const auto& [load, rank] : load_map) {
734 if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
735 dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
736 mds_last_epoch_under_map[rank] = beat_epoch;
737 }
738 }
739
740 int last_epoch_under = mds_last_epoch_under_map[whoami];
741 if (last_epoch_under == beat_epoch) {
742 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl;
743 return;
744 }
745 // am i over long enough?
746 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
747 dout(7) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
748 return;
749 }
750
751 dout(7) << " i am sufficiently overloaded" << dendl;
752
753
754 // first separate exporters and importers
755 multimap<double,mds_rank_t> importers;
756 multimap<double,mds_rank_t> exporters;
757 set<mds_rank_t> importer_set;
758 set<mds_rank_t> exporter_set;
759
760 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
761 it != load_map.end();
762 ++it) {
763 if (it->first < target_load) {
764 dout(15) << " mds." << it->second << " is importer" << dendl;
765 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
766 importer_set.insert(it->second);
767 } else {
768 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
769 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
770 dout(15) << " mds." << it->second << " is exporter" << dendl;
771 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
772 exporter_set.insert(it->second);
773 }
774 }
775 }
776
777
778 // determine load transfer mapping
779
780 if (true) {
781 // analyze import_map; do any matches i can
782
783 dout(15) << " matching exporters to import sources" << dendl;
784
785 // big -> small exporters
786 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
787 ex != exporters.rend();
788 ++ex) {
789 double maxex = get_maxex(state, ex->second);
790 if (maxex <= .001) continue;
791
792 // check importers. for now, just in arbitrary order (no intelligent matching).
793 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
794 im != mds_import_map[ex->second].end();
795 ++im) {
796 double maxim = get_maxim(state, im->first);
797 if (maxim <= .001) continue;
798 try_match(state, ex->second, maxex, im->first, maxim);
799 if (maxex <= .001) break;
800 }
801 }
802 }
803
804 // old way
805 if (beat % 2 == 1) {
806 dout(15) << " matching big exporters to big importers" << dendl;
807 // big exporters to big importers
808 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
809 multimap<double,mds_rank_t>::iterator im = importers.begin();
810 while (ex != exporters.rend() &&
811 im != importers.end()) {
812 double maxex = get_maxex(state, ex->second);
813 double maxim = get_maxim(state, im->second);
814 if (maxex < .001 || maxim < .001) break;
815 try_match(state, ex->second, maxex, im->second, maxim);
816 if (maxex <= .001) ++ex;
817 if (maxim <= .001) ++im;
818 }
819 } else { // new way
820 dout(15) << " matching small exporters to big importers" << dendl;
821 // small exporters to big importers
822 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
823 multimap<double,mds_rank_t>::iterator im = importers.begin();
824 while (ex != exporters.end() &&
825 im != importers.end()) {
826 double maxex = get_maxex(state, ex->second);
827 double maxim = get_maxim(state, im->second);
828 if (maxex < .001 || maxim < .001) break;
829 try_match(state, ex->second, maxex, im->second, maxim);
830 if (maxex <= .001) ++ex;
831 if (maxim <= .001) ++im;
832 }
833 }
834 }
835 try_rebalance(state);
836 }
837
838 int MDBalancer::mantle_prep_rebalance()
839 {
840 balance_state_t state;
841
842 /* refresh balancer if it has changed */
843 if (bal_version != mds->mdsmap->get_balancer()) {
844 bal_version.assign("");
845 int r = localize_balancer();
846 if (r) return r;
847
848 /* only spam the cluster log from 1 mds on version changes */
849 if (mds->get_nodeid() == 0)
850 mds->clog->info() << "mantle balancer version changed: " << bal_version;
851 }
852
853 /* prepare for balancing */
854 int cluster_size = mds->get_mds_map()->get_num_in_mds();
855 rebalance_time = clock::now();
856 mds->mdcache->migrator->clear_export_queue();
857
858 /* fill in the metrics for each mds by grabbing load struct */
859 vector < map<string, double> > metrics (cluster_size);
860 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
861 mds_load_t& load = mds_load.at(i);
862
863 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
864 {"all.meta_load", load.all.meta_load()},
865 {"req_rate", load.req_rate},
866 {"queue_len", load.queue_len},
867 {"cpu_load_avg", load.cpu_load_avg}};
868 }
869
870 /* execute the balancer */
871 Mantle mantle;
872 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
873 dout(7) << " mantle decided that new targets=" << state.targets << dendl;
874
875 /* mantle doesn't know about cluster size, so check target len here */
876 if ((int) state.targets.size() != cluster_size)
877 return -CEPHFS_EINVAL;
878 else if (ret)
879 return ret;
880
881 try_rebalance(state);
882 return 0;
883 }
884
885
886
887 void MDBalancer::try_rebalance(balance_state_t& state)
888 {
889 if (g_conf()->mds_thrash_exports) {
890 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
891 << dendl;
892 return;
893 }
894
895 // make a sorted list of my imports
896 multimap<double, CDir*> import_pop_map;
897 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
898
899 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
900 CInode *diri = dir->get_inode();
901 if (diri->is_mdsdir())
902 continue;
903 if (diri->get_export_pin(false) != MDS_RANK_NONE)
904 continue;
905 if (dir->is_freezing() || dir->is_frozen())
906 continue; // export pbly already in progress
907
908 mds_rank_t from = diri->authority().first;
909 double pop = dir->pop_auth_subtree.meta_load();
910 if (g_conf()->mds_bal_idle_threshold > 0 &&
911 pop < g_conf()->mds_bal_idle_threshold &&
912 diri != mds->mdcache->get_root() &&
913 from != mds->get_nodeid()) {
914 dout(5) << " exporting idle (" << pop << ") import " << *dir
915 << " back to mds." << from << dendl;
916 mds->mdcache->migrator->export_dir_nicely(dir, from);
917 continue;
918 }
919
920 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
921 import_pop_map.insert(make_pair(pop, dir));
922 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
923 }
924
925 // do my exports!
926 map<mds_rank_t, double> export_pop_map;
927
928 for (auto &it : state.targets) {
929 mds_rank_t target = it.first;
930 double amount = it.second;
931
932 if (amount < MIN_OFFLOAD)
933 continue;
934 if (amount * 10 * state.targets.size() < target_load)
935 continue;
936
937 dout(5) << "want to send " << amount << " to mds." << target
938 //<< " .. " << (*it).second << " * " << load_fac
939 << " -> " << amount
940 << dendl;//" .. fudge is " << fudge << dendl;
941
942 double& have = export_pop_map[target];
943
944 mds->mdcache->show_subtrees();
945
946 // search imports from target
947 if (import_from_map.count(target)) {
948 dout(7) << " aha, looking through imports from target mds." << target << dendl;
949 for (auto p = import_from_map.equal_range(target);
950 p.first != p.second; ) {
951 CDir *dir = p.first->second.first;
952 double pop = p.first->second.second;
953 dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
954 auto plast = p.first++;
955
956 if (dir->inode->is_base())
957 continue;
958 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
959
960 if (pop <= amount-have) {
961 dout(7) << "reexporting " << *dir << " pop " << pop
962 << " back to mds." << target << dendl;
963 mds->mdcache->migrator->export_dir_nicely(dir, target);
964 have += pop;
965 import_from_map.erase(plast);
966 for (auto q = import_pop_map.equal_range(pop);
967 q.first != q.second; ) {
968 if (q.first->second == dir) {
969 import_pop_map.erase(q.first);
970 break;
971 }
972 q.first++;
973 }
974 } else {
975 dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
976 }
977 if (amount-have < MIN_OFFLOAD)
978 break;
979 }
980 }
981 }
982
983 // any other imports
984 for (auto &it : state.targets) {
985 mds_rank_t target = it.first;
986 double amount = it.second;
987
988 if (!export_pop_map.count(target))
989 continue;
990 double& have = export_pop_map[target];
991 if (amount-have < MIN_OFFLOAD)
992 continue;
993
994 for (auto p = import_pop_map.begin();
995 p != import_pop_map.end(); ) {
996 CDir *dir = p->second;
997 if (dir->inode->is_base()) {
998 ++p;
999 continue;
1000 }
1001
1002 double pop = p->first;
1003 if (pop <= amount-have && pop > MIN_REEXPORT) {
1004 dout(5) << "reexporting " << *dir << " pop " << pop
1005 << " to mds." << target << dendl;
1006 have += pop;
1007 mds->mdcache->migrator->export_dir_nicely(dir, target);
1008 import_pop_map.erase(p++);
1009 } else {
1010 ++p;
1011 }
1012 if (amount-have < MIN_OFFLOAD)
1013 break;
1014 }
1015 }
1016
1017 set<CDir*> already_exporting;
1018
1019 for (auto &it : state.targets) {
1020 mds_rank_t target = it.first;
1021 double amount = it.second;
1022
1023 if (!export_pop_map.count(target))
1024 continue;
1025 double& have = export_pop_map[target];
1026 if (amount-have < MIN_OFFLOAD)
1027 continue;
1028
1029 // okay, search for fragments of my workload
1030 std::vector<CDir*> exports;
1031
1032 for (auto p = import_pop_map.rbegin();
1033 p != import_pop_map.rend();
1034 ++p) {
1035 CDir *dir = p->second;
1036 find_exports(dir, amount, &exports, have, already_exporting);
1037 if (amount-have < MIN_OFFLOAD)
1038 break;
1039 }
1040 //fudge = amount - have;
1041
1042 for (const auto& dir : exports) {
1043 dout(5) << " - exporting " << dir->pop_auth_subtree
1044 << " " << dir->pop_auth_subtree.meta_load()
1045 << " to mds." << target << " " << *dir << dendl;
1046 mds->mdcache->migrator->export_dir_nicely(dir, target);
1047 }
1048 }
1049
1050 dout(7) << "done" << dendl;
1051 mds->mdcache->show_subtrees();
1052 }
1053
1054 void MDBalancer::find_exports(CDir *dir,
1055 double amount,
1056 std::vector<CDir*>* exports,
1057 double& have,
1058 set<CDir*>& already_exporting)
1059 {
1060 auto now = clock::now();
1061 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
1062 if (duration > 0.1) {
1063 derr << " balancer runs too long" << dendl_impl;
1064 have = amount;
1065 return;
1066 }
1067
1068 ceph_assert(dir->is_auth());
1069
1070 double need = amount - have;
1071 if (need < amount * g_conf()->mds_bal_min_start)
1072 return; // good enough!
1073
1074 double needmax = need * g_conf()->mds_bal_need_max;
1075 double needmin = need * g_conf()->mds_bal_need_min;
1076 double midchunk = need * g_conf()->mds_bal_midchunk;
1077 double minchunk = need * g_conf()->mds_bal_minchunk;
1078
1079 std::vector<CDir*> bigger_rep, bigger_unrep;
1080 multimap<double, CDir*> smaller;
1081
1082 double dir_pop = dir->pop_auth_subtree.meta_load();
1083 dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
1084
1085 double subdir_sum = 0;
1086 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1087 !it.end(); ) {
1088 CInode *in = *it;
1089 ++it;
1090
1091 ceph_assert(in->is_dir());
1092 ceph_assert(in->get_parent_dir() == dir);
1093
1094 auto&& dfls = in->get_nested_dirfrags();
1095
1096 size_t num_idle_frags = 0;
1097 for (const auto& subdir : dfls) {
1098 if (already_exporting.count(subdir))
1099 continue;
1100
1101 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1102 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1103 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1104 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1105 continue; // can't export this right now!
1106
1107 // how popular?
1108 double pop = subdir->pop_auth_subtree.meta_load();
1109 subdir_sum += pop;
1110 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1111
1112 if (pop < minchunk) {
1113 num_idle_frags++;
1114 continue;
1115 }
1116
1117 // lucky find?
1118 if (pop > needmin && pop < needmax) {
1119 exports->push_back(subdir);
1120 already_exporting.insert(subdir);
1121 have += pop;
1122 return;
1123 }
1124
1125 if (pop > need) {
1126 if (subdir->is_rep())
1127 bigger_rep.push_back(subdir);
1128 else
1129 bigger_unrep.push_back(subdir);
1130 } else
1131 smaller.insert(pair<double,CDir*>(pop, subdir));
1132 }
1133 if (dfls.size() == num_idle_frags)
1134 in->item_pop_lru.remove_myself();
1135 }
1136 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1137
1138 // grab some sufficiently big small items
1139 multimap<double,CDir*>::reverse_iterator it;
1140 for (it = smaller.rbegin();
1141 it != smaller.rend();
1142 ++it) {
1143
1144 if ((*it).first < midchunk)
1145 break; // try later
1146
1147 dout(7) << " taking smaller " << *(*it).second << dendl;
1148
1149 exports->push_back((*it).second);
1150 already_exporting.insert((*it).second);
1151 have += (*it).first;
1152 if (have > needmin)
1153 return;
1154 }
1155
1156 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1157 for (const auto& dir : bigger_unrep) {
1158 dout(15) << " descending into " << *dir << dendl;
1159 find_exports(dir, amount, exports, have, already_exporting);
1160 if (have > needmin)
1161 return;
1162 }
1163
1164 // ok fine, use smaller bits
1165 for (;
1166 it != smaller.rend();
1167 ++it) {
1168 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1169
1170 exports->push_back((*it).second);
1171 already_exporting.insert((*it).second);
1172 have += (*it).first;
1173 if (have > needmin)
1174 return;
1175 }
1176
1177 // ok fine, drill into replicated dirs
1178 for (const auto& dir : bigger_rep) {
1179 dout(7) << " descending into replicated " << *dir << dendl;
1180 find_exports(dir, amount, exports, have, already_exporting);
1181 if (have > needmin)
1182 return;
1183 }
1184 }
1185
1186 void MDBalancer::hit_inode(CInode *in, int type, int who)
1187 {
1188 // hit inode
1189 in->pop.get(type).hit();
1190
1191 if (in->get_parent_dn())
1192 hit_dir(in->get_parent_dn()->get_dir(), type, who);
1193 }
1194
1195 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1196 {
1197 // split/merge
1198 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
1199 dir->is_auth() &&
1200 !dir->inode->is_base() && // not root/mdsdir (for now at least)
1201 !dir->inode->is_stray()) { // not straydir
1202
1203 // split
1204 if (dir->should_split() || hot) {
1205 if (split_pending.count(dir->dirfrag()) == 0) {
1206 queue_split(dir, false);
1207 } else {
1208 if (dir->should_split_fast()) {
1209 queue_split(dir, true);
1210 } else {
1211 dout(10) << ": fragment already enqueued to split: "
1212 << *dir << dendl;
1213 }
1214 }
1215 }
1216
1217 // merge?
1218 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1219 merge_pending.count(dir->dirfrag()) == 0) {
1220 queue_merge(dir);
1221 }
1222 }
1223 }
1224
1225 void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
1226 {
1227 if (dir->inode->is_stray())
1228 return;
1229 // hit me
1230 double v = dir->pop_me.get(type).hit(amount);
1231
1232 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1233 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
1234
1235 dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
1236 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
1237
1238 maybe_fragment(dir, hot);
1239
1240 // replicate?
1241 if (type == META_POP_IRD && who >= 0) {
1242 dir->pop_spread.hit(who);
1243 }
1244
1245 double rd_adj = 0.0;
1246 if (type == META_POP_IRD &&
1247 dir->last_popularity_sample < last_sample) {
1248 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
1249 dir->last_popularity_sample = last_sample;
1250 double pop_sp = dir->pop_spread.get();
1251 dir_pop += pop_sp * 10;
1252
1253 //if (dir->ino() == inodeno_t(0x10000000002))
1254 if (pop_sp > 0) {
1255 dout(20) << type << " pop " << dir_pop << " spread " << pop_sp
1256 << " " << dir->pop_spread.last[0]
1257 << " " << dir->pop_spread.last[1]
1258 << " " << dir->pop_spread.last[2]
1259 << " " << dir->pop_spread.last[3]
1260 << " in " << *dir << dendl;
1261 }
1262
1263 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1264 if (dir->can_rep() &&
1265 dir_pop >= g_conf()->mds_bal_replicate_threshold) {
1266 // replicate
1267 double rdp = dir->pop_me.get(META_POP_IRD).get();
1268 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1269 rd_adj /= 2.0; // temper somewhat
1270
1271 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
1272
1273 dir->dir_rep = CDir::REP_ALL;
1274 mds->mdcache->send_dir_updates(dir, true);
1275
1276 // fixme this should adjust the whole pop hierarchy
1277 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1278 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1279 }
1280
1281 if (dir->ino() != 1 &&
1282 dir->is_rep() &&
1283 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
1284 // unreplicate
1285 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
1286
1287 dir->dir_rep = CDir::REP_NONE;
1288 mds->mdcache->send_dir_updates(dir);
1289 }
1290 }
1291 }
1292
1293 // adjust ancestors
1294 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1295 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1296
1297 while (true) {
1298 CDir *pdir = dir->inode->get_parent_dir();
1299 dir->pop_nested.get(type).hit(amount);
1300 if (rd_adj != 0.0)
1301 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
1302
1303 if (hit_subtree) {
1304 dir->pop_auth_subtree.get(type).hit(amount);
1305
1306 if (rd_adj != 0.0)
1307 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1308
1309 if (dir->is_subtree_root())
1310 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1311 else if (pdir)
1312 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
1313 }
1314
1315 if (hit_subtree_nested) {
1316 dir->pop_auth_subtree_nested.get(type).hit(amount);
1317 if (rd_adj != 0.0)
1318 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
1319 }
1320 if (!pdir) break;
1321 dir = pdir;
1322 }
1323 }
1324
1325
1326 /*
1327 * subtract off an exported chunk.
1328 * this excludes *dir itself (encode_export_dir should have take care of that)
1329 * we _just_ do the parents' nested counters.
1330 *
1331 * NOTE: call me _after_ forcing *dir into a subtree root,
1332 * but _before_ doing the encode_export_dirs.
1333 */
1334 void MDBalancer::subtract_export(CDir *dir)
1335 {
1336 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1337
1338 while (true) {
1339 dir = dir->inode->get_parent_dir();
1340 if (!dir) break;
1341
1342 dir->pop_nested.sub(subload);
1343 dir->pop_auth_subtree_nested.sub(subload);
1344 }
1345 }
1346
1347
1348 void MDBalancer::add_import(CDir *dir)
1349 {
1350 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1351
1352 while (true) {
1353 dir = dir->inode->get_parent_dir();
1354 if (!dir) break;
1355
1356 dir->pop_nested.add(subload);
1357 dir->pop_auth_subtree_nested.add(subload);
1358 }
1359 }
1360
1361 void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
1362 {
1363 bool adjust_subtree_nest = dir->is_auth();
1364 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1365 CDir *cur = dir;
1366 while (true) {
1367 if (inc) {
1368 pdir->pop_nested.add(dir->pop_nested);
1369 if (adjust_subtree) {
1370 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
1371 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1372 }
1373
1374 if (adjust_subtree_nest)
1375 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
1376 } else {
1377 pdir->pop_nested.sub(dir->pop_nested);
1378 if (adjust_subtree)
1379 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
1380
1381 if (adjust_subtree_nest)
1382 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
1383 }
1384
1385 if (pdir->is_subtree_root())
1386 adjust_subtree = false;
1387 cur = pdir;
1388 pdir = pdir->inode->get_parent_dir();
1389 if (!pdir) break;
1390 }
1391 }
1392
1393 void MDBalancer::handle_mds_failure(mds_rank_t who)
1394 {
1395 if (0 == who) {
1396 mds_last_epoch_under_map.clear();
1397 }
1398 }
1399
1400 int MDBalancer::dump_loads(Formatter *f) const
1401 {
1402 std::deque<CDir*> dfs;
1403 if (mds->mdcache->get_root()) {
1404 mds->mdcache->get_root()->get_dirfrags(dfs);
1405 } else {
1406 dout(10) << "no root" << dendl;
1407 }
1408
1409 f->open_object_section("loads");
1410
1411 f->open_array_section("dirfrags");
1412 while (!dfs.empty()) {
1413 CDir *dir = dfs.front();
1414 dfs.pop_front();
1415
1416 f->open_object_section("dir");
1417 dir->dump_load(f);
1418 f->close_section();
1419
1420 for (auto it = dir->begin(); it != dir->end(); ++it) {
1421 CInode *in = it->second->get_linkage()->get_inode();
1422 if (!in || !in->is_dir())
1423 continue;
1424
1425 auto&& ls = in->get_dirfrags();
1426 for (const auto& subdir : ls) {
1427 if (subdir->pop_nested.meta_load() < .001)
1428 continue;
1429 dfs.push_back(subdir);
1430 }
1431 }
1432 }
1433 f->close_section(); // dirfrags array
1434
1435 f->open_object_section("mds_load");
1436 {
1437
1438 auto dump_mds_load = [f](const mds_load_t& load) {
1439 f->dump_float("request_rate", load.req_rate);
1440 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1441 f->dump_float("queue_length", load.queue_len);
1442 f->dump_float("cpu_load", load.cpu_load_avg);
1443 f->dump_float("mds_load", load.mds_load());
1444
1445 f->open_object_section("auth_dirfrags");
1446 load.auth.dump(f);
1447 f->close_section();
1448 f->open_object_section("all_dirfrags");
1449 load.all.dump(f);
1450 f->close_section();
1451 };
1452
1453 for (const auto& [rank, load] : mds_load) {
1454 CachedStackStringStream css;
1455 *css << "mds." << rank;
1456 f->open_object_section(css->strv());
1457 dump_mds_load(load);
1458 f->close_section();
1459 }
1460 }
1461 f->close_section(); // mds_load
1462
1463 f->open_object_section("mds_meta_load");
1464 for (auto& [rank, mload] : mds_meta_load) {
1465 CachedStackStringStream css;
1466 *css << "mds." << rank;
1467 f->dump_float(css->strv(), mload);
1468 }
1469 f->close_section(); // mds_meta_load
1470
1471 f->open_object_section("mds_import_map");
1472 for (auto& [rank, imports] : mds_import_map) {
1473 {
1474 CachedStackStringStream css;
1475 *css << "mds." << rank;
1476 f->open_array_section(css->strv());
1477 }
1478 for (auto& [rank_from, mload] : imports) {
1479 f->open_object_section("from");
1480 CachedStackStringStream css;
1481 *css << "mds." << rank_from;
1482 f->dump_float(css->strv(), mload);
1483 f->close_section();
1484 }
1485 f->close_section(); // mds.? array
1486 }
1487 f->close_section(); // mds_import_map
1488
1489 f->close_section(); // loads
1490 return 0;
1491 }