]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / MDBalancer.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "mdstypes.h"
17
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
20 #include "MDSRank.h"
21 #include "MDSMap.h"
22 #include "CInode.h"
23 #include "CDir.h"
24 #include "MDCache.h"
25 #include "Migrator.h"
26 #include "Mantle.h"
27
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
30
31 #include <fstream>
32 #include <vector>
33 #include <map>
34
35 using namespace std;
36
37 #include "common/config.h"
38 #include "common/errno.h"
39
40 #define dout_context g_ceph_context
41 #undef dout_prefix
42 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
43 #undef dout
44 #define dout(lvl) \
45 do {\
46 auto subsys = ceph_subsys_mds;\
47 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
48 subsys = ceph_subsys_mds_balancer;\
49 }\
50 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
51 #undef dendl
52 #define dendl dendl_impl; } while (0)
53
54
55 #define MIN_LOAD 50 // ??
56 #define MIN_REEXPORT 5 // will automatically reexport
57 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
58
59
60 int MDBalancer::proc_message(const cref_t<Message> &m)
61 {
62 switch (m->get_type()) {
63
64 case MSG_MDS_HEARTBEAT:
65 handle_heartbeat(ref_cast<MHeartbeat>(m));
66 break;
67
68 default:
69 derr << " balancer unknown message " << m->get_type() << dendl_impl;
70 ceph_abort_msg("balancer unknown message");
71 }
72
73 return 0;
74 }
75
76 MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
77 mds(m), messenger(msgr), mon_client(monc)
78 {
79 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
80 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
81 }
82
83 void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
84 {
85 if (changed.count("mds_bal_fragment_dirs"))
86 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
87 if (changed.count("mds_bal_fragment_interval"))
88 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
89 }
90
91 void MDBalancer::handle_export_pins(void)
92 {
93 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
94 auto mdcache = mds->mdcache;
95
96 auto &q = mdcache->export_pin_queue;
97 auto it = q.begin();
98 dout(20) << "export_pin_queue size=" << q.size() << dendl;
99 while (it != q.end()) {
100 auto cur = it++;
101 CInode *in = *cur;
102 ceph_assert(in->is_dir());
103
104 mds_rank_t export_pin = in->get_export_pin(false);
105 in->check_pin_policy(export_pin);
106
107 if (export_pin >= max_mds) {
108 dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
109 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
110 q.erase(cur);
111
112 in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
113 mdcache->export_pin_delayed_queue.insert(in);
114 continue;
115 }
116
117 dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
118 unsigned min_frag_bits = 0;
119 mds_rank_t target = MDS_RANK_NONE;
120 if (export_pin >= 0)
121 target = export_pin;
122 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
123 target = mdcache->hash_into_rank_bucket(in->ino());
124 else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
125 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
126
127 bool remove = true;
128 for (auto&& dir : in->get_dirfrags()) {
129 if (!dir->is_auth())
130 continue;
131
132 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
133 if (dir->get_frag().bits() < min_frag_bits) {
134 if (!dir->state_test(CDir::STATE_CREATING) &&
135 !dir->is_frozen() && !dir->is_freezing()) {
136 queue_split(dir, true);
137 }
138 remove = false;
139 continue;
140 }
141 target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
142 }
143
144 if (target == MDS_RANK_NONE) {
145 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
146 if (dir->is_frozen() || dir->is_freezing()) {
147 // try again later
148 remove = false;
149 continue;
150 }
151 dout(10) << " clear auxsubtree on " << *dir << dendl;
152 dir->state_clear(CDir::STATE_AUXSUBTREE);
153 mds->mdcache->try_subtree_merge(dir);
154 }
155 } else if (target == mds->get_nodeid()) {
156 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
157 ceph_assert(dir->is_subtree_root());
158 } else if (dir->state_test(CDir::STATE_CREATING) ||
159 dir->is_frozen() || dir->is_freezing()) {
160 // try again later
161 remove = false;
162 continue;
163 } else if (!dir->is_subtree_root()) {
164 dir->state_set(CDir::STATE_AUXSUBTREE);
165 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
166 dout(10) << " create aux subtree on " << *dir << dendl;
167 } else {
168 dout(10) << " set auxsubtree bit on " << *dir << dendl;
169 dir->state_set(CDir::STATE_AUXSUBTREE);
170 }
171 } else {
172 /* Only export a directory if it's non-empty. An empty directory will
173 * be sent back by the importer.
174 */
175 if (dir->get_num_head_items() > 0) {
176 mds->mdcache->migrator->export_dir(dir, target);
177 }
178 remove = false;
179 }
180 }
181
182 if (remove) {
183 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
184 q.erase(cur);
185 }
186 }
187
188 std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
189 bool print_auth_subtrees = true;
190
191 if (authsubs.size() > AUTH_TREES_THRESHOLD &&
192 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
193 dout(15) << "number of auth trees = " << authsubs.size() << "; not "
194 "printing auth trees" << dendl;
195 print_auth_subtrees = false;
196 }
197
198 for (auto &cd : authsubs) {
199 mds_rank_t export_pin = cd->inode->get_export_pin();
200 cd->inode->check_pin_policy(export_pin);
201
202 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
203 export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
204 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
205 export_pin = mdcache->hash_into_rank_bucket(cd->ino());
206 }
207
208 if (print_auth_subtrees)
209 dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
210
211 if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
212 export_pin < mds->mdsmap->get_max_mds()) {
213 mdcache->migrator->export_dir(cd, export_pin);
214 }
215 }
216 }
217
218 void MDBalancer::tick()
219 {
220 static int num_bal_times = g_conf()->mds_bal_max;
221 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
222 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
223 time now = clock::now();
224
225 if (g_conf()->mds_bal_export_pin) {
226 handle_export_pins();
227 }
228
229 // sample?
230 if (chrono::duration<double>(now-last_sample).count() >
231 g_conf()->mds_bal_sample_interval) {
232 dout(15) << "tick last_sample now " << now << dendl;
233 last_sample = now;
234 }
235
236 // We can use duration_cast below, although the result is an int,
237 // because the values from g_conf are also integers.
238 // balance?
239 if (mds->get_nodeid() == 0
240 && mds->is_active()
241 && bal_interval > 0
242 && chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
243 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
244 last_heartbeat = now;
245 send_heartbeat();
246 num_bal_times--;
247 }
248
249 mds->mdcache->show_subtrees(10, true);
250 }
251
252
253
254
255 class C_Bal_SendHeartbeat : public MDSInternalContext {
256 public:
257 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
258 void finish(int f) override {
259 mds->balancer->send_heartbeat();
260 }
261 };
262
263
264 double mds_load_t::mds_load() const
265 {
266 switch(g_conf()->mds_bal_mode) {
267 case 0:
268 return
269 .8 * auth.meta_load() +
270 .2 * all.meta_load() +
271 req_rate +
272 10.0 * queue_len;
273
274 case 1:
275 return req_rate + 10.0*queue_len;
276
277 case 2:
278 return cpu_load_avg;
279
280 }
281 ceph_abort();
282 return 0;
283 }
284
285 mds_load_t MDBalancer::get_load()
286 {
287 auto now = clock::now();
288
289 mds_load_t load{DecayRate()}; /* zero DecayRate! */
290
291 if (mds->mdcache->get_root()) {
292 auto&& ls = mds->mdcache->get_root()->get_dirfrags();
293 for (auto &d : ls) {
294 load.auth.add(d->pop_auth_subtree_nested);
295 load.all.add(d->pop_nested);
296 }
297 } else {
298 dout(20) << "no root, no load" << dendl;
299 }
300
301 uint64_t num_requests = mds->get_num_requests();
302 uint64_t num_traverse = mds->logger->get(l_mds_traverse);
303 uint64_t num_traverse_hit = mds->logger->get(l_mds_traverse_hit);
304
305 uint64_t cpu_time = 1;
306 {
307 string stat_path = PROCPREFIX "/proc/self/stat";
308 ifstream stat_file(stat_path);
309 if (stat_file.is_open()) {
310 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
311 std::istream_iterator<string>());
312 if (stat_vec.size() >= 15) {
313 // utime + stime
314 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
315 strtoll(stat_vec[14].c_str(), nullptr, 10);
316 } else {
317 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
318 }
319 } else {
320 derr << "input file '" << stat_path << "' not found" << dendl_impl;
321 }
322 }
323
324 load.queue_len = messenger->get_dispatch_queue_len();
325
326 bool update_last = true;
327 if (last_get_load != clock::zero() &&
328 now > last_get_load) {
329 double el = std::chrono::duration<double>(now-last_get_load).count();
330 if (el >= 1.0) {
331 if (num_requests > last_num_requests)
332 load.req_rate = (num_requests - last_num_requests) / el;
333 if (cpu_time > last_cpu_time)
334 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
335 if (num_traverse > last_num_traverse && num_traverse_hit > last_num_traverse_hit)
336 load.cache_hit_rate = (double)(num_traverse_hit - last_num_traverse_hit) / (num_traverse - last_num_traverse);
337 } else {
338 auto p = mds_load.find(mds->get_nodeid());
339 if (p != mds_load.end()) {
340 load.req_rate = p->second.req_rate;
341 load.cpu_load_avg = p->second.cpu_load_avg;
342 load.cache_hit_rate = p->second.cache_hit_rate;
343 }
344 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time &&
345 num_traverse >= last_num_traverse && num_traverse_hit >= last_num_traverse_hit)
346 update_last = false;
347 }
348 }
349
350 if (update_last) {
351 last_num_requests = num_requests;
352 last_cpu_time = cpu_time;
353 last_get_load = now;
354 last_num_traverse = num_traverse;
355 last_num_traverse_hit = num_traverse_hit;
356 }
357
358 dout(15) << load << dendl;
359 return load;
360 }
361
362 /*
363 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
364 * fallbacks (i.e. kick off async read when we are processing the map and
365 * check status when we get here) with the way the mds is structured.
366 */
367 int MDBalancer::localize_balancer()
368 {
369 /* reset everything */
370 bool ack = false;
371 int r = 0;
372 bufferlist lua_src;
373 ceph::mutex lock = ceph::make_mutex("lock");
374 ceph::condition_variable cond;
375
376 /* we assume that balancer is in the metadata pool */
377 object_t oid = object_t(mds->mdsmap->get_balancer());
378 object_locator_t oloc(mds->get_metadata_pool());
379 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
380 new C_SafeCond(lock, cond, &ack, &r));
381 dout(15) << "launched non-blocking read tid=" << tid
382 << " oid=" << oid << " oloc=" << oloc << dendl;
383
384 /* timeout: if we waste half our time waiting for RADOS, then abort! */
385 std::cv_status ret_t = [&] {
386 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
387 std::unique_lock locker{lock};
388 return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
389 }();
390 /* success: store the balancer in memory and set the version. */
391 if (!r) {
392 if (ret_t == std::cv_status::timeout) {
393 mds->objecter->op_cancel(tid, -CEPHFS_ECANCELED);
394 return -CEPHFS_ETIMEDOUT;
395 }
396 bal_code.assign(lua_src.to_str());
397 bal_version.assign(oid.name);
398 dout(10) "bal_code=" << bal_code << dendl;
399 }
400 return r;
401 }
402
403 void MDBalancer::send_heartbeat()
404 {
405 if (mds->is_cluster_degraded()) {
406 dout(10) << "degraded" << dendl;
407 return;
408 }
409
410 if (!mds->mdcache->is_open()) {
411 dout(10) << "not open" << dendl;
412 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
413 return;
414 }
415
416 if (mds->get_nodeid() == 0) {
417 beat_epoch++;
418 mds_load.clear();
419 }
420
421 // my load
422 mds_load_t load = get_load();
423 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
424 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
425
426 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
427 if (!em.second) {
428 em.first->second = load;
429 }
430
431 // import_map -- how much do i import from whom
432 map<mds_rank_t, float> import_map;
433 for (auto& im : mds->mdcache->get_auth_subtrees()) {
434 mds_rank_t from = im->inode->authority().first;
435 if (from == mds->get_nodeid()) continue;
436 if (im->get_inode()->is_stray()) continue;
437 import_map[from] += im->pop_auth_subtree.meta_load();
438 }
439 mds_import_map[ mds->get_nodeid() ] = import_map;
440
441
442 dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
443 for (const auto& [rank, load] : import_map) {
444 dout(5) << " import_map from " << rank << " -> " << load << dendl;
445 }
446
447
448 set<mds_rank_t> up;
449 mds->get_mds_map()->get_up_mds_set(up);
450 for (const auto& r : up) {
451 if (r == mds->get_nodeid())
452 continue;
453 auto hb = make_message<MHeartbeat>(load, beat_epoch);
454 hb->get_import_map() = import_map;
455 mds->send_message_mds(hb, r);
456 }
457 }
458
459 void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
460 {
461 mds_rank_t who = mds_rank_t(m->get_source().num());
462 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
463
464 if (!mds->is_active())
465 return;
466
467 if (!mds->mdcache->is_open()) {
468 dout(10) << "opening root on handle_heartbeat" << dendl;
469 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
470 return;
471 }
472
473 if (mds->is_cluster_degraded()) {
474 dout(10) << " degraded, ignoring" << dendl;
475 return;
476 }
477
478 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
479 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
480
481 beat_epoch = m->get_beat();
482 // clear the mds load info whose epoch is less than beat_epoch
483 mds_load.clear();
484 }
485
486 if (who == 0) {
487 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
488 if (beat_epoch != m->get_beat()) {
489 beat_epoch = m->get_beat();
490 mds_load.clear();
491 }
492
493 send_heartbeat();
494
495 mds->mdcache->show_subtrees();
496 } else if (mds->get_nodeid() == 0) {
497 if (beat_epoch != m->get_beat()) {
498 dout(10) << " old heartbeat epoch, ignoring" << dendl;
499 return;
500 }
501 }
502
503 {
504 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
505 if (!em.second) {
506 em.first->second = m->get_load();
507 }
508 }
509 mds_import_map[who] = m->get_import_map();
510
511 {
512 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
513 if (mds_load.size() == cluster_size) {
514 // let's go!
515 //export_empties(); // no!
516
517 /* avoid spamming ceph -w if user does not turn mantle on */
518 if (mds->mdsmap->get_balancer() != "") {
519 int r = mantle_prep_rebalance();
520 if (!r) return;
521 mds->clog->warn() << "using old balancer; mantle failed for "
522 << "balancer=" << mds->mdsmap->get_balancer()
523 << " : " << cpp_strerror(r);
524 }
525 prep_rebalance(m->get_beat());
526 }
527 }
528 }
529
530 double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
531 mds_rank_t im, double& maxim)
532 {
533 if (maxex <= 0 || maxim <= 0) return 0.0;
534
535 double howmuch = std::min(maxex, maxim);
536
537 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
538
539 if (ex == mds->get_nodeid())
540 state.targets[im] += howmuch;
541
542 state.exported[ex] += howmuch;
543 state.imported[im] += howmuch;
544
545 maxex -= howmuch;
546 maxim -= howmuch;
547
548 return howmuch;
549 }
550
551 void MDBalancer::queue_split(const CDir *dir, bool fast)
552 {
553 dout(10) << __func__ << " enqueuing " << *dir
554 << " (fast=" << fast << ")" << dendl;
555
556 const dirfrag_t df = dir->dirfrag();
557
558 auto callback = [this, df](int r) {
559 if (split_pending.erase(df) == 0) {
560 // Someone beat me to it. This can happen in the fast splitting
561 // path, because we spawn two contexts, one with mds->timer and
562 // one with mds->queue_waiter. The loser can safely just drop
563 // out.
564 return;
565 }
566
567 auto mdcache = mds->mdcache;
568
569 CDir *dir = mdcache->get_dirfrag(df);
570 if (!dir) {
571 dout(10) << "drop split on " << df << " because not in cache" << dendl;
572 return;
573 }
574 if (!dir->is_auth()) {
575 dout(10) << "drop split on " << df << " because non-auth" << dendl;
576 return;
577 }
578
579 // Pass on to MDCache: note that the split might still not
580 // happen if the checks in MDCache::can_fragment fail.
581 dout(10) << __func__ << " splitting " << *dir << dendl;
582 int bits = g_conf()->mds_bal_split_bits;
583 if (dir->inode->is_ephemeral_dist()) {
584 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
585 if (df.frag.bits() + bits < min_frag_bits)
586 bits = min_frag_bits - df.frag.bits();
587 }
588 mdcache->split_dir(dir, bits);
589 };
590
591 auto ret = split_pending.insert(df);
592 bool is_new = ret.second;
593
594 if (fast) {
595 // Do the split ASAP: enqueue it in the MDSRank waiters which are
596 // run at the end of dispatching the current request
597 mds->queue_waiter(new MDSInternalContextWrapper(mds,
598 new LambdaContext(std::move(callback))));
599 } else if (is_new) {
600 // Set a timer to really do the split: we don't do it immediately
601 // so that bursts of ops on a directory have a chance to go through
602 // before we freeze it.
603 mds->timer.add_event_after(bal_fragment_interval,
604 new LambdaContext(std::move(callback)));
605 }
606 }
607
608 void MDBalancer::queue_merge(CDir *dir)
609 {
610 const auto frag = dir->dirfrag();
611 auto callback = [this, frag](int r) {
612 ceph_assert(frag.frag != frag_t());
613
614 // frag must be in this set because only one context is in flight
615 // for a given frag at a time (because merge_pending is checked before
616 // starting one), and this context is the only one that erases it.
617 merge_pending.erase(frag);
618
619 auto mdcache = mds->mdcache;
620 CDir *dir = mdcache->get_dirfrag(frag);
621 if (!dir) {
622 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
623 return;
624 }
625 ceph_assert(dir->dirfrag() == frag);
626
627 if(!dir->is_auth()) {
628 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
629 return;
630 }
631
632 dout(10) << "merging " << *dir << dendl;
633
634 CInode *diri = dir->get_inode();
635
636 unsigned min_frag_bits = 0;
637 if (diri->is_ephemeral_dist())
638 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
639
640 frag_t fg = dir->get_frag();
641 while (fg.bits() > min_frag_bits) {
642 frag_t sibfg = fg.get_sibling();
643 auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
644 if (!complete) {
645 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
646 break;
647 }
648 bool all = true;
649 for (auto& sib : sibs) {
650 if (!sib->is_auth() || !sib->should_merge()) {
651 all = false;
652 break;
653 }
654 }
655 if (!all) {
656 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
657 break;
658 }
659 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
660 fg = fg.parent();
661 }
662
663 if (fg != dir->get_frag())
664 mdcache->merge_dir(diri, fg);
665 };
666
667 if (merge_pending.count(frag) == 0) {
668 dout(20) << " enqueued dir " << *dir << dendl;
669 merge_pending.insert(frag);
670 mds->timer.add_event_after(bal_fragment_interval,
671 new LambdaContext(std::move(callback)));
672 } else {
673 dout(20) << " dir already in queue " << *dir << dendl;
674 }
675 }
676
677 void MDBalancer::prep_rebalance(int beat)
678 {
679 balance_state_t state;
680
681 if (g_conf()->mds_thrash_exports) {
682 //we're going to randomly export to all the mds in the cluster
683 set<mds_rank_t> up_mds;
684 mds->get_mds_map()->get_up_mds_set(up_mds);
685 for (const auto &rank : up_mds) {
686 state.targets[rank] = 0.0;
687 }
688 } else {
689 int cluster_size = mds->get_mds_map()->get_num_in_mds();
690 mds_rank_t whoami = mds->get_nodeid();
691 rebalance_time = clock::now();
692
693 dout(7) << "cluster loads are" << dendl;
694
695 mds->mdcache->migrator->clear_export_queue();
696
697 // rescale! turn my mds_load back into meta_load units
698 double load_fac = 1.0;
699 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
700 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
701 double metald = m->second.auth.meta_load();
702 double mdsld = m->second.mds_load();
703 load_fac = metald / mdsld;
704 dout(7) << " load_fac is " << load_fac
705 << " <- " << m->second.auth << " " << metald
706 << " / " << mdsld
707 << dendl;
708 }
709
710 mds_meta_load.clear();
711
712 double total_load = 0.0;
713 multimap<double,mds_rank_t> load_map;
714 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
715 mds_load_t& load = mds_load.at(i);
716
717 double l = load.mds_load() * load_fac;
718 mds_meta_load[i] = l;
719
720 if (whoami == 0)
721 dout(7) << " mds." << i
722 << " " << load
723 << " = " << load.mds_load()
724 << " ~ " << l << dendl;
725
726 if (whoami == i) my_load = l;
727 total_load += l;
728
729 load_map.insert(pair<double,mds_rank_t>( l, i ));
730 }
731
732 // target load
733 target_load = total_load / (double)cluster_size;
734 dout(7) << "my load " << my_load
735 << " target " << target_load
736 << " total " << total_load
737 << dendl;
738
739 // under or over?
740 for (const auto& [load, rank] : load_map) {
741 if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
742 dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
743 mds_last_epoch_under_map[rank] = beat_epoch;
744 }
745 }
746
747 int last_epoch_under = mds_last_epoch_under_map[whoami];
748 if (last_epoch_under == beat_epoch) {
749 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl;
750 return;
751 }
752 // am i over long enough?
753 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
754 dout(7) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
755 return;
756 }
757
758 dout(7) << " i am sufficiently overloaded" << dendl;
759
760
761 // first separate exporters and importers
762 multimap<double,mds_rank_t> importers;
763 multimap<double,mds_rank_t> exporters;
764 set<mds_rank_t> importer_set;
765 set<mds_rank_t> exporter_set;
766
767 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
768 it != load_map.end();
769 ++it) {
770 if (it->first < target_load) {
771 dout(15) << " mds." << it->second << " is importer" << dendl;
772 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
773 importer_set.insert(it->second);
774 } else {
775 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
776 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
777 dout(15) << " mds." << it->second << " is exporter" << dendl;
778 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
779 exporter_set.insert(it->second);
780 }
781 }
782 }
783
784
785 // determine load transfer mapping
786
787 if (true) {
788 // analyze import_map; do any matches i can
789
790 dout(15) << " matching exporters to import sources" << dendl;
791
792 // big -> small exporters
793 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
794 ex != exporters.rend();
795 ++ex) {
796 double maxex = get_maxex(state, ex->second);
797 if (maxex <= .001) continue;
798
799 // check importers. for now, just in arbitrary order (no intelligent matching).
800 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
801 im != mds_import_map[ex->second].end();
802 ++im) {
803 double maxim = get_maxim(state, im->first);
804 if (maxim <= .001) continue;
805 try_match(state, ex->second, maxex, im->first, maxim);
806 if (maxex <= .001) break;
807 }
808 }
809 }
810
811 // old way
812 if (beat % 2 == 1) {
813 dout(15) << " matching big exporters to big importers" << dendl;
814 // big exporters to big importers
815 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
816 multimap<double,mds_rank_t>::iterator im = importers.begin();
817 while (ex != exporters.rend() &&
818 im != importers.end()) {
819 double maxex = get_maxex(state, ex->second);
820 double maxim = get_maxim(state, im->second);
821 if (maxex < .001 || maxim < .001) break;
822 try_match(state, ex->second, maxex, im->second, maxim);
823 if (maxex <= .001) ++ex;
824 if (maxim <= .001) ++im;
825 }
826 } else { // new way
827 dout(15) << " matching small exporters to big importers" << dendl;
828 // small exporters to big importers
829 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
830 multimap<double,mds_rank_t>::iterator im = importers.begin();
831 while (ex != exporters.end() &&
832 im != importers.end()) {
833 double maxex = get_maxex(state, ex->second);
834 double maxim = get_maxim(state, im->second);
835 if (maxex < .001 || maxim < .001) break;
836 try_match(state, ex->second, maxex, im->second, maxim);
837 if (maxex <= .001) ++ex;
838 if (maxim <= .001) ++im;
839 }
840 }
841 }
842 try_rebalance(state);
843 }
844
845 int MDBalancer::mantle_prep_rebalance()
846 {
847 balance_state_t state;
848
849 /* refresh balancer if it has changed */
850 if (bal_version != mds->mdsmap->get_balancer()) {
851 bal_version.assign("");
852 int r = localize_balancer();
853 if (r) return r;
854
855 /* only spam the cluster log from 1 mds on version changes */
856 if (mds->get_nodeid() == 0)
857 mds->clog->info() << "mantle balancer version changed: " << bal_version;
858 }
859
860 /* prepare for balancing */
861 int cluster_size = mds->get_mds_map()->get_num_in_mds();
862 rebalance_time = clock::now();
863 mds->mdcache->migrator->clear_export_queue();
864
865 /* fill in the metrics for each mds by grabbing load struct */
866 vector < map<string, double> > metrics (cluster_size);
867 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
868 mds_load_t& load = mds_load.at(i);
869
870 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
871 {"all.meta_load", load.all.meta_load()},
872 {"req_rate", load.req_rate},
873 {"queue_len", load.queue_len},
874 {"cpu_load_avg", load.cpu_load_avg}};
875 }
876
877 /* execute the balancer */
878 Mantle mantle;
879 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
880 dout(7) << " mantle decided that new targets=" << state.targets << dendl;
881
882 /* mantle doesn't know about cluster size, so check target len here */
883 if ((int) state.targets.size() != cluster_size)
884 return -CEPHFS_EINVAL;
885 else if (ret)
886 return ret;
887
888 try_rebalance(state);
889 return 0;
890 }
891
892
893
894 void MDBalancer::try_rebalance(balance_state_t& state)
895 {
896 if (g_conf()->mds_thrash_exports) {
897 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
898 << dendl;
899 return;
900 }
901
902 // make a sorted list of my imports
903 multimap<double, CDir*> import_pop_map;
904 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
905
906 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
907 CInode *diri = dir->get_inode();
908 if (diri->is_mdsdir())
909 continue;
910 if (diri->get_export_pin(false) != MDS_RANK_NONE)
911 continue;
912 if (dir->is_freezing() || dir->is_frozen())
913 continue; // export pbly already in progress
914
915 mds_rank_t from = diri->authority().first;
916 double pop = dir->pop_auth_subtree.meta_load();
917 if (g_conf()->mds_bal_idle_threshold > 0 &&
918 pop < g_conf()->mds_bal_idle_threshold &&
919 diri != mds->mdcache->get_root() &&
920 from != mds->get_nodeid()) {
921 dout(5) << " exporting idle (" << pop << ") import " << *dir
922 << " back to mds." << from << dendl;
923 mds->mdcache->migrator->export_dir_nicely(dir, from);
924 continue;
925 }
926
927 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
928 import_pop_map.insert(make_pair(pop, dir));
929 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
930 }
931
932 // do my exports!
933 map<mds_rank_t, double> export_pop_map;
934
935 for (auto &it : state.targets) {
936 mds_rank_t target = it.first;
937 double amount = it.second;
938
939 if (amount < MIN_OFFLOAD)
940 continue;
941 if (amount * 10 * state.targets.size() < target_load)
942 continue;
943
944 dout(5) << "want to send " << amount << " to mds." << target
945 //<< " .. " << (*it).second << " * " << load_fac
946 << " -> " << amount
947 << dendl;//" .. fudge is " << fudge << dendl;
948
949 double& have = export_pop_map[target];
950
951 mds->mdcache->show_subtrees();
952
953 // search imports from target
954 if (import_from_map.count(target)) {
955 dout(7) << " aha, looking through imports from target mds." << target << dendl;
956 for (auto p = import_from_map.equal_range(target);
957 p.first != p.second; ) {
958 CDir *dir = p.first->second.first;
959 double pop = p.first->second.second;
960 dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
961 auto plast = p.first++;
962
963 if (dir->inode->is_base())
964 continue;
965 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
966
967 if (pop <= amount-have) {
968 dout(7) << "reexporting " << *dir << " pop " << pop
969 << " back to mds." << target << dendl;
970 mds->mdcache->migrator->export_dir_nicely(dir, target);
971 have += pop;
972 import_from_map.erase(plast);
973 for (auto q = import_pop_map.equal_range(pop);
974 q.first != q.second; ) {
975 if (q.first->second == dir) {
976 import_pop_map.erase(q.first);
977 break;
978 }
979 q.first++;
980 }
981 } else {
982 dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
983 }
984 if (amount-have < MIN_OFFLOAD)
985 break;
986 }
987 }
988 }
989
990 // any other imports
991 for (auto &it : state.targets) {
992 mds_rank_t target = it.first;
993 double amount = it.second;
994
995 if (!export_pop_map.count(target))
996 continue;
997 double& have = export_pop_map[target];
998 if (amount-have < MIN_OFFLOAD)
999 continue;
1000
1001 for (auto p = import_pop_map.begin();
1002 p != import_pop_map.end(); ) {
1003 CDir *dir = p->second;
1004 if (dir->inode->is_base()) {
1005 ++p;
1006 continue;
1007 }
1008
1009 double pop = p->first;
1010 if (pop <= amount-have && pop > MIN_REEXPORT) {
1011 dout(5) << "reexporting " << *dir << " pop " << pop
1012 << " to mds." << target << dendl;
1013 have += pop;
1014 mds->mdcache->migrator->export_dir_nicely(dir, target);
1015 import_pop_map.erase(p++);
1016 } else {
1017 ++p;
1018 }
1019 if (amount-have < MIN_OFFLOAD)
1020 break;
1021 }
1022 }
1023
1024 set<CDir*> already_exporting;
1025
1026 for (auto &it : state.targets) {
1027 mds_rank_t target = it.first;
1028 double amount = it.second;
1029
1030 if (!export_pop_map.count(target))
1031 continue;
1032 double& have = export_pop_map[target];
1033 if (amount-have < MIN_OFFLOAD)
1034 continue;
1035
1036 // okay, search for fragments of my workload
1037 std::vector<CDir*> exports;
1038
1039 for (auto p = import_pop_map.rbegin();
1040 p != import_pop_map.rend();
1041 ++p) {
1042 CDir *dir = p->second;
1043 find_exports(dir, amount, &exports, have, already_exporting);
1044 if (amount-have < MIN_OFFLOAD)
1045 break;
1046 }
1047 //fudge = amount - have;
1048
1049 for (const auto& dir : exports) {
1050 dout(5) << " - exporting " << dir->pop_auth_subtree
1051 << " " << dir->pop_auth_subtree.meta_load()
1052 << " to mds." << target << " " << *dir << dendl;
1053 mds->mdcache->migrator->export_dir_nicely(dir, target);
1054 }
1055 }
1056
1057 dout(7) << "done" << dendl;
1058 mds->mdcache->show_subtrees();
1059 }
1060
1061 void MDBalancer::find_exports(CDir *dir,
1062 double amount,
1063 std::vector<CDir*>* exports,
1064 double& have,
1065 set<CDir*>& already_exporting)
1066 {
1067 auto now = clock::now();
1068 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
1069 if (duration > 0.1) {
1070 derr << " balancer runs too long" << dendl_impl;
1071 have = amount;
1072 return;
1073 }
1074
1075 ceph_assert(dir->is_auth());
1076
1077 double need = amount - have;
1078 if (need < amount * g_conf()->mds_bal_min_start)
1079 return; // good enough!
1080
1081 double needmax = need * g_conf()->mds_bal_need_max;
1082 double needmin = need * g_conf()->mds_bal_need_min;
1083 double midchunk = need * g_conf()->mds_bal_midchunk;
1084 double minchunk = need * g_conf()->mds_bal_minchunk;
1085
1086 std::vector<CDir*> bigger_rep, bigger_unrep;
1087 multimap<double, CDir*> smaller;
1088
1089 double dir_pop = dir->pop_auth_subtree.meta_load();
1090 dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
1091
1092 double subdir_sum = 0;
1093 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1094 !it.end(); ) {
1095 CInode *in = *it;
1096 ++it;
1097
1098 ceph_assert(in->is_dir());
1099 ceph_assert(in->get_parent_dir() == dir);
1100
1101 auto&& dfls = in->get_nested_dirfrags();
1102
1103 size_t num_idle_frags = 0;
1104 for (const auto& subdir : dfls) {
1105 if (already_exporting.count(subdir))
1106 continue;
1107
1108 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1109 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1110 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1111 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1112 continue; // can't export this right now!
1113
1114 // how popular?
1115 double pop = subdir->pop_auth_subtree.meta_load();
1116 subdir_sum += pop;
1117 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1118
1119 if (pop < minchunk) {
1120 num_idle_frags++;
1121 continue;
1122 }
1123
1124 // lucky find?
1125 if (pop > needmin && pop < needmax) {
1126 exports->push_back(subdir);
1127 already_exporting.insert(subdir);
1128 have += pop;
1129 return;
1130 }
1131
1132 if (pop > need) {
1133 if (subdir->is_rep())
1134 bigger_rep.push_back(subdir);
1135 else
1136 bigger_unrep.push_back(subdir);
1137 } else
1138 smaller.insert(pair<double,CDir*>(pop, subdir));
1139 }
1140 if (dfls.size() == num_idle_frags)
1141 in->item_pop_lru.remove_myself();
1142 }
1143 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1144
1145 // grab some sufficiently big small items
1146 multimap<double,CDir*>::reverse_iterator it;
1147 for (it = smaller.rbegin();
1148 it != smaller.rend();
1149 ++it) {
1150
1151 if ((*it).first < midchunk)
1152 break; // try later
1153
1154 dout(7) << " taking smaller " << *(*it).second << dendl;
1155
1156 exports->push_back((*it).second);
1157 already_exporting.insert((*it).second);
1158 have += (*it).first;
1159 if (have > needmin)
1160 return;
1161 }
1162
1163 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1164 for (const auto& dir : bigger_unrep) {
1165 dout(15) << " descending into " << *dir << dendl;
1166 find_exports(dir, amount, exports, have, already_exporting);
1167 if (have > needmin)
1168 return;
1169 }
1170
1171 // ok fine, use smaller bits
1172 for (;
1173 it != smaller.rend();
1174 ++it) {
1175 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1176
1177 exports->push_back((*it).second);
1178 already_exporting.insert((*it).second);
1179 have += (*it).first;
1180 if (have > needmin)
1181 return;
1182 }
1183
1184 // ok fine, drill into replicated dirs
1185 for (const auto& dir : bigger_rep) {
1186 dout(7) << " descending into replicated " << *dir << dendl;
1187 find_exports(dir, amount, exports, have, already_exporting);
1188 if (have > needmin)
1189 return;
1190 }
1191 }
1192
1193 void MDBalancer::hit_inode(CInode *in, int type, int who)
1194 {
1195 // hit inode
1196 in->pop.get(type).hit();
1197
1198 if (in->get_parent_dn())
1199 hit_dir(in->get_parent_dn()->get_dir(), type, who);
1200 }
1201
1202 void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1203 {
1204 // split/merge
1205 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
1206 dir->is_auth() &&
1207 !dir->inode->is_base() && // not root/mdsdir (for now at least)
1208 !dir->inode->is_stray()) { // not straydir
1209
1210 // split
1211 if (dir->should_split() || hot) {
1212 if (split_pending.count(dir->dirfrag()) == 0) {
1213 queue_split(dir, false);
1214 } else {
1215 if (dir->should_split_fast()) {
1216 queue_split(dir, true);
1217 } else {
1218 dout(10) << ": fragment already enqueued to split: "
1219 << *dir << dendl;
1220 }
1221 }
1222 }
1223
1224 // merge?
1225 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1226 merge_pending.count(dir->dirfrag()) == 0) {
1227 queue_merge(dir);
1228 }
1229 }
1230 }
1231
1232 void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
1233 {
1234 if (dir->inode->is_stray())
1235 return;
1236 // hit me
1237 double v = dir->pop_me.get(type).hit(amount);
1238
1239 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1240 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
1241
1242 dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
1243 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
1244
1245 maybe_fragment(dir, hot);
1246
1247 // replicate?
1248 if (type == META_POP_IRD && who >= 0) {
1249 dir->pop_spread.hit(who);
1250 }
1251
1252 double rd_adj = 0.0;
1253 if (type == META_POP_IRD &&
1254 dir->last_popularity_sample < last_sample) {
1255 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
1256 dir->last_popularity_sample = last_sample;
1257 double pop_sp = dir->pop_spread.get();
1258 dir_pop += pop_sp * 10;
1259
1260 //if (dir->ino() == inodeno_t(0x10000000002))
1261 if (pop_sp > 0) {
1262 dout(20) << type << " pop " << dir_pop << " spread " << pop_sp
1263 << " " << dir->pop_spread.last[0]
1264 << " " << dir->pop_spread.last[1]
1265 << " " << dir->pop_spread.last[2]
1266 << " " << dir->pop_spread.last[3]
1267 << " in " << *dir << dendl;
1268 }
1269
1270 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1271 if (dir->can_rep() &&
1272 dir_pop >= g_conf()->mds_bal_replicate_threshold) {
1273 // replicate
1274 double rdp = dir->pop_me.get(META_POP_IRD).get();
1275 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1276 rd_adj /= 2.0; // temper somewhat
1277
1278 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
1279
1280 dir->dir_rep = CDir::REP_ALL;
1281 mds->mdcache->send_dir_updates(dir, true);
1282
1283 // fixme this should adjust the whole pop hierarchy
1284 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1285 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1286 }
1287
1288 if (dir->ino() != 1 &&
1289 dir->is_rep() &&
1290 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
1291 // unreplicate
1292 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
1293
1294 dir->dir_rep = CDir::REP_NONE;
1295 mds->mdcache->send_dir_updates(dir);
1296 }
1297 }
1298 }
1299
1300 // adjust ancestors
1301 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1302 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1303
1304 while (true) {
1305 CDir *pdir = dir->inode->get_parent_dir();
1306 dir->pop_nested.get(type).hit(amount);
1307 if (rd_adj != 0.0)
1308 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
1309
1310 if (hit_subtree) {
1311 dir->pop_auth_subtree.get(type).hit(amount);
1312
1313 if (rd_adj != 0.0)
1314 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1315
1316 if (dir->is_subtree_root())
1317 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1318 else if (pdir)
1319 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
1320 }
1321
1322 if (hit_subtree_nested) {
1323 dir->pop_auth_subtree_nested.get(type).hit(amount);
1324 if (rd_adj != 0.0)
1325 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
1326 }
1327 if (!pdir) break;
1328 dir = pdir;
1329 }
1330 }
1331
1332
1333 /*
1334 * subtract off an exported chunk.
1335 * this excludes *dir itself (encode_export_dir should have take care of that)
1336 * we _just_ do the parents' nested counters.
1337 *
1338 * NOTE: call me _after_ forcing *dir into a subtree root,
1339 * but _before_ doing the encode_export_dirs.
1340 */
1341 void MDBalancer::subtract_export(CDir *dir)
1342 {
1343 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1344
1345 while (true) {
1346 dir = dir->inode->get_parent_dir();
1347 if (!dir) break;
1348
1349 dir->pop_nested.sub(subload);
1350 dir->pop_auth_subtree_nested.sub(subload);
1351 }
1352 }
1353
1354
1355 void MDBalancer::add_import(CDir *dir)
1356 {
1357 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1358
1359 while (true) {
1360 dir = dir->inode->get_parent_dir();
1361 if (!dir) break;
1362
1363 dir->pop_nested.add(subload);
1364 dir->pop_auth_subtree_nested.add(subload);
1365 }
1366 }
1367
1368 void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
1369 {
1370 bool adjust_subtree_nest = dir->is_auth();
1371 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1372 CDir *cur = dir;
1373 while (true) {
1374 if (inc) {
1375 pdir->pop_nested.add(dir->pop_nested);
1376 if (adjust_subtree) {
1377 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
1378 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1379 }
1380
1381 if (adjust_subtree_nest)
1382 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
1383 } else {
1384 pdir->pop_nested.sub(dir->pop_nested);
1385 if (adjust_subtree)
1386 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
1387
1388 if (adjust_subtree_nest)
1389 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
1390 }
1391
1392 if (pdir->is_subtree_root())
1393 adjust_subtree = false;
1394 cur = pdir;
1395 pdir = pdir->inode->get_parent_dir();
1396 if (!pdir) break;
1397 }
1398 }
1399
1400 void MDBalancer::handle_mds_failure(mds_rank_t who)
1401 {
1402 if (0 == who) {
1403 mds_last_epoch_under_map.clear();
1404 }
1405 }
1406
1407 int MDBalancer::dump_loads(Formatter *f) const
1408 {
1409 std::deque<CDir*> dfs;
1410 if (mds->mdcache->get_root()) {
1411 mds->mdcache->get_root()->get_dirfrags(dfs);
1412 } else {
1413 dout(10) << "no root" << dendl;
1414 }
1415
1416 f->open_object_section("loads");
1417
1418 f->open_array_section("dirfrags");
1419 while (!dfs.empty()) {
1420 CDir *dir = dfs.front();
1421 dfs.pop_front();
1422
1423 f->open_object_section("dir");
1424 dir->dump_load(f);
1425 f->close_section();
1426
1427 for (auto it = dir->begin(); it != dir->end(); ++it) {
1428 CInode *in = it->second->get_linkage()->get_inode();
1429 if (!in || !in->is_dir())
1430 continue;
1431
1432 auto&& ls = in->get_dirfrags();
1433 for (const auto& subdir : ls) {
1434 if (subdir->pop_nested.meta_load() < .001)
1435 continue;
1436 dfs.push_back(subdir);
1437 }
1438 }
1439 }
1440 f->close_section(); // dirfrags array
1441
1442 f->open_object_section("mds_load");
1443 {
1444
1445 auto dump_mds_load = [f](const mds_load_t& load) {
1446 f->dump_float("request_rate", load.req_rate);
1447 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1448 f->dump_float("queue_length", load.queue_len);
1449 f->dump_float("cpu_load", load.cpu_load_avg);
1450 f->dump_float("mds_load", load.mds_load());
1451
1452 f->open_object_section("auth_dirfrags");
1453 load.auth.dump(f);
1454 f->close_section();
1455 f->open_object_section("all_dirfrags");
1456 load.all.dump(f);
1457 f->close_section();
1458 };
1459
1460 for (const auto& [rank, load] : mds_load) {
1461 CachedStackStringStream css;
1462 *css << "mds." << rank;
1463 f->open_object_section(css->strv());
1464 dump_mds_load(load);
1465 f->close_section();
1466 }
1467 }
1468 f->close_section(); // mds_load
1469
1470 f->open_object_section("mds_meta_load");
1471 for (auto& [rank, mload] : mds_meta_load) {
1472 CachedStackStringStream css;
1473 *css << "mds." << rank;
1474 f->dump_float(css->strv(), mload);
1475 }
1476 f->close_section(); // mds_meta_load
1477
1478 f->open_object_section("mds_import_map");
1479 for (auto& [rank, imports] : mds_import_map) {
1480 {
1481 CachedStackStringStream css;
1482 *css << "mds." << rank;
1483 f->open_array_section(css->strv());
1484 }
1485 for (auto& [rank_from, mload] : imports) {
1486 f->open_object_section("from");
1487 CachedStackStringStream css;
1488 *css << "mds." << rank_from;
1489 f->dump_float(css->strv(), mload);
1490 f->close_section();
1491 }
1492 f->close_section(); // mds.? array
1493 }
1494 f->close_section(); // mds_import_map
1495
1496 f->close_section(); // loads
1497 return 0;
1498 }