]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / mds / MDBalancer.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/compat.h"
16#include "mdstypes.h"
17
91327a77 18#include "mon/MonClient.h"
7c673cae
FG
19#include "MDBalancer.h"
20#include "MDSRank.h"
7c673cae
FG
21#include "MDSMap.h"
22#include "CInode.h"
23#include "CDir.h"
24#include "MDCache.h"
25#include "Migrator.h"
26#include "Mantle.h"
27
28#include "include/Context.h"
29#include "msg/Messenger.h"
7c673cae
FG
30
31#include <fstream>
32#include <iostream>
33#include <vector>
34#include <map>
35using std::map;
36using std::vector;
11fdf7f2 37using std::chrono::duration_cast;
7c673cae
FG
38
39#include "common/config.h"
40#include "common/errno.h"
41
42#define dout_context g_ceph_context
7c673cae
FG
43#undef dout_prefix
44#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
45#undef dout
46#define dout(lvl) \
47 do {\
48 auto subsys = ceph_subsys_mds;\
49 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
50 subsys = ceph_subsys_mds_balancer;\
51 }\
11fdf7f2 52 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
7c673cae
FG
53#undef dendl
54#define dendl dendl_impl; } while (0)
55
56
57#define MIN_LOAD 50 // ??
58#define MIN_REEXPORT 5 // will automatically reexport
59#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
60
61
11fdf7f2 62int MDBalancer::proc_message(const Message::const_ref &m)
7c673cae
FG
63{
64 switch (m->get_type()) {
65
66 case MSG_MDS_HEARTBEAT:
11fdf7f2 67 handle_heartbeat(MHeartbeat::msgref_cast(m));
7c673cae
FG
68 break;
69
70 default:
b32b8144 71 derr << " balancer unknown message " << m->get_type() << dendl_impl;
11fdf7f2 72 ceph_abort_msg("balancer unknown message");
7c673cae
FG
73 }
74
75 return 0;
76}
77
91327a77
AA
78MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
79 mds(m), messenger(msgr), mon_client(monc)
80{
11fdf7f2
TL
81 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
82 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
91327a77
AA
83}
84
92f5a8d4 85void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
91327a77 86{
11fdf7f2
TL
87 if (changed.count("mds_bal_fragment_dirs"))
88 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
91327a77 89 if (changed.count("mds_bal_fragment_interval"))
11fdf7f2 90 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
91327a77
AA
91}
92
7c673cae
FG
93void MDBalancer::handle_export_pins(void)
94{
95 auto &q = mds->mdcache->export_pin_queue;
96 auto it = q.begin();
97 dout(20) << "export_pin_queue size=" << q.size() << dendl;
98 while (it != q.end()) {
31f18b77
FG
99 auto cur = it++;
100 CInode *in = *cur;
11fdf7f2 101 ceph_assert(in->is_dir());
31f18b77 102 mds_rank_t export_pin = in->get_export_pin(false);
eafe8130
TL
103 if (export_pin >= mds->mdsmap->get_max_mds()) {
104 dout(20) << " delay export pin on " << *in << dendl;
105 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
106 q.erase(cur);
107
108 in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
109 mds->mdcache->export_pin_delayed_queue.insert(in);
110 continue;
111 }
31f18b77
FG
112
113 bool remove = true;
114 list<CDir*> dfls;
115 in->get_dirfrags(dfls);
116 for (auto dir : dfls) {
117 if (!dir->is_auth())
118 continue;
119
120 if (export_pin == MDS_RANK_NONE) {
121 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
122 if (dir->is_frozen() || dir->is_freezing()) {
123 // try again later
124 remove = false;
125 continue;
126 }
127 dout(10) << " clear auxsubtree on " << *dir << dendl;
128 dir->state_clear(CDir::STATE_AUXSUBTREE);
129 mds->mdcache->try_subtree_merge(dir);
130 }
131 } else if (export_pin == mds->get_nodeid()) {
132 if (dir->state_test(CDir::STATE_CREATING) ||
133 dir->is_frozen() || dir->is_freezing()) {
134 // try again later
135 remove = false;
136 continue;
137 }
138 if (!dir->is_subtree_root()) {
139 dir->state_set(CDir::STATE_AUXSUBTREE);
140 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
141 dout(10) << " create aux subtree on " << *dir << dendl;
142 } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) {
143 dout(10) << " set auxsubtree bit on " << *dir << dendl;
144 dir->state_set(CDir::STATE_AUXSUBTREE);
145 }
146 } else {
147 mds->mdcache->migrator->export_dir(dir, export_pin);
148 remove = false;
7c673cae
FG
149 }
150 }
31f18b77
FG
151
152 if (remove) {
153 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
154 q.erase(cur);
7c673cae
FG
155 }
156 }
157
81eedcae
TL
158 std::vector<CDir *> authsubs = mds->mdcache->get_auth_subtrees();
159 bool print_auth_subtrees = true;
160
161 if (authsubs.size() > AUTH_TREES_THRESHOLD &&
162 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
163 dout(15) << "number of auth trees = " << authsubs.size() << "; not "
164 "printing auth trees" << dendl;
165 print_auth_subtrees = false;
166 }
167
168 for (auto &cd : authsubs) {
7c673cae 169 mds_rank_t export_pin = cd->inode->get_export_pin();
81eedcae
TL
170
171 if (print_auth_subtrees) {
172 dout(25) << "auth tree " << *cd << " export_pin=" << export_pin <<
173 dendl;
174 }
175
eafe8130
TL
176 if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()
177 && export_pin != mds->get_nodeid()) {
7c673cae
FG
178 mds->mdcache->migrator->export_dir(cd, export_pin);
179 }
180 }
181}
182
183void MDBalancer::tick()
184{
11fdf7f2
TL
185 static int num_bal_times = g_conf()->mds_bal_max;
186 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
187 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
188 time now = clock::now();
7c673cae 189
11fdf7f2 190 if (g_conf()->mds_bal_export_pin) {
7c673cae
FG
191 handle_export_pins();
192 }
193
194 // sample?
11fdf7f2
TL
195 if (chrono::duration<double>(now-last_sample).count() >
196 g_conf()->mds_bal_sample_interval) {
7c673cae
FG
197 dout(15) << "tick last_sample now " << now << dendl;
198 last_sample = now;
199 }
200
11fdf7f2
TL
201 // We can use duration_cast below, although the result is an int,
202 // because the values from g_conf are also integers.
7c673cae 203 // balance?
11fdf7f2
TL
204 if (mds->get_nodeid() == 0
205 && mds->is_active()
206 && bal_interval > 0
207 && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
208 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
7c673cae
FG
209 last_heartbeat = now;
210 send_heartbeat();
211 num_bal_times--;
212 }
81eedcae
TL
213
214 mds->mdcache->show_subtrees(10, true);
7c673cae
FG
215}
216
217
218
219
220class C_Bal_SendHeartbeat : public MDSInternalContext {
221public:
222 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
223 void finish(int f) override {
224 mds->balancer->send_heartbeat();
225 }
226};
227
228
11fdf7f2 229double mds_load_t::mds_load() const
7c673cae 230{
11fdf7f2 231 switch(g_conf()->mds_bal_mode) {
7c673cae
FG
232 case 0:
233 return
234 .8 * auth.meta_load() +
235 .2 * all.meta_load() +
236 req_rate +
237 10.0 * queue_len;
238
239 case 1:
240 return req_rate + 10.0*queue_len;
241
242 case 2:
243 return cpu_load_avg;
244
245 }
246 ceph_abort();
247 return 0;
248}
249
11fdf7f2 250mds_load_t MDBalancer::get_load()
7c673cae 251{
11fdf7f2
TL
252 auto now = clock::now();
253
254 mds_load_t load{DecayRate()}; /* zero DecayRate! */
7c673cae
FG
255
256 if (mds->mdcache->get_root()) {
257 list<CDir*> ls;
258 mds->mdcache->get_root()->get_dirfrags(ls);
11fdf7f2
TL
259 for (auto &d : ls) {
260 load.auth.add(d->pop_auth_subtree_nested);
261 load.all.add(d->pop_nested);
7c673cae
FG
262 }
263 } else {
264 dout(20) << "get_load no root, no load" << dendl;
265 }
266
28e407b8 267 uint64_t num_requests = mds->get_num_requests();
91327a77
AA
268
269 uint64_t cpu_time = 1;
270 {
271 string stat_path = PROCPREFIX "/proc/self/stat";
272 ifstream stat_file(stat_path);
273 if (stat_file.is_open()) {
274 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
275 std::istream_iterator<string>());
276 if (stat_vec.size() >= 15) {
277 // utime + stime
278 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
279 strtoll(stat_vec[14].c_str(), nullptr, 10);
280 } else {
281 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
282 }
283 } else {
284 derr << "input file '" << stat_path << "' not found" << dendl_impl;
285 }
286 }
287
288 load.queue_len = messenger->get_dispatch_queue_len();
289
290 bool update_last = true;
11fdf7f2 291 if (last_get_load != clock::zero() &&
91327a77 292 now > last_get_load) {
11fdf7f2
TL
293 double el = std::chrono::duration<double>(now-last_get_load).count();
294 if (el >= 1.0) {
91327a77 295 if (num_requests > last_num_requests)
11fdf7f2 296 load.req_rate = (num_requests - last_num_requests) / el;
91327a77 297 if (cpu_time > last_cpu_time)
11fdf7f2 298 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
91327a77
AA
299 } else {
300 auto p = mds_load.find(mds->get_nodeid());
301 if (p != mds_load.end()) {
302 load.req_rate = p->second.req_rate;
303 load.cpu_load_avg = p->second.cpu_load_avg;
304 }
305 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
306 update_last = false;
28e407b8
AA
307 }
308 }
28e407b8 309
91327a77
AA
310 if (update_last) {
311 last_num_requests = num_requests;
312 last_cpu_time = cpu_time;
313 last_get_load = now;
314 }
7c673cae 315
7c673cae
FG
316 dout(15) << "get_load " << load << dendl;
317 return load;
318}
319
320/*
321 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
322 * fallbacks (i.e. kick off async read when we are processing the map and
323 * check status when we get here) with the way the mds is structured.
324 */
325int MDBalancer::localize_balancer()
326{
327 /* reset everything */
328 bool ack = false;
329 int r = 0;
330 bufferlist lua_src;
331 Mutex lock("lock");
332 Cond cond;
333
334 /* we assume that balancer is in the metadata pool */
335 object_t oid = object_t(mds->mdsmap->get_balancer());
336 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
337 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
338 new C_SafeCond(&lock, &cond, &ack, &r));
339 dout(15) << "launched non-blocking read tid=" << tid
340 << " oid=" << oid << " oloc=" << oloc << dendl;
341
342 /* timeout: if we waste half our time waiting for RADOS, then abort! */
11fdf7f2 343 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
7c673cae 344 lock.Lock();
91327a77 345 int ret_t = cond.WaitInterval(lock, utime_t(bal_interval / 2, 0));
7c673cae
FG
346 lock.Unlock();
347
348 /* success: store the balancer in memory and set the version. */
349 if (!r) {
350 if (ret_t == ETIMEDOUT) {
351 mds->objecter->op_cancel(tid, -ECANCELED);
352 return -ETIMEDOUT;
353 }
354 bal_code.assign(lua_src.to_str());
355 bal_version.assign(oid.name);
b32b8144 356 dout(10) << "localized balancer, bal_code=" << bal_code << dendl;
7c673cae
FG
357 }
358 return r;
359}
360
361void MDBalancer::send_heartbeat()
362{
7c673cae
FG
363 if (mds->is_cluster_degraded()) {
364 dout(10) << "send_heartbeat degraded" << dendl;
365 return;
366 }
367
368 if (!mds->mdcache->is_open()) {
369 dout(5) << "not open" << dendl;
370 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
371 return;
372 }
373
94b18763 374 if (mds->get_nodeid() == 0) {
7c673cae 375 beat_epoch++;
94b18763
FG
376 mds_load.clear();
377 }
7c673cae
FG
378
379 // my load
11fdf7f2 380 mds_load_t load = get_load();
28e407b8
AA
381 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
382 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
383
11fdf7f2
TL
384 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
385 if (!em.second) {
386 em.first->second = load;
387 }
7c673cae
FG
388
389 // import_map -- how much do i import from whom
390 map<mds_rank_t, float> import_map;
11fdf7f2 391 for (auto& im : mds->mdcache->get_auth_subtrees()) {
7c673cae
FG
392 mds_rank_t from = im->inode->authority().first;
393 if (from == mds->get_nodeid()) continue;
394 if (im->get_inode()->is_stray()) continue;
11fdf7f2 395 import_map[from] += im->pop_auth_subtree.meta_load();
7c673cae
FG
396 }
397 mds_import_map[ mds->get_nodeid() ] = import_map;
398
399
400 dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl;
401 for (map<mds_rank_t, float>::iterator it = import_map.begin();
402 it != import_map.end();
403 ++it) {
404 dout(5) << " import_map from " << it->first << " -> " << it->second << dendl;
405 }
406
407
408 set<mds_rank_t> up;
409 mds->get_mds_map()->get_up_mds_set(up);
11fdf7f2
TL
410 for (const auto& r : up) {
411 if (r == mds->get_nodeid())
7c673cae 412 continue;
11fdf7f2 413 auto hb = MHeartbeat::create(load, beat_epoch);
7c673cae 414 hb->get_import_map() = import_map;
11fdf7f2 415 mds->send_message_mds(hb, r);
7c673cae
FG
416 }
417}
418
11fdf7f2 419void MDBalancer::handle_heartbeat(const MHeartbeat::const_ref &m)
7c673cae 420{
7c673cae
FG
421 mds_rank_t who = mds_rank_t(m->get_source().num());
422 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
423
424 if (!mds->is_active())
11fdf7f2 425 return;
7c673cae
FG
426
427 if (!mds->mdcache->is_open()) {
428 dout(10) << "opening root on handle_heartbeat" << dendl;
429 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
430 return;
431 }
432
433 if (mds->is_cluster_degraded()) {
434 dout(10) << " degraded, ignoring" << dendl;
11fdf7f2 435 return;
7c673cae
FG
436 }
437
94b18763
FG
438 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
439 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
440
441 beat_epoch = m->get_beat();
442 // clear the mds load info whose epoch is less than beat_epoch
443 mds_load.clear();
444 }
445
7c673cae 446 if (who == 0) {
94b18763
FG
447 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
448 if (beat_epoch != m->get_beat()) {
28e407b8 449 beat_epoch = m->get_beat();
94b18763
FG
450 mds_load.clear();
451 }
28e407b8 452
7c673cae
FG
453 send_heartbeat();
454
455 mds->mdcache->show_subtrees();
28e407b8
AA
456 } else if (mds->get_nodeid() == 0) {
457 if (beat_epoch != m->get_beat()) {
458 dout(10) << " old heartbeat epoch, ignoring" << dendl;
11fdf7f2 459 return;
7c673cae
FG
460 }
461 }
28e407b8 462
11fdf7f2
TL
463 {
464 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
465 if (!em.second) {
466 em.first->second = m->get_load();
467 }
468 }
28e407b8 469 mds_import_map[who] = m->get_import_map();
7c673cae 470
7c673cae
FG
471 {
472 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
473 if (mds_load.size() == cluster_size) {
474 // let's go!
475 //export_empties(); // no!
476
477 /* avoid spamming ceph -w if user does not turn mantle on */
478 if (mds->mdsmap->get_balancer() != "") {
479 int r = mantle_prep_rebalance();
11fdf7f2 480 if (!r) return;
7c673cae
FG
481 mds->clog->warn() << "using old balancer; mantle failed for "
482 << "balancer=" << mds->mdsmap->get_balancer()
483 << " : " << cpp_strerror(r);
484 }
485 prep_rebalance(m->get_beat());
486 }
487 }
7c673cae
FG
488}
489
7c673cae
FG
490double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
491 mds_rank_t im, double& maxim)
492{
493 if (maxex <= 0 || maxim <= 0) return 0.0;
494
11fdf7f2 495 double howmuch = std::min(maxex, maxim);
7c673cae
FG
496 if (howmuch <= 0) return 0.0;
497
498 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
499
500 if (ex == mds->get_nodeid())
501 state.targets[im] += howmuch;
502
503 state.exported[ex] += howmuch;
504 state.imported[im] += howmuch;
505
506 maxex -= howmuch;
507 maxim -= howmuch;
508
509 return howmuch;
510}
511
512void MDBalancer::queue_split(const CDir *dir, bool fast)
513{
514 dout(10) << __func__ << " enqueuing " << *dir
515 << " (fast=" << fast << ")" << dendl;
516
7c673cae
FG
517 const dirfrag_t frag = dir->dirfrag();
518
519 auto callback = [this, frag](int r) {
520 if (split_pending.erase(frag) == 0) {
521 // Someone beat me to it. This can happen in the fast splitting
522 // path, because we spawn two contexts, one with mds->timer and
523 // one with mds->queue_waiter. The loser can safely just drop
524 // out.
525 return;
526 }
527
528 CDir *split_dir = mds->mdcache->get_dirfrag(frag);
529 if (!split_dir) {
530 dout(10) << "drop split on " << frag << " because not in cache" << dendl;
531 return;
532 }
533 if (!split_dir->is_auth()) {
534 dout(10) << "drop split on " << frag << " because non-auth" << dendl;
535 return;
536 }
537
538 // Pass on to MDCache: note that the split might still not
539 // happen if the checks in MDCache::can_fragment fail.
540 dout(10) << __func__ << " splitting " << *split_dir << dendl;
11fdf7f2 541 mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits);
7c673cae
FG
542 };
543
544 bool is_new = false;
545 if (split_pending.count(frag) == 0) {
546 split_pending.insert(frag);
547 is_new = true;
548 }
549
550 if (fast) {
551 // Do the split ASAP: enqueue it in the MDSRank waiters which are
552 // run at the end of dispatching the current request
553 mds->queue_waiter(new MDSInternalContextWrapper(mds,
554 new FunctionContext(callback)));
555 } else if (is_new) {
556 // Set a timer to really do the split: we don't do it immediately
557 // so that bursts of ops on a directory have a chance to go through
558 // before we freeze it.
91327a77 559 mds->timer.add_event_after(bal_fragment_interval,
7c673cae
FG
560 new FunctionContext(callback));
561 }
562}
563
564void MDBalancer::queue_merge(CDir *dir)
565{
566 const auto frag = dir->dirfrag();
567 auto callback = [this, frag](int r) {
11fdf7f2 568 ceph_assert(frag.frag != frag_t());
7c673cae
FG
569
570 // frag must be in this set because only one context is in flight
571 // for a given frag at a time (because merge_pending is checked before
572 // starting one), and this context is the only one that erases it.
573 merge_pending.erase(frag);
574
575 CDir *dir = mds->mdcache->get_dirfrag(frag);
576 if (!dir) {
577 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
578 return;
579 }
11fdf7f2 580 ceph_assert(dir->dirfrag() == frag);
7c673cae
FG
581
582 if(!dir->is_auth()) {
583 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
584 return;
585 }
586
587 dout(10) << "merging " << *dir << dendl;
588
589 CInode *diri = dir->get_inode();
590
591 frag_t fg = dir->get_frag();
592 while (fg != frag_t()) {
593 frag_t sibfg = fg.get_sibling();
594 list<CDir*> sibs;
595 bool complete = diri->get_dirfrags_under(sibfg, sibs);
596 if (!complete) {
597 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
598 break;
599 }
600 bool all = true;
601 for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) {
602 CDir *sib = *p;
603 if (!sib->is_auth() || !sib->should_merge()) {
604 all = false;
605 break;
606 }
607 }
608 if (!all) {
609 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
610 break;
611 }
612 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
613 fg = fg.parent();
614 }
615
616 if (fg != dir->get_frag())
617 mds->mdcache->merge_dir(diri, fg);
618 };
619
620 if (merge_pending.count(frag) == 0) {
621 dout(20) << __func__ << " enqueued dir " << *dir << dendl;
622 merge_pending.insert(frag);
91327a77 623 mds->timer.add_event_after(bal_fragment_interval,
7c673cae
FG
624 new FunctionContext(callback));
625 } else {
626 dout(20) << __func__ << " dir already in queue " << *dir << dendl;
627 }
628}
629
630void MDBalancer::prep_rebalance(int beat)
631{
632 balance_state_t state;
633
11fdf7f2 634 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
635 //we're going to randomly export to all the mds in the cluster
636 set<mds_rank_t> up_mds;
637 mds->get_mds_map()->get_up_mds_set(up_mds);
638 for (const auto &rank : up_mds) {
639 state.targets[rank] = 0.0;
640 }
641 } else {
642 int cluster_size = mds->get_mds_map()->get_num_in_mds();
643 mds_rank_t whoami = mds->get_nodeid();
11fdf7f2 644 rebalance_time = clock::now();
7c673cae
FG
645
646 dout(5) << " prep_rebalance: cluster loads are" << dendl;
647
648 mds->mdcache->migrator->clear_export_queue();
649
650 // rescale! turn my mds_load back into meta_load units
651 double load_fac = 1.0;
652 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
653 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
11fdf7f2 654 double metald = m->second.auth.meta_load();
7c673cae
FG
655 double mdsld = m->second.mds_load();
656 load_fac = metald / mdsld;
657 dout(7) << " load_fac is " << load_fac
658 << " <- " << m->second.auth << " " << metald
659 << " / " << mdsld
660 << dendl;
661 }
662
28e407b8
AA
663 mds_meta_load.clear();
664
7c673cae
FG
665 double total_load = 0.0;
666 multimap<double,mds_rank_t> load_map;
667 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
28e407b8 668 mds_load_t& load = mds_load.at(i);
7c673cae
FG
669
670 double l = load.mds_load() * load_fac;
671 mds_meta_load[i] = l;
672
673 if (whoami == 0)
b32b8144 674 dout(5) << " mds." << i
7c673cae
FG
675 << " " << load
676 << " = " << load.mds_load()
677 << " ~ " << l << dendl;
678
679 if (whoami == i) my_load = l;
680 total_load += l;
681
682 load_map.insert(pair<double,mds_rank_t>( l, i ));
683 }
684
685 // target load
686 target_load = total_load / (double)cluster_size;
687 dout(5) << "prep_rebalance: my load " << my_load
688 << " target " << target_load
689 << " total " << total_load
690 << dendl;
691
692 // under or over?
28e407b8 693 for (auto p : load_map) {
11fdf7f2 694 if (p.first < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
28e407b8
AA
695 dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
696 mds_last_epoch_under_map[p.second] = beat_epoch;
697 }
698 }
699
700 int last_epoch_under = mds_last_epoch_under_map[whoami];
701 if (last_epoch_under == beat_epoch) {
7c673cae 702 dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
7c673cae
FG
703 return;
704 }
7c673cae
FG
705 // am i over long enough?
706 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
707 dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
708 return;
709 }
710
711 dout(5) << " i am sufficiently overloaded" << dendl;
712
713
714 // first separate exporters and importers
715 multimap<double,mds_rank_t> importers;
716 multimap<double,mds_rank_t> exporters;
717 set<mds_rank_t> importer_set;
718 set<mds_rank_t> exporter_set;
719
720 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
721 it != load_map.end();
722 ++it) {
723 if (it->first < target_load) {
724 dout(15) << " mds." << it->second << " is importer" << dendl;
725 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
726 importer_set.insert(it->second);
727 } else {
28e407b8
AA
728 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
729 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
730 dout(15) << " mds." << it->second << " is exporter" << dendl;
731 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
732 exporter_set.insert(it->second);
733 }
7c673cae
FG
734 }
735 }
736
737
738 // determine load transfer mapping
739
740 if (true) {
741 // analyze import_map; do any matches i can
742
743 dout(15) << " matching exporters to import sources" << dendl;
744
745 // big -> small exporters
746 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
747 ex != exporters.rend();
748 ++ex) {
749 double maxex = get_maxex(state, ex->second);
750 if (maxex <= .001) continue;
751
752 // check importers. for now, just in arbitrary order (no intelligent matching).
753 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
754 im != mds_import_map[ex->second].end();
755 ++im) {
756 double maxim = get_maxim(state, im->first);
757 if (maxim <= .001) continue;
758 try_match(state, ex->second, maxex, im->first, maxim);
759 if (maxex <= .001) break;
760 }
761 }
762 }
763
764 // old way
765 if (beat % 2 == 1) {
766 dout(15) << " matching big exporters to big importers" << dendl;
767 // big exporters to big importers
768 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
769 multimap<double,mds_rank_t>::iterator im = importers.begin();
770 while (ex != exporters.rend() &&
771 im != importers.end()) {
772 double maxex = get_maxex(state, ex->second);
773 double maxim = get_maxim(state, im->second);
774 if (maxex < .001 || maxim < .001) break;
775 try_match(state, ex->second, maxex, im->second, maxim);
776 if (maxex <= .001) ++ex;
777 if (maxim <= .001) ++im;
778 }
779 } else { // new way
780 dout(15) << " matching small exporters to big importers" << dendl;
781 // small exporters to big importers
782 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
783 multimap<double,mds_rank_t>::iterator im = importers.begin();
784 while (ex != exporters.end() &&
785 im != importers.end()) {
786 double maxex = get_maxex(state, ex->second);
787 double maxim = get_maxim(state, im->second);
788 if (maxex < .001 || maxim < .001) break;
789 try_match(state, ex->second, maxex, im->second, maxim);
790 if (maxex <= .001) ++ex;
791 if (maxim <= .001) ++im;
792 }
793 }
794 }
795 try_rebalance(state);
796}
797
7c673cae
FG
798int MDBalancer::mantle_prep_rebalance()
799{
800 balance_state_t state;
801
802 /* refresh balancer if it has changed */
803 if (bal_version != mds->mdsmap->get_balancer()) {
804 bal_version.assign("");
805 int r = localize_balancer();
806 if (r) return r;
807
808 /* only spam the cluster log from 1 mds on version changes */
809 if (mds->get_nodeid() == 0)
810 mds->clog->info() << "mantle balancer version changed: " << bal_version;
811 }
812
813 /* prepare for balancing */
814 int cluster_size = mds->get_mds_map()->get_num_in_mds();
11fdf7f2 815 rebalance_time = clock::now();
7c673cae
FG
816 mds->mdcache->migrator->clear_export_queue();
817
818 /* fill in the metrics for each mds by grabbing load struct */
819 vector < map<string, double> > metrics (cluster_size);
28e407b8
AA
820 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
821 mds_load_t& load = mds_load.at(i);
7c673cae
FG
822
823 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
824 {"all.meta_load", load.all.meta_load()},
825 {"req_rate", load.req_rate},
826 {"queue_len", load.queue_len},
827 {"cpu_load_avg", load.cpu_load_avg}};
828 }
829
830 /* execute the balancer */
831 Mantle mantle;
832 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
a8e16298 833 dout(5) << " mantle decided that new targets=" << state.targets << dendl;
7c673cae
FG
834
835 /* mantle doesn't know about cluster size, so check target len here */
836 if ((int) state.targets.size() != cluster_size)
837 return -EINVAL;
838 else if (ret)
839 return ret;
840
841 try_rebalance(state);
842 return 0;
843}
844
845
846
847void MDBalancer::try_rebalance(balance_state_t& state)
848{
11fdf7f2 849 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
850 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
851 << dendl;
852 return;
853 }
854
855 // make a sorted list of my imports
28e407b8
AA
856 multimap<double, CDir*> import_pop_map;
857 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
7c673cae 858
11fdf7f2 859 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
28e407b8
AA
860 CInode *diri = dir->get_inode();
861 if (diri->is_mdsdir())
862 continue;
863 if (diri->get_export_pin(false) != MDS_RANK_NONE)
864 continue;
865 if (dir->is_freezing() || dir->is_frozen())
866 continue; // export pbly already in progress
7c673cae 867
28e407b8 868 mds_rank_t from = diri->authority().first;
11fdf7f2
TL
869 double pop = dir->pop_auth_subtree.meta_load();
870 if (g_conf()->mds_bal_idle_threshold > 0 &&
871 pop < g_conf()->mds_bal_idle_threshold &&
28e407b8
AA
872 diri != mds->mdcache->get_root() &&
873 from != mds->get_nodeid()) {
874 dout(5) << " exporting idle (" << pop << ") import " << *dir
875 << " back to mds." << from << dendl;
876 mds->mdcache->migrator->export_dir_nicely(dir, from);
7c673cae
FG
877 continue;
878 }
879
28e407b8
AA
880 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
881 import_pop_map.insert(make_pair(pop, dir));
882 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
7c673cae
FG
883 }
884
7c673cae 885 // do my exports!
28e407b8 886 map<mds_rank_t, double> export_pop_map;
7c673cae
FG
887
888 for (auto &it : state.targets) {
889 mds_rank_t target = it.first;
890 double amount = it.second;
891
28e407b8
AA
892 if (amount < MIN_OFFLOAD)
893 continue;
91327a77
AA
894 if (amount * 10 * state.targets.size() < target_load)
895 continue;
7c673cae
FG
896
897 dout(5) << "want to send " << amount << " to mds." << target
898 //<< " .. " << (*it).second << " * " << load_fac
899 << " -> " << amount
900 << dendl;//" .. fudge is " << fudge << dendl;
7c673cae 901
28e407b8 902 double& have = export_pop_map[target];
7c673cae
FG
903
904 mds->mdcache->show_subtrees();
905
906 // search imports from target
907 if (import_from_map.count(target)) {
908 dout(5) << " aha, looking through imports from target mds." << target << dendl;
28e407b8
AA
909 for (auto p = import_from_map.equal_range(target);
910 p.first != p.second; ) {
911 CDir *dir = p.first->second.first;
912 double pop = p.first->second.second;
7c673cae 913 dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
28e407b8 914 auto plast = p.first++;
7c673cae 915
28e407b8 916 if (dir->inode->is_base())
7c673cae 917 continue;
11fdf7f2 918 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
7c673cae
FG
919
920 if (pop <= amount-have) {
28e407b8 921 dout(5) << "reexporting " << *dir << " pop " << pop
7c673cae
FG
922 << " back to mds." << target << dendl;
923 mds->mdcache->migrator->export_dir_nicely(dir, target);
924 have += pop;
925 import_from_map.erase(plast);
28e407b8
AA
926 for (auto q = import_pop_map.equal_range(pop);
927 q.first != q.second; ) {
928 if (q.first->second == dir) {
929 import_pop_map.erase(q.first);
930 break;
931 }
932 q.first++;
933 }
7c673cae
FG
934 } else {
935 dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
936 }
28e407b8
AA
937 if (amount-have < MIN_OFFLOAD)
938 break;
7c673cae
FG
939 }
940 }
28e407b8
AA
941 }
942
943 // any other imports
944 for (auto &it : state.targets) {
945 mds_rank_t target = it.first;
946 double amount = it.second;
947
948 if (!export_pop_map.count(target))
949 continue;
950 double& have = export_pop_map[target];
951 if (amount-have < MIN_OFFLOAD)
7c673cae 952 continue;
7c673cae 953
28e407b8
AA
954 for (auto p = import_pop_map.begin();
955 p != import_pop_map.end(); ) {
956 CDir *dir = p->second;
957 if (dir->inode->is_base()) {
958 ++p;
959 continue;
960 }
7c673cae 961
28e407b8
AA
962 double pop = p->first;
963 if (pop <= amount-have && pop > MIN_REEXPORT) {
964 dout(0) << "reexporting " << *dir << " pop " << pop
965 << " to mds." << target << dendl;
966 have += pop;
967 mds->mdcache->migrator->export_dir_nicely(dir, target);
968 import_pop_map.erase(p++);
969 } else {
970 ++p;
7c673cae 971 }
28e407b8
AA
972 if (amount-have < MIN_OFFLOAD)
973 break;
7c673cae 974 }
28e407b8 975 }
7c673cae 976
28e407b8
AA
977 set<CDir*> already_exporting;
978
979 for (auto &it : state.targets) {
980 mds_rank_t target = it.first;
981 double amount = it.second;
7c673cae 982
28e407b8
AA
983 if (!export_pop_map.count(target))
984 continue;
985 double& have = export_pop_map[target];
986 if (amount-have < MIN_OFFLOAD)
987 continue;
988
989 // okay, search for fragments of my workload
7c673cae
FG
990 list<CDir*> exports;
991
28e407b8
AA
992 for (auto p = import_pop_map.rbegin();
993 p != import_pop_map.rend();
994 ++p) {
995 CDir *dir = p->second;
996 find_exports(dir, amount, exports, have, already_exporting);
997 if (amount-have < MIN_OFFLOAD)
7c673cae
FG
998 break;
999 }
1000 //fudge = amount - have;
1001
28e407b8
AA
1002 for (auto dir : exports) {
1003 dout(5) << " - exporting " << dir->pop_auth_subtree
11fdf7f2 1004 << " " << dir->pop_auth_subtree.meta_load()
28e407b8
AA
1005 << " to mds." << target << " " << *dir << dendl;
1006 mds->mdcache->migrator->export_dir_nicely(dir, target);
7c673cae
FG
1007 }
1008 }
1009
1010 dout(5) << "rebalance done" << dendl;
1011 mds->mdcache->show_subtrees();
1012}
1013
7c673cae
FG
1014void MDBalancer::find_exports(CDir *dir,
1015 double amount,
1016 list<CDir*>& exports,
1017 double& have,
1018 set<CDir*>& already_exporting)
1019{
11fdf7f2
TL
1020 auto now = clock::now();
1021 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
1022 if (duration > 0.1) {
28e407b8
AA
1023 derr << " balancer runs too long" << dendl_impl;
1024 have = amount;
1025 return;
1026 }
1027
11fdf7f2 1028 ceph_assert(dir->is_auth());
28e407b8 1029
7c673cae 1030 double need = amount - have;
11fdf7f2 1031 if (need < amount * g_conf()->mds_bal_min_start)
7c673cae 1032 return; // good enough!
28e407b8 1033
11fdf7f2
TL
1034 double needmax = need * g_conf()->mds_bal_need_max;
1035 double needmin = need * g_conf()->mds_bal_need_min;
1036 double midchunk = need * g_conf()->mds_bal_midchunk;
1037 double minchunk = need * g_conf()->mds_bal_minchunk;
7c673cae
FG
1038
1039 list<CDir*> bigger_rep, bigger_unrep;
1040 multimap<double, CDir*> smaller;
1041
11fdf7f2 1042 double dir_pop = dir->pop_auth_subtree.meta_load();
7c673cae
FG
1043 dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
1044
1045 double subdir_sum = 0;
28e407b8
AA
1046 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1047 !it.end(); ) {
1048 CInode *in = *it;
1049 ++it;
1050
11fdf7f2
TL
1051 ceph_assert(in->is_dir());
1052 ceph_assert(in->get_parent_dir() == dir);
7c673cae
FG
1053
1054 list<CDir*> dfls;
28e407b8
AA
1055 in->get_nested_dirfrags(dfls);
1056
1057 size_t num_idle_frags = 0;
7c673cae
FG
1058 for (list<CDir*>::iterator p = dfls.begin();
1059 p != dfls.end();
1060 ++p) {
1061 CDir *subdir = *p;
28e407b8
AA
1062 if (already_exporting.count(subdir))
1063 continue;
7c673cae 1064
28e407b8
AA
1065 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1066 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1067 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1068 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1069 continue; // can't export this right now!
7c673cae
FG
1070
1071 // how popular?
11fdf7f2 1072 double pop = subdir->pop_auth_subtree.meta_load();
7c673cae
FG
1073 subdir_sum += pop;
1074 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1075
28e407b8
AA
1076 if (pop < minchunk) {
1077 num_idle_frags++;
1078 continue;
1079 }
7c673cae
FG
1080
1081 // lucky find?
1082 if (pop > needmin && pop < needmax) {
1083 exports.push_back(subdir);
1084 already_exporting.insert(subdir);
1085 have += pop;
1086 return;
1087 }
1088
1089 if (pop > need) {
1090 if (subdir->is_rep())
1091 bigger_rep.push_back(subdir);
1092 else
1093 bigger_unrep.push_back(subdir);
1094 } else
1095 smaller.insert(pair<double,CDir*>(pop, subdir));
1096 }
28e407b8
AA
1097 if (dfls.size() == num_idle_frags)
1098 in->item_pop_lru.remove_myself();
7c673cae
FG
1099 }
1100 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1101
1102 // grab some sufficiently big small items
1103 multimap<double,CDir*>::reverse_iterator it;
1104 for (it = smaller.rbegin();
1105 it != smaller.rend();
1106 ++it) {
1107
1108 if ((*it).first < midchunk)
1109 break; // try later
1110
1111 dout(7) << " taking smaller " << *(*it).second << dendl;
1112
1113 exports.push_back((*it).second);
1114 already_exporting.insert((*it).second);
1115 have += (*it).first;
1116 if (have > needmin)
1117 return;
1118 }
1119
1120 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1121 for (list<CDir*>::iterator it = bigger_unrep.begin();
1122 it != bigger_unrep.end();
1123 ++it) {
1124 dout(15) << " descending into " << **it << dendl;
1125 find_exports(*it, amount, exports, have, already_exporting);
1126 if (have > needmin)
1127 return;
1128 }
1129
1130 // ok fine, use smaller bits
1131 for (;
1132 it != smaller.rend();
1133 ++it) {
1134 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1135
1136 exports.push_back((*it).second);
1137 already_exporting.insert((*it).second);
1138 have += (*it).first;
1139 if (have > needmin)
1140 return;
1141 }
1142
1143 // ok fine, drill into replicated dirs
1144 for (list<CDir*>::iterator it = bigger_rep.begin();
1145 it != bigger_rep.end();
1146 ++it) {
1147 dout(7) << " descending into replicated " << **it << dendl;
1148 find_exports(*it, amount, exports, have, already_exporting);
1149 if (have > needmin)
1150 return;
1151 }
7c673cae
FG
1152}
1153
11fdf7f2 1154void MDBalancer::hit_inode(CInode *in, int type, int who)
7c673cae
FG
1155{
1156 // hit inode
11fdf7f2 1157 in->pop.get(type).hit();
7c673cae
FG
1158
1159 if (in->get_parent_dn())
11fdf7f2 1160 hit_dir(in->get_parent_dn()->get_dir(), type, who);
7c673cae
FG
1161}
1162
1163void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1164{
1165 // split/merge
11fdf7f2 1166 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
91327a77 1167 dir->is_auth() &&
11fdf7f2 1168 !dir->inode->is_base() && // not root/mdsdir (for now at least)
91327a77 1169 !dir->inode->is_stray()) { // not straydir
7c673cae
FG
1170
1171 // split
11fdf7f2 1172 if (g_conf()->mds_bal_split_size > 0 && (dir->should_split() || hot)) {
7c673cae
FG
1173 if (split_pending.count(dir->dirfrag()) == 0) {
1174 queue_split(dir, false);
1175 } else {
1176 if (dir->should_split_fast()) {
1177 queue_split(dir, true);
1178 } else {
1179 dout(10) << __func__ << ": fragment already enqueued to split: "
1180 << *dir << dendl;
1181 }
1182 }
1183 }
1184
1185 // merge?
1186 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1187 merge_pending.count(dir->dirfrag()) == 0) {
1188 queue_merge(dir);
1189 }
1190 }
1191}
1192
11fdf7f2 1193void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
7c673cae
FG
1194{
1195 // hit me
11fdf7f2 1196 double v = dir->pop_me.get(type).hit(amount);
7c673cae 1197
11fdf7f2
TL
1198 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1199 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
7c673cae
FG
1200
1201 dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag()
11fdf7f2 1202 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
7c673cae
FG
1203
1204 maybe_fragment(dir, hot);
1205
1206 // replicate?
1207 if (type == META_POP_IRD && who >= 0) {
11fdf7f2 1208 dir->pop_spread.hit(who);
7c673cae
FG
1209 }
1210
1211 double rd_adj = 0.0;
1212 if (type == META_POP_IRD &&
1213 dir->last_popularity_sample < last_sample) {
11fdf7f2 1214 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
7c673cae 1215 dir->last_popularity_sample = last_sample;
11fdf7f2 1216 double pop_sp = dir->pop_spread.get();
7c673cae
FG
1217 dir_pop += pop_sp * 10;
1218
1219 //if (dir->ino() == inodeno_t(0x10000000002))
1220 if (pop_sp > 0) {
1221 dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp
1222 << " " << dir->pop_spread.last[0]
1223 << " " << dir->pop_spread.last[1]
1224 << " " << dir->pop_spread.last[2]
1225 << " " << dir->pop_spread.last[3]
1226 << " in " << *dir << dendl;
1227 }
1228
1229 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1230 if (!dir->is_rep() &&
11fdf7f2 1231 dir_pop >= g_conf()->mds_bal_replicate_threshold) {
7c673cae 1232 // replicate
11fdf7f2 1233 double rdp = dir->pop_me.get(META_POP_IRD).get();
7c673cae
FG
1234 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1235 rd_adj /= 2.0; // temper somewhat
1236
b32b8144 1237 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
7c673cae
FG
1238
1239 dir->dir_rep = CDir::REP_ALL;
1240 mds->mdcache->send_dir_updates(dir, true);
1241
1242 // fixme this should adjust the whole pop hierarchy
1243 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1244 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1245 }
1246
1247 if (dir->ino() != 1 &&
1248 dir->is_rep() &&
11fdf7f2 1249 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
7c673cae 1250 // unreplicate
b32b8144 1251 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
7c673cae
FG
1252
1253 dir->dir_rep = CDir::REP_NONE;
1254 mds->mdcache->send_dir_updates(dir);
1255 }
1256 }
1257 }
1258
1259 // adjust ancestors
1260 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1261 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1262
1263 while (true) {
28e407b8 1264 CDir *pdir = dir->inode->get_parent_dir();
11fdf7f2 1265 dir->pop_nested.get(type).hit(amount);
7c673cae 1266 if (rd_adj != 0.0)
11fdf7f2 1267 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae
FG
1268
1269 if (hit_subtree) {
11fdf7f2 1270 dir->pop_auth_subtree.get(type).hit(amount);
28e407b8 1271
7c673cae 1272 if (rd_adj != 0.0)
11fdf7f2 1273 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
28e407b8
AA
1274
1275 if (dir->is_subtree_root())
1276 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1277 else if (pdir)
1278 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
7c673cae
FG
1279 }
1280
1281 if (hit_subtree_nested) {
11fdf7f2 1282 dir->pop_auth_subtree_nested.get(type).hit(amount);
7c673cae 1283 if (rd_adj != 0.0)
11fdf7f2 1284 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae 1285 }
28e407b8
AA
1286 if (!pdir) break;
1287 dir = pdir;
7c673cae
FG
1288 }
1289}
1290
1291
1292/*
1293 * subtract off an exported chunk.
1294 * this excludes *dir itself (encode_export_dir should have take care of that)
1295 * we _just_ do the parents' nested counters.
1296 *
1297 * NOTE: call me _after_ forcing *dir into a subtree root,
1298 * but _before_ doing the encode_export_dirs.
1299 */
11fdf7f2 1300void MDBalancer::subtract_export(CDir *dir)
7c673cae
FG
1301{
1302 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1303
1304 while (true) {
1305 dir = dir->inode->get_parent_dir();
1306 if (!dir) break;
1307
11fdf7f2
TL
1308 dir->pop_nested.sub(subload);
1309 dir->pop_auth_subtree_nested.sub(subload);
7c673cae
FG
1310 }
1311}
1312
1313
11fdf7f2 1314void MDBalancer::add_import(CDir *dir)
7c673cae
FG
1315{
1316 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1317
1318 while (true) {
1319 dir = dir->inode->get_parent_dir();
1320 if (!dir) break;
1321
11fdf7f2
TL
1322 dir->pop_nested.add(subload);
1323 dir->pop_auth_subtree_nested.add(subload);
7c673cae
FG
1324 }
1325}
1326
11fdf7f2 1327void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
28e407b8 1328{
28e407b8
AA
1329 bool adjust_subtree_nest = dir->is_auth();
1330 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1331 CDir *cur = dir;
1332 while (true) {
1333 if (inc) {
11fdf7f2 1334 pdir->pop_nested.add(dir->pop_nested);
28e407b8 1335 if (adjust_subtree) {
11fdf7f2 1336 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8
AA
1337 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1338 }
1339
1340 if (adjust_subtree_nest)
11fdf7f2 1341 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
28e407b8 1342 } else {
11fdf7f2 1343 pdir->pop_nested.sub(dir->pop_nested);
28e407b8 1344 if (adjust_subtree)
11fdf7f2 1345 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
28e407b8
AA
1346
1347 if (adjust_subtree_nest)
11fdf7f2 1348 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
28e407b8
AA
1349 }
1350
1351 if (pdir->is_subtree_root())
1352 adjust_subtree = false;
1353 cur = pdir;
1354 pdir = pdir->inode->get_parent_dir();
1355 if (!pdir) break;
1356 }
1357}
1358
224ce89b
WB
1359void MDBalancer::handle_mds_failure(mds_rank_t who)
1360{
1361 if (0 == who) {
28e407b8 1362 mds_last_epoch_under_map.clear();
224ce89b
WB
1363 }
1364}
28e407b8
AA
1365
1366int MDBalancer::dump_loads(Formatter *f)
1367{
28e407b8
AA
1368 list<CDir*> dfs;
1369 if (mds->mdcache->get_root()) {
1370 mds->mdcache->get_root()->get_dirfrags(dfs);
1371 } else {
1372 dout(5) << "dump_load no root" << dendl;
1373 }
1374
1375 f->open_object_section("loads");
1376
1377 f->open_array_section("dirfrags");
1378 while (!dfs.empty()) {
1379 CDir *dir = dfs.front();
1380 dfs.pop_front();
1381
11fdf7f2
TL
1382 f->open_object_section("dir");
1383 dir->dump_load(f);
1384 f->close_section();
28e407b8
AA
1385
1386 for (auto it = dir->begin(); it != dir->end(); ++it) {
1387 CInode *in = it->second->get_linkage()->get_inode();
1388 if (!in || !in->is_dir())
1389 continue;
1390
1391 list<CDir*> ls;
1392 in->get_dirfrags(ls);
1393 for (auto subdir : ls) {
1394 if (subdir->pop_nested.meta_load() < .001)
1395 continue;
1396 dfs.push_back(subdir);
1397 }
1398 }
1399 }
1400 f->close_section(); // dirfrags array
1401
1402 f->open_object_section("mds_load");
1403 {
1404
11fdf7f2 1405 auto dump_mds_load = [f](mds_load_t& load) {
28e407b8
AA
1406 f->dump_float("request_rate", load.req_rate);
1407 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1408 f->dump_float("queue_length", load.queue_len);
1409 f->dump_float("cpu_load", load.cpu_load_avg);
1410 f->dump_float("mds_load", load.mds_load());
1411
28e407b8 1412 f->open_object_section("auth_dirfrags");
11fdf7f2 1413 load.auth.dump(f);
28e407b8
AA
1414 f->close_section();
1415 f->open_object_section("all_dirfrags");
11fdf7f2 1416 load.all.dump(f);
28e407b8
AA
1417 f->close_section();
1418 };
1419
1420 for (auto p : mds_load) {
1421 stringstream name;
1422 name << "mds." << p.first;
1423 f->open_object_section(name.str().c_str());
1424 dump_mds_load(p.second);
1425 f->close_section();
1426 }
1427 }
1428 f->close_section(); // mds_load
1429
1430 f->open_object_section("mds_meta_load");
1431 for (auto p : mds_meta_load) {
1432 stringstream name;
1433 name << "mds." << p.first;
1434 f->dump_float(name.str().c_str(), p.second);
1435 }
1436 f->close_section(); // mds_meta_load
1437
1438 f->open_object_section("mds_import_map");
1439 for (auto p : mds_import_map) {
1440 stringstream name1;
1441 name1 << "mds." << p.first;
1442 f->open_array_section(name1.str().c_str());
1443 for (auto q : p.second) {
1444 f->open_object_section("from");
1445 stringstream name2;
1446 name2 << "mds." << q.first;
1447 f->dump_float(name2.str().c_str(), q.second);
1448 f->close_section();
1449 }
1450 f->close_section(); // mds.? array
1451 }
1452 f->close_section(); // mds_import_map
1453
1454 f->close_section(); // loads
1455 return 0;
1456}