]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / MDBalancer.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/compat.h"
16#include "mdstypes.h"
17
91327a77 18#include "mon/MonClient.h"
7c673cae
FG
19#include "MDBalancer.h"
20#include "MDSRank.h"
7c673cae
FG
21#include "MDSMap.h"
22#include "CInode.h"
23#include "CDir.h"
24#include "MDCache.h"
25#include "Migrator.h"
26#include "Mantle.h"
27
28#include "include/Context.h"
29#include "msg/Messenger.h"
7c673cae
FG
30
31#include <fstream>
32#include <iostream>
33#include <vector>
34#include <map>
35using std::map;
36using std::vector;
11fdf7f2 37using std::chrono::duration_cast;
7c673cae
FG
38
39#include "common/config.h"
40#include "common/errno.h"
41
42#define dout_context g_ceph_context
7c673cae 43#undef dout_prefix
9f95a23c 44#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
7c673cae
FG
45#undef dout
46#define dout(lvl) \
47 do {\
48 auto subsys = ceph_subsys_mds;\
49 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
50 subsys = ceph_subsys_mds_balancer;\
51 }\
11fdf7f2 52 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
7c673cae
FG
53#undef dendl
54#define dendl dendl_impl; } while (0)
55
56
57#define MIN_LOAD 50 // ??
58#define MIN_REEXPORT 5 // will automatically reexport
59#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
60
61
9f95a23c 62int MDBalancer::proc_message(const cref_t<Message> &m)
7c673cae
FG
63{
64 switch (m->get_type()) {
65
66 case MSG_MDS_HEARTBEAT:
9f95a23c 67 handle_heartbeat(ref_cast<MHeartbeat>(m));
7c673cae
FG
68 break;
69
70 default:
b32b8144 71 derr << " balancer unknown message " << m->get_type() << dendl_impl;
11fdf7f2 72 ceph_abort_msg("balancer unknown message");
7c673cae
FG
73 }
74
75 return 0;
76}
77
91327a77
AA
78MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
79 mds(m), messenger(msgr), mon_client(monc)
80{
11fdf7f2
TL
81 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
82 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
91327a77
AA
83}
84
92f5a8d4 85void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
91327a77 86{
11fdf7f2
TL
87 if (changed.count("mds_bal_fragment_dirs"))
88 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
91327a77 89 if (changed.count("mds_bal_fragment_interval"))
11fdf7f2 90 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
91327a77
AA
91}
92
7c673cae
FG
93void MDBalancer::handle_export_pins(void)
94{
95 auto &q = mds->mdcache->export_pin_queue;
96 auto it = q.begin();
97 dout(20) << "export_pin_queue size=" << q.size() << dendl;
98 while (it != q.end()) {
31f18b77
FG
99 auto cur = it++;
100 CInode *in = *cur;
11fdf7f2 101 ceph_assert(in->is_dir());
f6b5b4d7
TL
102
103 in->check_pin_policy();
31f18b77 104 mds_rank_t export_pin = in->get_export_pin(false);
eafe8130 105 if (export_pin >= mds->mdsmap->get_max_mds()) {
f6b5b4d7 106 dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
eafe8130
TL
107 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
108 q.erase(cur);
109
110 in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
111 mds->mdcache->export_pin_delayed_queue.insert(in);
112 continue;
f6b5b4d7
TL
113 } else {
114 dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
eafe8130 115 }
31f18b77
FG
116
117 bool remove = true;
f6b5b4d7 118 for (auto&& dir : in->get_dirfrags()) {
31f18b77
FG
119 if (!dir->is_auth())
120 continue;
121
122 if (export_pin == MDS_RANK_NONE) {
123 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
124 if (dir->is_frozen() || dir->is_freezing()) {
125 // try again later
126 remove = false;
127 continue;
128 }
129 dout(10) << " clear auxsubtree on " << *dir << dendl;
130 dir->state_clear(CDir::STATE_AUXSUBTREE);
131 mds->mdcache->try_subtree_merge(dir);
132 }
133 } else if (export_pin == mds->get_nodeid()) {
f6b5b4d7
TL
134 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
135 ceph_assert(dir->is_subtree_root());
136 } else if (dir->state_test(CDir::STATE_CREATING) ||
137 dir->is_frozen() || dir->is_freezing()) {
31f18b77
FG
138 // try again later
139 remove = false;
140 continue;
f6b5b4d7 141 } else if (!dir->is_subtree_root()) {
31f18b77
FG
142 dir->state_set(CDir::STATE_AUXSUBTREE);
143 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
144 dout(10) << " create aux subtree on " << *dir << dendl;
f6b5b4d7 145 } else {
31f18b77
FG
146 dout(10) << " set auxsubtree bit on " << *dir << dendl;
147 dir->state_set(CDir::STATE_AUXSUBTREE);
148 }
149 } else {
f6b5b4d7
TL
150 /* Only export a directory if it's non-empty. An empty directory will
151 * be sent back by the importer.
152 */
153 if (dir->get_num_head_items() > 0) {
154 mds->mdcache->migrator->export_dir(dir, export_pin);
155 }
31f18b77 156 remove = false;
7c673cae
FG
157 }
158 }
31f18b77
FG
159
160 if (remove) {
161 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
162 q.erase(cur);
7c673cae
FG
163 }
164 }
165
81eedcae
TL
166 std::vector<CDir *> authsubs = mds->mdcache->get_auth_subtrees();
167 bool print_auth_subtrees = true;
168
169 if (authsubs.size() > AUTH_TREES_THRESHOLD &&
170 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
171 dout(15) << "number of auth trees = " << authsubs.size() << "; not "
172 "printing auth trees" << dendl;
173 print_auth_subtrees = false;
174 }
175
176 for (auto &cd : authsubs) {
7c673cae 177 mds_rank_t export_pin = cd->inode->get_export_pin();
81eedcae
TL
178
179 if (print_auth_subtrees) {
180 dout(25) << "auth tree " << *cd << " export_pin=" << export_pin <<
181 dendl;
182 }
183
f6b5b4d7
TL
184 if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()) {
185 if (export_pin == mds->get_nodeid()) {
186 cd->get_inode()->check_pin_policy();
187 } else {
188 mds->mdcache->migrator->export_dir(cd, export_pin);
189 }
7c673cae
FG
190 }
191 }
192}
193
194void MDBalancer::tick()
195{
11fdf7f2
TL
196 static int num_bal_times = g_conf()->mds_bal_max;
197 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
198 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
199 time now = clock::now();
7c673cae 200
11fdf7f2 201 if (g_conf()->mds_bal_export_pin) {
7c673cae
FG
202 handle_export_pins();
203 }
204
205 // sample?
11fdf7f2
TL
206 if (chrono::duration<double>(now-last_sample).count() >
207 g_conf()->mds_bal_sample_interval) {
7c673cae
FG
208 dout(15) << "tick last_sample now " << now << dendl;
209 last_sample = now;
210 }
211
11fdf7f2
TL
212 // We can use duration_cast below, although the result is an int,
213 // because the values from g_conf are also integers.
7c673cae 214 // balance?
11fdf7f2
TL
215 if (mds->get_nodeid() == 0
216 && mds->is_active()
217 && bal_interval > 0
218 && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
219 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
7c673cae
FG
220 last_heartbeat = now;
221 send_heartbeat();
222 num_bal_times--;
223 }
81eedcae
TL
224
225 mds->mdcache->show_subtrees(10, true);
7c673cae
FG
226}
227
228
229
230
231class C_Bal_SendHeartbeat : public MDSInternalContext {
232public:
233 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
234 void finish(int f) override {
235 mds->balancer->send_heartbeat();
236 }
237};
238
239
11fdf7f2 240double mds_load_t::mds_load() const
7c673cae 241{
11fdf7f2 242 switch(g_conf()->mds_bal_mode) {
7c673cae
FG
243 case 0:
244 return
245 .8 * auth.meta_load() +
246 .2 * all.meta_load() +
247 req_rate +
248 10.0 * queue_len;
249
250 case 1:
251 return req_rate + 10.0*queue_len;
252
253 case 2:
254 return cpu_load_avg;
255
256 }
257 ceph_abort();
258 return 0;
259}
260
11fdf7f2 261mds_load_t MDBalancer::get_load()
7c673cae 262{
11fdf7f2
TL
263 auto now = clock::now();
264
265 mds_load_t load{DecayRate()}; /* zero DecayRate! */
7c673cae
FG
266
267 if (mds->mdcache->get_root()) {
9f95a23c 268 auto&& ls = mds->mdcache->get_root()->get_dirfrags();
11fdf7f2
TL
269 for (auto &d : ls) {
270 load.auth.add(d->pop_auth_subtree_nested);
271 load.all.add(d->pop_nested);
7c673cae
FG
272 }
273 } else {
9f95a23c 274 dout(20) << "no root, no load" << dendl;
7c673cae
FG
275 }
276
28e407b8 277 uint64_t num_requests = mds->get_num_requests();
91327a77
AA
278
279 uint64_t cpu_time = 1;
280 {
281 string stat_path = PROCPREFIX "/proc/self/stat";
282 ifstream stat_file(stat_path);
283 if (stat_file.is_open()) {
284 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
285 std::istream_iterator<string>());
286 if (stat_vec.size() >= 15) {
287 // utime + stime
288 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
289 strtoll(stat_vec[14].c_str(), nullptr, 10);
290 } else {
291 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
292 }
293 } else {
294 derr << "input file '" << stat_path << "' not found" << dendl_impl;
295 }
296 }
297
298 load.queue_len = messenger->get_dispatch_queue_len();
299
300 bool update_last = true;
11fdf7f2 301 if (last_get_load != clock::zero() &&
91327a77 302 now > last_get_load) {
11fdf7f2
TL
303 double el = std::chrono::duration<double>(now-last_get_load).count();
304 if (el >= 1.0) {
91327a77 305 if (num_requests > last_num_requests)
11fdf7f2 306 load.req_rate = (num_requests - last_num_requests) / el;
91327a77 307 if (cpu_time > last_cpu_time)
11fdf7f2 308 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
91327a77
AA
309 } else {
310 auto p = mds_load.find(mds->get_nodeid());
311 if (p != mds_load.end()) {
312 load.req_rate = p->second.req_rate;
313 load.cpu_load_avg = p->second.cpu_load_avg;
314 }
315 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
316 update_last = false;
28e407b8
AA
317 }
318 }
28e407b8 319
91327a77
AA
320 if (update_last) {
321 last_num_requests = num_requests;
322 last_cpu_time = cpu_time;
323 last_get_load = now;
324 }
7c673cae 325
9f95a23c 326 dout(15) << load << dendl;
7c673cae
FG
327 return load;
328}
329
330/*
331 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
332 * fallbacks (i.e. kick off async read when we are processing the map and
333 * check status when we get here) with the way the mds is structured.
334 */
335int MDBalancer::localize_balancer()
336{
337 /* reset everything */
338 bool ack = false;
339 int r = 0;
340 bufferlist lua_src;
9f95a23c
TL
341 ceph::mutex lock = ceph::make_mutex("lock");
342 ceph::condition_variable cond;
7c673cae
FG
343
344 /* we assume that balancer is in the metadata pool */
345 object_t oid = object_t(mds->mdsmap->get_balancer());
346 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
347 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
9f95a23c 348 new C_SafeCond(lock, cond, &ack, &r));
7c673cae
FG
349 dout(15) << "launched non-blocking read tid=" << tid
350 << " oid=" << oid << " oloc=" << oloc << dendl;
351
352 /* timeout: if we waste half our time waiting for RADOS, then abort! */
9f95a23c
TL
353 std::cv_status ret_t = [&] {
354 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
355 std::unique_lock locker{lock};
356 return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
357 }();
7c673cae
FG
358 /* success: store the balancer in memory and set the version. */
359 if (!r) {
9f95a23c 360 if (ret_t == std::cv_status::timeout) {
7c673cae
FG
361 mds->objecter->op_cancel(tid, -ECANCELED);
362 return -ETIMEDOUT;
363 }
364 bal_code.assign(lua_src.to_str());
365 bal_version.assign(oid.name);
9f95a23c 366 dout(10) "bal_code=" << bal_code << dendl;
7c673cae
FG
367 }
368 return r;
369}
370
371void MDBalancer::send_heartbeat()
372{
7c673cae 373 if (mds->is_cluster_degraded()) {
9f95a23c 374 dout(10) << "degraded" << dendl;
7c673cae
FG
375 return;
376 }
377
378 if (!mds->mdcache->is_open()) {
9f95a23c 379 dout(10) << "not open" << dendl;
7c673cae
FG
380 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
381 return;
382 }
383
94b18763 384 if (mds->get_nodeid() == 0) {
7c673cae 385 beat_epoch++;
94b18763
FG
386 mds_load.clear();
387 }
7c673cae
FG
388
389 // my load
11fdf7f2 390 mds_load_t load = get_load();
28e407b8
AA
391 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
392 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
393
11fdf7f2
TL
394 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
395 if (!em.second) {
396 em.first->second = load;
397 }
7c673cae
FG
398
399 // import_map -- how much do i import from whom
400 map<mds_rank_t, float> import_map;
11fdf7f2 401 for (auto& im : mds->mdcache->get_auth_subtrees()) {
7c673cae
FG
402 mds_rank_t from = im->inode->authority().first;
403 if (from == mds->get_nodeid()) continue;
404 if (im->get_inode()->is_stray()) continue;
11fdf7f2 405 import_map[from] += im->pop_auth_subtree.meta_load();
7c673cae
FG
406 }
407 mds_import_map[ mds->get_nodeid() ] = import_map;
408
409
9f95a23c
TL
410 dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
411 for (const auto& [rank, load] : import_map) {
412 dout(5) << " import_map from " << rank << " -> " << load << dendl;
7c673cae
FG
413 }
414
415
416 set<mds_rank_t> up;
417 mds->get_mds_map()->get_up_mds_set(up);
11fdf7f2
TL
418 for (const auto& r : up) {
419 if (r == mds->get_nodeid())
7c673cae 420 continue;
9f95a23c 421 auto hb = make_message<MHeartbeat>(load, beat_epoch);
7c673cae 422 hb->get_import_map() = import_map;
11fdf7f2 423 mds->send_message_mds(hb, r);
7c673cae
FG
424 }
425}
426
9f95a23c 427void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
7c673cae 428{
7c673cae
FG
429 mds_rank_t who = mds_rank_t(m->get_source().num());
430 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
431
432 if (!mds->is_active())
11fdf7f2 433 return;
7c673cae
FG
434
435 if (!mds->mdcache->is_open()) {
436 dout(10) << "opening root on handle_heartbeat" << dendl;
437 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
438 return;
439 }
440
441 if (mds->is_cluster_degraded()) {
442 dout(10) << " degraded, ignoring" << dendl;
11fdf7f2 443 return;
7c673cae
FG
444 }
445
94b18763
FG
446 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
447 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
448
449 beat_epoch = m->get_beat();
450 // clear the mds load info whose epoch is less than beat_epoch
451 mds_load.clear();
452 }
453
7c673cae 454 if (who == 0) {
94b18763
FG
455 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
456 if (beat_epoch != m->get_beat()) {
28e407b8 457 beat_epoch = m->get_beat();
94b18763
FG
458 mds_load.clear();
459 }
28e407b8 460
7c673cae
FG
461 send_heartbeat();
462
463 mds->mdcache->show_subtrees();
28e407b8
AA
464 } else if (mds->get_nodeid() == 0) {
465 if (beat_epoch != m->get_beat()) {
466 dout(10) << " old heartbeat epoch, ignoring" << dendl;
11fdf7f2 467 return;
7c673cae
FG
468 }
469 }
28e407b8 470
11fdf7f2
TL
471 {
472 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
473 if (!em.second) {
474 em.first->second = m->get_load();
475 }
476 }
28e407b8 477 mds_import_map[who] = m->get_import_map();
7c673cae 478
7c673cae
FG
479 {
480 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
481 if (mds_load.size() == cluster_size) {
482 // let's go!
483 //export_empties(); // no!
484
485 /* avoid spamming ceph -w if user does not turn mantle on */
486 if (mds->mdsmap->get_balancer() != "") {
487 int r = mantle_prep_rebalance();
11fdf7f2 488 if (!r) return;
7c673cae
FG
489 mds->clog->warn() << "using old balancer; mantle failed for "
490 << "balancer=" << mds->mdsmap->get_balancer()
491 << " : " << cpp_strerror(r);
492 }
493 prep_rebalance(m->get_beat());
494 }
495 }
7c673cae
FG
496}
497
7c673cae
FG
498double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
499 mds_rank_t im, double& maxim)
500{
501 if (maxex <= 0 || maxim <= 0) return 0.0;
502
11fdf7f2 503 double howmuch = std::min(maxex, maxim);
7c673cae
FG
504
505 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
506
507 if (ex == mds->get_nodeid())
508 state.targets[im] += howmuch;
509
510 state.exported[ex] += howmuch;
511 state.imported[im] += howmuch;
512
513 maxex -= howmuch;
514 maxim -= howmuch;
515
516 return howmuch;
517}
518
519void MDBalancer::queue_split(const CDir *dir, bool fast)
520{
521 dout(10) << __func__ << " enqueuing " << *dir
522 << " (fast=" << fast << ")" << dendl;
523
7c673cae
FG
524 const dirfrag_t frag = dir->dirfrag();
525
526 auto callback = [this, frag](int r) {
527 if (split_pending.erase(frag) == 0) {
528 // Someone beat me to it. This can happen in the fast splitting
529 // path, because we spawn two contexts, one with mds->timer and
530 // one with mds->queue_waiter. The loser can safely just drop
531 // out.
532 return;
533 }
534
535 CDir *split_dir = mds->mdcache->get_dirfrag(frag);
536 if (!split_dir) {
537 dout(10) << "drop split on " << frag << " because not in cache" << dendl;
538 return;
539 }
540 if (!split_dir->is_auth()) {
541 dout(10) << "drop split on " << frag << " because non-auth" << dendl;
542 return;
543 }
544
545 // Pass on to MDCache: note that the split might still not
546 // happen if the checks in MDCache::can_fragment fail.
547 dout(10) << __func__ << " splitting " << *split_dir << dendl;
11fdf7f2 548 mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits);
7c673cae
FG
549 };
550
551 bool is_new = false;
552 if (split_pending.count(frag) == 0) {
553 split_pending.insert(frag);
554 is_new = true;
555 }
556
557 if (fast) {
558 // Do the split ASAP: enqueue it in the MDSRank waiters which are
559 // run at the end of dispatching the current request
560 mds->queue_waiter(new MDSInternalContextWrapper(mds,
9f95a23c 561 new LambdaContext(std::move(callback))));
7c673cae
FG
562 } else if (is_new) {
563 // Set a timer to really do the split: we don't do it immediately
564 // so that bursts of ops on a directory have a chance to go through
565 // before we freeze it.
91327a77 566 mds->timer.add_event_after(bal_fragment_interval,
9f95a23c 567 new LambdaContext(std::move(callback)));
7c673cae
FG
568 }
569}
570
571void MDBalancer::queue_merge(CDir *dir)
572{
573 const auto frag = dir->dirfrag();
574 auto callback = [this, frag](int r) {
11fdf7f2 575 ceph_assert(frag.frag != frag_t());
7c673cae
FG
576
577 // frag must be in this set because only one context is in flight
578 // for a given frag at a time (because merge_pending is checked before
579 // starting one), and this context is the only one that erases it.
580 merge_pending.erase(frag);
581
582 CDir *dir = mds->mdcache->get_dirfrag(frag);
583 if (!dir) {
584 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
585 return;
586 }
11fdf7f2 587 ceph_assert(dir->dirfrag() == frag);
7c673cae
FG
588
589 if(!dir->is_auth()) {
590 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
591 return;
592 }
593
594 dout(10) << "merging " << *dir << dendl;
595
596 CInode *diri = dir->get_inode();
597
598 frag_t fg = dir->get_frag();
599 while (fg != frag_t()) {
600 frag_t sibfg = fg.get_sibling();
9f95a23c 601 auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
7c673cae
FG
602 if (!complete) {
603 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
604 break;
605 }
606 bool all = true;
9f95a23c 607 for (auto& sib : sibs) {
7c673cae
FG
608 if (!sib->is_auth() || !sib->should_merge()) {
609 all = false;
610 break;
611 }
612 }
613 if (!all) {
614 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
615 break;
616 }
617 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
618 fg = fg.parent();
619 }
620
621 if (fg != dir->get_frag())
622 mds->mdcache->merge_dir(diri, fg);
623 };
624
625 if (merge_pending.count(frag) == 0) {
9f95a23c 626 dout(20) << " enqueued dir " << *dir << dendl;
7c673cae 627 merge_pending.insert(frag);
91327a77 628 mds->timer.add_event_after(bal_fragment_interval,
9f95a23c 629 new LambdaContext(std::move(callback)));
7c673cae 630 } else {
9f95a23c 631 dout(20) << " dir already in queue " << *dir << dendl;
7c673cae
FG
632 }
633}
634
635void MDBalancer::prep_rebalance(int beat)
636{
637 balance_state_t state;
638
11fdf7f2 639 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
640 //we're going to randomly export to all the mds in the cluster
641 set<mds_rank_t> up_mds;
642 mds->get_mds_map()->get_up_mds_set(up_mds);
643 for (const auto &rank : up_mds) {
644 state.targets[rank] = 0.0;
645 }
646 } else {
647 int cluster_size = mds->get_mds_map()->get_num_in_mds();
648 mds_rank_t whoami = mds->get_nodeid();
11fdf7f2 649 rebalance_time = clock::now();
7c673cae 650
9f95a23c 651 dout(7) << "cluster loads are" << dendl;
7c673cae
FG
652
653 mds->mdcache->migrator->clear_export_queue();
654
655 // rescale! turn my mds_load back into meta_load units
656 double load_fac = 1.0;
657 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
658 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
11fdf7f2 659 double metald = m->second.auth.meta_load();
7c673cae
FG
660 double mdsld = m->second.mds_load();
661 load_fac = metald / mdsld;
662 dout(7) << " load_fac is " << load_fac
663 << " <- " << m->second.auth << " " << metald
664 << " / " << mdsld
665 << dendl;
666 }
667
28e407b8
AA
668 mds_meta_load.clear();
669
7c673cae
FG
670 double total_load = 0.0;
671 multimap<double,mds_rank_t> load_map;
672 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
28e407b8 673 mds_load_t& load = mds_load.at(i);
7c673cae
FG
674
675 double l = load.mds_load() * load_fac;
676 mds_meta_load[i] = l;
677
678 if (whoami == 0)
9f95a23c 679 dout(7) << " mds." << i
7c673cae
FG
680 << " " << load
681 << " = " << load.mds_load()
682 << " ~ " << l << dendl;
683
684 if (whoami == i) my_load = l;
685 total_load += l;
686
687 load_map.insert(pair<double,mds_rank_t>( l, i ));
688 }
689
690 // target load
691 target_load = total_load / (double)cluster_size;
9f95a23c 692 dout(7) << "my load " << my_load
7c673cae
FG
693 << " target " << target_load
694 << " total " << total_load
695 << dendl;
696
697 // under or over?
9f95a23c
TL
698 for (const auto& [load, rank] : load_map) {
699 if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
700 dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
701 mds_last_epoch_under_map[rank] = beat_epoch;
28e407b8
AA
702 }
703 }
704
705 int last_epoch_under = mds_last_epoch_under_map[whoami];
706 if (last_epoch_under == beat_epoch) {
9f95a23c 707 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl;
7c673cae
FG
708 return;
709 }
7c673cae
FG
710 // am i over long enough?
711 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
9f95a23c 712 dout(7) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
7c673cae
FG
713 return;
714 }
715
9f95a23c 716 dout(7) << " i am sufficiently overloaded" << dendl;
7c673cae
FG
717
718
719 // first separate exporters and importers
720 multimap<double,mds_rank_t> importers;
721 multimap<double,mds_rank_t> exporters;
722 set<mds_rank_t> importer_set;
723 set<mds_rank_t> exporter_set;
724
725 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
726 it != load_map.end();
727 ++it) {
728 if (it->first < target_load) {
729 dout(15) << " mds." << it->second << " is importer" << dendl;
730 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
731 importer_set.insert(it->second);
732 } else {
28e407b8
AA
733 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
734 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
735 dout(15) << " mds." << it->second << " is exporter" << dendl;
736 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
737 exporter_set.insert(it->second);
738 }
7c673cae
FG
739 }
740 }
741
742
743 // determine load transfer mapping
744
745 if (true) {
746 // analyze import_map; do any matches i can
747
748 dout(15) << " matching exporters to import sources" << dendl;
749
750 // big -> small exporters
751 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
752 ex != exporters.rend();
753 ++ex) {
754 double maxex = get_maxex(state, ex->second);
755 if (maxex <= .001) continue;
756
757 // check importers. for now, just in arbitrary order (no intelligent matching).
758 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
759 im != mds_import_map[ex->second].end();
760 ++im) {
761 double maxim = get_maxim(state, im->first);
762 if (maxim <= .001) continue;
763 try_match(state, ex->second, maxex, im->first, maxim);
764 if (maxex <= .001) break;
765 }
766 }
767 }
768
769 // old way
770 if (beat % 2 == 1) {
771 dout(15) << " matching big exporters to big importers" << dendl;
772 // big exporters to big importers
773 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
774 multimap<double,mds_rank_t>::iterator im = importers.begin();
775 while (ex != exporters.rend() &&
776 im != importers.end()) {
777 double maxex = get_maxex(state, ex->second);
778 double maxim = get_maxim(state, im->second);
779 if (maxex < .001 || maxim < .001) break;
780 try_match(state, ex->second, maxex, im->second, maxim);
781 if (maxex <= .001) ++ex;
782 if (maxim <= .001) ++im;
783 }
784 } else { // new way
785 dout(15) << " matching small exporters to big importers" << dendl;
786 // small exporters to big importers
787 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
788 multimap<double,mds_rank_t>::iterator im = importers.begin();
789 while (ex != exporters.end() &&
790 im != importers.end()) {
791 double maxex = get_maxex(state, ex->second);
792 double maxim = get_maxim(state, im->second);
793 if (maxex < .001 || maxim < .001) break;
794 try_match(state, ex->second, maxex, im->second, maxim);
795 if (maxex <= .001) ++ex;
796 if (maxim <= .001) ++im;
797 }
798 }
799 }
800 try_rebalance(state);
801}
802
7c673cae
FG
803int MDBalancer::mantle_prep_rebalance()
804{
805 balance_state_t state;
806
807 /* refresh balancer if it has changed */
808 if (bal_version != mds->mdsmap->get_balancer()) {
809 bal_version.assign("");
810 int r = localize_balancer();
811 if (r) return r;
812
813 /* only spam the cluster log from 1 mds on version changes */
814 if (mds->get_nodeid() == 0)
815 mds->clog->info() << "mantle balancer version changed: " << bal_version;
816 }
817
818 /* prepare for balancing */
819 int cluster_size = mds->get_mds_map()->get_num_in_mds();
11fdf7f2 820 rebalance_time = clock::now();
7c673cae
FG
821 mds->mdcache->migrator->clear_export_queue();
822
823 /* fill in the metrics for each mds by grabbing load struct */
824 vector < map<string, double> > metrics (cluster_size);
28e407b8
AA
825 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
826 mds_load_t& load = mds_load.at(i);
7c673cae
FG
827
828 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
829 {"all.meta_load", load.all.meta_load()},
830 {"req_rate", load.req_rate},
831 {"queue_len", load.queue_len},
832 {"cpu_load_avg", load.cpu_load_avg}};
833 }
834
835 /* execute the balancer */
836 Mantle mantle;
837 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
9f95a23c 838 dout(7) << " mantle decided that new targets=" << state.targets << dendl;
7c673cae
FG
839
840 /* mantle doesn't know about cluster size, so check target len here */
841 if ((int) state.targets.size() != cluster_size)
842 return -EINVAL;
843 else if (ret)
844 return ret;
845
846 try_rebalance(state);
847 return 0;
848}
849
850
851
852void MDBalancer::try_rebalance(balance_state_t& state)
853{
11fdf7f2 854 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
855 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
856 << dendl;
857 return;
858 }
859
860 // make a sorted list of my imports
28e407b8
AA
861 multimap<double, CDir*> import_pop_map;
862 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
7c673cae 863
11fdf7f2 864 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
28e407b8
AA
865 CInode *diri = dir->get_inode();
866 if (diri->is_mdsdir())
867 continue;
868 if (diri->get_export_pin(false) != MDS_RANK_NONE)
869 continue;
870 if (dir->is_freezing() || dir->is_frozen())
871 continue; // export pbly already in progress
7c673cae 872
28e407b8 873 mds_rank_t from = diri->authority().first;
11fdf7f2
TL
874 double pop = dir->pop_auth_subtree.meta_load();
875 if (g_conf()->mds_bal_idle_threshold > 0 &&
876 pop < g_conf()->mds_bal_idle_threshold &&
28e407b8
AA
877 diri != mds->mdcache->get_root() &&
878 from != mds->get_nodeid()) {
879 dout(5) << " exporting idle (" << pop << ") import " << *dir
880 << " back to mds." << from << dendl;
881 mds->mdcache->migrator->export_dir_nicely(dir, from);
7c673cae
FG
882 continue;
883 }
884
28e407b8
AA
885 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
886 import_pop_map.insert(make_pair(pop, dir));
887 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
7c673cae
FG
888 }
889
7c673cae 890 // do my exports!
28e407b8 891 map<mds_rank_t, double> export_pop_map;
7c673cae
FG
892
893 for (auto &it : state.targets) {
894 mds_rank_t target = it.first;
895 double amount = it.second;
896
28e407b8
AA
897 if (amount < MIN_OFFLOAD)
898 continue;
91327a77
AA
899 if (amount * 10 * state.targets.size() < target_load)
900 continue;
7c673cae
FG
901
902 dout(5) << "want to send " << amount << " to mds." << target
903 //<< " .. " << (*it).second << " * " << load_fac
904 << " -> " << amount
905 << dendl;//" .. fudge is " << fudge << dendl;
7c673cae 906
28e407b8 907 double& have = export_pop_map[target];
7c673cae
FG
908
909 mds->mdcache->show_subtrees();
910
911 // search imports from target
912 if (import_from_map.count(target)) {
9f95a23c 913 dout(7) << " aha, looking through imports from target mds." << target << dendl;
28e407b8
AA
914 for (auto p = import_from_map.equal_range(target);
915 p.first != p.second; ) {
916 CDir *dir = p.first->second.first;
917 double pop = p.first->second.second;
9f95a23c 918 dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
28e407b8 919 auto plast = p.first++;
7c673cae 920
28e407b8 921 if (dir->inode->is_base())
7c673cae 922 continue;
11fdf7f2 923 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
7c673cae
FG
924
925 if (pop <= amount-have) {
9f95a23c 926 dout(7) << "reexporting " << *dir << " pop " << pop
7c673cae
FG
927 << " back to mds." << target << dendl;
928 mds->mdcache->migrator->export_dir_nicely(dir, target);
929 have += pop;
930 import_from_map.erase(plast);
28e407b8
AA
931 for (auto q = import_pop_map.equal_range(pop);
932 q.first != q.second; ) {
933 if (q.first->second == dir) {
934 import_pop_map.erase(q.first);
935 break;
936 }
937 q.first++;
938 }
7c673cae 939 } else {
9f95a23c 940 dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
7c673cae 941 }
28e407b8
AA
942 if (amount-have < MIN_OFFLOAD)
943 break;
7c673cae
FG
944 }
945 }
28e407b8
AA
946 }
947
948 // any other imports
949 for (auto &it : state.targets) {
950 mds_rank_t target = it.first;
951 double amount = it.second;
952
953 if (!export_pop_map.count(target))
954 continue;
955 double& have = export_pop_map[target];
956 if (amount-have < MIN_OFFLOAD)
7c673cae 957 continue;
7c673cae 958
28e407b8
AA
959 for (auto p = import_pop_map.begin();
960 p != import_pop_map.end(); ) {
961 CDir *dir = p->second;
962 if (dir->inode->is_base()) {
963 ++p;
964 continue;
965 }
7c673cae 966
28e407b8
AA
967 double pop = p->first;
968 if (pop <= amount-have && pop > MIN_REEXPORT) {
9f95a23c 969 dout(5) << "reexporting " << *dir << " pop " << pop
28e407b8
AA
970 << " to mds." << target << dendl;
971 have += pop;
972 mds->mdcache->migrator->export_dir_nicely(dir, target);
973 import_pop_map.erase(p++);
974 } else {
975 ++p;
7c673cae 976 }
28e407b8
AA
977 if (amount-have < MIN_OFFLOAD)
978 break;
7c673cae 979 }
28e407b8 980 }
7c673cae 981
28e407b8
AA
982 set<CDir*> already_exporting;
983
984 for (auto &it : state.targets) {
985 mds_rank_t target = it.first;
986 double amount = it.second;
7c673cae 987
28e407b8
AA
988 if (!export_pop_map.count(target))
989 continue;
990 double& have = export_pop_map[target];
991 if (amount-have < MIN_OFFLOAD)
992 continue;
993
994 // okay, search for fragments of my workload
9f95a23c 995 std::vector<CDir*> exports;
7c673cae 996
28e407b8
AA
997 for (auto p = import_pop_map.rbegin();
998 p != import_pop_map.rend();
999 ++p) {
1000 CDir *dir = p->second;
9f95a23c 1001 find_exports(dir, amount, &exports, have, already_exporting);
28e407b8 1002 if (amount-have < MIN_OFFLOAD)
7c673cae
FG
1003 break;
1004 }
1005 //fudge = amount - have;
1006
9f95a23c 1007 for (const auto& dir : exports) {
28e407b8 1008 dout(5) << " - exporting " << dir->pop_auth_subtree
11fdf7f2 1009 << " " << dir->pop_auth_subtree.meta_load()
28e407b8
AA
1010 << " to mds." << target << " " << *dir << dendl;
1011 mds->mdcache->migrator->export_dir_nicely(dir, target);
7c673cae
FG
1012 }
1013 }
1014
9f95a23c 1015 dout(7) << "done" << dendl;
7c673cae
FG
1016 mds->mdcache->show_subtrees();
1017}
1018
7c673cae
FG
1019void MDBalancer::find_exports(CDir *dir,
1020 double amount,
9f95a23c 1021 std::vector<CDir*>* exports,
7c673cae
FG
1022 double& have,
1023 set<CDir*>& already_exporting)
1024{
11fdf7f2
TL
1025 auto now = clock::now();
1026 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
1027 if (duration > 0.1) {
28e407b8
AA
1028 derr << " balancer runs too long" << dendl_impl;
1029 have = amount;
1030 return;
1031 }
1032
11fdf7f2 1033 ceph_assert(dir->is_auth());
28e407b8 1034
7c673cae 1035 double need = amount - have;
11fdf7f2 1036 if (need < amount * g_conf()->mds_bal_min_start)
7c673cae 1037 return; // good enough!
28e407b8 1038
11fdf7f2
TL
1039 double needmax = need * g_conf()->mds_bal_need_max;
1040 double needmin = need * g_conf()->mds_bal_need_min;
1041 double midchunk = need * g_conf()->mds_bal_midchunk;
1042 double minchunk = need * g_conf()->mds_bal_minchunk;
7c673cae 1043
9f95a23c 1044 std::vector<CDir*> bigger_rep, bigger_unrep;
7c673cae
FG
1045 multimap<double, CDir*> smaller;
1046
11fdf7f2 1047 double dir_pop = dir->pop_auth_subtree.meta_load();
9f95a23c 1048 dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
7c673cae
FG
1049
1050 double subdir_sum = 0;
28e407b8
AA
1051 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1052 !it.end(); ) {
1053 CInode *in = *it;
1054 ++it;
1055
11fdf7f2
TL
1056 ceph_assert(in->is_dir());
1057 ceph_assert(in->get_parent_dir() == dir);
7c673cae 1058
9f95a23c 1059 auto&& dfls = in->get_nested_dirfrags();
28e407b8
AA
1060
1061 size_t num_idle_frags = 0;
9f95a23c 1062 for (const auto& subdir : dfls) {
28e407b8
AA
1063 if (already_exporting.count(subdir))
1064 continue;
7c673cae 1065
28e407b8
AA
1066 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1067 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1068 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1069 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1070 continue; // can't export this right now!
7c673cae
FG
1071
1072 // how popular?
11fdf7f2 1073 double pop = subdir->pop_auth_subtree.meta_load();
7c673cae
FG
1074 subdir_sum += pop;
1075 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1076
28e407b8
AA
1077 if (pop < minchunk) {
1078 num_idle_frags++;
1079 continue;
1080 }
7c673cae
FG
1081
1082 // lucky find?
1083 if (pop > needmin && pop < needmax) {
9f95a23c 1084 exports->push_back(subdir);
7c673cae
FG
1085 already_exporting.insert(subdir);
1086 have += pop;
1087 return;
1088 }
1089
1090 if (pop > need) {
1091 if (subdir->is_rep())
1092 bigger_rep.push_back(subdir);
1093 else
1094 bigger_unrep.push_back(subdir);
1095 } else
1096 smaller.insert(pair<double,CDir*>(pop, subdir));
1097 }
28e407b8
AA
1098 if (dfls.size() == num_idle_frags)
1099 in->item_pop_lru.remove_myself();
7c673cae
FG
1100 }
1101 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1102
1103 // grab some sufficiently big small items
1104 multimap<double,CDir*>::reverse_iterator it;
1105 for (it = smaller.rbegin();
1106 it != smaller.rend();
1107 ++it) {
1108
1109 if ((*it).first < midchunk)
1110 break; // try later
1111
1112 dout(7) << " taking smaller " << *(*it).second << dendl;
1113
9f95a23c 1114 exports->push_back((*it).second);
7c673cae
FG
1115 already_exporting.insert((*it).second);
1116 have += (*it).first;
1117 if (have > needmin)
1118 return;
1119 }
1120
1121 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
9f95a23c
TL
1122 for (const auto& dir : bigger_unrep) {
1123 dout(15) << " descending into " << *dir << dendl;
1124 find_exports(dir, amount, exports, have, already_exporting);
7c673cae
FG
1125 if (have > needmin)
1126 return;
1127 }
1128
1129 // ok fine, use smaller bits
1130 for (;
1131 it != smaller.rend();
1132 ++it) {
1133 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1134
9f95a23c 1135 exports->push_back((*it).second);
7c673cae
FG
1136 already_exporting.insert((*it).second);
1137 have += (*it).first;
1138 if (have > needmin)
1139 return;
1140 }
1141
1142 // ok fine, drill into replicated dirs
9f95a23c
TL
1143 for (const auto& dir : bigger_rep) {
1144 dout(7) << " descending into replicated " << *dir << dendl;
1145 find_exports(dir, amount, exports, have, already_exporting);
7c673cae
FG
1146 if (have > needmin)
1147 return;
1148 }
7c673cae
FG
1149}
1150
11fdf7f2 1151void MDBalancer::hit_inode(CInode *in, int type, int who)
7c673cae
FG
1152{
1153 // hit inode
11fdf7f2 1154 in->pop.get(type).hit();
7c673cae
FG
1155
1156 if (in->get_parent_dn())
11fdf7f2 1157 hit_dir(in->get_parent_dn()->get_dir(), type, who);
7c673cae
FG
1158}
1159
1160void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1161{
1162 // split/merge
11fdf7f2 1163 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
91327a77 1164 dir->is_auth() &&
11fdf7f2 1165 !dir->inode->is_base() && // not root/mdsdir (for now at least)
91327a77 1166 !dir->inode->is_stray()) { // not straydir
7c673cae
FG
1167
1168 // split
11fdf7f2 1169 if (g_conf()->mds_bal_split_size > 0 && (dir->should_split() || hot)) {
7c673cae
FG
1170 if (split_pending.count(dir->dirfrag()) == 0) {
1171 queue_split(dir, false);
1172 } else {
1173 if (dir->should_split_fast()) {
1174 queue_split(dir, true);
1175 } else {
9f95a23c 1176 dout(10) << ": fragment already enqueued to split: "
7c673cae
FG
1177 << *dir << dendl;
1178 }
1179 }
1180 }
1181
1182 // merge?
1183 if (dir->get_frag() != frag_t() && dir->should_merge() &&
1184 merge_pending.count(dir->dirfrag()) == 0) {
1185 queue_merge(dir);
1186 }
1187 }
1188}
1189
11fdf7f2 1190void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
7c673cae
FG
1191{
1192 // hit me
11fdf7f2 1193 double v = dir->pop_me.get(type).hit(amount);
7c673cae 1194
11fdf7f2
TL
1195 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1196 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
7c673cae 1197
9f95a23c 1198 dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
11fdf7f2 1199 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
7c673cae
FG
1200
1201 maybe_fragment(dir, hot);
1202
1203 // replicate?
1204 if (type == META_POP_IRD && who >= 0) {
11fdf7f2 1205 dir->pop_spread.hit(who);
7c673cae
FG
1206 }
1207
1208 double rd_adj = 0.0;
1209 if (type == META_POP_IRD &&
1210 dir->last_popularity_sample < last_sample) {
11fdf7f2 1211 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
7c673cae 1212 dir->last_popularity_sample = last_sample;
11fdf7f2 1213 double pop_sp = dir->pop_spread.get();
7c673cae
FG
1214 dir_pop += pop_sp * 10;
1215
1216 //if (dir->ino() == inodeno_t(0x10000000002))
1217 if (pop_sp > 0) {
9f95a23c 1218 dout(20) << type << " pop " << dir_pop << " spread " << pop_sp
7c673cae
FG
1219 << " " << dir->pop_spread.last[0]
1220 << " " << dir->pop_spread.last[1]
1221 << " " << dir->pop_spread.last[2]
1222 << " " << dir->pop_spread.last[3]
1223 << " in " << *dir << dendl;
1224 }
1225
1226 if (dir->is_auth() && !dir->is_ambiguous_auth()) {
1227 if (!dir->is_rep() &&
11fdf7f2 1228 dir_pop >= g_conf()->mds_bal_replicate_threshold) {
7c673cae 1229 // replicate
11fdf7f2 1230 double rdp = dir->pop_me.get(META_POP_IRD).get();
7c673cae
FG
1231 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1232 rd_adj /= 2.0; // temper somewhat
1233
b32b8144 1234 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
7c673cae
FG
1235
1236 dir->dir_rep = CDir::REP_ALL;
1237 mds->mdcache->send_dir_updates(dir, true);
1238
1239 // fixme this should adjust the whole pop hierarchy
1240 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1241 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1242 }
1243
1244 if (dir->ino() != 1 &&
1245 dir->is_rep() &&
11fdf7f2 1246 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
7c673cae 1247 // unreplicate
b32b8144 1248 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
7c673cae
FG
1249
1250 dir->dir_rep = CDir::REP_NONE;
1251 mds->mdcache->send_dir_updates(dir);
1252 }
1253 }
1254 }
1255
1256 // adjust ancestors
1257 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1258 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1259
1260 while (true) {
28e407b8 1261 CDir *pdir = dir->inode->get_parent_dir();
11fdf7f2 1262 dir->pop_nested.get(type).hit(amount);
7c673cae 1263 if (rd_adj != 0.0)
11fdf7f2 1264 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae
FG
1265
1266 if (hit_subtree) {
11fdf7f2 1267 dir->pop_auth_subtree.get(type).hit(amount);
28e407b8 1268
7c673cae 1269 if (rd_adj != 0.0)
11fdf7f2 1270 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
28e407b8
AA
1271
1272 if (dir->is_subtree_root())
1273 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1274 else if (pdir)
1275 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
7c673cae
FG
1276 }
1277
1278 if (hit_subtree_nested) {
11fdf7f2 1279 dir->pop_auth_subtree_nested.get(type).hit(amount);
7c673cae 1280 if (rd_adj != 0.0)
11fdf7f2 1281 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae 1282 }
28e407b8
AA
1283 if (!pdir) break;
1284 dir = pdir;
7c673cae
FG
1285 }
1286}
1287
1288
1289/*
1290 * subtract off an exported chunk.
1291 * this excludes *dir itself (encode_export_dir should have take care of that)
1292 * we _just_ do the parents' nested counters.
1293 *
1294 * NOTE: call me _after_ forcing *dir into a subtree root,
1295 * but _before_ doing the encode_export_dirs.
1296 */
11fdf7f2 1297void MDBalancer::subtract_export(CDir *dir)
7c673cae
FG
1298{
1299 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1300
1301 while (true) {
1302 dir = dir->inode->get_parent_dir();
1303 if (!dir) break;
1304
11fdf7f2
TL
1305 dir->pop_nested.sub(subload);
1306 dir->pop_auth_subtree_nested.sub(subload);
7c673cae
FG
1307 }
1308}
1309
1310
11fdf7f2 1311void MDBalancer::add_import(CDir *dir)
7c673cae
FG
1312{
1313 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1314
1315 while (true) {
1316 dir = dir->inode->get_parent_dir();
1317 if (!dir) break;
1318
11fdf7f2
TL
1319 dir->pop_nested.add(subload);
1320 dir->pop_auth_subtree_nested.add(subload);
7c673cae
FG
1321 }
1322}
1323
11fdf7f2 1324void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
28e407b8 1325{
28e407b8
AA
1326 bool adjust_subtree_nest = dir->is_auth();
1327 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1328 CDir *cur = dir;
1329 while (true) {
1330 if (inc) {
11fdf7f2 1331 pdir->pop_nested.add(dir->pop_nested);
28e407b8 1332 if (adjust_subtree) {
11fdf7f2 1333 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8
AA
1334 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1335 }
1336
1337 if (adjust_subtree_nest)
11fdf7f2 1338 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
28e407b8 1339 } else {
11fdf7f2 1340 pdir->pop_nested.sub(dir->pop_nested);
28e407b8 1341 if (adjust_subtree)
11fdf7f2 1342 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
28e407b8
AA
1343
1344 if (adjust_subtree_nest)
11fdf7f2 1345 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
28e407b8
AA
1346 }
1347
1348 if (pdir->is_subtree_root())
1349 adjust_subtree = false;
1350 cur = pdir;
1351 pdir = pdir->inode->get_parent_dir();
1352 if (!pdir) break;
1353 }
1354}
1355
224ce89b
WB
1356void MDBalancer::handle_mds_failure(mds_rank_t who)
1357{
1358 if (0 == who) {
28e407b8 1359 mds_last_epoch_under_map.clear();
224ce89b
WB
1360 }
1361}
28e407b8 1362
9f95a23c 1363int MDBalancer::dump_loads(Formatter *f) const
28e407b8 1364{
9f95a23c 1365 std::deque<CDir*> dfs;
28e407b8
AA
1366 if (mds->mdcache->get_root()) {
1367 mds->mdcache->get_root()->get_dirfrags(dfs);
1368 } else {
9f95a23c 1369 dout(10) << "no root" << dendl;
28e407b8
AA
1370 }
1371
1372 f->open_object_section("loads");
1373
1374 f->open_array_section("dirfrags");
1375 while (!dfs.empty()) {
1376 CDir *dir = dfs.front();
1377 dfs.pop_front();
1378
11fdf7f2
TL
1379 f->open_object_section("dir");
1380 dir->dump_load(f);
1381 f->close_section();
28e407b8
AA
1382
1383 for (auto it = dir->begin(); it != dir->end(); ++it) {
1384 CInode *in = it->second->get_linkage()->get_inode();
1385 if (!in || !in->is_dir())
1386 continue;
1387
9f95a23c
TL
1388 auto&& ls = in->get_dirfrags();
1389 for (const auto& subdir : ls) {
28e407b8
AA
1390 if (subdir->pop_nested.meta_load() < .001)
1391 continue;
1392 dfs.push_back(subdir);
1393 }
1394 }
1395 }
1396 f->close_section(); // dirfrags array
1397
1398 f->open_object_section("mds_load");
1399 {
1400
11fdf7f2 1401 auto dump_mds_load = [f](mds_load_t& load) {
28e407b8
AA
1402 f->dump_float("request_rate", load.req_rate);
1403 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1404 f->dump_float("queue_length", load.queue_len);
1405 f->dump_float("cpu_load", load.cpu_load_avg);
1406 f->dump_float("mds_load", load.mds_load());
1407
28e407b8 1408 f->open_object_section("auth_dirfrags");
11fdf7f2 1409 load.auth.dump(f);
28e407b8
AA
1410 f->close_section();
1411 f->open_object_section("all_dirfrags");
11fdf7f2 1412 load.all.dump(f);
28e407b8
AA
1413 f->close_section();
1414 };
1415
1416 for (auto p : mds_load) {
1417 stringstream name;
1418 name << "mds." << p.first;
1419 f->open_object_section(name.str().c_str());
1420 dump_mds_load(p.second);
1421 f->close_section();
1422 }
1423 }
1424 f->close_section(); // mds_load
1425
1426 f->open_object_section("mds_meta_load");
1427 for (auto p : mds_meta_load) {
1428 stringstream name;
1429 name << "mds." << p.first;
1430 f->dump_float(name.str().c_str(), p.second);
1431 }
1432 f->close_section(); // mds_meta_load
1433
1434 f->open_object_section("mds_import_map");
1435 for (auto p : mds_import_map) {
1436 stringstream name1;
1437 name1 << "mds." << p.first;
1438 f->open_array_section(name1.str().c_str());
1439 for (auto q : p.second) {
1440 f->open_object_section("from");
1441 stringstream name2;
1442 name2 << "mds." << q.first;
1443 f->dump_float(name2.str().c_str(), q.second);
1444 f->close_section();
1445 }
1446 f->close_section(); // mds.? array
1447 }
1448 f->close_section(); // mds_import_map
1449
1450 f->close_section(); // loads
1451 return 0;
1452}