]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / mds / MDBalancer.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/compat.h"
16#include "mdstypes.h"
17
91327a77 18#include "mon/MonClient.h"
7c673cae
FG
19#include "MDBalancer.h"
20#include "MDSRank.h"
7c673cae
FG
21#include "MDSMap.h"
22#include "CInode.h"
23#include "CDir.h"
24#include "MDCache.h"
25#include "Migrator.h"
26#include "Mantle.h"
27
28#include "include/Context.h"
29#include "msg/Messenger.h"
7c673cae
FG
30
31#include <fstream>
7c673cae
FG
32#include <vector>
33#include <map>
20effc67
TL
34
35using namespace std;
7c673cae
FG
36
37#include "common/config.h"
38#include "common/errno.h"
39
1e59de90
TL
40/* Note, by default debug_mds_balancer is 1/5. For debug messages 1<lvl<=5,
41 * should_gather (below) will be true; so, debug_mds will be ignored even if
42 * set to 20/20. For this reason, some messages may not appear in the log.
43 * Increase both debug levels to get expected output!
44 */
7c673cae 45#define dout_context g_ceph_context
7c673cae 46#undef dout_prefix
9f95a23c 47#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
7c673cae
FG
48#undef dout
49#define dout(lvl) \
50 do {\
51 auto subsys = ceph_subsys_mds;\
52 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
53 subsys = ceph_subsys_mds_balancer;\
54 }\
11fdf7f2 55 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
7c673cae
FG
56#undef dendl
57#define dendl dendl_impl; } while (0)
58
59
60#define MIN_LOAD 50 // ??
61#define MIN_REEXPORT 5 // will automatically reexport
62#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
63
64
9f95a23c 65int MDBalancer::proc_message(const cref_t<Message> &m)
7c673cae
FG
66{
67 switch (m->get_type()) {
68
69 case MSG_MDS_HEARTBEAT:
9f95a23c 70 handle_heartbeat(ref_cast<MHeartbeat>(m));
7c673cae
FG
71 break;
72
73 default:
b32b8144 74 derr << " balancer unknown message " << m->get_type() << dendl_impl;
11fdf7f2 75 ceph_abort_msg("balancer unknown message");
7c673cae
FG
76 }
77
78 return 0;
79}
80
91327a77
AA
81MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
82 mds(m), messenger(msgr), mon_client(monc)
83{
11fdf7f2
TL
84 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
85 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
91327a77
AA
86}
87
92f5a8d4 88void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
91327a77 89{
1e59de90 90 if (changed.count("mds_bal_fragment_dirs")) {
11fdf7f2 91 bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
1e59de90
TL
92 }
93 if (changed.count("mds_bal_fragment_interval")) {
11fdf7f2 94 bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
1e59de90
TL
95 }
96}
97
98bool MDBalancer::test_rank_mask(mds_rank_t rank)
99{
100 return mds->mdsmap->get_bal_rank_mask_bitset().test(rank);
91327a77
AA
101}
102
7c673cae
FG
103void MDBalancer::handle_export_pins(void)
104{
f67539c2
TL
105 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
106 auto mdcache = mds->mdcache;
107
108 auto &q = mdcache->export_pin_queue;
7c673cae
FG
109 auto it = q.begin();
110 dout(20) << "export_pin_queue size=" << q.size() << dendl;
111 while (it != q.end()) {
31f18b77
FG
112 auto cur = it++;
113 CInode *in = *cur;
11fdf7f2 114 ceph_assert(in->is_dir());
f6b5b4d7 115
31f18b77 116 mds_rank_t export_pin = in->get_export_pin(false);
f67539c2
TL
117 in->check_pin_policy(export_pin);
118
119 if (export_pin >= max_mds) {
f6b5b4d7 120 dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
eafe8130
TL
121 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
122 q.erase(cur);
123
124 in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
f67539c2 125 mdcache->export_pin_delayed_queue.insert(in);
eafe8130
TL
126 continue;
127 }
31f18b77 128
f67539c2
TL
129 dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
130 unsigned min_frag_bits = 0;
131 mds_rank_t target = MDS_RANK_NONE;
132 if (export_pin >= 0)
133 target = export_pin;
134 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
135 target = mdcache->hash_into_rank_bucket(in->ino());
136 else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
137 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
138
31f18b77 139 bool remove = true;
f6b5b4d7 140 for (auto&& dir : in->get_dirfrags()) {
31f18b77
FG
141 if (!dir->is_auth())
142 continue;
143
f67539c2
TL
144 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
145 if (dir->get_frag().bits() < min_frag_bits) {
146 if (!dir->state_test(CDir::STATE_CREATING) &&
147 !dir->is_frozen() && !dir->is_freezing()) {
148 queue_split(dir, true);
149 }
150 remove = false;
151 continue;
152 }
153 target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
154 }
155
156 if (target == MDS_RANK_NONE) {
31f18b77
FG
157 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
158 if (dir->is_frozen() || dir->is_freezing()) {
159 // try again later
160 remove = false;
161 continue;
162 }
163 dout(10) << " clear auxsubtree on " << *dir << dendl;
164 dir->state_clear(CDir::STATE_AUXSUBTREE);
165 mds->mdcache->try_subtree_merge(dir);
166 }
f67539c2 167 } else if (target == mds->get_nodeid()) {
f6b5b4d7
TL
168 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
169 ceph_assert(dir->is_subtree_root());
170 } else if (dir->state_test(CDir::STATE_CREATING) ||
171 dir->is_frozen() || dir->is_freezing()) {
31f18b77
FG
172 // try again later
173 remove = false;
174 continue;
f6b5b4d7 175 } else if (!dir->is_subtree_root()) {
31f18b77
FG
176 dir->state_set(CDir::STATE_AUXSUBTREE);
177 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
178 dout(10) << " create aux subtree on " << *dir << dendl;
f6b5b4d7 179 } else {
31f18b77
FG
180 dout(10) << " set auxsubtree bit on " << *dir << dendl;
181 dir->state_set(CDir::STATE_AUXSUBTREE);
182 }
183 } else {
f6b5b4d7
TL
184 /* Only export a directory if it's non-empty. An empty directory will
185 * be sent back by the importer.
186 */
187 if (dir->get_num_head_items() > 0) {
f67539c2 188 mds->mdcache->migrator->export_dir(dir, target);
f6b5b4d7 189 }
31f18b77 190 remove = false;
7c673cae
FG
191 }
192 }
31f18b77
FG
193
194 if (remove) {
195 in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
196 q.erase(cur);
7c673cae
FG
197 }
198 }
199
f67539c2 200 std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
81eedcae
TL
201 bool print_auth_subtrees = true;
202
203 if (authsubs.size() > AUTH_TREES_THRESHOLD &&
204 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
205 dout(15) << "number of auth trees = " << authsubs.size() << "; not "
206 "printing auth trees" << dendl;
207 print_auth_subtrees = false;
208 }
209
210 for (auto &cd : authsubs) {
7c673cae 211 mds_rank_t export_pin = cd->inode->get_export_pin();
f67539c2 212 cd->inode->check_pin_policy(export_pin);
81eedcae 213
f67539c2
TL
214 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
215 export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
216 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
217 export_pin = mdcache->hash_into_rank_bucket(cd->ino());
81eedcae
TL
218 }
219
f67539c2
TL
220 if (print_auth_subtrees)
221 dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
222
223 if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
224 export_pin < mds->mdsmap->get_max_mds()) {
225 mdcache->migrator->export_dir(cd, export_pin);
7c673cae
FG
226 }
227 }
228}
229
230void MDBalancer::tick()
231{
11fdf7f2
TL
232 static int num_bal_times = g_conf()->mds_bal_max;
233 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
234 auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
235 time now = clock::now();
7c673cae 236
11fdf7f2 237 if (g_conf()->mds_bal_export_pin) {
7c673cae
FG
238 handle_export_pins();
239 }
240
241 // sample?
11fdf7f2
TL
242 if (chrono::duration<double>(now-last_sample).count() >
243 g_conf()->mds_bal_sample_interval) {
7c673cae
FG
244 dout(15) << "tick last_sample now " << now << dendl;
245 last_sample = now;
246 }
247
11fdf7f2
TL
248 // We can use duration_cast below, although the result is an int,
249 // because the values from g_conf are also integers.
7c673cae 250 // balance?
11fdf7f2
TL
251 if (mds->get_nodeid() == 0
252 && mds->is_active()
253 && bal_interval > 0
20effc67 254 && chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
11fdf7f2 255 && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
7c673cae
FG
256 last_heartbeat = now;
257 send_heartbeat();
258 num_bal_times--;
259 }
81eedcae
TL
260
261 mds->mdcache->show_subtrees(10, true);
7c673cae
FG
262}
263
264
265
266
267class C_Bal_SendHeartbeat : public MDSInternalContext {
268public:
269 explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
270 void finish(int f) override {
271 mds->balancer->send_heartbeat();
272 }
273};
274
275
11fdf7f2 276double mds_load_t::mds_load() const
7c673cae 277{
11fdf7f2 278 switch(g_conf()->mds_bal_mode) {
7c673cae
FG
279 case 0:
280 return
281 .8 * auth.meta_load() +
282 .2 * all.meta_load() +
283 req_rate +
284 10.0 * queue_len;
285
286 case 1:
287 return req_rate + 10.0*queue_len;
288
289 case 2:
290 return cpu_load_avg;
291
292 }
293 ceph_abort();
294 return 0;
295}
296
11fdf7f2 297mds_load_t MDBalancer::get_load()
7c673cae 298{
11fdf7f2
TL
299 auto now = clock::now();
300
301 mds_load_t load{DecayRate()}; /* zero DecayRate! */
7c673cae
FG
302
303 if (mds->mdcache->get_root()) {
9f95a23c 304 auto&& ls = mds->mdcache->get_root()->get_dirfrags();
11fdf7f2
TL
305 for (auto &d : ls) {
306 load.auth.add(d->pop_auth_subtree_nested);
307 load.all.add(d->pop_nested);
7c673cae
FG
308 }
309 } else {
9f95a23c 310 dout(20) << "no root, no load" << dendl;
7c673cae
FG
311 }
312
28e407b8 313 uint64_t num_requests = mds->get_num_requests();
522d829b
TL
314 uint64_t num_traverse = mds->logger->get(l_mds_traverse);
315 uint64_t num_traverse_hit = mds->logger->get(l_mds_traverse_hit);
91327a77
AA
316
317 uint64_t cpu_time = 1;
318 {
319 string stat_path = PROCPREFIX "/proc/self/stat";
320 ifstream stat_file(stat_path);
321 if (stat_file.is_open()) {
322 vector<string> stat_vec(std::istream_iterator<string>{stat_file},
323 std::istream_iterator<string>());
324 if (stat_vec.size() >= 15) {
325 // utime + stime
326 cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
327 strtoll(stat_vec[14].c_str(), nullptr, 10);
328 } else {
329 derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
330 }
331 } else {
332 derr << "input file '" << stat_path << "' not found" << dendl_impl;
333 }
334 }
335
336 load.queue_len = messenger->get_dispatch_queue_len();
337
338 bool update_last = true;
11fdf7f2 339 if (last_get_load != clock::zero() &&
91327a77 340 now > last_get_load) {
11fdf7f2
TL
341 double el = std::chrono::duration<double>(now-last_get_load).count();
342 if (el >= 1.0) {
91327a77 343 if (num_requests > last_num_requests)
11fdf7f2 344 load.req_rate = (num_requests - last_num_requests) / el;
91327a77 345 if (cpu_time > last_cpu_time)
11fdf7f2 346 load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
522d829b
TL
347 if (num_traverse > last_num_traverse && num_traverse_hit > last_num_traverse_hit)
348 load.cache_hit_rate = (double)(num_traverse_hit - last_num_traverse_hit) / (num_traverse - last_num_traverse);
91327a77
AA
349 } else {
350 auto p = mds_load.find(mds->get_nodeid());
351 if (p != mds_load.end()) {
352 load.req_rate = p->second.req_rate;
353 load.cpu_load_avg = p->second.cpu_load_avg;
522d829b 354 load.cache_hit_rate = p->second.cache_hit_rate;
91327a77 355 }
522d829b
TL
356 if (num_requests >= last_num_requests && cpu_time >= last_cpu_time &&
357 num_traverse >= last_num_traverse && num_traverse_hit >= last_num_traverse_hit)
91327a77 358 update_last = false;
28e407b8
AA
359 }
360 }
28e407b8 361
91327a77
AA
362 if (update_last) {
363 last_num_requests = num_requests;
364 last_cpu_time = cpu_time;
365 last_get_load = now;
522d829b
TL
366 last_num_traverse = num_traverse;
367 last_num_traverse_hit = num_traverse_hit;
91327a77 368 }
7c673cae 369
9f95a23c 370 dout(15) << load << dendl;
7c673cae
FG
371 return load;
372}
373
374/*
375 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
376 * fallbacks (i.e. kick off async read when we are processing the map and
377 * check status when we get here) with the way the mds is structured.
378 */
379int MDBalancer::localize_balancer()
380{
381 /* reset everything */
382 bool ack = false;
383 int r = 0;
384 bufferlist lua_src;
9f95a23c
TL
385 ceph::mutex lock = ceph::make_mutex("lock");
386 ceph::condition_variable cond;
7c673cae
FG
387
388 /* we assume that balancer is in the metadata pool */
389 object_t oid = object_t(mds->mdsmap->get_balancer());
b3b6e05e 390 object_locator_t oloc(mds->get_metadata_pool());
7c673cae 391 ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
9f95a23c 392 new C_SafeCond(lock, cond, &ack, &r));
7c673cae
FG
393 dout(15) << "launched non-blocking read tid=" << tid
394 << " oid=" << oid << " oloc=" << oloc << dendl;
395
396 /* timeout: if we waste half our time waiting for RADOS, then abort! */
9f95a23c
TL
397 std::cv_status ret_t = [&] {
398 auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
399 std::unique_lock locker{lock};
400 return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
401 }();
7c673cae
FG
402 /* success: store the balancer in memory and set the version. */
403 if (!r) {
9f95a23c 404 if (ret_t == std::cv_status::timeout) {
f67539c2
TL
405 mds->objecter->op_cancel(tid, -CEPHFS_ECANCELED);
406 return -CEPHFS_ETIMEDOUT;
7c673cae
FG
407 }
408 bal_code.assign(lua_src.to_str());
409 bal_version.assign(oid.name);
9f95a23c 410 dout(10) "bal_code=" << bal_code << dendl;
7c673cae
FG
411 }
412 return r;
413}
414
415void MDBalancer::send_heartbeat()
416{
7c673cae 417 if (mds->is_cluster_degraded()) {
9f95a23c 418 dout(10) << "degraded" << dendl;
7c673cae
FG
419 return;
420 }
421
422 if (!mds->mdcache->is_open()) {
9f95a23c 423 dout(10) << "not open" << dendl;
7c673cae
FG
424 mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
425 return;
426 }
427
94b18763 428 if (mds->get_nodeid() == 0) {
7c673cae 429 beat_epoch++;
94b18763
FG
430 mds_load.clear();
431 }
7c673cae
FG
432
433 // my load
11fdf7f2 434 mds_load_t load = get_load();
28e407b8
AA
435 mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
436 mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
437
11fdf7f2
TL
438 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
439 if (!em.second) {
440 em.first->second = load;
441 }
7c673cae
FG
442
443 // import_map -- how much do i import from whom
444 map<mds_rank_t, float> import_map;
11fdf7f2 445 for (auto& im : mds->mdcache->get_auth_subtrees()) {
7c673cae
FG
446 mds_rank_t from = im->inode->authority().first;
447 if (from == mds->get_nodeid()) continue;
448 if (im->get_inode()->is_stray()) continue;
11fdf7f2 449 import_map[from] += im->pop_auth_subtree.meta_load();
7c673cae
FG
450 }
451 mds_import_map[ mds->get_nodeid() ] = import_map;
452
453
9f95a23c
TL
454 dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
455 for (const auto& [rank, load] : import_map) {
456 dout(5) << " import_map from " << rank << " -> " << load << dendl;
7c673cae
FG
457 }
458
459
460 set<mds_rank_t> up;
461 mds->get_mds_map()->get_up_mds_set(up);
11fdf7f2
TL
462 for (const auto& r : up) {
463 if (r == mds->get_nodeid())
7c673cae 464 continue;
9f95a23c 465 auto hb = make_message<MHeartbeat>(load, beat_epoch);
7c673cae 466 hb->get_import_map() = import_map;
11fdf7f2 467 mds->send_message_mds(hb, r);
7c673cae
FG
468 }
469}
470
9f95a23c 471void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
7c673cae 472{
7c673cae
FG
473 mds_rank_t who = mds_rank_t(m->get_source().num());
474 dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
475
476 if (!mds->is_active())
11fdf7f2 477 return;
7c673cae
FG
478
479 if (!mds->mdcache->is_open()) {
480 dout(10) << "opening root on handle_heartbeat" << dendl;
481 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
482 return;
483 }
484
485 if (mds->is_cluster_degraded()) {
486 dout(10) << " degraded, ignoring" << dendl;
11fdf7f2 487 return;
7c673cae
FG
488 }
489
94b18763
FG
490 if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
491 dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
492
493 beat_epoch = m->get_beat();
494 // clear the mds load info whose epoch is less than beat_epoch
495 mds_load.clear();
496 }
497
7c673cae 498 if (who == 0) {
94b18763
FG
499 dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
500 if (beat_epoch != m->get_beat()) {
28e407b8 501 beat_epoch = m->get_beat();
94b18763
FG
502 mds_load.clear();
503 }
28e407b8 504
7c673cae
FG
505 send_heartbeat();
506
507 mds->mdcache->show_subtrees();
28e407b8
AA
508 } else if (mds->get_nodeid() == 0) {
509 if (beat_epoch != m->get_beat()) {
510 dout(10) << " old heartbeat epoch, ignoring" << dendl;
11fdf7f2 511 return;
7c673cae
FG
512 }
513 }
28e407b8 514
11fdf7f2
TL
515 {
516 auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
517 if (!em.second) {
518 em.first->second = m->get_load();
519 }
520 }
28e407b8 521 mds_import_map[who] = m->get_import_map();
7c673cae 522
1e59de90
TL
523 mds->mdsmap->update_num_mdss_in_rank_mask_bitset();
524
525 if (mds->mdsmap->get_num_mdss_in_rank_mask_bitset() > 0)
7c673cae
FG
526 {
527 unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
528 if (mds_load.size() == cluster_size) {
529 // let's go!
530 //export_empties(); // no!
531
532 /* avoid spamming ceph -w if user does not turn mantle on */
533 if (mds->mdsmap->get_balancer() != "") {
534 int r = mantle_prep_rebalance();
11fdf7f2 535 if (!r) return;
7c673cae
FG
536 mds->clog->warn() << "using old balancer; mantle failed for "
537 << "balancer=" << mds->mdsmap->get_balancer()
538 << " : " << cpp_strerror(r);
539 }
540 prep_rebalance(m->get_beat());
541 }
542 }
7c673cae
FG
543}
544
7c673cae
FG
545double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
546 mds_rank_t im, double& maxim)
547{
548 if (maxex <= 0 || maxim <= 0) return 0.0;
549
11fdf7f2 550 double howmuch = std::min(maxex, maxim);
7c673cae
FG
551
552 dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
553
554 if (ex == mds->get_nodeid())
555 state.targets[im] += howmuch;
556
557 state.exported[ex] += howmuch;
558 state.imported[im] += howmuch;
559
560 maxex -= howmuch;
561 maxim -= howmuch;
562
563 return howmuch;
564}
565
566void MDBalancer::queue_split(const CDir *dir, bool fast)
567{
568 dout(10) << __func__ << " enqueuing " << *dir
569 << " (fast=" << fast << ")" << dendl;
570
f67539c2 571 const dirfrag_t df = dir->dirfrag();
7c673cae 572
f67539c2
TL
573 auto callback = [this, df](int r) {
574 if (split_pending.erase(df) == 0) {
7c673cae
FG
575 // Someone beat me to it. This can happen in the fast splitting
576 // path, because we spawn two contexts, one with mds->timer and
577 // one with mds->queue_waiter. The loser can safely just drop
578 // out.
579 return;
580 }
581
f67539c2
TL
582 auto mdcache = mds->mdcache;
583
584 CDir *dir = mdcache->get_dirfrag(df);
585 if (!dir) {
586 dout(10) << "drop split on " << df << " because not in cache" << dendl;
7c673cae
FG
587 return;
588 }
f67539c2
TL
589 if (!dir->is_auth()) {
590 dout(10) << "drop split on " << df << " because non-auth" << dendl;
7c673cae
FG
591 return;
592 }
593
594 // Pass on to MDCache: note that the split might still not
595 // happen if the checks in MDCache::can_fragment fail.
f67539c2
TL
596 dout(10) << __func__ << " splitting " << *dir << dendl;
597 int bits = g_conf()->mds_bal_split_bits;
598 if (dir->inode->is_ephemeral_dist()) {
599 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
600 if (df.frag.bits() + bits < min_frag_bits)
601 bits = min_frag_bits - df.frag.bits();
602 }
603 mdcache->split_dir(dir, bits);
7c673cae
FG
604 };
605
f67539c2
TL
606 auto ret = split_pending.insert(df);
607 bool is_new = ret.second;
7c673cae
FG
608
609 if (fast) {
610 // Do the split ASAP: enqueue it in the MDSRank waiters which are
611 // run at the end of dispatching the current request
612 mds->queue_waiter(new MDSInternalContextWrapper(mds,
9f95a23c 613 new LambdaContext(std::move(callback))));
7c673cae
FG
614 } else if (is_new) {
615 // Set a timer to really do the split: we don't do it immediately
616 // so that bursts of ops on a directory have a chance to go through
617 // before we freeze it.
91327a77 618 mds->timer.add_event_after(bal_fragment_interval,
9f95a23c 619 new LambdaContext(std::move(callback)));
7c673cae
FG
620 }
621}
622
623void MDBalancer::queue_merge(CDir *dir)
624{
625 const auto frag = dir->dirfrag();
626 auto callback = [this, frag](int r) {
11fdf7f2 627 ceph_assert(frag.frag != frag_t());
7c673cae
FG
628
629 // frag must be in this set because only one context is in flight
630 // for a given frag at a time (because merge_pending is checked before
631 // starting one), and this context is the only one that erases it.
632 merge_pending.erase(frag);
633
f67539c2
TL
634 auto mdcache = mds->mdcache;
635 CDir *dir = mdcache->get_dirfrag(frag);
7c673cae
FG
636 if (!dir) {
637 dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
638 return;
639 }
11fdf7f2 640 ceph_assert(dir->dirfrag() == frag);
7c673cae
FG
641
642 if(!dir->is_auth()) {
643 dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
644 return;
645 }
646
647 dout(10) << "merging " << *dir << dendl;
648
649 CInode *diri = dir->get_inode();
650
f67539c2
TL
651 unsigned min_frag_bits = 0;
652 if (diri->is_ephemeral_dist())
653 min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
654
7c673cae 655 frag_t fg = dir->get_frag();
f67539c2 656 while (fg.bits() > min_frag_bits) {
7c673cae 657 frag_t sibfg = fg.get_sibling();
9f95a23c 658 auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
7c673cae
FG
659 if (!complete) {
660 dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
661 break;
662 }
663 bool all = true;
9f95a23c 664 for (auto& sib : sibs) {
7c673cae
FG
665 if (!sib->is_auth() || !sib->should_merge()) {
666 all = false;
667 break;
668 }
669 }
670 if (!all) {
671 dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
672 break;
673 }
674 dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
675 fg = fg.parent();
676 }
677
678 if (fg != dir->get_frag())
f67539c2 679 mdcache->merge_dir(diri, fg);
7c673cae
FG
680 };
681
682 if (merge_pending.count(frag) == 0) {
9f95a23c 683 dout(20) << " enqueued dir " << *dir << dendl;
7c673cae 684 merge_pending.insert(frag);
91327a77 685 mds->timer.add_event_after(bal_fragment_interval,
9f95a23c 686 new LambdaContext(std::move(callback)));
7c673cae 687 } else {
9f95a23c 688 dout(20) << " dir already in queue " << *dir << dendl;
7c673cae
FG
689 }
690}
691
692void MDBalancer::prep_rebalance(int beat)
693{
694 balance_state_t state;
695
11fdf7f2 696 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
697 //we're going to randomly export to all the mds in the cluster
698 set<mds_rank_t> up_mds;
699 mds->get_mds_map()->get_up_mds_set(up_mds);
700 for (const auto &rank : up_mds) {
701 state.targets[rank] = 0.0;
702 }
703 } else {
704 int cluster_size = mds->get_mds_map()->get_num_in_mds();
705 mds_rank_t whoami = mds->get_nodeid();
11fdf7f2 706 rebalance_time = clock::now();
7c673cae 707
9f95a23c 708 dout(7) << "cluster loads are" << dendl;
7c673cae
FG
709
710 mds->mdcache->migrator->clear_export_queue();
711
712 // rescale! turn my mds_load back into meta_load units
713 double load_fac = 1.0;
714 map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
715 if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
11fdf7f2 716 double metald = m->second.auth.meta_load();
7c673cae
FG
717 double mdsld = m->second.mds_load();
718 load_fac = metald / mdsld;
719 dout(7) << " load_fac is " << load_fac
720 << " <- " << m->second.auth << " " << metald
721 << " / " << mdsld
722 << dendl;
723 }
724
28e407b8
AA
725 mds_meta_load.clear();
726
7c673cae
FG
727 double total_load = 0.0;
728 multimap<double,mds_rank_t> load_map;
729 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
28e407b8 730 mds_load_t& load = mds_load.at(i);
7c673cae
FG
731
732 double l = load.mds_load() * load_fac;
733 mds_meta_load[i] = l;
734
735 if (whoami == 0)
9f95a23c 736 dout(7) << " mds." << i
7c673cae
FG
737 << " " << load
738 << " = " << load.mds_load()
739 << " ~ " << l << dendl;
740
741 if (whoami == i) my_load = l;
742 total_load += l;
743
744 load_map.insert(pair<double,mds_rank_t>( l, i ));
745 }
746
747 // target load
1e59de90 748 target_load = total_load / (double)mds->mdsmap->get_num_mdss_in_rank_mask_bitset();
9f95a23c 749 dout(7) << "my load " << my_load
7c673cae
FG
750 << " target " << target_load
751 << " total " << total_load
752 << dendl;
753
754 // under or over?
9f95a23c 755 for (const auto& [load, rank] : load_map) {
1e59de90
TL
756 if (test_rank_mask(rank) &&
757 load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
9f95a23c
TL
758 dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
759 mds_last_epoch_under_map[rank] = beat_epoch;
28e407b8
AA
760 }
761 }
762
763 int last_epoch_under = mds_last_epoch_under_map[whoami];
764 if (last_epoch_under == beat_epoch) {
9f95a23c 765 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl;
7c673cae
FG
766 return;
767 }
7c673cae
FG
768 // am i over long enough?
769 if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
9f95a23c 770 dout(7) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
7c673cae
FG
771 return;
772 }
773
9f95a23c 774 dout(7) << " i am sufficiently overloaded" << dendl;
7c673cae
FG
775
776
777 // first separate exporters and importers
778 multimap<double,mds_rank_t> importers;
779 multimap<double,mds_rank_t> exporters;
780 set<mds_rank_t> importer_set;
781 set<mds_rank_t> exporter_set;
782
783 for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
784 it != load_map.end();
785 ++it) {
1e59de90 786 if (it->first < target_load && test_rank_mask(it->second)) {
7c673cae
FG
787 dout(15) << " mds." << it->second << " is importer" << dendl;
788 importers.insert(pair<double,mds_rank_t>(it->first,it->second));
789 importer_set.insert(it->second);
790 } else {
28e407b8
AA
791 int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
792 if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
793 dout(15) << " mds." << it->second << " is exporter" << dendl;
794 exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
795 exporter_set.insert(it->second);
796 }
7c673cae
FG
797 }
798 }
799
800
801 // determine load transfer mapping
802
803 if (true) {
804 // analyze import_map; do any matches i can
805
806 dout(15) << " matching exporters to import sources" << dendl;
807
808 // big -> small exporters
809 for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
810 ex != exporters.rend();
811 ++ex) {
1e59de90
TL
812 double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
813 double maxex = get_maxex(state, ex->second, ex_target_load);
7c673cae
FG
814 if (maxex <= .001) continue;
815
816 // check importers. for now, just in arbitrary order (no intelligent matching).
817 for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
818 im != mds_import_map[ex->second].end();
819 ++im) {
1e59de90 820 double maxim = get_maxim(state, im->first, target_load);
7c673cae
FG
821 if (maxim <= .001) continue;
822 try_match(state, ex->second, maxex, im->first, maxim);
823 if (maxex <= .001) break;
824 }
825 }
826 }
827
828 // old way
829 if (beat % 2 == 1) {
830 dout(15) << " matching big exporters to big importers" << dendl;
831 // big exporters to big importers
832 multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
833 multimap<double,mds_rank_t>::iterator im = importers.begin();
834 while (ex != exporters.rend() &&
835 im != importers.end()) {
1e59de90
TL
836 double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
837 double maxex = get_maxex(state, ex->second, ex_target_load);
838 double maxim = get_maxim(state, im->second, target_load);
7c673cae
FG
839 if (maxex < .001 || maxim < .001) break;
840 try_match(state, ex->second, maxex, im->second, maxim);
841 if (maxex <= .001) ++ex;
842 if (maxim <= .001) ++im;
843 }
844 } else { // new way
845 dout(15) << " matching small exporters to big importers" << dendl;
846 // small exporters to big importers
847 multimap<double,mds_rank_t>::iterator ex = exporters.begin();
848 multimap<double,mds_rank_t>::iterator im = importers.begin();
849 while (ex != exporters.end() &&
850 im != importers.end()) {
1e59de90
TL
851 double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
852 double maxex = get_maxex(state, ex->second, ex_target_load);
853 double maxim = get_maxim(state, im->second, target_load);
7c673cae
FG
854 if (maxex < .001 || maxim < .001) break;
855 try_match(state, ex->second, maxex, im->second, maxim);
856 if (maxex <= .001) ++ex;
857 if (maxim <= .001) ++im;
858 }
859 }
860 }
861 try_rebalance(state);
862}
863
7c673cae
FG
864int MDBalancer::mantle_prep_rebalance()
865{
866 balance_state_t state;
867
868 /* refresh balancer if it has changed */
869 if (bal_version != mds->mdsmap->get_balancer()) {
870 bal_version.assign("");
871 int r = localize_balancer();
872 if (r) return r;
873
874 /* only spam the cluster log from 1 mds on version changes */
875 if (mds->get_nodeid() == 0)
876 mds->clog->info() << "mantle balancer version changed: " << bal_version;
877 }
878
879 /* prepare for balancing */
880 int cluster_size = mds->get_mds_map()->get_num_in_mds();
11fdf7f2 881 rebalance_time = clock::now();
7c673cae
FG
882 mds->mdcache->migrator->clear_export_queue();
883
884 /* fill in the metrics for each mds by grabbing load struct */
885 vector < map<string, double> > metrics (cluster_size);
28e407b8
AA
886 for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
887 mds_load_t& load = mds_load.at(i);
7c673cae
FG
888
889 metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
890 {"all.meta_load", load.all.meta_load()},
891 {"req_rate", load.req_rate},
892 {"queue_len", load.queue_len},
893 {"cpu_load_avg", load.cpu_load_avg}};
894 }
895
896 /* execute the balancer */
897 Mantle mantle;
898 int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
9f95a23c 899 dout(7) << " mantle decided that new targets=" << state.targets << dendl;
7c673cae
FG
900
901 /* mantle doesn't know about cluster size, so check target len here */
902 if ((int) state.targets.size() != cluster_size)
f67539c2 903 return -CEPHFS_EINVAL;
7c673cae
FG
904 else if (ret)
905 return ret;
906
907 try_rebalance(state);
908 return 0;
909}
910
911
912
913void MDBalancer::try_rebalance(balance_state_t& state)
914{
11fdf7f2 915 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
916 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
917 << dendl;
918 return;
919 }
920
921 // make a sorted list of my imports
28e407b8
AA
922 multimap<double, CDir*> import_pop_map;
923 multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
7c673cae 924
11fdf7f2 925 for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
28e407b8
AA
926 CInode *diri = dir->get_inode();
927 if (diri->is_mdsdir())
928 continue;
929 if (diri->get_export_pin(false) != MDS_RANK_NONE)
930 continue;
931 if (dir->is_freezing() || dir->is_frozen())
932 continue; // export pbly already in progress
7c673cae 933
28e407b8 934 mds_rank_t from = diri->authority().first;
11fdf7f2
TL
935 double pop = dir->pop_auth_subtree.meta_load();
936 if (g_conf()->mds_bal_idle_threshold > 0 &&
937 pop < g_conf()->mds_bal_idle_threshold &&
28e407b8
AA
938 diri != mds->mdcache->get_root() &&
939 from != mds->get_nodeid()) {
940 dout(5) << " exporting idle (" << pop << ") import " << *dir
941 << " back to mds." << from << dendl;
942 mds->mdcache->migrator->export_dir_nicely(dir, from);
7c673cae
FG
943 continue;
944 }
945
28e407b8
AA
946 dout(15) << " map: i imported " << *dir << " from " << from << dendl;
947 import_pop_map.insert(make_pair(pop, dir));
948 import_from_map.insert(make_pair(from, make_pair(dir, pop)));
7c673cae
FG
949 }
950
7c673cae 951 // do my exports!
28e407b8 952 map<mds_rank_t, double> export_pop_map;
7c673cae
FG
953
954 for (auto &it : state.targets) {
955 mds_rank_t target = it.first;
956 double amount = it.second;
957
1e59de90 958 if (amount < MIN_OFFLOAD) {
28e407b8 959 continue;
1e59de90
TL
960 }
961 if (amount * 10 * state.targets.size() < target_load) {
91327a77 962 continue;
1e59de90 963 }
7c673cae
FG
964
965 dout(5) << "want to send " << amount << " to mds." << target
966 //<< " .. " << (*it).second << " * " << load_fac
967 << " -> " << amount
968 << dendl;//" .. fudge is " << fudge << dendl;
7c673cae 969
28e407b8 970 double& have = export_pop_map[target];
7c673cae
FG
971
972 mds->mdcache->show_subtrees();
973
974 // search imports from target
975 if (import_from_map.count(target)) {
9f95a23c 976 dout(7) << " aha, looking through imports from target mds." << target << dendl;
28e407b8
AA
977 for (auto p = import_from_map.equal_range(target);
978 p.first != p.second; ) {
979 CDir *dir = p.first->second.first;
980 double pop = p.first->second.second;
9f95a23c 981 dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
28e407b8 982 auto plast = p.first++;
7c673cae 983
28e407b8 984 if (dir->inode->is_base())
7c673cae 985 continue;
11fdf7f2 986 ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
7c673cae
FG
987
988 if (pop <= amount-have) {
9f95a23c 989 dout(7) << "reexporting " << *dir << " pop " << pop
7c673cae
FG
990 << " back to mds." << target << dendl;
991 mds->mdcache->migrator->export_dir_nicely(dir, target);
992 have += pop;
993 import_from_map.erase(plast);
28e407b8
AA
994 for (auto q = import_pop_map.equal_range(pop);
995 q.first != q.second; ) {
996 if (q.first->second == dir) {
997 import_pop_map.erase(q.first);
998 break;
999 }
1000 q.first++;
1001 }
7c673cae 1002 } else {
9f95a23c 1003 dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
7c673cae 1004 }
28e407b8
AA
1005 if (amount-have < MIN_OFFLOAD)
1006 break;
7c673cae
FG
1007 }
1008 }
28e407b8
AA
1009 }
1010
1011 // any other imports
1012 for (auto &it : state.targets) {
1013 mds_rank_t target = it.first;
1014 double amount = it.second;
1015
1016 if (!export_pop_map.count(target))
1017 continue;
1018 double& have = export_pop_map[target];
1019 if (amount-have < MIN_OFFLOAD)
7c673cae 1020 continue;
7c673cae 1021
28e407b8
AA
1022 for (auto p = import_pop_map.begin();
1023 p != import_pop_map.end(); ) {
1024 CDir *dir = p->second;
1025 if (dir->inode->is_base()) {
1026 ++p;
1027 continue;
1028 }
7c673cae 1029
28e407b8
AA
1030 double pop = p->first;
1031 if (pop <= amount-have && pop > MIN_REEXPORT) {
9f95a23c 1032 dout(5) << "reexporting " << *dir << " pop " << pop
28e407b8
AA
1033 << " to mds." << target << dendl;
1034 have += pop;
1035 mds->mdcache->migrator->export_dir_nicely(dir, target);
1036 import_pop_map.erase(p++);
1037 } else {
1038 ++p;
7c673cae 1039 }
28e407b8
AA
1040 if (amount-have < MIN_OFFLOAD)
1041 break;
7c673cae 1042 }
28e407b8 1043 }
7c673cae 1044
28e407b8
AA
1045 set<CDir*> already_exporting;
1046
1047 for (auto &it : state.targets) {
1048 mds_rank_t target = it.first;
1049 double amount = it.second;
7c673cae 1050
28e407b8
AA
1051 if (!export_pop_map.count(target))
1052 continue;
1053 double& have = export_pop_map[target];
1054 if (amount-have < MIN_OFFLOAD)
1055 continue;
1056
1057 // okay, search for fragments of my workload
9f95a23c 1058 std::vector<CDir*> exports;
7c673cae 1059
28e407b8
AA
1060 for (auto p = import_pop_map.rbegin();
1061 p != import_pop_map.rend();
1062 ++p) {
1063 CDir *dir = p->second;
9f95a23c 1064 find_exports(dir, amount, &exports, have, already_exporting);
28e407b8 1065 if (amount-have < MIN_OFFLOAD)
7c673cae
FG
1066 break;
1067 }
1068 //fudge = amount - have;
1069
9f95a23c 1070 for (const auto& dir : exports) {
28e407b8 1071 dout(5) << " - exporting " << dir->pop_auth_subtree
11fdf7f2 1072 << " " << dir->pop_auth_subtree.meta_load()
28e407b8
AA
1073 << " to mds." << target << " " << *dir << dendl;
1074 mds->mdcache->migrator->export_dir_nicely(dir, target);
7c673cae
FG
1075 }
1076 }
1077
9f95a23c 1078 dout(7) << "done" << dendl;
7c673cae
FG
1079 mds->mdcache->show_subtrees();
1080}
1081
7c673cae
FG
1082void MDBalancer::find_exports(CDir *dir,
1083 double amount,
9f95a23c 1084 std::vector<CDir*>* exports,
7c673cae
FG
1085 double& have,
1086 set<CDir*>& already_exporting)
1087{
11fdf7f2
TL
1088 auto now = clock::now();
1089 auto duration = std::chrono::duration<double>(now-rebalance_time).count();
1090 if (duration > 0.1) {
28e407b8
AA
1091 derr << " balancer runs too long" << dendl_impl;
1092 have = amount;
1093 return;
1094 }
1095
11fdf7f2 1096 ceph_assert(dir->is_auth());
28e407b8 1097
7c673cae 1098 double need = amount - have;
11fdf7f2 1099 if (need < amount * g_conf()->mds_bal_min_start)
7c673cae 1100 return; // good enough!
28e407b8 1101
11fdf7f2
TL
1102 double needmax = need * g_conf()->mds_bal_need_max;
1103 double needmin = need * g_conf()->mds_bal_need_min;
1104 double midchunk = need * g_conf()->mds_bal_midchunk;
1105 double minchunk = need * g_conf()->mds_bal_minchunk;
7c673cae 1106
9f95a23c 1107 std::vector<CDir*> bigger_rep, bigger_unrep;
7c673cae
FG
1108 multimap<double, CDir*> smaller;
1109
11fdf7f2 1110 double dir_pop = dir->pop_auth_subtree.meta_load();
9f95a23c 1111 dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
7c673cae
FG
1112
1113 double subdir_sum = 0;
28e407b8
AA
1114 for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
1115 !it.end(); ) {
1116 CInode *in = *it;
1117 ++it;
1118
11fdf7f2
TL
1119 ceph_assert(in->is_dir());
1120 ceph_assert(in->get_parent_dir() == dir);
7c673cae 1121
9f95a23c 1122 auto&& dfls = in->get_nested_dirfrags();
28e407b8
AA
1123
1124 size_t num_idle_frags = 0;
9f95a23c 1125 for (const auto& subdir : dfls) {
28e407b8
AA
1126 if (already_exporting.count(subdir))
1127 continue;
7c673cae 1128
28e407b8
AA
1129 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1130 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1131 if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
1132 subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
1133 continue; // can't export this right now!
7c673cae
FG
1134
1135 // how popular?
11fdf7f2 1136 double pop = subdir->pop_auth_subtree.meta_load();
7c673cae
FG
1137 subdir_sum += pop;
1138 dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
1139
28e407b8
AA
1140 if (pop < minchunk) {
1141 num_idle_frags++;
1142 continue;
1143 }
7c673cae
FG
1144
1145 // lucky find?
1146 if (pop > needmin && pop < needmax) {
9f95a23c 1147 exports->push_back(subdir);
7c673cae
FG
1148 already_exporting.insert(subdir);
1149 have += pop;
1150 return;
1151 }
1152
1153 if (pop > need) {
1154 if (subdir->is_rep())
1155 bigger_rep.push_back(subdir);
1156 else
1157 bigger_unrep.push_back(subdir);
1158 } else
1159 smaller.insert(pair<double,CDir*>(pop, subdir));
1160 }
28e407b8
AA
1161 if (dfls.size() == num_idle_frags)
1162 in->item_pop_lru.remove_myself();
7c673cae
FG
1163 }
1164 dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
1165
1166 // grab some sufficiently big small items
1167 multimap<double,CDir*>::reverse_iterator it;
1168 for (it = smaller.rbegin();
1169 it != smaller.rend();
1170 ++it) {
1171
1172 if ((*it).first < midchunk)
1173 break; // try later
1174
1175 dout(7) << " taking smaller " << *(*it).second << dendl;
1176
9f95a23c 1177 exports->push_back((*it).second);
7c673cae
FG
1178 already_exporting.insert((*it).second);
1179 have += (*it).first;
1180 if (have > needmin)
1181 return;
1182 }
1183
1184 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
9f95a23c
TL
1185 for (const auto& dir : bigger_unrep) {
1186 dout(15) << " descending into " << *dir << dendl;
1187 find_exports(dir, amount, exports, have, already_exporting);
7c673cae
FG
1188 if (have > needmin)
1189 return;
1190 }
1191
1192 // ok fine, use smaller bits
1193 for (;
1194 it != smaller.rend();
1195 ++it) {
1196 dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
1197
9f95a23c 1198 exports->push_back((*it).second);
7c673cae
FG
1199 already_exporting.insert((*it).second);
1200 have += (*it).first;
1201 if (have > needmin)
1202 return;
1203 }
1204
1205 // ok fine, drill into replicated dirs
9f95a23c
TL
1206 for (const auto& dir : bigger_rep) {
1207 dout(7) << " descending into replicated " << *dir << dendl;
1208 find_exports(dir, amount, exports, have, already_exporting);
7c673cae
FG
1209 if (have > needmin)
1210 return;
1211 }
7c673cae
FG
1212}
1213
1e59de90 1214void MDBalancer::hit_inode(CInode *in, int type)
7c673cae
FG
1215{
1216 // hit inode
11fdf7f2 1217 in->pop.get(type).hit();
7c673cae
FG
1218
1219 if (in->get_parent_dn())
1e59de90 1220 hit_dir(in->get_parent_dn()->get_dir(), type);
7c673cae
FG
1221}
1222
1223void MDBalancer::maybe_fragment(CDir *dir, bool hot)
1224{
1225 // split/merge
11fdf7f2 1226 if (bal_fragment_dirs && bal_fragment_interval > 0 &&
91327a77 1227 dir->is_auth() &&
11fdf7f2 1228 !dir->inode->is_base() && // not root/mdsdir (for now at least)
91327a77 1229 !dir->inode->is_stray()) { // not straydir
7c673cae
FG
1230
1231 // split
f67539c2 1232 if (dir->should_split() || hot) {
7c673cae
FG
1233 if (split_pending.count(dir->dirfrag()) == 0) {
1234 queue_split(dir, false);
1235 } else {
1236 if (dir->should_split_fast()) {
1237 queue_split(dir, true);
1238 } else {
9f95a23c 1239 dout(10) << ": fragment already enqueued to split: "
7c673cae
FG
1240 << *dir << dendl;
1241 }
1242 }
1243 }
1244
1245 // merge?
1e59de90 1246 if (dir->should_merge() && merge_pending.count(dir->dirfrag()) == 0) {
7c673cae
FG
1247 queue_merge(dir);
1248 }
1249 }
1250}
1251
1e59de90 1252void MDBalancer::hit_dir(CDir *dir, int type, double amount)
7c673cae 1253{
f67539c2
TL
1254 if (dir->inode->is_stray())
1255 return;
7c673cae 1256 // hit me
11fdf7f2 1257 double v = dir->pop_me.get(type).hit(amount);
7c673cae 1258
11fdf7f2
TL
1259 const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
1260 (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
7c673cae 1261
9f95a23c 1262 dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
11fdf7f2 1263 << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
7c673cae
FG
1264
1265 maybe_fragment(dir, hot);
1266
1267 // replicate?
1e59de90 1268 const bool readop = (type == META_POP_IRD || type == META_POP_READDIR);
7c673cae 1269 double rd_adj = 0.0;
1e59de90 1270 if (readop && dir->last_popularity_sample < last_sample) {
11fdf7f2 1271 double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
1e59de90 1272 dir_pop += v * 10;
7c673cae 1273 dir->last_popularity_sample = last_sample;
7c673cae 1274
1e59de90
TL
1275 dout(20) << type << " pop " << dir_pop << " spread in " << *dir << dendl;
1276 if (dir->is_auth() && !dir->is_ambiguous_auth() && dir->can_rep()) {
1277 if (dir_pop >= g_conf()->mds_bal_replicate_threshold) {
7c673cae 1278 // replicate
11fdf7f2 1279 double rdp = dir->pop_me.get(META_POP_IRD).get();
7c673cae
FG
1280 rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
1281 rd_adj /= 2.0; // temper somewhat
1282
b32b8144 1283 dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
7c673cae
FG
1284
1285 dir->dir_rep = CDir::REP_ALL;
1286 mds->mdcache->send_dir_updates(dir, true);
1287
1288 // fixme this should adjust the whole pop hierarchy
1289 dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
1290 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
1291 }
1292
1293 if (dir->ino() != 1 &&
1294 dir->is_rep() &&
11fdf7f2 1295 dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
7c673cae 1296 // unreplicate
b32b8144 1297 dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
7c673cae
FG
1298
1299 dir->dir_rep = CDir::REP_NONE;
1300 mds->mdcache->send_dir_updates(dir);
1301 }
1302 }
1303 }
1304
1305 // adjust ancestors
1306 bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
1307 bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
1308
1309 while (true) {
28e407b8 1310 CDir *pdir = dir->inode->get_parent_dir();
11fdf7f2 1311 dir->pop_nested.get(type).hit(amount);
7c673cae 1312 if (rd_adj != 0.0)
11fdf7f2 1313 dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae
FG
1314
1315 if (hit_subtree) {
11fdf7f2 1316 dir->pop_auth_subtree.get(type).hit(amount);
28e407b8 1317
7c673cae 1318 if (rd_adj != 0.0)
11fdf7f2 1319 dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
28e407b8
AA
1320
1321 if (dir->is_subtree_root())
1322 hit_subtree = false; // end of auth domain, stop hitting auth counters.
1323 else if (pdir)
1324 pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
7c673cae
FG
1325 }
1326
1327 if (hit_subtree_nested) {
11fdf7f2 1328 dir->pop_auth_subtree_nested.get(type).hit(amount);
7c673cae 1329 if (rd_adj != 0.0)
11fdf7f2 1330 dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
7c673cae 1331 }
28e407b8
AA
1332 if (!pdir) break;
1333 dir = pdir;
7c673cae
FG
1334 }
1335}
1336
1337
1338/*
1339 * subtract off an exported chunk.
1340 * this excludes *dir itself (encode_export_dir should have take care of that)
1341 * we _just_ do the parents' nested counters.
1342 *
1343 * NOTE: call me _after_ forcing *dir into a subtree root,
1344 * but _before_ doing the encode_export_dirs.
1345 */
11fdf7f2 1346void MDBalancer::subtract_export(CDir *dir)
7c673cae
FG
1347{
1348 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1349
1350 while (true) {
1351 dir = dir->inode->get_parent_dir();
1352 if (!dir) break;
1353
11fdf7f2
TL
1354 dir->pop_nested.sub(subload);
1355 dir->pop_auth_subtree_nested.sub(subload);
7c673cae
FG
1356 }
1357}
1358
1359
11fdf7f2 1360void MDBalancer::add_import(CDir *dir)
7c673cae
FG
1361{
1362 dirfrag_load_vec_t subload = dir->pop_auth_subtree;
1363
1364 while (true) {
1365 dir = dir->inode->get_parent_dir();
1366 if (!dir) break;
1367
11fdf7f2
TL
1368 dir->pop_nested.add(subload);
1369 dir->pop_auth_subtree_nested.add(subload);
7c673cae
FG
1370 }
1371}
1372
11fdf7f2 1373void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
28e407b8 1374{
28e407b8
AA
1375 bool adjust_subtree_nest = dir->is_auth();
1376 bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
1377 CDir *cur = dir;
1378 while (true) {
1379 if (inc) {
11fdf7f2 1380 pdir->pop_nested.add(dir->pop_nested);
28e407b8 1381 if (adjust_subtree) {
11fdf7f2 1382 pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8
AA
1383 pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1384 }
1385
1386 if (adjust_subtree_nest)
11fdf7f2 1387 pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
28e407b8 1388 } else {
11fdf7f2 1389 pdir->pop_nested.sub(dir->pop_nested);
28e407b8 1390 if (adjust_subtree)
11fdf7f2 1391 pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
28e407b8
AA
1392
1393 if (adjust_subtree_nest)
11fdf7f2 1394 pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
28e407b8
AA
1395 }
1396
1397 if (pdir->is_subtree_root())
1398 adjust_subtree = false;
1399 cur = pdir;
1400 pdir = pdir->inode->get_parent_dir();
1401 if (!pdir) break;
1402 }
1403}
1404
224ce89b
WB
1405void MDBalancer::handle_mds_failure(mds_rank_t who)
1406{
1407 if (0 == who) {
28e407b8 1408 mds_last_epoch_under_map.clear();
224ce89b
WB
1409 }
1410}
28e407b8 1411
1e59de90 1412int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
28e407b8 1413{
1e59de90
TL
1414 std::deque<pair<CDir*, int>> dfs;
1415 std::deque<CDir*> dfs_root;
28e407b8 1416 if (mds->mdcache->get_root()) {
1e59de90
TL
1417 mds->mdcache->get_root()->get_dirfrags(dfs_root);
1418 while (!dfs_root.empty()) {
1419 CDir *dir = dfs_root.front();
1420 dfs_root.pop_front();
1421 dfs.push_back(make_pair(dir, 0));
1422 }
28e407b8 1423 } else {
9f95a23c 1424 dout(10) << "no root" << dendl;
28e407b8
AA
1425 }
1426
1427 f->open_object_section("loads");
1428
1429 f->open_array_section("dirfrags");
1430 while (!dfs.empty()) {
1e59de90 1431 auto [dir, cur_depth] = dfs.front();
28e407b8
AA
1432 dfs.pop_front();
1433
11fdf7f2
TL
1434 f->open_object_section("dir");
1435 dir->dump_load(f);
1436 f->close_section();
28e407b8 1437
1e59de90
TL
1438 //limit output dirfrags depth
1439 if (depth >= 0 && (cur_depth + 1) > depth) {
1440 continue;
1441 }
1442
28e407b8
AA
1443 for (auto it = dir->begin(); it != dir->end(); ++it) {
1444 CInode *in = it->second->get_linkage()->get_inode();
1445 if (!in || !in->is_dir())
1446 continue;
1447
9f95a23c
TL
1448 auto&& ls = in->get_dirfrags();
1449 for (const auto& subdir : ls) {
1e59de90 1450
28e407b8
AA
1451 if (subdir->pop_nested.meta_load() < .001)
1452 continue;
1e59de90 1453 dfs.push_back(make_pair(subdir, cur_depth+1));
28e407b8
AA
1454 }
1455 }
1456 }
1457 f->close_section(); // dirfrags array
1458
1459 f->open_object_section("mds_load");
1460 {
1461
f67539c2 1462 auto dump_mds_load = [f](const mds_load_t& load) {
28e407b8
AA
1463 f->dump_float("request_rate", load.req_rate);
1464 f->dump_float("cache_hit_rate", load.cache_hit_rate);
1465 f->dump_float("queue_length", load.queue_len);
1466 f->dump_float("cpu_load", load.cpu_load_avg);
1467 f->dump_float("mds_load", load.mds_load());
1468
28e407b8 1469 f->open_object_section("auth_dirfrags");
11fdf7f2 1470 load.auth.dump(f);
28e407b8
AA
1471 f->close_section();
1472 f->open_object_section("all_dirfrags");
11fdf7f2 1473 load.all.dump(f);
28e407b8
AA
1474 f->close_section();
1475 };
1476
f67539c2
TL
1477 for (const auto& [rank, load] : mds_load) {
1478 CachedStackStringStream css;
1479 *css << "mds." << rank;
1480 f->open_object_section(css->strv());
1481 dump_mds_load(load);
28e407b8
AA
1482 f->close_section();
1483 }
1484 }
1485 f->close_section(); // mds_load
1486
1487 f->open_object_section("mds_meta_load");
f67539c2
TL
1488 for (auto& [rank, mload] : mds_meta_load) {
1489 CachedStackStringStream css;
1490 *css << "mds." << rank;
1491 f->dump_float(css->strv(), mload);
28e407b8
AA
1492 }
1493 f->close_section(); // mds_meta_load
1494
1495 f->open_object_section("mds_import_map");
f67539c2
TL
1496 for (auto& [rank, imports] : mds_import_map) {
1497 {
1498 CachedStackStringStream css;
1499 *css << "mds." << rank;
1500 f->open_array_section(css->strv());
1501 }
1502 for (auto& [rank_from, mload] : imports) {
28e407b8 1503 f->open_object_section("from");
f67539c2
TL
1504 CachedStackStringStream css;
1505 *css << "mds." << rank_from;
1506 f->dump_float(css->strv(), mload);
28e407b8
AA
1507 f->close_section();
1508 }
1509 f->close_section(); // mds.? array
1510 }
1511 f->close_section(); // mds_import_map
1512
1513 f->close_section(); // loads
1514 return 0;
1515}