1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/compat.h"
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
37 #include "common/config.h"
38 #include "common/errno.h"
40 #define dout_context g_ceph_context
42 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
46 auto subsys = ceph_subsys_mds;\
47 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
48 subsys = ceph_subsys_mds_balancer;\
50 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
52 #define dendl dendl_impl; } while (0)
55 #define MIN_LOAD 50 // ??
56 #define MIN_REEXPORT 5 // will automatically reexport
57 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
60 int MDBalancer::proc_message(const cref_t
<Message
> &m
)
62 switch (m
->get_type()) {
64 case MSG_MDS_HEARTBEAT
:
65 handle_heartbeat(ref_cast
<MHeartbeat
>(m
));
69 derr
<< " balancer unknown message " << m
->get_type() << dendl_impl
;
70 ceph_abort_msg("balancer unknown message");
76 MDBalancer::MDBalancer(MDSRank
*m
, Messenger
*msgr
, MonClient
*monc
) :
77 mds(m
), messenger(msgr
), mon_client(monc
)
79 bal_fragment_dirs
= g_conf().get_val
<bool>("mds_bal_fragment_dirs");
80 bal_fragment_interval
= g_conf().get_val
<int64_t>("mds_bal_fragment_interval");
83 void MDBalancer::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
)
85 if (changed
.count("mds_bal_fragment_dirs"))
86 bal_fragment_dirs
= g_conf().get_val
<bool>("mds_bal_fragment_dirs");
87 if (changed
.count("mds_bal_fragment_interval"))
88 bal_fragment_interval
= g_conf().get_val
<int64_t>("mds_bal_fragment_interval");
91 void MDBalancer::handle_export_pins(void)
93 const mds_rank_t max_mds
= mds
->mdsmap
->get_max_mds();
94 auto mdcache
= mds
->mdcache
;
96 auto &q
= mdcache
->export_pin_queue
;
98 dout(20) << "export_pin_queue size=" << q
.size() << dendl
;
99 while (it
!= q
.end()) {
102 ceph_assert(in
->is_dir());
104 mds_rank_t export_pin
= in
->get_export_pin(false);
105 in
->check_pin_policy(export_pin
);
107 if (export_pin
>= max_mds
) {
108 dout(20) << " delay export_pin=" << export_pin
<< " on " << *in
<< dendl
;
109 in
->state_clear(CInode::STATE_QUEUEDEXPORTPIN
);
112 in
->state_set(CInode::STATE_DELAYEDEXPORTPIN
);
113 mdcache
->export_pin_delayed_queue
.insert(in
);
117 dout(20) << " executing export_pin=" << export_pin
<< " on " << *in
<< dendl
;
118 unsigned min_frag_bits
= 0;
119 mds_rank_t target
= MDS_RANK_NONE
;
122 else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
)
123 target
= mdcache
->hash_into_rank_bucket(in
->ino());
124 else if (export_pin
== MDS_RANK_EPHEMERAL_DIST
)
125 min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
128 for (auto&& dir
: in
->get_dirfrags()) {
132 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
133 if (dir
->get_frag().bits() < min_frag_bits
) {
134 if (!dir
->state_test(CDir::STATE_CREATING
) &&
135 !dir
->is_frozen() && !dir
->is_freezing()) {
136 queue_split(dir
, true);
141 target
= mdcache
->hash_into_rank_bucket(in
->ino(), dir
->get_frag());
144 if (target
== MDS_RANK_NONE
) {
145 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
146 if (dir
->is_frozen() || dir
->is_freezing()) {
151 dout(10) << " clear auxsubtree on " << *dir
<< dendl
;
152 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
153 mds
->mdcache
->try_subtree_merge(dir
);
155 } else if (target
== mds
->get_nodeid()) {
156 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
157 ceph_assert(dir
->is_subtree_root());
158 } else if (dir
->state_test(CDir::STATE_CREATING
) ||
159 dir
->is_frozen() || dir
->is_freezing()) {
163 } else if (!dir
->is_subtree_root()) {
164 dir
->state_set(CDir::STATE_AUXSUBTREE
);
165 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
166 dout(10) << " create aux subtree on " << *dir
<< dendl
;
168 dout(10) << " set auxsubtree bit on " << *dir
<< dendl
;
169 dir
->state_set(CDir::STATE_AUXSUBTREE
);
172 /* Only export a directory if it's non-empty. An empty directory will
173 * be sent back by the importer.
175 if (dir
->get_num_head_items() > 0) {
176 mds
->mdcache
->migrator
->export_dir(dir
, target
);
183 in
->state_clear(CInode::STATE_QUEUEDEXPORTPIN
);
188 std::vector
<CDir
*> authsubs
= mdcache
->get_auth_subtrees();
189 bool print_auth_subtrees
= true;
191 if (authsubs
.size() > AUTH_TREES_THRESHOLD
&&
192 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
193 dout(15) << "number of auth trees = " << authsubs
.size() << "; not "
194 "printing auth trees" << dendl
;
195 print_auth_subtrees
= false;
198 for (auto &cd
: authsubs
) {
199 mds_rank_t export_pin
= cd
->inode
->get_export_pin();
200 cd
->inode
->check_pin_policy(export_pin
);
202 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
203 export_pin
= mdcache
->hash_into_rank_bucket(cd
->ino(), cd
->get_frag());
204 } else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
) {
205 export_pin
= mdcache
->hash_into_rank_bucket(cd
->ino());
208 if (print_auth_subtrees
)
209 dout(25) << "auth tree " << *cd
<< " export_pin=" << export_pin
<< dendl
;
211 if (export_pin
>= 0 && export_pin
!= mds
->get_nodeid() &&
212 export_pin
< mds
->mdsmap
->get_max_mds()) {
213 mdcache
->migrator
->export_dir(cd
, export_pin
);
218 void MDBalancer::tick()
220 static int num_bal_times
= g_conf()->mds_bal_max
;
221 auto bal_interval
= g_conf().get_val
<int64_t>("mds_bal_interval");
222 auto bal_max_until
= g_conf().get_val
<int64_t>("mds_bal_max_until");
223 time now
= clock::now();
225 if (g_conf()->mds_bal_export_pin
) {
226 handle_export_pins();
230 if (chrono::duration
<double>(now
-last_sample
).count() >
231 g_conf()->mds_bal_sample_interval
) {
232 dout(15) << "tick last_sample now " << now
<< dendl
;
236 // We can use duration_cast below, although the result is an int,
237 // because the values from g_conf are also integers.
239 if (mds
->get_nodeid() == 0
242 && chrono::duration_cast
<chrono::seconds
>(now
- last_heartbeat
).count() >= bal_interval
243 && (num_bal_times
|| (bal_max_until
>= 0 && mds
->get_uptime().count() > bal_max_until
))) {
244 last_heartbeat
= now
;
249 mds
->mdcache
->show_subtrees(10, true);
255 class C_Bal_SendHeartbeat
: public MDSInternalContext
{
257 explicit C_Bal_SendHeartbeat(MDSRank
*mds_
) : MDSInternalContext(mds_
) { }
258 void finish(int f
) override
{
259 mds
->balancer
->send_heartbeat();
264 double mds_load_t::mds_load() const
266 switch(g_conf()->mds_bal_mode
) {
269 .8 * auth
.meta_load() +
270 .2 * all
.meta_load() +
275 return req_rate
+ 10.0*queue_len
;
285 mds_load_t
MDBalancer::get_load()
287 auto now
= clock::now();
289 mds_load_t load
{DecayRate()}; /* zero DecayRate! */
291 if (mds
->mdcache
->get_root()) {
292 auto&& ls
= mds
->mdcache
->get_root()->get_dirfrags();
294 load
.auth
.add(d
->pop_auth_subtree_nested
);
295 load
.all
.add(d
->pop_nested
);
298 dout(20) << "no root, no load" << dendl
;
301 uint64_t num_requests
= mds
->get_num_requests();
302 uint64_t num_traverse
= mds
->logger
->get(l_mds_traverse
);
303 uint64_t num_traverse_hit
= mds
->logger
->get(l_mds_traverse_hit
);
305 uint64_t cpu_time
= 1;
307 string stat_path
= PROCPREFIX
"/proc/self/stat";
308 ifstream
stat_file(stat_path
);
309 if (stat_file
.is_open()) {
310 vector
<string
> stat_vec(std::istream_iterator
<string
>{stat_file
},
311 std::istream_iterator
<string
>());
312 if (stat_vec
.size() >= 15) {
314 cpu_time
= strtoll(stat_vec
[13].c_str(), nullptr, 10) +
315 strtoll(stat_vec
[14].c_str(), nullptr, 10);
317 derr
<< "input file '" << stat_path
<< "' not resolvable" << dendl_impl
;
320 derr
<< "input file '" << stat_path
<< "' not found" << dendl_impl
;
324 load
.queue_len
= messenger
->get_dispatch_queue_len();
326 bool update_last
= true;
327 if (last_get_load
!= clock::zero() &&
328 now
> last_get_load
) {
329 double el
= std::chrono::duration
<double>(now
-last_get_load
).count();
331 if (num_requests
> last_num_requests
)
332 load
.req_rate
= (num_requests
- last_num_requests
) / el
;
333 if (cpu_time
> last_cpu_time
)
334 load
.cpu_load_avg
= (cpu_time
- last_cpu_time
) / el
;
335 if (num_traverse
> last_num_traverse
&& num_traverse_hit
> last_num_traverse_hit
)
336 load
.cache_hit_rate
= (double)(num_traverse_hit
- last_num_traverse_hit
) / (num_traverse
- last_num_traverse
);
338 auto p
= mds_load
.find(mds
->get_nodeid());
339 if (p
!= mds_load
.end()) {
340 load
.req_rate
= p
->second
.req_rate
;
341 load
.cpu_load_avg
= p
->second
.cpu_load_avg
;
342 load
.cache_hit_rate
= p
->second
.cache_hit_rate
;
344 if (num_requests
>= last_num_requests
&& cpu_time
>= last_cpu_time
&&
345 num_traverse
>= last_num_traverse
&& num_traverse_hit
>= last_num_traverse_hit
)
351 last_num_requests
= num_requests
;
352 last_cpu_time
= cpu_time
;
354 last_num_traverse
= num_traverse
;
355 last_num_traverse_hit
= num_traverse_hit
;
358 dout(15) << load
<< dendl
;
363 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
364 * fallbacks (i.e. kick off async read when we are processing the map and
365 * check status when we get here) with the way the mds is structured.
367 int MDBalancer::localize_balancer()
369 /* reset everything */
373 ceph::mutex lock
= ceph::make_mutex("lock");
374 ceph::condition_variable cond
;
376 /* we assume that balancer is in the metadata pool */
377 object_t oid
= object_t(mds
->mdsmap
->get_balancer());
378 object_locator_t
oloc(mds
->get_metadata_pool());
379 ceph_tid_t tid
= mds
->objecter
->read(oid
, oloc
, 0, 0, CEPH_NOSNAP
, &lua_src
, 0,
380 new C_SafeCond(lock
, cond
, &ack
, &r
));
381 dout(15) << "launched non-blocking read tid=" << tid
382 << " oid=" << oid
<< " oloc=" << oloc
<< dendl
;
384 /* timeout: if we waste half our time waiting for RADOS, then abort! */
385 std::cv_status ret_t
= [&] {
386 auto bal_interval
= g_conf().get_val
<int64_t>("mds_bal_interval");
387 std::unique_lock locker
{lock
};
388 return cond
.wait_for(locker
, std::chrono::seconds(bal_interval
/ 2));
390 /* success: store the balancer in memory and set the version. */
392 if (ret_t
== std::cv_status::timeout
) {
393 mds
->objecter
->op_cancel(tid
, -CEPHFS_ECANCELED
);
394 return -CEPHFS_ETIMEDOUT
;
396 bal_code
.assign(lua_src
.to_str());
397 bal_version
.assign(oid
.name
);
398 dout(10) "bal_code=" << bal_code
<< dendl
;
403 void MDBalancer::send_heartbeat()
405 if (mds
->is_cluster_degraded()) {
406 dout(10) << "degraded" << dendl
;
410 if (!mds
->mdcache
->is_open()) {
411 dout(10) << "not open" << dendl
;
412 mds
->mdcache
->wait_for_open(new C_Bal_SendHeartbeat(mds
));
416 if (mds
->get_nodeid() == 0) {
422 mds_load_t load
= get_load();
423 mds
->logger
->set(l_mds_load_cent
, 100 * load
.mds_load());
424 mds
->logger
->set(l_mds_dispatch_queue_len
, load
.queue_len
);
426 auto em
= mds_load
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
->get_nodeid()), std::forward_as_tuple(load
));
428 em
.first
->second
= load
;
431 // import_map -- how much do i import from whom
432 map
<mds_rank_t
, float> import_map
;
433 for (auto& im
: mds
->mdcache
->get_auth_subtrees()) {
434 mds_rank_t from
= im
->inode
->authority().first
;
435 if (from
== mds
->get_nodeid()) continue;
436 if (im
->get_inode()->is_stray()) continue;
437 import_map
[from
] += im
->pop_auth_subtree
.meta_load();
439 mds_import_map
[ mds
->get_nodeid() ] = import_map
;
442 dout(3) << " epoch " << beat_epoch
<< " load " << load
<< dendl
;
443 for (const auto& [rank
, load
] : import_map
) {
444 dout(5) << " import_map from " << rank
<< " -> " << load
<< dendl
;
449 mds
->get_mds_map()->get_up_mds_set(up
);
450 for (const auto& r
: up
) {
451 if (r
== mds
->get_nodeid())
453 auto hb
= make_message
<MHeartbeat
>(load
, beat_epoch
);
454 hb
->get_import_map() = import_map
;
455 mds
->send_message_mds(hb
, r
);
459 void MDBalancer::handle_heartbeat(const cref_t
<MHeartbeat
> &m
)
461 mds_rank_t who
= mds_rank_t(m
->get_source().num());
462 dout(25) << "=== got heartbeat " << m
->get_beat() << " from " << m
->get_source().num() << " " << m
->get_load() << dendl
;
464 if (!mds
->is_active())
467 if (!mds
->mdcache
->is_open()) {
468 dout(10) << "opening root on handle_heartbeat" << dendl
;
469 mds
->mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, m
));
473 if (mds
->is_cluster_degraded()) {
474 dout(10) << " degraded, ignoring" << dendl
;
478 if (mds
->get_nodeid() != 0 && m
->get_beat() > beat_epoch
) {
479 dout(10) << "receive next epoch " << m
->get_beat() << " from mds." << who
<< " before mds0" << dendl
;
481 beat_epoch
= m
->get_beat();
482 // clear the mds load info whose epoch is less than beat_epoch
487 dout(20) << " from mds0, new epoch " << m
->get_beat() << dendl
;
488 if (beat_epoch
!= m
->get_beat()) {
489 beat_epoch
= m
->get_beat();
495 mds
->mdcache
->show_subtrees();
496 } else if (mds
->get_nodeid() == 0) {
497 if (beat_epoch
!= m
->get_beat()) {
498 dout(10) << " old heartbeat epoch, ignoring" << dendl
;
504 auto em
= mds_load
.emplace(std::piecewise_construct
, std::forward_as_tuple(who
), std::forward_as_tuple(m
->get_load()));
506 em
.first
->second
= m
->get_load();
509 mds_import_map
[who
] = m
->get_import_map();
512 unsigned cluster_size
= mds
->get_mds_map()->get_num_in_mds();
513 if (mds_load
.size() == cluster_size
) {
515 //export_empties(); // no!
517 /* avoid spamming ceph -w if user does not turn mantle on */
518 if (mds
->mdsmap
->get_balancer() != "") {
519 int r
= mantle_prep_rebalance();
521 mds
->clog
->warn() << "using old balancer; mantle failed for "
522 << "balancer=" << mds
->mdsmap
->get_balancer()
523 << " : " << cpp_strerror(r
);
525 prep_rebalance(m
->get_beat());
530 double MDBalancer::try_match(balance_state_t
& state
, mds_rank_t ex
, double& maxex
,
531 mds_rank_t im
, double& maxim
)
533 if (maxex
<= 0 || maxim
<= 0) return 0.0;
535 double howmuch
= std::min(maxex
, maxim
);
537 dout(5) << " - mds." << ex
<< " exports " << howmuch
<< " to mds." << im
<< dendl
;
539 if (ex
== mds
->get_nodeid())
540 state
.targets
[im
] += howmuch
;
542 state
.exported
[ex
] += howmuch
;
543 state
.imported
[im
] += howmuch
;
551 void MDBalancer::queue_split(const CDir
*dir
, bool fast
)
553 dout(10) << __func__
<< " enqueuing " << *dir
554 << " (fast=" << fast
<< ")" << dendl
;
556 const dirfrag_t df
= dir
->dirfrag();
558 auto callback
= [this, df
](int r
) {
559 if (split_pending
.erase(df
) == 0) {
560 // Someone beat me to it. This can happen in the fast splitting
561 // path, because we spawn two contexts, one with mds->timer and
562 // one with mds->queue_waiter. The loser can safely just drop
567 auto mdcache
= mds
->mdcache
;
569 CDir
*dir
= mdcache
->get_dirfrag(df
);
571 dout(10) << "drop split on " << df
<< " because not in cache" << dendl
;
574 if (!dir
->is_auth()) {
575 dout(10) << "drop split on " << df
<< " because non-auth" << dendl
;
579 // Pass on to MDCache: note that the split might still not
580 // happen if the checks in MDCache::can_fragment fail.
581 dout(10) << __func__
<< " splitting " << *dir
<< dendl
;
582 int bits
= g_conf()->mds_bal_split_bits
;
583 if (dir
->inode
->is_ephemeral_dist()) {
584 unsigned min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
585 if (df
.frag
.bits() + bits
< min_frag_bits
)
586 bits
= min_frag_bits
- df
.frag
.bits();
588 mdcache
->split_dir(dir
, bits
);
591 auto ret
= split_pending
.insert(df
);
592 bool is_new
= ret
.second
;
595 // Do the split ASAP: enqueue it in the MDSRank waiters which are
596 // run at the end of dispatching the current request
597 mds
->queue_waiter(new MDSInternalContextWrapper(mds
,
598 new LambdaContext(std::move(callback
))));
600 // Set a timer to really do the split: we don't do it immediately
601 // so that bursts of ops on a directory have a chance to go through
602 // before we freeze it.
603 mds
->timer
.add_event_after(bal_fragment_interval
,
604 new LambdaContext(std::move(callback
)));
608 void MDBalancer::queue_merge(CDir
*dir
)
610 const auto frag
= dir
->dirfrag();
611 auto callback
= [this, frag
](int r
) {
612 ceph_assert(frag
.frag
!= frag_t());
614 // frag must be in this set because only one context is in flight
615 // for a given frag at a time (because merge_pending is checked before
616 // starting one), and this context is the only one that erases it.
617 merge_pending
.erase(frag
);
619 auto mdcache
= mds
->mdcache
;
620 CDir
*dir
= mdcache
->get_dirfrag(frag
);
622 dout(10) << "drop merge on " << frag
<< " because not in cache" << dendl
;
625 ceph_assert(dir
->dirfrag() == frag
);
627 if(!dir
->is_auth()) {
628 dout(10) << "drop merge on " << *dir
<< " because lost auth" << dendl
;
632 dout(10) << "merging " << *dir
<< dendl
;
634 CInode
*diri
= dir
->get_inode();
636 unsigned min_frag_bits
= 0;
637 if (diri
->is_ephemeral_dist())
638 min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
640 frag_t fg
= dir
->get_frag();
641 while (fg
.bits() > min_frag_bits
) {
642 frag_t sibfg
= fg
.get_sibling();
643 auto&& [complete
, sibs
] = diri
->get_dirfrags_under(sibfg
);
645 dout(10) << " not all sibs under " << sibfg
<< " in cache (have " << sibs
<< ")" << dendl
;
649 for (auto& sib
: sibs
) {
650 if (!sib
->is_auth() || !sib
->should_merge()) {
656 dout(10) << " not all sibs under " << sibfg
<< " " << sibs
<< " should_merge" << dendl
;
659 dout(10) << " all sibs under " << sibfg
<< " " << sibs
<< " should merge" << dendl
;
663 if (fg
!= dir
->get_frag())
664 mdcache
->merge_dir(diri
, fg
);
667 if (merge_pending
.count(frag
) == 0) {
668 dout(20) << " enqueued dir " << *dir
<< dendl
;
669 merge_pending
.insert(frag
);
670 mds
->timer
.add_event_after(bal_fragment_interval
,
671 new LambdaContext(std::move(callback
)));
673 dout(20) << " dir already in queue " << *dir
<< dendl
;
677 void MDBalancer::prep_rebalance(int beat
)
679 balance_state_t state
;
681 if (g_conf()->mds_thrash_exports
) {
682 //we're going to randomly export to all the mds in the cluster
683 set
<mds_rank_t
> up_mds
;
684 mds
->get_mds_map()->get_up_mds_set(up_mds
);
685 for (const auto &rank
: up_mds
) {
686 state
.targets
[rank
] = 0.0;
689 int cluster_size
= mds
->get_mds_map()->get_num_in_mds();
690 mds_rank_t whoami
= mds
->get_nodeid();
691 rebalance_time
= clock::now();
693 dout(7) << "cluster loads are" << dendl
;
695 mds
->mdcache
->migrator
->clear_export_queue();
697 // rescale! turn my mds_load back into meta_load units
698 double load_fac
= 1.0;
699 map
<mds_rank_t
, mds_load_t
>::iterator m
= mds_load
.find(whoami
);
700 if ((m
!= mds_load
.end()) && (m
->second
.mds_load() > 0)) {
701 double metald
= m
->second
.auth
.meta_load();
702 double mdsld
= m
->second
.mds_load();
703 load_fac
= metald
/ mdsld
;
704 dout(7) << " load_fac is " << load_fac
705 << " <- " << m
->second
.auth
<< " " << metald
710 mds_meta_load
.clear();
712 double total_load
= 0.0;
713 multimap
<double,mds_rank_t
> load_map
;
714 for (mds_rank_t i
=mds_rank_t(0); i
< mds_rank_t(cluster_size
); i
++) {
715 mds_load_t
& load
= mds_load
.at(i
);
717 double l
= load
.mds_load() * load_fac
;
718 mds_meta_load
[i
] = l
;
721 dout(7) << " mds." << i
723 << " = " << load
.mds_load()
724 << " ~ " << l
<< dendl
;
726 if (whoami
== i
) my_load
= l
;
729 load_map
.insert(pair
<double,mds_rank_t
>( l
, i
));
733 target_load
= total_load
/ (double)cluster_size
;
734 dout(7) << "my load " << my_load
735 << " target " << target_load
736 << " total " << total_load
740 for (const auto& [load
, rank
] : load_map
) {
741 if (load
< target_load
* (1.0 + g_conf()->mds_bal_min_rebalance
)) {
742 dout(7) << " mds." << rank
<< " is underloaded or barely overloaded." << dendl
;
743 mds_last_epoch_under_map
[rank
] = beat_epoch
;
747 int last_epoch_under
= mds_last_epoch_under_map
[whoami
];
748 if (last_epoch_under
== beat_epoch
) {
749 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl
;
752 // am i over long enough?
753 if (last_epoch_under
&& beat_epoch
- last_epoch_under
< 2) {
754 dout(7) << " i am overloaded, but only for " << (beat_epoch
- last_epoch_under
) << " epochs" << dendl
;
758 dout(7) << " i am sufficiently overloaded" << dendl
;
761 // first separate exporters and importers
762 multimap
<double,mds_rank_t
> importers
;
763 multimap
<double,mds_rank_t
> exporters
;
764 set
<mds_rank_t
> importer_set
;
765 set
<mds_rank_t
> exporter_set
;
767 for (multimap
<double,mds_rank_t
>::iterator it
= load_map
.begin();
768 it
!= load_map
.end();
770 if (it
->first
< target_load
) {
771 dout(15) << " mds." << it
->second
<< " is importer" << dendl
;
772 importers
.insert(pair
<double,mds_rank_t
>(it
->first
,it
->second
));
773 importer_set
.insert(it
->second
);
775 int mds_last_epoch_under
= mds_last_epoch_under_map
[it
->second
];
776 if (!(mds_last_epoch_under
&& beat_epoch
- mds_last_epoch_under
< 2)) {
777 dout(15) << " mds." << it
->second
<< " is exporter" << dendl
;
778 exporters
.insert(pair
<double,mds_rank_t
>(it
->first
,it
->second
));
779 exporter_set
.insert(it
->second
);
785 // determine load transfer mapping
788 // analyze import_map; do any matches i can
790 dout(15) << " matching exporters to import sources" << dendl
;
792 // big -> small exporters
793 for (multimap
<double,mds_rank_t
>::reverse_iterator ex
= exporters
.rbegin();
794 ex
!= exporters
.rend();
796 double maxex
= get_maxex(state
, ex
->second
);
797 if (maxex
<= .001) continue;
799 // check importers. for now, just in arbitrary order (no intelligent matching).
800 for (map
<mds_rank_t
, float>::iterator im
= mds_import_map
[ex
->second
].begin();
801 im
!= mds_import_map
[ex
->second
].end();
803 double maxim
= get_maxim(state
, im
->first
);
804 if (maxim
<= .001) continue;
805 try_match(state
, ex
->second
, maxex
, im
->first
, maxim
);
806 if (maxex
<= .001) break;
813 dout(15) << " matching big exporters to big importers" << dendl
;
814 // big exporters to big importers
815 multimap
<double,mds_rank_t
>::reverse_iterator ex
= exporters
.rbegin();
816 multimap
<double,mds_rank_t
>::iterator im
= importers
.begin();
817 while (ex
!= exporters
.rend() &&
818 im
!= importers
.end()) {
819 double maxex
= get_maxex(state
, ex
->second
);
820 double maxim
= get_maxim(state
, im
->second
);
821 if (maxex
< .001 || maxim
< .001) break;
822 try_match(state
, ex
->second
, maxex
, im
->second
, maxim
);
823 if (maxex
<= .001) ++ex
;
824 if (maxim
<= .001) ++im
;
827 dout(15) << " matching small exporters to big importers" << dendl
;
828 // small exporters to big importers
829 multimap
<double,mds_rank_t
>::iterator ex
= exporters
.begin();
830 multimap
<double,mds_rank_t
>::iterator im
= importers
.begin();
831 while (ex
!= exporters
.end() &&
832 im
!= importers
.end()) {
833 double maxex
= get_maxex(state
, ex
->second
);
834 double maxim
= get_maxim(state
, im
->second
);
835 if (maxex
< .001 || maxim
< .001) break;
836 try_match(state
, ex
->second
, maxex
, im
->second
, maxim
);
837 if (maxex
<= .001) ++ex
;
838 if (maxim
<= .001) ++im
;
842 try_rebalance(state
);
845 int MDBalancer::mantle_prep_rebalance()
847 balance_state_t state
;
849 /* refresh balancer if it has changed */
850 if (bal_version
!= mds
->mdsmap
->get_balancer()) {
851 bal_version
.assign("");
852 int r
= localize_balancer();
855 /* only spam the cluster log from 1 mds on version changes */
856 if (mds
->get_nodeid() == 0)
857 mds
->clog
->info() << "mantle balancer version changed: " << bal_version
;
860 /* prepare for balancing */
861 int cluster_size
= mds
->get_mds_map()->get_num_in_mds();
862 rebalance_time
= clock::now();
863 mds
->mdcache
->migrator
->clear_export_queue();
865 /* fill in the metrics for each mds by grabbing load struct */
866 vector
< map
<string
, double> > metrics (cluster_size
);
867 for (mds_rank_t i
=mds_rank_t(0); i
< mds_rank_t(cluster_size
); i
++) {
868 mds_load_t
& load
= mds_load
.at(i
);
870 metrics
[i
] = {{"auth.meta_load", load
.auth
.meta_load()},
871 {"all.meta_load", load
.all
.meta_load()},
872 {"req_rate", load
.req_rate
},
873 {"queue_len", load
.queue_len
},
874 {"cpu_load_avg", load
.cpu_load_avg
}};
877 /* execute the balancer */
879 int ret
= mantle
.balance(bal_code
, mds
->get_nodeid(), metrics
, state
.targets
);
880 dout(7) << " mantle decided that new targets=" << state
.targets
<< dendl
;
882 /* mantle doesn't know about cluster size, so check target len here */
883 if ((int) state
.targets
.size() != cluster_size
)
884 return -CEPHFS_EINVAL
;
888 try_rebalance(state
);
894 void MDBalancer::try_rebalance(balance_state_t
& state
)
896 if (g_conf()->mds_thrash_exports
) {
897 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
902 // make a sorted list of my imports
903 multimap
<double, CDir
*> import_pop_map
;
904 multimap
<mds_rank_t
, pair
<CDir
*, double> > import_from_map
;
906 for (auto& dir
: mds
->mdcache
->get_fullauth_subtrees()) {
907 CInode
*diri
= dir
->get_inode();
908 if (diri
->is_mdsdir())
910 if (diri
->get_export_pin(false) != MDS_RANK_NONE
)
912 if (dir
->is_freezing() || dir
->is_frozen())
913 continue; // export pbly already in progress
915 mds_rank_t from
= diri
->authority().first
;
916 double pop
= dir
->pop_auth_subtree
.meta_load();
917 if (g_conf()->mds_bal_idle_threshold
> 0 &&
918 pop
< g_conf()->mds_bal_idle_threshold
&&
919 diri
!= mds
->mdcache
->get_root() &&
920 from
!= mds
->get_nodeid()) {
921 dout(5) << " exporting idle (" << pop
<< ") import " << *dir
922 << " back to mds." << from
<< dendl
;
923 mds
->mdcache
->migrator
->export_dir_nicely(dir
, from
);
927 dout(15) << " map: i imported " << *dir
<< " from " << from
<< dendl
;
928 import_pop_map
.insert(make_pair(pop
, dir
));
929 import_from_map
.insert(make_pair(from
, make_pair(dir
, pop
)));
933 map
<mds_rank_t
, double> export_pop_map
;
935 for (auto &it
: state
.targets
) {
936 mds_rank_t target
= it
.first
;
937 double amount
= it
.second
;
939 if (amount
< MIN_OFFLOAD
)
941 if (amount
* 10 * state
.targets
.size() < target_load
)
944 dout(5) << "want to send " << amount
<< " to mds." << target
945 //<< " .. " << (*it).second << " * " << load_fac
947 << dendl
;//" .. fudge is " << fudge << dendl;
949 double& have
= export_pop_map
[target
];
951 mds
->mdcache
->show_subtrees();
953 // search imports from target
954 if (import_from_map
.count(target
)) {
955 dout(7) << " aha, looking through imports from target mds." << target
<< dendl
;
956 for (auto p
= import_from_map
.equal_range(target
);
957 p
.first
!= p
.second
; ) {
958 CDir
*dir
= p
.first
->second
.first
;
959 double pop
= p
.first
->second
.second
;
960 dout(7) << "considering " << *dir
<< " from " << (*p
.first
).first
<< dendl
;
961 auto plast
= p
.first
++;
963 if (dir
->inode
->is_base())
965 ceph_assert(dir
->inode
->authority().first
== target
); // cuz that's how i put it in the map, dummy
967 if (pop
<= amount
-have
) {
968 dout(7) << "reexporting " << *dir
<< " pop " << pop
969 << " back to mds." << target
<< dendl
;
970 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
972 import_from_map
.erase(plast
);
973 for (auto q
= import_pop_map
.equal_range(pop
);
974 q
.first
!= q
.second
; ) {
975 if (q
.first
->second
== dir
) {
976 import_pop_map
.erase(q
.first
);
982 dout(7) << "can't reexport " << *dir
<< ", too big " << pop
<< dendl
;
984 if (amount
-have
< MIN_OFFLOAD
)
991 for (auto &it
: state
.targets
) {
992 mds_rank_t target
= it
.first
;
993 double amount
= it
.second
;
995 if (!export_pop_map
.count(target
))
997 double& have
= export_pop_map
[target
];
998 if (amount
-have
< MIN_OFFLOAD
)
1001 for (auto p
= import_pop_map
.begin();
1002 p
!= import_pop_map
.end(); ) {
1003 CDir
*dir
= p
->second
;
1004 if (dir
->inode
->is_base()) {
1009 double pop
= p
->first
;
1010 if (pop
<= amount
-have
&& pop
> MIN_REEXPORT
) {
1011 dout(5) << "reexporting " << *dir
<< " pop " << pop
1012 << " to mds." << target
<< dendl
;
1014 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
1015 import_pop_map
.erase(p
++);
1019 if (amount
-have
< MIN_OFFLOAD
)
1024 set
<CDir
*> already_exporting
;
1026 for (auto &it
: state
.targets
) {
1027 mds_rank_t target
= it
.first
;
1028 double amount
= it
.second
;
1030 if (!export_pop_map
.count(target
))
1032 double& have
= export_pop_map
[target
];
1033 if (amount
-have
< MIN_OFFLOAD
)
1036 // okay, search for fragments of my workload
1037 std::vector
<CDir
*> exports
;
1039 for (auto p
= import_pop_map
.rbegin();
1040 p
!= import_pop_map
.rend();
1042 CDir
*dir
= p
->second
;
1043 find_exports(dir
, amount
, &exports
, have
, already_exporting
);
1044 if (amount
-have
< MIN_OFFLOAD
)
1047 //fudge = amount - have;
1049 for (const auto& dir
: exports
) {
1050 dout(5) << " - exporting " << dir
->pop_auth_subtree
1051 << " " << dir
->pop_auth_subtree
.meta_load()
1052 << " to mds." << target
<< " " << *dir
<< dendl
;
1053 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
1057 dout(7) << "done" << dendl
;
1058 mds
->mdcache
->show_subtrees();
1061 void MDBalancer::find_exports(CDir
*dir
,
1063 std::vector
<CDir
*>* exports
,
1065 set
<CDir
*>& already_exporting
)
1067 auto now
= clock::now();
1068 auto duration
= std::chrono::duration
<double>(now
-rebalance_time
).count();
1069 if (duration
> 0.1) {
1070 derr
<< " balancer runs too long" << dendl_impl
;
1075 ceph_assert(dir
->is_auth());
1077 double need
= amount
- have
;
1078 if (need
< amount
* g_conf()->mds_bal_min_start
)
1079 return; // good enough!
1081 double needmax
= need
* g_conf()->mds_bal_need_max
;
1082 double needmin
= need
* g_conf()->mds_bal_need_min
;
1083 double midchunk
= need
* g_conf()->mds_bal_midchunk
;
1084 double minchunk
= need
* g_conf()->mds_bal_minchunk
;
1086 std::vector
<CDir
*> bigger_rep
, bigger_unrep
;
1087 multimap
<double, CDir
*> smaller
;
1089 double dir_pop
= dir
->pop_auth_subtree
.meta_load();
1090 dout(7) << "in " << dir_pop
<< " " << *dir
<< " need " << need
<< " (" << needmin
<< " - " << needmax
<< ")" << dendl
;
1092 double subdir_sum
= 0;
1093 for (elist
<CInode
*>::iterator it
= dir
->pop_lru_subdirs
.begin_use_current();
1098 ceph_assert(in
->is_dir());
1099 ceph_assert(in
->get_parent_dir() == dir
);
1101 auto&& dfls
= in
->get_nested_dirfrags();
1103 size_t num_idle_frags
= 0;
1104 for (const auto& subdir
: dfls
) {
1105 if (already_exporting
.count(subdir
))
1108 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1109 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1110 if (subdir
->is_frozen_dir() || subdir
->is_frozen_tree_root() ||
1111 subdir
->is_freezing_dir() || subdir
->is_freezing_tree_root())
1112 continue; // can't export this right now!
1115 double pop
= subdir
->pop_auth_subtree
.meta_load();
1117 dout(15) << " subdir pop " << pop
<< " " << *subdir
<< dendl
;
1119 if (pop
< minchunk
) {
1125 if (pop
> needmin
&& pop
< needmax
) {
1126 exports
->push_back(subdir
);
1127 already_exporting
.insert(subdir
);
1133 if (subdir
->is_rep())
1134 bigger_rep
.push_back(subdir
);
1136 bigger_unrep
.push_back(subdir
);
1138 smaller
.insert(pair
<double,CDir
*>(pop
, subdir
));
1140 if (dfls
.size() == num_idle_frags
)
1141 in
->item_pop_lru
.remove_myself();
1143 dout(15) << " sum " << subdir_sum
<< " / " << dir_pop
<< dendl
;
1145 // grab some sufficiently big small items
1146 multimap
<double,CDir
*>::reverse_iterator it
;
1147 for (it
= smaller
.rbegin();
1148 it
!= smaller
.rend();
1151 if ((*it
).first
< midchunk
)
1154 dout(7) << " taking smaller " << *(*it
).second
<< dendl
;
1156 exports
->push_back((*it
).second
);
1157 already_exporting
.insert((*it
).second
);
1158 have
+= (*it
).first
;
1163 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1164 for (const auto& dir
: bigger_unrep
) {
1165 dout(15) << " descending into " << *dir
<< dendl
;
1166 find_exports(dir
, amount
, exports
, have
, already_exporting
);
1171 // ok fine, use smaller bits
1173 it
!= smaller
.rend();
1175 dout(7) << " taking (much) smaller " << it
->first
<< " " << *(*it
).second
<< dendl
;
1177 exports
->push_back((*it
).second
);
1178 already_exporting
.insert((*it
).second
);
1179 have
+= (*it
).first
;
1184 // ok fine, drill into replicated dirs
1185 for (const auto& dir
: bigger_rep
) {
1186 dout(7) << " descending into replicated " << *dir
<< dendl
;
1187 find_exports(dir
, amount
, exports
, have
, already_exporting
);
1193 void MDBalancer::hit_inode(CInode
*in
, int type
, int who
)
1196 in
->pop
.get(type
).hit();
1198 if (in
->get_parent_dn())
1199 hit_dir(in
->get_parent_dn()->get_dir(), type
, who
);
1202 void MDBalancer::maybe_fragment(CDir
*dir
, bool hot
)
1205 if (bal_fragment_dirs
&& bal_fragment_interval
> 0 &&
1207 !dir
->inode
->is_base() && // not root/mdsdir (for now at least)
1208 !dir
->inode
->is_stray()) { // not straydir
1211 if (dir
->should_split() || hot
) {
1212 if (split_pending
.count(dir
->dirfrag()) == 0) {
1213 queue_split(dir
, false);
1215 if (dir
->should_split_fast()) {
1216 queue_split(dir
, true);
1218 dout(10) << ": fragment already enqueued to split: "
1225 if (dir
->get_frag() != frag_t() && dir
->should_merge() &&
1226 merge_pending
.count(dir
->dirfrag()) == 0) {
1232 void MDBalancer::hit_dir(CDir
*dir
, int type
, int who
, double amount
)
1234 if (dir
->inode
->is_stray())
1237 double v
= dir
->pop_me
.get(type
).hit(amount
);
1239 const bool hot
= (v
> g_conf()->mds_bal_split_rd
&& type
== META_POP_IRD
) ||
1240 (v
> g_conf()->mds_bal_split_wr
&& type
== META_POP_IWR
);
1242 dout(20) << type
<< " pop is " << v
<< ", frag " << dir
->get_frag()
1243 << " size " << dir
->get_frag_size() << " " << dir
->pop_me
<< dendl
;
1245 maybe_fragment(dir
, hot
);
1248 if (type
== META_POP_IRD
&& who
>= 0) {
1249 dir
->pop_spread
.hit(who
);
1252 double rd_adj
= 0.0;
1253 if (type
== META_POP_IRD
&&
1254 dir
->last_popularity_sample
< last_sample
) {
1255 double dir_pop
= dir
->pop_auth_subtree
.get(type
).get(); // hmm??
1256 dir
->last_popularity_sample
= last_sample
;
1257 double pop_sp
= dir
->pop_spread
.get();
1258 dir_pop
+= pop_sp
* 10;
1260 //if (dir->ino() == inodeno_t(0x10000000002))
1262 dout(20) << type
<< " pop " << dir_pop
<< " spread " << pop_sp
1263 << " " << dir
->pop_spread
.last
[0]
1264 << " " << dir
->pop_spread
.last
[1]
1265 << " " << dir
->pop_spread
.last
[2]
1266 << " " << dir
->pop_spread
.last
[3]
1267 << " in " << *dir
<< dendl
;
1270 if (dir
->is_auth() && !dir
->is_ambiguous_auth()) {
1271 if (dir
->can_rep() &&
1272 dir_pop
>= g_conf()->mds_bal_replicate_threshold
) {
1274 double rdp
= dir
->pop_me
.get(META_POP_IRD
).get();
1275 rd_adj
= rdp
/ mds
->get_mds_map()->get_num_in_mds() - rdp
;
1276 rd_adj
/= 2.0; // temper somewhat
1278 dout(5) << "replicating dir " << *dir
<< " pop " << dir_pop
<< " .. rdp " << rdp
<< " adj " << rd_adj
<< dendl
;
1280 dir
->dir_rep
= CDir::REP_ALL
;
1281 mds
->mdcache
->send_dir_updates(dir
, true);
1283 // fixme this should adjust the whole pop hierarchy
1284 dir
->pop_me
.get(META_POP_IRD
).adjust(rd_adj
);
1285 dir
->pop_auth_subtree
.get(META_POP_IRD
).adjust(rd_adj
);
1288 if (dir
->ino() != 1 &&
1290 dir_pop
< g_conf()->mds_bal_unreplicate_threshold
) {
1292 dout(5) << "unreplicating dir " << *dir
<< " pop " << dir_pop
<< dendl
;
1294 dir
->dir_rep
= CDir::REP_NONE
;
1295 mds
->mdcache
->send_dir_updates(dir
);
1301 bool hit_subtree
= dir
->is_auth(); // current auth subtree (if any)
1302 bool hit_subtree_nested
= dir
->is_auth(); // all nested auth subtrees
1305 CDir
*pdir
= dir
->inode
->get_parent_dir();
1306 dir
->pop_nested
.get(type
).hit(amount
);
1308 dir
->pop_nested
.get(META_POP_IRD
).adjust(rd_adj
);
1311 dir
->pop_auth_subtree
.get(type
).hit(amount
);
1314 dir
->pop_auth_subtree
.get(META_POP_IRD
).adjust(rd_adj
);
1316 if (dir
->is_subtree_root())
1317 hit_subtree
= false; // end of auth domain, stop hitting auth counters.
1319 pdir
->pop_lru_subdirs
.push_front(&dir
->get_inode()->item_pop_lru
);
1322 if (hit_subtree_nested
) {
1323 dir
->pop_auth_subtree_nested
.get(type
).hit(amount
);
1325 dir
->pop_auth_subtree_nested
.get(META_POP_IRD
).adjust(rd_adj
);
1334 * subtract off an exported chunk.
1335 * this excludes *dir itself (encode_export_dir should have take care of that)
1336 * we _just_ do the parents' nested counters.
1338 * NOTE: call me _after_ forcing *dir into a subtree root,
1339 * but _before_ doing the encode_export_dirs.
1341 void MDBalancer::subtract_export(CDir
*dir
)
1343 dirfrag_load_vec_t subload
= dir
->pop_auth_subtree
;
1346 dir
= dir
->inode
->get_parent_dir();
1349 dir
->pop_nested
.sub(subload
);
1350 dir
->pop_auth_subtree_nested
.sub(subload
);
1355 void MDBalancer::add_import(CDir
*dir
)
1357 dirfrag_load_vec_t subload
= dir
->pop_auth_subtree
;
1360 dir
= dir
->inode
->get_parent_dir();
1363 dir
->pop_nested
.add(subload
);
1364 dir
->pop_auth_subtree_nested
.add(subload
);
1368 void MDBalancer::adjust_pop_for_rename(CDir
*pdir
, CDir
*dir
, bool inc
)
1370 bool adjust_subtree_nest
= dir
->is_auth();
1371 bool adjust_subtree
= adjust_subtree_nest
&& !dir
->is_subtree_root();
1375 pdir
->pop_nested
.add(dir
->pop_nested
);
1376 if (adjust_subtree
) {
1377 pdir
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
1378 pdir
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
1381 if (adjust_subtree_nest
)
1382 pdir
->pop_auth_subtree_nested
.add(dir
->pop_auth_subtree_nested
);
1384 pdir
->pop_nested
.sub(dir
->pop_nested
);
1386 pdir
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
1388 if (adjust_subtree_nest
)
1389 pdir
->pop_auth_subtree_nested
.sub(dir
->pop_auth_subtree_nested
);
1392 if (pdir
->is_subtree_root())
1393 adjust_subtree
= false;
1395 pdir
= pdir
->inode
->get_parent_dir();
1400 void MDBalancer::handle_mds_failure(mds_rank_t who
)
1403 mds_last_epoch_under_map
.clear();
1407 int MDBalancer::dump_loads(Formatter
*f
) const
1409 std::deque
<CDir
*> dfs
;
1410 if (mds
->mdcache
->get_root()) {
1411 mds
->mdcache
->get_root()->get_dirfrags(dfs
);
1413 dout(10) << "no root" << dendl
;
1416 f
->open_object_section("loads");
1418 f
->open_array_section("dirfrags");
1419 while (!dfs
.empty()) {
1420 CDir
*dir
= dfs
.front();
1423 f
->open_object_section("dir");
1427 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
1428 CInode
*in
= it
->second
->get_linkage()->get_inode();
1429 if (!in
|| !in
->is_dir())
1432 auto&& ls
= in
->get_dirfrags();
1433 for (const auto& subdir
: ls
) {
1434 if (subdir
->pop_nested
.meta_load() < .001)
1436 dfs
.push_back(subdir
);
1440 f
->close_section(); // dirfrags array
1442 f
->open_object_section("mds_load");
1445 auto dump_mds_load
= [f
](const mds_load_t
& load
) {
1446 f
->dump_float("request_rate", load
.req_rate
);
1447 f
->dump_float("cache_hit_rate", load
.cache_hit_rate
);
1448 f
->dump_float("queue_length", load
.queue_len
);
1449 f
->dump_float("cpu_load", load
.cpu_load_avg
);
1450 f
->dump_float("mds_load", load
.mds_load());
1452 f
->open_object_section("auth_dirfrags");
1455 f
->open_object_section("all_dirfrags");
1460 for (const auto& [rank
, load
] : mds_load
) {
1461 CachedStackStringStream css
;
1462 *css
<< "mds." << rank
;
1463 f
->open_object_section(css
->strv());
1464 dump_mds_load(load
);
1468 f
->close_section(); // mds_load
1470 f
->open_object_section("mds_meta_load");
1471 for (auto& [rank
, mload
] : mds_meta_load
) {
1472 CachedStackStringStream css
;
1473 *css
<< "mds." << rank
;
1474 f
->dump_float(css
->strv(), mload
);
1476 f
->close_section(); // mds_meta_load
1478 f
->open_object_section("mds_import_map");
1479 for (auto& [rank
, imports
] : mds_import_map
) {
1481 CachedStackStringStream css
;
1482 *css
<< "mds." << rank
;
1483 f
->open_array_section(css
->strv());
1485 for (auto& [rank_from
, mload
] : imports
) {
1486 f
->open_object_section("from");
1487 CachedStackStringStream css
;
1488 *css
<< "mds." << rank_from
;
1489 f
->dump_float(css
->strv(), mload
);
1492 f
->close_section(); // mds.? array
1494 f
->close_section(); // mds_import_map
1496 f
->close_section(); // loads