1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/compat.h"
18 #include "mon/MonClient.h"
19 #include "MDBalancer.h"
28 #include "include/Context.h"
29 #include "msg/Messenger.h"
36 using std::chrono::duration_cast
;
38 #include "common/config.h"
39 #include "common/errno.h"
41 #define dout_context g_ceph_context
43 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
47 auto subsys = ceph_subsys_mds;\
48 if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
49 subsys = ceph_subsys_mds_balancer;\
51 dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
53 #define dendl dendl_impl; } while (0)
56 #define MIN_LOAD 50 // ??
57 #define MIN_REEXPORT 5 // will automatically reexport
58 #define MIN_OFFLOAD 10 // point at which i stop trying, close enough
61 int MDBalancer::proc_message(const cref_t
<Message
> &m
)
63 switch (m
->get_type()) {
65 case MSG_MDS_HEARTBEAT
:
66 handle_heartbeat(ref_cast
<MHeartbeat
>(m
));
70 derr
<< " balancer unknown message " << m
->get_type() << dendl_impl
;
71 ceph_abort_msg("balancer unknown message");
77 MDBalancer::MDBalancer(MDSRank
*m
, Messenger
*msgr
, MonClient
*monc
) :
78 mds(m
), messenger(msgr
), mon_client(monc
)
80 bal_fragment_dirs
= g_conf().get_val
<bool>("mds_bal_fragment_dirs");
81 bal_fragment_interval
= g_conf().get_val
<int64_t>("mds_bal_fragment_interval");
84 void MDBalancer::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
)
86 if (changed
.count("mds_bal_fragment_dirs"))
87 bal_fragment_dirs
= g_conf().get_val
<bool>("mds_bal_fragment_dirs");
88 if (changed
.count("mds_bal_fragment_interval"))
89 bal_fragment_interval
= g_conf().get_val
<int64_t>("mds_bal_fragment_interval");
92 void MDBalancer::handle_export_pins(void)
94 const mds_rank_t max_mds
= mds
->mdsmap
->get_max_mds();
95 auto mdcache
= mds
->mdcache
;
97 auto &q
= mdcache
->export_pin_queue
;
99 dout(20) << "export_pin_queue size=" << q
.size() << dendl
;
100 while (it
!= q
.end()) {
103 ceph_assert(in
->is_dir());
105 mds_rank_t export_pin
= in
->get_export_pin(false);
106 in
->check_pin_policy(export_pin
);
108 if (export_pin
>= max_mds
) {
109 dout(20) << " delay export_pin=" << export_pin
<< " on " << *in
<< dendl
;
110 in
->state_clear(CInode::STATE_QUEUEDEXPORTPIN
);
113 in
->state_set(CInode::STATE_DELAYEDEXPORTPIN
);
114 mdcache
->export_pin_delayed_queue
.insert(in
);
118 dout(20) << " executing export_pin=" << export_pin
<< " on " << *in
<< dendl
;
119 unsigned min_frag_bits
= 0;
120 mds_rank_t target
= MDS_RANK_NONE
;
123 else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
)
124 target
= mdcache
->hash_into_rank_bucket(in
->ino());
125 else if (export_pin
== MDS_RANK_EPHEMERAL_DIST
)
126 min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
129 for (auto&& dir
: in
->get_dirfrags()) {
133 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
134 if (dir
->get_frag().bits() < min_frag_bits
) {
135 if (!dir
->state_test(CDir::STATE_CREATING
) &&
136 !dir
->is_frozen() && !dir
->is_freezing()) {
137 queue_split(dir
, true);
142 target
= mdcache
->hash_into_rank_bucket(in
->ino(), dir
->get_frag());
145 if (target
== MDS_RANK_NONE
) {
146 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
147 if (dir
->is_frozen() || dir
->is_freezing()) {
152 dout(10) << " clear auxsubtree on " << *dir
<< dendl
;
153 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
154 mds
->mdcache
->try_subtree_merge(dir
);
156 } else if (target
== mds
->get_nodeid()) {
157 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
158 ceph_assert(dir
->is_subtree_root());
159 } else if (dir
->state_test(CDir::STATE_CREATING
) ||
160 dir
->is_frozen() || dir
->is_freezing()) {
164 } else if (!dir
->is_subtree_root()) {
165 dir
->state_set(CDir::STATE_AUXSUBTREE
);
166 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
167 dout(10) << " create aux subtree on " << *dir
<< dendl
;
169 dout(10) << " set auxsubtree bit on " << *dir
<< dendl
;
170 dir
->state_set(CDir::STATE_AUXSUBTREE
);
173 /* Only export a directory if it's non-empty. An empty directory will
174 * be sent back by the importer.
176 if (dir
->get_num_head_items() > 0) {
177 mds
->mdcache
->migrator
->export_dir(dir
, target
);
184 in
->state_clear(CInode::STATE_QUEUEDEXPORTPIN
);
189 std::vector
<CDir
*> authsubs
= mdcache
->get_auth_subtrees();
190 bool print_auth_subtrees
= true;
192 if (authsubs
.size() > AUTH_TREES_THRESHOLD
&&
193 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
194 dout(15) << "number of auth trees = " << authsubs
.size() << "; not "
195 "printing auth trees" << dendl
;
196 print_auth_subtrees
= false;
199 for (auto &cd
: authsubs
) {
200 mds_rank_t export_pin
= cd
->inode
->get_export_pin();
201 cd
->inode
->check_pin_policy(export_pin
);
203 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
204 export_pin
= mdcache
->hash_into_rank_bucket(cd
->ino(), cd
->get_frag());
205 } else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
) {
206 export_pin
= mdcache
->hash_into_rank_bucket(cd
->ino());
209 if (print_auth_subtrees
)
210 dout(25) << "auth tree " << *cd
<< " export_pin=" << export_pin
<< dendl
;
212 if (export_pin
>= 0 && export_pin
!= mds
->get_nodeid() &&
213 export_pin
< mds
->mdsmap
->get_max_mds()) {
214 mdcache
->migrator
->export_dir(cd
, export_pin
);
219 void MDBalancer::tick()
221 static int num_bal_times
= g_conf()->mds_bal_max
;
222 auto bal_interval
= g_conf().get_val
<int64_t>("mds_bal_interval");
223 auto bal_max_until
= g_conf().get_val
<int64_t>("mds_bal_max_until");
224 time now
= clock::now();
226 if (g_conf()->mds_bal_export_pin
) {
227 handle_export_pins();
231 if (chrono::duration
<double>(now
-last_sample
).count() >
232 g_conf()->mds_bal_sample_interval
) {
233 dout(15) << "tick last_sample now " << now
<< dendl
;
237 // We can use duration_cast below, although the result is an int,
238 // because the values from g_conf are also integers.
240 if (mds
->get_nodeid() == 0
243 && duration_cast
<chrono::seconds
>(now
- last_heartbeat
).count() >= bal_interval
244 && (num_bal_times
|| (bal_max_until
>= 0 && mds
->get_uptime().count() > bal_max_until
))) {
245 last_heartbeat
= now
;
250 mds
->mdcache
->show_subtrees(10, true);
256 class C_Bal_SendHeartbeat
: public MDSInternalContext
{
258 explicit C_Bal_SendHeartbeat(MDSRank
*mds_
) : MDSInternalContext(mds_
) { }
259 void finish(int f
) override
{
260 mds
->balancer
->send_heartbeat();
265 double mds_load_t::mds_load() const
267 switch(g_conf()->mds_bal_mode
) {
270 .8 * auth
.meta_load() +
271 .2 * all
.meta_load() +
276 return req_rate
+ 10.0*queue_len
;
286 mds_load_t
MDBalancer::get_load()
288 auto now
= clock::now();
290 mds_load_t load
{DecayRate()}; /* zero DecayRate! */
292 if (mds
->mdcache
->get_root()) {
293 auto&& ls
= mds
->mdcache
->get_root()->get_dirfrags();
295 load
.auth
.add(d
->pop_auth_subtree_nested
);
296 load
.all
.add(d
->pop_nested
);
299 dout(20) << "no root, no load" << dendl
;
302 uint64_t num_requests
= mds
->get_num_requests();
304 uint64_t cpu_time
= 1;
306 string stat_path
= PROCPREFIX
"/proc/self/stat";
307 ifstream
stat_file(stat_path
);
308 if (stat_file
.is_open()) {
309 vector
<string
> stat_vec(std::istream_iterator
<string
>{stat_file
},
310 std::istream_iterator
<string
>());
311 if (stat_vec
.size() >= 15) {
313 cpu_time
= strtoll(stat_vec
[13].c_str(), nullptr, 10) +
314 strtoll(stat_vec
[14].c_str(), nullptr, 10);
316 derr
<< "input file '" << stat_path
<< "' not resolvable" << dendl_impl
;
319 derr
<< "input file '" << stat_path
<< "' not found" << dendl_impl
;
323 load
.queue_len
= messenger
->get_dispatch_queue_len();
325 bool update_last
= true;
326 if (last_get_load
!= clock::zero() &&
327 now
> last_get_load
) {
328 double el
= std::chrono::duration
<double>(now
-last_get_load
).count();
330 if (num_requests
> last_num_requests
)
331 load
.req_rate
= (num_requests
- last_num_requests
) / el
;
332 if (cpu_time
> last_cpu_time
)
333 load
.cpu_load_avg
= (cpu_time
- last_cpu_time
) / el
;
335 auto p
= mds_load
.find(mds
->get_nodeid());
336 if (p
!= mds_load
.end()) {
337 load
.req_rate
= p
->second
.req_rate
;
338 load
.cpu_load_avg
= p
->second
.cpu_load_avg
;
340 if (num_requests
>= last_num_requests
&& cpu_time
>= last_cpu_time
)
346 last_num_requests
= num_requests
;
347 last_cpu_time
= cpu_time
;
351 dout(15) << load
<< dendl
;
356 * Read synchronously from RADOS using a timeout. We cannot do daemon-local
357 * fallbacks (i.e. kick off async read when we are processing the map and
358 * check status when we get here) with the way the mds is structured.
360 int MDBalancer::localize_balancer()
362 /* reset everything */
366 ceph::mutex lock
= ceph::make_mutex("lock");
367 ceph::condition_variable cond
;
369 /* we assume that balancer is in the metadata pool */
370 object_t oid
= object_t(mds
->mdsmap
->get_balancer());
371 object_locator_t
oloc(mds
->get_metadata_pool());
372 ceph_tid_t tid
= mds
->objecter
->read(oid
, oloc
, 0, 0, CEPH_NOSNAP
, &lua_src
, 0,
373 new C_SafeCond(lock
, cond
, &ack
, &r
));
374 dout(15) << "launched non-blocking read tid=" << tid
375 << " oid=" << oid
<< " oloc=" << oloc
<< dendl
;
377 /* timeout: if we waste half our time waiting for RADOS, then abort! */
378 std::cv_status ret_t
= [&] {
379 auto bal_interval
= g_conf().get_val
<int64_t>("mds_bal_interval");
380 std::unique_lock locker
{lock
};
381 return cond
.wait_for(locker
, std::chrono::seconds(bal_interval
/ 2));
383 /* success: store the balancer in memory and set the version. */
385 if (ret_t
== std::cv_status::timeout
) {
386 mds
->objecter
->op_cancel(tid
, -CEPHFS_ECANCELED
);
387 return -CEPHFS_ETIMEDOUT
;
389 bal_code
.assign(lua_src
.to_str());
390 bal_version
.assign(oid
.name
);
391 dout(10) "bal_code=" << bal_code
<< dendl
;
396 void MDBalancer::send_heartbeat()
398 if (mds
->is_cluster_degraded()) {
399 dout(10) << "degraded" << dendl
;
403 if (!mds
->mdcache
->is_open()) {
404 dout(10) << "not open" << dendl
;
405 mds
->mdcache
->wait_for_open(new C_Bal_SendHeartbeat(mds
));
409 if (mds
->get_nodeid() == 0) {
415 mds_load_t load
= get_load();
416 mds
->logger
->set(l_mds_load_cent
, 100 * load
.mds_load());
417 mds
->logger
->set(l_mds_dispatch_queue_len
, load
.queue_len
);
419 auto em
= mds_load
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
->get_nodeid()), std::forward_as_tuple(load
));
421 em
.first
->second
= load
;
424 // import_map -- how much do i import from whom
425 map
<mds_rank_t
, float> import_map
;
426 for (auto& im
: mds
->mdcache
->get_auth_subtrees()) {
427 mds_rank_t from
= im
->inode
->authority().first
;
428 if (from
== mds
->get_nodeid()) continue;
429 if (im
->get_inode()->is_stray()) continue;
430 import_map
[from
] += im
->pop_auth_subtree
.meta_load();
432 mds_import_map
[ mds
->get_nodeid() ] = import_map
;
435 dout(3) << " epoch " << beat_epoch
<< " load " << load
<< dendl
;
436 for (const auto& [rank
, load
] : import_map
) {
437 dout(5) << " import_map from " << rank
<< " -> " << load
<< dendl
;
442 mds
->get_mds_map()->get_up_mds_set(up
);
443 for (const auto& r
: up
) {
444 if (r
== mds
->get_nodeid())
446 auto hb
= make_message
<MHeartbeat
>(load
, beat_epoch
);
447 hb
->get_import_map() = import_map
;
448 mds
->send_message_mds(hb
, r
);
452 void MDBalancer::handle_heartbeat(const cref_t
<MHeartbeat
> &m
)
454 mds_rank_t who
= mds_rank_t(m
->get_source().num());
455 dout(25) << "=== got heartbeat " << m
->get_beat() << " from " << m
->get_source().num() << " " << m
->get_load() << dendl
;
457 if (!mds
->is_active())
460 if (!mds
->mdcache
->is_open()) {
461 dout(10) << "opening root on handle_heartbeat" << dendl
;
462 mds
->mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, m
));
466 if (mds
->is_cluster_degraded()) {
467 dout(10) << " degraded, ignoring" << dendl
;
471 if (mds
->get_nodeid() != 0 && m
->get_beat() > beat_epoch
) {
472 dout(10) << "receive next epoch " << m
->get_beat() << " from mds." << who
<< " before mds0" << dendl
;
474 beat_epoch
= m
->get_beat();
475 // clear the mds load info whose epoch is less than beat_epoch
480 dout(20) << " from mds0, new epoch " << m
->get_beat() << dendl
;
481 if (beat_epoch
!= m
->get_beat()) {
482 beat_epoch
= m
->get_beat();
488 mds
->mdcache
->show_subtrees();
489 } else if (mds
->get_nodeid() == 0) {
490 if (beat_epoch
!= m
->get_beat()) {
491 dout(10) << " old heartbeat epoch, ignoring" << dendl
;
497 auto em
= mds_load
.emplace(std::piecewise_construct
, std::forward_as_tuple(who
), std::forward_as_tuple(m
->get_load()));
499 em
.first
->second
= m
->get_load();
502 mds_import_map
[who
] = m
->get_import_map();
505 unsigned cluster_size
= mds
->get_mds_map()->get_num_in_mds();
506 if (mds_load
.size() == cluster_size
) {
508 //export_empties(); // no!
510 /* avoid spamming ceph -w if user does not turn mantle on */
511 if (mds
->mdsmap
->get_balancer() != "") {
512 int r
= mantle_prep_rebalance();
514 mds
->clog
->warn() << "using old balancer; mantle failed for "
515 << "balancer=" << mds
->mdsmap
->get_balancer()
516 << " : " << cpp_strerror(r
);
518 prep_rebalance(m
->get_beat());
523 double MDBalancer::try_match(balance_state_t
& state
, mds_rank_t ex
, double& maxex
,
524 mds_rank_t im
, double& maxim
)
526 if (maxex
<= 0 || maxim
<= 0) return 0.0;
528 double howmuch
= std::min(maxex
, maxim
);
530 dout(5) << " - mds." << ex
<< " exports " << howmuch
<< " to mds." << im
<< dendl
;
532 if (ex
== mds
->get_nodeid())
533 state
.targets
[im
] += howmuch
;
535 state
.exported
[ex
] += howmuch
;
536 state
.imported
[im
] += howmuch
;
544 void MDBalancer::queue_split(const CDir
*dir
, bool fast
)
546 dout(10) << __func__
<< " enqueuing " << *dir
547 << " (fast=" << fast
<< ")" << dendl
;
549 const dirfrag_t df
= dir
->dirfrag();
551 auto callback
= [this, df
](int r
) {
552 if (split_pending
.erase(df
) == 0) {
553 // Someone beat me to it. This can happen in the fast splitting
554 // path, because we spawn two contexts, one with mds->timer and
555 // one with mds->queue_waiter. The loser can safely just drop
560 auto mdcache
= mds
->mdcache
;
562 CDir
*dir
= mdcache
->get_dirfrag(df
);
564 dout(10) << "drop split on " << df
<< " because not in cache" << dendl
;
567 if (!dir
->is_auth()) {
568 dout(10) << "drop split on " << df
<< " because non-auth" << dendl
;
572 // Pass on to MDCache: note that the split might still not
573 // happen if the checks in MDCache::can_fragment fail.
574 dout(10) << __func__
<< " splitting " << *dir
<< dendl
;
575 int bits
= g_conf()->mds_bal_split_bits
;
576 if (dir
->inode
->is_ephemeral_dist()) {
577 unsigned min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
578 if (df
.frag
.bits() + bits
< min_frag_bits
)
579 bits
= min_frag_bits
- df
.frag
.bits();
581 mdcache
->split_dir(dir
, bits
);
584 auto ret
= split_pending
.insert(df
);
585 bool is_new
= ret
.second
;
588 // Do the split ASAP: enqueue it in the MDSRank waiters which are
589 // run at the end of dispatching the current request
590 mds
->queue_waiter(new MDSInternalContextWrapper(mds
,
591 new LambdaContext(std::move(callback
))));
593 // Set a timer to really do the split: we don't do it immediately
594 // so that bursts of ops on a directory have a chance to go through
595 // before we freeze it.
596 mds
->timer
.add_event_after(bal_fragment_interval
,
597 new LambdaContext(std::move(callback
)));
601 void MDBalancer::queue_merge(CDir
*dir
)
603 const auto frag
= dir
->dirfrag();
604 auto callback
= [this, frag
](int r
) {
605 ceph_assert(frag
.frag
!= frag_t());
607 // frag must be in this set because only one context is in flight
608 // for a given frag at a time (because merge_pending is checked before
609 // starting one), and this context is the only one that erases it.
610 merge_pending
.erase(frag
);
612 auto mdcache
= mds
->mdcache
;
613 CDir
*dir
= mdcache
->get_dirfrag(frag
);
615 dout(10) << "drop merge on " << frag
<< " because not in cache" << dendl
;
618 ceph_assert(dir
->dirfrag() == frag
);
620 if(!dir
->is_auth()) {
621 dout(10) << "drop merge on " << *dir
<< " because lost auth" << dendl
;
625 dout(10) << "merging " << *dir
<< dendl
;
627 CInode
*diri
= dir
->get_inode();
629 unsigned min_frag_bits
= 0;
630 if (diri
->is_ephemeral_dist())
631 min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
633 frag_t fg
= dir
->get_frag();
634 while (fg
.bits() > min_frag_bits
) {
635 frag_t sibfg
= fg
.get_sibling();
636 auto&& [complete
, sibs
] = diri
->get_dirfrags_under(sibfg
);
638 dout(10) << " not all sibs under " << sibfg
<< " in cache (have " << sibs
<< ")" << dendl
;
642 for (auto& sib
: sibs
) {
643 if (!sib
->is_auth() || !sib
->should_merge()) {
649 dout(10) << " not all sibs under " << sibfg
<< " " << sibs
<< " should_merge" << dendl
;
652 dout(10) << " all sibs under " << sibfg
<< " " << sibs
<< " should merge" << dendl
;
656 if (fg
!= dir
->get_frag())
657 mdcache
->merge_dir(diri
, fg
);
660 if (merge_pending
.count(frag
) == 0) {
661 dout(20) << " enqueued dir " << *dir
<< dendl
;
662 merge_pending
.insert(frag
);
663 mds
->timer
.add_event_after(bal_fragment_interval
,
664 new LambdaContext(std::move(callback
)));
666 dout(20) << " dir already in queue " << *dir
<< dendl
;
670 void MDBalancer::prep_rebalance(int beat
)
672 balance_state_t state
;
674 if (g_conf()->mds_thrash_exports
) {
675 //we're going to randomly export to all the mds in the cluster
676 set
<mds_rank_t
> up_mds
;
677 mds
->get_mds_map()->get_up_mds_set(up_mds
);
678 for (const auto &rank
: up_mds
) {
679 state
.targets
[rank
] = 0.0;
682 int cluster_size
= mds
->get_mds_map()->get_num_in_mds();
683 mds_rank_t whoami
= mds
->get_nodeid();
684 rebalance_time
= clock::now();
686 dout(7) << "cluster loads are" << dendl
;
688 mds
->mdcache
->migrator
->clear_export_queue();
690 // rescale! turn my mds_load back into meta_load units
691 double load_fac
= 1.0;
692 map
<mds_rank_t
, mds_load_t
>::iterator m
= mds_load
.find(whoami
);
693 if ((m
!= mds_load
.end()) && (m
->second
.mds_load() > 0)) {
694 double metald
= m
->second
.auth
.meta_load();
695 double mdsld
= m
->second
.mds_load();
696 load_fac
= metald
/ mdsld
;
697 dout(7) << " load_fac is " << load_fac
698 << " <- " << m
->second
.auth
<< " " << metald
703 mds_meta_load
.clear();
705 double total_load
= 0.0;
706 multimap
<double,mds_rank_t
> load_map
;
707 for (mds_rank_t i
=mds_rank_t(0); i
< mds_rank_t(cluster_size
); i
++) {
708 mds_load_t
& load
= mds_load
.at(i
);
710 double l
= load
.mds_load() * load_fac
;
711 mds_meta_load
[i
] = l
;
714 dout(7) << " mds." << i
716 << " = " << load
.mds_load()
717 << " ~ " << l
<< dendl
;
719 if (whoami
== i
) my_load
= l
;
722 load_map
.insert(pair
<double,mds_rank_t
>( l
, i
));
726 target_load
= total_load
/ (double)cluster_size
;
727 dout(7) << "my load " << my_load
728 << " target " << target_load
729 << " total " << total_load
733 for (const auto& [load
, rank
] : load_map
) {
734 if (load
< target_load
* (1.0 + g_conf()->mds_bal_min_rebalance
)) {
735 dout(7) << " mds." << rank
<< " is underloaded or barely overloaded." << dendl
;
736 mds_last_epoch_under_map
[rank
] = beat_epoch
;
740 int last_epoch_under
= mds_last_epoch_under_map
[whoami
];
741 if (last_epoch_under
== beat_epoch
) {
742 dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl
;
745 // am i over long enough?
746 if (last_epoch_under
&& beat_epoch
- last_epoch_under
< 2) {
747 dout(7) << " i am overloaded, but only for " << (beat_epoch
- last_epoch_under
) << " epochs" << dendl
;
751 dout(7) << " i am sufficiently overloaded" << dendl
;
754 // first separate exporters and importers
755 multimap
<double,mds_rank_t
> importers
;
756 multimap
<double,mds_rank_t
> exporters
;
757 set
<mds_rank_t
> importer_set
;
758 set
<mds_rank_t
> exporter_set
;
760 for (multimap
<double,mds_rank_t
>::iterator it
= load_map
.begin();
761 it
!= load_map
.end();
763 if (it
->first
< target_load
) {
764 dout(15) << " mds." << it
->second
<< " is importer" << dendl
;
765 importers
.insert(pair
<double,mds_rank_t
>(it
->first
,it
->second
));
766 importer_set
.insert(it
->second
);
768 int mds_last_epoch_under
= mds_last_epoch_under_map
[it
->second
];
769 if (!(mds_last_epoch_under
&& beat_epoch
- mds_last_epoch_under
< 2)) {
770 dout(15) << " mds." << it
->second
<< " is exporter" << dendl
;
771 exporters
.insert(pair
<double,mds_rank_t
>(it
->first
,it
->second
));
772 exporter_set
.insert(it
->second
);
778 // determine load transfer mapping
781 // analyze import_map; do any matches i can
783 dout(15) << " matching exporters to import sources" << dendl
;
785 // big -> small exporters
786 for (multimap
<double,mds_rank_t
>::reverse_iterator ex
= exporters
.rbegin();
787 ex
!= exporters
.rend();
789 double maxex
= get_maxex(state
, ex
->second
);
790 if (maxex
<= .001) continue;
792 // check importers. for now, just in arbitrary order (no intelligent matching).
793 for (map
<mds_rank_t
, float>::iterator im
= mds_import_map
[ex
->second
].begin();
794 im
!= mds_import_map
[ex
->second
].end();
796 double maxim
= get_maxim(state
, im
->first
);
797 if (maxim
<= .001) continue;
798 try_match(state
, ex
->second
, maxex
, im
->first
, maxim
);
799 if (maxex
<= .001) break;
806 dout(15) << " matching big exporters to big importers" << dendl
;
807 // big exporters to big importers
808 multimap
<double,mds_rank_t
>::reverse_iterator ex
= exporters
.rbegin();
809 multimap
<double,mds_rank_t
>::iterator im
= importers
.begin();
810 while (ex
!= exporters
.rend() &&
811 im
!= importers
.end()) {
812 double maxex
= get_maxex(state
, ex
->second
);
813 double maxim
= get_maxim(state
, im
->second
);
814 if (maxex
< .001 || maxim
< .001) break;
815 try_match(state
, ex
->second
, maxex
, im
->second
, maxim
);
816 if (maxex
<= .001) ++ex
;
817 if (maxim
<= .001) ++im
;
820 dout(15) << " matching small exporters to big importers" << dendl
;
821 // small exporters to big importers
822 multimap
<double,mds_rank_t
>::iterator ex
= exporters
.begin();
823 multimap
<double,mds_rank_t
>::iterator im
= importers
.begin();
824 while (ex
!= exporters
.end() &&
825 im
!= importers
.end()) {
826 double maxex
= get_maxex(state
, ex
->second
);
827 double maxim
= get_maxim(state
, im
->second
);
828 if (maxex
< .001 || maxim
< .001) break;
829 try_match(state
, ex
->second
, maxex
, im
->second
, maxim
);
830 if (maxex
<= .001) ++ex
;
831 if (maxim
<= .001) ++im
;
835 try_rebalance(state
);
838 int MDBalancer::mantle_prep_rebalance()
840 balance_state_t state
;
842 /* refresh balancer if it has changed */
843 if (bal_version
!= mds
->mdsmap
->get_balancer()) {
844 bal_version
.assign("");
845 int r
= localize_balancer();
848 /* only spam the cluster log from 1 mds on version changes */
849 if (mds
->get_nodeid() == 0)
850 mds
->clog
->info() << "mantle balancer version changed: " << bal_version
;
853 /* prepare for balancing */
854 int cluster_size
= mds
->get_mds_map()->get_num_in_mds();
855 rebalance_time
= clock::now();
856 mds
->mdcache
->migrator
->clear_export_queue();
858 /* fill in the metrics for each mds by grabbing load struct */
859 vector
< map
<string
, double> > metrics (cluster_size
);
860 for (mds_rank_t i
=mds_rank_t(0); i
< mds_rank_t(cluster_size
); i
++) {
861 mds_load_t
& load
= mds_load
.at(i
);
863 metrics
[i
] = {{"auth.meta_load", load
.auth
.meta_load()},
864 {"all.meta_load", load
.all
.meta_load()},
865 {"req_rate", load
.req_rate
},
866 {"queue_len", load
.queue_len
},
867 {"cpu_load_avg", load
.cpu_load_avg
}};
870 /* execute the balancer */
872 int ret
= mantle
.balance(bal_code
, mds
->get_nodeid(), metrics
, state
.targets
);
873 dout(7) << " mantle decided that new targets=" << state
.targets
<< dendl
;
875 /* mantle doesn't know about cluster size, so check target len here */
876 if ((int) state
.targets
.size() != cluster_size
)
877 return -CEPHFS_EINVAL
;
881 try_rebalance(state
);
887 void MDBalancer::try_rebalance(balance_state_t
& state
)
889 if (g_conf()->mds_thrash_exports
) {
890 dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
895 // make a sorted list of my imports
896 multimap
<double, CDir
*> import_pop_map
;
897 multimap
<mds_rank_t
, pair
<CDir
*, double> > import_from_map
;
899 for (auto& dir
: mds
->mdcache
->get_fullauth_subtrees()) {
900 CInode
*diri
= dir
->get_inode();
901 if (diri
->is_mdsdir())
903 if (diri
->get_export_pin(false) != MDS_RANK_NONE
)
905 if (dir
->is_freezing() || dir
->is_frozen())
906 continue; // export pbly already in progress
908 mds_rank_t from
= diri
->authority().first
;
909 double pop
= dir
->pop_auth_subtree
.meta_load();
910 if (g_conf()->mds_bal_idle_threshold
> 0 &&
911 pop
< g_conf()->mds_bal_idle_threshold
&&
912 diri
!= mds
->mdcache
->get_root() &&
913 from
!= mds
->get_nodeid()) {
914 dout(5) << " exporting idle (" << pop
<< ") import " << *dir
915 << " back to mds." << from
<< dendl
;
916 mds
->mdcache
->migrator
->export_dir_nicely(dir
, from
);
920 dout(15) << " map: i imported " << *dir
<< " from " << from
<< dendl
;
921 import_pop_map
.insert(make_pair(pop
, dir
));
922 import_from_map
.insert(make_pair(from
, make_pair(dir
, pop
)));
926 map
<mds_rank_t
, double> export_pop_map
;
928 for (auto &it
: state
.targets
) {
929 mds_rank_t target
= it
.first
;
930 double amount
= it
.second
;
932 if (amount
< MIN_OFFLOAD
)
934 if (amount
* 10 * state
.targets
.size() < target_load
)
937 dout(5) << "want to send " << amount
<< " to mds." << target
938 //<< " .. " << (*it).second << " * " << load_fac
940 << dendl
;//" .. fudge is " << fudge << dendl;
942 double& have
= export_pop_map
[target
];
944 mds
->mdcache
->show_subtrees();
946 // search imports from target
947 if (import_from_map
.count(target
)) {
948 dout(7) << " aha, looking through imports from target mds." << target
<< dendl
;
949 for (auto p
= import_from_map
.equal_range(target
);
950 p
.first
!= p
.second
; ) {
951 CDir
*dir
= p
.first
->second
.first
;
952 double pop
= p
.first
->second
.second
;
953 dout(7) << "considering " << *dir
<< " from " << (*p
.first
).first
<< dendl
;
954 auto plast
= p
.first
++;
956 if (dir
->inode
->is_base())
958 ceph_assert(dir
->inode
->authority().first
== target
); // cuz that's how i put it in the map, dummy
960 if (pop
<= amount
-have
) {
961 dout(7) << "reexporting " << *dir
<< " pop " << pop
962 << " back to mds." << target
<< dendl
;
963 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
965 import_from_map
.erase(plast
);
966 for (auto q
= import_pop_map
.equal_range(pop
);
967 q
.first
!= q
.second
; ) {
968 if (q
.first
->second
== dir
) {
969 import_pop_map
.erase(q
.first
);
975 dout(7) << "can't reexport " << *dir
<< ", too big " << pop
<< dendl
;
977 if (amount
-have
< MIN_OFFLOAD
)
984 for (auto &it
: state
.targets
) {
985 mds_rank_t target
= it
.first
;
986 double amount
= it
.second
;
988 if (!export_pop_map
.count(target
))
990 double& have
= export_pop_map
[target
];
991 if (amount
-have
< MIN_OFFLOAD
)
994 for (auto p
= import_pop_map
.begin();
995 p
!= import_pop_map
.end(); ) {
996 CDir
*dir
= p
->second
;
997 if (dir
->inode
->is_base()) {
1002 double pop
= p
->first
;
1003 if (pop
<= amount
-have
&& pop
> MIN_REEXPORT
) {
1004 dout(5) << "reexporting " << *dir
<< " pop " << pop
1005 << " to mds." << target
<< dendl
;
1007 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
1008 import_pop_map
.erase(p
++);
1012 if (amount
-have
< MIN_OFFLOAD
)
1017 set
<CDir
*> already_exporting
;
1019 for (auto &it
: state
.targets
) {
1020 mds_rank_t target
= it
.first
;
1021 double amount
= it
.second
;
1023 if (!export_pop_map
.count(target
))
1025 double& have
= export_pop_map
[target
];
1026 if (amount
-have
< MIN_OFFLOAD
)
1029 // okay, search for fragments of my workload
1030 std::vector
<CDir
*> exports
;
1032 for (auto p
= import_pop_map
.rbegin();
1033 p
!= import_pop_map
.rend();
1035 CDir
*dir
= p
->second
;
1036 find_exports(dir
, amount
, &exports
, have
, already_exporting
);
1037 if (amount
-have
< MIN_OFFLOAD
)
1040 //fudge = amount - have;
1042 for (const auto& dir
: exports
) {
1043 dout(5) << " - exporting " << dir
->pop_auth_subtree
1044 << " " << dir
->pop_auth_subtree
.meta_load()
1045 << " to mds." << target
<< " " << *dir
<< dendl
;
1046 mds
->mdcache
->migrator
->export_dir_nicely(dir
, target
);
1050 dout(7) << "done" << dendl
;
1051 mds
->mdcache
->show_subtrees();
1054 void MDBalancer::find_exports(CDir
*dir
,
1056 std::vector
<CDir
*>* exports
,
1058 set
<CDir
*>& already_exporting
)
1060 auto now
= clock::now();
1061 auto duration
= std::chrono::duration
<double>(now
-rebalance_time
).count();
1062 if (duration
> 0.1) {
1063 derr
<< " balancer runs too long" << dendl_impl
;
1068 ceph_assert(dir
->is_auth());
1070 double need
= amount
- have
;
1071 if (need
< amount
* g_conf()->mds_bal_min_start
)
1072 return; // good enough!
1074 double needmax
= need
* g_conf()->mds_bal_need_max
;
1075 double needmin
= need
* g_conf()->mds_bal_need_min
;
1076 double midchunk
= need
* g_conf()->mds_bal_midchunk
;
1077 double minchunk
= need
* g_conf()->mds_bal_minchunk
;
1079 std::vector
<CDir
*> bigger_rep
, bigger_unrep
;
1080 multimap
<double, CDir
*> smaller
;
1082 double dir_pop
= dir
->pop_auth_subtree
.meta_load();
1083 dout(7) << "in " << dir_pop
<< " " << *dir
<< " need " << need
<< " (" << needmin
<< " - " << needmax
<< ")" << dendl
;
1085 double subdir_sum
= 0;
1086 for (elist
<CInode
*>::iterator it
= dir
->pop_lru_subdirs
.begin_use_current();
1091 ceph_assert(in
->is_dir());
1092 ceph_assert(in
->get_parent_dir() == dir
);
1094 auto&& dfls
= in
->get_nested_dirfrags();
1096 size_t num_idle_frags
= 0;
1097 for (const auto& subdir
: dfls
) {
1098 if (already_exporting
.count(subdir
))
1101 // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
1102 // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
1103 if (subdir
->is_frozen_dir() || subdir
->is_frozen_tree_root() ||
1104 subdir
->is_freezing_dir() || subdir
->is_freezing_tree_root())
1105 continue; // can't export this right now!
1108 double pop
= subdir
->pop_auth_subtree
.meta_load();
1110 dout(15) << " subdir pop " << pop
<< " " << *subdir
<< dendl
;
1112 if (pop
< minchunk
) {
1118 if (pop
> needmin
&& pop
< needmax
) {
1119 exports
->push_back(subdir
);
1120 already_exporting
.insert(subdir
);
1126 if (subdir
->is_rep())
1127 bigger_rep
.push_back(subdir
);
1129 bigger_unrep
.push_back(subdir
);
1131 smaller
.insert(pair
<double,CDir
*>(pop
, subdir
));
1133 if (dfls
.size() == num_idle_frags
)
1134 in
->item_pop_lru
.remove_myself();
1136 dout(15) << " sum " << subdir_sum
<< " / " << dir_pop
<< dendl
;
1138 // grab some sufficiently big small items
1139 multimap
<double,CDir
*>::reverse_iterator it
;
1140 for (it
= smaller
.rbegin();
1141 it
!= smaller
.rend();
1144 if ((*it
).first
< midchunk
)
1147 dout(7) << " taking smaller " << *(*it
).second
<< dendl
;
1149 exports
->push_back((*it
).second
);
1150 already_exporting
.insert((*it
).second
);
1151 have
+= (*it
).first
;
1156 // apprently not enough; drill deeper into the hierarchy (if non-replicated)
1157 for (const auto& dir
: bigger_unrep
) {
1158 dout(15) << " descending into " << *dir
<< dendl
;
1159 find_exports(dir
, amount
, exports
, have
, already_exporting
);
1164 // ok fine, use smaller bits
1166 it
!= smaller
.rend();
1168 dout(7) << " taking (much) smaller " << it
->first
<< " " << *(*it
).second
<< dendl
;
1170 exports
->push_back((*it
).second
);
1171 already_exporting
.insert((*it
).second
);
1172 have
+= (*it
).first
;
1177 // ok fine, drill into replicated dirs
1178 for (const auto& dir
: bigger_rep
) {
1179 dout(7) << " descending into replicated " << *dir
<< dendl
;
1180 find_exports(dir
, amount
, exports
, have
, already_exporting
);
1186 void MDBalancer::hit_inode(CInode
*in
, int type
, int who
)
1189 in
->pop
.get(type
).hit();
1191 if (in
->get_parent_dn())
1192 hit_dir(in
->get_parent_dn()->get_dir(), type
, who
);
1195 void MDBalancer::maybe_fragment(CDir
*dir
, bool hot
)
1198 if (bal_fragment_dirs
&& bal_fragment_interval
> 0 &&
1200 !dir
->inode
->is_base() && // not root/mdsdir (for now at least)
1201 !dir
->inode
->is_stray()) { // not straydir
1204 if (dir
->should_split() || hot
) {
1205 if (split_pending
.count(dir
->dirfrag()) == 0) {
1206 queue_split(dir
, false);
1208 if (dir
->should_split_fast()) {
1209 queue_split(dir
, true);
1211 dout(10) << ": fragment already enqueued to split: "
1218 if (dir
->get_frag() != frag_t() && dir
->should_merge() &&
1219 merge_pending
.count(dir
->dirfrag()) == 0) {
1225 void MDBalancer::hit_dir(CDir
*dir
, int type
, int who
, double amount
)
1227 if (dir
->inode
->is_stray())
1230 double v
= dir
->pop_me
.get(type
).hit(amount
);
1232 const bool hot
= (v
> g_conf()->mds_bal_split_rd
&& type
== META_POP_IRD
) ||
1233 (v
> g_conf()->mds_bal_split_wr
&& type
== META_POP_IWR
);
1235 dout(20) << type
<< " pop is " << v
<< ", frag " << dir
->get_frag()
1236 << " size " << dir
->get_frag_size() << " " << dir
->pop_me
<< dendl
;
1238 maybe_fragment(dir
, hot
);
1241 if (type
== META_POP_IRD
&& who
>= 0) {
1242 dir
->pop_spread
.hit(who
);
1245 double rd_adj
= 0.0;
1246 if (type
== META_POP_IRD
&&
1247 dir
->last_popularity_sample
< last_sample
) {
1248 double dir_pop
= dir
->pop_auth_subtree
.get(type
).get(); // hmm??
1249 dir
->last_popularity_sample
= last_sample
;
1250 double pop_sp
= dir
->pop_spread
.get();
1251 dir_pop
+= pop_sp
* 10;
1253 //if (dir->ino() == inodeno_t(0x10000000002))
1255 dout(20) << type
<< " pop " << dir_pop
<< " spread " << pop_sp
1256 << " " << dir
->pop_spread
.last
[0]
1257 << " " << dir
->pop_spread
.last
[1]
1258 << " " << dir
->pop_spread
.last
[2]
1259 << " " << dir
->pop_spread
.last
[3]
1260 << " in " << *dir
<< dendl
;
1263 if (dir
->is_auth() && !dir
->is_ambiguous_auth()) {
1264 if (dir
->can_rep() &&
1265 dir_pop
>= g_conf()->mds_bal_replicate_threshold
) {
1267 double rdp
= dir
->pop_me
.get(META_POP_IRD
).get();
1268 rd_adj
= rdp
/ mds
->get_mds_map()->get_num_in_mds() - rdp
;
1269 rd_adj
/= 2.0; // temper somewhat
1271 dout(5) << "replicating dir " << *dir
<< " pop " << dir_pop
<< " .. rdp " << rdp
<< " adj " << rd_adj
<< dendl
;
1273 dir
->dir_rep
= CDir::REP_ALL
;
1274 mds
->mdcache
->send_dir_updates(dir
, true);
1276 // fixme this should adjust the whole pop hierarchy
1277 dir
->pop_me
.get(META_POP_IRD
).adjust(rd_adj
);
1278 dir
->pop_auth_subtree
.get(META_POP_IRD
).adjust(rd_adj
);
1281 if (dir
->ino() != 1 &&
1283 dir_pop
< g_conf()->mds_bal_unreplicate_threshold
) {
1285 dout(5) << "unreplicating dir " << *dir
<< " pop " << dir_pop
<< dendl
;
1287 dir
->dir_rep
= CDir::REP_NONE
;
1288 mds
->mdcache
->send_dir_updates(dir
);
1294 bool hit_subtree
= dir
->is_auth(); // current auth subtree (if any)
1295 bool hit_subtree_nested
= dir
->is_auth(); // all nested auth subtrees
1298 CDir
*pdir
= dir
->inode
->get_parent_dir();
1299 dir
->pop_nested
.get(type
).hit(amount
);
1301 dir
->pop_nested
.get(META_POP_IRD
).adjust(rd_adj
);
1304 dir
->pop_auth_subtree
.get(type
).hit(amount
);
1307 dir
->pop_auth_subtree
.get(META_POP_IRD
).adjust(rd_adj
);
1309 if (dir
->is_subtree_root())
1310 hit_subtree
= false; // end of auth domain, stop hitting auth counters.
1312 pdir
->pop_lru_subdirs
.push_front(&dir
->get_inode()->item_pop_lru
);
1315 if (hit_subtree_nested
) {
1316 dir
->pop_auth_subtree_nested
.get(type
).hit(amount
);
1318 dir
->pop_auth_subtree_nested
.get(META_POP_IRD
).adjust(rd_adj
);
1327 * subtract off an exported chunk.
1328 * this excludes *dir itself (encode_export_dir should have take care of that)
1329 * we _just_ do the parents' nested counters.
1331 * NOTE: call me _after_ forcing *dir into a subtree root,
1332 * but _before_ doing the encode_export_dirs.
1334 void MDBalancer::subtract_export(CDir
*dir
)
1336 dirfrag_load_vec_t subload
= dir
->pop_auth_subtree
;
1339 dir
= dir
->inode
->get_parent_dir();
1342 dir
->pop_nested
.sub(subload
);
1343 dir
->pop_auth_subtree_nested
.sub(subload
);
1348 void MDBalancer::add_import(CDir
*dir
)
1350 dirfrag_load_vec_t subload
= dir
->pop_auth_subtree
;
1353 dir
= dir
->inode
->get_parent_dir();
1356 dir
->pop_nested
.add(subload
);
1357 dir
->pop_auth_subtree_nested
.add(subload
);
1361 void MDBalancer::adjust_pop_for_rename(CDir
*pdir
, CDir
*dir
, bool inc
)
1363 bool adjust_subtree_nest
= dir
->is_auth();
1364 bool adjust_subtree
= adjust_subtree_nest
&& !dir
->is_subtree_root();
1368 pdir
->pop_nested
.add(dir
->pop_nested
);
1369 if (adjust_subtree
) {
1370 pdir
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
1371 pdir
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
1374 if (adjust_subtree_nest
)
1375 pdir
->pop_auth_subtree_nested
.add(dir
->pop_auth_subtree_nested
);
1377 pdir
->pop_nested
.sub(dir
->pop_nested
);
1379 pdir
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
1381 if (adjust_subtree_nest
)
1382 pdir
->pop_auth_subtree_nested
.sub(dir
->pop_auth_subtree_nested
);
1385 if (pdir
->is_subtree_root())
1386 adjust_subtree
= false;
1388 pdir
= pdir
->inode
->get_parent_dir();
1393 void MDBalancer::handle_mds_failure(mds_rank_t who
)
1396 mds_last_epoch_under_map
.clear();
1400 int MDBalancer::dump_loads(Formatter
*f
) const
1402 std::deque
<CDir
*> dfs
;
1403 if (mds
->mdcache
->get_root()) {
1404 mds
->mdcache
->get_root()->get_dirfrags(dfs
);
1406 dout(10) << "no root" << dendl
;
1409 f
->open_object_section("loads");
1411 f
->open_array_section("dirfrags");
1412 while (!dfs
.empty()) {
1413 CDir
*dir
= dfs
.front();
1416 f
->open_object_section("dir");
1420 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
1421 CInode
*in
= it
->second
->get_linkage()->get_inode();
1422 if (!in
|| !in
->is_dir())
1425 auto&& ls
= in
->get_dirfrags();
1426 for (const auto& subdir
: ls
) {
1427 if (subdir
->pop_nested
.meta_load() < .001)
1429 dfs
.push_back(subdir
);
1433 f
->close_section(); // dirfrags array
1435 f
->open_object_section("mds_load");
1438 auto dump_mds_load
= [f
](const mds_load_t
& load
) {
1439 f
->dump_float("request_rate", load
.req_rate
);
1440 f
->dump_float("cache_hit_rate", load
.cache_hit_rate
);
1441 f
->dump_float("queue_length", load
.queue_len
);
1442 f
->dump_float("cpu_load", load
.cpu_load_avg
);
1443 f
->dump_float("mds_load", load
.mds_load());
1445 f
->open_object_section("auth_dirfrags");
1448 f
->open_object_section("all_dirfrags");
1453 for (const auto& [rank
, load
] : mds_load
) {
1454 CachedStackStringStream css
;
1455 *css
<< "mds." << rank
;
1456 f
->open_object_section(css
->strv());
1457 dump_mds_load(load
);
1461 f
->close_section(); // mds_load
1463 f
->open_object_section("mds_meta_load");
1464 for (auto& [rank
, mload
] : mds_meta_load
) {
1465 CachedStackStringStream css
;
1466 *css
<< "mds." << rank
;
1467 f
->dump_float(css
->strv(), mload
);
1469 f
->close_section(); // mds_meta_load
1471 f
->open_object_section("mds_import_map");
1472 for (auto& [rank
, imports
] : mds_import_map
) {
1474 CachedStackStringStream css
;
1475 *css
<< "mds." << rank
;
1476 f
->open_array_section(css
->strv());
1478 for (auto& [rank_from
, mload
] : imports
) {
1479 f
->open_object_section("from");
1480 CachedStackStringStream css
;
1481 *css
<< "mds." << rank_from
;
1482 f
->dump_float(css
->strv(), mload
);
1485 f
->close_section(); // mds.? array
1487 f
->close_section(); // mds_import_map
1489 f
->close_section(); // loads