]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.h
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / MDBalancer.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
7c673cae
FG
14#ifndef CEPH_MDBALANCER_H
15#define CEPH_MDBALANCER_H
16
7c673cae
FG
17#include "include/types.h"
18#include "common/Clock.h"
19#include "common/Cond.h"
20
11fdf7f2
TL
21#include "msg/Message.h"
22#include "messages/MHeartbeat.h"
23
24#include "MDSMap.h"
25
7c673cae 26class MDSRank;
7c673cae
FG
27class MHeartbeat;
28class CInode;
29class CDir;
30class Messenger;
31class MonClient;
32
33class MDBalancer {
7c673cae 34public:
11fdf7f2
TL
35 using clock = ceph::coarse_mono_clock;
36 using time = ceph::coarse_mono_time;
37 friend class C_Bal_SendHeartbeat;
38
91327a77
AA
39 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
40
92f5a8d4 41 void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
7c673cae 42
9f95a23c 43 int proc_message(const cref_t<Message> &m);
7c673cae
FG
44
45 /**
46 * Regularly called upkeep function.
47 *
48 * Sends MHeartbeat messages to the mons.
49 */
50 void tick();
51
f6b5b4d7
TL
52 void handle_export_pins(void);
53
11fdf7f2
TL
54 void subtract_export(CDir *ex);
55 void add_import(CDir *im);
56 void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc);
7c673cae 57
11fdf7f2
TL
58 void hit_inode(CInode *in, int type, int who=-1);
59 void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0);
7c673cae
FG
60
61 void queue_split(const CDir *dir, bool fast);
62 void queue_merge(CDir *dir);
63
64 /**
65 * Based on size and configuration, decide whether to issue a queue_split
66 * or queue_merge for this CDir.
67 *
68 * \param hot whether the directory's temperature is enough to split it
69 */
70 void maybe_fragment(CDir *dir, bool hot);
71
224ce89b
WB
72 void handle_mds_failure(mds_rank_t who);
73
9f95a23c 74 int dump_loads(Formatter *f) const;
28e407b8 75
7c673cae
FG
76private:
77 typedef struct {
78 std::map<mds_rank_t, double> targets;
79 std::map<mds_rank_t, double> imported;
80 std::map<mds_rank_t, double> exported;
81 } balance_state_t;
82
83 //set up the rebalancing targets for export and do one if the
84 //MDSMap is up to date
85 void prep_rebalance(int beat);
86 int mantle_prep_rebalance();
87
11fdf7f2 88 mds_load_t get_load();
7c673cae 89 int localize_balancer();
7c673cae 90 void send_heartbeat();
9f95a23c 91 void handle_heartbeat(const cref_t<MHeartbeat> &m);
7c673cae
FG
92 void find_exports(CDir *dir,
93 double amount,
9f95a23c 94 std::vector<CDir*>* exports,
7c673cae
FG
95 double& have,
96 set<CDir*>& already_exporting);
97
98 double try_match(balance_state_t &state,
99 mds_rank_t ex, double& maxex,
100 mds_rank_t im, double& maxim);
101
102 double get_maxim(balance_state_t &state, mds_rank_t im) {
103 return target_load - mds_meta_load[im] - state.imported[im];
104 }
105 double get_maxex(balance_state_t &state, mds_rank_t ex) {
106 return mds_meta_load[ex] - target_load - state.exported[ex];
107 }
108
109 /**
110 * Try to rebalance.
111 *
112 * Check if the monitor has recorded the current export targets;
113 * if it has then do the actual export. Otherwise send off our
114 * export targets message again.
115 */
116 void try_rebalance(balance_state_t& state);
117
9f95a23c
TL
118 bool bal_fragment_dirs;
119 int64_t bal_fragment_interval;
120 static const unsigned int AUTH_TREES_THRESHOLD = 5;
121
7c673cae
FG
122 MDSRank *mds;
123 Messenger *messenger;
124 MonClient *mon_client;
28e407b8 125 int beat_epoch = 0;
7c673cae 126
7c673cae
FG
127 string bal_code;
128 string bal_version;
129
11fdf7f2
TL
130 time last_heartbeat = clock::zero();
131 time last_sample = clock::zero();
132 time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance
7c673cae 133
11fdf7f2 134 time last_get_load = clock::zero();
28e407b8 135 uint64_t last_num_requests = 0;
91327a77 136 uint64_t last_cpu_time = 0;
28e407b8 137
7c673cae
FG
138 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
139 // just as soon as a delayed context comes back and triggers it.
140 // These sets just prevent us from spawning extra timer contexts for
141 // dirfrags that already have one in flight.
9f95a23c 142 set<dirfrag_t> split_pending, merge_pending;
7c673cae
FG
143
144 // per-epoch scatter/gathered info
9f95a23c
TL
145 std::map<mds_rank_t, mds_load_t> mds_load;
146 std::map<mds_rank_t, double> mds_meta_load;
11fdf7f2
TL
147 std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
148 std::map<mds_rank_t, int> mds_last_epoch_under_map;
7c673cae
FG
149
150 // per-epoch state
28e407b8
AA
151 double my_load = 0;
152 double target_load = 0;
7c673cae 153};
7c673cae 154#endif