]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
7c673cae FG |
14 | #ifndef CEPH_MDBALANCER_H |
15 | #define CEPH_MDBALANCER_H | |
16 | ||
7c673cae FG |
17 | #include "include/types.h" |
18 | #include "common/Clock.h" | |
19 | #include "common/Cond.h" | |
20 | ||
11fdf7f2 TL |
21 | #include "msg/Message.h" |
22 | #include "messages/MHeartbeat.h" | |
23 | ||
24 | #include "MDSMap.h" | |
25 | ||
7c673cae | 26 | class MDSRank; |
7c673cae FG |
27 | class MHeartbeat; |
28 | class CInode; | |
29 | class CDir; | |
30 | class Messenger; | |
31 | class MonClient; | |
32 | ||
33 | class MDBalancer { | |
7c673cae | 34 | public: |
11fdf7f2 TL |
35 | using clock = ceph::coarse_mono_clock; |
36 | using time = ceph::coarse_mono_time; | |
37 | friend class C_Bal_SendHeartbeat; | |
38 | ||
91327a77 AA |
39 | MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc); |
40 | ||
92f5a8d4 | 41 | void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); |
7c673cae | 42 | |
9f95a23c | 43 | int proc_message(const cref_t<Message> &m); |
7c673cae FG |
44 | |
45 | /** | |
46 | * Regularly called upkeep function. | |
47 | * | |
48 | * Sends MHeartbeat messages to the mons. | |
49 | */ | |
50 | void tick(); | |
51 | ||
f6b5b4d7 TL |
52 | void handle_export_pins(void); |
53 | ||
11fdf7f2 TL |
54 | void subtract_export(CDir *ex); |
55 | void add_import(CDir *im); | |
56 | void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc); | |
7c673cae | 57 | |
11fdf7f2 TL |
58 | void hit_inode(CInode *in, int type, int who=-1); |
59 | void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0); | |
7c673cae FG |
60 | |
61 | void queue_split(const CDir *dir, bool fast); | |
62 | void queue_merge(CDir *dir); | |
63 | ||
64 | /** | |
65 | * Based on size and configuration, decide whether to issue a queue_split | |
66 | * or queue_merge for this CDir. | |
67 | * | |
68 | * \param hot whether the directory's temperature is enough to split it | |
69 | */ | |
70 | void maybe_fragment(CDir *dir, bool hot); | |
71 | ||
224ce89b WB |
72 | void handle_mds_failure(mds_rank_t who); |
73 | ||
9f95a23c | 74 | int dump_loads(Formatter *f) const; |
28e407b8 | 75 | |
7c673cae FG |
76 | private: |
77 | typedef struct { | |
78 | std::map<mds_rank_t, double> targets; | |
79 | std::map<mds_rank_t, double> imported; | |
80 | std::map<mds_rank_t, double> exported; | |
81 | } balance_state_t; | |
82 | ||
83 | //set up the rebalancing targets for export and do one if the | |
84 | //MDSMap is up to date | |
85 | void prep_rebalance(int beat); | |
86 | int mantle_prep_rebalance(); | |
87 | ||
11fdf7f2 | 88 | mds_load_t get_load(); |
7c673cae | 89 | int localize_balancer(); |
7c673cae | 90 | void send_heartbeat(); |
9f95a23c | 91 | void handle_heartbeat(const cref_t<MHeartbeat> &m); |
7c673cae FG |
92 | void find_exports(CDir *dir, |
93 | double amount, | |
9f95a23c | 94 | std::vector<CDir*>* exports, |
7c673cae FG |
95 | double& have, |
96 | set<CDir*>& already_exporting); | |
97 | ||
98 | double try_match(balance_state_t &state, | |
99 | mds_rank_t ex, double& maxex, | |
100 | mds_rank_t im, double& maxim); | |
101 | ||
102 | double get_maxim(balance_state_t &state, mds_rank_t im) { | |
103 | return target_load - mds_meta_load[im] - state.imported[im]; | |
104 | } | |
105 | double get_maxex(balance_state_t &state, mds_rank_t ex) { | |
106 | return mds_meta_load[ex] - target_load - state.exported[ex]; | |
107 | } | |
108 | ||
109 | /** | |
110 | * Try to rebalance. | |
111 | * | |
112 | * Check if the monitor has recorded the current export targets; | |
113 | * if it has then do the actual export. Otherwise send off our | |
114 | * export targets message again. | |
115 | */ | |
116 | void try_rebalance(balance_state_t& state); | |
117 | ||
9f95a23c TL |
118 | bool bal_fragment_dirs; |
119 | int64_t bal_fragment_interval; | |
120 | static const unsigned int AUTH_TREES_THRESHOLD = 5; | |
121 | ||
7c673cae FG |
122 | MDSRank *mds; |
123 | Messenger *messenger; | |
124 | MonClient *mon_client; | |
28e407b8 | 125 | int beat_epoch = 0; |
7c673cae | 126 | |
7c673cae FG |
127 | string bal_code; |
128 | string bal_version; | |
129 | ||
11fdf7f2 TL |
130 | time last_heartbeat = clock::zero(); |
131 | time last_sample = clock::zero(); | |
132 | time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance | |
7c673cae | 133 | |
11fdf7f2 | 134 | time last_get_load = clock::zero(); |
28e407b8 | 135 | uint64_t last_num_requests = 0; |
91327a77 | 136 | uint64_t last_cpu_time = 0; |
28e407b8 | 137 | |
7c673cae FG |
138 | // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir |
139 | // just as soon as a delayed context comes back and triggers it. | |
140 | // These sets just prevent us from spawning extra timer contexts for | |
141 | // dirfrags that already have one in flight. | |
9f95a23c | 142 | set<dirfrag_t> split_pending, merge_pending; |
7c673cae FG |
143 | |
144 | // per-epoch scatter/gathered info | |
9f95a23c TL |
145 | std::map<mds_rank_t, mds_load_t> mds_load; |
146 | std::map<mds_rank_t, double> mds_meta_load; | |
11fdf7f2 TL |
147 | std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map; |
148 | std::map<mds_rank_t, int> mds_last_epoch_under_map; | |
7c673cae FG |
149 | |
150 | // per-epoch state | |
28e407b8 AA |
151 | double my_load = 0; |
152 | double target_load = 0; | |
7c673cae | 153 | }; |
7c673cae | 154 | #endif |