]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | ||
17 | #ifndef CEPH_MDBALANCER_H | |
18 | #define CEPH_MDBALANCER_H | |
19 | ||
20 | #include <list> | |
21 | #include <map> | |
22 | using std::list; | |
23 | using std::map; | |
24 | ||
25 | #include "include/types.h" | |
26 | #include "common/Clock.h" | |
27 | #include "common/Cond.h" | |
28 | ||
29 | class MDSRank; | |
30 | class Message; | |
31 | class MHeartbeat; | |
32 | class CInode; | |
33 | class CDir; | |
34 | class Messenger; | |
35 | class MonClient; | |
36 | ||
37 | class MDBalancer { | |
38 | friend class C_Bal_SendHeartbeat; | |
39 | public: | |
40 | MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : | |
28e407b8 | 41 | mds(m), messenger(msgr), mon_client(monc) { } |
7c673cae FG |
42 | |
43 | int proc_message(Message *m); | |
44 | ||
45 | /** | |
46 | * Regularly called upkeep function. | |
47 | * | |
48 | * Sends MHeartbeat messages to the mons. | |
49 | */ | |
50 | void tick(); | |
51 | ||
52 | void subtract_export(CDir *ex, utime_t now); | |
53 | void add_import(CDir *im, utime_t now); | |
28e407b8 | 54 | void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc); |
7c673cae | 55 | |
28e407b8 AA |
56 | void hit_inode(const utime_t& now, CInode *in, int type, int who=-1); |
57 | void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0); | |
7c673cae FG |
58 | |
59 | void queue_split(const CDir *dir, bool fast); | |
60 | void queue_merge(CDir *dir); | |
61 | ||
62 | /** | |
63 | * Based on size and configuration, decide whether to issue a queue_split | |
64 | * or queue_merge for this CDir. | |
65 | * | |
66 | * \param hot whether the directory's temperature is enough to split it | |
67 | */ | |
68 | void maybe_fragment(CDir *dir, bool hot); | |
69 | ||
224ce89b WB |
70 | void handle_mds_failure(mds_rank_t who); |
71 | ||
28e407b8 AA |
72 | int dump_loads(Formatter *f); |
73 | ||
7c673cae FG |
74 | private: |
75 | typedef struct { | |
76 | std::map<mds_rank_t, double> targets; | |
77 | std::map<mds_rank_t, double> imported; | |
78 | std::map<mds_rank_t, double> exported; | |
79 | } balance_state_t; | |
80 | ||
81 | //set up the rebalancing targets for export and do one if the | |
82 | //MDSMap is up to date | |
83 | void prep_rebalance(int beat); | |
84 | int mantle_prep_rebalance(); | |
85 | ||
86 | void handle_export_pins(void); | |
87 | ||
28e407b8 | 88 | mds_load_t get_load(utime_t now); |
7c673cae | 89 | int localize_balancer(); |
7c673cae FG |
90 | void send_heartbeat(); |
91 | void handle_heartbeat(MHeartbeat *m); | |
92 | void find_exports(CDir *dir, | |
93 | double amount, | |
94 | list<CDir*>& exports, | |
95 | double& have, | |
96 | set<CDir*>& already_exporting); | |
97 | ||
98 | double try_match(balance_state_t &state, | |
99 | mds_rank_t ex, double& maxex, | |
100 | mds_rank_t im, double& maxim); | |
101 | ||
102 | double get_maxim(balance_state_t &state, mds_rank_t im) { | |
103 | return target_load - mds_meta_load[im] - state.imported[im]; | |
104 | } | |
105 | double get_maxex(balance_state_t &state, mds_rank_t ex) { | |
106 | return mds_meta_load[ex] - target_load - state.exported[ex]; | |
107 | } | |
108 | ||
109 | /** | |
110 | * Try to rebalance. | |
111 | * | |
112 | * Check if the monitor has recorded the current export targets; | |
113 | * if it has then do the actual export. Otherwise send off our | |
114 | * export targets message again. | |
115 | */ | |
116 | void try_rebalance(balance_state_t& state); | |
117 | ||
118 | MDSRank *mds; | |
119 | Messenger *messenger; | |
120 | MonClient *mon_client; | |
28e407b8 | 121 | int beat_epoch = 0; |
7c673cae | 122 | |
7c673cae FG |
123 | string bal_code; |
124 | string bal_version; | |
125 | ||
126 | utime_t last_heartbeat; | |
127 | utime_t last_sample; | |
128 | utime_t rebalance_time; //ensure a consistent view of load for rebalance | |
129 | ||
28e407b8 AA |
130 | utime_t last_get_load; |
131 | uint64_t last_num_requests = 0; | |
132 | ||
7c673cae FG |
133 | // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir |
134 | // just as soon as a delayed context comes back and triggers it. | |
135 | // These sets just prevent us from spawning extra timer contexts for | |
136 | // dirfrags that already have one in flight. | |
137 | set<dirfrag_t> split_pending, merge_pending; | |
138 | ||
139 | // per-epoch scatter/gathered info | |
140 | map<mds_rank_t, mds_load_t> mds_load; | |
141 | map<mds_rank_t, double> mds_meta_load; | |
142 | map<mds_rank_t, map<mds_rank_t, float> > mds_import_map; | |
28e407b8 | 143 | map<mds_rank_t, int> mds_last_epoch_under_map; |
7c673cae FG |
144 | |
145 | // per-epoch state | |
28e407b8 AA |
146 | double my_load = 0; |
147 | double target_load = 0; | |
7c673cae FG |
148 | }; |
149 | ||
150 | #endif |