]>
Commit | Line | Data |
---|---|---|
1 | package PVE::HA::Manager; | |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | use Digest::MD5 qw(md5_base64); | |
6 | ||
7 | use Data::Dumper; | |
8 | ||
9 | use PVE::HA::NodeStatus; | |
10 | ||
11 | sub new { | |
12 | my ($this, $haenv) = @_; | |
13 | ||
14 | my $class = ref($this) || $this; | |
15 | ||
16 | my $ms = $haenv->read_manager_status(); | |
17 | ||
18 | $ms->{master_node} = $haenv->nodename(); | |
19 | ||
20 | my $ns = PVE::HA::NodeStatus->new($haenv, $ms->{node_status} || {}); | |
21 | ||
22 | # fixme: use separate class PVE::HA::ServiceStatus | |
23 | my $ss = $ms->{service_status} || {}; | |
24 | ||
25 | my $self = bless { | |
26 | haenv => $haenv, | |
27 | ms => $ms, # master status | |
28 | ns => $ns, # PVE::HA::NodeStatus | |
29 | ss => $ss, # service status | |
30 | }, $class; | |
31 | ||
32 | return $self; | |
33 | } | |
34 | ||
35 | sub cleanup { | |
36 | my ($self) = @_; | |
37 | ||
38 | # todo: ? | |
39 | } | |
40 | ||
41 | sub flush_master_status { | |
42 | my ($self) = @_; | |
43 | ||
44 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); | |
45 | ||
46 | $ms->{node_status} = $ns->{status}; | |
47 | $ms->{service_status} = $ss; | |
48 | ||
49 | $haenv->write_manager_status($ms); | |
50 | } | |
51 | ||
52 | # Attention: must be idempotent (alway return the same result for same input!) | |
53 | sub select_service_node { | |
54 | my ($self, $service_conf) = @_; | |
55 | ||
56 | my $ns = $self->{ns}; | |
57 | ||
58 | my $pref_node = $service_conf->{node}; | |
59 | ||
60 | return $pref_node if $ns->node_is_online($pref_node); | |
61 | ||
62 | my $online_nodes = $ns->list_online_nodes(); | |
63 | ||
64 | return shift @$online_nodes; | |
65 | } | |
66 | ||
67 | my $uid_counter = 0; | |
68 | ||
69 | my $valid_service_states = { | |
70 | stopped => 1, | |
71 | request_stop => 1, | |
72 | started => 1, | |
73 | fence => 1, | |
74 | move => 1, | |
75 | migrate => 1, | |
76 | error => 1, | |
77 | }; | |
78 | ||
79 | my $change_service_state = sub { | |
80 | my ($self, $sid, $new_state, %params) = @_; | |
81 | ||
82 | my ($haenv, $ss) = ($self->{haenv}, $self->{ss}); | |
83 | ||
84 | my $sd = $ss->{$sid} || die "no such service '$sid"; | |
85 | ||
86 | my $old_state = $sd->{state}; | |
87 | ||
88 | die "no state change" if $old_state eq $new_state; # just to be sure | |
89 | ||
90 | die "invalid CRM service state '$new_state'\n" if !$valid_service_states->{$new_state}; | |
91 | ||
92 | my $changes = ''; | |
93 | foreach my $k (keys %params) { | |
94 | my $v = $params{$k}; | |
95 | next if defined($sd->{$k}) && $sd->{$k} eq $v; | |
96 | $changes .= ", " if $changes; | |
97 | $changes .= "$k = $v"; | |
98 | $sd->{$k} = $v; | |
99 | } | |
100 | ||
101 | $sd->{state} = $new_state; | |
102 | $uid_counter++; | |
103 | $sd->{uid} = md5_base64($new_state . $$ . time() . $uid_counter); | |
104 | ||
105 | # fixme: cleanup state (remove unused values) | |
106 | ||
107 | $changes = " ($changes)" if $changes; | |
108 | $haenv->log('info', "service '$sid': state changed to '$new_state' $changes\n"); | |
109 | }; | |
110 | ||
111 | # read LRM status for all active nodes | |
112 | sub read_lrm_status { | |
113 | my ($self) = @_; | |
114 | ||
115 | my $nodes = $self->{ns}->list_online_nodes(); | |
116 | my $haenv = $self->{haenv}; | |
117 | ||
118 | my $res = {}; | |
119 | ||
120 | foreach my $node (@$nodes) { | |
121 | my $ls = $haenv->read_lrm_status($node); | |
122 | foreach my $uid (keys %$ls) { | |
123 | next if $res->{$uid}; # should not happen | |
124 | $res->{$uid} = $ls->{$uid}; | |
125 | } | |
126 | } | |
127 | ||
128 | return $res; | |
129 | } | |
130 | ||
131 | sub manage { | |
132 | my ($self) = @_; | |
133 | ||
134 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); | |
135 | ||
136 | $ns->update($haenv->get_node_info()); | |
137 | ||
138 | if (!$ns->node_is_online($haenv->nodename())) { | |
139 | $haenv->log('info', "master seems offline\n"); | |
140 | return; | |
141 | } | |
142 | ||
143 | my $lrm_status = $self->read_lrm_status(); | |
144 | ||
145 | my $sc = $haenv->read_service_config(); | |
146 | ||
147 | # compute new service status | |
148 | ||
149 | # add new service | |
150 | foreach my $sid (keys %$sc) { | |
151 | next if $ss->{$sid}; # already there | |
152 | $haenv->log('info', "Adding new service '$sid'\n"); | |
153 | # assume we are running to avoid relocate running service at add | |
154 | $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}}; | |
155 | } | |
156 | ||
157 | for (;;) { | |
158 | my $repeat = 0; | |
159 | ||
160 | foreach my $sid (keys %$ss) { | |
161 | my $sd = $ss->{$sid}; | |
162 | my $cd = $sc->{$sid} || { state => 'disabled' }; | |
163 | ||
164 | my $lrm_res = $sd->{uid} ? $lrm_status->{$sd->{uid}} : undef; | |
165 | ||
166 | my $last_state = $sd->{state}; | |
167 | ||
168 | if ($last_state eq 'stopped') { | |
169 | ||
170 | $self->next_state_stopped($sid, $cd, $sd); | |
171 | ||
172 | } elsif ($last_state eq 'started') { | |
173 | ||
174 | $self->next_state_started($sid, $cd, $sd); | |
175 | ||
176 | } elsif ($last_state eq 'migrate') { | |
177 | ||
178 | die "implement me"; | |
179 | ||
180 | } elsif ($last_state eq 'move') { | |
181 | ||
182 | #die "implement me"; | |
183 | ||
184 | } elsif ($last_state eq 'fence') { | |
185 | ||
186 | # do nothing here - wait until fenced | |
187 | ||
188 | } elsif ($last_state eq 'request_stop') { | |
189 | ||
190 | # check result from LRM daemon | |
191 | if ($lrm_res) { | |
192 | my $exit_code = $lrm_res->{exit_code}; | |
193 | if ($exit_code == 0) { | |
194 | &$change_service_state($self, $sid, 'stopped'); | |
195 | } else { | |
196 | &$change_service_state($self, $sid, 'error'); # fixme: what state? | |
197 | } | |
198 | } | |
199 | ||
200 | } else { | |
201 | ||
202 | die "unknown service state '$last_state'"; | |
203 | } | |
204 | ||
205 | $repeat = 1 if $sd->{state} ne $last_state; | |
206 | } | |
207 | ||
208 | # handle fencing | |
209 | my $fenced_nodes = {}; | |
210 | foreach my $sid (keys %$ss) { | |
211 | my $sd = $ss->{$sid}; | |
212 | next if $sd->{state} ne 'fence'; | |
213 | ||
214 | if (!defined($fenced_nodes->{$sd->{node}})) { | |
215 | $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0; | |
216 | } | |
217 | ||
218 | next if !$fenced_nodes->{$sd->{node}}; | |
219 | ||
220 | # node fence was sucessful - mark service as stopped | |
221 | &$change_service_state($self, $sid, 'stopped'); | |
222 | } | |
223 | ||
224 | last if !$repeat; | |
225 | } | |
226 | ||
227 | # remove stale services | |
228 | # fixme: | |
229 | ||
230 | $self->flush_master_status(); | |
231 | } | |
232 | ||
233 | # functions to compute next service states | |
234 | # $cd: service configuration data (read only) | |
235 | # $sd: service status data (read only) | |
236 | # | |
237 | # Note: use change_service_state() to alter state | |
238 | # | |
239 | ||
240 | sub next_state_stopped { | |
241 | my ($self, $sid, $cd, $sd) = @_; | |
242 | ||
243 | my $haenv = $self->{haenv}; | |
244 | ||
245 | if ($cd->{state} eq 'disabled') { | |
246 | # do nothing | |
247 | } elsif ($cd->{state} eq 'enabled') { | |
248 | if (my $node = $self->select_service_node($cd)) { | |
249 | if ($node && ($sd->{node} ne $node)) { | |
250 | $haenv->change_service_location($sid, $node); | |
251 | } | |
252 | &$change_service_state($self, $sid, 'started', node => $node); | |
253 | } else { | |
254 | # fixme: warn | |
255 | } | |
256 | } else { | |
257 | # do nothing - todo: log something? | |
258 | } | |
259 | } | |
260 | ||
261 | sub next_state_started { | |
262 | my ($self, $sid, $cd, $sd) = @_; | |
263 | ||
264 | my $haenv = $self->{haenv}; | |
265 | my $ns = $self->{ns}; | |
266 | ||
267 | if (!$ns->node_is_online($sd->{node})) { | |
268 | ||
269 | &$change_service_state($self, $sid, 'fence'); | |
270 | ||
271 | } else { | |
272 | ||
273 | if ($cd->{state} eq 'disabled') { | |
274 | &$change_service_state($self, $sid, 'request_stop'); | |
275 | } elsif ($cd->{state} eq 'enabled') { | |
276 | my $node = $self->select_service_node($cd); | |
277 | if ($node && ($sd->{node} ne $node)) { | |
278 | &$change_service_state($self, $sid, 'migrate'); | |
279 | } else { | |
280 | # do nothing | |
281 | } | |
282 | } else { | |
283 | # do nothing - todo: log something? | |
284 | } | |
285 | } | |
286 | } | |
287 | ||
288 | 1; |