]>
Commit | Line | Data |
---|---|---|
c0bbd038 DM |
1 | package PVE::HA::Manager; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
c4a221bc | 5 | use Digest::MD5 qw(md5_base64); |
c0bbd038 DM |
6 | |
7 | use Data::Dumper; | |
8 | ||
9 | use PVE::HA::NodeStatus; | |
10 | ||
11 | sub new { | |
8f0bb968 | 12 | my ($this, $haenv) = @_; |
c0bbd038 DM |
13 | |
14 | my $class = ref($this) || $this; | |
15 | ||
8f0bb968 DM |
16 | my $ms = $haenv->read_manager_status(); |
17 | ||
18 | $ms->{master_node} = $haenv->nodename(); | |
19 | ||
20 | my $ns = PVE::HA::NodeStatus->new($haenv, $ms->{node_status} || {}); | |
21 | ||
59fd7207 DM |
22 | # fixme: use separate class PVE::HA::ServiceStatus |
23 | my $ss = $ms->{service_status} || {}; | |
24 | ||
c0bbd038 | 25 | my $self = bless { |
8f0bb968 DM |
26 | haenv => $haenv, |
27 | ms => $ms, # master status | |
28 | ns => $ns, # PVE::HA::NodeStatus | |
59fd7207 | 29 | ss => $ss, # service status |
c0bbd038 DM |
30 | }, $class; |
31 | ||
32 | return $self; | |
33 | } | |
34 | ||
d84da043 DM |
35 | sub cleanup { |
36 | my ($self) = @_; | |
37 | ||
38 | # todo: ? | |
39 | } | |
40 | ||
8f0bb968 | 41 | sub flush_master_status { |
c0bbd038 DM |
42 | my ($self) = @_; |
43 | ||
59fd7207 | 44 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 45 | |
8f0bb968 | 46 | $ms->{node_status} = $ns->{status}; |
59fd7207 DM |
47 | $ms->{service_status} = $ss; |
48 | ||
8f0bb968 DM |
49 | $haenv->write_manager_status($ms); |
50 | } | |
c0bbd038 | 51 | |
819c61f0 | 52 | # Attention: must be idempotent (alway return the same result for same input!) |
f7ccd1b3 DM |
53 | sub select_service_node { |
54 | my ($self, $service_conf) = @_; | |
55 | ||
56 | my $ns = $self->{ns}; | |
57 | ||
58 | my $pref_node = $service_conf->{node}; | |
59 | ||
60 | return $pref_node if $ns->node_is_online($pref_node); | |
61 | ||
62 | my $online_nodes = $ns->list_online_nodes(); | |
63 | ||
64 | return shift @$online_nodes; | |
65 | } | |
66 | ||
c4a221bc DM |
67 | my $uid_counter = 0; |
68 | ||
618fbeda DM |
69 | my $valid_service_states = { |
70 | stopped => 1, | |
71 | request_stop => 1, | |
72 | started => 1, | |
73 | fence => 1, | |
74 | migrate => 1, | |
b0fdf86a | 75 | relocate => 1, |
618fbeda DM |
76 | error => 1, |
77 | }; | |
78 | ||
4e01bc86 DM |
79 | my $change_service_state = sub { |
80 | my ($self, $sid, $new_state, %params) = @_; | |
81 | ||
82 | my ($haenv, $ss) = ($self->{haenv}, $self->{ss}); | |
83 | ||
84 | my $sd = $ss->{$sid} || die "no such service '$sid"; | |
85 | ||
86 | my $old_state = $sd->{state}; | |
e4ffb299 | 87 | my $old_node = $sd->{node}; |
4e01bc86 DM |
88 | |
89 | die "no state change" if $old_state eq $new_state; # just to be sure | |
90 | ||
618fbeda DM |
91 | die "invalid CRM service state '$new_state'\n" if !$valid_service_states->{$new_state}; |
92 | ||
e4ffb299 DM |
93 | foreach my $k (keys %$sd) { delete $sd->{$k}; }; |
94 | ||
95 | $sd->{state} = $new_state; | |
96 | $sd->{node} = $old_node; | |
97 | ||
98 | my $text_state = ''; | |
4e01bc86 DM |
99 | foreach my $k (keys %params) { |
100 | my $v = $params{$k}; | |
e4ffb299 DM |
101 | $text_state .= ", " if $text_state; |
102 | $text_state .= "$k = $v"; | |
4e01bc86 DM |
103 | $sd->{$k} = $v; |
104 | } | |
105 | ||
c4a221bc DM |
106 | $uid_counter++; |
107 | $sd->{uid} = md5_base64($new_state . $$ . time() . $uid_counter); | |
4e01bc86 | 108 | |
e4ffb299 DM |
109 | $text_state = " ($text_state)" if $text_state; |
110 | $haenv->log('info', "service '$sid': state changed from '${old_state}' to '${new_state}' $text_state\n"); | |
4e01bc86 DM |
111 | }; |
112 | ||
332170bd | 113 | # read LRM status for all active nodes |
c4a221bc | 114 | sub read_lrm_status { |
332170bd | 115 | my ($self) = @_; |
c4a221bc | 116 | |
332170bd | 117 | my $nodes = $self->{ns}->list_online_nodes(); |
c4a221bc DM |
118 | my $haenv = $self->{haenv}; |
119 | ||
120 | my $res = {}; | |
121 | ||
332170bd | 122 | foreach my $node (@$nodes) { |
c4a221bc DM |
123 | my $ls = $haenv->read_lrm_status($node); |
124 | foreach my $uid (keys %$ls) { | |
125 | next if $res->{$uid}; # should not happen | |
126 | $res->{$uid} = $ls->{$uid}; | |
127 | } | |
128 | } | |
129 | ||
130 | return $res; | |
131 | } | |
132 | ||
aa98a844 DM |
133 | # read new crm commands and save them into crm master status |
134 | sub update_crm_commands { | |
135 | my ($self) = @_; | |
136 | ||
137 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); | |
138 | ||
139 | my $cmdlist = $haenv->read_crm_commands(); | |
140 | ||
141 | foreach my $cmd (split(/\n/, $cmdlist)) { | |
142 | chomp $cmd; | |
143 | ||
b0fdf86a DM |
144 | if ($cmd =~ m/^(migrate|relocate)\s+(\S+)\s+(\S+)$/) { |
145 | my ($task, $sid, $node) = ($1, $2, $3); | |
aa98a844 DM |
146 | if (my $sd = $ss->{$sid}) { |
147 | if (!$ns->node_is_online($node)) { | |
148 | $haenv->log('err', "crm command error - node not online: $cmd"); | |
149 | } else { | |
150 | if ($node eq $sd->{node}) { | |
151 | $haenv->log('info', "ignore crm command - service already on target node: $cmd"); | |
152 | } else { | |
153 | $haenv->log('info', "got crm command: $cmd"); | |
b0fdf86a | 154 | $ss->{$sid}->{cmd} = [ $task, $node]; |
aa98a844 DM |
155 | } |
156 | } | |
157 | } else { | |
158 | $haenv->log('err', "crm command error - no such service: $cmd"); | |
159 | } | |
160 | ||
161 | } else { | |
162 | $haenv->log('err', "unable to parse crm command: $cmd"); | |
163 | } | |
164 | } | |
165 | ||
166 | } | |
167 | ||
8f0bb968 DM |
168 | sub manage { |
169 | my ($self) = @_; | |
c0bbd038 | 170 | |
59fd7207 | 171 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 172 | |
332170bd | 173 | $ns->update($haenv->get_node_info()); |
c79442f2 DM |
174 | |
175 | if (!$ns->node_is_online($haenv->nodename())) { | |
176 | $haenv->log('info', "master seems offline\n"); | |
177 | return; | |
178 | } | |
179 | ||
332170bd | 180 | my $lrm_status = $self->read_lrm_status(); |
c4a221bc | 181 | |
f7ccd1b3 DM |
182 | my $sc = $haenv->read_service_config(); |
183 | ||
184 | # compute new service status | |
185 | ||
186 | # add new service | |
187 | foreach my $sid (keys %$sc) { | |
188 | next if $ss->{$sid}; # already there | |
189 | $haenv->log('info', "Adding new service '$sid'\n"); | |
190 | # assume we are running to avoid relocate running service at add | |
8456bde2 | 191 | $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}}; |
f7ccd1b3 DM |
192 | } |
193 | ||
aa98a844 DM |
194 | $self->update_crm_commands(); |
195 | ||
c79442f2 DM |
196 | for (;;) { |
197 | my $repeat = 0; | |
f7ccd1b3 | 198 | |
c79442f2 DM |
199 | foreach my $sid (keys %$ss) { |
200 | my $sd = $ss->{$sid}; | |
201 | my $cd = $sc->{$sid} || { state => 'disabled' }; | |
f7ccd1b3 | 202 | |
a875fbe8 DM |
203 | my $lrm_res = $sd->{uid} ? $lrm_status->{$sd->{uid}} : undef; |
204 | ||
c79442f2 DM |
205 | my $last_state = $sd->{state}; |
206 | ||
207 | if ($last_state eq 'stopped') { | |
208 | ||
a875fbe8 | 209 | $self->next_state_stopped($sid, $cd, $sd); |
f7ccd1b3 | 210 | |
c79442f2 | 211 | } elsif ($last_state eq 'started') { |
f7ccd1b3 | 212 | |
a875fbe8 | 213 | $self->next_state_started($sid, $cd, $sd); |
f7ccd1b3 | 214 | |
b0fdf86a | 215 | } elsif ($last_state eq 'migrate' || $last_state eq 'relocate') { |
f7ccd1b3 | 216 | |
e88469ba DM |
217 | # check result from LRM daemon |
218 | if ($lrm_res) { | |
219 | my $exit_code = $lrm_res->{exit_code}; | |
220 | if ($exit_code == 0) { | |
221 | &$change_service_state($self, $sid, 'started', node => $sd->{target}); | |
222 | } else { | |
223 | $haenv->log('err', "service '$sid' - migration failed (exit code $exit_code)"); | |
224 | &$change_service_state($self, $sid, 'started', node => $sd->{node}); | |
225 | } | |
226 | } | |
f7ccd1b3 | 227 | |
c79442f2 | 228 | } elsif ($last_state eq 'fence') { |
f7ccd1b3 | 229 | |
21e37ed4 | 230 | # do nothing here - wait until fenced |
f7ccd1b3 | 231 | |
c79442f2 | 232 | } elsif ($last_state eq 'request_stop') { |
f7ccd1b3 | 233 | |
a875fbe8 DM |
234 | # check result from LRM daemon |
235 | if ($lrm_res) { | |
236 | my $exit_code = $lrm_res->{exit_code}; | |
618fbeda DM |
237 | if ($exit_code == 0) { |
238 | &$change_service_state($self, $sid, 'stopped'); | |
239 | } else { | |
240 | &$change_service_state($self, $sid, 'error'); # fixme: what state? | |
241 | } | |
618fbeda DM |
242 | } |
243 | ||
e88469ba DM |
244 | } elsif ($last_state eq 'error') { |
245 | ||
246 | # fixme: | |
247 | ||
a875fbe8 DM |
248 | } else { |
249 | ||
250 | die "unknown service state '$last_state'"; | |
618fbeda | 251 | } |
21e37ed4 | 252 | |
c79442f2 | 253 | $repeat = 1 if $sd->{state} ne $last_state; |
f7ccd1b3 DM |
254 | } |
255 | ||
21e37ed4 DM |
256 | # handle fencing |
257 | my $fenced_nodes = {}; | |
258 | foreach my $sid (keys %$ss) { | |
259 | my $sd = $ss->{$sid}; | |
260 | next if $sd->{state} ne 'fence'; | |
261 | ||
262 | if (!defined($fenced_nodes->{$sd->{node}})) { | |
263 | $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0; | |
264 | } | |
265 | ||
266 | next if !$fenced_nodes->{$sd->{node}}; | |
267 | ||
268 | # node fence was sucessful - mark service as stopped | |
269 | &$change_service_state($self, $sid, 'stopped'); | |
270 | } | |
271 | ||
c79442f2 | 272 | last if !$repeat; |
f7ccd1b3 | 273 | } |
f7ccd1b3 DM |
274 | |
275 | # remove stale services | |
276 | # fixme: | |
277 | ||
8f0bb968 | 278 | $self->flush_master_status(); |
c0bbd038 DM |
279 | } |
280 | ||
a875fbe8 DM |
281 | # functions to compute next service states |
282 | # $cd: service configuration data (read only) | |
283 | # $sd: service status data (read only) | |
284 | # | |
285 | # Note: use change_service_state() to alter state | |
286 | # | |
287 | ||
288 | sub next_state_stopped { | |
289 | my ($self, $sid, $cd, $sd) = @_; | |
290 | ||
291 | my $haenv = $self->{haenv}; | |
e88469ba | 292 | my $ns = $self->{ns}; |
a875fbe8 | 293 | |
ff6f1c5c DM |
294 | if ($sd->{node} ne $cd->{node}) { |
295 | # this can happen if we fence a node with active migrations | |
296 | # hack: modify $sd (normally this should be considered read-only) | |
297 | $haenv->log('info', "fixup service '$sid' location ($sd->{node} => $cd->{node}"); | |
298 | $sd->{node} = $cd->{node}; | |
299 | } | |
300 | ||
94b7ebe2 DM |
301 | if ($sd->{cmd}) { |
302 | my ($cmd, $target) = @{$sd->{cmd}}; | |
303 | delete $sd->{cmd}; | |
304 | ||
b0fdf86a | 305 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
94b7ebe2 | 306 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 307 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 308 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 309 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
94b7ebe2 DM |
310 | } else { |
311 | $haenv->change_service_location($sid, $target); | |
312 | $cd->{node} = $sd->{node} = $target; # fixme: $sd is read-only??!! | |
b0fdf86a | 313 | $haenv->log('info', "$cmd service '$sid' to node '$target' (stopped)"); |
94b7ebe2 DM |
314 | } |
315 | } else { | |
316 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); | |
317 | } | |
318 | } | |
319 | ||
a875fbe8 DM |
320 | if ($cd->{state} eq 'disabled') { |
321 | # do nothing | |
e88469ba DM |
322 | return; |
323 | } | |
324 | ||
325 | if ($cd->{state} eq 'enabled') { | |
a875fbe8 DM |
326 | if (my $node = $self->select_service_node($cd)) { |
327 | if ($node && ($sd->{node} ne $node)) { | |
328 | $haenv->change_service_location($sid, $node); | |
329 | } | |
330 | &$change_service_state($self, $sid, 'started', node => $node); | |
331 | } else { | |
332 | # fixme: warn | |
333 | } | |
e88469ba DM |
334 | |
335 | return; | |
a875fbe8 | 336 | } |
e88469ba DM |
337 | |
338 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 DM |
339 | } |
340 | ||
341 | sub next_state_started { | |
342 | my ($self, $sid, $cd, $sd) = @_; | |
343 | ||
344 | my $haenv = $self->{haenv}; | |
345 | my $ns = $self->{ns}; | |
346 | ||
347 | if (!$ns->node_is_online($sd->{node})) { | |
348 | ||
349 | &$change_service_state($self, $sid, 'fence'); | |
e88469ba DM |
350 | return; |
351 | } | |
a875fbe8 | 352 | |
e88469ba DM |
353 | if ($cd->{state} eq 'disabled') { |
354 | &$change_service_state($self, $sid, 'request_stop'); | |
355 | return; | |
356 | } | |
357 | ||
358 | if ($cd->{state} eq 'enabled') { | |
e88469ba DM |
359 | |
360 | if ($sd->{cmd}) { | |
361 | my ($cmd, $target) = @{$sd->{cmd}}; | |
362 | delete $sd->{cmd}; | |
363 | ||
b0fdf86a | 364 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
e88469ba | 365 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 366 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 367 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 368 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
e88469ba | 369 | } else { |
b0fdf86a DM |
370 | $haenv->log('info', "$cmd service '$sid' to node '$target' (running)"); |
371 | &$change_service_state($self, $sid, $cmd, node => $sd->{node}, target => $target); | |
e88469ba | 372 | } |
a875fbe8 | 373 | } else { |
e88469ba | 374 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); |
a875fbe8 DM |
375 | } |
376 | } else { | |
b0fdf86a DM |
377 | |
378 | my $node = $self->select_service_node($cd); | |
379 | ||
380 | if ($node && ($sd->{node} ne $node)) { | |
381 | $haenv->log('info', "migrate service '$sid' to node '$node' (running)"); | |
382 | &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node); | |
383 | } else { | |
384 | # do nothing | |
385 | } | |
a875fbe8 | 386 | } |
e88469ba DM |
387 | |
388 | return; | |
389 | } | |
390 | ||
391 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 | 392 | } |
c0bbd038 DM |
393 | |
394 | 1; |