]>
Commit | Line | Data |
---|---|---|
c0bbd038 DM |
1 | package PVE::HA::Manager; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
c4a221bc | 5 | use Digest::MD5 qw(md5_base64); |
c0bbd038 DM |
6 | |
7 | use Data::Dumper; | |
8 | ||
9 | use PVE::HA::NodeStatus; | |
10 | ||
11 | sub new { | |
8f0bb968 | 12 | my ($this, $haenv) = @_; |
c0bbd038 DM |
13 | |
14 | my $class = ref($this) || $this; | |
15 | ||
8f0bb968 DM |
16 | my $ms = $haenv->read_manager_status(); |
17 | ||
18 | $ms->{master_node} = $haenv->nodename(); | |
19 | ||
20 | my $ns = PVE::HA::NodeStatus->new($haenv, $ms->{node_status} || {}); | |
21 | ||
59fd7207 DM |
22 | # fixme: use separate class PVE::HA::ServiceStatus |
23 | my $ss = $ms->{service_status} || {}; | |
24 | ||
c0bbd038 | 25 | my $self = bless { |
8f0bb968 DM |
26 | haenv => $haenv, |
27 | ms => $ms, # master status | |
28 | ns => $ns, # PVE::HA::NodeStatus | |
59fd7207 | 29 | ss => $ss, # service status |
c0bbd038 DM |
30 | }, $class; |
31 | ||
32 | return $self; | |
33 | } | |
34 | ||
d84da043 DM |
35 | sub cleanup { |
36 | my ($self) = @_; | |
37 | ||
38 | # todo: ? | |
39 | } | |
40 | ||
8f0bb968 | 41 | sub flush_master_status { |
c0bbd038 DM |
42 | my ($self) = @_; |
43 | ||
59fd7207 | 44 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 45 | |
8f0bb968 | 46 | $ms->{node_status} = $ns->{status}; |
59fd7207 DM |
47 | $ms->{service_status} = $ss; |
48 | ||
8f0bb968 DM |
49 | $haenv->write_manager_status($ms); |
50 | } | |
c0bbd038 | 51 | |
f7ccd1b3 | 52 | sub select_service_node { |
270d4406 | 53 | my ($groups, $online_node_usage, $service_conf, $current_node, $try_next) = @_; |
f7ccd1b3 | 54 | |
abc920b4 DM |
55 | my $group = { 'nodes' => $service_conf->{node} }; # default group |
56 | ||
270d4406 DM |
57 | $group = $groups->{ids}->{$service_conf->{group}} if $service_conf->{group} && |
58 | $groups->{ids}->{$service_conf->{group}}; | |
abc920b4 DM |
59 | |
60 | my $pri_groups = {}; | |
61 | my $group_members = {}; | |
62 | foreach my $entry (PVE::Tools::split_list($group->{nodes})) { | |
63 | my ($node, $pri) = ($entry, 0); | |
64 | if ($entry =~ m/^(\S+):(\d+)$/) { | |
65 | ($node, $pri) = ($1, $2); | |
66 | } | |
270d4406 | 67 | next if !defined($online_node_usage->{$node}); # offline |
abc920b4 DM |
68 | $pri_groups->{$pri}->{$node} = 1; |
69 | $group_members->{$node} = $pri; | |
70 | } | |
f7ccd1b3 | 71 | |
270d4406 | 72 | |
abc920b4 DM |
73 | # add non-group members to unrestricted groups (priority -1) |
74 | if (!$group->{restricted}) { | |
75 | my $pri = -1; | |
270d4406 | 76 | foreach my $node (keys %$online_node_usage) { |
abc920b4 DM |
77 | next if defined($group_members->{$node}); |
78 | $pri_groups->{$pri}->{$node} = 1; | |
79 | $group_members->{$node} = -1; | |
80 | } | |
81 | } | |
82 | ||
270d4406 | 83 | |
abc920b4 DM |
84 | my @pri_list = sort {$b <=> $a} keys %$pri_groups; |
85 | return undef if !scalar(@pri_list); | |
270d4406 | 86 | |
abc920b4 DM |
87 | if (!$try_next && $group->{nofailback} && defined($group_members->{$current_node})) { |
88 | return $current_node; | |
89 | } | |
90 | ||
91 | # select node from top priority node list | |
92 | ||
93 | my $top_pri = $pri_list[0]; | |
94 | ||
270d4406 | 95 | my @nodes = sort { $online_node_usage->{$a} <=> $online_node_usage->{$b} } keys %{$pri_groups->{$top_pri}}; |
abc920b4 DM |
96 | |
97 | my $found; | |
98 | for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) { | |
99 | my $node = $nodes[$i]; | |
100 | if ($node eq $current_node) { | |
101 | $found = $i; | |
102 | last; | |
103 | } | |
104 | } | |
105 | ||
abc920b4 DM |
106 | if ($try_next) { |
107 | ||
108 | if (defined($found) && ($found < (scalar(@nodes) - 1))) { | |
109 | return $nodes[$found + 1]; | |
110 | } else { | |
111 | return $nodes[0]; | |
112 | } | |
113 | ||
114 | } else { | |
115 | ||
116 | return $nodes[$found] if defined($found); | |
117 | ||
118 | return $nodes[0]; | |
119 | ||
120 | } | |
f7ccd1b3 DM |
121 | } |
122 | ||
c4a221bc DM |
123 | my $uid_counter = 0; |
124 | ||
618fbeda DM |
125 | my $valid_service_states = { |
126 | stopped => 1, | |
127 | request_stop => 1, | |
128 | started => 1, | |
129 | fence => 1, | |
130 | migrate => 1, | |
b0fdf86a | 131 | relocate => 1, |
618fbeda DM |
132 | error => 1, |
133 | }; | |
134 | ||
270d4406 DM |
135 | sub recompute_online_node_usage { |
136 | my ($self) = @_; | |
137 | ||
138 | my $online_node_usage = {}; | |
139 | ||
140 | my $online_nodes = $self->{ns}->list_online_nodes(); | |
141 | ||
142 | foreach my $node (@$online_nodes) { | |
143 | $online_node_usage->{$node} = 0; | |
144 | } | |
145 | ||
146 | foreach my $sid (keys %{$self->{ss}}) { | |
147 | my $sd = $self->{ss}->{$sid}; | |
148 | my $state = $sd->{state}; | |
149 | if (defined($online_node_usage->{$sd->{node}})) { | |
150 | if (($state eq 'started') || ($state eq 'request_stop') || | |
151 | ($state eq 'fence') || ($state eq 'error')) { | |
152 | $online_node_usage->{$sd->{node}}++; | |
153 | } elsif (($state eq 'migrate') || ($state eq 'relocate')) { | |
154 | $online_node_usage->{$sd->{target}}++; | |
155 | } elsif ($state eq 'stopped') { | |
156 | # do nothing | |
157 | } else { | |
158 | die "should not be reached"; | |
159 | } | |
160 | } | |
161 | } | |
162 | ||
163 | $self->{online_node_usage} = $online_node_usage; | |
164 | } | |
165 | ||
4e01bc86 DM |
166 | my $change_service_state = sub { |
167 | my ($self, $sid, $new_state, %params) = @_; | |
168 | ||
169 | my ($haenv, $ss) = ($self->{haenv}, $self->{ss}); | |
170 | ||
171 | my $sd = $ss->{$sid} || die "no such service '$sid"; | |
172 | ||
173 | my $old_state = $sd->{state}; | |
e4ffb299 | 174 | my $old_node = $sd->{node}; |
4e01bc86 DM |
175 | |
176 | die "no state change" if $old_state eq $new_state; # just to be sure | |
177 | ||
618fbeda DM |
178 | die "invalid CRM service state '$new_state'\n" if !$valid_service_states->{$new_state}; |
179 | ||
e4ffb299 DM |
180 | foreach my $k (keys %$sd) { delete $sd->{$k}; }; |
181 | ||
182 | $sd->{state} = $new_state; | |
183 | $sd->{node} = $old_node; | |
184 | ||
185 | my $text_state = ''; | |
4e01bc86 DM |
186 | foreach my $k (keys %params) { |
187 | my $v = $params{$k}; | |
e4ffb299 DM |
188 | $text_state .= ", " if $text_state; |
189 | $text_state .= "$k = $v"; | |
4e01bc86 DM |
190 | $sd->{$k} = $v; |
191 | } | |
270d4406 DM |
192 | |
193 | $self->recompute_online_node_usage(); | |
194 | ||
c4a221bc DM |
195 | $uid_counter++; |
196 | $sd->{uid} = md5_base64($new_state . $$ . time() . $uid_counter); | |
4e01bc86 | 197 | |
e4ffb299 DM |
198 | $text_state = " ($text_state)" if $text_state; |
199 | $haenv->log('info', "service '$sid': state changed from '${old_state}' to '${new_state}' $text_state\n"); | |
4e01bc86 DM |
200 | }; |
201 | ||
332170bd | 202 | # read LRM status for all active nodes |
c4a221bc | 203 | sub read_lrm_status { |
332170bd | 204 | my ($self) = @_; |
c4a221bc | 205 | |
332170bd | 206 | my $nodes = $self->{ns}->list_online_nodes(); |
c4a221bc DM |
207 | my $haenv = $self->{haenv}; |
208 | ||
209 | my $res = {}; | |
210 | ||
332170bd | 211 | foreach my $node (@$nodes) { |
c4a221bc DM |
212 | my $ls = $haenv->read_lrm_status($node); |
213 | foreach my $uid (keys %$ls) { | |
214 | next if $res->{$uid}; # should not happen | |
215 | $res->{$uid} = $ls->{$uid}; | |
216 | } | |
217 | } | |
218 | ||
219 | return $res; | |
220 | } | |
221 | ||
aa98a844 DM |
222 | # read new crm commands and save them into crm master status |
223 | sub update_crm_commands { | |
224 | my ($self) = @_; | |
225 | ||
226 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); | |
227 | ||
228 | my $cmdlist = $haenv->read_crm_commands(); | |
229 | ||
230 | foreach my $cmd (split(/\n/, $cmdlist)) { | |
231 | chomp $cmd; | |
232 | ||
b0fdf86a DM |
233 | if ($cmd =~ m/^(migrate|relocate)\s+(\S+)\s+(\S+)$/) { |
234 | my ($task, $sid, $node) = ($1, $2, $3); | |
aa98a844 DM |
235 | if (my $sd = $ss->{$sid}) { |
236 | if (!$ns->node_is_online($node)) { | |
237 | $haenv->log('err', "crm command error - node not online: $cmd"); | |
238 | } else { | |
239 | if ($node eq $sd->{node}) { | |
240 | $haenv->log('info', "ignore crm command - service already on target node: $cmd"); | |
241 | } else { | |
242 | $haenv->log('info', "got crm command: $cmd"); | |
b0fdf86a | 243 | $ss->{$sid}->{cmd} = [ $task, $node]; |
aa98a844 DM |
244 | } |
245 | } | |
246 | } else { | |
247 | $haenv->log('err', "crm command error - no such service: $cmd"); | |
248 | } | |
249 | ||
250 | } else { | |
251 | $haenv->log('err', "unable to parse crm command: $cmd"); | |
252 | } | |
253 | } | |
254 | ||
255 | } | |
256 | ||
8f0bb968 DM |
257 | sub manage { |
258 | my ($self) = @_; | |
c0bbd038 | 259 | |
59fd7207 | 260 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 261 | |
332170bd | 262 | $ns->update($haenv->get_node_info()); |
c79442f2 DM |
263 | |
264 | if (!$ns->node_is_online($haenv->nodename())) { | |
265 | $haenv->log('info', "master seems offline\n"); | |
266 | return; | |
267 | } | |
268 | ||
332170bd | 269 | my $lrm_status = $self->read_lrm_status(); |
c4a221bc | 270 | |
f7ccd1b3 DM |
271 | my $sc = $haenv->read_service_config(); |
272 | ||
abc920b4 DM |
273 | $self->{groups} = $haenv->read_group_config(); # update |
274 | ||
f7ccd1b3 DM |
275 | # compute new service status |
276 | ||
277 | # add new service | |
278 | foreach my $sid (keys %$sc) { | |
279 | next if $ss->{$sid}; # already there | |
280 | $haenv->log('info', "Adding new service '$sid'\n"); | |
281 | # assume we are running to avoid relocate running service at add | |
8456bde2 | 282 | $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}}; |
f7ccd1b3 DM |
283 | } |
284 | ||
aa98a844 DM |
285 | $self->update_crm_commands(); |
286 | ||
c79442f2 DM |
287 | for (;;) { |
288 | my $repeat = 0; | |
270d4406 DM |
289 | |
290 | $self->recompute_online_node_usage(); | |
f7ccd1b3 | 291 | |
c79442f2 DM |
292 | foreach my $sid (keys %$ss) { |
293 | my $sd = $ss->{$sid}; | |
294 | my $cd = $sc->{$sid} || { state => 'disabled' }; | |
f7ccd1b3 | 295 | |
a875fbe8 DM |
296 | my $lrm_res = $sd->{uid} ? $lrm_status->{$sd->{uid}} : undef; |
297 | ||
c79442f2 DM |
298 | my $last_state = $sd->{state}; |
299 | ||
300 | if ($last_state eq 'stopped') { | |
301 | ||
abc920b4 | 302 | $self->next_state_stopped($sid, $cd, $sd, $lrm_res); |
f7ccd1b3 | 303 | |
c79442f2 | 304 | } elsif ($last_state eq 'started') { |
f7ccd1b3 | 305 | |
abc920b4 | 306 | $self->next_state_started($sid, $cd, $sd, $lrm_res); |
f7ccd1b3 | 307 | |
b0fdf86a | 308 | } elsif ($last_state eq 'migrate' || $last_state eq 'relocate') { |
f7ccd1b3 | 309 | |
8aaa0e36 | 310 | $self->next_state_migrate_relocate($sid, $cd, $sd, $lrm_res); |
f7ccd1b3 | 311 | |
c79442f2 | 312 | } elsif ($last_state eq 'fence') { |
f7ccd1b3 | 313 | |
21e37ed4 | 314 | # do nothing here - wait until fenced |
f7ccd1b3 | 315 | |
c79442f2 | 316 | } elsif ($last_state eq 'request_stop') { |
f7ccd1b3 | 317 | |
0df5b3dd | 318 | $self->next_state_request_stop($sid, $cd, $sd, $lrm_res); |
618fbeda | 319 | |
e88469ba DM |
320 | } elsif ($last_state eq 'error') { |
321 | ||
322 | # fixme: | |
323 | ||
a875fbe8 DM |
324 | } else { |
325 | ||
326 | die "unknown service state '$last_state'"; | |
618fbeda | 327 | } |
21e37ed4 | 328 | |
c79442f2 | 329 | $repeat = 1 if $sd->{state} ne $last_state; |
f7ccd1b3 DM |
330 | } |
331 | ||
21e37ed4 DM |
332 | # handle fencing |
333 | my $fenced_nodes = {}; | |
334 | foreach my $sid (keys %$ss) { | |
335 | my $sd = $ss->{$sid}; | |
336 | next if $sd->{state} ne 'fence'; | |
337 | ||
338 | if (!defined($fenced_nodes->{$sd->{node}})) { | |
339 | $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0; | |
340 | } | |
341 | ||
342 | next if !$fenced_nodes->{$sd->{node}}; | |
343 | ||
344 | # node fence was sucessful - mark service as stopped | |
345 | &$change_service_state($self, $sid, 'stopped'); | |
346 | } | |
347 | ||
c79442f2 | 348 | last if !$repeat; |
f7ccd1b3 | 349 | } |
f7ccd1b3 DM |
350 | |
351 | # remove stale services | |
352 | # fixme: | |
353 | ||
8f0bb968 | 354 | $self->flush_master_status(); |
c0bbd038 DM |
355 | } |
356 | ||
a875fbe8 DM |
357 | # functions to compute next service states |
358 | # $cd: service configuration data (read only) | |
359 | # $sd: service status data (read only) | |
360 | # | |
361 | # Note: use change_service_state() to alter state | |
362 | # | |
363 | ||
0df5b3dd DM |
364 | sub next_state_request_stop { |
365 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; | |
366 | ||
367 | my $haenv = $self->{haenv}; | |
368 | my $ns = $self->{ns}; | |
369 | ||
370 | # check result from LRM daemon | |
371 | if ($lrm_res) { | |
372 | my $exit_code = $lrm_res->{exit_code}; | |
373 | if ($exit_code == 0) { | |
374 | &$change_service_state($self, $sid, 'stopped'); | |
375 | return; | |
376 | } else { | |
377 | &$change_service_state($self, $sid, 'error'); # fixme: what state? | |
378 | return; | |
379 | } | |
380 | } | |
381 | ||
382 | if (!$ns->node_is_online($sd->{node})) { | |
383 | &$change_service_state($self, $sid, 'fence'); | |
384 | return; | |
385 | } | |
386 | } | |
387 | ||
8aaa0e36 DM |
388 | sub next_state_migrate_relocate { |
389 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; | |
390 | ||
391 | my $haenv = $self->{haenv}; | |
392 | my $ns = $self->{ns}; | |
393 | ||
394 | # check result from LRM daemon | |
395 | if ($lrm_res) { | |
396 | my $exit_code = $lrm_res->{exit_code}; | |
397 | if ($exit_code == 0) { | |
398 | &$change_service_state($self, $sid, 'started', node => $sd->{target}); | |
399 | return; | |
400 | } else { | |
401 | $haenv->log('err', "service '$sid' - migration failed (exit code $exit_code)"); | |
402 | &$change_service_state($self, $sid, 'started', node => $sd->{node}); | |
403 | return; | |
404 | } | |
405 | } | |
406 | ||
407 | if (!$ns->node_is_online($sd->{node})) { | |
408 | &$change_service_state($self, $sid, 'fence'); | |
409 | return; | |
410 | } | |
411 | } | |
412 | ||
413 | ||
a875fbe8 | 414 | sub next_state_stopped { |
abc920b4 | 415 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; |
a875fbe8 DM |
416 | |
417 | my $haenv = $self->{haenv}; | |
e88469ba | 418 | my $ns = $self->{ns}; |
a875fbe8 | 419 | |
ff6f1c5c DM |
420 | if ($sd->{node} ne $cd->{node}) { |
421 | # this can happen if we fence a node with active migrations | |
422 | # hack: modify $sd (normally this should be considered read-only) | |
423 | $haenv->log('info', "fixup service '$sid' location ($sd->{node} => $cd->{node}"); | |
424 | $sd->{node} = $cd->{node}; | |
425 | } | |
426 | ||
94b7ebe2 DM |
427 | if ($sd->{cmd}) { |
428 | my ($cmd, $target) = @{$sd->{cmd}}; | |
429 | delete $sd->{cmd}; | |
430 | ||
b0fdf86a | 431 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
94b7ebe2 | 432 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 433 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 434 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 435 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
94b7ebe2 DM |
436 | } else { |
437 | $haenv->change_service_location($sid, $target); | |
438 | $cd->{node} = $sd->{node} = $target; # fixme: $sd is read-only??!! | |
b0fdf86a | 439 | $haenv->log('info', "$cmd service '$sid' to node '$target' (stopped)"); |
94b7ebe2 DM |
440 | } |
441 | } else { | |
442 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); | |
443 | } | |
444 | } | |
445 | ||
a875fbe8 DM |
446 | if ($cd->{state} eq 'disabled') { |
447 | # do nothing | |
e88469ba DM |
448 | return; |
449 | } | |
450 | ||
451 | if ($cd->{state} eq 'enabled') { | |
270d4406 | 452 | if (my $node = select_service_node($self->{groups}, $self->{online_node_usage}, $cd, $sd->{node})) { |
a875fbe8 DM |
453 | if ($node && ($sd->{node} ne $node)) { |
454 | $haenv->change_service_location($sid, $node); | |
455 | } | |
456 | &$change_service_state($self, $sid, 'started', node => $node); | |
457 | } else { | |
458 | # fixme: warn | |
459 | } | |
e88469ba DM |
460 | |
461 | return; | |
a875fbe8 | 462 | } |
e88469ba DM |
463 | |
464 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 DM |
465 | } |
466 | ||
467 | sub next_state_started { | |
abc920b4 | 468 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; |
a875fbe8 DM |
469 | |
470 | my $haenv = $self->{haenv}; | |
471 | my $ns = $self->{ns}; | |
472 | ||
473 | if (!$ns->node_is_online($sd->{node})) { | |
474 | ||
475 | &$change_service_state($self, $sid, 'fence'); | |
e88469ba DM |
476 | return; |
477 | } | |
a875fbe8 | 478 | |
e88469ba DM |
479 | if ($cd->{state} eq 'disabled') { |
480 | &$change_service_state($self, $sid, 'request_stop'); | |
481 | return; | |
482 | } | |
483 | ||
484 | if ($cd->{state} eq 'enabled') { | |
e88469ba DM |
485 | |
486 | if ($sd->{cmd}) { | |
487 | my ($cmd, $target) = @{$sd->{cmd}}; | |
488 | delete $sd->{cmd}; | |
489 | ||
b0fdf86a | 490 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
e88469ba | 491 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 492 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 493 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 494 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
e88469ba | 495 | } else { |
b0fdf86a DM |
496 | $haenv->log('info', "$cmd service '$sid' to node '$target' (running)"); |
497 | &$change_service_state($self, $sid, $cmd, node => $sd->{node}, target => $target); | |
e88469ba | 498 | } |
a875fbe8 | 499 | } else { |
e88469ba | 500 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); |
a875fbe8 DM |
501 | } |
502 | } else { | |
b0fdf86a | 503 | |
abc920b4 DM |
504 | my $try_next = 0; |
505 | if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes? | |
506 | $try_next = 1; | |
507 | } | |
508 | ||
270d4406 DM |
509 | my $node = select_service_node($self->{groups}, $self->{online_node_usage}, |
510 | $cd, $sd->{node}, $try_next); | |
abc920b4 | 511 | |
b0fdf86a DM |
512 | if ($node && ($sd->{node} ne $node)) { |
513 | $haenv->log('info', "migrate service '$sid' to node '$node' (running)"); | |
514 | &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node); | |
515 | } else { | |
516 | # do nothing | |
517 | } | |
a875fbe8 | 518 | } |
e88469ba DM |
519 | |
520 | return; | |
521 | } | |
522 | ||
523 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 | 524 | } |
c0bbd038 DM |
525 | |
526 | 1; |