]>
Commit | Line | Data |
---|---|---|
c0bbd038 DM |
1 | package PVE::HA::Manager; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
c4a221bc | 5 | use Digest::MD5 qw(md5_base64); |
c0bbd038 DM |
6 | |
7 | use Data::Dumper; | |
8 | ||
9 | use PVE::HA::NodeStatus; | |
10 | ||
11 | sub new { | |
8f0bb968 | 12 | my ($this, $haenv) = @_; |
c0bbd038 DM |
13 | |
14 | my $class = ref($this) || $this; | |
15 | ||
8f0bb968 DM |
16 | my $ms = $haenv->read_manager_status(); |
17 | ||
18 | $ms->{master_node} = $haenv->nodename(); | |
19 | ||
20 | my $ns = PVE::HA::NodeStatus->new($haenv, $ms->{node_status} || {}); | |
21 | ||
59fd7207 DM |
22 | # fixme: use separate class PVE::HA::ServiceStatus |
23 | my $ss = $ms->{service_status} || {}; | |
24 | ||
c0bbd038 | 25 | my $self = bless { |
8f0bb968 DM |
26 | haenv => $haenv, |
27 | ms => $ms, # master status | |
28 | ns => $ns, # PVE::HA::NodeStatus | |
59fd7207 | 29 | ss => $ss, # service status |
c0bbd038 DM |
30 | }, $class; |
31 | ||
32 | return $self; | |
33 | } | |
34 | ||
d84da043 DM |
35 | sub cleanup { |
36 | my ($self) = @_; | |
37 | ||
38 | # todo: ? | |
39 | } | |
40 | ||
8f0bb968 | 41 | sub flush_master_status { |
c0bbd038 DM |
42 | my ($self) = @_; |
43 | ||
59fd7207 | 44 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 45 | |
8f0bb968 | 46 | $ms->{node_status} = $ns->{status}; |
59fd7207 DM |
47 | $ms->{service_status} = $ss; |
48 | ||
8f0bb968 DM |
49 | $haenv->write_manager_status($ms); |
50 | } | |
c0bbd038 | 51 | |
819c61f0 | 52 | # Attention: must be idempotent (alway return the same result for same input!) |
f7ccd1b3 | 53 | sub select_service_node { |
abc920b4 | 54 | my ($self, $service_conf, $try_next) = @_; |
f7ccd1b3 DM |
55 | |
56 | my $ns = $self->{ns}; | |
f7ccd1b3 | 57 | |
abc920b4 DM |
58 | my $group = { 'nodes' => $service_conf->{node} }; # default group |
59 | ||
60 | $group = $self->{groups}->{ids}->{$service_conf->{group}} if $service_conf->{group} && | |
61 | $self->{groups}->{ids}->{$service_conf->{group}}; | |
62 | ||
63 | my $pri_groups = {}; | |
64 | my $group_members = {}; | |
65 | foreach my $entry (PVE::Tools::split_list($group->{nodes})) { | |
66 | my ($node, $pri) = ($entry, 0); | |
67 | if ($entry =~ m/^(\S+):(\d+)$/) { | |
68 | ($node, $pri) = ($1, $2); | |
69 | } | |
70 | next if !$ns->node_is_online($node); | |
71 | $pri_groups->{$pri}->{$node} = 1; | |
72 | $group_members->{$node} = $pri; | |
73 | } | |
f7ccd1b3 DM |
74 | |
75 | my $online_nodes = $ns->list_online_nodes(); | |
76 | ||
abc920b4 DM |
77 | |
78 | # add non-group members to unrestricted groups (priority -1) | |
79 | if (!$group->{restricted}) { | |
80 | my $pri = -1; | |
81 | foreach my $node (@$online_nodes) { | |
82 | next if defined($group_members->{$node}); | |
83 | $pri_groups->{$pri}->{$node} = 1; | |
84 | $group_members->{$node} = -1; | |
85 | } | |
86 | } | |
87 | ||
88 | my @pri_list = sort {$b <=> $a} keys %$pri_groups; | |
89 | return undef if !scalar(@pri_list); | |
90 | ||
91 | my $current_node = $service_conf->{node}; | |
92 | if (!$try_next && $group->{nofailback} && defined($group_members->{$current_node})) { | |
93 | return $current_node; | |
94 | } | |
95 | ||
96 | # select node from top priority node list | |
97 | ||
98 | my $top_pri = $pri_list[0]; | |
99 | ||
100 | my @nodes = sort keys %{$pri_groups->{$top_pri}}; | |
101 | ||
102 | my $found; | |
103 | for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) { | |
104 | my $node = $nodes[$i]; | |
105 | if ($node eq $current_node) { | |
106 | $found = $i; | |
107 | last; | |
108 | } | |
109 | } | |
110 | ||
111 | my $find_next = 0; | |
112 | ||
113 | if ($try_next) { | |
114 | ||
115 | if (defined($found) && ($found < (scalar(@nodes) - 1))) { | |
116 | return $nodes[$found + 1]; | |
117 | } else { | |
118 | return $nodes[0]; | |
119 | } | |
120 | ||
121 | } else { | |
122 | ||
123 | return $nodes[$found] if defined($found); | |
124 | ||
125 | return $nodes[0]; | |
126 | ||
127 | } | |
f7ccd1b3 DM |
128 | } |
129 | ||
c4a221bc DM |
130 | my $uid_counter = 0; |
131 | ||
618fbeda DM |
132 | my $valid_service_states = { |
133 | stopped => 1, | |
134 | request_stop => 1, | |
135 | started => 1, | |
136 | fence => 1, | |
137 | migrate => 1, | |
b0fdf86a | 138 | relocate => 1, |
618fbeda DM |
139 | error => 1, |
140 | }; | |
141 | ||
4e01bc86 DM |
142 | my $change_service_state = sub { |
143 | my ($self, $sid, $new_state, %params) = @_; | |
144 | ||
145 | my ($haenv, $ss) = ($self->{haenv}, $self->{ss}); | |
146 | ||
147 | my $sd = $ss->{$sid} || die "no such service '$sid"; | |
148 | ||
149 | my $old_state = $sd->{state}; | |
e4ffb299 | 150 | my $old_node = $sd->{node}; |
4e01bc86 DM |
151 | |
152 | die "no state change" if $old_state eq $new_state; # just to be sure | |
153 | ||
618fbeda DM |
154 | die "invalid CRM service state '$new_state'\n" if !$valid_service_states->{$new_state}; |
155 | ||
e4ffb299 DM |
156 | foreach my $k (keys %$sd) { delete $sd->{$k}; }; |
157 | ||
158 | $sd->{state} = $new_state; | |
159 | $sd->{node} = $old_node; | |
160 | ||
161 | my $text_state = ''; | |
4e01bc86 DM |
162 | foreach my $k (keys %params) { |
163 | my $v = $params{$k}; | |
e4ffb299 DM |
164 | $text_state .= ", " if $text_state; |
165 | $text_state .= "$k = $v"; | |
4e01bc86 DM |
166 | $sd->{$k} = $v; |
167 | } | |
168 | ||
c4a221bc DM |
169 | $uid_counter++; |
170 | $sd->{uid} = md5_base64($new_state . $$ . time() . $uid_counter); | |
4e01bc86 | 171 | |
e4ffb299 DM |
172 | $text_state = " ($text_state)" if $text_state; |
173 | $haenv->log('info', "service '$sid': state changed from '${old_state}' to '${new_state}' $text_state\n"); | |
4e01bc86 DM |
174 | }; |
175 | ||
332170bd | 176 | # read LRM status for all active nodes |
c4a221bc | 177 | sub read_lrm_status { |
332170bd | 178 | my ($self) = @_; |
c4a221bc | 179 | |
332170bd | 180 | my $nodes = $self->{ns}->list_online_nodes(); |
c4a221bc DM |
181 | my $haenv = $self->{haenv}; |
182 | ||
183 | my $res = {}; | |
184 | ||
332170bd | 185 | foreach my $node (@$nodes) { |
c4a221bc DM |
186 | my $ls = $haenv->read_lrm_status($node); |
187 | foreach my $uid (keys %$ls) { | |
188 | next if $res->{$uid}; # should not happen | |
189 | $res->{$uid} = $ls->{$uid}; | |
190 | } | |
191 | } | |
192 | ||
193 | return $res; | |
194 | } | |
195 | ||
aa98a844 DM |
196 | # read new crm commands and save them into crm master status |
197 | sub update_crm_commands { | |
198 | my ($self) = @_; | |
199 | ||
200 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); | |
201 | ||
202 | my $cmdlist = $haenv->read_crm_commands(); | |
203 | ||
204 | foreach my $cmd (split(/\n/, $cmdlist)) { | |
205 | chomp $cmd; | |
206 | ||
b0fdf86a DM |
207 | if ($cmd =~ m/^(migrate|relocate)\s+(\S+)\s+(\S+)$/) { |
208 | my ($task, $sid, $node) = ($1, $2, $3); | |
aa98a844 DM |
209 | if (my $sd = $ss->{$sid}) { |
210 | if (!$ns->node_is_online($node)) { | |
211 | $haenv->log('err', "crm command error - node not online: $cmd"); | |
212 | } else { | |
213 | if ($node eq $sd->{node}) { | |
214 | $haenv->log('info', "ignore crm command - service already on target node: $cmd"); | |
215 | } else { | |
216 | $haenv->log('info', "got crm command: $cmd"); | |
b0fdf86a | 217 | $ss->{$sid}->{cmd} = [ $task, $node]; |
aa98a844 DM |
218 | } |
219 | } | |
220 | } else { | |
221 | $haenv->log('err', "crm command error - no such service: $cmd"); | |
222 | } | |
223 | ||
224 | } else { | |
225 | $haenv->log('err', "unable to parse crm command: $cmd"); | |
226 | } | |
227 | } | |
228 | ||
229 | } | |
230 | ||
8f0bb968 DM |
231 | sub manage { |
232 | my ($self) = @_; | |
c0bbd038 | 233 | |
59fd7207 | 234 | my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss}); |
c0bbd038 | 235 | |
332170bd | 236 | $ns->update($haenv->get_node_info()); |
c79442f2 DM |
237 | |
238 | if (!$ns->node_is_online($haenv->nodename())) { | |
239 | $haenv->log('info', "master seems offline\n"); | |
240 | return; | |
241 | } | |
242 | ||
332170bd | 243 | my $lrm_status = $self->read_lrm_status(); |
c4a221bc | 244 | |
f7ccd1b3 DM |
245 | my $sc = $haenv->read_service_config(); |
246 | ||
abc920b4 DM |
247 | $self->{groups} = $haenv->read_group_config(); # update |
248 | ||
f7ccd1b3 DM |
249 | # compute new service status |
250 | ||
251 | # add new service | |
252 | foreach my $sid (keys %$sc) { | |
253 | next if $ss->{$sid}; # already there | |
254 | $haenv->log('info', "Adding new service '$sid'\n"); | |
255 | # assume we are running to avoid relocate running service at add | |
8456bde2 | 256 | $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}}; |
f7ccd1b3 DM |
257 | } |
258 | ||
aa98a844 DM |
259 | $self->update_crm_commands(); |
260 | ||
c79442f2 DM |
261 | for (;;) { |
262 | my $repeat = 0; | |
f7ccd1b3 | 263 | |
c79442f2 DM |
264 | foreach my $sid (keys %$ss) { |
265 | my $sd = $ss->{$sid}; | |
266 | my $cd = $sc->{$sid} || { state => 'disabled' }; | |
f7ccd1b3 | 267 | |
a875fbe8 DM |
268 | my $lrm_res = $sd->{uid} ? $lrm_status->{$sd->{uid}} : undef; |
269 | ||
c79442f2 DM |
270 | my $last_state = $sd->{state}; |
271 | ||
272 | if ($last_state eq 'stopped') { | |
273 | ||
abc920b4 | 274 | $self->next_state_stopped($sid, $cd, $sd, $lrm_res); |
f7ccd1b3 | 275 | |
c79442f2 | 276 | } elsif ($last_state eq 'started') { |
f7ccd1b3 | 277 | |
abc920b4 | 278 | $self->next_state_started($sid, $cd, $sd, $lrm_res); |
f7ccd1b3 | 279 | |
b0fdf86a | 280 | } elsif ($last_state eq 'migrate' || $last_state eq 'relocate') { |
f7ccd1b3 | 281 | |
e88469ba DM |
282 | # check result from LRM daemon |
283 | if ($lrm_res) { | |
284 | my $exit_code = $lrm_res->{exit_code}; | |
285 | if ($exit_code == 0) { | |
286 | &$change_service_state($self, $sid, 'started', node => $sd->{target}); | |
287 | } else { | |
288 | $haenv->log('err', "service '$sid' - migration failed (exit code $exit_code)"); | |
289 | &$change_service_state($self, $sid, 'started', node => $sd->{node}); | |
290 | } | |
291 | } | |
f7ccd1b3 | 292 | |
c79442f2 | 293 | } elsif ($last_state eq 'fence') { |
f7ccd1b3 | 294 | |
21e37ed4 | 295 | # do nothing here - wait until fenced |
f7ccd1b3 | 296 | |
c79442f2 | 297 | } elsif ($last_state eq 'request_stop') { |
f7ccd1b3 | 298 | |
a875fbe8 DM |
299 | # check result from LRM daemon |
300 | if ($lrm_res) { | |
301 | my $exit_code = $lrm_res->{exit_code}; | |
618fbeda DM |
302 | if ($exit_code == 0) { |
303 | &$change_service_state($self, $sid, 'stopped'); | |
304 | } else { | |
305 | &$change_service_state($self, $sid, 'error'); # fixme: what state? | |
306 | } | |
618fbeda DM |
307 | } |
308 | ||
e88469ba DM |
309 | } elsif ($last_state eq 'error') { |
310 | ||
311 | # fixme: | |
312 | ||
a875fbe8 DM |
313 | } else { |
314 | ||
315 | die "unknown service state '$last_state'"; | |
618fbeda | 316 | } |
21e37ed4 | 317 | |
c79442f2 | 318 | $repeat = 1 if $sd->{state} ne $last_state; |
f7ccd1b3 DM |
319 | } |
320 | ||
21e37ed4 DM |
321 | # handle fencing |
322 | my $fenced_nodes = {}; | |
323 | foreach my $sid (keys %$ss) { | |
324 | my $sd = $ss->{$sid}; | |
325 | next if $sd->{state} ne 'fence'; | |
326 | ||
327 | if (!defined($fenced_nodes->{$sd->{node}})) { | |
328 | $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0; | |
329 | } | |
330 | ||
331 | next if !$fenced_nodes->{$sd->{node}}; | |
332 | ||
333 | # node fence was sucessful - mark service as stopped | |
334 | &$change_service_state($self, $sid, 'stopped'); | |
335 | } | |
336 | ||
c79442f2 | 337 | last if !$repeat; |
f7ccd1b3 | 338 | } |
f7ccd1b3 DM |
339 | |
340 | # remove stale services | |
341 | # fixme: | |
342 | ||
8f0bb968 | 343 | $self->flush_master_status(); |
c0bbd038 DM |
344 | } |
345 | ||
a875fbe8 DM |
346 | # functions to compute next service states |
347 | # $cd: service configuration data (read only) | |
348 | # $sd: service status data (read only) | |
349 | # | |
350 | # Note: use change_service_state() to alter state | |
351 | # | |
352 | ||
353 | sub next_state_stopped { | |
abc920b4 | 354 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; |
a875fbe8 DM |
355 | |
356 | my $haenv = $self->{haenv}; | |
e88469ba | 357 | my $ns = $self->{ns}; |
a875fbe8 | 358 | |
ff6f1c5c DM |
359 | if ($sd->{node} ne $cd->{node}) { |
360 | # this can happen if we fence a node with active migrations | |
361 | # hack: modify $sd (normally this should be considered read-only) | |
362 | $haenv->log('info', "fixup service '$sid' location ($sd->{node} => $cd->{node}"); | |
363 | $sd->{node} = $cd->{node}; | |
364 | } | |
365 | ||
94b7ebe2 DM |
366 | if ($sd->{cmd}) { |
367 | my ($cmd, $target) = @{$sd->{cmd}}; | |
368 | delete $sd->{cmd}; | |
369 | ||
b0fdf86a | 370 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
94b7ebe2 | 371 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 372 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 373 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 374 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
94b7ebe2 DM |
375 | } else { |
376 | $haenv->change_service_location($sid, $target); | |
377 | $cd->{node} = $sd->{node} = $target; # fixme: $sd is read-only??!! | |
b0fdf86a | 378 | $haenv->log('info', "$cmd service '$sid' to node '$target' (stopped)"); |
94b7ebe2 DM |
379 | } |
380 | } else { | |
381 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); | |
382 | } | |
383 | } | |
384 | ||
a875fbe8 DM |
385 | if ($cd->{state} eq 'disabled') { |
386 | # do nothing | |
e88469ba DM |
387 | return; |
388 | } | |
389 | ||
390 | if ($cd->{state} eq 'enabled') { | |
a875fbe8 DM |
391 | if (my $node = $self->select_service_node($cd)) { |
392 | if ($node && ($sd->{node} ne $node)) { | |
393 | $haenv->change_service_location($sid, $node); | |
394 | } | |
395 | &$change_service_state($self, $sid, 'started', node => $node); | |
396 | } else { | |
397 | # fixme: warn | |
398 | } | |
e88469ba DM |
399 | |
400 | return; | |
a875fbe8 | 401 | } |
e88469ba DM |
402 | |
403 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 DM |
404 | } |
405 | ||
406 | sub next_state_started { | |
abc920b4 | 407 | my ($self, $sid, $cd, $sd, $lrm_res) = @_; |
a875fbe8 DM |
408 | |
409 | my $haenv = $self->{haenv}; | |
410 | my $ns = $self->{ns}; | |
411 | ||
412 | if (!$ns->node_is_online($sd->{node})) { | |
413 | ||
414 | &$change_service_state($self, $sid, 'fence'); | |
e88469ba DM |
415 | return; |
416 | } | |
a875fbe8 | 417 | |
e88469ba DM |
418 | if ($cd->{state} eq 'disabled') { |
419 | &$change_service_state($self, $sid, 'request_stop'); | |
420 | return; | |
421 | } | |
422 | ||
423 | if ($cd->{state} eq 'enabled') { | |
e88469ba DM |
424 | |
425 | if ($sd->{cmd}) { | |
426 | my ($cmd, $target) = @{$sd->{cmd}}; | |
427 | delete $sd->{cmd}; | |
428 | ||
b0fdf86a | 429 | if ($cmd eq 'migrate' || $cmd eq 'relocate') { |
e88469ba | 430 | if (!$ns->node_is_online($target)) { |
b0fdf86a | 431 | $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online"); |
e88469ba | 432 | } elsif ($sd->{node} eq $target) { |
b0fdf86a | 433 | $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'"); |
e88469ba | 434 | } else { |
b0fdf86a DM |
435 | $haenv->log('info', "$cmd service '$sid' to node '$target' (running)"); |
436 | &$change_service_state($self, $sid, $cmd, node => $sd->{node}, target => $target); | |
e88469ba | 437 | } |
a875fbe8 | 438 | } else { |
e88469ba | 439 | $haenv->log('err', "unknown command '$cmd' for service '$sid'"); |
a875fbe8 DM |
440 | } |
441 | } else { | |
b0fdf86a | 442 | |
abc920b4 DM |
443 | my $try_next = 0; |
444 | if ($lrm_res && ($lrm_res->{exit_code} != 0)) { # fixme: other exit codes? | |
445 | $try_next = 1; | |
446 | } | |
447 | ||
448 | my $node = $self->select_service_node($cd, $try_next); | |
449 | ||
b0fdf86a DM |
450 | if ($node && ($sd->{node} ne $node)) { |
451 | $haenv->log('info', "migrate service '$sid' to node '$node' (running)"); | |
452 | &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node); | |
453 | } else { | |
454 | # do nothing | |
455 | } | |
a875fbe8 | 456 | } |
e88469ba DM |
457 | |
458 | return; | |
459 | } | |
460 | ||
461 | $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration"); | |
a875fbe8 | 462 | } |
c0bbd038 DM |
463 | |
464 | 1; |