]> git.proxmox.com Git - pve-ha-manager.git/blame - PVE/HA/Manager.pm
add parser for ha groups (similar to failover domains)
[pve-ha-manager.git] / PVE / HA / Manager.pm
CommitLineData
c0bbd038
DM
1package PVE::HA::Manager;
2
3use strict;
4use warnings;
c4a221bc 5use Digest::MD5 qw(md5_base64);
c0bbd038
DM
6
7use Data::Dumper;
8
9use PVE::HA::NodeStatus;
10
11sub new {
8f0bb968 12 my ($this, $haenv) = @_;
c0bbd038
DM
13
14 my $class = ref($this) || $this;
15
8f0bb968
DM
16 my $ms = $haenv->read_manager_status();
17
18 $ms->{master_node} = $haenv->nodename();
19
20 my $ns = PVE::HA::NodeStatus->new($haenv, $ms->{node_status} || {});
21
59fd7207
DM
22 # fixme: use separate class PVE::HA::ServiceStatus
23 my $ss = $ms->{service_status} || {};
24
c0bbd038 25 my $self = bless {
8f0bb968
DM
26 haenv => $haenv,
27 ms => $ms, # master status
28 ns => $ns, # PVE::HA::NodeStatus
59fd7207 29 ss => $ss, # service status
c0bbd038
DM
30 }, $class;
31
32 return $self;
33}
34
d84da043
DM
35sub cleanup {
36 my ($self) = @_;
37
38 # todo: ?
39}
40
8f0bb968 41sub flush_master_status {
c0bbd038
DM
42 my ($self) = @_;
43
59fd7207 44 my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss});
c0bbd038 45
8f0bb968 46 $ms->{node_status} = $ns->{status};
59fd7207
DM
47 $ms->{service_status} = $ss;
48
8f0bb968
DM
49 $haenv->write_manager_status($ms);
50}
c0bbd038 51
819c61f0 52# Attention: must be idempotent (alway return the same result for same input!)
f7ccd1b3
DM
53sub select_service_node {
54 my ($self, $service_conf) = @_;
55
56 my $ns = $self->{ns};
57
58 my $pref_node = $service_conf->{node};
59
60 return $pref_node if $ns->node_is_online($pref_node);
61
62 my $online_nodes = $ns->list_online_nodes();
63
64 return shift @$online_nodes;
65}
66
c4a221bc
DM
67my $uid_counter = 0;
68
618fbeda
DM
69my $valid_service_states = {
70 stopped => 1,
71 request_stop => 1,
72 started => 1,
73 fence => 1,
74 migrate => 1,
b0fdf86a 75 relocate => 1,
618fbeda
DM
76 error => 1,
77};
78
4e01bc86
DM
79my $change_service_state = sub {
80 my ($self, $sid, $new_state, %params) = @_;
81
82 my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
83
84 my $sd = $ss->{$sid} || die "no such service '$sid";
85
86 my $old_state = $sd->{state};
e4ffb299 87 my $old_node = $sd->{node};
4e01bc86
DM
88
89 die "no state change" if $old_state eq $new_state; # just to be sure
90
618fbeda
DM
91 die "invalid CRM service state '$new_state'\n" if !$valid_service_states->{$new_state};
92
e4ffb299
DM
93 foreach my $k (keys %$sd) { delete $sd->{$k}; };
94
95 $sd->{state} = $new_state;
96 $sd->{node} = $old_node;
97
98 my $text_state = '';
4e01bc86
DM
99 foreach my $k (keys %params) {
100 my $v = $params{$k};
e4ffb299
DM
101 $text_state .= ", " if $text_state;
102 $text_state .= "$k = $v";
4e01bc86
DM
103 $sd->{$k} = $v;
104 }
105
c4a221bc
DM
106 $uid_counter++;
107 $sd->{uid} = md5_base64($new_state . $$ . time() . $uid_counter);
4e01bc86 108
e4ffb299
DM
109 $text_state = " ($text_state)" if $text_state;
110 $haenv->log('info', "service '$sid': state changed from '${old_state}' to '${new_state}' $text_state\n");
4e01bc86
DM
111};
112
332170bd 113# read LRM status for all active nodes
c4a221bc 114sub read_lrm_status {
332170bd 115 my ($self) = @_;
c4a221bc 116
332170bd 117 my $nodes = $self->{ns}->list_online_nodes();
c4a221bc
DM
118 my $haenv = $self->{haenv};
119
120 my $res = {};
121
332170bd 122 foreach my $node (@$nodes) {
c4a221bc
DM
123 my $ls = $haenv->read_lrm_status($node);
124 foreach my $uid (keys %$ls) {
125 next if $res->{$uid}; # should not happen
126 $res->{$uid} = $ls->{$uid};
127 }
128 }
129
130 return $res;
131}
132
aa98a844
DM
133# read new crm commands and save them into crm master status
134sub update_crm_commands {
135 my ($self) = @_;
136
137 my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss});
138
139 my $cmdlist = $haenv->read_crm_commands();
140
141 foreach my $cmd (split(/\n/, $cmdlist)) {
142 chomp $cmd;
143
b0fdf86a
DM
144 if ($cmd =~ m/^(migrate|relocate)\s+(\S+)\s+(\S+)$/) {
145 my ($task, $sid, $node) = ($1, $2, $3);
aa98a844
DM
146 if (my $sd = $ss->{$sid}) {
147 if (!$ns->node_is_online($node)) {
148 $haenv->log('err', "crm command error - node not online: $cmd");
149 } else {
150 if ($node eq $sd->{node}) {
151 $haenv->log('info', "ignore crm command - service already on target node: $cmd");
152 } else {
153 $haenv->log('info', "got crm command: $cmd");
b0fdf86a 154 $ss->{$sid}->{cmd} = [ $task, $node];
aa98a844
DM
155 }
156 }
157 } else {
158 $haenv->log('err', "crm command error - no such service: $cmd");
159 }
160
161 } else {
162 $haenv->log('err', "unable to parse crm command: $cmd");
163 }
164 }
165
166}
167
8f0bb968
DM
168sub manage {
169 my ($self) = @_;
c0bbd038 170
59fd7207 171 my ($haenv, $ms, $ns, $ss) = ($self->{haenv}, $self->{ms}, $self->{ns}, $self->{ss});
c0bbd038 172
332170bd 173 $ns->update($haenv->get_node_info());
c79442f2
DM
174
175 if (!$ns->node_is_online($haenv->nodename())) {
176 $haenv->log('info', "master seems offline\n");
177 return;
178 }
179
332170bd 180 my $lrm_status = $self->read_lrm_status();
c4a221bc 181
f7ccd1b3
DM
182 my $sc = $haenv->read_service_config();
183
184 # compute new service status
185
186 # add new service
187 foreach my $sid (keys %$sc) {
188 next if $ss->{$sid}; # already there
189 $haenv->log('info', "Adding new service '$sid'\n");
190 # assume we are running to avoid relocate running service at add
8456bde2 191 $ss->{$sid} = { state => 'started', node => $sc->{$sid}->{node}};
f7ccd1b3
DM
192 }
193
aa98a844
DM
194 $self->update_crm_commands();
195
c79442f2
DM
196 for (;;) {
197 my $repeat = 0;
f7ccd1b3 198
c79442f2
DM
199 foreach my $sid (keys %$ss) {
200 my $sd = $ss->{$sid};
201 my $cd = $sc->{$sid} || { state => 'disabled' };
f7ccd1b3 202
a875fbe8
DM
203 my $lrm_res = $sd->{uid} ? $lrm_status->{$sd->{uid}} : undef;
204
c79442f2
DM
205 my $last_state = $sd->{state};
206
207 if ($last_state eq 'stopped') {
208
a875fbe8 209 $self->next_state_stopped($sid, $cd, $sd);
f7ccd1b3 210
c79442f2 211 } elsif ($last_state eq 'started') {
f7ccd1b3 212
a875fbe8 213 $self->next_state_started($sid, $cd, $sd);
f7ccd1b3 214
b0fdf86a 215 } elsif ($last_state eq 'migrate' || $last_state eq 'relocate') {
f7ccd1b3 216
e88469ba
DM
217 # check result from LRM daemon
218 if ($lrm_res) {
219 my $exit_code = $lrm_res->{exit_code};
220 if ($exit_code == 0) {
221 &$change_service_state($self, $sid, 'started', node => $sd->{target});
222 } else {
223 $haenv->log('err', "service '$sid' - migration failed (exit code $exit_code)");
224 &$change_service_state($self, $sid, 'started', node => $sd->{node});
225 }
226 }
f7ccd1b3 227
c79442f2 228 } elsif ($last_state eq 'fence') {
f7ccd1b3 229
21e37ed4 230 # do nothing here - wait until fenced
f7ccd1b3 231
c79442f2 232 } elsif ($last_state eq 'request_stop') {
f7ccd1b3 233
a875fbe8
DM
234 # check result from LRM daemon
235 if ($lrm_res) {
236 my $exit_code = $lrm_res->{exit_code};
618fbeda
DM
237 if ($exit_code == 0) {
238 &$change_service_state($self, $sid, 'stopped');
239 } else {
240 &$change_service_state($self, $sid, 'error'); # fixme: what state?
241 }
618fbeda
DM
242 }
243
e88469ba
DM
244 } elsif ($last_state eq 'error') {
245
246 # fixme:
247
a875fbe8
DM
248 } else {
249
250 die "unknown service state '$last_state'";
618fbeda 251 }
21e37ed4 252
c79442f2 253 $repeat = 1 if $sd->{state} ne $last_state;
f7ccd1b3
DM
254 }
255
21e37ed4
DM
256 # handle fencing
257 my $fenced_nodes = {};
258 foreach my $sid (keys %$ss) {
259 my $sd = $ss->{$sid};
260 next if $sd->{state} ne 'fence';
261
262 if (!defined($fenced_nodes->{$sd->{node}})) {
263 $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0;
264 }
265
266 next if !$fenced_nodes->{$sd->{node}};
267
268 # node fence was sucessful - mark service as stopped
269 &$change_service_state($self, $sid, 'stopped');
270 }
271
c79442f2 272 last if !$repeat;
f7ccd1b3 273 }
f7ccd1b3
DM
274
275 # remove stale services
276 # fixme:
277
8f0bb968 278 $self->flush_master_status();
c0bbd038
DM
279}
280
a875fbe8
DM
281# functions to compute next service states
282# $cd: service configuration data (read only)
283# $sd: service status data (read only)
284#
285# Note: use change_service_state() to alter state
286#
287
288sub next_state_stopped {
289 my ($self, $sid, $cd, $sd) = @_;
290
291 my $haenv = $self->{haenv};
e88469ba 292 my $ns = $self->{ns};
a875fbe8 293
ff6f1c5c
DM
294 if ($sd->{node} ne $cd->{node}) {
295 # this can happen if we fence a node with active migrations
296 # hack: modify $sd (normally this should be considered read-only)
297 $haenv->log('info', "fixup service '$sid' location ($sd->{node} => $cd->{node}");
298 $sd->{node} = $cd->{node};
299 }
300
94b7ebe2
DM
301 if ($sd->{cmd}) {
302 my ($cmd, $target) = @{$sd->{cmd}};
303 delete $sd->{cmd};
304
b0fdf86a 305 if ($cmd eq 'migrate' || $cmd eq 'relocate') {
94b7ebe2 306 if (!$ns->node_is_online($target)) {
b0fdf86a 307 $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online");
e88469ba 308 } elsif ($sd->{node} eq $target) {
b0fdf86a 309 $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'");
94b7ebe2
DM
310 } else {
311 $haenv->change_service_location($sid, $target);
312 $cd->{node} = $sd->{node} = $target; # fixme: $sd is read-only??!!
b0fdf86a 313 $haenv->log('info', "$cmd service '$sid' to node '$target' (stopped)");
94b7ebe2
DM
314 }
315 } else {
316 $haenv->log('err', "unknown command '$cmd' for service '$sid'");
317 }
318 }
319
a875fbe8
DM
320 if ($cd->{state} eq 'disabled') {
321 # do nothing
e88469ba
DM
322 return;
323 }
324
325 if ($cd->{state} eq 'enabled') {
a875fbe8
DM
326 if (my $node = $self->select_service_node($cd)) {
327 if ($node && ($sd->{node} ne $node)) {
328 $haenv->change_service_location($sid, $node);
329 }
330 &$change_service_state($self, $sid, 'started', node => $node);
331 } else {
332 # fixme: warn
333 }
e88469ba
DM
334
335 return;
a875fbe8 336 }
e88469ba
DM
337
338 $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration");
a875fbe8
DM
339}
340
341sub next_state_started {
342 my ($self, $sid, $cd, $sd) = @_;
343
344 my $haenv = $self->{haenv};
345 my $ns = $self->{ns};
346
347 if (!$ns->node_is_online($sd->{node})) {
348
349 &$change_service_state($self, $sid, 'fence');
e88469ba
DM
350 return;
351 }
a875fbe8 352
e88469ba
DM
353 if ($cd->{state} eq 'disabled') {
354 &$change_service_state($self, $sid, 'request_stop');
355 return;
356 }
357
358 if ($cd->{state} eq 'enabled') {
e88469ba
DM
359
360 if ($sd->{cmd}) {
361 my ($cmd, $target) = @{$sd->{cmd}};
362 delete $sd->{cmd};
363
b0fdf86a 364 if ($cmd eq 'migrate' || $cmd eq 'relocate') {
e88469ba 365 if (!$ns->node_is_online($target)) {
b0fdf86a 366 $haenv->log('err', "ignore service '$sid' $cmd request - node '$target' not online");
e88469ba 367 } elsif ($sd->{node} eq $target) {
b0fdf86a 368 $haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'");
e88469ba 369 } else {
b0fdf86a
DM
370 $haenv->log('info', "$cmd service '$sid' to node '$target' (running)");
371 &$change_service_state($self, $sid, $cmd, node => $sd->{node}, target => $target);
e88469ba 372 }
a875fbe8 373 } else {
e88469ba 374 $haenv->log('err', "unknown command '$cmd' for service '$sid'");
a875fbe8
DM
375 }
376 } else {
b0fdf86a
DM
377
378 my $node = $self->select_service_node($cd);
379
380 if ($node && ($sd->{node} ne $node)) {
381 $haenv->log('info', "migrate service '$sid' to node '$node' (running)");
382 &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
383 } else {
384 # do nothing
385 }
a875fbe8 386 }
e88469ba
DM
387
388 return;
389 }
390
391 $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration");
a875fbe8 392}
c0bbd038
DM
393
3941;