]> git.proxmox.com Git - pve-ha-manager.git/blame - PVE/HA/LRM.pm
code cleanup: split out next_state functions
[pve-ha-manager.git] / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
12use PVE::HA::Tools;
13
14# Server can have several states:
15
16my $valid_states = {
17 wait_for_agent_lock => "waiting for agnet lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
5f095798
DM
32 }, $class;
33
34 $self->set_local_status({ state => 'wait_for_agent_lock' });
35
36 return $self;
37}
38
39sub shutdown_request {
40 my ($self) = @_;
41
42 $self->{shutdown_request} = 1;
43}
44
45sub get_local_status {
46 my ($self) = @_;
47
48 return $self->{status};
49}
50
51sub set_local_status {
52 my ($self, $new) = @_;
53
54 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
55
56 my $haenv = $self->{haenv};
57
58 my $old = $self->{status};
59
60 # important: only update if if really changed
61 return if $old->{state} eq $new->{state};
62
0bba8f60 63 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
64
65 $new->{state_change_time} = $haenv->get_time();
66
67 $self->{status} = $new;
68}
69
70sub get_protected_ha_agent_lock {
71 my ($self) = @_;
72
73 my $haenv = $self->{haenv};
74
75 my $count = 0;
76 my $starttime = $haenv->get_time();
77
78 for (;;) {
79
80 if ($haenv->get_ha_agent_lock()) {
81 if ($self->{ha_agent_wd}) {
82 $haenv->watchdog_update($self->{ha_agent_wd});
83 } else {
84 my $wfh = $haenv->watchdog_open();
85 $self->{ha_agent_wd} = $wfh;
86 }
87 return 1;
88 }
89
90 last if ++$count > 5; # try max 5 time
91
92 my $delay = $haenv->get_time() - $starttime;
93 last if $delay > 5; # for max 5 seconds
94
95 $haenv->sleep(1);
96 }
97
98 return 0;
99}
100
101sub do_one_iteration {
102 my ($self) = @_;
103
104 my $haenv = $self->{haenv};
105
106 my $status = $self->get_local_status();
107 my $state = $status->{state};
108
109 # do state changes first
110
111 my $ctime = $haenv->get_time();
112
113 if ($state eq 'wait_for_agent_lock') {
114
115 my $service_count = 1; # todo: correctly compute
116
117 if ($service_count && $haenv->quorate()) {
0bba8f60
DM
118 if ($self->get_protected_ha_agent_lock()) {
119 $self->set_local_status({ state => 'active' });
5f095798
DM
120 }
121 }
122
123 } elsif ($state eq 'lost_agent_lock') {
124
125 if ($haenv->quorate()) {
0bba8f60
DM
126 if ($self->get_protected_ha_agent_lock()) {
127 $self->set_local_status({ state => 'active' });
5f095798
DM
128 }
129 }
130
0bba8f60 131 } elsif ($state eq 'active') {
5f095798 132
0bba8f60 133 if (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
134 $self->set_local_status({ state => 'lost_agent_lock'});
135 }
136 }
137
138 $status = $self->get_local_status();
139 $state = $status->{state};
140
141 # do work
142
c4a221bc
DM
143 $self->{service_status} = {};
144
5f095798
DM
145 if ($state eq 'wait_for_agent_lock') {
146
147 return 0 if $self->{shutdown_request};
148
149 $haenv->sleep(5);
150
0bba8f60 151 } elsif ($state eq 'active') {
5f095798
DM
152
153 my $startime = $haenv->get_time();
154
155 my $max_time = 10;
156
157 my $shutdown = 0;
158
159 # do work (max_time seconds)
160 eval {
161 # fixme: set alert timer
162
163 if ($self->{shutdown_request}) {
164
165 # fixme: request service stop or relocate ?
166
167 my $service_count = 0; # fixme
168
169 if ($service_count == 0) {
170
171 if ($self->{ha_agent_wd}) {
172 $haenv->watchdog_close($self->{ha_agent_wd});
173 delete $self->{ha_agent_wd};
174 }
175
176 $shutdown = 1;
177 }
c4a221bc
DM
178 } else {
179 my $ms = $haenv->read_manager_status();
180
181 $self->{service_status} = $ms->{service_status} || {};
182
183 $self->manage_resources();
5f095798
DM
184 }
185 };
186 if (my $err = $@) {
187 $haenv->log('err', "got unexpected error - $err");
188 }
189
190 return 0 if $shutdown;
191
192 $haenv->sleep_until($startime + $max_time);
193
194 } elsif ($state eq 'lost_agent_lock') {
195
196 # Note: watchdog is active an will triger soon!
197
198 # so we hope to get the lock back soon!
199
200 if ($self->{shutdown_request}) {
201
202 my $running_services = 0; # fixme: correctly compute
203
204 if ($running_services > 0) {
205 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
206 "killing running services");
207
208 # fixme: kill all services as fast as possible
209 }
210
211 # now all services are stopped, so we can close the watchdog
212
213 if ($self->{ha_agent_wd}) {
214 $haenv->watchdog_close($self->{ha_agent_wd});
215 delete $self->{ha_agent_wd};
216 }
217
218 return 0;
219 }
220
221 } else {
222
223 die "got unexpected status '$state'\n";
224
225 }
226
227 return 1;
228}
229
c4a221bc
DM
230sub manage_resources {
231 my ($self) = @_;
232
233 my $haenv = $self->{haenv};
234
235 my $nodename = $haenv->nodename();
236
237 my $ms = $haenv->read_manager_status();
238
239 my $ss = $self->{service_status};
240
241 foreach my $sid (keys %$ss) {
242 my $sd = $ss->{$sid};
243 next if !$sd->{node};
244 next if !$sd->{uid};
245 next if $sd->{node} ne $nodename;
246 my $req_state = $sd->{state};
247 next if !defined($req_state);
248
249 eval {
250 $self->queue_resource_command($sid, $sd->{uid}, $req_state);
251 };
252 if (my $err = $@) {
253 warn "unable to run resource agent for '$sid' - $err"; # fixme
254 }
255 }
256
257 my $starttime = time();
258
259 # start workers
260 my $max_workers = 4;
261
262 while ((time() - $starttime) < 5) {
263 my $count = $self->check_active_workers();
264
265 foreach my $sid (keys %{$self->{workers}}) {
266 last if $count >= $max_workers;
267 my $w = $self->{workers}->{$sid};
268 if (!$w->{pid}) {
269 my $pid = fork();
270 if (!defined($pid)) {
271 warn "fork worker failed\n";
272 $count = 0; last; # abort, try later
273 } elsif ($pid == 0) {
274 # do work
275 my $res = -1;
276 eval {
277 $res = $haenv->exec_resource_agent($sid, $w->{state});
278 };
279 if (my $err = $@) {
280 warn $err;
281 POSIX::_exit(-1);
282 }
283 POSIX::_exit($res);
284 } else {
285 $count++;
286 $w->{pid} = $pid;
287 }
288 }
289 }
290
291 last if !$count;
292
293 sleep(1);
294 }
295}
296
297# fixme: use a queue an limit number of parallel workers?
298sub queue_resource_command {
299 my ($self, $sid, $uid, $state) = @_;
300
301 if (my $w = $self->{workers}->{$sid}) {
302 return if $w->{pid}; # already started
303 # else, delete and overwrite queue entry with new command
304 delete $self->{workers}->{$sid};
305 }
306
307 $self->{workers}->{$sid} = {
308 sid => $sid,
309 uid => $uid,
310 state => $state,
311 };
312}
313
314sub check_active_workers {
315 my ($self) = @_;
316
317 # finish/count workers
318 my $count = 0;
319 foreach my $sid (keys %{$self->{workers}}) {
320 my $w = $self->{workers}->{$sid};
321 if (my $pid = $w->{pid}) {
322 # check status
323 my $waitpid = waitpid($pid, WNOHANG);
324 if (defined($waitpid) && ($waitpid == $pid)) {
325 $self->resource_command_finished($sid, $w->{uid}, $?);
326 } else {
327 $count++;
328 }
329 }
330 }
331
332 return $count;
333}
334
335sub resource_command_finished {
336 my ($self, $sid, $uid, $status) = @_;
337
338 my $haenv = $self->{haenv};
339
340 my $w = delete $self->{workers}->{$sid};
341 return if !$w; # should not happen
342
343 my $exit_code = -1;
344
345 if ($status == -1) {
346 print "RA $sid finished - failed to execute\n";
347 } elsif (my $sig = ($status & 127)) {
348 print "RA $sid finished - got signal $sig\n";
349 } else {
350 $exit_code = ($status >> 8);
351 print "RA $sid finished - exit code ${exit_code}\n";
352 }
353
354 $self->{results}->{$uid} = {
355 sid => $w->{sid},
356 state => $w->{state},
357 exit_code => $exit_code,
358 };
359
360 my $ss = $self->{service_status};
361
362 # compute hash of valid/existing uids
363 my $valid_uids = {};
364 foreach my $sid (keys %$ss) {
365 my $sd = $ss->{$sid};
366 next if !$sd->{uid};
367 $valid_uids->{$sd->{uid}} = 1;
368 }
369
370 my $results = {};
371 foreach my $id (keys %{$self->{results}}) {
372 next if !$valid_uids->{$id};
373 $results->{$id} = $self->{results}->{$id};
374 }
375 $self->{results} = $results;
376
377 $haenv->write_lrm_status($results);
378}
379
5f095798 3801;