]> git.proxmox.com Git - pve-ha-manager.git/blob - PVE/HA/LRM.pm
fixup watchdog to cleanly compile with systemd
[pve-ha-manager.git] / PVE / HA / LRM.pm
1 package PVE::HA::LRM;
2
3 # Local Resource Manager
4
5 use strict;
6 use warnings;
7 use Data::Dumper;
8 use POSIX qw(:sys_wait_h);
9
10 use PVE::SafeSyslog;
11 use PVE::Tools;
12 use PVE::HA::Tools;
13
14 # Server can have several states:
15
16 my $valid_states = {
17 wait_for_agent_lock => "waiting for agnet lock",
18 active => "got agent_lock",
19 lost_agent_lock => "lost agent_lock",
20 };
21
22 sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
30 workers => {},
31 results => {},
32 }, $class;
33
34 $self->set_local_status({ state => 'wait_for_agent_lock' });
35
36 return $self;
37 }
38
39 sub shutdown_request {
40 my ($self) = @_;
41
42 $self->{shutdown_request} = 1;
43 }
44
45 sub get_local_status {
46 my ($self) = @_;
47
48 return $self->{status};
49 }
50
51 sub set_local_status {
52 my ($self, $new) = @_;
53
54 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
55
56 my $haenv = $self->{haenv};
57
58 my $old = $self->{status};
59
60 # important: only update if if really changed
61 return if $old->{state} eq $new->{state};
62
63 $haenv->log('info', "status change $old->{state} => $new->{state}");
64
65 $new->{state_change_time} = $haenv->get_time();
66
67 $self->{status} = $new;
68 }
69
70 sub get_protected_ha_agent_lock {
71 my ($self) = @_;
72
73 my $haenv = $self->{haenv};
74
75 my $count = 0;
76 my $starttime = $haenv->get_time();
77
78 for (;;) {
79
80 if ($haenv->get_ha_agent_lock()) {
81 if ($self->{ha_agent_wd}) {
82 $haenv->watchdog_update($self->{ha_agent_wd});
83 } else {
84 my $wfh = $haenv->watchdog_open();
85 $self->{ha_agent_wd} = $wfh;
86 }
87 return 1;
88 }
89
90 last if ++$count > 5; # try max 5 time
91
92 my $delay = $haenv->get_time() - $starttime;
93 last if $delay > 5; # for max 5 seconds
94
95 $haenv->sleep(1);
96 }
97
98 return 0;
99 }
100
101 sub do_one_iteration {
102 my ($self) = @_;
103
104 my $haenv = $self->{haenv};
105
106 my $status = $self->get_local_status();
107 my $state = $status->{state};
108
109 # do state changes first
110
111 my $ctime = $haenv->get_time();
112
113 if ($state eq 'wait_for_agent_lock') {
114
115 my $service_count = 1; # todo: correctly compute
116
117 if ($service_count && $haenv->quorate()) {
118 if ($self->get_protected_ha_agent_lock()) {
119 $self->set_local_status({ state => 'active' });
120 }
121 }
122
123 } elsif ($state eq 'lost_agent_lock') {
124
125 if ($haenv->quorate()) {
126 if ($self->get_protected_ha_agent_lock()) {
127 $self->set_local_status({ state => 'active' });
128 }
129 }
130
131 } elsif ($state eq 'active') {
132
133 if (!$self->get_protected_ha_agent_lock()) {
134 $self->set_local_status({ state => 'lost_agent_lock'});
135 }
136 }
137
138 $status = $self->get_local_status();
139 $state = $status->{state};
140
141 # do work
142
143 $self->{service_status} = {};
144
145 if ($state eq 'wait_for_agent_lock') {
146
147 return 0 if $self->{shutdown_request};
148
149 $haenv->sleep(5);
150
151 } elsif ($state eq 'active') {
152
153 my $startime = $haenv->get_time();
154
155 my $max_time = 10;
156
157 my $shutdown = 0;
158
159 # do work (max_time seconds)
160 eval {
161 # fixme: set alert timer
162
163 if ($self->{shutdown_request}) {
164
165 # fixme: request service stop or relocate ?
166
167 my $service_count = 0; # fixme
168
169 if ($service_count == 0) {
170
171 if ($self->{ha_agent_wd}) {
172 $haenv->watchdog_close($self->{ha_agent_wd});
173 delete $self->{ha_agent_wd};
174 }
175
176 $shutdown = 1;
177 }
178 } else {
179 my $ms = $haenv->read_manager_status();
180
181 $self->{service_status} = $ms->{service_status} || {};
182
183 $self->manage_resources();
184 }
185 };
186 if (my $err = $@) {
187 $haenv->log('err', "got unexpected error - $err");
188 }
189
190 return 0 if $shutdown;
191
192 $haenv->sleep_until($startime + $max_time);
193
194 } elsif ($state eq 'lost_agent_lock') {
195
196 # Note: watchdog is active an will triger soon!
197
198 # so we hope to get the lock back soon!
199
200 if ($self->{shutdown_request}) {
201
202 my $running_services = 0; # fixme: correctly compute
203
204 if ($running_services > 0) {
205 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
206 "killing running services");
207
208 # fixme: kill all services as fast as possible
209 }
210
211 # now all services are stopped, so we can close the watchdog
212
213 if ($self->{ha_agent_wd}) {
214 $haenv->watchdog_close($self->{ha_agent_wd});
215 delete $self->{ha_agent_wd};
216 }
217
218 return 0;
219 }
220
221 } else {
222
223 die "got unexpected status '$state'\n";
224
225 }
226
227 return 1;
228 }
229
230 sub manage_resources {
231 my ($self) = @_;
232
233 my $haenv = $self->{haenv};
234
235 my $nodename = $haenv->nodename();
236
237 my $ms = $haenv->read_manager_status();
238
239 my $ss = $self->{service_status};
240
241 foreach my $sid (keys %$ss) {
242 my $sd = $ss->{$sid};
243 next if !$sd->{node};
244 next if !$sd->{uid};
245 next if $sd->{node} ne $nodename;
246 my $req_state = $sd->{state};
247 next if !defined($req_state);
248
249 eval {
250 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
251 };
252 if (my $err = $@) {
253 warn "unable to run resource agent for '$sid' - $err"; # fixme
254 }
255 }
256
257 my $starttime = time();
258
259 # start workers
260 my $max_workers = 4;
261
262 while ((time() - $starttime) < 5) {
263 my $count = $self->check_active_workers();
264
265 foreach my $sid (keys %{$self->{workers}}) {
266 last if $count >= $max_workers;
267 my $w = $self->{workers}->{$sid};
268 if (!$w->{pid}) {
269 my $pid = fork();
270 if (!defined($pid)) {
271 warn "fork worker failed\n";
272 $count = 0; last; # abort, try later
273 } elsif ($pid == 0) {
274 # do work
275 my $res = -1;
276 eval {
277 $res = $haenv->exec_resource_agent($sid, $w->{state}, $w->{target});
278 };
279 if (my $err = $@) {
280 warn $err;
281 POSIX::_exit(-1);
282 }
283 POSIX::_exit($res);
284 } else {
285 $count++;
286 $w->{pid} = $pid;
287 }
288 }
289 }
290
291 last if !$count;
292
293 sleep(1);
294 }
295 }
296
297 # fixme: use a queue an limit number of parallel workers?
298 sub queue_resource_command {
299 my ($self, $sid, $uid, $state, $target) = @_;
300
301 if (my $w = $self->{workers}->{$sid}) {
302 return if $w->{pid}; # already started
303 # else, delete and overwrite queue entry with new command
304 delete $self->{workers}->{$sid};
305 }
306
307 $self->{workers}->{$sid} = {
308 sid => $sid,
309 uid => $uid,
310 state => $state,
311 };
312
313 $self->{workers}->{$sid}->{target} = $target if $target;
314 }
315
316 sub check_active_workers {
317 my ($self) = @_;
318
319 # finish/count workers
320 my $count = 0;
321 foreach my $sid (keys %{$self->{workers}}) {
322 my $w = $self->{workers}->{$sid};
323 if (my $pid = $w->{pid}) {
324 # check status
325 my $waitpid = waitpid($pid, WNOHANG);
326 if (defined($waitpid) && ($waitpid == $pid)) {
327 $self->resource_command_finished($sid, $w->{uid}, $?);
328 } else {
329 $count++;
330 }
331 }
332 }
333
334 return $count;
335 }
336
337 sub resource_command_finished {
338 my ($self, $sid, $uid, $status) = @_;
339
340 my $haenv = $self->{haenv};
341
342 my $w = delete $self->{workers}->{$sid};
343 return if !$w; # should not happen
344
345 my $exit_code = -1;
346
347 if ($status == -1) {
348 $haenv->log('err', "resource agent $sid finished - failed to execute");
349 } elsif (my $sig = ($status & 127)) {
350 $haenv->log('err', "resource agent $sid finished - got signal $sig");
351 } else {
352 $exit_code = ($status >> 8);
353 }
354
355 $self->{results}->{$uid} = {
356 sid => $w->{sid},
357 state => $w->{state},
358 exit_code => $exit_code,
359 };
360
361 my $ss = $self->{service_status};
362
363 # compute hash of valid/existing uids
364 my $valid_uids = {};
365 foreach my $sid (keys %$ss) {
366 my $sd = $ss->{$sid};
367 next if !$sd->{uid};
368 $valid_uids->{$sd->{uid}} = 1;
369 }
370
371 my $results = {};
372 foreach my $id (keys %{$self->{results}}) {
373 next if !$valid_uids->{$id};
374 $results->{$id} = $self->{results}->{$id};
375 }
376 $self->{results} = $results;
377
378 $haenv->write_lrm_status($results);
379 }
380
381 1;