]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
use option nofailback for default simulator groups
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
12use PVE::HA::Tools;
13
14# Server can have several states:
15
16my $valid_states = {
ec911edd 17 wait_for_agent_lock => "waiting for agent lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
5f095798
DM
32 }, $class;
33
b0bf08a9 34 $self->set_local_status({ state => 'wait_for_agent_lock' });
5f095798
DM
35
36 return $self;
37}
38
39sub shutdown_request {
40 my ($self) = @_;
41
42 $self->{shutdown_request} = 1;
43}
44
45sub get_local_status {
46 my ($self) = @_;
47
48 return $self->{status};
49}
50
51sub set_local_status {
52 my ($self, $new) = @_;
53
54 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
55
56 my $haenv = $self->{haenv};
57
58 my $old = $self->{status};
59
60 # important: only update if if really changed
61 return if $old->{state} eq $new->{state};
62
0bba8f60 63 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
64
65 $new->{state_change_time} = $haenv->get_time();
66
67 $self->{status} = $new;
68}
69
70sub get_protected_ha_agent_lock {
71 my ($self) = @_;
72
73 my $haenv = $self->{haenv};
74
75 my $count = 0;
76 my $starttime = $haenv->get_time();
77
78 for (;;) {
79
80 if ($haenv->get_ha_agent_lock()) {
81 if ($self->{ha_agent_wd}) {
82 $haenv->watchdog_update($self->{ha_agent_wd});
83 } else {
84 my $wfh = $haenv->watchdog_open();
85 $self->{ha_agent_wd} = $wfh;
86 }
87 return 1;
88 }
89
90 last if ++$count > 5; # try max 5 time
91
92 my $delay = $haenv->get_time() - $starttime;
93 last if $delay > 5; # for max 5 seconds
94
95 $haenv->sleep(1);
96 }
97
98 return 0;
99}
100
101sub do_one_iteration {
102 my ($self) = @_;
103
104 my $haenv = $self->{haenv};
105
106 my $status = $self->get_local_status();
107 my $state = $status->{state};
108
109 # do state changes first
110
111 my $ctime = $haenv->get_time();
112
b0bf08a9 113 if ($state eq 'wait_for_agent_lock') {
5f095798
DM
114
115 my $service_count = 1; # todo: correctly compute
116
117 if ($service_count && $haenv->quorate()) {
0bba8f60
DM
118 if ($self->get_protected_ha_agent_lock()) {
119 $self->set_local_status({ state => 'active' });
5f095798
DM
120 }
121 }
122
123 } elsif ($state eq 'lost_agent_lock') {
124
125 if ($haenv->quorate()) {
0bba8f60
DM
126 if ($self->get_protected_ha_agent_lock()) {
127 $self->set_local_status({ state => 'active' });
5f095798
DM
128 }
129 }
130
0bba8f60 131 } elsif ($state eq 'active') {
5f095798 132
0bba8f60 133 if (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
134 $self->set_local_status({ state => 'lost_agent_lock'});
135 }
136 }
137
138 $status = $self->get_local_status();
139 $state = $status->{state};
140
141 # do work
142
c4a221bc
DM
143 $self->{service_status} = {};
144
5f095798
DM
145 if ($state eq 'wait_for_agent_lock') {
146
147 return 0 if $self->{shutdown_request};
148
149 $haenv->sleep(5);
150
0bba8f60 151 } elsif ($state eq 'active') {
5f095798
DM
152
153 my $startime = $haenv->get_time();
154
155 my $max_time = 10;
156
157 my $shutdown = 0;
158
159 # do work (max_time seconds)
160 eval {
161 # fixme: set alert timer
162
163 if ($self->{shutdown_request}) {
164
165 # fixme: request service stop or relocate ?
166
167 my $service_count = 0; # fixme
168
169 if ($service_count == 0) {
170
171 if ($self->{ha_agent_wd}) {
172 $haenv->watchdog_close($self->{ha_agent_wd});
173 delete $self->{ha_agent_wd};
174 }
175
176 $shutdown = 1;
177 }
c4a221bc
DM
178 } else {
179 my $ms = $haenv->read_manager_status();
180
181 $self->{service_status} = $ms->{service_status} || {};
182
183 $self->manage_resources();
5f095798
DM
184 }
185 };
186 if (my $err = $@) {
187 $haenv->log('err', "got unexpected error - $err");
188 }
189
190 return 0 if $shutdown;
191
192 $haenv->sleep_until($startime + $max_time);
193
194 } elsif ($state eq 'lost_agent_lock') {
195
196 # Note: watchdog is active an will triger soon!
197
198 # so we hope to get the lock back soon!
199
200 if ($self->{shutdown_request}) {
201
202 my $running_services = 0; # fixme: correctly compute
203
204 if ($running_services > 0) {
205 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
206 "killing running services");
207
208 # fixme: kill all services as fast as possible
209 }
210
211 # now all services are stopped, so we can close the watchdog
212
213 if ($self->{ha_agent_wd}) {
214 $haenv->watchdog_close($self->{ha_agent_wd});
215 delete $self->{ha_agent_wd};
216 }
217
218 return 0;
219 }
220
b0bf08a9
DM
221 $haenv->sleep(5);
222
5f095798
DM
223 } else {
224
225 die "got unexpected status '$state'\n";
226
227 }
228
229 return 1;
230}
231
c4a221bc
DM
232sub manage_resources {
233 my ($self) = @_;
234
235 my $haenv = $self->{haenv};
236
237 my $nodename = $haenv->nodename();
238
239 my $ms = $haenv->read_manager_status();
240
241 my $ss = $self->{service_status};
242
243 foreach my $sid (keys %$ss) {
244 my $sd = $ss->{$sid};
245 next if !$sd->{node};
246 next if !$sd->{uid};
247 next if $sd->{node} ne $nodename;
248 my $req_state = $sd->{state};
249 next if !defined($req_state);
250
251 eval {
e88469ba 252 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
c4a221bc
DM
253 };
254 if (my $err = $@) {
255 warn "unable to run resource agent for '$sid' - $err"; # fixme
256 }
257 }
258
259 my $starttime = time();
260
261 # start workers
262 my $max_workers = 4;
263
6dbf93a0
DM
264 my $sc = $haenv->read_service_config();
265
c4a221bc
DM
266 while ((time() - $starttime) < 5) {
267 my $count = $self->check_active_workers();
268
269 foreach my $sid (keys %{$self->{workers}}) {
270 last if $count >= $max_workers;
271 my $w = $self->{workers}->{$sid};
6dbf93a0
DM
272 my $cd = $sc->{$sid};
273 if (!$cd) {
274 warn "missing resource configuration for '$sid'\n";
275 next;
276 }
c4a221bc
DM
277 if (!$w->{pid}) {
278 my $pid = fork();
279 if (!defined($pid)) {
280 warn "fork worker failed\n";
281 $count = 0; last; # abort, try later
282 } elsif ($pid == 0) {
283 # do work
284 my $res = -1;
285 eval {
6dbf93a0 286 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
c4a221bc
DM
287 };
288 if (my $err = $@) {
289 warn $err;
290 POSIX::_exit(-1);
291 }
292 POSIX::_exit($res);
293 } else {
294 $count++;
295 $w->{pid} = $pid;
296 }
297 }
298 }
299
300 last if !$count;
301
302 sleep(1);
303 }
304}
305
306# fixme: use a queue an limit number of parallel workers?
307sub queue_resource_command {
e88469ba 308 my ($self, $sid, $uid, $state, $target) = @_;
c4a221bc
DM
309
310 if (my $w = $self->{workers}->{$sid}) {
311 return if $w->{pid}; # already started
312 # else, delete and overwrite queue entry with new command
313 delete $self->{workers}->{$sid};
314 }
315
316 $self->{workers}->{$sid} = {
317 sid => $sid,
318 uid => $uid,
319 state => $state,
320 };
e88469ba
DM
321
322 $self->{workers}->{$sid}->{target} = $target if $target;
c4a221bc
DM
323}
324
325sub check_active_workers {
326 my ($self) = @_;
327
328 # finish/count workers
329 my $count = 0;
330 foreach my $sid (keys %{$self->{workers}}) {
331 my $w = $self->{workers}->{$sid};
332 if (my $pid = $w->{pid}) {
333 # check status
334 my $waitpid = waitpid($pid, WNOHANG);
335 if (defined($waitpid) && ($waitpid == $pid)) {
336 $self->resource_command_finished($sid, $w->{uid}, $?);
337 } else {
338 $count++;
339 }
340 }
341 }
342
343 return $count;
344}
345
346sub resource_command_finished {
347 my ($self, $sid, $uid, $status) = @_;
348
349 my $haenv = $self->{haenv};
350
351 my $w = delete $self->{workers}->{$sid};
352 return if !$w; # should not happen
353
354 my $exit_code = -1;
355
356 if ($status == -1) {
0f70400d 357 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 358 } elsif (my $sig = ($status & 127)) {
0f70400d 359 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
360 } else {
361 $exit_code = ($status >> 8);
c4a221bc
DM
362 }
363
364 $self->{results}->{$uid} = {
365 sid => $w->{sid},
366 state => $w->{state},
367 exit_code => $exit_code,
368 };
369
370 my $ss = $self->{service_status};
371
372 # compute hash of valid/existing uids
373 my $valid_uids = {};
374 foreach my $sid (keys %$ss) {
375 my $sd = $ss->{$sid};
376 next if !$sd->{uid};
377 $valid_uids->{$sd->{uid}} = 1;
378 }
379
380 my $results = {};
381 foreach my $id (keys %{$self->{results}}) {
382 next if !$valid_uids->{$id};
383 $results->{$id} = $self->{results}->{$id};
384 }
385 $self->{results} = $results;
386
387 $haenv->write_lrm_status($results);
388}
389
5f095798 3901;