]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
implement service_config_exists() - test if ha is enabled
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
12use PVE::HA::Tools;
13
14# Server can have several states:
15
16my $valid_states = {
17 wait_for_agent_lock => "waiting for agnet lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
5f095798
DM
32 }, $class;
33
34 $self->set_local_status({ state => 'wait_for_agent_lock' });
35
36 return $self;
37}
38
39sub shutdown_request {
40 my ($self) = @_;
41
42 $self->{shutdown_request} = 1;
43}
44
45sub get_local_status {
46 my ($self) = @_;
47
48 return $self->{status};
49}
50
51sub set_local_status {
52 my ($self, $new) = @_;
53
54 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
55
56 my $haenv = $self->{haenv};
57
58 my $old = $self->{status};
59
60 # important: only update if if really changed
61 return if $old->{state} eq $new->{state};
62
0bba8f60 63 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
64
65 $new->{state_change_time} = $haenv->get_time();
66
67 $self->{status} = $new;
68}
69
70sub get_protected_ha_agent_lock {
71 my ($self) = @_;
72
73 my $haenv = $self->{haenv};
74
75 my $count = 0;
76 my $starttime = $haenv->get_time();
77
78 for (;;) {
79
80 if ($haenv->get_ha_agent_lock()) {
81 if ($self->{ha_agent_wd}) {
82 $haenv->watchdog_update($self->{ha_agent_wd});
83 } else {
84 my $wfh = $haenv->watchdog_open();
85 $self->{ha_agent_wd} = $wfh;
86 }
87 return 1;
88 }
89
90 last if ++$count > 5; # try max 5 time
91
92 my $delay = $haenv->get_time() - $starttime;
93 last if $delay > 5; # for max 5 seconds
94
95 $haenv->sleep(1);
96 }
97
98 return 0;
99}
100
101sub do_one_iteration {
102 my ($self) = @_;
103
104 my $haenv = $self->{haenv};
105
106 my $status = $self->get_local_status();
107 my $state = $status->{state};
108
109 # do state changes first
110
111 my $ctime = $haenv->get_time();
112
113 if ($state eq 'wait_for_agent_lock') {
114
115 my $service_count = 1; # todo: correctly compute
116
117 if ($service_count && $haenv->quorate()) {
0bba8f60
DM
118 if ($self->get_protected_ha_agent_lock()) {
119 $self->set_local_status({ state => 'active' });
5f095798
DM
120 }
121 }
122
123 } elsif ($state eq 'lost_agent_lock') {
124
125 if ($haenv->quorate()) {
0bba8f60
DM
126 if ($self->get_protected_ha_agent_lock()) {
127 $self->set_local_status({ state => 'active' });
5f095798
DM
128 }
129 }
130
0bba8f60 131 } elsif ($state eq 'active') {
5f095798 132
0bba8f60 133 if (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
134 $self->set_local_status({ state => 'lost_agent_lock'});
135 }
136 }
137
138 $status = $self->get_local_status();
139 $state = $status->{state};
140
141 # do work
142
c4a221bc
DM
143 $self->{service_status} = {};
144
5f095798
DM
145 if ($state eq 'wait_for_agent_lock') {
146
147 return 0 if $self->{shutdown_request};
148
149 $haenv->sleep(5);
150
0bba8f60 151 } elsif ($state eq 'active') {
5f095798
DM
152
153 my $startime = $haenv->get_time();
154
155 my $max_time = 10;
156
157 my $shutdown = 0;
158
159 # do work (max_time seconds)
160 eval {
161 # fixme: set alert timer
162
163 if ($self->{shutdown_request}) {
164
165 # fixme: request service stop or relocate ?
166
167 my $service_count = 0; # fixme
168
169 if ($service_count == 0) {
170
171 if ($self->{ha_agent_wd}) {
172 $haenv->watchdog_close($self->{ha_agent_wd});
173 delete $self->{ha_agent_wd};
174 }
175
176 $shutdown = 1;
177 }
c4a221bc
DM
178 } else {
179 my $ms = $haenv->read_manager_status();
180
181 $self->{service_status} = $ms->{service_status} || {};
182
183 $self->manage_resources();
5f095798
DM
184 }
185 };
186 if (my $err = $@) {
187 $haenv->log('err', "got unexpected error - $err");
188 }
189
190 return 0 if $shutdown;
191
192 $haenv->sleep_until($startime + $max_time);
193
194 } elsif ($state eq 'lost_agent_lock') {
195
196 # Note: watchdog is active an will triger soon!
197
198 # so we hope to get the lock back soon!
199
200 if ($self->{shutdown_request}) {
201
202 my $running_services = 0; # fixme: correctly compute
203
204 if ($running_services > 0) {
205 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
206 "killing running services");
207
208 # fixme: kill all services as fast as possible
209 }
210
211 # now all services are stopped, so we can close the watchdog
212
213 if ($self->{ha_agent_wd}) {
214 $haenv->watchdog_close($self->{ha_agent_wd});
215 delete $self->{ha_agent_wd};
216 }
217
218 return 0;
219 }
220
221 } else {
222
223 die "got unexpected status '$state'\n";
224
225 }
226
227 return 1;
228}
229
c4a221bc
DM
230sub manage_resources {
231 my ($self) = @_;
232
233 my $haenv = $self->{haenv};
234
235 my $nodename = $haenv->nodename();
236
237 my $ms = $haenv->read_manager_status();
238
239 my $ss = $self->{service_status};
240
241 foreach my $sid (keys %$ss) {
242 my $sd = $ss->{$sid};
243 next if !$sd->{node};
244 next if !$sd->{uid};
245 next if $sd->{node} ne $nodename;
246 my $req_state = $sd->{state};
247 next if !defined($req_state);
248
249 eval {
e88469ba 250 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
c4a221bc
DM
251 };
252 if (my $err = $@) {
253 warn "unable to run resource agent for '$sid' - $err"; # fixme
254 }
255 }
256
257 my $starttime = time();
258
259 # start workers
260 my $max_workers = 4;
261
6dbf93a0
DM
262 my $sc = $haenv->read_service_config();
263
c4a221bc
DM
264 while ((time() - $starttime) < 5) {
265 my $count = $self->check_active_workers();
266
267 foreach my $sid (keys %{$self->{workers}}) {
268 last if $count >= $max_workers;
269 my $w = $self->{workers}->{$sid};
6dbf93a0
DM
270 my $cd = $sc->{$sid};
271 if (!$cd) {
272 warn "missing resource configuration for '$sid'\n";
273 next;
274 }
c4a221bc
DM
275 if (!$w->{pid}) {
276 my $pid = fork();
277 if (!defined($pid)) {
278 warn "fork worker failed\n";
279 $count = 0; last; # abort, try later
280 } elsif ($pid == 0) {
281 # do work
282 my $res = -1;
283 eval {
6dbf93a0 284 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
c4a221bc
DM
285 };
286 if (my $err = $@) {
287 warn $err;
288 POSIX::_exit(-1);
289 }
290 POSIX::_exit($res);
291 } else {
292 $count++;
293 $w->{pid} = $pid;
294 }
295 }
296 }
297
298 last if !$count;
299
300 sleep(1);
301 }
302}
303
304# fixme: use a queue an limit number of parallel workers?
305sub queue_resource_command {
e88469ba 306 my ($self, $sid, $uid, $state, $target) = @_;
c4a221bc
DM
307
308 if (my $w = $self->{workers}->{$sid}) {
309 return if $w->{pid}; # already started
310 # else, delete and overwrite queue entry with new command
311 delete $self->{workers}->{$sid};
312 }
313
314 $self->{workers}->{$sid} = {
315 sid => $sid,
316 uid => $uid,
317 state => $state,
318 };
e88469ba
DM
319
320 $self->{workers}->{$sid}->{target} = $target if $target;
c4a221bc
DM
321}
322
323sub check_active_workers {
324 my ($self) = @_;
325
326 # finish/count workers
327 my $count = 0;
328 foreach my $sid (keys %{$self->{workers}}) {
329 my $w = $self->{workers}->{$sid};
330 if (my $pid = $w->{pid}) {
331 # check status
332 my $waitpid = waitpid($pid, WNOHANG);
333 if (defined($waitpid) && ($waitpid == $pid)) {
334 $self->resource_command_finished($sid, $w->{uid}, $?);
335 } else {
336 $count++;
337 }
338 }
339 }
340
341 return $count;
342}
343
344sub resource_command_finished {
345 my ($self, $sid, $uid, $status) = @_;
346
347 my $haenv = $self->{haenv};
348
349 my $w = delete $self->{workers}->{$sid};
350 return if !$w; # should not happen
351
352 my $exit_code = -1;
353
354 if ($status == -1) {
0f70400d 355 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 356 } elsif (my $sig = ($status & 127)) {
0f70400d 357 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
358 } else {
359 $exit_code = ($status >> 8);
c4a221bc
DM
360 }
361
362 $self->{results}->{$uid} = {
363 sid => $w->{sid},
364 state => $w->{state},
365 exit_code => $exit_code,
366 };
367
368 my $ss = $self->{service_status};
369
370 # compute hash of valid/existing uids
371 my $valid_uids = {};
372 foreach my $sid (keys %$ss) {
373 my $sd = $ss->{$sid};
374 next if !$sd->{uid};
375 $valid_uids->{$sd->{uid}} = 1;
376 }
377
378 my $results = {};
379 foreach my $id (keys %{$self->{results}}) {
380 next if !$valid_uids->{$id};
381 $results->{$id} = $self->{results}->{$id};
382 }
383 $self->{results} = $results;
384
385 $haenv->write_lrm_status($results);
386}
387
5f095798 3881;