]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
bump version to 0.4-1
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
12use PVE::HA::Tools;
13
14# Server can have several states:
15
16my $valid_states = {
ec911edd 17 wait_for_agent_lock => "waiting for agent lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
067cdf33 32 shutdown_request => 0,
5f095798
DM
33 }, $class;
34
b0bf08a9 35 $self->set_local_status({ state => 'wait_for_agent_lock' });
5f095798
DM
36
37 return $self;
38}
39
40sub shutdown_request {
41 my ($self) = @_;
42
43 $self->{shutdown_request} = 1;
44}
45
46sub get_local_status {
47 my ($self) = @_;
48
49 return $self->{status};
50}
51
52sub set_local_status {
53 my ($self, $new) = @_;
54
55 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
56
57 my $haenv = $self->{haenv};
58
59 my $old = $self->{status};
60
61 # important: only update if if really changed
62 return if $old->{state} eq $new->{state};
63
0bba8f60 64 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
65
66 $new->{state_change_time} = $haenv->get_time();
67
68 $self->{status} = $new;
69}
70
71sub get_protected_ha_agent_lock {
72 my ($self) = @_;
73
74 my $haenv = $self->{haenv};
75
76 my $count = 0;
77 my $starttime = $haenv->get_time();
78
79 for (;;) {
80
81 if ($haenv->get_ha_agent_lock()) {
82 if ($self->{ha_agent_wd}) {
83 $haenv->watchdog_update($self->{ha_agent_wd});
84 } else {
85 my $wfh = $haenv->watchdog_open();
86 $self->{ha_agent_wd} = $wfh;
87 }
88 return 1;
89 }
90
91 last if ++$count > 5; # try max 5 time
92
93 my $delay = $haenv->get_time() - $starttime;
94 last if $delay > 5; # for max 5 seconds
95
96 $haenv->sleep(1);
97 }
98
99 return 0;
100}
101
546e2f1f
DM
102sub active_service_count {
103 my ($self) = @_;
104
105 my $haenv = $self->{haenv};
106
107 my $nodename = $haenv->nodename();
108
109 my $ss = $self->{service_status};
110
111 my $count = 0;
112
113 foreach my $sid (keys %$ss) {
114 my $sd = $ss->{$sid};
115 next if !$sd->{node};
116 next if $sd->{node} ne $nodename;
117 my $req_state = $sd->{state};
118 next if !defined($req_state);
119 next if $req_state eq 'stopped';
120
121 $count++;
122 }
123
124 return $count;
125}
067cdf33 126
5f095798
DM
127sub do_one_iteration {
128 my ($self) = @_;
129
130 my $haenv = $self->{haenv};
131
132 my $status = $self->get_local_status();
133 my $state = $status->{state};
134
067cdf33
DM
135 my $ms = $haenv->read_manager_status();
136 $self->{service_status} = $ms->{service_status} || {};
137
49777d09 138 my $fence_request = PVE::HA::Tools::count_fenced_services($self->{service_status}, $haenv->nodename());
067cdf33 139
5f095798
DM
140 # do state changes first
141
142 my $ctime = $haenv->get_time();
143
b0bf08a9 144 if ($state eq 'wait_for_agent_lock') {
5f095798 145
546e2f1f 146 my $service_count = $self->active_service_count();
5f095798 147
067cdf33 148 if (!$fence_request && $service_count && $haenv->quorate()) {
0bba8f60
DM
149 if ($self->get_protected_ha_agent_lock()) {
150 $self->set_local_status({ state => 'active' });
5f095798
DM
151 }
152 }
153
154 } elsif ($state eq 'lost_agent_lock') {
155
067cdf33 156 if (!$fence_request && $haenv->quorate()) {
0bba8f60
DM
157 if ($self->get_protected_ha_agent_lock()) {
158 $self->set_local_status({ state => 'active' });
5f095798
DM
159 }
160 }
161
0bba8f60 162 } elsif ($state eq 'active') {
5f095798 163
067cdf33
DM
164 if ($fence_request) {
165 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
166 $self->set_local_status({ state => 'lost_agent_lock'});
167 } elsif (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
168 $self->set_local_status({ state => 'lost_agent_lock'});
169 }
170 }
171
172 $status = $self->get_local_status();
173 $state = $status->{state};
174
175 # do work
176
177 if ($state eq 'wait_for_agent_lock') {
178
179 return 0 if $self->{shutdown_request};
180
181 $haenv->sleep(5);
182
0bba8f60 183 } elsif ($state eq 'active') {
5f095798
DM
184
185 my $startime = $haenv->get_time();
186
187 my $max_time = 10;
188
189 my $shutdown = 0;
190
191 # do work (max_time seconds)
192 eval {
193 # fixme: set alert timer
194
195 if ($self->{shutdown_request}) {
196
197 # fixme: request service stop or relocate ?
198
546e2f1f 199 my $service_count = $self->active_service_count();
5f095798
DM
200
201 if ($service_count == 0) {
202
203 if ($self->{ha_agent_wd}) {
204 $haenv->watchdog_close($self->{ha_agent_wd});
205 delete $self->{ha_agent_wd};
206 }
207
208 $shutdown = 1;
209 }
c4a221bc 210 } else {
c4a221bc
DM
211
212 $self->manage_resources();
067cdf33 213
5f095798
DM
214 }
215 };
216 if (my $err = $@) {
217 $haenv->log('err', "got unexpected error - $err");
218 }
219
220 return 0 if $shutdown;
221
222 $haenv->sleep_until($startime + $max_time);
223
224 } elsif ($state eq 'lost_agent_lock') {
225
226 # Note: watchdog is active an will triger soon!
227
228 # so we hope to get the lock back soon!
229
230 if ($self->{shutdown_request}) {
231
546e2f1f 232 my $service_count = $self->active_service_count();
5f095798 233
546e2f1f 234 if ($service_count > 0) {
5f095798 235 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
546e2f1f 236 "detected $service_count running services");
5f095798 237
546e2f1f 238 } else {
5f095798 239
546e2f1f 240 # all services are stopped, so we can close the watchdog
5f095798 241
546e2f1f
DM
242 if ($self->{ha_agent_wd}) {
243 $haenv->watchdog_close($self->{ha_agent_wd});
244 delete $self->{ha_agent_wd};
245 }
246
247 return 0;
5f095798 248 }
5f095798
DM
249 }
250
b0bf08a9
DM
251 $haenv->sleep(5);
252
5f095798
DM
253 } else {
254
255 die "got unexpected status '$state'\n";
256
257 }
258
259 return 1;
260}
261
c4a221bc
DM
262sub manage_resources {
263 my ($self) = @_;
264
265 my $haenv = $self->{haenv};
266
267 my $nodename = $haenv->nodename();
268
c4a221bc
DM
269 my $ss = $self->{service_status};
270
271 foreach my $sid (keys %$ss) {
272 my $sd = $ss->{$sid};
273 next if !$sd->{node};
274 next if !$sd->{uid};
275 next if $sd->{node} ne $nodename;
276 my $req_state = $sd->{state};
277 next if !defined($req_state);
c4a221bc 278 eval {
e88469ba 279 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
c4a221bc
DM
280 };
281 if (my $err = $@) {
f31b7e94 282 $haenv->log('err', "unable to run resource agent for '$sid' - $err"); # fixme
c4a221bc
DM
283 }
284 }
285
f31b7e94 286 my $starttime = $haenv->get_time();
c4a221bc
DM
287
288 # start workers
289 my $max_workers = 4;
290
6dbf93a0 291 my $sc = $haenv->read_service_config();
f31b7e94
DM
292
293 while (($haenv->get_time() - $starttime) < 5) {
c4a221bc
DM
294 my $count = $self->check_active_workers();
295
296 foreach my $sid (keys %{$self->{workers}}) {
297 last if $count >= $max_workers;
298 my $w = $self->{workers}->{$sid};
6dbf93a0
DM
299 my $cd = $sc->{$sid};
300 if (!$cd) {
f31b7e94 301 $haenv->log('err', "missing resource configuration for '$sid'");
6dbf93a0
DM
302 next;
303 }
c4a221bc 304 if (!$w->{pid}) {
f31b7e94
DM
305 if ($haenv->can_fork()) {
306 my $pid = fork();
307 if (!defined($pid)) {
308 $haenv->log('err', "fork worker failed");
309 $count = 0; last; # abort, try later
310 } elsif ($pid == 0) {
311 # do work
312 my $res = -1;
313 eval {
314 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
315 };
316 if (my $err = $@) {
317 $haenv->log('err', $err);
318 POSIX::_exit(-1);
319 }
320 POSIX::_exit($res);
321 } else {
322 $count++;
323 $w->{pid} = $pid;
324 }
325 } else {
c4a221bc
DM
326 my $res = -1;
327 eval {
6dbf93a0 328 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
c4a221bc
DM
329 };
330 if (my $err = $@) {
f31b7e94
DM
331 $haenv->log('err', $err);
332 }
333 $self->resource_command_finished($sid, $w->{uid}, $res);
c4a221bc
DM
334 }
335 }
336 }
337
338 last if !$count;
339
f31b7e94 340 $haenv->sleep(1);
c4a221bc
DM
341 }
342}
343
344# fixme: use a queue an limit number of parallel workers?
345sub queue_resource_command {
e88469ba 346 my ($self, $sid, $uid, $state, $target) = @_;
c4a221bc
DM
347
348 if (my $w = $self->{workers}->{$sid}) {
349 return if $w->{pid}; # already started
350 # else, delete and overwrite queue entry with new command
351 delete $self->{workers}->{$sid};
352 }
353
354 $self->{workers}->{$sid} = {
355 sid => $sid,
356 uid => $uid,
357 state => $state,
358 };
e88469ba
DM
359
360 $self->{workers}->{$sid}->{target} = $target if $target;
c4a221bc
DM
361}
362
363sub check_active_workers {
364 my ($self) = @_;
365
366 # finish/count workers
367 my $count = 0;
368 foreach my $sid (keys %{$self->{workers}}) {
369 my $w = $self->{workers}->{$sid};
370 if (my $pid = $w->{pid}) {
371 # check status
372 my $waitpid = waitpid($pid, WNOHANG);
373 if (defined($waitpid) && ($waitpid == $pid)) {
374 $self->resource_command_finished($sid, $w->{uid}, $?);
375 } else {
376 $count++;
377 }
378 }
379 }
380
381 return $count;
382}
383
384sub resource_command_finished {
385 my ($self, $sid, $uid, $status) = @_;
386
387 my $haenv = $self->{haenv};
388
389 my $w = delete $self->{workers}->{$sid};
390 return if !$w; # should not happen
391
392 my $exit_code = -1;
393
394 if ($status == -1) {
0f70400d 395 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 396 } elsif (my $sig = ($status & 127)) {
0f70400d 397 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
398 } else {
399 $exit_code = ($status >> 8);
c4a221bc
DM
400 }
401
402 $self->{results}->{$uid} = {
403 sid => $w->{sid},
404 state => $w->{state},
405 exit_code => $exit_code,
406 };
407
408 my $ss = $self->{service_status};
409
410 # compute hash of valid/existing uids
411 my $valid_uids = {};
412 foreach my $sid (keys %$ss) {
413 my $sd = $ss->{$sid};
414 next if !$sd->{uid};
415 $valid_uids->{$sd->{uid}} = 1;
416 }
417
418 my $results = {};
419 foreach my $id (keys %{$self->{results}}) {
420 next if !$valid_uids->{$id};
421 $results->{$id} = $self->{results}->{$id};
422 }
423 $self->{results} = $results;
424
425 $haenv->write_lrm_status($results);
426}
427
5f095798 4281;