]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
TestHardware: correct shutdown/reboot behaviour of CRM and LRM
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
a89ff919 12use PVE::HA::Tools ':exit_codes';
5f095798
DM
13
14# Server can have several states:
15
16my $valid_states = {
ec911edd 17 wait_for_agent_lock => "waiting for agent lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
ea4443cc 32 restart_tries => {},
067cdf33 33 shutdown_request => 0,
116dea30 34 shutdown_errors => 0,
9c7d068b
DM
35 # mode can be: active, reboot, shutdown, restart
36 mode => 'active',
5f095798
DM
37 }, $class;
38
b0bf08a9 39 $self->set_local_status({ state => 'wait_for_agent_lock' });
9c7d068b 40
5f095798
DM
41 return $self;
42}
43
44sub shutdown_request {
45 my ($self) = @_;
46
f1be5b3a
DM
47 return if $self->{shutdown_request}; # already in shutdown mode
48
499f06e3
DM
49 my $haenv = $self->{haenv};
50
116dea30
DM
51 my $nodename = $haenv->nodename();
52
cde77779 53 my $shutdown = $haenv->is_node_shutdown();
499f06e3
DM
54
55 if ($shutdown) {
56 $haenv->log('info', "shutdown LRM, stop all services");
57 $self->{mode} = 'shutdown';
116dea30
DM
58
59 # queue stop jobs for all services
60
61 my $ss = $self->{service_status};
62
63 foreach my $sid (keys %$ss) {
64 my $sd = $ss->{$sid};
65 next if !$sd->{node};
66 next if $sd->{node} ne $nodename;
c0edbd7e 67 # Note: use undef uid to mark shutdown/stop jobs
116dea30
DM
68 $self->queue_resource_command($sid, undef, 'request_stop');
69 }
70
499f06e3
DM
71 } else {
72 $haenv->log('info', "restart LRM, freeze all services");
73 $self->{mode} = 'restart';
74 }
9c7d068b 75
499f06e3 76 $self->{shutdown_request} = 1;
9c7d068b
DM
77
78 eval { $self->update_lrm_status(); };
79 if (my $err = $@) {
5bd7aa54 80 $self->log('err', "unable to update lrm status file - $err");
9c7d068b 81 }
5f095798
DM
82}
83
84sub get_local_status {
85 my ($self) = @_;
86
87 return $self->{status};
88}
89
90sub set_local_status {
91 my ($self, $new) = @_;
92
93 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
94
95 my $haenv = $self->{haenv};
96
97 my $old = $self->{status};
98
99 # important: only update if if really changed
100 return if $old->{state} eq $new->{state};
101
0bba8f60 102 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
103
104 $new->{state_change_time} = $haenv->get_time();
105
106 $self->{status} = $new;
107}
108
9c7d068b
DM
109sub update_lrm_status {
110 my ($self) = @_;
111
5bd7aa54
DM
112 my $haenv = $self->{haenv};
113
79829202
DM
114 return 0 if !$haenv->quorate();
115
9c7d068b
DM
116 my $lrm_status = {
117 mode => $self->{mode},
118 results => $self->{results},
aa330d1c 119 timestamp => $haenv->get_time(),
9c7d068b
DM
120 };
121
5bd7aa54
DM
122 eval { $haenv->write_lrm_status($lrm_status); };
123 if (my $err = $@) {
124 $haenv->log('err', "unable to write lrm status file - $err");
125 return 0;
126 }
127
128 return 1;
9c7d068b
DM
129}
130
5f095798
DM
131sub get_protected_ha_agent_lock {
132 my ($self) = @_;
133
134 my $haenv = $self->{haenv};
135
136 my $count = 0;
137 my $starttime = $haenv->get_time();
138
139 for (;;) {
140
141 if ($haenv->get_ha_agent_lock()) {
142 if ($self->{ha_agent_wd}) {
143 $haenv->watchdog_update($self->{ha_agent_wd});
144 } else {
145 my $wfh = $haenv->watchdog_open();
146 $self->{ha_agent_wd} = $wfh;
147 }
148 return 1;
149 }
150
151 last if ++$count > 5; # try max 5 time
152
153 my $delay = $haenv->get_time() - $starttime;
154 last if $delay > 5; # for max 5 seconds
155
156 $haenv->sleep(1);
157 }
158
159 return 0;
160}
161
546e2f1f
DM
162sub active_service_count {
163 my ($self) = @_;
164
165 my $haenv = $self->{haenv};
166
167 my $nodename = $haenv->nodename();
168
169 my $ss = $self->{service_status};
170
171 my $count = 0;
172
173 foreach my $sid (keys %$ss) {
174 my $sd = $ss->{$sid};
175 next if !$sd->{node};
176 next if $sd->{node} ne $nodename;
177 my $req_state = $sd->{state};
178 next if !defined($req_state);
179 next if $req_state eq 'stopped';
9c7d068b 180 next if $req_state eq 'freeze';
546e2f1f
DM
181
182 $count++;
183 }
184
185 return $count;
186}
5bd7aa54
DM
187
188my $wrote_lrm_status_at_startup = 0;
189
5f095798
DM
190sub do_one_iteration {
191 my ($self) = @_;
192
193 my $haenv = $self->{haenv};
194
c5ec095f 195 if (!$wrote_lrm_status_at_startup) {
79829202 196 if ($self->update_lrm_status()) {
c5ec095f
DM
197 $wrote_lrm_status_at_startup = 1;
198 } else {
199 # do nothing
200 $haenv->sleep(5);
201 return $self->{shutdown_request} ? 0 : 1;
202 }
5bd7aa54
DM
203 }
204
5f095798
DM
205 my $status = $self->get_local_status();
206 my $state = $status->{state};
207
067cdf33
DM
208 my $ms = $haenv->read_manager_status();
209 $self->{service_status} = $ms->{service_status} || {};
210
49777d09 211 my $fence_request = PVE::HA::Tools::count_fenced_services($self->{service_status}, $haenv->nodename());
067cdf33 212
5f095798
DM
213 # do state changes first
214
215 my $ctime = $haenv->get_time();
216
b0bf08a9 217 if ($state eq 'wait_for_agent_lock') {
5f095798 218
546e2f1f 219 my $service_count = $self->active_service_count();
5f095798 220
067cdf33 221 if (!$fence_request && $service_count && $haenv->quorate()) {
0bba8f60
DM
222 if ($self->get_protected_ha_agent_lock()) {
223 $self->set_local_status({ state => 'active' });
5f095798
DM
224 }
225 }
226
227 } elsif ($state eq 'lost_agent_lock') {
228
067cdf33 229 if (!$fence_request && $haenv->quorate()) {
0bba8f60
DM
230 if ($self->get_protected_ha_agent_lock()) {
231 $self->set_local_status({ state => 'active' });
5f095798
DM
232 }
233 }
234
0bba8f60 235 } elsif ($state eq 'active') {
5f095798 236
067cdf33
DM
237 if ($fence_request) {
238 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
239 $self->set_local_status({ state => 'lost_agent_lock'});
240 } elsif (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
241 $self->set_local_status({ state => 'lost_agent_lock'});
242 }
243 }
244
245 $status = $self->get_local_status();
246 $state = $status->{state};
247
248 # do work
249
250 if ($state eq 'wait_for_agent_lock') {
251
252 return 0 if $self->{shutdown_request};
79829202
DM
253
254 $self->update_lrm_status();
255
5f095798
DM
256 $haenv->sleep(5);
257
0bba8f60 258 } elsif ($state eq 'active') {
5f095798
DM
259
260 my $startime = $haenv->get_time();
261
262 my $max_time = 10;
263
264 my $shutdown = 0;
265
266 # do work (max_time seconds)
267 eval {
268 # fixme: set alert timer
269
270 if ($self->{shutdown_request}) {
271
499f06e3 272 if ($self->{mode} eq 'restart') {
5f095798 273
499f06e3 274 my $service_count = $self->active_service_count();
5f095798 275
499f06e3 276 if ($service_count == 0) {
5f095798 277
116dea30
DM
278 if ($self->run_workers() == 0) {
279 if ($self->{ha_agent_wd}) {
280 $haenv->watchdog_close($self->{ha_agent_wd});
281 delete $self->{ha_agent_wd};
282 }
283
284 $shutdown = 1;
285 }
286 }
287 } else {
288
289 if ($self->run_workers() == 0) {
290 if ($self->{shutdown_errors} == 0) {
291 if ($self->{ha_agent_wd}) {
292 $haenv->watchdog_close($self->{ha_agent_wd});
293 delete $self->{ha_agent_wd};
294 }
499f06e3 295 }
5f095798 296
499f06e3 297 $shutdown = 1;
ff165cd8
TL
298
299 # shutdown with all services stopped thus release the lock
300 $haenv->release_ha_agent_lock();
499f06e3 301 }
5f095798 302 }
c4a221bc 303 } else {
c4a221bc
DM
304
305 $self->manage_resources();
067cdf33 306
5f095798
DM
307 }
308 };
309 if (my $err = $@) {
310 $haenv->log('err', "got unexpected error - $err");
311 }
312
79829202
DM
313 $self->update_lrm_status();
314
5f095798
DM
315 return 0 if $shutdown;
316
317 $haenv->sleep_until($startime + $max_time);
318
319 } elsif ($state eq 'lost_agent_lock') {
320
321 # Note: watchdog is active an will triger soon!
322
323 # so we hope to get the lock back soon!
324
325 if ($self->{shutdown_request}) {
326
546e2f1f 327 my $service_count = $self->active_service_count();
5f095798 328
546e2f1f 329 if ($service_count > 0) {
5f095798 330 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
546e2f1f 331 "detected $service_count running services");
5f095798 332
546e2f1f 333 } else {
5f095798 334
546e2f1f 335 # all services are stopped, so we can close the watchdog
5f095798 336
546e2f1f
DM
337 if ($self->{ha_agent_wd}) {
338 $haenv->watchdog_close($self->{ha_agent_wd});
339 delete $self->{ha_agent_wd};
340 }
341
342 return 0;
5f095798 343 }
5f095798
DM
344 }
345
b0bf08a9
DM
346 $haenv->sleep(5);
347
5f095798
DM
348 } else {
349
350 die "got unexpected status '$state'\n";
351
352 }
353
354 return 1;
355}
356
116dea30 357sub run_workers {
c4a221bc
DM
358 my ($self) = @_;
359
360 my $haenv = $self->{haenv};
361
f31b7e94 362 my $starttime = $haenv->get_time();
c4a221bc
DM
363
364 # start workers
365 my $max_workers = 4;
366
6dbf93a0 367 my $sc = $haenv->read_service_config();
f31b7e94
DM
368
369 while (($haenv->get_time() - $starttime) < 5) {
c4a221bc
DM
370 my $count = $self->check_active_workers();
371
372 foreach my $sid (keys %{$self->{workers}}) {
373 last if $count >= $max_workers;
374 my $w = $self->{workers}->{$sid};
6dbf93a0
DM
375 my $cd = $sc->{$sid};
376 if (!$cd) {
f31b7e94 377 $haenv->log('err', "missing resource configuration for '$sid'");
6dbf93a0
DM
378 next;
379 }
c4a221bc 380 if (!$w->{pid}) {
f31b7e94
DM
381 if ($haenv->can_fork()) {
382 my $pid = fork();
383 if (!defined($pid)) {
384 $haenv->log('err', "fork worker failed");
385 $count = 0; last; # abort, try later
386 } elsif ($pid == 0) {
387 # do work
388 my $res = -1;
389 eval {
390 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
391 };
392 if (my $err = $@) {
393 $haenv->log('err', $err);
394 POSIX::_exit(-1);
395 }
396 POSIX::_exit($res);
397 } else {
398 $count++;
399 $w->{pid} = $pid;
400 }
401 } else {
c4a221bc
DM
402 my $res = -1;
403 eval {
6dbf93a0 404 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
b33b5743 405 $res = $res << 8 if $res > 0;
c4a221bc
DM
406 };
407 if (my $err = $@) {
f31b7e94 408 $haenv->log('err', $err);
116dea30
DM
409 }
410 if (defined($w->{uid})) {
411 $self->resource_command_finished($sid, $w->{uid}, $res);
412 } else {
413 $self->stop_command_finished($sid, $res);
414 }
c4a221bc
DM
415 }
416 }
417 }
418
419 last if !$count;
420
f31b7e94 421 $haenv->sleep(1);
c4a221bc 422 }
116dea30
DM
423
424 return scalar(keys %{$self->{workers}});
425}
426
427sub manage_resources {
428 my ($self) = @_;
429
430 my $haenv = $self->{haenv};
431
432 my $nodename = $haenv->nodename();
433
434 my $ss = $self->{service_status};
435
436 foreach my $sid (keys %$ss) {
437 my $sd = $ss->{$sid};
438 next if !$sd->{node};
439 next if !$sd->{uid};
440 next if $sd->{node} ne $nodename;
441 my $req_state = $sd->{state};
442 next if !defined($req_state);
443 next if $req_state eq 'freeze';
444 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
445 }
446
447 return $self->run_workers();
c4a221bc
DM
448}
449
c4a221bc 450sub queue_resource_command {
e88469ba 451 my ($self, $sid, $uid, $state, $target) = @_;
c4a221bc
DM
452
453 if (my $w = $self->{workers}->{$sid}) {
454 return if $w->{pid}; # already started
455 # else, delete and overwrite queue entry with new command
456 delete $self->{workers}->{$sid};
457 }
458
459 $self->{workers}->{$sid} = {
460 sid => $sid,
461 uid => $uid,
462 state => $state,
463 };
e88469ba
DM
464
465 $self->{workers}->{$sid}->{target} = $target if $target;
c4a221bc
DM
466}
467
468sub check_active_workers {
469 my ($self) = @_;
470
471 # finish/count workers
472 my $count = 0;
473 foreach my $sid (keys %{$self->{workers}}) {
474 my $w = $self->{workers}->{$sid};
475 if (my $pid = $w->{pid}) {
476 # check status
477 my $waitpid = waitpid($pid, WNOHANG);
478 if (defined($waitpid) && ($waitpid == $pid)) {
c0edbd7e 479 if (defined($w->{uid})) {
116dea30
DM
480 $self->resource_command_finished($sid, $w->{uid}, $?);
481 } else {
482 $self->stop_command_finished($sid, $?);
483 }
c4a221bc
DM
484 } else {
485 $count++;
486 }
487 }
488 }
489
490 return $count;
491}
492
116dea30
DM
493sub stop_command_finished {
494 my ($self, $sid, $status) = @_;
495
496 my $haenv = $self->{haenv};
497
498 my $w = delete $self->{workers}->{$sid};
499 return if !$w; # should not happen
500
501 my $exit_code = -1;
502
503 if ($status == -1) {
504 $haenv->log('err', "resource agent $sid finished - failed to execute");
505 } elsif (my $sig = ($status & 127)) {
506 $haenv->log('err', "resource agent $sid finished - got signal $sig");
507 } else {
508 $exit_code = ($status >> 8);
509 }
510
511 if ($exit_code != 0) {
512 $self->{shutdown_errors}++;
513 }
514}
515
c4a221bc
DM
516sub resource_command_finished {
517 my ($self, $sid, $uid, $status) = @_;
518
519 my $haenv = $self->{haenv};
520
521 my $w = delete $self->{workers}->{$sid};
522 return if !$w; # should not happen
523
524 my $exit_code = -1;
525
526 if ($status == -1) {
0f70400d 527 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 528 } elsif (my $sig = ($status & 127)) {
0f70400d 529 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
530 } else {
531 $exit_code = ($status >> 8);
c4a221bc
DM
532 }
533
ea4443cc
TL
534 $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
535
c4a221bc
DM
536 $self->{results}->{$uid} = {
537 sid => $w->{sid},
538 state => $w->{state},
539 exit_code => $exit_code,
540 };
541
542 my $ss = $self->{service_status};
543
544 # compute hash of valid/existing uids
545 my $valid_uids = {};
546 foreach my $sid (keys %$ss) {
547 my $sd = $ss->{$sid};
548 next if !$sd->{uid};
549 $valid_uids->{$sd->{uid}} = 1;
550 }
551
552 my $results = {};
553 foreach my $id (keys %{$self->{results}}) {
554 next if !$valid_uids->{$id};
555 $results->{$id} = $self->{results}->{$id};
556 }
557 $self->{results} = $results;
c4a221bc
DM
558}
559
ea4443cc
TL
560# processes the exit code from a finished resource agent, so that the CRM knows
561# if the LRM wants to retry an action based on the current recovery policies for
562# the failed service, or the CRM itself must try to recover from the failure.
563sub handle_service_exitcode {
564 my ($self, $sid, $cmd, $exit_code) = @_;
565
566 my $haenv = $self->{haenv};
567 my $tries = $self->{restart_tries};
568
569 my $sc = $haenv->read_service_config();
570 my $cd = $sc->{$sid};
571
572 if ($cmd eq 'started') {
573
a89ff919 574 if ($exit_code == SUCCESS) {
ea4443cc
TL
575
576 $tries->{$sid} = 0;
577
578 return $exit_code;
579
a89ff919 580 } elsif ($exit_code == ERROR) {
ea4443cc
TL
581
582 $tries->{$sid} = 0 if !defined($tries->{$sid});
583
584 $tries->{$sid}++;
585 if ($tries->{$sid} >= $cd->{max_restart}) {
586 $haenv->log('err', "unable to start service $sid on local node".
587 " after $tries->{$sid} retries");
588 $tries->{$sid} = 0;
a89ff919 589 return ERROR;
ea4443cc
TL
590 }
591
a89ff919
TL
592 # tell CRM that we retry the start
593 return ETRY_AGAIN;
ea4443cc
TL
594 }
595 }
596
597 return $exit_code;
598
599}
600
5f095798 6011;