]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
bump version to 1.0-19
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc
DM
7use Data::Dumper;
8use POSIX qw(:sys_wait_h);
5f095798
DM
9
10use PVE::SafeSyslog;
11use PVE::Tools;
a89ff919 12use PVE::HA::Tools ':exit_codes';
5f095798
DM
13
14# Server can have several states:
15
16my $valid_states = {
ec911edd 17 wait_for_agent_lock => "waiting for agent lock",
0bba8f60 18 active => "got agent_lock",
5f095798
DM
19 lost_agent_lock => "lost agent_lock",
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 status => { state => 'startup' },
c4a221bc
DM
30 workers => {},
31 results => {},
ea4443cc 32 restart_tries => {},
067cdf33 33 shutdown_request => 0,
116dea30 34 shutdown_errors => 0,
9c7d068b
DM
35 # mode can be: active, reboot, shutdown, restart
36 mode => 'active',
5f095798
DM
37 }, $class;
38
b0bf08a9 39 $self->set_local_status({ state => 'wait_for_agent_lock' });
9c7d068b 40
5f095798
DM
41 return $self;
42}
43
44sub shutdown_request {
45 my ($self) = @_;
46
f1be5b3a
DM
47 return if $self->{shutdown_request}; # already in shutdown mode
48
499f06e3
DM
49 my $haenv = $self->{haenv};
50
116dea30
DM
51 my $nodename = $haenv->nodename();
52
cde77779 53 my $shutdown = $haenv->is_node_shutdown();
499f06e3
DM
54
55 if ($shutdown) {
56 $haenv->log('info', "shutdown LRM, stop all services");
57 $self->{mode} = 'shutdown';
116dea30
DM
58
59 # queue stop jobs for all services
60
61 my $ss = $self->{service_status};
62
63 foreach my $sid (keys %$ss) {
64 my $sd = $ss->{$sid};
65 next if !$sd->{node};
66 next if $sd->{node} ne $nodename;
c0edbd7e 67 # Note: use undef uid to mark shutdown/stop jobs
116dea30
DM
68 $self->queue_resource_command($sid, undef, 'request_stop');
69 }
70
499f06e3
DM
71 } else {
72 $haenv->log('info', "restart LRM, freeze all services");
73 $self->{mode} = 'restart';
74 }
9c7d068b 75
499f06e3 76 $self->{shutdown_request} = 1;
9c7d068b
DM
77
78 eval { $self->update_lrm_status(); };
79 if (my $err = $@) {
5bd7aa54 80 $self->log('err', "unable to update lrm status file - $err");
9c7d068b 81 }
5f095798
DM
82}
83
84sub get_local_status {
85 my ($self) = @_;
86
87 return $self->{status};
88}
89
90sub set_local_status {
91 my ($self, $new) = @_;
92
93 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
94
95 my $haenv = $self->{haenv};
96
97 my $old = $self->{status};
98
99 # important: only update if if really changed
100 return if $old->{state} eq $new->{state};
101
0bba8f60 102 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
103
104 $new->{state_change_time} = $haenv->get_time();
105
106 $self->{status} = $new;
107}
108
9c7d068b
DM
109sub update_lrm_status {
110 my ($self) = @_;
111
5bd7aa54
DM
112 my $haenv = $self->{haenv};
113
79829202
DM
114 return 0 if !$haenv->quorate();
115
9c7d068b
DM
116 my $lrm_status = {
117 mode => $self->{mode},
118 results => $self->{results},
aa330d1c 119 timestamp => $haenv->get_time(),
9c7d068b
DM
120 };
121
5bd7aa54
DM
122 eval { $haenv->write_lrm_status($lrm_status); };
123 if (my $err = $@) {
124 $haenv->log('err', "unable to write lrm status file - $err");
125 return 0;
126 }
127
128 return 1;
9c7d068b
DM
129}
130
5f095798
DM
131sub get_protected_ha_agent_lock {
132 my ($self) = @_;
133
134 my $haenv = $self->{haenv};
135
136 my $count = 0;
137 my $starttime = $haenv->get_time();
138
139 for (;;) {
140
141 if ($haenv->get_ha_agent_lock()) {
142 if ($self->{ha_agent_wd}) {
143 $haenv->watchdog_update($self->{ha_agent_wd});
144 } else {
145 my $wfh = $haenv->watchdog_open();
146 $self->{ha_agent_wd} = $wfh;
147 }
148 return 1;
149 }
150
151 last if ++$count > 5; # try max 5 time
152
153 my $delay = $haenv->get_time() - $starttime;
154 last if $delay > 5; # for max 5 seconds
155
156 $haenv->sleep(1);
157 }
158
159 return 0;
160}
161
546e2f1f
DM
162sub active_service_count {
163 my ($self) = @_;
164
165 my $haenv = $self->{haenv};
166
167 my $nodename = $haenv->nodename();
168
169 my $ss = $self->{service_status};
170
171 my $count = 0;
172
173 foreach my $sid (keys %$ss) {
174 my $sd = $ss->{$sid};
175 next if !$sd->{node};
176 next if $sd->{node} ne $nodename;
177 my $req_state = $sd->{state};
178 next if !defined($req_state);
179 next if $req_state eq 'stopped';
9c7d068b 180 next if $req_state eq 'freeze';
546e2f1f
DM
181
182 $count++;
183 }
184
185 return $count;
186}
5bd7aa54
DM
187
188my $wrote_lrm_status_at_startup = 0;
189
5f095798
DM
190sub do_one_iteration {
191 my ($self) = @_;
192
193 my $haenv = $self->{haenv};
194
c5ec095f 195 if (!$wrote_lrm_status_at_startup) {
79829202 196 if ($self->update_lrm_status()) {
c5ec095f
DM
197 $wrote_lrm_status_at_startup = 1;
198 } else {
199 # do nothing
200 $haenv->sleep(5);
201 return $self->{shutdown_request} ? 0 : 1;
202 }
5bd7aa54
DM
203 }
204
5f095798
DM
205 my $status = $self->get_local_status();
206 my $state = $status->{state};
207
067cdf33
DM
208 my $ms = $haenv->read_manager_status();
209 $self->{service_status} = $ms->{service_status} || {};
210
49777d09 211 my $fence_request = PVE::HA::Tools::count_fenced_services($self->{service_status}, $haenv->nodename());
067cdf33 212
5f095798
DM
213 # do state changes first
214
215 my $ctime = $haenv->get_time();
216
b0bf08a9 217 if ($state eq 'wait_for_agent_lock') {
5f095798 218
546e2f1f 219 my $service_count = $self->active_service_count();
5f095798 220
067cdf33 221 if (!$fence_request && $service_count && $haenv->quorate()) {
0bba8f60
DM
222 if ($self->get_protected_ha_agent_lock()) {
223 $self->set_local_status({ state => 'active' });
5f095798
DM
224 }
225 }
226
227 } elsif ($state eq 'lost_agent_lock') {
228
067cdf33 229 if (!$fence_request && $haenv->quorate()) {
0bba8f60
DM
230 if ($self->get_protected_ha_agent_lock()) {
231 $self->set_local_status({ state => 'active' });
5f095798
DM
232 }
233 }
234
0bba8f60 235 } elsif ($state eq 'active') {
5f095798 236
067cdf33
DM
237 if ($fence_request) {
238 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
239 $self->set_local_status({ state => 'lost_agent_lock'});
240 } elsif (!$self->get_protected_ha_agent_lock()) {
5f095798
DM
241 $self->set_local_status({ state => 'lost_agent_lock'});
242 }
243 }
244
245 $status = $self->get_local_status();
246 $state = $status->{state};
247
248 # do work
249
250 if ($state eq 'wait_for_agent_lock') {
251
252 return 0 if $self->{shutdown_request};
79829202
DM
253
254 $self->update_lrm_status();
255
5f095798
DM
256 $haenv->sleep(5);
257
0bba8f60 258 } elsif ($state eq 'active') {
5f095798
DM
259
260 my $startime = $haenv->get_time();
261
262 my $max_time = 10;
263
264 my $shutdown = 0;
265
266 # do work (max_time seconds)
267 eval {
268 # fixme: set alert timer
269
270 if ($self->{shutdown_request}) {
271
499f06e3 272 if ($self->{mode} eq 'restart') {
5f095798 273
499f06e3 274 my $service_count = $self->active_service_count();
5f095798 275
499f06e3 276 if ($service_count == 0) {
5f095798 277
116dea30
DM
278 if ($self->run_workers() == 0) {
279 if ($self->{ha_agent_wd}) {
280 $haenv->watchdog_close($self->{ha_agent_wd});
281 delete $self->{ha_agent_wd};
282 }
283
284 $shutdown = 1;
e23f674c
TL
285
286 # restart with no or freezed services, release the lock
287 $haenv->release_ha_agent_lock();
116dea30
DM
288 }
289 }
290 } else {
291
292 if ($self->run_workers() == 0) {
293 if ($self->{shutdown_errors} == 0) {
294 if ($self->{ha_agent_wd}) {
295 $haenv->watchdog_close($self->{ha_agent_wd});
296 delete $self->{ha_agent_wd};
297 }
499f06e3 298 }
5f095798 299
499f06e3 300 $shutdown = 1;
ff165cd8
TL
301
302 # shutdown with all services stopped thus release the lock
303 $haenv->release_ha_agent_lock();
499f06e3 304 }
5f095798 305 }
c4a221bc 306 } else {
c4a221bc
DM
307
308 $self->manage_resources();
067cdf33 309
5f095798
DM
310 }
311 };
312 if (my $err = $@) {
313 $haenv->log('err', "got unexpected error - $err");
314 }
315
79829202
DM
316 $self->update_lrm_status();
317
5f095798
DM
318 return 0 if $shutdown;
319
320 $haenv->sleep_until($startime + $max_time);
321
322 } elsif ($state eq 'lost_agent_lock') {
323
324 # Note: watchdog is active an will triger soon!
325
326 # so we hope to get the lock back soon!
327
328 if ($self->{shutdown_request}) {
329
546e2f1f 330 my $service_count = $self->active_service_count();
5f095798 331
546e2f1f 332 if ($service_count > 0) {
5f095798 333 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
546e2f1f 334 "detected $service_count running services");
5f095798 335
546e2f1f 336 } else {
5f095798 337
546e2f1f 338 # all services are stopped, so we can close the watchdog
5f095798 339
546e2f1f
DM
340 if ($self->{ha_agent_wd}) {
341 $haenv->watchdog_close($self->{ha_agent_wd});
342 delete $self->{ha_agent_wd};
343 }
344
345 return 0;
5f095798 346 }
5f095798
DM
347 }
348
b0bf08a9
DM
349 $haenv->sleep(5);
350
5f095798
DM
351 } else {
352
353 die "got unexpected status '$state'\n";
354
355 }
356
357 return 1;
358}
359
116dea30 360sub run_workers {
c4a221bc
DM
361 my ($self) = @_;
362
363 my $haenv = $self->{haenv};
364
f31b7e94 365 my $starttime = $haenv->get_time();
c4a221bc
DM
366
367 # start workers
368 my $max_workers = 4;
369
6dbf93a0 370 my $sc = $haenv->read_service_config();
f31b7e94
DM
371
372 while (($haenv->get_time() - $starttime) < 5) {
c4a221bc
DM
373 my $count = $self->check_active_workers();
374
375 foreach my $sid (keys %{$self->{workers}}) {
376 last if $count >= $max_workers;
377 my $w = $self->{workers}->{$sid};
6dbf93a0
DM
378 my $cd = $sc->{$sid};
379 if (!$cd) {
f31b7e94 380 $haenv->log('err', "missing resource configuration for '$sid'");
6dbf93a0
DM
381 next;
382 }
c4a221bc 383 if (!$w->{pid}) {
f31b7e94
DM
384 if ($haenv->can_fork()) {
385 my $pid = fork();
386 if (!defined($pid)) {
387 $haenv->log('err', "fork worker failed");
388 $count = 0; last; # abort, try later
389 } elsif ($pid == 0) {
390 # do work
391 my $res = -1;
392 eval {
393 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
394 };
395 if (my $err = $@) {
396 $haenv->log('err', $err);
397 POSIX::_exit(-1);
398 }
399 POSIX::_exit($res);
400 } else {
401 $count++;
402 $w->{pid} = $pid;
403 }
404 } else {
c4a221bc
DM
405 my $res = -1;
406 eval {
6dbf93a0 407 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target});
b33b5743 408 $res = $res << 8 if $res > 0;
c4a221bc
DM
409 };
410 if (my $err = $@) {
f31b7e94 411 $haenv->log('err', $err);
116dea30
DM
412 }
413 if (defined($w->{uid})) {
414 $self->resource_command_finished($sid, $w->{uid}, $res);
415 } else {
416 $self->stop_command_finished($sid, $res);
417 }
c4a221bc
DM
418 }
419 }
420 }
421
422 last if !$count;
423
f31b7e94 424 $haenv->sleep(1);
c4a221bc 425 }
116dea30
DM
426
427 return scalar(keys %{$self->{workers}});
428}
429
430sub manage_resources {
431 my ($self) = @_;
432
433 my $haenv = $self->{haenv};
434
435 my $nodename = $haenv->nodename();
436
437 my $ss = $self->{service_status};
438
439 foreach my $sid (keys %$ss) {
440 my $sd = $ss->{$sid};
441 next if !$sd->{node};
442 next if !$sd->{uid};
443 next if $sd->{node} ne $nodename;
444 my $req_state = $sd->{state};
445 next if !defined($req_state);
446 next if $req_state eq 'freeze';
447 $self->queue_resource_command($sid, $sd->{uid}, $req_state, $sd->{target});
448 }
449
450 return $self->run_workers();
c4a221bc
DM
451}
452
c4a221bc 453sub queue_resource_command {
e88469ba 454 my ($self, $sid, $uid, $state, $target) = @_;
c4a221bc
DM
455
456 if (my $w = $self->{workers}->{$sid}) {
457 return if $w->{pid}; # already started
458 # else, delete and overwrite queue entry with new command
459 delete $self->{workers}->{$sid};
460 }
461
462 $self->{workers}->{$sid} = {
463 sid => $sid,
464 uid => $uid,
465 state => $state,
466 };
e88469ba
DM
467
468 $self->{workers}->{$sid}->{target} = $target if $target;
c4a221bc
DM
469}
470
471sub check_active_workers {
472 my ($self) = @_;
473
474 # finish/count workers
475 my $count = 0;
476 foreach my $sid (keys %{$self->{workers}}) {
477 my $w = $self->{workers}->{$sid};
478 if (my $pid = $w->{pid}) {
479 # check status
480 my $waitpid = waitpid($pid, WNOHANG);
481 if (defined($waitpid) && ($waitpid == $pid)) {
c0edbd7e 482 if (defined($w->{uid})) {
116dea30
DM
483 $self->resource_command_finished($sid, $w->{uid}, $?);
484 } else {
485 $self->stop_command_finished($sid, $?);
486 }
c4a221bc
DM
487 } else {
488 $count++;
489 }
490 }
491 }
492
493 return $count;
494}
495
116dea30
DM
496sub stop_command_finished {
497 my ($self, $sid, $status) = @_;
498
499 my $haenv = $self->{haenv};
500
501 my $w = delete $self->{workers}->{$sid};
502 return if !$w; # should not happen
503
504 my $exit_code = -1;
505
506 if ($status == -1) {
507 $haenv->log('err', "resource agent $sid finished - failed to execute");
508 } elsif (my $sig = ($status & 127)) {
509 $haenv->log('err', "resource agent $sid finished - got signal $sig");
510 } else {
511 $exit_code = ($status >> 8);
512 }
513
514 if ($exit_code != 0) {
515 $self->{shutdown_errors}++;
516 }
517}
518
c4a221bc
DM
519sub resource_command_finished {
520 my ($self, $sid, $uid, $status) = @_;
521
522 my $haenv = $self->{haenv};
523
524 my $w = delete $self->{workers}->{$sid};
525 return if !$w; # should not happen
526
527 my $exit_code = -1;
528
529 if ($status == -1) {
0f70400d 530 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 531 } elsif (my $sig = ($status & 127)) {
0f70400d 532 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
533 } else {
534 $exit_code = ($status >> 8);
c4a221bc
DM
535 }
536
ea4443cc
TL
537 $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
538
c4a221bc
DM
539 $self->{results}->{$uid} = {
540 sid => $w->{sid},
541 state => $w->{state},
542 exit_code => $exit_code,
543 };
544
545 my $ss = $self->{service_status};
546
547 # compute hash of valid/existing uids
548 my $valid_uids = {};
549 foreach my $sid (keys %$ss) {
550 my $sd = $ss->{$sid};
551 next if !$sd->{uid};
552 $valid_uids->{$sd->{uid}} = 1;
553 }
554
555 my $results = {};
556 foreach my $id (keys %{$self->{results}}) {
557 next if !$valid_uids->{$id};
558 $results->{$id} = $self->{results}->{$id};
559 }
560 $self->{results} = $results;
c4a221bc
DM
561}
562
ea4443cc
TL
563# processes the exit code from a finished resource agent, so that the CRM knows
564# if the LRM wants to retry an action based on the current recovery policies for
565# the failed service, or the CRM itself must try to recover from the failure.
566sub handle_service_exitcode {
567 my ($self, $sid, $cmd, $exit_code) = @_;
568
569 my $haenv = $self->{haenv};
570 my $tries = $self->{restart_tries};
571
572 my $sc = $haenv->read_service_config();
573 my $cd = $sc->{$sid};
574
575 if ($cmd eq 'started') {
576
a89ff919 577 if ($exit_code == SUCCESS) {
ea4443cc
TL
578
579 $tries->{$sid} = 0;
580
581 return $exit_code;
582
a89ff919 583 } elsif ($exit_code == ERROR) {
ea4443cc
TL
584
585 $tries->{$sid} = 0 if !defined($tries->{$sid});
586
587 $tries->{$sid}++;
588 if ($tries->{$sid} >= $cd->{max_restart}) {
589 $haenv->log('err', "unable to start service $sid on local node".
590 " after $tries->{$sid} retries");
591 $tries->{$sid} = 0;
a89ff919 592 return ERROR;
ea4443cc
TL
593 }
594
a89ff919
TL
595 # tell CRM that we retry the start
596 return ETRY_AGAIN;
ea4443cc
TL
597 }
598 }
599
600 return $exit_code;
601
602}
603
5f095798 6041;