]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/LRM.pm
manager: make recovery actual state in FSM
[pve-ha-manager.git] / src / PVE / HA / LRM.pm
CommitLineData
5f095798
DM
1package PVE::HA::LRM;
2
3# Local Resource Manager
4
5use strict;
6use warnings;
c4a221bc 7use POSIX qw(:sys_wait_h);
5f095798
DM
8
9use PVE::SafeSyslog;
10use PVE::Tools;
a89ff919 11use PVE::HA::Tools ':exit_codes';
2a045f55 12use PVE::HA::Resources;
5f095798
DM
13
14# Server can have several states:
15
16my $valid_states = {
ec911edd 17 wait_for_agent_lock => "waiting for agent lock",
0bba8f60 18 active => "got agent_lock",
99278e06 19 maintenance => "going into maintenance",
5f095798
DM
20 lost_agent_lock => "lost agent_lock",
21};
22
23sub new {
24 my ($this, $haenv) = @_;
25
26 my $class = ref($this) || $this;
27
28 my $self = bless {
29 haenv => $haenv,
30 status => { state => 'startup' },
c4a221bc
DM
31 workers => {},
32 results => {},
ea4443cc 33 restart_tries => {},
067cdf33 34 shutdown_request => 0,
116dea30 35 shutdown_errors => 0,
9c7d068b
DM
36 # mode can be: active, reboot, shutdown, restart
37 mode => 'active',
3df15380 38 cluster_state_update => 0,
5f095798
DM
39 }, $class;
40
289e4784 41 $self->set_local_status({ state => 'wait_for_agent_lock' });
9c7d068b 42
5f095798
DM
43 return $self;
44}
45
46sub shutdown_request {
47 my ($self) = @_;
48
f1be5b3a
DM
49 return if $self->{shutdown_request}; # already in shutdown mode
50
499f06e3
DM
51 my $haenv = $self->{haenv};
52
116dea30
DM
53 my $nodename = $haenv->nodename();
54
f65f41b9 55 my ($shutdown, $reboot) = $haenv->is_node_shutdown();
499f06e3 56
ba15a9b9
TL
57 my $dc_ha_cfg = $haenv->get_ha_settings();
58 my $shutdown_policy = $dc_ha_cfg->{shutdown_policy} // 'conditional';
59
7a20d688
TL
60 if ($shutdown) { # don't log this on service restart, only on node shutdown
61 $haenv->log('info', "got shutdown request with shutdown policy '$shutdown_policy'");
62 }
63
d2236278 64 my $freeze_all;
99278e06 65 my $maintenance;
ba15a9b9
TL
66 if ($shutdown_policy eq 'conditional') {
67 $freeze_all = $reboot;
68 } elsif ($shutdown_policy eq 'freeze') {
69 $freeze_all = 1;
70 } elsif ($shutdown_policy eq 'failover') {
71 $freeze_all = 0;
99278e06
TL
72 } elsif ($shutdown_policy eq 'migrate') {
73 $maintenance = 1;
ba15a9b9 74 } else {
d2236278
TL
75 $haenv->log('err', "unknown shutdown policy '$shutdown_policy', fall back to conditional");
76 $freeze_all = $reboot;
ba15a9b9
TL
77 }
78
99278e06
TL
79 if ($maintenance) {
80 # we get marked as unaivalable by the manager, then all services will
81 # be migrated away, we'll still have the same "can we exit" clause than
82 # a normal shutdown -> no running service on this node
83 # FIXME: after X minutes, add shutdown command for remaining services,
84 # e.g., if they have no alternative node???
85 } elsif ($shutdown) {
f65f41b9
TL
86 # *always* queue stop jobs for all services if the node shuts down,
87 # independent if it's a reboot or a poweroff, else we may corrupt
88 # services or hinder node shutdown
116dea30
DM
89 my $ss = $self->{service_status};
90
91 foreach my $sid (keys %$ss) {
92 my $sd = $ss->{$sid};
93 next if !$sd->{node};
94 next if $sd->{node} ne $nodename;
c0edbd7e 95 # Note: use undef uid to mark shutdown/stop jobs
116dea30
DM
96 $self->queue_resource_command($sid, undef, 'request_stop');
97 }
f65f41b9 98 }
116dea30 99
f65f41b9 100 if ($shutdown) {
41236dcf 101 my $shutdown_type = $reboot ? 'reboot' : 'shutdown';
99278e06
TL
102 if ($maintenance) {
103 $haenv->log('info', "$shutdown_type LRM, doing maintenance, removing this node from active list");
104 $self->{mode} = 'maintenance';
105 } elsif ($freeze_all) {
41236dcf 106 $haenv->log('info', "$shutdown_type LRM, stop and freeze all services");
f65f41b9
TL
107 $self->{mode} = 'restart';
108 } else {
109 $haenv->log('info', "shutdown LRM, stop all services");
110 $self->{mode} = 'shutdown';
111 }
499f06e3
DM
112 } else {
113 $haenv->log('info', "restart LRM, freeze all services");
114 $self->{mode} = 'restart';
115 }
9c7d068b 116
99278e06 117 $self->{shutdown_request} = $haenv->get_time();
9c7d068b 118
a19f2576 119 eval { $self->update_lrm_status() or die "not quorate?\n"; };
9c7d068b 120 if (my $err = $@) {
5bd7aa54 121 $self->log('err', "unable to update lrm status file - $err");
9c7d068b 122 }
5f095798
DM
123}
124
125sub get_local_status {
126 my ($self) = @_;
127
128 return $self->{status};
129}
130
131sub set_local_status {
132 my ($self, $new) = @_;
133
134 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
135
136 my $haenv = $self->{haenv};
137
138 my $old = $self->{status};
139
289e4784 140 # important: only update if if really changed
5f095798
DM
141 return if $old->{state} eq $new->{state};
142
0bba8f60 143 $haenv->log('info', "status change $old->{state} => $new->{state}");
5f095798
DM
144
145 $new->{state_change_time} = $haenv->get_time();
146
147 $self->{status} = $new;
148}
149
9c7d068b
DM
150sub update_lrm_status {
151 my ($self) = @_;
152
5bd7aa54
DM
153 my $haenv = $self->{haenv};
154
79829202 155 return 0 if !$haenv->quorate();
289e4784
TL
156
157 my $lrm_status = {
331a9f00 158 state => $self->{status}->{state},
9c7d068b
DM
159 mode => $self->{mode},
160 results => $self->{results},
aa330d1c 161 timestamp => $haenv->get_time(),
9c7d068b 162 };
289e4784 163
5bd7aa54
DM
164 eval { $haenv->write_lrm_status($lrm_status); };
165 if (my $err = $@) {
166 $haenv->log('err', "unable to write lrm status file - $err");
167 return 0;
168 }
169
170 return 1;
9c7d068b
DM
171}
172
8e940b68
TL
173sub update_service_status {
174 my ($self) = @_;
175
176 my $haenv = $self->{haenv};
177
178 my $ms = eval { $haenv->read_manager_status(); };
179 if (my $err = $@) {
180 $haenv->log('err', "updating service status from manager failed: $err");
181 return undef;
182 } else {
183 $self->{service_status} = $ms->{service_status} || {};
184 return 1;
185 }
186}
187
5f095798
DM
188sub get_protected_ha_agent_lock {
189 my ($self) = @_;
190
191 my $haenv = $self->{haenv};
192
193 my $count = 0;
194 my $starttime = $haenv->get_time();
195
196 for (;;) {
289e4784 197
5f095798
DM
198 if ($haenv->get_ha_agent_lock()) {
199 if ($self->{ha_agent_wd}) {
200 $haenv->watchdog_update($self->{ha_agent_wd});
201 } else {
202 my $wfh = $haenv->watchdog_open();
203 $self->{ha_agent_wd} = $wfh;
204 }
205 return 1;
206 }
289e4784 207
5f095798
DM
208 last if ++$count > 5; # try max 5 time
209
210 my $delay = $haenv->get_time() - $starttime;
211 last if $delay > 5; # for max 5 seconds
212
213 $haenv->sleep(1);
214 }
289e4784 215
5f095798
DM
216 return 0;
217}
218
546e2f1f
DM
219sub active_service_count {
220 my ($self) = @_;
289e4784 221
546e2f1f
DM
222 my $haenv = $self->{haenv};
223
224 my $nodename = $haenv->nodename();
225
226 my $ss = $self->{service_status};
227
228 my $count = 0;
289e4784 229
546e2f1f
DM
230 foreach my $sid (keys %$ss) {
231 my $sd = $ss->{$sid};
232 next if !$sd->{node};
233 next if $sd->{node} ne $nodename;
234 my $req_state = $sd->{state};
235 next if !defined($req_state);
236 next if $req_state eq 'stopped';
9c7d068b 237 next if $req_state eq 'freeze';
38545741
TL
238 # erroneous services are not managed by HA, don't count them as active
239 next if $req_state eq 'error';
546e2f1f
DM
240
241 $count++;
242 }
289e4784 243
546e2f1f
DM
244 return $count;
245}
5bd7aa54
DM
246
247my $wrote_lrm_status_at_startup = 0;
248
5f095798
DM
249sub do_one_iteration {
250 my ($self) = @_;
251
252 my $haenv = $self->{haenv};
253
da6f0416
TL
254 $haenv->loop_start_hook();
255
3df15380
TL
256 $self->{cluster_state_update} = $haenv->cluster_state_update();
257
da6f0416
TL
258 my $res = $self->work();
259
260 $haenv->loop_end_hook();
261
262 return $res;
263}
264
265sub work {
266 my ($self) = @_;
267
268 my $haenv = $self->{haenv};
269
c5ec095f 270 if (!$wrote_lrm_status_at_startup) {
79829202 271 if ($self->update_lrm_status()) {
c5ec095f
DM
272 $wrote_lrm_status_at_startup = 1;
273 } else {
274 # do nothing
275 $haenv->sleep(5);
276 return $self->{shutdown_request} ? 0 : 1;
277 }
5bd7aa54 278 }
289e4784 279
5f095798
DM
280 my $status = $self->get_local_status();
281 my $state = $status->{state};
282
8e940b68 283 $self->update_service_status();
067cdf33 284
49777d09 285 my $fence_request = PVE::HA::Tools::count_fenced_services($self->{service_status}, $haenv->nodename());
289e4784
TL
286
287 # do state changes first
5f095798
DM
288
289 my $ctime = $haenv->get_time();
290
b0bf08a9 291 if ($state eq 'wait_for_agent_lock') {
5f095798 292
546e2f1f 293 my $service_count = $self->active_service_count();
5f095798 294
067cdf33 295 if (!$fence_request && $service_count && $haenv->quorate()) {
0bba8f60
DM
296 if ($self->get_protected_ha_agent_lock()) {
297 $self->set_local_status({ state => 'active' });
5f095798
DM
298 }
299 }
289e4784 300
5f095798
DM
301 } elsif ($state eq 'lost_agent_lock') {
302
067cdf33 303 if (!$fence_request && $haenv->quorate()) {
0bba8f60
DM
304 if ($self->get_protected_ha_agent_lock()) {
305 $self->set_local_status({ state => 'active' });
5f095798
DM
306 }
307 }
308
0bba8f60 309 } elsif ($state eq 'active') {
5f095798 310
289e4784 311 if ($fence_request) {
067cdf33 312 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
289e4784 313 $self->set_local_status({ state => 'lost_agent_lock'});
067cdf33 314 } elsif (!$self->get_protected_ha_agent_lock()) {
5f095798 315 $self->set_local_status({ state => 'lost_agent_lock'});
99278e06
TL
316 } elsif ($self->{mode} eq 'maintenance') {
317 $self->set_local_status({ state => 'maintenance'});
318 }
319 } elsif ($state eq 'maintenance') {
320
321 if ($fence_request) {
322 $haenv->log('err', "node need to be fenced during maintenance mode - releasing agent_lock\n");
323 $self->set_local_status({ state => 'lost_agent_lock'});
324 } elsif (!$self->get_protected_ha_agent_lock()) {
325 $self->set_local_status({ state => 'lost_agent_lock'});
5f095798
DM
326 }
327 }
328
329 $status = $self->get_local_status();
330 $state = $status->{state};
331
332 # do work
333
334 if ($state eq 'wait_for_agent_lock') {
335
336 return 0 if $self->{shutdown_request};
289e4784 337
79829202 338 $self->update_lrm_status();
289e4784 339
5f095798 340 $haenv->sleep(5);
289e4784 341
0bba8f60 342 } elsif ($state eq 'active') {
5f095798
DM
343
344 my $startime = $haenv->get_time();
345
346 my $max_time = 10;
347
348 my $shutdown = 0;
349
350 # do work (max_time seconds)
351 eval {
352 # fixme: set alert timer
353
8e940b68
TL
354 # if we could not get the current service status there's no point
355 # in doing anything, try again next round.
356 return if !$self->update_service_status();
357
5f095798
DM
358 if ($self->{shutdown_request}) {
359
499f06e3 360 if ($self->{mode} eq 'restart') {
5f095798 361
499f06e3 362 my $service_count = $self->active_service_count();
5f095798 363
499f06e3 364 if ($service_count == 0) {
5f095798 365
116dea30
DM
366 if ($self->run_workers() == 0) {
367 if ($self->{ha_agent_wd}) {
368 $haenv->watchdog_close($self->{ha_agent_wd});
369 delete $self->{ha_agent_wd};
370 }
371
372 $shutdown = 1;
e23f674c
TL
373
374 # restart with no or freezed services, release the lock
375 $haenv->release_ha_agent_lock();
116dea30
DM
376 }
377 }
378 } else {
379
380 if ($self->run_workers() == 0) {
381 if ($self->{shutdown_errors} == 0) {
382 if ($self->{ha_agent_wd}) {
383 $haenv->watchdog_close($self->{ha_agent_wd});
384 delete $self->{ha_agent_wd};
385 }
0e5b1a43
TL
386
387 # shutdown with all services stopped thus release the lock
388 $haenv->release_ha_agent_lock();
499f06e3 389 }
5f095798 390
499f06e3
DM
391 $shutdown = 1;
392 }
5f095798 393 }
c4a221bc 394 } else {
724bd3f3
TL
395 if (!$self->{cluster_state_update}) {
396 # update failed but we could still renew our lock (cfs restart?),
397 # safely skip manage and expect to update just fine next round
398 $haenv->log('notice', "temporary inconsistent cluster state " .
399 "(cfs restart?), skip round");
400 return;
401 }
c4a221bc
DM
402
403 $self->manage_resources();
067cdf33 404
5f095798
DM
405 }
406 };
407 if (my $err = $@) {
408 $haenv->log('err', "got unexpected error - $err");
409 }
410
79829202 411 $self->update_lrm_status();
289e4784 412
5f095798
DM
413 return 0 if $shutdown;
414
415 $haenv->sleep_until($startime + $max_time);
416
417 } elsif ($state eq 'lost_agent_lock') {
289e4784 418
5f095798
DM
419 # Note: watchdog is active an will triger soon!
420
421 # so we hope to get the lock back soon!
422
423 if ($self->{shutdown_request}) {
424
546e2f1f 425 my $service_count = $self->active_service_count();
5f095798 426
546e2f1f 427 if ($service_count > 0) {
289e4784 428 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
546e2f1f 429 "detected $service_count running services");
5f095798 430
c5c7faf6
TL
431 if ($self->{mode} eq 'restart') {
432 my $state_mt = $self->{status}->{state_change_time};
433
434 # watchdog should have already triggered, so either it's set
435 # set to noboot or it failed. As we are in restart mode, and
436 # have infinity stoptimeout -> exit now - we don't touch services
437 # or change state, so this is save, relatively speaking
438 if (($haenv->get_time() - $state_mt) > 90) {
439 $haenv->log('err', "lost agent lock and restart request for over 90 seconds - giving up!");
440 return 0;
441 }
442 }
546e2f1f 443 } else {
5f095798 444
546e2f1f 445 # all services are stopped, so we can close the watchdog
5f095798 446
546e2f1f
DM
447 if ($self->{ha_agent_wd}) {
448 $haenv->watchdog_close($self->{ha_agent_wd});
449 delete $self->{ha_agent_wd};
450 }
289e4784 451
546e2f1f 452 return 0;
5f095798 453 }
5f095798
DM
454 }
455
b0bf08a9
DM
456 $haenv->sleep(5);
457
99278e06
TL
458 } elsif ($state eq 'maintenance') {
459
460 my $startime = $haenv->get_time();
461 return if !$self->update_service_status();
462
463 # wait until all active services moved away
464 my $service_count = $self->active_service_count();
465
466 my $exit_lrm = 0;
467
468 if ($self->{shutdown_request}) {
469 if ($service_count == 0 && $self->run_workers() == 0) {
470 if ($self->{ha_agent_wd}) {
471 $haenv->watchdog_close($self->{ha_agent_wd});
472 delete $self->{ha_agent_wd};
473 }
474
475 $exit_lrm = 1;
476
477 # restart with no or freezed services, release the lock
478 $haenv->release_ha_agent_lock();
479 }
480 }
481
482 $self->manage_resources() if !$exit_lrm;
483
484 $self->update_lrm_status();
485
486 return 0 if $exit_lrm;
487
488 $haenv->sleep_until($startime + 5);
489
5f095798
DM
490 } else {
491
492 die "got unexpected status '$state'\n";
493
494 }
495
496 return 1;
497}
498
116dea30 499sub run_workers {
c4a221bc
DM
500 my ($self) = @_;
501
502 my $haenv = $self->{haenv};
503
f31b7e94 504 my $starttime = $haenv->get_time();
c4a221bc 505
a28fa330
TL
506 # number of workers to start, if 0 we exec the command directly witouth forking
507 my $max_workers = $haenv->get_max_workers();
c4a221bc 508
6dbf93a0 509 my $sc = $haenv->read_service_config();
f31b7e94
DM
510
511 while (($haenv->get_time() - $starttime) < 5) {
c4a221bc
DM
512 my $count = $self->check_active_workers();
513
a5e4bef4 514 foreach my $sid (sort keys %{$self->{workers}}) {
a28fa330
TL
515 last if $count >= $max_workers && $max_workers > 0;
516
c4a221bc
DM
517 my $w = $self->{workers}->{$sid};
518 if (!$w->{pid}) {
a28fa330
TL
519 # only fork if we may else call exec_resource_agent
520 # directly (e.g. for regression tests)
521 if ($max_workers > 0) {
f31b7e94
DM
522 my $pid = fork();
523 if (!defined($pid)) {
524 $haenv->log('err', "fork worker failed");
525 $count = 0; last; # abort, try later
526 } elsif ($pid == 0) {
a2aae08a
TL
527 $haenv->after_fork(); # cleanup
528
f31b7e94
DM
529 # do work
530 my $res = -1;
531 eval {
3ac1ee6b 532 $res = $self->exec_resource_agent($sid, $sc->{$sid}, $w->{state}, $w->{params});
f31b7e94
DM
533 };
534 if (my $err = $@) {
535 $haenv->log('err', $err);
536 POSIX::_exit(-1);
289e4784
TL
537 }
538 POSIX::_exit($res);
f31b7e94
DM
539 } else {
540 $count++;
541 $w->{pid} = $pid;
542 }
543 } else {
c4a221bc
DM
544 my $res = -1;
545 eval {
3ac1ee6b 546 $res = $self->exec_resource_agent($sid, $sc->{$sid}, $w->{state}, $w->{params});
b33b5743 547 $res = $res << 8 if $res > 0;
c4a221bc
DM
548 };
549 if (my $err = $@) {
f31b7e94 550 $haenv->log('err', $err);
116dea30
DM
551 }
552 if (defined($w->{uid})) {
553 $self->resource_command_finished($sid, $w->{uid}, $res);
554 } else {
555 $self->stop_command_finished($sid, $res);
556 }
c4a221bc
DM
557 }
558 }
559 }
560
561 last if !$count;
562
f31b7e94 563 $haenv->sleep(1);
c4a221bc 564 }
116dea30
DM
565
566 return scalar(keys %{$self->{workers}});
567}
568
569sub manage_resources {
570 my ($self) = @_;
571
572 my $haenv = $self->{haenv};
573
574 my $nodename = $haenv->nodename();
575
576 my $ss = $self->{service_status};
577
5a28da91
TL
578 foreach my $sid (keys %{$self->{restart_tries}}) {
579 delete $self->{restart_tries}->{$sid} if !$ss->{$sid};
580 }
581
116dea30
DM
582 foreach my $sid (keys %$ss) {
583 my $sd = $ss->{$sid};
584 next if !$sd->{node};
585 next if !$sd->{uid};
586 next if $sd->{node} ne $nodename;
587 my $req_state = $sd->{state};
588 next if !defined($req_state);
589 next if $req_state eq 'freeze';
e4ef317d 590 $self->queue_resource_command($sid, $sd->{uid}, $req_state, {'target' => $sd->{target}, 'timeout' => $sd->{timeout}});
116dea30
DM
591 }
592
593 return $self->run_workers();
c4a221bc
DM
594}
595
c4a221bc 596sub queue_resource_command {
3ac1ee6b 597 my ($self, $sid, $uid, $state, $params) = @_;
c4a221bc 598
35cbb764
TL
599 # do not queue the excatly same command twice as this may lead to
600 # an inconsistent HA state when the first command fails but the CRM
601 # does not process its failure right away and the LRM starts a second
602 # try, without the CRM knowing of it (race condition)
603 # The 'stopped' command is an exception as we do not process its result
604 # in the CRM and we want to execute it always (even with no active CRM)
605 return if $state ne 'stopped' && $uid && defined($self->{results}->{$uid});
606
c4a221bc
DM
607 if (my $w = $self->{workers}->{$sid}) {
608 return if $w->{pid}; # already started
609 # else, delete and overwrite queue entry with new command
610 delete $self->{workers}->{$sid};
611 }
612
613 $self->{workers}->{$sid} = {
614 sid => $sid,
615 uid => $uid,
616 state => $state,
617 };
e88469ba 618
3ac1ee6b 619 $self->{workers}->{$sid}->{params} = $params if $params;
c4a221bc
DM
620}
621
622sub check_active_workers {
623 my ($self) = @_;
624
625 # finish/count workers
626 my $count = 0;
627 foreach my $sid (keys %{$self->{workers}}) {
628 my $w = $self->{workers}->{$sid};
629 if (my $pid = $w->{pid}) {
630 # check status
631 my $waitpid = waitpid($pid, WNOHANG);
632 if (defined($waitpid) && ($waitpid == $pid)) {
c0edbd7e 633 if (defined($w->{uid})) {
116dea30
DM
634 $self->resource_command_finished($sid, $w->{uid}, $?);
635 } else {
636 $self->stop_command_finished($sid, $?);
637 }
c4a221bc
DM
638 } else {
639 $count++;
640 }
641 }
642 }
289e4784 643
c4a221bc
DM
644 return $count;
645}
646
116dea30
DM
647sub stop_command_finished {
648 my ($self, $sid, $status) = @_;
649
650 my $haenv = $self->{haenv};
651
652 my $w = delete $self->{workers}->{$sid};
653 return if !$w; # should not happen
654
655 my $exit_code = -1;
656
657 if ($status == -1) {
658 $haenv->log('err', "resource agent $sid finished - failed to execute");
659 } elsif (my $sig = ($status & 127)) {
660 $haenv->log('err', "resource agent $sid finished - got signal $sig");
661 } else {
662 $exit_code = ($status >> 8);
663 }
664
665 if ($exit_code != 0) {
666 $self->{shutdown_errors}++;
667 }
668}
669
c4a221bc
DM
670sub resource_command_finished {
671 my ($self, $sid, $uid, $status) = @_;
672
673 my $haenv = $self->{haenv};
674
675 my $w = delete $self->{workers}->{$sid};
676 return if !$w; # should not happen
677
678 my $exit_code = -1;
679
680 if ($status == -1) {
289e4784 681 $haenv->log('err', "resource agent $sid finished - failed to execute");
c4a221bc 682 } elsif (my $sig = ($status & 127)) {
0f70400d 683 $haenv->log('err', "resource agent $sid finished - got signal $sig");
c4a221bc
DM
684 } else {
685 $exit_code = ($status >> 8);
c4a221bc
DM
686 }
687
ea4443cc
TL
688 $exit_code = $self->handle_service_exitcode($sid, $w->{state}, $exit_code);
689
280ee5d5
DM
690 return if $exit_code == ETRY_AGAIN; # tell nobody, simply retry
691
c4a221bc
DM
692 $self->{results}->{$uid} = {
693 sid => $w->{sid},
694 state => $w->{state},
695 exit_code => $exit_code,
696 };
697
698 my $ss = $self->{service_status};
699
700 # compute hash of valid/existing uids
701 my $valid_uids = {};
702 foreach my $sid (keys %$ss) {
703 my $sd = $ss->{$sid};
704 next if !$sd->{uid};
705 $valid_uids->{$sd->{uid}} = 1;
706 }
707
708 my $results = {};
709 foreach my $id (keys %{$self->{results}}) {
710 next if !$valid_uids->{$id};
711 $results->{$id} = $self->{results}->{$id};
712 }
713 $self->{results} = $results;
c4a221bc
DM
714}
715
ea4443cc
TL
716# processes the exit code from a finished resource agent, so that the CRM knows
717# if the LRM wants to retry an action based on the current recovery policies for
718# the failed service, or the CRM itself must try to recover from the failure.
719sub handle_service_exitcode {
720 my ($self, $sid, $cmd, $exit_code) = @_;
721
722 my $haenv = $self->{haenv};
723 my $tries = $self->{restart_tries};
724
725 my $sc = $haenv->read_service_config();
aaabde6a
DM
726
727 my $max_restart = 0;
728
729 if (my $cd = $sc->{$sid}) {
730 $max_restart = $cd->{max_restart};
731 }
ea4443cc
TL
732
733 if ($cmd eq 'started') {
734
a89ff919 735 if ($exit_code == SUCCESS) {
ea4443cc
TL
736
737 $tries->{$sid} = 0;
738
739 return $exit_code;
740
a89ff919 741 } elsif ($exit_code == ERROR) {
ea4443cc
TL
742
743 $tries->{$sid} = 0 if !defined($tries->{$sid});
744
aaabde6a 745 if ($tries->{$sid} >= $max_restart) {
ea4443cc
TL
746 $haenv->log('err', "unable to start service $sid on local node".
747 " after $tries->{$sid} retries");
748 $tries->{$sid} = 0;
a89ff919 749 return ERROR;
ea4443cc
TL
750 }
751
e9e1cd68
TL
752 $tries->{$sid}++;
753
754 $haenv->log('warning', "restart policy: retry number $tries->{$sid}" .
755 " for service '$sid'");
a89ff919
TL
756 # tell CRM that we retry the start
757 return ETRY_AGAIN;
ea4443cc
TL
758 }
759 }
760
761 return $exit_code;
762
763}
764
2a045f55 765sub exec_resource_agent {
3ac1ee6b 766 my ($self, $sid, $service_config, $cmd, $params) = @_;
2a045f55
TL
767
768 # setup execution environment
769
770 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
771
2a045f55
TL
772 my $haenv = $self->{haenv};
773
774 my $nodename = $haenv->nodename();
775
0087839a 776 my (undef, $service_type, $service_name) = $haenv->parse_sid($sid);
2a045f55
TL
777
778 my $plugin = PVE::HA::Resources->lookup($service_type);
779 if (!$plugin) {
780 $haenv->log('err', "service type '$service_type' not implemented");
781 return EUNKNOWN_SERVICE_TYPE;
782 }
783
aaabde6a
DM
784 if (!$service_config) {
785 $haenv->log('err', "missing resource configuration for '$sid'");
786 return EUNKNOWN_SERVICE;
787 }
788
d338a56f
TL
789 # process error state early
790 if ($cmd eq 'error') {
791
792 $haenv->log('err', "service $sid is in an error state and needs manual " .
793 "intervention. Look up 'ERROR RECOVERY' in the documentation.");
794
795 return SUCCESS; # error always succeeds
796 }
797
2a045f55
TL
798 if ($service_config->{node} ne $nodename) {
799 $haenv->log('err', "service '$sid' not on this node");
800 return EWRONG_NODE;
801 }
802
803 my $id = $service_name;
804
805 my $running = $plugin->check_running($haenv, $id);
806
807 if ($cmd eq 'started') {
808
809 return SUCCESS if $running;
810
811 $haenv->log("info", "starting service $sid");
812
813 $plugin->start($haenv, $id);
814
815 $running = $plugin->check_running($haenv, $id);
816
817 if ($running) {
818 $haenv->log("info", "service status $sid started");
819 return SUCCESS;
820 } else {
821 $haenv->log("warning", "unable to start service $sid");
822 return ERROR;
823 }
824
825 } elsif ($cmd eq 'request_stop' || $cmd eq 'stopped') {
826
827 return SUCCESS if !$running;
828
e4ef317d
FE
829 if (defined($params->{timeout})) {
830 $haenv->log("info", "stopping service $sid (timeout=$params->{timeout})");
831 } else {
832 $haenv->log("info", "stopping service $sid");
833 }
2a045f55 834
e4ef317d 835 $plugin->shutdown($haenv, $id, $params->{timeout});
2a045f55
TL
836
837 $running = $plugin->check_running($haenv, $id);
838
839 if (!$running) {
840 $haenv->log("info", "service status $sid stopped");
841 return SUCCESS;
842 } else {
843 $haenv->log("info", "unable to stop stop service $sid (still running)");
844 return ERROR;
845 }
846
847 } elsif ($cmd eq 'migrate' || $cmd eq 'relocate') {
848
3ac1ee6b 849 my $target = $params->{target};
2a045f55
TL
850 if (!defined($target)) {
851 die "$cmd '$sid' failed - missing target\n" if !defined($target);
852 return EINVALID_PARAMETER;
853 }
854
855 if ($service_config->{node} eq $target) {
856 # already there
857 return SUCCESS;
858 }
859
860 my $online = ($cmd eq 'migrate') ? 1 : 0;
861
ea28f873 862 my $res = $plugin->migrate($haenv, $id, $target, $online);
2a045f55
TL
863
864 # something went wrong if service is still on this node
ea28f873 865 if (!$res) {
2a045f55
TL
866 $haenv->log("err", "service $sid not moved (migration error)");
867 return ERROR;
868 }
869
870 return SUCCESS;
871
2a045f55
TL
872 }
873
874 $haenv->log("err", "implement me (cmd '$cmd')");
875 return EUNKNOWN_COMMAND;
876}
877
878
5f095798 8791;