Commit
61ae38eb6fc5ab351fb61f2323776819e20538b7 which ensured that
services get freezed on a node reboot had a side effect where running
services did not get gracefully shutdown on node reboot.
This may lead to data loss as the services then get hard killed, or
they may even prevent a node reboot because a storage cannot get
unmounted as a service still access it.
This commits addresses this issue but does not changes behavior of
the freeze logic for now, but we should evaluate if a freeze makes
really sense here or at least make it configurable.
The changed regression test is a result of the fact that we did not
adapt the correct behavior for the is_node_shutdown command in the
problematic commit. The simulation envrionment returned true
everytime a node shutdown (reboot and poweroff) and the real world
environment just returned true if a poweroff happened but not on a
reboot.
Now the simulation acts the same way as the real environment.
Further I moved the simulation implemenentation to the base class so
that both simulator and regression test system behave the same.
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
my ($self) = @_;
my $shutdown = 0;
+ my $reboot = 0;
my $code = sub {
my $line = shift;
# ensure we match the full unit name by matching /^JOB_ID UNIT /
- $shutdown = 1 if ($line =~ m/^\d+\s+(poweroff|halt)\.target\s+/);
+ # see: man systemd.special
+ $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
+ $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
};
my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
- return $shutdown;
+ return ($shutdown, $reboot);
}
sub queue_crm_commands {
my $nodename = $haenv->nodename();
- my $shutdown = $haenv->is_node_shutdown();
+ my ($shutdown, $reboot) = $haenv->is_node_shutdown();
if ($shutdown) {
- $haenv->log('info', "shutdown LRM, stop all services");
- $self->{mode} = 'shutdown';
-
- # queue stop jobs for all services
-
+ # *always* queue stop jobs for all services if the node shuts down,
+ # independent if it's a reboot or a poweroff, else we may corrupt
+ # services or hinder node shutdown
my $ss = $self->{service_status};
foreach my $sid (keys %$ss) {
# Note: use undef uid to mark shutdown/stop jobs
$self->queue_resource_command($sid, undef, 'request_stop');
}
+ }
+ if ($shutdown) {
+ if ($reboot) {
+ $haenv->log('info', "reboot LRM, stop and freeze all services");
+ $self->{mode} = 'restart';
+ } else {
+ $haenv->log('info', "shutdown LRM, stop all services");
+ $self->{mode} = 'shutdown';
+ }
} else {
$haenv->log('info', "restart LRM, freeze all services");
$self->{mode} = 'restart';
sub is_node_shutdown {
my ($self) = @_;
- return 0; # default to freezing services if not overwritten by subclass
+ my $node = $self->{nodename};
+ my $cstatus = $self->{hardware}->read_hardware_status_nolock();
+
+ die "undefined node status for node '$node'" if !defined($cstatus->{$node});
+
+ my ($shutdown, $reboot) = (0, 0);
+
+ if (my $target = $cstatus->{$node}->{shutdown}) {
+ if ($target eq 'shutdown') {
+ $shutdown = 1;
+ } elsif ($target eq 'reboot') {
+ $shutdown = 1;
+ $reboot = 1;
+ } else {
+ die "unknown shutdown target '$target'";
+ }
+ }
+
+ return ($shutdown, $reboot);
}
sub read_service_config {
$self->{cur_time} += 1; # easier for simulation
}
-sub is_node_shutdown {
- my ($self) = @_;
-
- my $node = $self->{nodename};
- my $cstatus = $self->{hardware}->read_hardware_status_nolock();
-
- die "undefined node status for node '$node'" if !defined($cstatus->{$node});
-
- return defined($cstatus->{$node}->{shutdown}) ? 1 : 0;
-}
-
# must be 0 as we do not want to fork in the regression tests
sub get_max_workers {
my ($self) = @_;
info 25 node3/lrm: starting service vm:103
info 25 node3/lrm: service status vm:103 started
info 120 cmdlist: execute reboot node3
-info 120 node3/lrm: shutdown LRM, stop all services
+info 120 node3/lrm: reboot LRM, stop and freeze all services
+info 120 node1/crm: service 'vm:103': state changed from 'started' to 'freeze'
info 125 node3/lrm: stopping service vm:103
info 125 node3/lrm: service status vm:103 stopped
info 126 node3/lrm: exit (loop end)
info 145 reboot: execute power node3 on
info 145 node3/crm: status change startup => wait_for_quorum
info 140 node3/lrm: status change startup => wait_for_agent_lock
-info 145 node3/lrm: got lock 'ha_agent_node3_lock'
-info 145 node3/lrm: status change wait_for_agent_lock => active
-info 145 node3/lrm: starting service vm:103
-info 145 node3/lrm: service status vm:103 started
+info 160 node1/crm: service 'vm:103': state changed from 'freeze' to 'started'
info 164 node3/crm: status change wait_for_quorum => slave
+info 165 node3/lrm: got lock 'ha_agent_node3_lock'
+info 165 node3/lrm: status change wait_for_agent_lock => active
+info 165 node3/lrm: starting service vm:103
+info 165 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done