use PVE::HA::Tools ':exit_codes';
use PVE::HA::NodeStatus;
+## Variable Name & Abbreviations Convention
+#
+# The HA stack has some variables it uses frequently and thus abbreviates it such that it may be
+# confusing for new readers. Here's a short list of the most common used.
+#
+# NOTE: variables should be assumed to be read only if not otherwise stated, only use the specific
+# methods to re-compute/read/alter them.
+#
+# - $haenv -> HA environment, the main interface to the simulator/test/real world
+# - $sid -> Service ID, unique identifier for a service, `type:vmid` is common
+#
+# - $ms -> Master/Manager Status, contains runtime info from the current active manager
+# - $ns -> Node Status, hash holding online/offline status about all nodes
+#
+# - $ss -> Service Status, hash holding the current state (last LRM cmd result, failed starts
+# or migrates, maintenance fallback node, for *all* services ...
+# - $sd -> Service Data, the service status of a *single* service, iow. $ss->{$sid}
+#
+# - $sc -> Service Configuration, hash for all services including target state, group, ...
+# - $cd -> Configuration Data, the service config of a *single* service, iow. $sc->{$sid}
+#
+# Try to avoid adding new two letter (or similar over abbreviated) names, but also don't send
+# patches for changing above, as that set is mostly sensible and should be easy to remember once
+# spending a bit time in the HA code base.
+
sub new {
my ($this, $haenv) = @_;
}
sub select_service_node {
- my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
+ my ($groups, $online_node_usage, $sid, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
my $group = get_service_group($groups, $online_node_usage, $service_conf);
} keys %{$pri_groups->{$top_pri}};
my $found;
- my $found_maintenace_fallback;
+ my $found_maintenance_fallback;
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
my $node = $nodes[$i];
if ($node eq $current_node) {
$found = $i;
}
if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
- $found_maintenace_fallback = $i;
+ $found_maintenance_fallback = $i;
}
}
- if (defined($found_maintenace_fallback)) {
- return $nodes[$found_maintenace_fallback];
+ if (defined($found_maintenance_fallback)) {
+ return $nodes[$found_maintenance_fallback];
}
if ($try_next) {
foreach my $sid (keys %{$self->{ss}}) {
my $sd = $self->{ss}->{$sid};
my $state = $sd->{state};
+ my $target = $sd->{target}; # optional
if (defined($online_node_usage->{$sd->{node}})) {
if (
$state eq 'started' || $state eq 'request_stop' || $state eq 'fence' ||
} elsif (($state eq 'migrate') || ($state eq 'relocate')) {
# count it for both, source and target as load is put on both
$online_node_usage->{$sd->{node}}++;
- $online_node_usage->{$sd->{target}}++;
+ $online_node_usage->{$target}++;
} elsif ($state eq 'stopped') {
# do nothing
} else {
die "should not be reached (sid = '$sid', state = '$state')";
}
+ } elsif (defined($target) && defined($online_node_usage->{$target})) {
+ if ($state eq 'migrate' || $state eq 'relocate') {
+ # to correctly track maintenance modi and also consider the target as used for the
+ # case a node dies, as we cannot really know if the to-be-aborted incoming migration
+ # has already cleaned up all used resources
+ $online_node_usage->{$target}++;
+ }
}
}
# handle fencing
my $fenced_nodes = {};
foreach my $sid (sort keys %$ss) {
- my $sd = $ss->{$sid};
- next if $sd->{state} ne 'fence';
-
- my $service_node = $sd->{node};
+ my ($service_state, $service_node) = $ss->{$sid}->@{'state', 'node'};
+ next if $service_state ne 'fence';
if (!defined($fenced_nodes->{$service_node})) {
- $fenced_nodes->{$service_node} = $ns->fence_node($sd->{node}) || 0;
+ $fenced_nodes->{$service_node} = $ns->fence_node($service_node) || 0;
}
next if !$fenced_nodes->{$service_node};
$repeat = 1; # for faster recovery execution
}
+ # Avoid that a node without services in 'fence' state (e.g., removed
+ # manually by admin) is stuck with the 'fence' node state.
+ for my $node (sort grep { !defined($fenced_nodes->{$_}) } keys $ns->{status}->%*) {
+ next if $ns->get_node_state($node) ne 'fence';
+
+ $haenv->log('notice', "node '$node' in fence state but no services to-fence! admin interference?!");
+ $repeat = 1 if $ns->fence_node($node);
+ }
+
last if !$repeat;
}
my $node = select_service_node(
$self->{groups},
$self->{online_node_usage},
+ $sid,
$cd,
$sd->{node},
$try_next,
my $recovery_node = select_service_node(
$self->{groups},
$self->{online_node_usage},
+ $sid,
$cd,
$sd->{node},
);