use PVE::Tools;
use PVE::HA::Tools ':exit_codes';
use PVE::HA::NodeStatus;
+use PVE::HA::Usage::Basic;
+
+## Variable Name & Abbreviations Convention
+#
+# The HA stack has some variables it uses frequently and thus abbreviates it such that it may be
+# confusing for new readers. Here's a short list of the most common used.
+#
+# NOTE: variables should be assumed to be read only if not otherwise stated, only use the specific
+# methods to re-compute/read/alter them.
+#
+# - $haenv -> HA environment, the main interface to the simulator/test/real world
+# - $sid -> Service ID, unique identifier for a service, `type:vmid` is common
+#
+# - $ms -> Master/Manager Status, contains runtime info from the current active manager
+# - $ns -> Node Status, hash holding online/offline status about all nodes
+#
+# - $ss -> Service Status, hash holding the current state (last LRM cmd result, failed starts
+# or migrates, maintenance fallback node, for *all* services ...
+# - $sd -> Service Data, the service status of a *single* service, iow. $ss->{$sid}
+#
+# - $sc -> Service Configuration, hash for all services including target state, group, ...
+# - $cd -> Configuration Data, the service config of a *single* service, iow. $sc->{$sid}
+#
+# Try to avoid adding new two letter (or similar over abbreviated) names, but also don't send
+# patches for changing above, as that set is mostly sensible and should be easy to remember once
+# spending a bit time in the HA code base.
sub new {
my ($this, $haenv) = @_;
$self->{ms} = { master_node => $haenv->nodename() };
+ my $dc_cfg = $haenv->get_datacenter_settings();
+ $self->{'scheduler-mode'} = $dc_cfg->{crs}->{ha} ? $dc_cfg->{crs}->{ha} : 'basic';
+ $haenv->log('info', "using scheduler mode '$self->{'scheduler-mode'}'")
+ if $self->{'scheduler-mode'} ne 'basic';
+
return $self;
}
my $group = {};
# add all online nodes to default group to allow try_next when no group set
- foreach my $node (keys %$online_node_usage) {
- $group->{nodes}->{$node} = 1;
- }
+ $group->{nodes}->{$_} = 1 for $online_node_usage->list_nodes();
# overwrite default if service is bound to a specific group
if (my $group_id = $service_conf->{group}) {
if ($entry =~ m/^(\S+):(\d+)$/) {
($node, $pri) = ($1, $2);
}
- next if !defined($online_node_usage->{$node}); # offline
+ next if !$online_node_usage->contains_node($node); # offline
$pri_groups->{$pri}->{$node} = 1;
$group_members->{$node} = $pri;
}
# add non-group members to unrestricted groups (priority -1)
if (!$group->{restricted}) {
my $pri = -1;
- foreach my $node (keys %$online_node_usage) {
+ for my $node ($online_node_usage->list_nodes()) {
next if defined($group_members->{$node});
$pri_groups->{$pri}->{$node} = 1;
$group_members->{$node} = -1;
}
sub select_service_node {
- my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
+ my ($groups, $online_node_usage, $sid, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
my $group = get_service_group($groups, $online_node_usage, $service_conf);
}
}
+ my $scores = $online_node_usage->score_nodes_to_start_service($sid, $current_node);
my @nodes = sort {
- $online_node_usage->{$a} <=> $online_node_usage->{$b} || $a cmp $b
+ $scores->{$a} <=> $scores->{$b} || $a cmp $b
} keys %{$pri_groups->{$top_pri}};
my $found;
- my $found_maintenace_fallback;
+ my $found_maintenance_fallback;
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
my $node = $nodes[$i];
if ($node eq $current_node) {
$found = $i;
}
if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
- $found_maintenace_fallback = $i;
+ $found_maintenance_fallback = $i;
}
}
- if (defined($found_maintenace_fallback)) {
- return $nodes[$found_maintenace_fallback];
+ if (defined($found_maintenance_fallback)) {
+ return $nodes[$found_maintenance_fallback];
}
if ($try_next) {
sub recompute_online_node_usage {
my ($self) = @_;
- my $online_node_usage = {};
+ my $online_node_usage = PVE::HA::Usage::Basic->new($self->{haenv});
my $online_nodes = $self->{ns}->list_online_nodes();
- foreach my $node (@$online_nodes) {
- $online_node_usage->{$node} = 0;
- }
+ $online_node_usage->add_node($_) for $online_nodes->@*;
foreach my $sid (keys %{$self->{ss}}) {
my $sd = $self->{ss}->{$sid};
my $state = $sd->{state};
- if (defined($online_node_usage->{$sd->{node}})) {
+ my $target = $sd->{target}; # optional
+ if ($online_node_usage->contains_node($sd->{node})) {
if (
$state eq 'started' || $state eq 'request_stop' || $state eq 'fence' ||
$state eq 'freeze' || $state eq 'error' || $state eq 'recovery'
) {
- $online_node_usage->{$sd->{node}}++;
+ $online_node_usage->add_service_usage_to_node($sd->{node}, $sid, $sd->{node});
} elsif (($state eq 'migrate') || ($state eq 'relocate')) {
+ my $source = $sd->{node};
# count it for both, source and target as load is put on both
- $online_node_usage->{$sd->{node}}++;
- $online_node_usage->{$sd->{target}}++;
+ $online_node_usage->add_service_usage_to_node($source, $sid, $source, $target);
+ $online_node_usage->add_service_usage_to_node($target, $sid, $source, $target);
} elsif ($state eq 'stopped') {
# do nothing
} else {
die "should not be reached (sid = '$sid', state = '$state')";
}
+ } elsif (defined($target) && $online_node_usage->contains_node($target)) {
+ if ($state eq 'migrate' || $state eq 'relocate') {
+ # to correctly track maintenance modi and also consider the target as used for the
+ # case a node dies, as we cannot really know if the to-be-aborted incoming migration
+ # has already cleaned up all used resources
+ $online_node_usage->add_service_usage_to_node($target, $sid, $sd->{node}, $target);
+ }
}
}
# handle fencing
my $fenced_nodes = {};
foreach my $sid (sort keys %$ss) {
- my $sd = $ss->{$sid};
- next if $sd->{state} ne 'fence';
+ my ($service_state, $service_node) = $ss->{$sid}->@{'state', 'node'};
+ next if $service_state ne 'fence';
- if (!defined($fenced_nodes->{$sd->{node}})) {
- $fenced_nodes->{$sd->{node}} = $ns->fence_node($sd->{node}) || 0;
+ if (!defined($fenced_nodes->{$service_node})) {
+ $fenced_nodes->{$service_node} = $ns->fence_node($service_node) || 0;
}
- next if !$fenced_nodes->{$sd->{node}};
+ next if !$fenced_nodes->{$service_node};
# node fence was successful - recover service
$change_service_state->($self, $sid, 'recovery');
- $repeat = 1; # for faster execution
+ $repeat = 1; # for faster recovery execution
+ }
+
+ # Avoid that a node without services in 'fence' state (e.g., removed
+ # manually by admin) is stuck with the 'fence' node state.
+ for my $node (sort grep { !defined($fenced_nodes->{$_}) } keys $ns->{status}->%*) {
+ next if $ns->get_node_state($node) ne 'fence';
+
+ $haenv->log('notice', "node '$node' in fence state but no services to-fence! admin interference?!");
+ $repeat = 1 if $ns->fence_node($node);
}
last if !$repeat;
my $node = select_service_node(
$self->{groups},
$self->{online_node_usage},
+ $sid,
$cd,
$sd->{node},
$try_next,
);
if ($node && ($sd->{node} ne $node)) {
- $self->{online_node_usage}->{$node}++;
+ $self->{online_node_usage}->add_service_usage_to_node($node, $sid, $sd->{node});
if (defined(my $fallback = $sd->{maintenance_node})) {
if ($node eq $fallback) {
my $recovery_node = select_service_node(
$self->{groups},
$self->{online_node_usage},
+ $sid,
$cd,
$sd->{node},
);
$fence_recovery_cleanup->($self, $sid, $fenced_node);
$haenv->steal_service($sid, $sd->{node}, $recovery_node);
- $self->{online_node_usage}->{$recovery_node}++;
+ $self->{online_node_usage}->add_service_usage_to_node($recovery_node, $sid, $recovery_node);
# NOTE: $sd *is normally read-only*, fencing is the exception
$cd->{node} = $sd->{node} = $recovery_node;