}
sub select_service_node {
- my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes) = @_;
+ my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
my $group = get_service_group($groups, $online_node_usage, $service_conf);
} keys %{$pri_groups->{$top_pri}};
my $found;
+ my $found_maintenace_fallback;
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
my $node = $nodes[$i];
if ($node eq $current_node) {
$found = $i;
- last;
}
+ if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
+ $found_maintenace_fallback = $i;
+ }
+ }
+
+ if (defined($found_maintenace_fallback)) {
+ return $nodes[$found_maintenace_fallback];
}
if ($try_next) {
my $old_state = $sd->{state};
my $old_node = $sd->{node};
my $old_failed_nodes = $sd->{failed_nodes};
+ my $old_maintenance_node = $sd->{maintenance_node};
die "no state change" if $old_state eq $new_state; # just to be sure
$sd->{state} = $new_state;
$sd->{node} = $old_node;
$sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes);
+ $sd->{maintenance_node} = $old_maintenance_node if defined($old_maintenance_node);
my $text_state = '';
foreach my $k (sort keys %params) {
}
if ($ns->get_node_state($sd->{node}) ne 'maintenance') {
return;
+ } else {
+ # save current node as fallback for when it comes out of
+ # maintenance
+ $sd->{maintenance_node} = $sd->{node};
}
}
}
}
- my $node = select_service_node($self->{groups}, $self->{online_node_usage},
- $cd, $sd->{node}, $try_next, $sd->{failed_nodes});
+ my $node = select_service_node(
+ $self->{groups},
+ $self->{online_node_usage},
+ $cd,
+ $sd->{node},
+ $try_next,
+ $sd->{failed_nodes},
+ $sd->{maintenance_node},
+ );
if ($node && ($sd->{node} ne $node)) {
$self->{online_node_usage}->{$node}++;
+
+ if (defined(my $fallback = $sd->{maintenance_node})) {
+ if ($node eq $fallback) {
+ $haenv->log('info', "moving service '$sid' back to '$fallback', node came back from maintenance.");
+ delete $sd->{maintenance_node};
+ } elsif ($sd->{node} ne $fallback) {
+ $haenv->log('info', "dropping maintenance fallback node '$fallback' for '$sid'");
+ delete $sd->{maintenance_node};
+ }
+ }
+
if ($cd->{type} eq 'vm') {
$haenv->log('info', "migrate service '$sid' to node '$node' (running)");
&$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
info 345 node3/crm: status change startup => wait_for_quorum
info 340 node3/lrm: status change startup => wait_for_agent_lock
info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 360 node1/crm: moving service 'fa:109' back to 'node3', node came back from maintenance.
+info 360 node1/crm: relocate service 'fa:109' to node 'node3'
+info 360 node1/crm: service 'fa:109': state changed from 'started' to 'relocate' (node = node2, target = node3)
+info 360 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
+info 360 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 360 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
+info 361 node1/lrm: service vm:103 - start migrate to node 'node3'
+info 361 node1/lrm: service vm:103 - end migrate to node 'node3'
+err 363 node2/lrm: service fa:109 not moved (migration error)
info 364 node3/crm: status change wait_for_quorum => slave
+err 380 node1/crm: service 'fa:109' - migration failed (exit code 1)
+info 380 node1/crm: service 'fa:109': state changed from 'relocate' to 'started' (node = node2)
+info 380 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 385 node3/lrm: got lock 'ha_agent_node3_lock'
+info 385 node3/lrm: status change wait_for_agent_lock => active
+info 385 node3/lrm: starting service vm:103
+info 385 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
info 165 node3/crm: status change startup => wait_for_quorum
info 160 node3/lrm: status change startup => wait_for_agent_lock
info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 180 node1/crm: moving service 'ct:102' back to 'node3', node came back from maintenance.
+info 180 node1/crm: relocate service 'ct:102' to node 'node3'
+info 180 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node1, target = node3)
+info 180 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
+info 180 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 180 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
+info 181 node1/lrm: service ct:102 - start relocate to node 'node3'
+info 181 node1/lrm: stopping service ct:102 (relocate)
+info 181 node1/lrm: service status ct:102 stopped
+info 181 node1/lrm: service ct:102 - end relocate to node 'node3'
+info 181 node1/lrm: service vm:103 - start migrate to node 'node3'
+info 181 node1/lrm: service vm:103 - end migrate to node 'node3'
info 184 node3/crm: status change wait_for_quorum => slave
+info 200 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node3)
+info 200 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 205 node3/lrm: got lock 'ha_agent_node3_lock'
+info 205 node3/lrm: status change wait_for_agent_lock => active
+info 205 node3/lrm: starting service ct:102
+info 205 node3/lrm: service status ct:102 started
+info 205 node3/lrm: starting service vm:103
+info 205 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done