$self->{service_status} = $ms->{service_status} || {};
my $nodename = $haenv->nodename();
$self->{node_status} = $ms->{node_status}->{$nodename} || 'unknown';
+
+ # FIXME: method name is a bit confusing for doing this, either rename or move
+ if (!$self->{shutdown_request}) {
+ my $request = $ms->{node_request}->{$nodename} // {};
+ if ($request->{maintenance}) {
+ $self->{mode} = 'maintenance';
+ } elsif ($self->{mode} eq 'maintenance') {
+ $self->{mode} = 'active';
+ }
+ }
+
return 1;
}
}
if (!$self->get_protected_ha_agent_lock()) {
$self->set_local_status({ state => 'lost_agent_lock'});
}
+ } elsif (!$self->is_maintenance_requested()) {
+ # empty && no maintenance mode && not exited -> need to switch active again
+ if ($self->get_protected_ha_agent_lock()) {
+ $self->set_local_status({ state => 'active' });
+ } else {
+ $self->set_local_status({ state => 'lost_agent_lock'});
+ }
}
}
} else {
$haenv->log('err', "crm command error - no such service: $cmd");
}
+ } elsif ($cmd =~ m/^enable-node-maintenance\s+(\S+)$/) {
+ my $node = $1;
+
+ my $state = $ns->get_node_state($node);
+ if ($state eq 'online') {
+ $ms->{node_request}->{$node}->{maintenance} = 1;
+ } elsif ($state eq 'maintenance') {
+ $haenv->log('info', "ignoring crm command - node $node is already in maintenance state");
+ } else {
+ $haenv->log('err', "crm command error - node not online: $cmd");
+ }
+ } elsif ($cmd =~ m/^disable-node-maintenance\s+(\S+)$/) {
+ my $node = $1;
+
+ my $state = $ns->get_node_state($node);
+ if ($state ne 'maintenance') {
+ $haenv->log(
+ 'warn', "clearing maintenance of node $node requested, but it's in state $state");
+ }
+ delete $ms->{node_request}->{$node}->{maintenance}; # gets flushed out at the end of the CRM loop
} else {
$haenv->log('err', "unable to parse crm command: $cmd");
}
}
} elsif ($action eq 'start') {
$d->{crm} = $self->crm_control('start', $d, $lock_fh) if !defined($d->{crm});
+ } elsif ($action eq 'enable-node-maintenance' || $action eq 'disable-node-maintenance') {
+ $self->queue_crm_commands_nolock("$action $node");
} else {
die "sim_hardware_cmd: unknown action '$action'";
}
--- /dev/null
+Test setting the maintenance mode for a node manually, then reboot it and
+ensure that maintenance mode doesn't get cleared automatically that way.
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "crm node3 enable-node-maintenance" ],
+ [ "power node3 off" ],
+ [ "power node3 on" ],
+ [ "crm node3 disable-node-maintenance" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:101' on node 'node1'
+info 20 node1/crm: adding new service 'vm:102' on node 'node2'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 20 node1/crm: service 'vm:101': state changed from 'request_start' to 'started' (node = node1)
+info 20 node1/crm: service 'vm:103': state changed from 'request_start' to 'started' (node = node3)
+info 21 node1/lrm: got lock 'ha_agent_node1_lock'
+info 21 node1/lrm: status change wait_for_agent_lock => active
+info 21 node1/lrm: starting service vm:101
+info 21 node1/lrm: service status vm:101 started
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 40 node1/crm: service 'vm:102': state changed from 'request_stop' to 'stopped'
+info 120 cmdlist: execute crm node3 enable-node-maintenance
+info 125 node3/lrm: status change active => maintenance
+info 140 node1/crm: node 'node3': state changed from 'online' => 'maintenance'
+info 140 node1/crm: migrate service 'vm:103' to node 'node2' (running)
+info 140 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node3, target = node2)
+info 145 node3/lrm: service vm:103 - start migrate to node 'node2'
+info 145 node3/lrm: service vm:103 - end migrate to node 'node2'
+info 160 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node2)
+info 163 node2/lrm: starting service vm:103
+info 163 node2/lrm: service status vm:103 started
+info 220 cmdlist: execute power node3 off
+info 220 node3/crm: killed by poweroff
+info 220 node3/lrm: killed by poweroff
+info 320 cmdlist: execute power node3 on
+info 320 node3/crm: status change startup => wait_for_quorum
+info 320 node3/lrm: status change startup => wait_for_agent_lock
+info 324 node3/crm: status change wait_for_quorum => slave
+info 420 cmdlist: execute crm node3 disable-node-maintenance
+info 440 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
+info 440 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
+info 440 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 440 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node2, target = node3)
+info 443 node2/lrm: service vm:103 - start migrate to node 'node3'
+info 443 node2/lrm: service vm:103 - end migrate to node 'node3'
+info 460 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 465 node3/lrm: got lock 'ha_agent_node3_lock'
+info 465 node3/lrm: status change wait_for_agent_lock => active
+info 465 node3/lrm: starting service vm:103
+info 465 node3/lrm: service status vm:103 started
+info 1020 hardware: exit simulation - done
--- /dev/null
+{}
\ No newline at end of file
--- /dev/null
+{
+ "vm:101": { "node": "node1", "state": "enabled" },
+ "vm:102": { "node": "node2" },
+ "vm:103": { "node": "node3", "state": "enabled" }
+}
\ No newline at end of file
--- /dev/null
+Test that setting the maintenance mode for a offline node (node3) manually fails, then re-power it
+(node3) and enable mainteancne mode for an online node (node2). Now trigger a manual migrate of the
+for maintenance moved service. Disable maitenance mode again (node2) and currently the HA stack will
+migrate the service that got moved twice, once automatically for maintenance and once manually, back
+due to the maitenance fail-back logic.
+
+If that is the right behavior, or if a manual migration should clear any maintenance fallback might
+be up for debate, this test just encodes the actual current behavior to ensure its not changed by
+mistake.
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "network node3 off" ],
+ [ "crm node3 enable-node-maintenance" ],
+ [ "power node3 on" ],
+ [ "crm node2 enable-node-maintenance" ],
+ [ "service vm:103 migrate node1" ],
+ [ "crm node2 disable-node-maintenance" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:101' on node 'node1'
+info 20 node1/crm: adding new service 'vm:102' on node 'node2'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 20 node1/crm: service 'vm:101': state changed from 'request_start' to 'started' (node = node1)
+info 20 node1/crm: service 'vm:103': state changed from 'request_start' to 'started' (node = node3)
+info 21 node1/lrm: got lock 'ha_agent_node1_lock'
+info 21 node1/lrm: status change wait_for_agent_lock => active
+info 21 node1/lrm: starting service vm:101
+info 21 node1/lrm: service status vm:101 started
+info 22 node2/crm: status change wait_for_quorum => slave
+info 23 node2/lrm: got lock 'ha_agent_node2_lock'
+info 23 node2/lrm: status change wait_for_agent_lock => active
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 40 node1/crm: service 'vm:102': state changed from 'request_stop' to 'stopped'
+info 120 cmdlist: execute network node3 off
+info 120 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 124 node3/crm: status change slave => wait_for_quorum
+info 125 node3/lrm: status change active => lost_agent_lock
+info 160 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 160 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+emai 160 node1/crm: FENCE: Try to fence node 'node3'
+info 166 watchdog: execute power node3 off
+info 165 node3/crm: killed by poweroff
+info 166 node3/lrm: killed by poweroff
+info 166 hardware: server 'node3' stopped by poweroff (watchdog)
+info 220 cmdlist: execute crm node3 enable-node-maintenance
+err 220 node1/crm: crm command error - node not online: enable-node-maintenance node3
+info 240 node1/crm: got lock 'ha_agent_node3_lock'
+info 240 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
+info 240 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+emai 240 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
+info 240 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
+info 240 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node2)
+info 243 node2/lrm: starting service vm:103
+info 243 node2/lrm: service status vm:103 started
+info 320 cmdlist: execute power node3 on
+info 320 node3/crm: status change startup => wait_for_quorum
+info 320 node3/lrm: status change startup => wait_for_agent_lock
+info 320 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 324 node3/crm: status change wait_for_quorum => slave
+info 420 cmdlist: execute crm node2 enable-node-maintenance
+info 423 node2/lrm: status change active => maintenance
+info 440 node1/crm: node 'node2': state changed from 'online' => 'maintenance'
+info 440 node1/crm: migrate service 'vm:103' to node 'node3' (running)
+info 440 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node2, target = node3)
+info 443 node2/lrm: service vm:103 - start migrate to node 'node3'
+info 443 node2/lrm: service vm:103 - end migrate to node 'node3'
+info 460 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
+info 465 node3/lrm: got lock 'ha_agent_node3_lock'
+info 465 node3/lrm: status change wait_for_agent_lock => active
+info 465 node3/lrm: starting service vm:103
+info 465 node3/lrm: service status vm:103 started
+info 520 cmdlist: execute service vm:103 migrate node1
+info 520 node1/crm: got crm command: migrate vm:103 node1
+info 520 node1/crm: migrate service 'vm:103' to node 'node1'
+info 520 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node3, target = node1)
+info 525 node3/lrm: service vm:103 - start migrate to node 'node1'
+info 525 node3/lrm: service vm:103 - end migrate to node 'node1'
+info 540 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node1)
+info 541 node1/lrm: starting service vm:103
+info 541 node1/lrm: service status vm:103 started
+info 620 cmdlist: execute crm node2 disable-node-maintenance
+info 623 node2/lrm: got lock 'ha_agent_node2_lock'
+info 623 node2/lrm: status change maintenance => active
+info 640 node1/crm: node 'node2': state changed from 'maintenance' => 'online'
+info 640 node1/crm: moving service 'vm:103' back to 'node2', node came back from maintenance.
+info 640 node1/crm: migrate service 'vm:103' to node 'node2' (running)
+info 640 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node2)
+info 641 node1/lrm: service vm:103 - start migrate to node 'node2'
+info 641 node1/lrm: service vm:103 - end migrate to node 'node2'
+info 660 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node2)
+info 663 node2/lrm: starting service vm:103
+info 663 node2/lrm: service status vm:103 started
+info 1220 hardware: exit simulation - done
--- /dev/null
+{}
\ No newline at end of file
--- /dev/null
+{
+ "vm:101": { "node": "node1", "state": "enabled" },
+ "vm:102": { "node": "node2" },
+ "vm:103": { "node": "node3", "state": "enabled" }
+}
\ No newline at end of file