From: Thomas Lamprecht Date: Fri, 2 Jul 2021 15:32:42 +0000 (+0200) Subject: fix #3415: never switch in error state on recovery, try harder X-Git-Url: https://git.proxmox.com/?p=pve-ha-manager.git;a=commitdiff_plain;h=90a247552cc27d84f13c31a5dfa560ee9ae10af6 fix #3415: never switch in error state on recovery, try harder With the new 'recovery' state introduced a commit previously we get a clean transition, and thus actual difference, from to-be-fenced and fenced. Use that to avoid going into the error state when we did not find any possible new node we could recover the service too. That can happen if the user uses the HA manager for local services, which is an OK use-case as long as the service is restricted to a group with only that node. But previous to that we could never recover such services if their node failed, as they got always put into the "error" dummy/final state. But that's just artificially limiting ourself to get a false sense of safety. Nobody, touches the services while it's in the recovery state, no LRM not anything else (as any normal API call gets just routed to the HA stack anyway) so there's just no chance that we get a bad double-start of the same services, with resource access collisions and all the bad stuff that could happen (and note, this will in practice only matter for restricted services, which are normally only using local resources, so here it wouldn't even matter if it wasn't safe already - but it is, double time!). So, the usual transition guarantees still hold: * only the current master does transitions * there needs to be a OK quorate partition to have a master And, for getting into recovery the following holds: * the old node's lock was acquired by the master, which means it was (self-)fenced -> resource not running So as "recovery" is a no-op state we got only into once the nodes was fenced we can continue recovery, i.e., try to find a new node for t the failed services. Tests: * adapt the exist recovery test output to match the endless retry for finding a new node (vs. the previous "go into error immediately" * add a test where the node comes up eventually, so that we cover also the recovery to the same node it was on, previous to a failure * add a test with a non-empty start-state, the restricted failed node is online again. This ensure that the service won't get started until the HA manager actively recovered it, even if it's staying on that node. Signed-off-by: Thomas Lamprecht --- diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index b3074d5..8dcabca 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -618,8 +618,16 @@ sub manage_resources { next if $sd->{node} ne $nodename; my $req_state = $sd->{state}; next if !defined($req_state); + # can only happen for restricted groups where the failed node itself needs to be the + # reocvery target. Always let the master first do so, it will then marked as 'stopped' and + # we can just continue normally. But we must NOT do anything with it while still in recovery + next if $req_state eq 'recovery'; next if $req_state eq 'freeze'; - $self->queue_resource_command($sid, $sd->{uid}, $req_state, {'target' => $sd->{target}, 'timeout' => $sd->{timeout}}); + + $self->queue_resource_command($sid, $sd->{uid}, $req_state, { + 'target' => $sd->{target}, + 'timeout' => $sd->{timeout}, + }); } return $self->run_workers(); diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index d65fb88..d1bd477 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -811,7 +811,12 @@ sub next_state_recovery { ); if ($recovery_node) { - $haenv->log('info', "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'"); + my $msg = "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'"; + if ($recovery_node eq $fenced_node) { + # can happen if restriced groups and the node came up again OK + $msg = "recover service '$sid' to previous failed and fenced node '$fenced_node' again"; + } + $haenv->log('info', "$msg"); $fence_recovery_cleanup->($self, $sid, $fenced_node); @@ -825,7 +830,6 @@ sub next_state_recovery { } else { # no possible node found, cannot recover - but retry later, as we always try to make it available $haenv->log('err', "recovering service '$sid' from fenced node '$fenced_node' failed, no recovery node found"); - $change_service_state->($self, $sid, 'error'); } } diff --git a/src/test/test-recovery1/README b/src/test/test-recovery1/README index 8753ad2..5652a8d 100644 --- a/src/test/test-recovery1/README +++ b/src/test/test-recovery1/README @@ -1,4 +1,5 @@ -Test what happens if a service needs to get recovered but -select_service_node cannot return any possible node. +Test what happens if a service needs to get recovered but select_service_node +cannot return any possible node. -Avoid endless loops by placing the service in the error state. +Try recovery forever, as this is HA and its single job is to get that service +available again, even if trying forever. diff --git a/src/test/test-recovery1/log.expect b/src/test/test-recovery1/log.expect index e4496b5..bea4c42 100644 --- a/src/test/test-recovery1/log.expect +++ b/src/test/test-recovery1/log.expect @@ -37,5 +37,27 @@ info 240 node1/crm: node 'node2': state changed from 'fence' => 'unknown' emai 240 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node2' info 240 node1/crm: service 'vm:102': state changed from 'fence' to 'recovery' err 240 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found -info 240 node1/crm: service 'vm:102': state changed from 'recovery' to 'error' +err 260 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 280 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 300 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 320 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 340 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 360 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 380 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 400 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 420 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 440 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 460 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 480 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 500 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 520 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 540 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 560 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 580 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 600 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 620 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 640 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 660 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 680 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 700 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found info 720 hardware: exit simulation - done diff --git a/src/test/test-recovery2/README b/src/test/test-recovery2/README new file mode 100644 index 0000000..017d0f2 --- /dev/null +++ b/src/test/test-recovery2/README @@ -0,0 +1,3 @@ +Test what happens if a service needs to get recovered but select_service_node +cannot return any possible node due to restricted groups, but after a while the +original node comes up, in which case the service must be recovered. diff --git a/src/test/test-recovery2/cmdlist b/src/test/test-recovery2/cmdlist new file mode 100644 index 0000000..6696c45 --- /dev/null +++ b/src/test/test-recovery2/cmdlist @@ -0,0 +1,15 @@ +[ + [ "power node1 on", "power node2 on", "power node3 on"], + [ "network node2 off" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "power node2 on", "network node2 on" ] +] diff --git a/src/test/test-recovery2/groups b/src/test/test-recovery2/groups new file mode 100644 index 0000000..06c7f76 --- /dev/null +++ b/src/test/test-recovery2/groups @@ -0,0 +1,4 @@ +group: prefer_node2 + nodes node2 + restricted 1 + diff --git a/src/test/test-recovery2/hardware_status b/src/test/test-recovery2/hardware_status new file mode 100644 index 0000000..451beb1 --- /dev/null +++ b/src/test/test-recovery2/hardware_status @@ -0,0 +1,5 @@ +{ + "node1": { "power": "off", "network": "off" }, + "node2": { "power": "off", "network": "off" }, + "node3": { "power": "off", "network": "off" } +} diff --git a/src/test/test-recovery2/log.expect b/src/test/test-recovery2/log.expect new file mode 100644 index 0000000..31a9647 --- /dev/null +++ b/src/test/test-recovery2/log.expect @@ -0,0 +1,100 @@ +info 0 hardware: starting simulation +info 20 cmdlist: execute power node1 on +info 20 node1/crm: status change startup => wait_for_quorum +info 20 node1/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node2 on +info 20 node2/crm: status change startup => wait_for_quorum +info 20 node2/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node3 on +info 20 node3/crm: status change startup => wait_for_quorum +info 20 node3/lrm: status change startup => wait_for_agent_lock +info 20 node1/crm: got lock 'ha_manager_lock' +info 20 node1/crm: status change wait_for_quorum => master +info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online' +info 20 node1/crm: adding new service 'vm:102' on node 'node2' +info 22 node2/crm: status change wait_for_quorum => slave +info 23 node2/lrm: got lock 'ha_agent_node2_lock' +info 23 node2/lrm: status change wait_for_agent_lock => active +info 23 node2/lrm: starting service vm:102 +info 23 node2/lrm: service status vm:102 started +info 24 node3/crm: status change wait_for_quorum => slave +info 120 cmdlist: execute network node2 off +info 120 node1/crm: node 'node2': state changed from 'online' => 'unknown' +info 122 node2/crm: status change slave => wait_for_quorum +info 123 node2/lrm: status change active => lost_agent_lock +info 160 node1/crm: service 'vm:102': state changed from 'started' to 'fence' +info 160 node1/crm: node 'node2': state changed from 'unknown' => 'fence' +emai 160 node1/crm: FENCE: Try to fence node 'node2' +info 164 watchdog: execute power node2 off +info 163 node2/crm: killed by poweroff +info 164 node2/lrm: killed by poweroff +info 164 hardware: server 'node2' stopped by poweroff (watchdog) +info 240 node1/crm: got lock 'ha_agent_node2_lock' +info 240 node1/crm: fencing: acknowledged - got agent lock for node 'node2' +info 240 node1/crm: node 'node2': state changed from 'fence' => 'unknown' +emai 240 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node2' +info 240 node1/crm: service 'vm:102': state changed from 'fence' to 'recovery' +err 240 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 260 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 280 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 300 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 320 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 340 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 360 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 380 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 400 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 420 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 440 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 460 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 480 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 500 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 520 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 540 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 560 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 580 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 600 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 620 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 640 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 660 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 680 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 700 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 720 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 740 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 760 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 780 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 800 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 820 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 840 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 860 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 880 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 900 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 920 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 940 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 960 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 980 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1000 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1020 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1040 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1060 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1080 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1100 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1120 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1140 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1160 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1180 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +err 1200 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found +info 1220 cmdlist: execute power node2 on +info 1220 node2/crm: status change startup => wait_for_quorum +info 1220 node2/lrm: status change startup => wait_for_agent_lock +info 1220 cmdlist: execute network node2 on +info 1220 node1/crm: node 'node2': state changed from 'unknown' => 'online' +info 1220 node1/crm: recover service 'vm:102' to previous failed and fenced node 'node2' again +info 1220 node1/crm: service 'vm:102': state changed from 'recovery' to 'started' (node = node2) +info 1222 node2/crm: status change wait_for_quorum => slave +info 1223 node2/lrm: got lock 'ha_agent_node2_lock' +info 1223 node2/lrm: status change wait_for_agent_lock => active +info 1223 node2/lrm: starting service vm:102 +info 1223 node2/lrm: service status vm:102 started +info 1820 hardware: exit simulation - done diff --git a/src/test/test-recovery2/manager_status b/src/test/test-recovery2/manager_status new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/src/test/test-recovery2/manager_status @@ -0,0 +1 @@ +{} diff --git a/src/test/test-recovery2/service_config b/src/test/test-recovery2/service_config new file mode 100644 index 0000000..39a05e5 --- /dev/null +++ b/src/test/test-recovery2/service_config @@ -0,0 +1,3 @@ +{ + "vm:102": { "node": "node2", "state": "enabled", "group": "prefer_node2" } +} diff --git a/src/test/test-recovery3/README b/src/test/test-recovery3/README new file mode 100644 index 0000000..a59688c --- /dev/null +++ b/src/test/test-recovery3/README @@ -0,0 +1,9 @@ +This starts out with a "active scenario", i.e., a non-empty/default manager and +HW status. + +We test what happens if a service needs to get recovered but +select_service_node cannot return any possible node, while it's original node +it was on is already online again (after fencing). + +Ensures that the node does not starts the service before the HA manager +transitions it away from the "recovery" state diff --git a/src/test/test-recovery3/cmdlist b/src/test/test-recovery3/cmdlist new file mode 100644 index 0000000..89cfdb1 --- /dev/null +++ b/src/test/test-recovery3/cmdlist @@ -0,0 +1,4 @@ +[ + [ "power node2 on" ], + [ "delay 0" ] +] diff --git a/src/test/test-recovery3/groups b/src/test/test-recovery3/groups new file mode 100644 index 0000000..06c7f76 --- /dev/null +++ b/src/test/test-recovery3/groups @@ -0,0 +1,4 @@ +group: prefer_node2 + nodes node2 + restricted 1 + diff --git a/src/test/test-recovery3/hardware_status b/src/test/test-recovery3/hardware_status new file mode 100644 index 0000000..4fe0194 --- /dev/null +++ b/src/test/test-recovery3/hardware_status @@ -0,0 +1,5 @@ +{ + "node1": { "power": "on", "network": "on" }, + "node2": { "power": "off", "network": "off" }, + "node3": { "power": "on", "network": "on" } +} diff --git a/src/test/test-recovery3/log.expect b/src/test/test-recovery3/log.expect new file mode 100644 index 0000000..9b48409 --- /dev/null +++ b/src/test/test-recovery3/log.expect @@ -0,0 +1,14 @@ +info 0 hardware: starting simulation +info 20 cmdlist: execute power node2 on +info 20 node2/crm: status change startup => wait_for_quorum +info 20 node2/lrm: status change startup => wait_for_agent_lock +info 20 node2/crm: got lock 'ha_manager_lock' +info 20 node2/crm: status change wait_for_quorum => master +info 20 node2/crm: node 'node2': state changed from 'unknown' => 'online' +info 20 node2/crm: recover service 'vm:102' to previous failed and fenced node 'node2' again +info 20 node2/crm: service 'vm:102': state changed from 'recovery' to 'started' (node = node2) +info 21 node2/lrm: got lock 'ha_agent_node2_lock' +info 21 node2/lrm: status change wait_for_agent_lock => active +info 21 node2/lrm: starting service vm:102 +info 21 node2/lrm: service status vm:102 started +info 720 hardware: exit simulation - done diff --git a/src/test/test-recovery3/manager_status b/src/test/test-recovery3/manager_status new file mode 100644 index 0000000..08459a3 --- /dev/null +++ b/src/test/test-recovery3/manager_status @@ -0,0 +1 @@ +{"master_node":"node1","service_status":{"vm:102":{"state":"recovery","node":"node2","uid":"w69Yqf4/pmeb8ymueYNPvQ"}},"timestamp":700,"node_status":{"node1":"online","node2":"unknown","node3":"online"}} \ No newline at end of file diff --git a/src/test/test-recovery3/service_config b/src/test/test-recovery3/service_config new file mode 100644 index 0000000..39a05e5 --- /dev/null +++ b/src/test/test-recovery3/service_config @@ -0,0 +1,3 @@ +{ + "vm:102": { "node": "node2", "state": "enabled", "group": "prefer_node2" } +}