add ignore state for resources

author Thomas Lamprecht <t.lamprecht@proxmox.com>

Tue, 24 Jan 2017 17:37:22 +0000 (18:37 +0100)

committer Fabian Grünbichler <f.gruenbichler@proxmox.com>

Fri, 13 Oct 2017 08:50:16 +0000 (10:50 +0200)
author Thomas Lamprecht <t.lamprecht@proxmox.com>
Tue, 24 Jan 2017 17:37:22 +0000 (18:37 +0100)
committer Fabian Grünbichler <f.gruenbichler@proxmox.com>
Fri, 13 Oct 2017 08:50:16 +0000 (10:50 +0200)
diff --git a/src/PVE/HA/Config.pm b/src/PVE/HA/Config.pm

index a7a7e3089d949f7550eaec642e853ace47dfcc76..bf37b04ff193969f8eed6592c6287e6fdba1faaf 100644 (file)
--- a/src/PVE/HA/Config.pm
+++ b/src/PVE/HA/Config.pm
@@ -201,7 +201,11 @@ my $service_check_ha_state = sub {
      my ($conf, $sid, $has_state) = @_;
  
      if (my $d = $conf->{ids}->{$sid}) {
-       return 1 if !defined($has_state);
+       if (!defined($has_state)) {
+           # ignored service behave as if they were not managed by HA
+           return 0 if defined($d->{state}) && $d->{state} eq 'ignored';
+           return 1;
+       }
  
         # backward compatibility
         $has_state = 'started' if $has_state eq 'enabled';
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm

index bbcf5e7c79ba541964203fb72f2dc8578ecadb63..25a7398d732303cb223203ac6ba6caac452520d5 100644 (file)
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -380,6 +380,8 @@ sub manage {
      foreach my $sid (sort keys %$sc) {
         next if $ss->{$sid}; # already there
         my $cd = $sc->{$sid};
+       next if $cd->{state} eq 'ignored';
+
         $haenv->log('info', "adding new service '$sid' on node '$cd->{node}'");
         # assume we are running to avoid relocate running service at add
         my $state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
@@ -387,10 +389,13 @@ sub manage {
                         uid => compute_new_uuid('started') };
      }
  
-    # remove stale service from manager state
+    # remove stale or ignored services from manager state
      foreach my $sid (keys %$ss) {
-       next if $sc->{$sid};
-       $haenv->log('info', "removing stale service '$sid' (no config)");
+       next if $sc->{$sid} && $sc->{$sid}->{state} ne 'ignored';
+
+       my $reason =  defined($sc->{$sid}) ? 'ignored state requested' : 'no config';
+       $haenv->log('info', "removing stale service '$sid' ($reason)");
+
         # remove all service related state information
         delete $ss->{$sid};
      }
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm

index c0111768830170474d221a7dfcce464ed600ef3f..4d5544227f450bcb86dd072b2481c7654397bc49 100644 (file)
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -16,7 +16,7 @@ my $defaultData = {
                                    { completion => \&PVE::HA::Tools::complete_sid }),
         state => {
             type => 'string',
-           enum => ['started', 'stopped', 'enabled', 'disabled'],
+           enum => ['started', 'stopped', 'enabled', 'disabled', 'ignored'],
             optional => 1,
             default => 'started',
             description => "Requested resource state.",
@@ -43,6 +43,14 @@ to relocate the resources on node failures. The main purpose of this
  state is error recovery, because it is the only way to move a resource out
  of the `error` state.
  
+`ignored`;;
+
+The resource gets removed from the manager status and so the CRM and the LRM do
+not touch the resource anymore. All {pve} API calls affecting this resource
+will be executed, directly bypassing the HA stack. CRM commands will be thrown
+away while there source is in this state. The resource will not get relocated
+on node failures.
+
  EODESC
         },
         group => get_standard_option('pve-ha-group-id',
diff --git a/src/PVE/HA/Sim/Hardware.pm b/src/PVE/HA/Sim/Hardware.pm

index 605a7cdb743de41c8e590ba70bd04e0c26236505..6ba2210d6d8a77f5c5f08cb0173a5e6fbf40e71a 100644 (file)
--- a/src/PVE/HA/Sim/Hardware.pm
+++ b/src/PVE/HA/Sim/Hardware.pm
@@ -500,7 +500,7 @@ sub get_node_info {
  # reboot <node>
  # shutdown <node>
  # restart-lrm <node>
-# service <sid> <started|disabled|stopped>
+# service <sid> <started|disabled|stopped|ignored>
  # service <sid> <migrate|relocate> <target>
  # service <sid> lock/unlock [lockname]
  
@@ -597,7 +597,8 @@ sub sim_hardware_cmd {
             }
  
         } elsif ($cmd eq 'service') {
-           if ($action eq 'started' || $action eq 'disabled' || $action eq 'stopped') {
+           if ($action eq 'started' || $action eq 'disabled' ||
+               $action eq 'stopped' || $action eq 'ignored') {
  
                 $self->set_service_state($sid, $action);
  
diff --git a/src/PVE/HA/Sim/RTHardware.pm b/src/PVE/HA/Sim/RTHardware.pm

index d3f48a6ab1fcca43bd6f76fc9341e722d3cb1027..ccac6ce8d207cfb727100dc8ad6631846e46d442 100644 (file)
--- a/src/PVE/HA/Sim/RTHardware.pm
+++ b/src/PVE/HA/Sim/RTHardware.pm
@@ -580,7 +580,7 @@ sub new_service_gui_entry {
      $sgrid->attach($w, 1, $row, 1, 1);
  
      my $count = 0;
-    foreach my $state (qw(started stopped disabled)) {
+    foreach my $state (qw(started stopped disabled ignored)) {
         $w->append_text($state);
         $w->set_active($count) if $d->{state} eq $state;
         $count++;
diff --git a/src/test/test-service-ignore1/README b/src/test/test-service-ignore1/README

new file mode 100644 (file)

index 0000000..2a01fce
--- /dev/null
+++ b/src/test/test-service-ignore1/README
@@ -0,0 +1,2 @@
+Test an user triggered service disable and enable cycle. The services should
+successfully stop and then start again.
diff --git a/src/test/test-service-ignore1/cmdlist b/src/test/test-service-ignore1/cmdlist

new file mode 100644 (file)

index 0000000..597e469
--- /dev/null
+++ b/src/test/test-service-ignore1/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service vm:103 ignored" ]
+]
diff --git a/src/test/test-service-ignore1/hardware_status b/src/test/test-service-ignore1/hardware_status

new file mode 100644 (file)

index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-service-ignore1/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-service-ignore1/log.expect b/src/test/test-service-ignore1/log.expect

new file mode 100644 (file)

index 0000000..1b98303
--- /dev/null
+++ b/src/test/test-service-ignore1/log.expect
@@ -0,0 +1,25 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 ignored
+info    120    node1/crm: removing stale service 'vm:103' (ignored state requested)
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-service-ignore1/manager_status b/src/test/test-service-ignore1/manager_status

new file mode 100644 (file)

index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-service-ignore1/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-service-ignore1/service_config b/src/test/test-service-ignore1/service_config

new file mode 100644 (file)

index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-service-ignore1/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
diff --git a/src/test/test-service-ignore2/README b/src/test/test-service-ignore2/README

new file mode 100644 (file)

index 0000000..c605145
--- /dev/null
+++ b/src/test/test-service-ignore2/README
@@ -0,0 +1,4 @@
+Set the request state of a service to ignored. Then simulate a node failure
+through network outage. The HA stack should not touch the 'ignored' service.
+
+Set the service to 'started' again, now the service should be fenced.
diff --git a/src/test/test-service-ignore2/cmdlist b/src/test/test-service-ignore2/cmdlist

new file mode 100644 (file)

index 0000000..4cf29f4
--- /dev/null
+++ b/src/test/test-service-ignore2/cmdlist
@@ -0,0 +1,6 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service vm:103 ignored" ],
+    [ "network node3 off" ],
+    [ "service vm:103 started" ]
+]
diff --git a/src/test/test-service-ignore2/hardware_status b/src/test/test-service-ignore2/hardware_status

new file mode 100644 (file)

index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-service-ignore2/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-service-ignore2/log.expect b/src/test/test-service-ignore2/log.expect

new file mode 100644 (file)

index 0000000..27e52f8
--- /dev/null
+++ b/src/test/test-service-ignore2/log.expect
@@ -0,0 +1,46 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 ignored
+info    120    node1/crm: removing stale service 'vm:103' (ignored state requested)
+info    220      cmdlist: execute network node3 off
+info    220    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    224    node3/crm: status change slave => wait_for_quorum
+info    225    node3/lrm: status change active => lost_agent_lock
+info    266     watchdog: execute power node3 off
+info    265    node3/crm: killed by poweroff
+info    266    node3/lrm: killed by poweroff
+info    266     hardware: server 'node3' stopped by poweroff (watchdog)
+info    320      cmdlist: execute service vm:103 started
+info    320    node1/crm: adding new service 'vm:103' on node 'node3'
+info    320    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info    320    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    340    node1/crm: got lock 'ha_agent_node3_lock'
+info    340    node1/crm: fencing: acknowledged - got agent lock for node 'node3'
+info    340    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    340    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+info    340    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
+info    341    node1/lrm: got lock 'ha_agent_node1_lock'
+info    341    node1/lrm: status change wait_for_agent_lock => active
+info    341    node1/lrm: starting service vm:103
+info    341    node1/lrm: service status vm:103 started
+info    920     hardware: exit simulation - done
diff --git a/src/test/test-service-ignore2/manager_status b/src/test/test-service-ignore2/manager_status

new file mode 100644 (file)

index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-service-ignore2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-service-ignore2/service_config b/src/test/test-service-ignore2/service_config

new file mode 100644 (file)

index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-service-ignore2/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
author	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Tue, 24 Jan 2017 17:37:22 +0000 (18:37 +0100)
committer	Fabian Grünbichler <f.gruenbichler@proxmox.com>
	Fri, 13 Oct 2017 08:50:16 +0000 (10:50 +0200)
src/PVE/HA/Config.pm		patch \| blob \| blame \| history
src/PVE/HA/Manager.pm		patch \| blob \| blame \| history
src/PVE/HA/Resources.pm		patch \| blob \| blame \| history
src/PVE/HA/Sim/Hardware.pm		patch \| blob \| blame \| history
src/PVE/HA/Sim/RTHardware.pm		patch \| blob \| blame \| history
src/test/test-service-ignore1/README	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore1/cmdlist	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore1/hardware_status	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore1/log.expect	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore1/manager_status	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore1/service_config	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/README	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/cmdlist	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/hardware_status	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/log.expect	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/manager_status	[new file with mode: 0644]	patch \| blob
src/test/test-service-ignore2/service_config	[new file with mode: 0644]	patch \| blob