]> git.proxmox.com Git - pve-ha-manager.git/commitdiff
move cfs update to common code
authorThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 22 Nov 2017 10:53:11 +0000 (11:53 +0100)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Tue, 30 Jan 2018 08:33:16 +0000 (09:33 +0100)
We updated the CRM and LRM view of the cluster state only in the PVE2
environment, outside of all regression testing and simulation scope.

Further, we ignored if this update failed and happily worked with an
empty state, resulting in strange actions, e.g., the removal of all
(not so) "stale" services or changing the all but the masters node
state to unknown.

This patch tries to improve this by moving out the update in a own
environment method, cluster_update_state, calling this in the LRM and
CRM and saving its result.
As with our introduced functionallity to simulate cfs rw or update
errors we can also simulate failures of this state update with the RT
system.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
Reviewed-by: Dominik Csapak <d.csapak@proxmox.com>
Tested-by: Dominik Csapak <d.csapak@proxmox.com>
src/PVE/HA/CRM.pm
src/PVE/HA/Env.pm
src/PVE/HA/Env/PVE2.pm
src/PVE/HA/LRM.pm
src/PVE/HA/Sim/Env.pm
src/test/test-cfs-unavailable2/README [new file with mode: 0644]
src/test/test-cfs-unavailable2/cmdlist [new file with mode: 0644]
src/test/test-cfs-unavailable2/hardware_status [new file with mode: 0644]
src/test/test-cfs-unavailable2/manager_status [new file with mode: 0644]
src/test/test-cfs-unavailable2/service_config [new file with mode: 0644]

index 21a0acccb850078b82da8637f416595381db2e25..d149c58af69d84c8ab6c2909ac0bda03daec32fd 100644 (file)
@@ -28,6 +28,7 @@ sub new {
        haenv => $haenv,
        manager => undef,
        status => { state => 'startup' },
+       cluster_state_update => 0,
     }, $class;
 
     $self->set_local_status({ state => 'wait_for_quorum' });
@@ -146,6 +147,8 @@ sub do_one_iteration {
 
     $haenv->loop_start_hook();
 
+    $self->{cluster_state_update} = $haenv->cluster_state_update();
+
     my $res = $self->work();
 
     $haenv->loop_end_hook();
@@ -243,6 +246,7 @@ sub work {
                $shutdown = 1;
 
            } else {
+
                $manager->manage();
            }
        };
index 55f66843bf9ee54b69e78fc63f4d424403776cd9..50441ea8ced445701100061077980c9f8da75347 100644 (file)
@@ -209,6 +209,12 @@ sub loop_end_hook {
     return $self->{plug}->loop_end_hook(@args);
 }
 
+sub cluster_state_update {
+    my ($self) = @_;
+
+    return $self->{plug}->cluster_state_update();
+}
+
 sub watchdog_open {
     my ($self) = @_;
 
index 8baf2d01ef6f899996ea17bca8197d7544919193..9d198b9e335c787fe85d723224a8abc5e9c4e818 100644 (file)
@@ -348,9 +348,8 @@ sub sleep_until {
 sub loop_start_hook {
     my ($self) = @_;
 
-    PVE::Cluster::cfs_update();
-
     $self->{loop_start} = $self->get_time();
+
 }
 
 sub loop_end_hook {
@@ -361,6 +360,18 @@ sub loop_end_hook {
     warn "loop take too long ($delay seconds)\n" if $delay > 30;
 }
 
+sub cluster_state_update {
+    my ($self) = @_;
+
+    eval { PVE::Cluster::cfs_update(1) };
+    if (my $err = $@) {
+       $self->log('warn', "cluster file system update failed - $err");
+       return 0;
+    }
+
+    return 1;
+}
+
 my $watchdog_fh;
 
 sub watchdog_open {
index 0fc8acba23c6b13f06a75edc6b6e4b6f110f5790..afca084b7b38e1edf8915036e5b32d51ea84def7 100644 (file)
@@ -34,6 +34,7 @@ sub new {
        shutdown_errors => 0,
        # mode can be: active, reboot, shutdown, restart
        mode => 'active',
+       cluster_state_update => 0,
     }, $class;
 
     $self->set_local_status({ state =>         'wait_for_agent_lock' });   
@@ -219,6 +220,8 @@ sub do_one_iteration {
 
     $haenv->loop_start_hook();
 
+    $self->{cluster_state_update} = $haenv->cluster_state_update();
+
     my $res = $self->work();
 
     $haenv->loop_end_hook();
index 34848b1ebd3445482016cb7aac4240789dc9ca3a..7344b041b1ea5f8629cdc8ced1088b160114f635 100644 (file)
@@ -366,6 +366,13 @@ sub loop_end_hook {
     # do nothing, overwrite in subclass
 }
 
+
+sub cluster_state_update {
+    my ($self) = @_;
+
+    return $self->{hardware}->get_cfs_state($self->{nodename}, 'update');
+}
+
 sub watchdog_open {
     my ($self) = @_;
 
diff --git a/src/test/test-cfs-unavailable2/README b/src/test/test-cfs-unavailable2/README
new file mode 100644 (file)
index 0000000..6fe7fc6
--- /dev/null
@@ -0,0 +1 @@
+Test a cfs update behavior, e.g., cfs_update fails (temporarily)
diff --git a/src/test/test-cfs-unavailable2/cmdlist b/src/test/test-cfs-unavailable2/cmdlist
new file mode 100644 (file)
index 0000000..590215d
--- /dev/null
@@ -0,0 +1,5 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "cfs node1 update fail", "service vm:101 stopped" ],
+    [ "cfs node1 update work" ]
+]
diff --git a/src/test/test-cfs-unavailable2/hardware_status b/src/test/test-cfs-unavailable2/hardware_status
new file mode 100644 (file)
index 0000000..451beb1
--- /dev/null
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-cfs-unavailable2/manager_status b/src/test/test-cfs-unavailable2/manager_status
new file mode 100644 (file)
index 0000000..0967ef4
--- /dev/null
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-cfs-unavailable2/service_config b/src/test/test-cfs-unavailable2/service_config
new file mode 100644 (file)
index 0000000..70f11d6
--- /dev/null
@@ -0,0 +1,5 @@
+{
+    "vm:101": { "node": "node1", "state": "enabled" },
+    "vm:102": { "node": "node2" },
+    "vm:103": { "node": "node3", "state": "enabled" }
+}