We updated the CRM and LRM view of the cluster state only in the PVE2
environment, outside of all regression testing and simulation scope.
Further, we ignored if this update failed and happily worked with an
empty state, resulting in strange actions, e.g., the removal of all
(not so) "stale" services or changing the all but the masters node
state to unknown.
This patch tries to improve this by moving out the update in a own
environment method, cluster_update_state, calling this in the LRM and
CRM and saving its result.
As with our introduced functionallity to simulate cfs rw or update
errors we can also simulate failures of this state update with the RT
system.
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
Reviewed-by: Dominik Csapak <d.csapak@proxmox.com>
Tested-by: Dominik Csapak <d.csapak@proxmox.com>
haenv => $haenv,
manager => undef,
status => { state => 'startup' },
haenv => $haenv,
manager => undef,
status => { state => 'startup' },
+ cluster_state_update => 0,
}, $class;
$self->set_local_status({ state => 'wait_for_quorum' });
}, $class;
$self->set_local_status({ state => 'wait_for_quorum' });
$haenv->loop_start_hook();
$haenv->loop_start_hook();
+ $self->{cluster_state_update} = $haenv->cluster_state_update();
+
my $res = $self->work();
$haenv->loop_end_hook();
my $res = $self->work();
$haenv->loop_end_hook();
return $self->{plug}->loop_end_hook(@args);
}
return $self->{plug}->loop_end_hook(@args);
}
+sub cluster_state_update {
+ my ($self) = @_;
+
+ return $self->{plug}->cluster_state_update();
+}
+
sub watchdog_open {
my ($self) = @_;
sub watchdog_open {
my ($self) = @_;
sub loop_start_hook {
my ($self) = @_;
sub loop_start_hook {
my ($self) = @_;
- PVE::Cluster::cfs_update();
-
$self->{loop_start} = $self->get_time();
$self->{loop_start} = $self->get_time();
warn "loop take too long ($delay seconds)\n" if $delay > 30;
}
warn "loop take too long ($delay seconds)\n" if $delay > 30;
}
+sub cluster_state_update {
+ my ($self) = @_;
+
+ eval { PVE::Cluster::cfs_update(1) };
+ if (my $err = $@) {
+ $self->log('warn', "cluster file system update failed - $err");
+ return 0;
+ }
+
+ return 1;
+}
+
my $watchdog_fh;
sub watchdog_open {
my $watchdog_fh;
sub watchdog_open {
shutdown_errors => 0,
# mode can be: active, reboot, shutdown, restart
mode => 'active',
shutdown_errors => 0,
# mode can be: active, reboot, shutdown, restart
mode => 'active',
+ cluster_state_update => 0,
}, $class;
$self->set_local_status({ state => 'wait_for_agent_lock' });
}, $class;
$self->set_local_status({ state => 'wait_for_agent_lock' });
$haenv->loop_start_hook();
$haenv->loop_start_hook();
+ $self->{cluster_state_update} = $haenv->cluster_state_update();
+
my $res = $self->work();
$haenv->loop_end_hook();
my $res = $self->work();
$haenv->loop_end_hook();
# do nothing, overwrite in subclass
}
# do nothing, overwrite in subclass
}
+
+sub cluster_state_update {
+ my ($self) = @_;
+
+ return $self->{hardware}->get_cfs_state($self->{nodename}, 'update');
+}
+
sub watchdog_open {
my ($self) = @_;
sub watchdog_open {
my ($self) = @_;
--- /dev/null
+Test a cfs update behavior, e.g., cfs_update fails (temporarily)
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "cfs node1 update fail", "service vm:101 stopped" ],
+ [ "cfs node1 update work" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+{
+ "vm:101": { "node": "node1", "state": "enabled" },
+ "vm:102": { "node": "node2" },
+ "vm:103": { "node": "node3", "state": "enabled" }
+}