=== Watchdog ===
We need a reliable watchdog mechanism, which is able to provide hard
-timeouts. It must be guaranteed that the node reboot withing specified
+timeouts. It must be guaranteed that the node reboots within the specified
timeout if we do not update the watchdog. For me it looks that neither
systemd nor the standard watchdog(8) daemon provides such guarantees.
== Self fencing ==
-A node needs to aquire a special 'ha_agent_${node}_lock' (one separate
+A node needs to acquire a special 'ha_agent_${node}_lock' (one separate
lock for each node) before starting HA resources, and the node updates
the watchdog device once it get that lock. If the node loose quorum,
or is unable to get the 'ha_agent_${node}_lock', the watchdog is no
long as there are running services on that node.
The HA manger can assume that the watchdog triggered a reboot when he
-is able to aquire the 'ha_agent_${node}_lock' for that node.
+is able to acquire the 'ha_agent_${node}_lock' for that node.
=== Problems with "two_node" Clusters ===
sub write_manager_status {
my ($self, $status_obj) = @_;
-
+
PVE::HA::Config::write_manager_status($status_obj);
}
my ($self, $status_obj) = @_;
my $node = $self->{nodename};
-
+
PVE::HA::Config::write_lrm_status($node, $status_obj);
}
sub service_config_exists {
my ($self) = @_;
-
+
return PVE::HA::Config::resources_config_exists();
}
my ($self) = @_;
my $res = PVE::HA::Config::read_resources_config();
-
+
my $vmlist = PVE::Cluster::get_vmlist();
my $conf = {};
}
}
}
-
+
return $conf;
}
my ($self) = @_;
my ($node_info, $quorate) = ({}, 0);
-
+
my $nodename = $self->{nodename};
$quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
foreach my $node (keys %$members) {
my $d = $members->{$node};
- $node_info->{$node}->{online} = $d->{online};
+ $node_info->{$node}->{online} = $d->{online};
}
-
+
$node_info->{$nodename}->{online} = 1; # local node is always up
-
+
return ($node_info, $quorate);
}
my $retry = 0;
my $retry_timeout = 100; # fixme: what timeout
-
+
eval {
mkdir $lockdir;
# $self->log('err', $err) if $err; # for debugging
return 0;
}
-
+
$last_lock_status->{$lockid} = $got_lock ? $ctime : 0;
if (!!$got_lock != !!$last) {
if ($got_lock) {
- $self->log('info', "successfully aquired lock '$lockid'");
+ $self->log('info', "successfully acquired lock '$lockid'");
} else {
my $msg = "lost lock '$lockid";
- $msg .= " - $err" if $err;
+ $msg .= " - $err" if $err;
$self->log('err', $msg);
}
} else {
sub get_ha_agent_lock {
my ($self, $node) = @_;
-
+
$node = $self->nodename() if !defined($node);
return $self->get_pve_lock("ha_agent_${node}_lock");
my ($self) = @_;
my $quorate = 0;
- eval {
- $quorate = PVE::Cluster::check_cfs_quorum();
+ eval {
+ $quorate = PVE::Cluster::check_cfs_quorum();
};
-
+
return $quorate;
}
my ($self) = @_;
PVE::Cluster::cfs_update();
-
+
$self->{loop_start} = $self->get_time();
}
my ($self) = @_;
my $delay = $self->get_time() - $self->{loop_start};
-
+
warn "loop take too long ($delay seconds)\n" if $delay > 30;
}
Type => SOCK_STREAM(),
Peer => "/run/watchdog-mux.sock") ||
die "unable to open watchdog socket - $!\n";
-
+
$self->log('info', "watchdog active");
}
my ($self, $sid, $service_config, $cmd, @params) = @_;
# setup execution environment
-
+
$ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
PVE::INotify::inotify_close();
-
+
PVE::INotify::inotify_init();
PVE::Cluster::cfs_update();
-
+
my $nodename = $self->{nodename};
# fixme: return valid_exit code (instead of using die) ?
} elsif ($cmd eq 'error') {
-
- if($running) {
+ if ($running) {
$self->log("err", "service $sid is in an error state while running");
} else {
$self->log("warning", "service $sid is not running and in an error state");