]>
git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/NodeStatus.pm
1 package PVE
::HA
::NodeStatus
;
11 my ($this, $haenv, $status) = @_;
13 my $class = ref($this) || $this;
24 # possible node state:
25 my $valid_node_states = {
26 online
=> "node online and member of quorate partition",
27 maintenance
=> "node is a member of quorate partition but currently not able to do work",
28 unknown
=> "not member of quorate partition, but possibly still running",
29 fence
=> "node needs to be fenced",
30 gone
=> "node vanished from cluster members list, possibly deleted"
34 my ($self, $node) = @_;
36 $self->{status
}->{$node} = 'unknown'
37 if !$self->{status
}->{$node};
39 return $self->{status
}->{$node};
42 sub node_is_operational
{
43 my ($self, $node) = @_;
44 return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance';
48 my ($self, $node) = @_;
50 return $self->get_node_state($node) eq 'online';
53 sub node_is_offline_delayed
{
54 my ($self, $node, $delay) = @_;
56 $delay = $fence_delay if !defined($delay);
58 my $haenv = $self->{haenv
};
60 return undef if $self->get_node_state($node) eq 'online';
62 my $last_online = $self->{last_online
}->{$node};
64 my $ctime = $haenv->get_time();
66 if (!defined($last_online)) {
67 $self->{last_online
}->{$node} = $ctime;
71 return ($ctime - $last_online) >= $delay;
77 return [sort keys %{$self->{status
}}];
80 sub list_online_nodes
{
85 foreach my $node (sort keys %{$self->{status
}}) {
86 next if $self->{status
}->{$node} ne 'online';
93 my $delete_node = sub {
94 my ($self, $node) = @_;
96 return undef if $self->get_node_state($node) ne 'gone';
98 my $haenv = $self->{haenv
};
100 delete $self->{last_online
}->{$node};
101 delete $self->{status
}->{$node};
103 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
107 my $set_node_state = sub {
108 my ($self, $node, $state) = @_;
110 my $haenv = $self->{haenv
};
112 die "unknown node state '$state'\n"
113 if !defined($valid_node_states->{$state});
115 my $last_state = $self->get_node_state($node);
117 return if $state eq $last_state;
119 $self->{status
}->{$node} = $state;
121 $haenv->log('info', "node '$node': state changed from " .
122 "'$last_state' => '$state'\n");
126 my ($self, $node_info, $lrm_modes) = @_;
128 my $haenv = $self->{haenv
};
130 foreach my $node (sort keys %$node_info) {
131 my $d = $node_info->{$node};
132 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
133 next if !$d->{online
};
135 # record last time the node was online (required to implement fence delay)
136 $self->{last_online
}->{$node} = $haenv->get_time();
138 my $state = $self->get_node_state($node);
140 if ($state eq 'online') {
141 if ($lrm_mode eq 'maintenance') {
142 $set_node_state->($self, $node, 'maintenance');
144 # $set_node_state->($self, $node, 'online');
145 } elsif ($state eq 'unknown' || $state eq 'gone') {
146 $set_node_state->($self, $node, 'online');
147 } elsif ($state eq 'fence') {
148 # do nothing, wait until fenced
149 } elsif ($state eq 'maintenance') {
150 if ($lrm_mode ne 'maintenance') {
151 $set_node_state->($self, $node, 'online');
154 die "detected unknown node state '$state";
158 foreach my $node (sort keys %{$self->{status
}}) {
159 my $d = $node_info->{$node};
160 next if $d && $d->{online
};
162 my $state = $self->get_node_state($node);
164 # node is not inside quorate partition, possibly not active
166 if ($state eq 'online') {
167 $set_node_state->($self, $node, 'unknown');
168 } elsif ($state eq 'maintenance') {
169 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
170 if ($lrm_mode ne 'maintenance') {
171 $set_node_state->($self, $node, 'unknown');
173 } elsif ($state eq 'unknown') {
175 # node isn't in the member list anymore, deleted from the cluster?
176 $set_node_state->($self, $node, 'gone') if !defined($d) ;
178 } elsif ($state eq 'fence') {
179 # do nothing, wait until fenced
180 } elsif($state eq 'gone') {
181 if ($self->node_is_offline_delayed($node, 3600)) {
182 $delete_node->($self, $node);
185 die "detected unknown node state '$state";
191 # assembles a commont text for fence emails
192 my $send_fence_state_email = sub {
193 my ($self, $subject_prefix, $subject, $node) = @_;
195 my $haenv = $self->{haenv
};
197 my $mail_text = <<EOF
198 The node '$node' failed and needs manual intervention.
200 The PVE HA manager tries to fence it and recover the
201 configured HA resources to a healthy node if possible.
203 Current fence status: $subject_prefix
207 Overall Cluster status:
208 -----------------------
212 my $mail_subject = $subject_prefix . ': ' . $subject;
214 my $status = $haenv->read_manager_status();
215 my $data = { manager_status
=> $status, node_status
=> $self->{status
} };
217 $mail_text .= to_json
($data, { pretty
=> 1, canonical
=> 1});
219 $haenv->sendmail($mail_subject, $mail_text);
225 my ($self, $node) = @_;
227 my $haenv = $self->{haenv
};
229 my $state = $self->get_node_state($node);
231 if ($state ne 'fence') {
232 $set_node_state->($self, $node, 'fence');
233 my $msg = "Try to fence node '$node'";
234 $send_fence_state_email->($self, 'FENCE', $msg, $node);
237 my $success = $haenv->get_ha_agent_lock($node);
240 my $msg = "fencing: acknowledged - got agent lock for node '$node'";
241 $haenv->log("info", $msg);
242 $set_node_state->($self, $node, 'unknown');
243 $send_fence_state_email->($self, 'SUCCEED', $msg, $node);