]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/NodeStatus.pm
env: switch to matcher-based notification system
[pve-ha-manager.git] / src / PVE / HA / NodeStatus.pm
CommitLineData
cbca2c55
DM
1package PVE::HA::NodeStatus;
2
3use strict;
4use warnings;
5
854cecf3 6use JSON;
cbca2c55 7
b0e9158d
TL
8my $fence_delay = 60;
9
cbca2c55 10sub new {
c79442f2 11 my ($this, $haenv, $status) = @_;
cbca2c55
DM
12
13 my $class = ref($this) || $this;
14
15 my $self = bless {
c79442f2 16 haenv => $haenv,
c0bbd038 17 status => $status,
5385a606 18 last_online => {},
cbca2c55
DM
19 }, $class;
20
21 return $self;
22}
23
24# possible node state:
b9e715a1 25my $valid_node_states = {
c0bbd038 26 online => "node online and member of quorate partition",
99278e06 27 maintenance => "node is a member of quorate partition but currently not able to do work",
c0bbd038 28 unknown => "not member of quorate partition, but possibly still running",
f7ccd1b3 29 fence => "node needs to be fenced",
7dd15f22 30 gone => "node vanished from cluster members list, possibly deleted"
b9e715a1 31};
cbca2c55
DM
32
33sub get_node_state {
34 my ($self, $node) = @_;
35
289e4784 36 $self->{status}->{$node} = 'unknown'
b9e715a1 37 if !$self->{status}->{$node};
cbca2c55 38
b9e715a1 39 return $self->{status}->{$node};
cbca2c55
DM
40}
41
99278e06
TL
42sub node_is_operational {
43 my ($self, $node) = @_;
44 return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance';
45}
46
f7ccd1b3
DM
47sub node_is_online {
48 my ($self, $node) = @_;
49
50 return $self->get_node_state($node) eq 'online';
51}
52
5385a606
DM
53sub node_is_offline_delayed {
54 my ($self, $node, $delay) = @_;
55
b0e9158d
TL
56 $delay = $fence_delay if !defined($delay);
57
d8b6f99b
DM
58 my $haenv = $self->{haenv};
59
5385a606
DM
60 return undef if $self->get_node_state($node) eq 'online';
61
62 my $last_online = $self->{last_online}->{$node};
63
d8b6f99b
DM
64 my $ctime = $haenv->get_time();
65
5385a606
DM
66 if (!defined($last_online)) {
67 $self->{last_online}->{$node} = $ctime;
68 return undef;
69 }
70
d8b6f99b 71 return ($ctime - $last_online) >= $delay;
5385a606
DM
72}
73
9c7d068b
DM
74sub list_nodes {
75 my ($self) = @_;
76
77 return [sort keys %{$self->{status}}];
78}
79
f7ccd1b3
DM
80sub list_online_nodes {
81 my ($self) = @_;
82
83 my $res = [];
84
c79442f2 85 foreach my $node (sort keys %{$self->{status}}) {
f7ccd1b3
DM
86 next if $self->{status}->{$node} ne 'online';
87 push @$res, $node;
88 }
89
90 return $res;
91}
92
7dd15f22
TL
93my $delete_node = sub {
94 my ($self, $node) = @_;
95
96 return undef if $self->get_node_state($node) ne 'gone';
97
98 my $haenv = $self->{haenv};
99
100 delete $self->{last_online}->{$node};
101 delete $self->{status}->{$node};
102
103 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
104 " anymore.");
105};
106
cbca2c55
DM
107my $set_node_state = sub {
108 my ($self, $node, $state) = @_;
109
c79442f2
DM
110 my $haenv = $self->{haenv};
111
b9e715a1
DM
112 die "unknown node state '$state'\n"
113 if !defined($valid_node_states->{$state});
cbca2c55
DM
114
115 my $last_state = $self->get_node_state($node);
116
117 return if $state eq $last_state;
118
119 $self->{status}->{$node} = $state;
120
c79442f2
DM
121 $haenv->log('info', "node '$node': state changed from " .
122 "'$last_state' => '$state'\n");
cbca2c55
DM
123};
124
125sub update {
99278e06 126 my ($self, $node_info, $lrm_modes) = @_;
cbca2c55 127
d8b6f99b
DM
128 my $haenv = $self->{haenv};
129
130 foreach my $node (sort keys %$node_info) {
cbca2c55 131 my $d = $node_info->{$node};
99278e06 132 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
cbca2c55
DM
133 next if !$d->{online};
134
5385a606 135 # record last time the node was online (required to implement fence delay)
d8b6f99b 136 $self->{last_online}->{$node} = $haenv->get_time();
5385a606 137
cbca2c55
DM
138 my $state = $self->get_node_state($node);
139
f7ccd1b3 140 if ($state eq 'online') {
99278e06 141 if ($lrm_mode eq 'maintenance') {
99278e06
TL
142 $set_node_state->($self, $node, 'maintenance');
143 }
da5ba16a 144 # $set_node_state->($self, $node, 'online');
7dd15f22 145 } elsif ($state eq 'unknown' || $state eq 'gone') {
da5ba16a 146 $set_node_state->($self, $node, 'online');
f7ccd1b3 147 } elsif ($state eq 'fence') {
c0bbd038 148 # do nothing, wait until fenced
99278e06
TL
149 } elsif ($state eq 'maintenance') {
150 if ($lrm_mode ne 'maintenance') {
151 $set_node_state->($self, $node, 'online');
152 }
c0bbd038
DM
153 } else {
154 die "detected unknown node state '$state";
cbca2c55
DM
155 }
156 }
157
9b2dbc2a 158 foreach my $node (sort keys %{$self->{status}}) {
cbca2c55
DM
159 my $d = $node_info->{$node};
160 next if $d && $d->{online};
161
162 my $state = $self->get_node_state($node);
163
c0bbd038
DM
164 # node is not inside quorate partition, possibly not active
165
1388fcc1 166 if ($state eq 'online') {
da5ba16a 167 $set_node_state->($self, $node, 'unknown');
1388fcc1
TL
168 } elsif ($state eq 'maintenance') {
169 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
170 if ($lrm_mode ne 'maintenance') {
171 $set_node_state->($self, $node, 'unknown');
172 }
c0bbd038 173 } elsif ($state eq 'unknown') {
7dd15f22
TL
174
175 # node isn't in the member list anymore, deleted from the cluster?
da5ba16a 176 $set_node_state->($self, $node, 'gone') if !defined($d) ;
7dd15f22 177
f7ccd1b3 178 } elsif ($state eq 'fence') {
c0bbd038 179 # do nothing, wait until fenced
7dd15f22 180 } elsif($state eq 'gone') {
5d880e15 181 if ($self->node_is_offline_delayed($node, 3600)) {
da5ba16a 182 $delete_node->($self, $node);
7dd15f22 183 }
c0bbd038
DM
184 } else {
185 die "detected unknown node state '$state";
186 }
187
cbca2c55
DM
188 }
189}
190
4cb3b2cf
LW
191my $body_template = <<EOT;
192{{#verbatim}}
193The node '{{node}}' failed and needs manual intervention.
854cecf3 194
4cb3b2cf
LW
195The PVE HA manager tries to fence it and recover the configured HA resources to
196a healthy node if possible.
854cecf3 197
4cb3b2cf
LW
198Current fence status: {{subject-prefix}}
199{{subject}}
200{{/verbatim}}
854cecf3 201
4cb3b2cf
LW
202{{heading-2 "Overall Cluster status:"}}
203{{object status-data}}
204EOT
854cecf3 205
4cb3b2cf 206my $subject_template = "{{subject-prefix}}: {{subject}}";
854cecf3 207
4cb3b2cf
LW
208# assembles a commont text for fence emails
209my $send_fence_state_email = sub {
210 my ($self, $subject_prefix, $subject, $node) = @_;
854cecf3 211
4cb3b2cf 212 my $haenv = $self->{haenv};
854cecf3 213 my $status = $haenv->read_manager_status();
854cecf3 214
868d3cd4 215 my $template_data = {
4cb3b2cf
LW
216 "status-data" => {
217 manager_status => $status,
218 node_status => $self->{status}
219 },
220 "node" => $node,
221 "subject-prefix" => $subject_prefix,
222 "subject" => $subject,
223 };
224
868d3cd4
LW
225 my $metadata_fields = {
226 type => 'fencing',
227 hostname => $node,
228 };
229
4cb3b2cf
LW
230 $haenv->send_notification(
231 $subject_template,
232 $body_template,
868d3cd4
LW
233 $template_data,
234 $metadata_fields,
4cb3b2cf 235 );
868d3cd4 236
854cecf3
TL
237};
238
239
c79442f2 240# start fencing
f7ccd1b3
DM
241sub fence_node {
242 my ($self, $node) = @_;
243
c79442f2
DM
244 my $haenv = $self->{haenv};
245
f7ccd1b3
DM
246 my $state = $self->get_node_state($node);
247
c79442f2 248 if ($state ne 'fence') {
da5ba16a 249 $set_node_state->($self, $node, 'fence');
854cecf3 250 my $msg = "Try to fence node '$node'";
da5ba16a 251 $send_fence_state_email->($self, 'FENCE', $msg, $node);
f7ccd1b3
DM
252 }
253
f5c29173 254 my $success = $haenv->get_ha_agent_lock($node);
ffa555c5
DM
255
256 if ($success) {
e2a7b1b5 257 my $msg = "fencing: acknowledged - got agent lock for node '$node'";
854cecf3 258 $haenv->log("info", $msg);
da5ba16a
TL
259 $set_node_state->($self, $node, 'unknown');
260 $send_fence_state_email->($self, 'SUCCEED', $msg, $node);
ffa555c5
DM
261 }
262
263 return $success;
f7ccd1b3
DM
264}
265
cbca2c55 2661;