]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/NodeStatus.pm
manager: send notifications via new notification module
[pve-ha-manager.git] / src / PVE / HA / NodeStatus.pm
1 package PVE::HA::NodeStatus;
2
3 use strict;
4 use warnings;
5
6 use JSON;
7
8 my $fence_delay = 60;
9
10 sub new {
11 my ($this, $haenv, $status) = @_;
12
13 my $class = ref($this) || $this;
14
15 my $self = bless {
16 haenv => $haenv,
17 status => $status,
18 last_online => {},
19 }, $class;
20
21 return $self;
22 }
23
24 # possible node state:
25 my $valid_node_states = {
26 online => "node online and member of quorate partition",
27 maintenance => "node is a member of quorate partition but currently not able to do work",
28 unknown => "not member of quorate partition, but possibly still running",
29 fence => "node needs to be fenced",
30 gone => "node vanished from cluster members list, possibly deleted"
31 };
32
33 sub get_node_state {
34 my ($self, $node) = @_;
35
36 $self->{status}->{$node} = 'unknown'
37 if !$self->{status}->{$node};
38
39 return $self->{status}->{$node};
40 }
41
42 sub node_is_operational {
43 my ($self, $node) = @_;
44 return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance';
45 }
46
47 sub node_is_online {
48 my ($self, $node) = @_;
49
50 return $self->get_node_state($node) eq 'online';
51 }
52
53 sub node_is_offline_delayed {
54 my ($self, $node, $delay) = @_;
55
56 $delay = $fence_delay if !defined($delay);
57
58 my $haenv = $self->{haenv};
59
60 return undef if $self->get_node_state($node) eq 'online';
61
62 my $last_online = $self->{last_online}->{$node};
63
64 my $ctime = $haenv->get_time();
65
66 if (!defined($last_online)) {
67 $self->{last_online}->{$node} = $ctime;
68 return undef;
69 }
70
71 return ($ctime - $last_online) >= $delay;
72 }
73
74 sub list_nodes {
75 my ($self) = @_;
76
77 return [sort keys %{$self->{status}}];
78 }
79
80 sub list_online_nodes {
81 my ($self) = @_;
82
83 my $res = [];
84
85 foreach my $node (sort keys %{$self->{status}}) {
86 next if $self->{status}->{$node} ne 'online';
87 push @$res, $node;
88 }
89
90 return $res;
91 }
92
93 my $delete_node = sub {
94 my ($self, $node) = @_;
95
96 return undef if $self->get_node_state($node) ne 'gone';
97
98 my $haenv = $self->{haenv};
99
100 delete $self->{last_online}->{$node};
101 delete $self->{status}->{$node};
102
103 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
104 " anymore.");
105 };
106
107 my $set_node_state = sub {
108 my ($self, $node, $state) = @_;
109
110 my $haenv = $self->{haenv};
111
112 die "unknown node state '$state'\n"
113 if !defined($valid_node_states->{$state});
114
115 my $last_state = $self->get_node_state($node);
116
117 return if $state eq $last_state;
118
119 $self->{status}->{$node} = $state;
120
121 $haenv->log('info', "node '$node': state changed from " .
122 "'$last_state' => '$state'\n");
123 };
124
125 sub update {
126 my ($self, $node_info, $lrm_modes) = @_;
127
128 my $haenv = $self->{haenv};
129
130 foreach my $node (sort keys %$node_info) {
131 my $d = $node_info->{$node};
132 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
133 next if !$d->{online};
134
135 # record last time the node was online (required to implement fence delay)
136 $self->{last_online}->{$node} = $haenv->get_time();
137
138 my $state = $self->get_node_state($node);
139
140 if ($state eq 'online') {
141 if ($lrm_mode eq 'maintenance') {
142 $set_node_state->($self, $node, 'maintenance');
143 }
144 # $set_node_state->($self, $node, 'online');
145 } elsif ($state eq 'unknown' || $state eq 'gone') {
146 $set_node_state->($self, $node, 'online');
147 } elsif ($state eq 'fence') {
148 # do nothing, wait until fenced
149 } elsif ($state eq 'maintenance') {
150 if ($lrm_mode ne 'maintenance') {
151 $set_node_state->($self, $node, 'online');
152 }
153 } else {
154 die "detected unknown node state '$state";
155 }
156 }
157
158 foreach my $node (sort keys %{$self->{status}}) {
159 my $d = $node_info->{$node};
160 next if $d && $d->{online};
161
162 my $state = $self->get_node_state($node);
163
164 # node is not inside quorate partition, possibly not active
165
166 if ($state eq 'online') {
167 $set_node_state->($self, $node, 'unknown');
168 } elsif ($state eq 'maintenance') {
169 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
170 if ($lrm_mode ne 'maintenance') {
171 $set_node_state->($self, $node, 'unknown');
172 }
173 } elsif ($state eq 'unknown') {
174
175 # node isn't in the member list anymore, deleted from the cluster?
176 $set_node_state->($self, $node, 'gone') if !defined($d) ;
177
178 } elsif ($state eq 'fence') {
179 # do nothing, wait until fenced
180 } elsif($state eq 'gone') {
181 if ($self->node_is_offline_delayed($node, 3600)) {
182 $delete_node->($self, $node);
183 }
184 } else {
185 die "detected unknown node state '$state";
186 }
187
188 }
189 }
190
191 my $body_template = <<EOT;
192 {{#verbatim}}
193 The node '{{node}}' failed and needs manual intervention.
194
195 The PVE HA manager tries to fence it and recover the configured HA resources to
196 a healthy node if possible.
197
198 Current fence status: {{subject-prefix}}
199 {{subject}}
200 {{/verbatim}}
201
202 {{heading-2 "Overall Cluster status:"}}
203 {{object status-data}}
204 EOT
205
206 my $subject_template = "{{subject-prefix}}: {{subject}}";
207
208 # assembles a commont text for fence emails
209 my $send_fence_state_email = sub {
210 my ($self, $subject_prefix, $subject, $node) = @_;
211
212 my $haenv = $self->{haenv};
213 my $status = $haenv->read_manager_status();
214
215 my $notification_properties = {
216 "status-data" => {
217 manager_status => $status,
218 node_status => $self->{status}
219 },
220 "node" => $node,
221 "subject-prefix" => $subject_prefix,
222 "subject" => $subject,
223 };
224
225 $haenv->send_notification(
226 $subject_template,
227 $body_template,
228 $notification_properties
229 );
230 };
231
232
233 # start fencing
234 sub fence_node {
235 my ($self, $node) = @_;
236
237 my $haenv = $self->{haenv};
238
239 my $state = $self->get_node_state($node);
240
241 if ($state ne 'fence') {
242 $set_node_state->($self, $node, 'fence');
243 my $msg = "Try to fence node '$node'";
244 $send_fence_state_email->($self, 'FENCE', $msg, $node);
245 }
246
247 my $success = $haenv->get_ha_agent_lock($node);
248
249 if ($success) {
250 my $msg = "fencing: acknowledged - got agent lock for node '$node'";
251 $haenv->log("info", $msg);
252 $set_node_state->($self, $node, 'unknown');
253 $send_fence_state_email->($self, 'SUCCEED', $msg, $node);
254 }
255
256 return $success;
257 }
258
259 1;