]>
Commit | Line | Data |
---|---|---|
1 | package PVE::HA::NodeStatus; | |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
6 | use JSON; | |
7 | ||
8 | my $fence_delay = 60; | |
9 | ||
10 | sub new { | |
11 | my ($this, $haenv, $status) = @_; | |
12 | ||
13 | my $class = ref($this) || $this; | |
14 | ||
15 | my $self = bless { | |
16 | haenv => $haenv, | |
17 | status => $status, | |
18 | last_online => {}, | |
19 | }, $class; | |
20 | ||
21 | return $self; | |
22 | } | |
23 | ||
24 | # possible node state: | |
25 | my $valid_node_states = { | |
26 | online => "node online and member of quorate partition", | |
27 | maintenance => "node is a member of quorate partition but currently not able to do work", | |
28 | unknown => "not member of quorate partition, but possibly still running", | |
29 | fence => "node needs to be fenced", | |
30 | gone => "node vanished from cluster members list, possibly deleted" | |
31 | }; | |
32 | ||
33 | sub get_node_state { | |
34 | my ($self, $node) = @_; | |
35 | ||
36 | $self->{status}->{$node} = 'unknown' | |
37 | if !$self->{status}->{$node}; | |
38 | ||
39 | return $self->{status}->{$node}; | |
40 | } | |
41 | ||
42 | sub node_is_operational { | |
43 | my ($self, $node) = @_; | |
44 | return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance'; | |
45 | } | |
46 | ||
47 | sub node_is_online { | |
48 | my ($self, $node) = @_; | |
49 | ||
50 | return $self->get_node_state($node) eq 'online'; | |
51 | } | |
52 | ||
53 | sub node_is_offline_delayed { | |
54 | my ($self, $node, $delay) = @_; | |
55 | ||
56 | $delay = $fence_delay if !defined($delay); | |
57 | ||
58 | my $haenv = $self->{haenv}; | |
59 | ||
60 | return undef if $self->get_node_state($node) eq 'online'; | |
61 | ||
62 | my $last_online = $self->{last_online}->{$node}; | |
63 | ||
64 | my $ctime = $haenv->get_time(); | |
65 | ||
66 | if (!defined($last_online)) { | |
67 | $self->{last_online}->{$node} = $ctime; | |
68 | return undef; | |
69 | } | |
70 | ||
71 | return ($ctime - $last_online) >= $delay; | |
72 | } | |
73 | ||
74 | sub list_nodes { | |
75 | my ($self) = @_; | |
76 | ||
77 | return [sort keys %{$self->{status}}]; | |
78 | } | |
79 | ||
80 | sub list_online_nodes { | |
81 | my ($self) = @_; | |
82 | ||
83 | my $res = []; | |
84 | ||
85 | foreach my $node (sort keys %{$self->{status}}) { | |
86 | next if $self->{status}->{$node} ne 'online'; | |
87 | push @$res, $node; | |
88 | } | |
89 | ||
90 | return $res; | |
91 | } | |
92 | ||
93 | my $delete_node = sub { | |
94 | my ($self, $node) = @_; | |
95 | ||
96 | return undef if $self->get_node_state($node) ne 'gone'; | |
97 | ||
98 | my $haenv = $self->{haenv}; | |
99 | ||
100 | delete $self->{last_online}->{$node}; | |
101 | delete $self->{status}->{$node}; | |
102 | ||
103 | $haenv->log('notice', "deleting gone node '$node', not a cluster member". | |
104 | " anymore."); | |
105 | }; | |
106 | ||
107 | my $set_node_state = sub { | |
108 | my ($self, $node, $state) = @_; | |
109 | ||
110 | my $haenv = $self->{haenv}; | |
111 | ||
112 | die "unknown node state '$state'\n" | |
113 | if !defined($valid_node_states->{$state}); | |
114 | ||
115 | my $last_state = $self->get_node_state($node); | |
116 | ||
117 | return if $state eq $last_state; | |
118 | ||
119 | $self->{status}->{$node} = $state; | |
120 | ||
121 | $haenv->log('info', "node '$node': state changed from " . | |
122 | "'$last_state' => '$state'\n"); | |
123 | }; | |
124 | ||
125 | sub update { | |
126 | my ($self, $node_info, $lrm_modes) = @_; | |
127 | ||
128 | my $haenv = $self->{haenv}; | |
129 | ||
130 | foreach my $node (sort keys %$node_info) { | |
131 | my $d = $node_info->{$node}; | |
132 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; | |
133 | next if !$d->{online}; | |
134 | ||
135 | # record last time the node was online (required to implement fence delay) | |
136 | $self->{last_online}->{$node} = $haenv->get_time(); | |
137 | ||
138 | my $state = $self->get_node_state($node); | |
139 | ||
140 | if ($state eq 'online') { | |
141 | if ($lrm_mode eq 'maintenance') { | |
142 | $set_node_state->($self, $node, 'maintenance'); | |
143 | } | |
144 | # $set_node_state->($self, $node, 'online'); | |
145 | } elsif ($state eq 'unknown' || $state eq 'gone') { | |
146 | $set_node_state->($self, $node, 'online'); | |
147 | } elsif ($state eq 'fence') { | |
148 | # do nothing, wait until fenced | |
149 | } elsif ($state eq 'maintenance') { | |
150 | if ($lrm_mode ne 'maintenance') { | |
151 | $set_node_state->($self, $node, 'online'); | |
152 | } | |
153 | } else { | |
154 | die "detected unknown node state '$state"; | |
155 | } | |
156 | } | |
157 | ||
158 | foreach my $node (sort keys %{$self->{status}}) { | |
159 | my $d = $node_info->{$node}; | |
160 | next if $d && $d->{online}; | |
161 | ||
162 | my $state = $self->get_node_state($node); | |
163 | ||
164 | # node is not inside quorate partition, possibly not active | |
165 | ||
166 | if ($state eq 'online') { | |
167 | $set_node_state->($self, $node, 'unknown'); | |
168 | } elsif ($state eq 'maintenance') { | |
169 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; | |
170 | if ($lrm_mode ne 'maintenance') { | |
171 | $set_node_state->($self, $node, 'unknown'); | |
172 | } | |
173 | } elsif ($state eq 'unknown') { | |
174 | ||
175 | # node isn't in the member list anymore, deleted from the cluster? | |
176 | $set_node_state->($self, $node, 'gone') if !defined($d) ; | |
177 | ||
178 | } elsif ($state eq 'fence') { | |
179 | # do nothing, wait until fenced | |
180 | } elsif($state eq 'gone') { | |
181 | if ($self->node_is_offline_delayed($node, 3600)) { | |
182 | $delete_node->($self, $node); | |
183 | } | |
184 | } else { | |
185 | die "detected unknown node state '$state"; | |
186 | } | |
187 | ||
188 | } | |
189 | } | |
190 | ||
191 | my $body_template = <<EOT; | |
192 | {{#verbatim}} | |
193 | The node '{{node}}' failed and needs manual intervention. | |
194 | ||
195 | The PVE HA manager tries to fence it and recover the configured HA resources to | |
196 | a healthy node if possible. | |
197 | ||
198 | Current fence status: {{subject-prefix}} | |
199 | {{subject}} | |
200 | {{/verbatim}} | |
201 | ||
202 | {{heading-2 "Overall Cluster status:"}} | |
203 | {{object status-data}} | |
204 | EOT | |
205 | ||
206 | my $subject_template = "{{subject-prefix}}: {{subject}}"; | |
207 | ||
208 | # assembles a commont text for fence emails | |
209 | my $send_fence_state_email = sub { | |
210 | my ($self, $subject_prefix, $subject, $node) = @_; | |
211 | ||
212 | my $haenv = $self->{haenv}; | |
213 | my $status = $haenv->read_manager_status(); | |
214 | ||
215 | my $notification_properties = { | |
216 | "status-data" => { | |
217 | manager_status => $status, | |
218 | node_status => $self->{status} | |
219 | }, | |
220 | "node" => $node, | |
221 | "subject-prefix" => $subject_prefix, | |
222 | "subject" => $subject, | |
223 | }; | |
224 | ||
225 | $haenv->send_notification( | |
226 | $subject_template, | |
227 | $body_template, | |
228 | $notification_properties | |
229 | ); | |
230 | }; | |
231 | ||
232 | ||
233 | # start fencing | |
234 | sub fence_node { | |
235 | my ($self, $node) = @_; | |
236 | ||
237 | my $haenv = $self->{haenv}; | |
238 | ||
239 | my $state = $self->get_node_state($node); | |
240 | ||
241 | if ($state ne 'fence') { | |
242 | $set_node_state->($self, $node, 'fence'); | |
243 | my $msg = "Try to fence node '$node'"; | |
244 | $send_fence_state_email->($self, 'FENCE', $msg, $node); | |
245 | } | |
246 | ||
247 | my $success = $haenv->get_ha_agent_lock($node); | |
248 | ||
249 | if ($success) { | |
250 | my $msg = "fencing: acknowledged - got agent lock for node '$node'"; | |
251 | $haenv->log("info", $msg); | |
252 | $set_node_state->($self, $node, 'unknown'); | |
253 | $send_fence_state_email->($self, 'SUCCEED', $msg, $node); | |
254 | } | |
255 | ||
256 | return $success; | |
257 | } | |
258 | ||
259 | 1; |