]>
Commit | Line | Data |
---|---|---|
cbca2c55 DM |
1 | package PVE::HA::NodeStatus; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
854cecf3 | 6 | use JSON; |
cbca2c55 | 7 | |
b0e9158d TL |
8 | my $fence_delay = 60; |
9 | ||
cbca2c55 | 10 | sub new { |
c79442f2 | 11 | my ($this, $haenv, $status) = @_; |
cbca2c55 DM |
12 | |
13 | my $class = ref($this) || $this; | |
14 | ||
15 | my $self = bless { | |
c79442f2 | 16 | haenv => $haenv, |
c0bbd038 | 17 | status => $status, |
5385a606 | 18 | last_online => {}, |
cbca2c55 DM |
19 | }, $class; |
20 | ||
21 | return $self; | |
22 | } | |
23 | ||
24 | # possible node state: | |
b9e715a1 | 25 | my $valid_node_states = { |
c0bbd038 | 26 | online => "node online and member of quorate partition", |
99278e06 | 27 | maintenance => "node is a member of quorate partition but currently not able to do work", |
c0bbd038 | 28 | unknown => "not member of quorate partition, but possibly still running", |
f7ccd1b3 | 29 | fence => "node needs to be fenced", |
7dd15f22 | 30 | gone => "node vanished from cluster members list, possibly deleted" |
b9e715a1 | 31 | }; |
cbca2c55 DM |
32 | |
33 | sub get_node_state { | |
34 | my ($self, $node) = @_; | |
35 | ||
289e4784 | 36 | $self->{status}->{$node} = 'unknown' |
b9e715a1 | 37 | if !$self->{status}->{$node}; |
cbca2c55 | 38 | |
b9e715a1 | 39 | return $self->{status}->{$node}; |
cbca2c55 DM |
40 | } |
41 | ||
99278e06 TL |
42 | sub node_is_operational { |
43 | my ($self, $node) = @_; | |
44 | return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance'; | |
45 | } | |
46 | ||
f7ccd1b3 DM |
47 | sub node_is_online { |
48 | my ($self, $node) = @_; | |
49 | ||
50 | return $self->get_node_state($node) eq 'online'; | |
51 | } | |
52 | ||
5385a606 DM |
53 | sub node_is_offline_delayed { |
54 | my ($self, $node, $delay) = @_; | |
55 | ||
b0e9158d TL |
56 | $delay = $fence_delay if !defined($delay); |
57 | ||
d8b6f99b DM |
58 | my $haenv = $self->{haenv}; |
59 | ||
5385a606 DM |
60 | return undef if $self->get_node_state($node) eq 'online'; |
61 | ||
62 | my $last_online = $self->{last_online}->{$node}; | |
63 | ||
d8b6f99b DM |
64 | my $ctime = $haenv->get_time(); |
65 | ||
5385a606 DM |
66 | if (!defined($last_online)) { |
67 | $self->{last_online}->{$node} = $ctime; | |
68 | return undef; | |
69 | } | |
70 | ||
d8b6f99b | 71 | return ($ctime - $last_online) >= $delay; |
5385a606 DM |
72 | } |
73 | ||
9c7d068b DM |
74 | sub list_nodes { |
75 | my ($self) = @_; | |
76 | ||
77 | return [sort keys %{$self->{status}}]; | |
78 | } | |
79 | ||
f7ccd1b3 DM |
80 | sub list_online_nodes { |
81 | my ($self) = @_; | |
82 | ||
83 | my $res = []; | |
84 | ||
c79442f2 | 85 | foreach my $node (sort keys %{$self->{status}}) { |
f7ccd1b3 DM |
86 | next if $self->{status}->{$node} ne 'online'; |
87 | push @$res, $node; | |
88 | } | |
89 | ||
90 | return $res; | |
91 | } | |
92 | ||
7dd15f22 TL |
93 | my $delete_node = sub { |
94 | my ($self, $node) = @_; | |
95 | ||
96 | return undef if $self->get_node_state($node) ne 'gone'; | |
97 | ||
98 | my $haenv = $self->{haenv}; | |
99 | ||
100 | delete $self->{last_online}->{$node}; | |
101 | delete $self->{status}->{$node}; | |
102 | ||
103 | $haenv->log('notice', "deleting gone node '$node', not a cluster member". | |
104 | " anymore."); | |
105 | }; | |
106 | ||
cbca2c55 DM |
107 | my $set_node_state = sub { |
108 | my ($self, $node, $state) = @_; | |
109 | ||
c79442f2 DM |
110 | my $haenv = $self->{haenv}; |
111 | ||
b9e715a1 DM |
112 | die "unknown node state '$state'\n" |
113 | if !defined($valid_node_states->{$state}); | |
cbca2c55 DM |
114 | |
115 | my $last_state = $self->get_node_state($node); | |
116 | ||
117 | return if $state eq $last_state; | |
118 | ||
119 | $self->{status}->{$node} = $state; | |
120 | ||
c79442f2 DM |
121 | $haenv->log('info', "node '$node': state changed from " . |
122 | "'$last_state' => '$state'\n"); | |
cbca2c55 DM |
123 | }; |
124 | ||
125 | sub update { | |
99278e06 | 126 | my ($self, $node_info, $lrm_modes) = @_; |
cbca2c55 | 127 | |
d8b6f99b DM |
128 | my $haenv = $self->{haenv}; |
129 | ||
130 | foreach my $node (sort keys %$node_info) { | |
cbca2c55 | 131 | my $d = $node_info->{$node}; |
99278e06 | 132 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; |
cbca2c55 DM |
133 | next if !$d->{online}; |
134 | ||
5385a606 | 135 | # record last time the node was online (required to implement fence delay) |
d8b6f99b | 136 | $self->{last_online}->{$node} = $haenv->get_time(); |
5385a606 | 137 | |
cbca2c55 DM |
138 | my $state = $self->get_node_state($node); |
139 | ||
f7ccd1b3 | 140 | if ($state eq 'online') { |
99278e06 | 141 | if ($lrm_mode eq 'maintenance') { |
99278e06 TL |
142 | $set_node_state->($self, $node, 'maintenance'); |
143 | } | |
da5ba16a | 144 | # $set_node_state->($self, $node, 'online'); |
7dd15f22 | 145 | } elsif ($state eq 'unknown' || $state eq 'gone') { |
da5ba16a | 146 | $set_node_state->($self, $node, 'online'); |
f7ccd1b3 | 147 | } elsif ($state eq 'fence') { |
c0bbd038 | 148 | # do nothing, wait until fenced |
99278e06 TL |
149 | } elsif ($state eq 'maintenance') { |
150 | if ($lrm_mode ne 'maintenance') { | |
151 | $set_node_state->($self, $node, 'online'); | |
152 | } | |
c0bbd038 DM |
153 | } else { |
154 | die "detected unknown node state '$state"; | |
cbca2c55 DM |
155 | } |
156 | } | |
157 | ||
9b2dbc2a | 158 | foreach my $node (sort keys %{$self->{status}}) { |
cbca2c55 DM |
159 | my $d = $node_info->{$node}; |
160 | next if $d && $d->{online}; | |
161 | ||
162 | my $state = $self->get_node_state($node); | |
163 | ||
c0bbd038 DM |
164 | # node is not inside quorate partition, possibly not active |
165 | ||
1388fcc1 | 166 | if ($state eq 'online') { |
da5ba16a | 167 | $set_node_state->($self, $node, 'unknown'); |
1388fcc1 TL |
168 | } elsif ($state eq 'maintenance') { |
169 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; | |
170 | if ($lrm_mode ne 'maintenance') { | |
171 | $set_node_state->($self, $node, 'unknown'); | |
172 | } | |
c0bbd038 | 173 | } elsif ($state eq 'unknown') { |
7dd15f22 TL |
174 | |
175 | # node isn't in the member list anymore, deleted from the cluster? | |
da5ba16a | 176 | $set_node_state->($self, $node, 'gone') if !defined($d) ; |
7dd15f22 | 177 | |
f7ccd1b3 | 178 | } elsif ($state eq 'fence') { |
c0bbd038 | 179 | # do nothing, wait until fenced |
7dd15f22 | 180 | } elsif($state eq 'gone') { |
5d880e15 | 181 | if ($self->node_is_offline_delayed($node, 3600)) { |
da5ba16a | 182 | $delete_node->($self, $node); |
7dd15f22 | 183 | } |
c0bbd038 DM |
184 | } else { |
185 | die "detected unknown node state '$state"; | |
186 | } | |
187 | ||
cbca2c55 DM |
188 | } |
189 | } | |
190 | ||
4cb3b2cf LW |
191 | my $body_template = <<EOT; |
192 | {{#verbatim}} | |
193 | The node '{{node}}' failed and needs manual intervention. | |
854cecf3 | 194 | |
4cb3b2cf LW |
195 | The PVE HA manager tries to fence it and recover the configured HA resources to |
196 | a healthy node if possible. | |
854cecf3 | 197 | |
4cb3b2cf LW |
198 | Current fence status: {{subject-prefix}} |
199 | {{subject}} | |
200 | {{/verbatim}} | |
854cecf3 | 201 | |
4cb3b2cf LW |
202 | {{heading-2 "Overall Cluster status:"}} |
203 | {{object status-data}} | |
204 | EOT | |
854cecf3 | 205 | |
4cb3b2cf | 206 | my $subject_template = "{{subject-prefix}}: {{subject}}"; |
854cecf3 | 207 | |
4cb3b2cf LW |
208 | # assembles a commont text for fence emails |
209 | my $send_fence_state_email = sub { | |
210 | my ($self, $subject_prefix, $subject, $node) = @_; | |
854cecf3 | 211 | |
4cb3b2cf | 212 | my $haenv = $self->{haenv}; |
854cecf3 | 213 | my $status = $haenv->read_manager_status(); |
854cecf3 | 214 | |
4cb3b2cf LW |
215 | my $notification_properties = { |
216 | "status-data" => { | |
217 | manager_status => $status, | |
218 | node_status => $self->{status} | |
219 | }, | |
220 | "node" => $node, | |
221 | "subject-prefix" => $subject_prefix, | |
222 | "subject" => $subject, | |
223 | }; | |
224 | ||
225 | $haenv->send_notification( | |
226 | $subject_template, | |
227 | $body_template, | |
228 | $notification_properties | |
229 | ); | |
854cecf3 TL |
230 | }; |
231 | ||
232 | ||
c79442f2 | 233 | # start fencing |
f7ccd1b3 DM |
234 | sub fence_node { |
235 | my ($self, $node) = @_; | |
236 | ||
c79442f2 DM |
237 | my $haenv = $self->{haenv}; |
238 | ||
f7ccd1b3 DM |
239 | my $state = $self->get_node_state($node); |
240 | ||
c79442f2 | 241 | if ($state ne 'fence') { |
da5ba16a | 242 | $set_node_state->($self, $node, 'fence'); |
854cecf3 | 243 | my $msg = "Try to fence node '$node'"; |
da5ba16a | 244 | $send_fence_state_email->($self, 'FENCE', $msg, $node); |
f7ccd1b3 DM |
245 | } |
246 | ||
f5c29173 | 247 | my $success = $haenv->get_ha_agent_lock($node); |
ffa555c5 DM |
248 | |
249 | if ($success) { | |
e2a7b1b5 | 250 | my $msg = "fencing: acknowledged - got agent lock for node '$node'"; |
854cecf3 | 251 | $haenv->log("info", $msg); |
da5ba16a TL |
252 | $set_node_state->($self, $node, 'unknown'); |
253 | $send_fence_state_email->($self, 'SUCCEED', $msg, $node); | |
ffa555c5 DM |
254 | } |
255 | ||
256 | return $success; | |
f7ccd1b3 DM |
257 | } |
258 | ||
cbca2c55 | 259 | 1; |