]>
Commit | Line | Data |
---|---|---|
cbca2c55 DM |
1 | package PVE::HA::NodeStatus; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
854cecf3 | 6 | use JSON; |
cbca2c55 | 7 | |
b0e9158d TL |
8 | my $fence_delay = 60; |
9 | ||
cbca2c55 | 10 | sub new { |
c79442f2 | 11 | my ($this, $haenv, $status) = @_; |
cbca2c55 DM |
12 | |
13 | my $class = ref($this) || $this; | |
14 | ||
15 | my $self = bless { | |
c79442f2 | 16 | haenv => $haenv, |
c0bbd038 | 17 | status => $status, |
5385a606 | 18 | last_online => {}, |
cbca2c55 DM |
19 | }, $class; |
20 | ||
21 | return $self; | |
22 | } | |
23 | ||
24 | # possible node state: | |
b9e715a1 | 25 | my $valid_node_states = { |
c0bbd038 | 26 | online => "node online and member of quorate partition", |
99278e06 | 27 | maintenance => "node is a member of quorate partition but currently not able to do work", |
c0bbd038 | 28 | unknown => "not member of quorate partition, but possibly still running", |
f7ccd1b3 | 29 | fence => "node needs to be fenced", |
7dd15f22 | 30 | gone => "node vanished from cluster members list, possibly deleted" |
b9e715a1 | 31 | }; |
cbca2c55 DM |
32 | |
33 | sub get_node_state { | |
34 | my ($self, $node) = @_; | |
35 | ||
289e4784 | 36 | $self->{status}->{$node} = 'unknown' |
b9e715a1 | 37 | if !$self->{status}->{$node}; |
cbca2c55 | 38 | |
b9e715a1 | 39 | return $self->{status}->{$node}; |
cbca2c55 DM |
40 | } |
41 | ||
99278e06 TL |
42 | sub node_is_operational { |
43 | my ($self, $node) = @_; | |
44 | return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance'; | |
45 | } | |
46 | ||
f7ccd1b3 DM |
47 | sub node_is_online { |
48 | my ($self, $node) = @_; | |
49 | ||
50 | return $self->get_node_state($node) eq 'online'; | |
51 | } | |
52 | ||
5385a606 DM |
53 | sub node_is_offline_delayed { |
54 | my ($self, $node, $delay) = @_; | |
55 | ||
b0e9158d TL |
56 | $delay = $fence_delay if !defined($delay); |
57 | ||
d8b6f99b DM |
58 | my $haenv = $self->{haenv}; |
59 | ||
5385a606 DM |
60 | return undef if $self->get_node_state($node) eq 'online'; |
61 | ||
62 | my $last_online = $self->{last_online}->{$node}; | |
63 | ||
d8b6f99b DM |
64 | my $ctime = $haenv->get_time(); |
65 | ||
5385a606 DM |
66 | if (!defined($last_online)) { |
67 | $self->{last_online}->{$node} = $ctime; | |
68 | return undef; | |
69 | } | |
70 | ||
d8b6f99b | 71 | return ($ctime - $last_online) >= $delay; |
5385a606 DM |
72 | } |
73 | ||
9c7d068b DM |
74 | sub list_nodes { |
75 | my ($self) = @_; | |
76 | ||
77 | return [sort keys %{$self->{status}}]; | |
78 | } | |
79 | ||
f7ccd1b3 DM |
80 | sub list_online_nodes { |
81 | my ($self) = @_; | |
82 | ||
83 | my $res = []; | |
84 | ||
c79442f2 | 85 | foreach my $node (sort keys %{$self->{status}}) { |
f7ccd1b3 DM |
86 | next if $self->{status}->{$node} ne 'online'; |
87 | push @$res, $node; | |
88 | } | |
89 | ||
90 | return $res; | |
91 | } | |
92 | ||
7dd15f22 TL |
93 | my $delete_node = sub { |
94 | my ($self, $node) = @_; | |
95 | ||
96 | return undef if $self->get_node_state($node) ne 'gone'; | |
97 | ||
98 | my $haenv = $self->{haenv}; | |
99 | ||
100 | delete $self->{last_online}->{$node}; | |
101 | delete $self->{status}->{$node}; | |
102 | ||
103 | $haenv->log('notice', "deleting gone node '$node', not a cluster member". | |
104 | " anymore."); | |
105 | }; | |
106 | ||
cbca2c55 DM |
107 | my $set_node_state = sub { |
108 | my ($self, $node, $state) = @_; | |
109 | ||
c79442f2 DM |
110 | my $haenv = $self->{haenv}; |
111 | ||
b9e715a1 DM |
112 | die "unknown node state '$state'\n" |
113 | if !defined($valid_node_states->{$state}); | |
cbca2c55 DM |
114 | |
115 | my $last_state = $self->get_node_state($node); | |
116 | ||
117 | return if $state eq $last_state; | |
118 | ||
119 | $self->{status}->{$node} = $state; | |
120 | ||
c79442f2 DM |
121 | $haenv->log('info', "node '$node': state changed from " . |
122 | "'$last_state' => '$state'\n"); | |
cbca2c55 DM |
123 | }; |
124 | ||
125 | sub update { | |
99278e06 | 126 | my ($self, $node_info, $lrm_modes) = @_; |
cbca2c55 | 127 | |
d8b6f99b DM |
128 | my $haenv = $self->{haenv}; |
129 | ||
130 | foreach my $node (sort keys %$node_info) { | |
cbca2c55 | 131 | my $d = $node_info->{$node}; |
99278e06 | 132 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; |
cbca2c55 DM |
133 | next if !$d->{online}; |
134 | ||
5385a606 | 135 | # record last time the node was online (required to implement fence delay) |
d8b6f99b | 136 | $self->{last_online}->{$node} = $haenv->get_time(); |
5385a606 | 137 | |
cbca2c55 DM |
138 | my $state = $self->get_node_state($node); |
139 | ||
f7ccd1b3 | 140 | if ($state eq 'online') { |
99278e06 | 141 | if ($lrm_mode eq 'maintenance') { |
99278e06 TL |
142 | $set_node_state->($self, $node, 'maintenance'); |
143 | } | |
da5ba16a | 144 | # $set_node_state->($self, $node, 'online'); |
7dd15f22 | 145 | } elsif ($state eq 'unknown' || $state eq 'gone') { |
da5ba16a | 146 | $set_node_state->($self, $node, 'online'); |
f7ccd1b3 | 147 | } elsif ($state eq 'fence') { |
c0bbd038 | 148 | # do nothing, wait until fenced |
99278e06 TL |
149 | } elsif ($state eq 'maintenance') { |
150 | if ($lrm_mode ne 'maintenance') { | |
151 | $set_node_state->($self, $node, 'online'); | |
152 | } | |
c0bbd038 DM |
153 | } else { |
154 | die "detected unknown node state '$state"; | |
cbca2c55 DM |
155 | } |
156 | } | |
157 | ||
9b2dbc2a | 158 | foreach my $node (sort keys %{$self->{status}}) { |
cbca2c55 DM |
159 | my $d = $node_info->{$node}; |
160 | next if $d && $d->{online}; | |
161 | ||
162 | my $state = $self->get_node_state($node); | |
163 | ||
c0bbd038 DM |
164 | # node is not inside quorate partition, possibly not active |
165 | ||
1388fcc1 | 166 | if ($state eq 'online') { |
da5ba16a | 167 | $set_node_state->($self, $node, 'unknown'); |
1388fcc1 TL |
168 | } elsif ($state eq 'maintenance') { |
169 | my $lrm_mode = $lrm_modes->{$node} // 'unkown'; | |
170 | if ($lrm_mode ne 'maintenance') { | |
171 | $set_node_state->($self, $node, 'unknown'); | |
172 | } | |
c0bbd038 | 173 | } elsif ($state eq 'unknown') { |
7dd15f22 TL |
174 | |
175 | # node isn't in the member list anymore, deleted from the cluster? | |
da5ba16a | 176 | $set_node_state->($self, $node, 'gone') if !defined($d) ; |
7dd15f22 | 177 | |
f7ccd1b3 | 178 | } elsif ($state eq 'fence') { |
c0bbd038 | 179 | # do nothing, wait until fenced |
7dd15f22 | 180 | } elsif($state eq 'gone') { |
5d880e15 | 181 | if ($self->node_is_offline_delayed($node, 3600)) { |
da5ba16a | 182 | $delete_node->($self, $node); |
7dd15f22 | 183 | } |
c0bbd038 DM |
184 | } else { |
185 | die "detected unknown node state '$state"; | |
186 | } | |
187 | ||
cbca2c55 DM |
188 | } |
189 | } | |
190 | ||
854cecf3 TL |
191 | # assembles a commont text for fence emails |
192 | my $send_fence_state_email = sub { | |
193 | my ($self, $subject_prefix, $subject, $node) = @_; | |
194 | ||
195 | my $haenv = $self->{haenv}; | |
196 | ||
197 | my $mail_text = <<EOF | |
198 | The node '$node' failed and needs manual intervention. | |
199 | ||
200 | The PVE HA manager tries to fence it and recover the | |
201 | configured HA resources to a healthy node if possible. | |
202 | ||
203 | Current fence status: $subject_prefix | |
204 | $subject | |
205 | ||
206 | ||
207 | Overall Cluster status: | |
208 | ----------------------- | |
209 | ||
210 | EOF | |
211 | ; | |
212 | my $mail_subject = $subject_prefix . ': ' . $subject; | |
213 | ||
214 | my $status = $haenv->read_manager_status(); | |
215 | my $data = { manager_status => $status, node_status => $self->{status} }; | |
216 | ||
217 | $mail_text .= to_json($data, { pretty => 1, canonical => 1}); | |
218 | ||
219 | $haenv->sendmail($mail_subject, $mail_text); | |
220 | }; | |
221 | ||
222 | ||
c79442f2 | 223 | # start fencing |
f7ccd1b3 DM |
224 | sub fence_node { |
225 | my ($self, $node) = @_; | |
226 | ||
c79442f2 DM |
227 | my $haenv = $self->{haenv}; |
228 | ||
f7ccd1b3 DM |
229 | my $state = $self->get_node_state($node); |
230 | ||
c79442f2 | 231 | if ($state ne 'fence') { |
da5ba16a | 232 | $set_node_state->($self, $node, 'fence'); |
854cecf3 | 233 | my $msg = "Try to fence node '$node'"; |
da5ba16a | 234 | $send_fence_state_email->($self, 'FENCE', $msg, $node); |
f7ccd1b3 DM |
235 | } |
236 | ||
f5c29173 | 237 | my $success = $haenv->get_ha_agent_lock($node); |
ffa555c5 DM |
238 | |
239 | if ($success) { | |
e2a7b1b5 | 240 | my $msg = "fencing: acknowledged - got agent lock for node '$node'"; |
854cecf3 | 241 | $haenv->log("info", $msg); |
da5ba16a TL |
242 | $set_node_state->($self, $node, 'unknown'); |
243 | $send_fence_state_email->($self, 'SUCCEED', $msg, $node); | |
ffa555c5 DM |
244 | } |
245 | ||
246 | return $success; | |
f7ccd1b3 DM |
247 | } |
248 | ||
cbca2c55 | 249 | 1; |