]>
Commit | Line | Data |
---|---|---|
cbca2c55 DM |
1 | package PVE::HA::NodeStatus; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
854cecf3 | 6 | use JSON; |
cbca2c55 DM |
7 | use Data::Dumper; |
8 | ||
b0e9158d TL |
9 | my $fence_delay = 60; |
10 | ||
cbca2c55 | 11 | sub new { |
c79442f2 | 12 | my ($this, $haenv, $status) = @_; |
cbca2c55 DM |
13 | |
14 | my $class = ref($this) || $this; | |
15 | ||
16 | my $self = bless { | |
c79442f2 | 17 | haenv => $haenv, |
c0bbd038 | 18 | status => $status, |
5385a606 | 19 | last_online => {}, |
cbca2c55 DM |
20 | }, $class; |
21 | ||
22 | return $self; | |
23 | } | |
24 | ||
25 | # possible node state: | |
b9e715a1 | 26 | my $valid_node_states = { |
c0bbd038 DM |
27 | online => "node online and member of quorate partition", |
28 | unknown => "not member of quorate partition, but possibly still running", | |
f7ccd1b3 | 29 | fence => "node needs to be fenced", |
7dd15f22 | 30 | gone => "node vanished from cluster members list, possibly deleted" |
b9e715a1 | 31 | }; |
cbca2c55 DM |
32 | |
33 | sub get_node_state { | |
34 | my ($self, $node) = @_; | |
35 | ||
f7ccd1b3 | 36 | $self->{status}->{$node} = 'unknown' |
b9e715a1 | 37 | if !$self->{status}->{$node}; |
cbca2c55 | 38 | |
b9e715a1 | 39 | return $self->{status}->{$node}; |
cbca2c55 DM |
40 | } |
41 | ||
f7ccd1b3 DM |
42 | sub node_is_online { |
43 | my ($self, $node) = @_; | |
44 | ||
45 | return $self->get_node_state($node) eq 'online'; | |
46 | } | |
47 | ||
5385a606 DM |
48 | sub node_is_offline_delayed { |
49 | my ($self, $node, $delay) = @_; | |
50 | ||
b0e9158d TL |
51 | $delay = $fence_delay if !defined($delay); |
52 | ||
d8b6f99b DM |
53 | my $haenv = $self->{haenv}; |
54 | ||
5385a606 DM |
55 | return undef if $self->get_node_state($node) eq 'online'; |
56 | ||
57 | my $last_online = $self->{last_online}->{$node}; | |
58 | ||
d8b6f99b DM |
59 | my $ctime = $haenv->get_time(); |
60 | ||
5385a606 DM |
61 | if (!defined($last_online)) { |
62 | $self->{last_online}->{$node} = $ctime; | |
63 | return undef; | |
64 | } | |
65 | ||
d8b6f99b | 66 | return ($ctime - $last_online) >= $delay; |
5385a606 DM |
67 | } |
68 | ||
9c7d068b DM |
69 | sub list_nodes { |
70 | my ($self) = @_; | |
71 | ||
72 | return [sort keys %{$self->{status}}]; | |
73 | } | |
74 | ||
f7ccd1b3 DM |
75 | sub list_online_nodes { |
76 | my ($self) = @_; | |
77 | ||
78 | my $res = []; | |
79 | ||
c79442f2 | 80 | foreach my $node (sort keys %{$self->{status}}) { |
f7ccd1b3 DM |
81 | next if $self->{status}->{$node} ne 'online'; |
82 | push @$res, $node; | |
83 | } | |
84 | ||
85 | return $res; | |
86 | } | |
87 | ||
7dd15f22 TL |
88 | my $delete_node = sub { |
89 | my ($self, $node) = @_; | |
90 | ||
91 | return undef if $self->get_node_state($node) ne 'gone'; | |
92 | ||
93 | my $haenv = $self->{haenv}; | |
94 | ||
95 | delete $self->{last_online}->{$node}; | |
96 | delete $self->{status}->{$node}; | |
97 | ||
98 | $haenv->log('notice', "deleting gone node '$node', not a cluster member". | |
99 | " anymore."); | |
100 | }; | |
101 | ||
cbca2c55 DM |
102 | my $set_node_state = sub { |
103 | my ($self, $node, $state) = @_; | |
104 | ||
c79442f2 DM |
105 | my $haenv = $self->{haenv}; |
106 | ||
b9e715a1 DM |
107 | die "unknown node state '$state'\n" |
108 | if !defined($valid_node_states->{$state}); | |
cbca2c55 DM |
109 | |
110 | my $last_state = $self->get_node_state($node); | |
111 | ||
112 | return if $state eq $last_state; | |
113 | ||
114 | $self->{status}->{$node} = $state; | |
115 | ||
c79442f2 DM |
116 | $haenv->log('info', "node '$node': state changed from " . |
117 | "'$last_state' => '$state'\n"); | |
cbca2c55 DM |
118 | }; |
119 | ||
120 | sub update { | |
121 | my ($self, $node_info) = @_; | |
122 | ||
d8b6f99b DM |
123 | my $haenv = $self->{haenv}; |
124 | ||
125 | foreach my $node (sort keys %$node_info) { | |
cbca2c55 DM |
126 | my $d = $node_info->{$node}; |
127 | next if !$d->{online}; | |
128 | ||
5385a606 | 129 | # record last time the node was online (required to implement fence delay) |
d8b6f99b | 130 | $self->{last_online}->{$node} = $haenv->get_time(); |
5385a606 | 131 | |
cbca2c55 DM |
132 | my $state = $self->get_node_state($node); |
133 | ||
f7ccd1b3 | 134 | if ($state eq 'online') { |
c0bbd038 | 135 | # &$set_node_state($self, $node, 'online'); |
7dd15f22 | 136 | } elsif ($state eq 'unknown' || $state eq 'gone') { |
c0bbd038 | 137 | &$set_node_state($self, $node, 'online'); |
f7ccd1b3 | 138 | } elsif ($state eq 'fence') { |
c0bbd038 | 139 | # do nothing, wait until fenced |
c0bbd038 DM |
140 | } else { |
141 | die "detected unknown node state '$state"; | |
cbca2c55 DM |
142 | } |
143 | } | |
144 | ||
9b2dbc2a | 145 | foreach my $node (sort keys %{$self->{status}}) { |
cbca2c55 DM |
146 | my $d = $node_info->{$node}; |
147 | next if $d && $d->{online}; | |
148 | ||
149 | my $state = $self->get_node_state($node); | |
150 | ||
c0bbd038 DM |
151 | # node is not inside quorate partition, possibly not active |
152 | ||
f7ccd1b3 | 153 | if ($state eq 'online') { |
c0bbd038 DM |
154 | &$set_node_state($self, $node, 'unknown'); |
155 | } elsif ($state eq 'unknown') { | |
7dd15f22 TL |
156 | |
157 | # node isn't in the member list anymore, deleted from the cluster? | |
158 | &$set_node_state($self, $node, 'gone') if(!defined($d)); | |
159 | ||
f7ccd1b3 | 160 | } elsif ($state eq 'fence') { |
c0bbd038 | 161 | # do nothing, wait until fenced |
7dd15f22 TL |
162 | } elsif($state eq 'gone') { |
163 | if($self->node_is_offline_delayed($node, 3600)) { | |
164 | &$delete_node($self, $node); | |
165 | } | |
c0bbd038 DM |
166 | } else { |
167 | die "detected unknown node state '$state"; | |
168 | } | |
169 | ||
cbca2c55 DM |
170 | } |
171 | } | |
172 | ||
854cecf3 TL |
173 | # assembles a commont text for fence emails |
174 | my $send_fence_state_email = sub { | |
175 | my ($self, $subject_prefix, $subject, $node) = @_; | |
176 | ||
177 | my $haenv = $self->{haenv}; | |
178 | ||
179 | my $mail_text = <<EOF | |
180 | The node '$node' failed and needs manual intervention. | |
181 | ||
182 | The PVE HA manager tries to fence it and recover the | |
183 | configured HA resources to a healthy node if possible. | |
184 | ||
185 | Current fence status: $subject_prefix | |
186 | $subject | |
187 | ||
188 | ||
189 | Overall Cluster status: | |
190 | ----------------------- | |
191 | ||
192 | EOF | |
193 | ; | |
194 | my $mail_subject = $subject_prefix . ': ' . $subject; | |
195 | ||
196 | my $status = $haenv->read_manager_status(); | |
197 | my $data = { manager_status => $status, node_status => $self->{status} }; | |
198 | ||
199 | $mail_text .= to_json($data, { pretty => 1, canonical => 1}); | |
200 | ||
201 | $haenv->sendmail($mail_subject, $mail_text); | |
202 | }; | |
203 | ||
204 | ||
c79442f2 | 205 | # start fencing |
f7ccd1b3 DM |
206 | sub fence_node { |
207 | my ($self, $node) = @_; | |
208 | ||
c79442f2 DM |
209 | my $haenv = $self->{haenv}; |
210 | ||
f7ccd1b3 DM |
211 | my $state = $self->get_node_state($node); |
212 | ||
c79442f2 DM |
213 | if ($state ne 'fence') { |
214 | &$set_node_state($self, $node, 'fence'); | |
854cecf3 TL |
215 | my $msg = "Try to fence node '$node'"; |
216 | &$send_fence_state_email($self, 'FENCE', $msg, $node); | |
f7ccd1b3 DM |
217 | } |
218 | ||
f5c29173 | 219 | my $success = $haenv->get_ha_agent_lock($node); |
ffa555c5 DM |
220 | |
221 | if ($success) { | |
854cecf3 TL |
222 | my $msg = "fencing: acknowleged - got agent lock for node '$node'"; |
223 | $haenv->log("info", $msg); | |
21e37ed4 | 224 | &$set_node_state($self, $node, 'unknown'); |
854cecf3 | 225 | &$send_fence_state_email($self, 'SUCEED', $msg, $node); |
ffa555c5 DM |
226 | } |
227 | ||
228 | return $success; | |
f7ccd1b3 DM |
229 | } |
230 | ||
cbca2c55 | 231 | 1; |