]>
Commit | Line | Data |
---|---|---|
cbca2c55 DM |
1 | package PVE::HA::NodeStatus; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
854cecf3 | 6 | use JSON; |
cbca2c55 | 7 | |
b0e9158d TL |
8 | my $fence_delay = 60; |
9 | ||
cbca2c55 | 10 | sub new { |
c79442f2 | 11 | my ($this, $haenv, $status) = @_; |
cbca2c55 DM |
12 | |
13 | my $class = ref($this) || $this; | |
14 | ||
15 | my $self = bless { | |
c79442f2 | 16 | haenv => $haenv, |
c0bbd038 | 17 | status => $status, |
5385a606 | 18 | last_online => {}, |
cbca2c55 DM |
19 | }, $class; |
20 | ||
21 | return $self; | |
22 | } | |
23 | ||
24 | # possible node state: | |
b9e715a1 | 25 | my $valid_node_states = { |
c0bbd038 DM |
26 | online => "node online and member of quorate partition", |
27 | unknown => "not member of quorate partition, but possibly still running", | |
f7ccd1b3 | 28 | fence => "node needs to be fenced", |
7dd15f22 | 29 | gone => "node vanished from cluster members list, possibly deleted" |
b9e715a1 | 30 | }; |
cbca2c55 DM |
31 | |
32 | sub get_node_state { | |
33 | my ($self, $node) = @_; | |
34 | ||
289e4784 | 35 | $self->{status}->{$node} = 'unknown' |
b9e715a1 | 36 | if !$self->{status}->{$node}; |
cbca2c55 | 37 | |
b9e715a1 | 38 | return $self->{status}->{$node}; |
cbca2c55 DM |
39 | } |
40 | ||
f7ccd1b3 DM |
41 | sub node_is_online { |
42 | my ($self, $node) = @_; | |
43 | ||
44 | return $self->get_node_state($node) eq 'online'; | |
45 | } | |
46 | ||
5385a606 DM |
47 | sub node_is_offline_delayed { |
48 | my ($self, $node, $delay) = @_; | |
49 | ||
b0e9158d TL |
50 | $delay = $fence_delay if !defined($delay); |
51 | ||
d8b6f99b DM |
52 | my $haenv = $self->{haenv}; |
53 | ||
5385a606 DM |
54 | return undef if $self->get_node_state($node) eq 'online'; |
55 | ||
56 | my $last_online = $self->{last_online}->{$node}; | |
57 | ||
d8b6f99b DM |
58 | my $ctime = $haenv->get_time(); |
59 | ||
5385a606 DM |
60 | if (!defined($last_online)) { |
61 | $self->{last_online}->{$node} = $ctime; | |
62 | return undef; | |
63 | } | |
64 | ||
d8b6f99b | 65 | return ($ctime - $last_online) >= $delay; |
5385a606 DM |
66 | } |
67 | ||
9c7d068b DM |
68 | sub list_nodes { |
69 | my ($self) = @_; | |
70 | ||
71 | return [sort keys %{$self->{status}}]; | |
72 | } | |
73 | ||
f7ccd1b3 DM |
74 | sub list_online_nodes { |
75 | my ($self) = @_; | |
76 | ||
77 | my $res = []; | |
78 | ||
c79442f2 | 79 | foreach my $node (sort keys %{$self->{status}}) { |
f7ccd1b3 DM |
80 | next if $self->{status}->{$node} ne 'online'; |
81 | push @$res, $node; | |
82 | } | |
83 | ||
84 | return $res; | |
85 | } | |
86 | ||
7dd15f22 TL |
87 | my $delete_node = sub { |
88 | my ($self, $node) = @_; | |
89 | ||
90 | return undef if $self->get_node_state($node) ne 'gone'; | |
91 | ||
92 | my $haenv = $self->{haenv}; | |
93 | ||
94 | delete $self->{last_online}->{$node}; | |
95 | delete $self->{status}->{$node}; | |
96 | ||
97 | $haenv->log('notice', "deleting gone node '$node', not a cluster member". | |
98 | " anymore."); | |
99 | }; | |
100 | ||
cbca2c55 DM |
101 | my $set_node_state = sub { |
102 | my ($self, $node, $state) = @_; | |
103 | ||
c79442f2 DM |
104 | my $haenv = $self->{haenv}; |
105 | ||
b9e715a1 DM |
106 | die "unknown node state '$state'\n" |
107 | if !defined($valid_node_states->{$state}); | |
cbca2c55 DM |
108 | |
109 | my $last_state = $self->get_node_state($node); | |
110 | ||
111 | return if $state eq $last_state; | |
112 | ||
113 | $self->{status}->{$node} = $state; | |
114 | ||
c79442f2 DM |
115 | $haenv->log('info', "node '$node': state changed from " . |
116 | "'$last_state' => '$state'\n"); | |
cbca2c55 DM |
117 | }; |
118 | ||
119 | sub update { | |
120 | my ($self, $node_info) = @_; | |
121 | ||
d8b6f99b DM |
122 | my $haenv = $self->{haenv}; |
123 | ||
124 | foreach my $node (sort keys %$node_info) { | |
cbca2c55 DM |
125 | my $d = $node_info->{$node}; |
126 | next if !$d->{online}; | |
127 | ||
5385a606 | 128 | # record last time the node was online (required to implement fence delay) |
d8b6f99b | 129 | $self->{last_online}->{$node} = $haenv->get_time(); |
5385a606 | 130 | |
cbca2c55 DM |
131 | my $state = $self->get_node_state($node); |
132 | ||
f7ccd1b3 | 133 | if ($state eq 'online') { |
c0bbd038 | 134 | # &$set_node_state($self, $node, 'online'); |
7dd15f22 | 135 | } elsif ($state eq 'unknown' || $state eq 'gone') { |
c0bbd038 | 136 | &$set_node_state($self, $node, 'online'); |
f7ccd1b3 | 137 | } elsif ($state eq 'fence') { |
c0bbd038 | 138 | # do nothing, wait until fenced |
c0bbd038 DM |
139 | } else { |
140 | die "detected unknown node state '$state"; | |
cbca2c55 DM |
141 | } |
142 | } | |
143 | ||
9b2dbc2a | 144 | foreach my $node (sort keys %{$self->{status}}) { |
cbca2c55 DM |
145 | my $d = $node_info->{$node}; |
146 | next if $d && $d->{online}; | |
147 | ||
148 | my $state = $self->get_node_state($node); | |
149 | ||
c0bbd038 DM |
150 | # node is not inside quorate partition, possibly not active |
151 | ||
f7ccd1b3 | 152 | if ($state eq 'online') { |
c0bbd038 DM |
153 | &$set_node_state($self, $node, 'unknown'); |
154 | } elsif ($state eq 'unknown') { | |
7dd15f22 TL |
155 | |
156 | # node isn't in the member list anymore, deleted from the cluster? | |
157 | &$set_node_state($self, $node, 'gone') if(!defined($d)); | |
158 | ||
f7ccd1b3 | 159 | } elsif ($state eq 'fence') { |
c0bbd038 | 160 | # do nothing, wait until fenced |
7dd15f22 | 161 | } elsif($state eq 'gone') { |
5d880e15 | 162 | if ($self->node_is_offline_delayed($node, 3600)) { |
7dd15f22 TL |
163 | &$delete_node($self, $node); |
164 | } | |
c0bbd038 DM |
165 | } else { |
166 | die "detected unknown node state '$state"; | |
167 | } | |
168 | ||
cbca2c55 DM |
169 | } |
170 | } | |
171 | ||
854cecf3 TL |
172 | # assembles a commont text for fence emails |
173 | my $send_fence_state_email = sub { | |
174 | my ($self, $subject_prefix, $subject, $node) = @_; | |
175 | ||
176 | my $haenv = $self->{haenv}; | |
177 | ||
178 | my $mail_text = <<EOF | |
179 | The node '$node' failed and needs manual intervention. | |
180 | ||
181 | The PVE HA manager tries to fence it and recover the | |
182 | configured HA resources to a healthy node if possible. | |
183 | ||
184 | Current fence status: $subject_prefix | |
185 | $subject | |
186 | ||
187 | ||
188 | Overall Cluster status: | |
189 | ----------------------- | |
190 | ||
191 | EOF | |
192 | ; | |
193 | my $mail_subject = $subject_prefix . ': ' . $subject; | |
194 | ||
195 | my $status = $haenv->read_manager_status(); | |
196 | my $data = { manager_status => $status, node_status => $self->{status} }; | |
197 | ||
198 | $mail_text .= to_json($data, { pretty => 1, canonical => 1}); | |
199 | ||
200 | $haenv->sendmail($mail_subject, $mail_text); | |
201 | }; | |
202 | ||
203 | ||
c79442f2 | 204 | # start fencing |
f7ccd1b3 DM |
205 | sub fence_node { |
206 | my ($self, $node) = @_; | |
207 | ||
c79442f2 DM |
208 | my $haenv = $self->{haenv}; |
209 | ||
f7ccd1b3 DM |
210 | my $state = $self->get_node_state($node); |
211 | ||
c79442f2 DM |
212 | if ($state ne 'fence') { |
213 | &$set_node_state($self, $node, 'fence'); | |
854cecf3 TL |
214 | my $msg = "Try to fence node '$node'"; |
215 | &$send_fence_state_email($self, 'FENCE', $msg, $node); | |
f7ccd1b3 DM |
216 | } |
217 | ||
f5c29173 | 218 | my $success = $haenv->get_ha_agent_lock($node); |
ffa555c5 DM |
219 | |
220 | if ($success) { | |
e2a7b1b5 | 221 | my $msg = "fencing: acknowledged - got agent lock for node '$node'"; |
854cecf3 | 222 | $haenv->log("info", $msg); |
21e37ed4 | 223 | &$set_node_state($self, $node, 'unknown'); |
e2a7b1b5 | 224 | &$send_fence_state_email($self, 'SUCCEED', $msg, $node); |
ffa555c5 DM |
225 | } |
226 | ||
227 | return $success; | |
f7ccd1b3 DM |
228 | } |
229 | ||
cbca2c55 | 230 | 1; |