]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/NodeStatus.pm
fix #1919, #1920: improve handling zombie (without node) services
[pve-ha-manager.git] / src / PVE / HA / NodeStatus.pm
CommitLineData
cbca2c55
DM
1package PVE::HA::NodeStatus;
2
3use strict;
4use warnings;
5
854cecf3 6use JSON;
cbca2c55 7
b0e9158d
TL
8my $fence_delay = 60;
9
cbca2c55 10sub new {
c79442f2 11 my ($this, $haenv, $status) = @_;
cbca2c55
DM
12
13 my $class = ref($this) || $this;
14
15 my $self = bless {
c79442f2 16 haenv => $haenv,
c0bbd038 17 status => $status,
5385a606 18 last_online => {},
cbca2c55
DM
19 }, $class;
20
21 return $self;
22}
23
24# possible node state:
b9e715a1 25my $valid_node_states = {
c0bbd038
DM
26 online => "node online and member of quorate partition",
27 unknown => "not member of quorate partition, but possibly still running",
f7ccd1b3 28 fence => "node needs to be fenced",
7dd15f22 29 gone => "node vanished from cluster members list, possibly deleted"
b9e715a1 30};
cbca2c55
DM
31
32sub get_node_state {
33 my ($self, $node) = @_;
34
289e4784 35 $self->{status}->{$node} = 'unknown'
b9e715a1 36 if !$self->{status}->{$node};
cbca2c55 37
b9e715a1 38 return $self->{status}->{$node};
cbca2c55
DM
39}
40
f7ccd1b3
DM
41sub node_is_online {
42 my ($self, $node) = @_;
43
44 return $self->get_node_state($node) eq 'online';
45}
46
5385a606
DM
47sub node_is_offline_delayed {
48 my ($self, $node, $delay) = @_;
49
b0e9158d
TL
50 $delay = $fence_delay if !defined($delay);
51
d8b6f99b
DM
52 my $haenv = $self->{haenv};
53
5385a606
DM
54 return undef if $self->get_node_state($node) eq 'online';
55
56 my $last_online = $self->{last_online}->{$node};
57
d8b6f99b
DM
58 my $ctime = $haenv->get_time();
59
5385a606
DM
60 if (!defined($last_online)) {
61 $self->{last_online}->{$node} = $ctime;
62 return undef;
63 }
64
d8b6f99b 65 return ($ctime - $last_online) >= $delay;
5385a606
DM
66}
67
9c7d068b
DM
68sub list_nodes {
69 my ($self) = @_;
70
71 return [sort keys %{$self->{status}}];
72}
73
f7ccd1b3
DM
74sub list_online_nodes {
75 my ($self) = @_;
76
77 my $res = [];
78
c79442f2 79 foreach my $node (sort keys %{$self->{status}}) {
f7ccd1b3
DM
80 next if $self->{status}->{$node} ne 'online';
81 push @$res, $node;
82 }
83
84 return $res;
85}
86
7dd15f22
TL
87my $delete_node = sub {
88 my ($self, $node) = @_;
89
90 return undef if $self->get_node_state($node) ne 'gone';
91
92 my $haenv = $self->{haenv};
93
94 delete $self->{last_online}->{$node};
95 delete $self->{status}->{$node};
96
97 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
98 " anymore.");
99};
100
cbca2c55
DM
101my $set_node_state = sub {
102 my ($self, $node, $state) = @_;
103
c79442f2
DM
104 my $haenv = $self->{haenv};
105
b9e715a1
DM
106 die "unknown node state '$state'\n"
107 if !defined($valid_node_states->{$state});
cbca2c55
DM
108
109 my $last_state = $self->get_node_state($node);
110
111 return if $state eq $last_state;
112
113 $self->{status}->{$node} = $state;
114
c79442f2
DM
115 $haenv->log('info', "node '$node': state changed from " .
116 "'$last_state' => '$state'\n");
cbca2c55
DM
117};
118
119sub update {
120 my ($self, $node_info) = @_;
121
d8b6f99b
DM
122 my $haenv = $self->{haenv};
123
124 foreach my $node (sort keys %$node_info) {
cbca2c55
DM
125 my $d = $node_info->{$node};
126 next if !$d->{online};
127
5385a606 128 # record last time the node was online (required to implement fence delay)
d8b6f99b 129 $self->{last_online}->{$node} = $haenv->get_time();
5385a606 130
cbca2c55
DM
131 my $state = $self->get_node_state($node);
132
f7ccd1b3 133 if ($state eq 'online') {
c0bbd038 134 # &$set_node_state($self, $node, 'online');
7dd15f22 135 } elsif ($state eq 'unknown' || $state eq 'gone') {
c0bbd038 136 &$set_node_state($self, $node, 'online');
f7ccd1b3 137 } elsif ($state eq 'fence') {
c0bbd038 138 # do nothing, wait until fenced
c0bbd038
DM
139 } else {
140 die "detected unknown node state '$state";
cbca2c55
DM
141 }
142 }
143
9b2dbc2a 144 foreach my $node (sort keys %{$self->{status}}) {
cbca2c55
DM
145 my $d = $node_info->{$node};
146 next if $d && $d->{online};
147
148 my $state = $self->get_node_state($node);
149
c0bbd038
DM
150 # node is not inside quorate partition, possibly not active
151
f7ccd1b3 152 if ($state eq 'online') {
c0bbd038
DM
153 &$set_node_state($self, $node, 'unknown');
154 } elsif ($state eq 'unknown') {
7dd15f22
TL
155
156 # node isn't in the member list anymore, deleted from the cluster?
157 &$set_node_state($self, $node, 'gone') if(!defined($d));
158
f7ccd1b3 159 } elsif ($state eq 'fence') {
c0bbd038 160 # do nothing, wait until fenced
7dd15f22 161 } elsif($state eq 'gone') {
5d880e15 162 if ($self->node_is_offline_delayed($node, 3600)) {
7dd15f22
TL
163 &$delete_node($self, $node);
164 }
c0bbd038
DM
165 } else {
166 die "detected unknown node state '$state";
167 }
168
cbca2c55
DM
169 }
170}
171
854cecf3
TL
172# assembles a commont text for fence emails
173my $send_fence_state_email = sub {
174 my ($self, $subject_prefix, $subject, $node) = @_;
175
176 my $haenv = $self->{haenv};
177
178 my $mail_text = <<EOF
179The node '$node' failed and needs manual intervention.
180
181The PVE HA manager tries to fence it and recover the
182configured HA resources to a healthy node if possible.
183
184Current fence status: $subject_prefix
185$subject
186
187
188Overall Cluster status:
189-----------------------
190
191EOF
192;
193 my $mail_subject = $subject_prefix . ': ' . $subject;
194
195 my $status = $haenv->read_manager_status();
196 my $data = { manager_status => $status, node_status => $self->{status} };
197
198 $mail_text .= to_json($data, { pretty => 1, canonical => 1});
199
200 $haenv->sendmail($mail_subject, $mail_text);
201};
202
203
c79442f2 204# start fencing
f7ccd1b3
DM
205sub fence_node {
206 my ($self, $node) = @_;
207
c79442f2
DM
208 my $haenv = $self->{haenv};
209
f7ccd1b3
DM
210 my $state = $self->get_node_state($node);
211
c79442f2
DM
212 if ($state ne 'fence') {
213 &$set_node_state($self, $node, 'fence');
854cecf3
TL
214 my $msg = "Try to fence node '$node'";
215 &$send_fence_state_email($self, 'FENCE', $msg, $node);
f7ccd1b3
DM
216 }
217
f5c29173 218 my $success = $haenv->get_ha_agent_lock($node);
ffa555c5
DM
219
220 if ($success) {
e2a7b1b5 221 my $msg = "fencing: acknowledged - got agent lock for node '$node'";
854cecf3 222 $haenv->log("info", $msg);
21e37ed4 223 &$set_node_state($self, $node, 'unknown');
e2a7b1b5 224 &$send_fence_state_email($self, 'SUCCEED', $msg, $node);
ffa555c5
DM
225 }
226
227 return $success;
f7ccd1b3
DM
228}
229
cbca2c55 2301;