]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/NodeStatus.pm
sort some more keys for regression test determinism
[pve-ha-manager.git] / src / PVE / HA / NodeStatus.pm
CommitLineData
cbca2c55
DM
1package PVE::HA::NodeStatus;
2
3use strict;
4use warnings;
5
854cecf3 6use JSON;
cbca2c55
DM
7use Data::Dumper;
8
b0e9158d
TL
9my $fence_delay = 60;
10
cbca2c55 11sub new {
c79442f2 12 my ($this, $haenv, $status) = @_;
cbca2c55
DM
13
14 my $class = ref($this) || $this;
15
16 my $self = bless {
c79442f2 17 haenv => $haenv,
c0bbd038 18 status => $status,
5385a606 19 last_online => {},
cbca2c55
DM
20 }, $class;
21
22 return $self;
23}
24
25# possible node state:
b9e715a1 26my $valid_node_states = {
c0bbd038
DM
27 online => "node online and member of quorate partition",
28 unknown => "not member of quorate partition, but possibly still running",
f7ccd1b3 29 fence => "node needs to be fenced",
7dd15f22 30 gone => "node vanished from cluster members list, possibly deleted"
b9e715a1 31};
cbca2c55
DM
32
33sub get_node_state {
34 my ($self, $node) = @_;
35
f7ccd1b3 36 $self->{status}->{$node} = 'unknown'
b9e715a1 37 if !$self->{status}->{$node};
cbca2c55 38
b9e715a1 39 return $self->{status}->{$node};
cbca2c55
DM
40}
41
f7ccd1b3
DM
42sub node_is_online {
43 my ($self, $node) = @_;
44
45 return $self->get_node_state($node) eq 'online';
46}
47
5385a606
DM
48sub node_is_offline_delayed {
49 my ($self, $node, $delay) = @_;
50
b0e9158d
TL
51 $delay = $fence_delay if !defined($delay);
52
d8b6f99b
DM
53 my $haenv = $self->{haenv};
54
5385a606
DM
55 return undef if $self->get_node_state($node) eq 'online';
56
57 my $last_online = $self->{last_online}->{$node};
58
d8b6f99b
DM
59 my $ctime = $haenv->get_time();
60
5385a606
DM
61 if (!defined($last_online)) {
62 $self->{last_online}->{$node} = $ctime;
63 return undef;
64 }
65
d8b6f99b 66 return ($ctime - $last_online) >= $delay;
5385a606
DM
67}
68
9c7d068b
DM
69sub list_nodes {
70 my ($self) = @_;
71
72 return [sort keys %{$self->{status}}];
73}
74
f7ccd1b3
DM
75sub list_online_nodes {
76 my ($self) = @_;
77
78 my $res = [];
79
c79442f2 80 foreach my $node (sort keys %{$self->{status}}) {
f7ccd1b3
DM
81 next if $self->{status}->{$node} ne 'online';
82 push @$res, $node;
83 }
84
85 return $res;
86}
87
7dd15f22
TL
88my $delete_node = sub {
89 my ($self, $node) = @_;
90
91 return undef if $self->get_node_state($node) ne 'gone';
92
93 my $haenv = $self->{haenv};
94
95 delete $self->{last_online}->{$node};
96 delete $self->{status}->{$node};
97
98 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
99 " anymore.");
100};
101
cbca2c55
DM
102my $set_node_state = sub {
103 my ($self, $node, $state) = @_;
104
c79442f2
DM
105 my $haenv = $self->{haenv};
106
b9e715a1
DM
107 die "unknown node state '$state'\n"
108 if !defined($valid_node_states->{$state});
cbca2c55
DM
109
110 my $last_state = $self->get_node_state($node);
111
112 return if $state eq $last_state;
113
114 $self->{status}->{$node} = $state;
115
c79442f2
DM
116 $haenv->log('info', "node '$node': state changed from " .
117 "'$last_state' => '$state'\n");
cbca2c55
DM
118};
119
120sub update {
121 my ($self, $node_info) = @_;
122
d8b6f99b
DM
123 my $haenv = $self->{haenv};
124
125 foreach my $node (sort keys %$node_info) {
cbca2c55
DM
126 my $d = $node_info->{$node};
127 next if !$d->{online};
128
5385a606 129 # record last time the node was online (required to implement fence delay)
d8b6f99b 130 $self->{last_online}->{$node} = $haenv->get_time();
5385a606 131
cbca2c55
DM
132 my $state = $self->get_node_state($node);
133
f7ccd1b3 134 if ($state eq 'online') {
c0bbd038 135 # &$set_node_state($self, $node, 'online');
7dd15f22 136 } elsif ($state eq 'unknown' || $state eq 'gone') {
c0bbd038 137 &$set_node_state($self, $node, 'online');
f7ccd1b3 138 } elsif ($state eq 'fence') {
c0bbd038 139 # do nothing, wait until fenced
c0bbd038
DM
140 } else {
141 die "detected unknown node state '$state";
cbca2c55
DM
142 }
143 }
144
9b2dbc2a 145 foreach my $node (sort keys %{$self->{status}}) {
cbca2c55
DM
146 my $d = $node_info->{$node};
147 next if $d && $d->{online};
148
149 my $state = $self->get_node_state($node);
150
c0bbd038
DM
151 # node is not inside quorate partition, possibly not active
152
f7ccd1b3 153 if ($state eq 'online') {
c0bbd038
DM
154 &$set_node_state($self, $node, 'unknown');
155 } elsif ($state eq 'unknown') {
7dd15f22
TL
156
157 # node isn't in the member list anymore, deleted from the cluster?
158 &$set_node_state($self, $node, 'gone') if(!defined($d));
159
f7ccd1b3 160 } elsif ($state eq 'fence') {
c0bbd038 161 # do nothing, wait until fenced
7dd15f22
TL
162 } elsif($state eq 'gone') {
163 if($self->node_is_offline_delayed($node, 3600)) {
164 &$delete_node($self, $node);
165 }
c0bbd038
DM
166 } else {
167 die "detected unknown node state '$state";
168 }
169
cbca2c55
DM
170 }
171}
172
854cecf3
TL
173# assembles a commont text for fence emails
174my $send_fence_state_email = sub {
175 my ($self, $subject_prefix, $subject, $node) = @_;
176
177 my $haenv = $self->{haenv};
178
179 my $mail_text = <<EOF
180The node '$node' failed and needs manual intervention.
181
182The PVE HA manager tries to fence it and recover the
183configured HA resources to a healthy node if possible.
184
185Current fence status: $subject_prefix
186$subject
187
188
189Overall Cluster status:
190-----------------------
191
192EOF
193;
194 my $mail_subject = $subject_prefix . ': ' . $subject;
195
196 my $status = $haenv->read_manager_status();
197 my $data = { manager_status => $status, node_status => $self->{status} };
198
199 $mail_text .= to_json($data, { pretty => 1, canonical => 1});
200
201 $haenv->sendmail($mail_subject, $mail_text);
202};
203
204
c79442f2 205# start fencing
f7ccd1b3
DM
206sub fence_node {
207 my ($self, $node) = @_;
208
c79442f2
DM
209 my $haenv = $self->{haenv};
210
f7ccd1b3
DM
211 my $state = $self->get_node_state($node);
212
c79442f2
DM
213 if ($state ne 'fence') {
214 &$set_node_state($self, $node, 'fence');
854cecf3
TL
215 my $msg = "Try to fence node '$node'";
216 &$send_fence_state_email($self, 'FENCE', $msg, $node);
f7ccd1b3
DM
217 }
218
f5c29173 219 my $success = $haenv->get_ha_agent_lock($node);
ffa555c5
DM
220
221 if ($success) {
854cecf3
TL
222 my $msg = "fencing: acknowleged - got agent lock for node '$node'";
223 $haenv->log("info", $msg);
21e37ed4 224 &$set_node_state($self, $node, 'unknown');
854cecf3 225 &$send_fence_state_email($self, 'SUCEED', $msg, $node);
ffa555c5
DM
226 }
227
228 return $success;
f7ccd1b3
DM
229}
230
cbca2c55 2311;