]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/NodeStatus.pm
fix service name for pve-ha-crm
[pve-ha-manager.git] / src / PVE / HA / NodeStatus.pm
CommitLineData
cbca2c55
DM
1package PVE::HA::NodeStatus;
2
3use strict;
4use warnings;
5
854cecf3 6use JSON;
cbca2c55 7
b0e9158d
TL
8my $fence_delay = 60;
9
cbca2c55 10sub new {
c79442f2 11 my ($this, $haenv, $status) = @_;
cbca2c55
DM
12
13 my $class = ref($this) || $this;
14
15 my $self = bless {
c79442f2 16 haenv => $haenv,
c0bbd038 17 status => $status,
5385a606 18 last_online => {},
cbca2c55
DM
19 }, $class;
20
21 return $self;
22}
23
24# possible node state:
b9e715a1 25my $valid_node_states = {
c0bbd038 26 online => "node online and member of quorate partition",
99278e06 27 maintenance => "node is a member of quorate partition but currently not able to do work",
c0bbd038 28 unknown => "not member of quorate partition, but possibly still running",
f7ccd1b3 29 fence => "node needs to be fenced",
7dd15f22 30 gone => "node vanished from cluster members list, possibly deleted"
b9e715a1 31};
cbca2c55
DM
32
33sub get_node_state {
34 my ($self, $node) = @_;
35
289e4784 36 $self->{status}->{$node} = 'unknown'
b9e715a1 37 if !$self->{status}->{$node};
cbca2c55 38
b9e715a1 39 return $self->{status}->{$node};
cbca2c55
DM
40}
41
99278e06
TL
42sub node_is_operational {
43 my ($self, $node) = @_;
44 return $self->node_is_online($node) || $self->get_node_state($node) eq 'maintenance';
45}
46
f7ccd1b3
DM
47sub node_is_online {
48 my ($self, $node) = @_;
49
50 return $self->get_node_state($node) eq 'online';
51}
52
5385a606
DM
53sub node_is_offline_delayed {
54 my ($self, $node, $delay) = @_;
55
b0e9158d
TL
56 $delay = $fence_delay if !defined($delay);
57
d8b6f99b
DM
58 my $haenv = $self->{haenv};
59
5385a606
DM
60 return undef if $self->get_node_state($node) eq 'online';
61
62 my $last_online = $self->{last_online}->{$node};
63
d8b6f99b
DM
64 my $ctime = $haenv->get_time();
65
5385a606
DM
66 if (!defined($last_online)) {
67 $self->{last_online}->{$node} = $ctime;
68 return undef;
69 }
70
d8b6f99b 71 return ($ctime - $last_online) >= $delay;
5385a606
DM
72}
73
9c7d068b
DM
74sub list_nodes {
75 my ($self) = @_;
76
77 return [sort keys %{$self->{status}}];
78}
79
f7ccd1b3
DM
80sub list_online_nodes {
81 my ($self) = @_;
82
83 my $res = [];
84
c79442f2 85 foreach my $node (sort keys %{$self->{status}}) {
f7ccd1b3
DM
86 next if $self->{status}->{$node} ne 'online';
87 push @$res, $node;
88 }
89
90 return $res;
91}
92
7dd15f22
TL
93my $delete_node = sub {
94 my ($self, $node) = @_;
95
96 return undef if $self->get_node_state($node) ne 'gone';
97
98 my $haenv = $self->{haenv};
99
100 delete $self->{last_online}->{$node};
101 delete $self->{status}->{$node};
102
103 $haenv->log('notice', "deleting gone node '$node', not a cluster member".
104 " anymore.");
105};
106
cbca2c55
DM
107my $set_node_state = sub {
108 my ($self, $node, $state) = @_;
109
c79442f2
DM
110 my $haenv = $self->{haenv};
111
b9e715a1
DM
112 die "unknown node state '$state'\n"
113 if !defined($valid_node_states->{$state});
cbca2c55
DM
114
115 my $last_state = $self->get_node_state($node);
116
117 return if $state eq $last_state;
118
119 $self->{status}->{$node} = $state;
120
c79442f2
DM
121 $haenv->log('info', "node '$node': state changed from " .
122 "'$last_state' => '$state'\n");
cbca2c55
DM
123};
124
125sub update {
99278e06 126 my ($self, $node_info, $lrm_modes) = @_;
cbca2c55 127
d8b6f99b
DM
128 my $haenv = $self->{haenv};
129
130 foreach my $node (sort keys %$node_info) {
cbca2c55 131 my $d = $node_info->{$node};
99278e06 132 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
cbca2c55
DM
133 next if !$d->{online};
134
5385a606 135 # record last time the node was online (required to implement fence delay)
d8b6f99b 136 $self->{last_online}->{$node} = $haenv->get_time();
5385a606 137
cbca2c55
DM
138 my $state = $self->get_node_state($node);
139
f7ccd1b3 140 if ($state eq 'online') {
99278e06 141 if ($lrm_mode eq 'maintenance') {
99278e06
TL
142 $set_node_state->($self, $node, 'maintenance');
143 }
c0bbd038 144 # &$set_node_state($self, $node, 'online');
7dd15f22 145 } elsif ($state eq 'unknown' || $state eq 'gone') {
c0bbd038 146 &$set_node_state($self, $node, 'online');
f7ccd1b3 147 } elsif ($state eq 'fence') {
c0bbd038 148 # do nothing, wait until fenced
99278e06
TL
149 } elsif ($state eq 'maintenance') {
150 if ($lrm_mode ne 'maintenance') {
151 $set_node_state->($self, $node, 'online');
152 }
c0bbd038
DM
153 } else {
154 die "detected unknown node state '$state";
cbca2c55
DM
155 }
156 }
157
9b2dbc2a 158 foreach my $node (sort keys %{$self->{status}}) {
cbca2c55
DM
159 my $d = $node_info->{$node};
160 next if $d && $d->{online};
161
162 my $state = $self->get_node_state($node);
163
c0bbd038
DM
164 # node is not inside quorate partition, possibly not active
165
1388fcc1 166 if ($state eq 'online') {
c0bbd038 167 &$set_node_state($self, $node, 'unknown');
1388fcc1
TL
168 } elsif ($state eq 'maintenance') {
169 my $lrm_mode = $lrm_modes->{$node} // 'unkown';
170 if ($lrm_mode ne 'maintenance') {
171 $set_node_state->($self, $node, 'unknown');
172 }
c0bbd038 173 } elsif ($state eq 'unknown') {
7dd15f22
TL
174
175 # node isn't in the member list anymore, deleted from the cluster?
176 &$set_node_state($self, $node, 'gone') if(!defined($d));
177
f7ccd1b3 178 } elsif ($state eq 'fence') {
c0bbd038 179 # do nothing, wait until fenced
7dd15f22 180 } elsif($state eq 'gone') {
5d880e15 181 if ($self->node_is_offline_delayed($node, 3600)) {
7dd15f22
TL
182 &$delete_node($self, $node);
183 }
c0bbd038
DM
184 } else {
185 die "detected unknown node state '$state";
186 }
187
cbca2c55
DM
188 }
189}
190
854cecf3
TL
191# assembles a commont text for fence emails
192my $send_fence_state_email = sub {
193 my ($self, $subject_prefix, $subject, $node) = @_;
194
195 my $haenv = $self->{haenv};
196
197 my $mail_text = <<EOF
198The node '$node' failed and needs manual intervention.
199
200The PVE HA manager tries to fence it and recover the
201configured HA resources to a healthy node if possible.
202
203Current fence status: $subject_prefix
204$subject
205
206
207Overall Cluster status:
208-----------------------
209
210EOF
211;
212 my $mail_subject = $subject_prefix . ': ' . $subject;
213
214 my $status = $haenv->read_manager_status();
215 my $data = { manager_status => $status, node_status => $self->{status} };
216
217 $mail_text .= to_json($data, { pretty => 1, canonical => 1});
218
219 $haenv->sendmail($mail_subject, $mail_text);
220};
221
222
c79442f2 223# start fencing
f7ccd1b3
DM
224sub fence_node {
225 my ($self, $node) = @_;
226
c79442f2
DM
227 my $haenv = $self->{haenv};
228
f7ccd1b3
DM
229 my $state = $self->get_node_state($node);
230
c79442f2
DM
231 if ($state ne 'fence') {
232 &$set_node_state($self, $node, 'fence');
854cecf3
TL
233 my $msg = "Try to fence node '$node'";
234 &$send_fence_state_email($self, 'FENCE', $msg, $node);
f7ccd1b3
DM
235 }
236
f5c29173 237 my $success = $haenv->get_ha_agent_lock($node);
ffa555c5
DM
238
239 if ($success) {
e2a7b1b5 240 my $msg = "fencing: acknowledged - got agent lock for node '$node'";
854cecf3 241 $haenv->log("info", $msg);
21e37ed4 242 &$set_node_state($self, $node, 'unknown');
e2a7b1b5 243 &$send_fence_state_email($self, 'SUCCEED', $msg, $node);
ffa555c5
DM
244 }
245
246 return $success;
f7ccd1b3
DM
247}
248
cbca2c55 2491;