]> git.proxmox.com Git - pve-ha-manager.git/blame - src/PVE/HA/CRM.pm
do not do active work if cfs update failed
[pve-ha-manager.git] / src / PVE / HA / CRM.pm
CommitLineData
6cd38bc6
DM
1package PVE::HA::CRM;
2
3# Cluster Resource Manager
f25a336a
DM
4
5use strict;
6use warnings;
7
f25a336a 8use PVE::Tools;
17654a06 9use PVE::HA::Tools;
f25a336a
DM
10
11use PVE::HA::Manager;
12
13# Server can have several state:
f25a336a
DM
14
15my $valid_states = {
b6044542
DM
16 wait_for_quorum => "cluster is not quorate, waiting",
17 master => "quorate, and we got the ha_manager lock",
959d7afe 18 lost_manager_lock => "we lost the ha_manager lock (watchdog active)",
b6044542 19 slave => "quorate, but we do not own the ha_manager lock",
f25a336a
DM
20};
21
22sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 manager => undef,
b6044542 30 status => { state => 'startup' },
3df15380 31 cluster_state_update => 0,
f25a336a
DM
32 }, $class;
33
b6044542 34 $self->set_local_status({ state => 'wait_for_quorum' });
f25a336a
DM
35
36 return $self;
37}
38
b6044542
DM
39sub shutdown_request {
40 my ($self) = @_;
41
f466005d 42 $self->{haenv}->log('info' , "server received shutdown request")
378af518
DM
43 if !$self->{shutdown_request};
44
b6044542
DM
45 $self->{shutdown_request} = 1;
46}
47
f25a336a
DM
48sub get_local_status {
49 my ($self) = @_;
50
51 return $self->{status};
52}
53
54sub set_local_status {
17654a06 55 my ($self, $new) = @_;
f25a336a 56
17654a06 57 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
f25a336a
DM
58
59 my $haenv = $self->{haenv};
60
17654a06 61 my $old = $self->{status};
f25a336a 62
b6044542
DM
63 # important: only update if if really changed
64 return if $old->{state} eq $new->{state};
f25a336a 65
0bba8f60 66 $haenv->log('info', "status change $old->{state} => $new->{state}");
f25a336a 67
17654a06 68 $new->{state_change_time} = $haenv->get_time();
f25a336a 69
17654a06 70 $self->{status} = $new;
f25a336a 71
b6044542 72 # fixme: do not use extra class
17654a06 73 if ($new->{state} eq 'master') {
f25a336a
DM
74 $self->{manager} = PVE::HA::Manager->new($haenv);
75 } else {
76 if ($self->{manager}) {
77 # fixme: what should we do here?
78 $self->{manager}->cleanup();
79 $self->{manager} = undef;
80 }
81 }
82}
83
b6044542 84sub get_protected_ha_manager_lock {
f25a336a
DM
85 my ($self) = @_;
86
87 my $haenv = $self->{haenv};
88
89 my $count = 0;
b6044542 90 my $starttime = $haenv->get_time();
68549bf1 91
f25a336a 92 for (;;) {
b6044542
DM
93
94 if ($haenv->get_ha_manager_lock()) {
95 if ($self->{ha_manager_wd}) {
96 $haenv->watchdog_update($self->{ha_manager_wd});
97 } else {
98 my $wfh = $haenv->watchdog_open();
99 $self->{ha_manager_wd} = $wfh;
68549bf1 100 }
b6044542 101 return 1;
68549bf1
DM
102 }
103
b6044542 104 last if ++$count > 5; # try max 5 time
68549bf1 105
b6044542
DM
106 my $delay = $haenv->get_time() - $starttime;
107 last if $delay > 5; # for max 5 seconds
68549bf1 108
f25a336a
DM
109 $haenv->sleep(1);
110 }
b6044542 111
f25a336a
DM
112 return 0;
113}
114
30b4f397
TL
115# checks quorum, for no active pending fence jobs and if services are configured
116sub can_get_active {
117 my ($self, $allow_no_service) = @_;
49777d09 118
30b4f397
TL
119 my $haenv = $self->{haenv};
120
121 return 0 if !$haenv->quorate();
122
724bd3f3
TL
123 # we may not do any active work with an incosistent cluster state
124 return 0 if !$self->{cluster_state_update};
125
30b4f397
TL
126 my $manager_status = eval { $haenv->read_manager_status() };
127 if (my $err = $@) {
128 $haenv->log('err', "could not read manager status: $err");
129 return 0;
130 }
49777d09 131 my $ss = $manager_status->{service_status};
30b4f397 132 return 0 if PVE::HA::Tools::count_fenced_services($ss, $haenv->nodename());
49777d09 133
30b4f397
TL
134 if (!$allow_no_service) {
135 my $conf = eval { $haenv->read_service_config() };
136 if (my $err = $@) {
137 $haenv->log('err', "could not read service config: $err");
138 return undef;
139 }
140 return 0 if !scalar(%{$conf});
141 }
49777d09 142
30b4f397 143 return 1;
49777d09
DM
144}
145
f25a336a
DM
146sub do_one_iteration {
147 my ($self) = @_;
148
149 my $haenv = $self->{haenv};
150
da6f0416
TL
151 $haenv->loop_start_hook();
152
3df15380
TL
153 $self->{cluster_state_update} = $haenv->cluster_state_update();
154
da6f0416
TL
155 my $res = $self->work();
156
157 $haenv->loop_end_hook();
158
159 return $res;
160}
161
162sub work {
163 my ($self) = @_;
164
165 my $haenv = $self->{haenv};
166
f25a336a 167 my $status = $self->get_local_status();
17654a06 168 my $state = $status->{state};
f25a336a 169
17654a06 170 # do state changes first
f25a336a 171
b6044542 172 if ($state eq 'wait_for_quorum') {
f25a336a 173
30b4f397 174 if ($self->can_get_active()) {
b6044542 175 if ($self->get_protected_ha_manager_lock()) {
17654a06
DM
176 $self->set_local_status({ state => 'master' });
177 } else {
178 $self->set_local_status({ state => 'slave' });
179 }
180 }
181
b6044542 182 } elsif ($state eq 'slave') {
17654a06 183
30b4f397 184 if ($self->can_get_active()) {
b6044542
DM
185 if ($self->get_protected_ha_manager_lock()) {
186 $self->set_local_status({ state => 'master' });
f25a336a 187 }
b6044542
DM
188 } else {
189 $self->set_local_status({ state => 'wait_for_quorum' });
f25a336a
DM
190 }
191
b6044542 192 } elsif ($state eq 'lost_manager_lock') {
17654a06 193
30b4f397 194 if ($self->can_get_active(1)) {
b6044542 195 if ($self->get_protected_ha_manager_lock()) {
17654a06
DM
196 $self->set_local_status({ state => 'master' });
197 }
17654a06
DM
198 }
199
b6044542
DM
200 } elsif ($state eq 'master') {
201
202 if (!$self->get_protected_ha_manager_lock()) {
203 $self->set_local_status({ state => 'lost_manager_lock'});
204 }
17654a06
DM
205 }
206
207 $status = $self->get_local_status();
208 $state = $status->{state};
209
210 # do work
211
b6044542 212 if ($state eq 'wait_for_quorum') {
17654a06 213
b6044542 214 return 0 if $self->{shutdown_request};
17654a06
DM
215
216 $haenv->sleep(5);
217
218 } elsif ($state eq 'master') {
f25a336a
DM
219
220 my $manager = $self->{manager};
221
222 die "no manager" if !defined($manager);
223
224 my $startime = $haenv->get_time();
225
226 my $max_time = 10;
227
3d411a6b
DM
228 my $shutdown = 0;
229
f25a336a
DM
230 # do work (max_time seconds)
231 eval {
232 # fixme: set alert timer
3d411a6b
DM
233
234 if ($self->{shutdown_request}) {
235
236 if ($self->{ha_manager_wd}) {
237 $haenv->watchdog_close($self->{ha_manager_wd});
238 delete $self->{ha_manager_wd};
239 }
240
de002253
TL
241 # release the manager lock, so another CRM slave can get it
242 # and continue to work without waiting for the lock timeout
243 $haenv->log('info', "voluntary release CRM lock");
244 if (!$haenv->release_ha_manager_lock()) {
245 $haenv->log('notice', "CRM lock release failed, let the" .
246 " lock timeout");
247 }
248
3d411a6b
DM
249 $shutdown = 1;
250
251 } else {
724bd3f3
TL
252 if (!$self->{cluster_state_update}) {
253 # update failed but we could still renew our lock (cfs restart?),
254 # safely skip manage and expect to update just fine next round
255 $haenv->log('notice', "temporary inconsistent cluster state " .
256 "(cfs restart?), skip round");
257 return;
258 }
3df15380 259
3d411a6b
DM
260 $manager->manage();
261 }
f25a336a
DM
262 };
263 if (my $err = $@) {
f25a336a 264 $haenv->log('err', "got unexpected error - $err");
b6044542 265 }
f25a336a 266
3d411a6b
DM
267 return 0 if $shutdown;
268
b6044542
DM
269 $haenv->sleep_until($startime + $max_time);
270
271 } elsif ($state eq 'lost_manager_lock') {
272
273 if ($self->{ha_manager_wd}) {
274 $haenv->watchdog_close($self->{ha_manager_wd});
275 delete $self->{ha_manager_wd};
f25a336a
DM
276 }
277
b6044542
DM
278 return 0 if $self->{shutdown_request};
279
280 $self->set_local_status({ state => 'wait_for_quorum' });
281
17654a06 282 } elsif ($state eq 'slave') {
b6044542 283
1c7886c2
DM
284 return 0 if $self->{shutdown_request};
285
b6044542
DM
286 # wait until we get master
287
f25a336a 288 } else {
b6044542 289
17654a06 290 die "got unexpected status '$state'\n";
f25a336a
DM
291 }
292
f25a336a
DM
293 return 1;
294}
295
2961;