]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/CRM.pm
do not do active work if cfs update failed
[pve-ha-manager.git] / src / PVE / HA / CRM.pm
1 package PVE::HA::CRM;
2
3 # Cluster Resource Manager
4
5 use strict;
6 use warnings;
7
8 use PVE::Tools;
9 use PVE::HA::Tools;
10
11 use PVE::HA::Manager;
12
13 # Server can have several state:
14
15 my $valid_states = {
16 wait_for_quorum => "cluster is not quorate, waiting",
17 master => "quorate, and we got the ha_manager lock",
18 lost_manager_lock => "we lost the ha_manager lock (watchdog active)",
19 slave => "quorate, but we do not own the ha_manager lock",
20 };
21
22 sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 manager => undef,
30 status => { state => 'startup' },
31 cluster_state_update => 0,
32 }, $class;
33
34 $self->set_local_status({ state => 'wait_for_quorum' });
35
36 return $self;
37 }
38
39 sub shutdown_request {
40 my ($self) = @_;
41
42 $self->{haenv}->log('info' , "server received shutdown request")
43 if !$self->{shutdown_request};
44
45 $self->{shutdown_request} = 1;
46 }
47
48 sub get_local_status {
49 my ($self) = @_;
50
51 return $self->{status};
52 }
53
54 sub set_local_status {
55 my ($self, $new) = @_;
56
57 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
58
59 my $haenv = $self->{haenv};
60
61 my $old = $self->{status};
62
63 # important: only update if if really changed
64 return if $old->{state} eq $new->{state};
65
66 $haenv->log('info', "status change $old->{state} => $new->{state}");
67
68 $new->{state_change_time} = $haenv->get_time();
69
70 $self->{status} = $new;
71
72 # fixme: do not use extra class
73 if ($new->{state} eq 'master') {
74 $self->{manager} = PVE::HA::Manager->new($haenv);
75 } else {
76 if ($self->{manager}) {
77 # fixme: what should we do here?
78 $self->{manager}->cleanup();
79 $self->{manager} = undef;
80 }
81 }
82 }
83
84 sub get_protected_ha_manager_lock {
85 my ($self) = @_;
86
87 my $haenv = $self->{haenv};
88
89 my $count = 0;
90 my $starttime = $haenv->get_time();
91
92 for (;;) {
93
94 if ($haenv->get_ha_manager_lock()) {
95 if ($self->{ha_manager_wd}) {
96 $haenv->watchdog_update($self->{ha_manager_wd});
97 } else {
98 my $wfh = $haenv->watchdog_open();
99 $self->{ha_manager_wd} = $wfh;
100 }
101 return 1;
102 }
103
104 last if ++$count > 5; # try max 5 time
105
106 my $delay = $haenv->get_time() - $starttime;
107 last if $delay > 5; # for max 5 seconds
108
109 $haenv->sleep(1);
110 }
111
112 return 0;
113 }
114
115 # checks quorum, for no active pending fence jobs and if services are configured
116 sub can_get_active {
117 my ($self, $allow_no_service) = @_;
118
119 my $haenv = $self->{haenv};
120
121 return 0 if !$haenv->quorate();
122
123 # we may not do any active work with an incosistent cluster state
124 return 0 if !$self->{cluster_state_update};
125
126 my $manager_status = eval { $haenv->read_manager_status() };
127 if (my $err = $@) {
128 $haenv->log('err', "could not read manager status: $err");
129 return 0;
130 }
131 my $ss = $manager_status->{service_status};
132 return 0 if PVE::HA::Tools::count_fenced_services($ss, $haenv->nodename());
133
134 if (!$allow_no_service) {
135 my $conf = eval { $haenv->read_service_config() };
136 if (my $err = $@) {
137 $haenv->log('err', "could not read service config: $err");
138 return undef;
139 }
140 return 0 if !scalar(%{$conf});
141 }
142
143 return 1;
144 }
145
146 sub do_one_iteration {
147 my ($self) = @_;
148
149 my $haenv = $self->{haenv};
150
151 $haenv->loop_start_hook();
152
153 $self->{cluster_state_update} = $haenv->cluster_state_update();
154
155 my $res = $self->work();
156
157 $haenv->loop_end_hook();
158
159 return $res;
160 }
161
162 sub work {
163 my ($self) = @_;
164
165 my $haenv = $self->{haenv};
166
167 my $status = $self->get_local_status();
168 my $state = $status->{state};
169
170 # do state changes first
171
172 if ($state eq 'wait_for_quorum') {
173
174 if ($self->can_get_active()) {
175 if ($self->get_protected_ha_manager_lock()) {
176 $self->set_local_status({ state => 'master' });
177 } else {
178 $self->set_local_status({ state => 'slave' });
179 }
180 }
181
182 } elsif ($state eq 'slave') {
183
184 if ($self->can_get_active()) {
185 if ($self->get_protected_ha_manager_lock()) {
186 $self->set_local_status({ state => 'master' });
187 }
188 } else {
189 $self->set_local_status({ state => 'wait_for_quorum' });
190 }
191
192 } elsif ($state eq 'lost_manager_lock') {
193
194 if ($self->can_get_active(1)) {
195 if ($self->get_protected_ha_manager_lock()) {
196 $self->set_local_status({ state => 'master' });
197 }
198 }
199
200 } elsif ($state eq 'master') {
201
202 if (!$self->get_protected_ha_manager_lock()) {
203 $self->set_local_status({ state => 'lost_manager_lock'});
204 }
205 }
206
207 $status = $self->get_local_status();
208 $state = $status->{state};
209
210 # do work
211
212 if ($state eq 'wait_for_quorum') {
213
214 return 0 if $self->{shutdown_request};
215
216 $haenv->sleep(5);
217
218 } elsif ($state eq 'master') {
219
220 my $manager = $self->{manager};
221
222 die "no manager" if !defined($manager);
223
224 my $startime = $haenv->get_time();
225
226 my $max_time = 10;
227
228 my $shutdown = 0;
229
230 # do work (max_time seconds)
231 eval {
232 # fixme: set alert timer
233
234 if ($self->{shutdown_request}) {
235
236 if ($self->{ha_manager_wd}) {
237 $haenv->watchdog_close($self->{ha_manager_wd});
238 delete $self->{ha_manager_wd};
239 }
240
241 # release the manager lock, so another CRM slave can get it
242 # and continue to work without waiting for the lock timeout
243 $haenv->log('info', "voluntary release CRM lock");
244 if (!$haenv->release_ha_manager_lock()) {
245 $haenv->log('notice', "CRM lock release failed, let the" .
246 " lock timeout");
247 }
248
249 $shutdown = 1;
250
251 } else {
252 if (!$self->{cluster_state_update}) {
253 # update failed but we could still renew our lock (cfs restart?),
254 # safely skip manage and expect to update just fine next round
255 $haenv->log('notice', "temporary inconsistent cluster state " .
256 "(cfs restart?), skip round");
257 return;
258 }
259
260 $manager->manage();
261 }
262 };
263 if (my $err = $@) {
264 $haenv->log('err', "got unexpected error - $err");
265 }
266
267 return 0 if $shutdown;
268
269 $haenv->sleep_until($startime + $max_time);
270
271 } elsif ($state eq 'lost_manager_lock') {
272
273 if ($self->{ha_manager_wd}) {
274 $haenv->watchdog_close($self->{ha_manager_wd});
275 delete $self->{ha_manager_wd};
276 }
277
278 return 0 if $self->{shutdown_request};
279
280 $self->set_local_status({ state => 'wait_for_quorum' });
281
282 } elsif ($state eq 'slave') {
283
284 return 0 if $self->{shutdown_request};
285
286 # wait until we get master
287
288 } else {
289
290 die "got unexpected status '$state'\n";
291 }
292
293 return 1;
294 }
295
296 1;