]>
Commit | Line | Data |
---|---|---|
6cd38bc6 DM |
1 | package PVE::HA::CRM; |
2 | ||
3 | # Cluster Resource Manager | |
f25a336a DM |
4 | |
5 | use strict; | |
6 | use warnings; | |
7 | ||
f25a336a | 8 | use PVE::Tools; |
17654a06 | 9 | use PVE::HA::Tools; |
f25a336a DM |
10 | |
11 | use PVE::HA::Manager; | |
12 | ||
13 | # Server can have several state: | |
f25a336a DM |
14 | |
15 | my $valid_states = { | |
b6044542 DM |
16 | wait_for_quorum => "cluster is not quorate, waiting", |
17 | master => "quorate, and we got the ha_manager lock", | |
959d7afe | 18 | lost_manager_lock => "we lost the ha_manager lock (watchdog active)", |
b6044542 | 19 | slave => "quorate, but we do not own the ha_manager lock", |
f25a336a DM |
20 | }; |
21 | ||
22 | sub new { | |
23 | my ($this, $haenv) = @_; | |
24 | ||
25 | my $class = ref($this) || $this; | |
26 | ||
27 | my $self = bless { | |
28 | haenv => $haenv, | |
29 | manager => undef, | |
b6044542 | 30 | status => { state => 'startup' }, |
3df15380 | 31 | cluster_state_update => 0, |
f25a336a DM |
32 | }, $class; |
33 | ||
b6044542 | 34 | $self->set_local_status({ state => 'wait_for_quorum' }); |
f25a336a DM |
35 | |
36 | return $self; | |
37 | } | |
38 | ||
b6044542 DM |
39 | sub shutdown_request { |
40 | my ($self) = @_; | |
41 | ||
f466005d | 42 | $self->{haenv}->log('info' , "server received shutdown request") |
378af518 DM |
43 | if !$self->{shutdown_request}; |
44 | ||
b6044542 DM |
45 | $self->{shutdown_request} = 1; |
46 | } | |
47 | ||
f25a336a DM |
48 | sub get_local_status { |
49 | my ($self) = @_; | |
50 | ||
51 | return $self->{status}; | |
52 | } | |
53 | ||
54 | sub set_local_status { | |
17654a06 | 55 | my ($self, $new) = @_; |
f25a336a | 56 | |
17654a06 | 57 | die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}}; |
f25a336a DM |
58 | |
59 | my $haenv = $self->{haenv}; | |
60 | ||
17654a06 | 61 | my $old = $self->{status}; |
f25a336a | 62 | |
b6044542 DM |
63 | # important: only update if if really changed |
64 | return if $old->{state} eq $new->{state}; | |
f25a336a | 65 | |
0bba8f60 | 66 | $haenv->log('info', "status change $old->{state} => $new->{state}"); |
f25a336a | 67 | |
17654a06 | 68 | $new->{state_change_time} = $haenv->get_time(); |
f25a336a | 69 | |
17654a06 | 70 | $self->{status} = $new; |
f25a336a | 71 | |
b6044542 | 72 | # fixme: do not use extra class |
17654a06 | 73 | if ($new->{state} eq 'master') { |
f25a336a DM |
74 | $self->{manager} = PVE::HA::Manager->new($haenv); |
75 | } else { | |
76 | if ($self->{manager}) { | |
77 | # fixme: what should we do here? | |
78 | $self->{manager}->cleanup(); | |
79 | $self->{manager} = undef; | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
b6044542 | 84 | sub get_protected_ha_manager_lock { |
f25a336a DM |
85 | my ($self) = @_; |
86 | ||
87 | my $haenv = $self->{haenv}; | |
88 | ||
89 | my $count = 0; | |
b6044542 | 90 | my $starttime = $haenv->get_time(); |
68549bf1 | 91 | |
f25a336a | 92 | for (;;) { |
b6044542 DM |
93 | |
94 | if ($haenv->get_ha_manager_lock()) { | |
95 | if ($self->{ha_manager_wd}) { | |
96 | $haenv->watchdog_update($self->{ha_manager_wd}); | |
97 | } else { | |
98 | my $wfh = $haenv->watchdog_open(); | |
99 | $self->{ha_manager_wd} = $wfh; | |
68549bf1 | 100 | } |
b6044542 | 101 | return 1; |
68549bf1 DM |
102 | } |
103 | ||
b6044542 | 104 | last if ++$count > 5; # try max 5 time |
68549bf1 | 105 | |
b6044542 DM |
106 | my $delay = $haenv->get_time() - $starttime; |
107 | last if $delay > 5; # for max 5 seconds | |
68549bf1 | 108 | |
f25a336a DM |
109 | $haenv->sleep(1); |
110 | } | |
b6044542 | 111 | |
f25a336a DM |
112 | return 0; |
113 | } | |
114 | ||
30b4f397 TL |
115 | # checks quorum, for no active pending fence jobs and if services are configured |
116 | sub can_get_active { | |
117 | my ($self, $allow_no_service) = @_; | |
49777d09 | 118 | |
30b4f397 TL |
119 | my $haenv = $self->{haenv}; |
120 | ||
121 | return 0 if !$haenv->quorate(); | |
122 | ||
724bd3f3 TL |
123 | # we may not do any active work with an incosistent cluster state |
124 | return 0 if !$self->{cluster_state_update}; | |
125 | ||
30b4f397 TL |
126 | my $manager_status = eval { $haenv->read_manager_status() }; |
127 | if (my $err = $@) { | |
128 | $haenv->log('err', "could not read manager status: $err"); | |
129 | return 0; | |
130 | } | |
49777d09 | 131 | my $ss = $manager_status->{service_status}; |
30b4f397 | 132 | return 0 if PVE::HA::Tools::count_fenced_services($ss, $haenv->nodename()); |
49777d09 | 133 | |
30b4f397 TL |
134 | if (!$allow_no_service) { |
135 | my $conf = eval { $haenv->read_service_config() }; | |
136 | if (my $err = $@) { | |
137 | $haenv->log('err', "could not read service config: $err"); | |
138 | return undef; | |
139 | } | |
140 | return 0 if !scalar(%{$conf}); | |
141 | } | |
49777d09 | 142 | |
30b4f397 | 143 | return 1; |
49777d09 DM |
144 | } |
145 | ||
f25a336a DM |
146 | sub do_one_iteration { |
147 | my ($self) = @_; | |
148 | ||
149 | my $haenv = $self->{haenv}; | |
150 | ||
da6f0416 TL |
151 | $haenv->loop_start_hook(); |
152 | ||
3df15380 TL |
153 | $self->{cluster_state_update} = $haenv->cluster_state_update(); |
154 | ||
da6f0416 TL |
155 | my $res = $self->work(); |
156 | ||
157 | $haenv->loop_end_hook(); | |
158 | ||
159 | return $res; | |
160 | } | |
161 | ||
162 | sub work { | |
163 | my ($self) = @_; | |
164 | ||
165 | my $haenv = $self->{haenv}; | |
166 | ||
f25a336a | 167 | my $status = $self->get_local_status(); |
17654a06 | 168 | my $state = $status->{state}; |
f25a336a | 169 | |
17654a06 | 170 | # do state changes first |
f25a336a | 171 | |
b6044542 | 172 | if ($state eq 'wait_for_quorum') { |
f25a336a | 173 | |
30b4f397 | 174 | if ($self->can_get_active()) { |
b6044542 | 175 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
176 | $self->set_local_status({ state => 'master' }); |
177 | } else { | |
178 | $self->set_local_status({ state => 'slave' }); | |
179 | } | |
180 | } | |
181 | ||
b6044542 | 182 | } elsif ($state eq 'slave') { |
17654a06 | 183 | |
30b4f397 | 184 | if ($self->can_get_active()) { |
b6044542 DM |
185 | if ($self->get_protected_ha_manager_lock()) { |
186 | $self->set_local_status({ state => 'master' }); | |
f25a336a | 187 | } |
b6044542 DM |
188 | } else { |
189 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
f25a336a DM |
190 | } |
191 | ||
b6044542 | 192 | } elsif ($state eq 'lost_manager_lock') { |
17654a06 | 193 | |
30b4f397 | 194 | if ($self->can_get_active(1)) { |
b6044542 | 195 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
196 | $self->set_local_status({ state => 'master' }); |
197 | } | |
17654a06 DM |
198 | } |
199 | ||
b6044542 DM |
200 | } elsif ($state eq 'master') { |
201 | ||
202 | if (!$self->get_protected_ha_manager_lock()) { | |
203 | $self->set_local_status({ state => 'lost_manager_lock'}); | |
204 | } | |
17654a06 DM |
205 | } |
206 | ||
207 | $status = $self->get_local_status(); | |
208 | $state = $status->{state}; | |
209 | ||
210 | # do work | |
211 | ||
b6044542 | 212 | if ($state eq 'wait_for_quorum') { |
17654a06 | 213 | |
b6044542 | 214 | return 0 if $self->{shutdown_request}; |
17654a06 DM |
215 | |
216 | $haenv->sleep(5); | |
217 | ||
218 | } elsif ($state eq 'master') { | |
f25a336a DM |
219 | |
220 | my $manager = $self->{manager}; | |
221 | ||
222 | die "no manager" if !defined($manager); | |
223 | ||
224 | my $startime = $haenv->get_time(); | |
225 | ||
226 | my $max_time = 10; | |
227 | ||
3d411a6b DM |
228 | my $shutdown = 0; |
229 | ||
f25a336a DM |
230 | # do work (max_time seconds) |
231 | eval { | |
232 | # fixme: set alert timer | |
3d411a6b DM |
233 | |
234 | if ($self->{shutdown_request}) { | |
235 | ||
236 | if ($self->{ha_manager_wd}) { | |
237 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
238 | delete $self->{ha_manager_wd}; | |
239 | } | |
240 | ||
de002253 TL |
241 | # release the manager lock, so another CRM slave can get it |
242 | # and continue to work without waiting for the lock timeout | |
243 | $haenv->log('info', "voluntary release CRM lock"); | |
244 | if (!$haenv->release_ha_manager_lock()) { | |
245 | $haenv->log('notice', "CRM lock release failed, let the" . | |
246 | " lock timeout"); | |
247 | } | |
248 | ||
3d411a6b DM |
249 | $shutdown = 1; |
250 | ||
251 | } else { | |
724bd3f3 TL |
252 | if (!$self->{cluster_state_update}) { |
253 | # update failed but we could still renew our lock (cfs restart?), | |
254 | # safely skip manage and expect to update just fine next round | |
255 | $haenv->log('notice', "temporary inconsistent cluster state " . | |
256 | "(cfs restart?), skip round"); | |
257 | return; | |
258 | } | |
3df15380 | 259 | |
3d411a6b DM |
260 | $manager->manage(); |
261 | } | |
f25a336a DM |
262 | }; |
263 | if (my $err = $@) { | |
f25a336a | 264 | $haenv->log('err', "got unexpected error - $err"); |
b6044542 | 265 | } |
f25a336a | 266 | |
3d411a6b DM |
267 | return 0 if $shutdown; |
268 | ||
b6044542 DM |
269 | $haenv->sleep_until($startime + $max_time); |
270 | ||
271 | } elsif ($state eq 'lost_manager_lock') { | |
272 | ||
273 | if ($self->{ha_manager_wd}) { | |
274 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
275 | delete $self->{ha_manager_wd}; | |
f25a336a DM |
276 | } |
277 | ||
b6044542 DM |
278 | return 0 if $self->{shutdown_request}; |
279 | ||
280 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
281 | ||
17654a06 | 282 | } elsif ($state eq 'slave') { |
b6044542 | 283 | |
1c7886c2 DM |
284 | return 0 if $self->{shutdown_request}; |
285 | ||
b6044542 DM |
286 | # wait until we get master |
287 | ||
f25a336a | 288 | } else { |
b6044542 | 289 | |
17654a06 | 290 | die "got unexpected status '$state'\n"; |
f25a336a DM |
291 | } |
292 | ||
f25a336a DM |
293 | return 1; |
294 | } | |
295 | ||
296 | 1; |