]>
Commit | Line | Data |
---|---|---|
6cd38bc6 DM |
1 | package PVE::HA::CRM; |
2 | ||
3 | # Cluster Resource Manager | |
f25a336a DM |
4 | |
5 | use strict; | |
6 | use warnings; | |
7 | ||
8 | use PVE::SafeSyslog; | |
9 | use PVE::Tools; | |
17654a06 | 10 | use PVE::HA::Tools; |
f25a336a DM |
11 | |
12 | use PVE::HA::Manager; | |
13 | ||
14 | # Server can have several state: | |
f25a336a DM |
15 | |
16 | my $valid_states = { | |
b6044542 DM |
17 | wait_for_quorum => "cluster is not quorate, waiting", |
18 | master => "quorate, and we got the ha_manager lock", | |
959d7afe | 19 | lost_manager_lock => "we lost the ha_manager lock (watchdog active)", |
b6044542 | 20 | slave => "quorate, but we do not own the ha_manager lock", |
f25a336a DM |
21 | }; |
22 | ||
23 | sub new { | |
24 | my ($this, $haenv) = @_; | |
25 | ||
26 | my $class = ref($this) || $this; | |
27 | ||
28 | my $self = bless { | |
29 | haenv => $haenv, | |
30 | manager => undef, | |
b6044542 | 31 | status => { state => 'startup' }, |
f25a336a DM |
32 | }, $class; |
33 | ||
b6044542 | 34 | $self->set_local_status({ state => 'wait_for_quorum' }); |
f25a336a DM |
35 | |
36 | return $self; | |
37 | } | |
38 | ||
b6044542 DM |
39 | sub shutdown_request { |
40 | my ($self) = @_; | |
41 | ||
378af518 DM |
42 | syslog('info' , "server received shutdown request") |
43 | if !$self->{shutdown_request}; | |
44 | ||
b6044542 DM |
45 | $self->{shutdown_request} = 1; |
46 | } | |
47 | ||
f25a336a DM |
48 | sub get_local_status { |
49 | my ($self) = @_; | |
50 | ||
51 | return $self->{status}; | |
52 | } | |
53 | ||
54 | sub set_local_status { | |
17654a06 | 55 | my ($self, $new) = @_; |
f25a336a | 56 | |
17654a06 | 57 | die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}}; |
f25a336a DM |
58 | |
59 | my $haenv = $self->{haenv}; | |
60 | ||
17654a06 | 61 | my $old = $self->{status}; |
f25a336a | 62 | |
b6044542 DM |
63 | # important: only update if if really changed |
64 | return if $old->{state} eq $new->{state}; | |
f25a336a | 65 | |
0bba8f60 | 66 | $haenv->log('info', "status change $old->{state} => $new->{state}"); |
f25a336a | 67 | |
17654a06 | 68 | $new->{state_change_time} = $haenv->get_time(); |
f25a336a | 69 | |
17654a06 | 70 | $self->{status} = $new; |
f25a336a | 71 | |
b6044542 | 72 | # fixme: do not use extra class |
17654a06 | 73 | if ($new->{state} eq 'master') { |
f25a336a DM |
74 | $self->{manager} = PVE::HA::Manager->new($haenv); |
75 | } else { | |
76 | if ($self->{manager}) { | |
77 | # fixme: what should we do here? | |
78 | $self->{manager}->cleanup(); | |
79 | $self->{manager} = undef; | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
b6044542 | 84 | sub get_protected_ha_manager_lock { |
f25a336a DM |
85 | my ($self) = @_; |
86 | ||
87 | my $haenv = $self->{haenv}; | |
88 | ||
89 | my $count = 0; | |
b6044542 | 90 | my $starttime = $haenv->get_time(); |
68549bf1 | 91 | |
f25a336a | 92 | for (;;) { |
b6044542 DM |
93 | |
94 | if ($haenv->get_ha_manager_lock()) { | |
95 | if ($self->{ha_manager_wd}) { | |
96 | $haenv->watchdog_update($self->{ha_manager_wd}); | |
97 | } else { | |
98 | my $wfh = $haenv->watchdog_open(); | |
99 | $self->{ha_manager_wd} = $wfh; | |
68549bf1 | 100 | } |
b6044542 | 101 | return 1; |
68549bf1 DM |
102 | } |
103 | ||
b6044542 | 104 | last if ++$count > 5; # try max 5 time |
68549bf1 | 105 | |
b6044542 DM |
106 | my $delay = $haenv->get_time() - $starttime; |
107 | last if $delay > 5; # for max 5 seconds | |
68549bf1 | 108 | |
f25a336a DM |
109 | $haenv->sleep(1); |
110 | } | |
b6044542 | 111 | |
f25a336a DM |
112 | return 0; |
113 | } | |
114 | ||
49777d09 DM |
115 | sub check_pending_fencing { |
116 | my ($manager_status, $node) = @_; | |
117 | ||
118 | my $ss = $manager_status->{service_status}; | |
119 | ||
120 | return 1 if PVE::HA::Tools::count_fenced_services($ss, $node); | |
121 | ||
122 | return 0; | |
123 | } | |
124 | ||
f25a336a DM |
125 | sub do_one_iteration { |
126 | my ($self) = @_; | |
127 | ||
128 | my $haenv = $self->{haenv}; | |
129 | ||
130 | my $status = $self->get_local_status(); | |
17654a06 | 131 | my $state = $status->{state}; |
f25a336a | 132 | |
49777d09 DM |
133 | my $manager_status = $haenv->read_manager_status(); |
134 | my $pending_fencing = check_pending_fencing($manager_status, $haenv->nodename()); | |
135 | ||
17654a06 | 136 | # do state changes first |
f25a336a | 137 | |
b6044542 | 138 | if ($state eq 'wait_for_quorum') { |
f25a336a | 139 | |
b83b4ae8 | 140 | if (!$pending_fencing && $haenv->quorate() && |
dab485b3 | 141 | PVE::HA::Tools::has_services($haenv)) { |
b6044542 | 142 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
143 | $self->set_local_status({ state => 'master' }); |
144 | } else { | |
145 | $self->set_local_status({ state => 'slave' }); | |
146 | } | |
147 | } | |
148 | ||
b6044542 | 149 | } elsif ($state eq 'slave') { |
17654a06 | 150 | |
b83b4ae8 | 151 | if (!$pending_fencing && $haenv->quorate() && |
dab485b3 | 152 | PVE::HA::Tools::has_services($haenv)) { |
b6044542 DM |
153 | if ($self->get_protected_ha_manager_lock()) { |
154 | $self->set_local_status({ state => 'master' }); | |
f25a336a | 155 | } |
b6044542 DM |
156 | } else { |
157 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
f25a336a DM |
158 | } |
159 | ||
b6044542 | 160 | } elsif ($state eq 'lost_manager_lock') { |
17654a06 | 161 | |
49777d09 | 162 | if (!$pending_fencing && $haenv->quorate()) { |
b6044542 | 163 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
164 | $self->set_local_status({ state => 'master' }); |
165 | } | |
17654a06 DM |
166 | } |
167 | ||
b6044542 DM |
168 | } elsif ($state eq 'master') { |
169 | ||
170 | if (!$self->get_protected_ha_manager_lock()) { | |
171 | $self->set_local_status({ state => 'lost_manager_lock'}); | |
172 | } | |
17654a06 DM |
173 | } |
174 | ||
175 | $status = $self->get_local_status(); | |
176 | $state = $status->{state}; | |
177 | ||
178 | # do work | |
179 | ||
b6044542 | 180 | if ($state eq 'wait_for_quorum') { |
17654a06 | 181 | |
b6044542 | 182 | return 0 if $self->{shutdown_request}; |
17654a06 DM |
183 | |
184 | $haenv->sleep(5); | |
185 | ||
186 | } elsif ($state eq 'master') { | |
f25a336a DM |
187 | |
188 | my $manager = $self->{manager}; | |
189 | ||
190 | die "no manager" if !defined($manager); | |
191 | ||
192 | my $startime = $haenv->get_time(); | |
193 | ||
194 | my $max_time = 10; | |
195 | ||
3d411a6b DM |
196 | my $shutdown = 0; |
197 | ||
f25a336a DM |
198 | # do work (max_time seconds) |
199 | eval { | |
200 | # fixme: set alert timer | |
3d411a6b DM |
201 | |
202 | if ($self->{shutdown_request}) { | |
203 | ||
204 | if ($self->{ha_manager_wd}) { | |
205 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
206 | delete $self->{ha_manager_wd}; | |
207 | } | |
208 | ||
de002253 TL |
209 | # release the manager lock, so another CRM slave can get it |
210 | # and continue to work without waiting for the lock timeout | |
211 | $haenv->log('info', "voluntary release CRM lock"); | |
212 | if (!$haenv->release_ha_manager_lock()) { | |
213 | $haenv->log('notice', "CRM lock release failed, let the" . | |
214 | " lock timeout"); | |
215 | } | |
216 | ||
3d411a6b DM |
217 | $shutdown = 1; |
218 | ||
219 | } else { | |
220 | $manager->manage(); | |
221 | } | |
f25a336a DM |
222 | }; |
223 | if (my $err = $@) { | |
f25a336a | 224 | $haenv->log('err', "got unexpected error - $err"); |
b6044542 | 225 | } |
f25a336a | 226 | |
3d411a6b DM |
227 | return 0 if $shutdown; |
228 | ||
b6044542 DM |
229 | $haenv->sleep_until($startime + $max_time); |
230 | ||
231 | } elsif ($state eq 'lost_manager_lock') { | |
232 | ||
233 | if ($self->{ha_manager_wd}) { | |
234 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
235 | delete $self->{ha_manager_wd}; | |
f25a336a DM |
236 | } |
237 | ||
b6044542 DM |
238 | return 0 if $self->{shutdown_request}; |
239 | ||
240 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
241 | ||
17654a06 | 242 | } elsif ($state eq 'slave') { |
b6044542 | 243 | |
1c7886c2 DM |
244 | return 0 if $self->{shutdown_request}; |
245 | ||
b6044542 DM |
246 | # wait until we get master |
247 | ||
f25a336a | 248 | } else { |
b6044542 | 249 | |
17654a06 | 250 | die "got unexpected status '$state'\n"; |
f25a336a DM |
251 | } |
252 | ||
f25a336a DM |
253 | return 1; |
254 | } | |
255 | ||
256 | 1; |