]>
Commit | Line | Data |
---|---|---|
6cd38bc6 DM |
1 | package PVE::HA::CRM; |
2 | ||
3 | # Cluster Resource Manager | |
f25a336a DM |
4 | |
5 | use strict; | |
6 | use warnings; | |
7 | ||
8 | use PVE::SafeSyslog; | |
9 | use PVE::Tools; | |
17654a06 | 10 | use PVE::HA::Tools; |
f25a336a DM |
11 | |
12 | use PVE::HA::Manager; | |
13 | ||
14 | # Server can have several state: | |
f25a336a DM |
15 | |
16 | my $valid_states = { | |
b6044542 DM |
17 | wait_for_quorum => "cluster is not quorate, waiting", |
18 | master => "quorate, and we got the ha_manager lock", | |
19 | lost_manager_lock => "we lost the ha_manager lock (watchgog active)", | |
20 | slave => "quorate, but we do not own the ha_manager lock", | |
f25a336a DM |
21 | }; |
22 | ||
23 | sub new { | |
24 | my ($this, $haenv) = @_; | |
25 | ||
26 | my $class = ref($this) || $this; | |
27 | ||
28 | my $self = bless { | |
29 | haenv => $haenv, | |
30 | manager => undef, | |
b6044542 | 31 | status => { state => 'startup' }, |
f25a336a DM |
32 | }, $class; |
33 | ||
b6044542 | 34 | $self->set_local_status({ state => 'wait_for_quorum' }); |
f25a336a DM |
35 | |
36 | return $self; | |
37 | } | |
38 | ||
b6044542 DM |
39 | sub shutdown_request { |
40 | my ($self) = @_; | |
41 | ||
378af518 DM |
42 | syslog('info' , "server received shutdown request") |
43 | if !$self->{shutdown_request}; | |
44 | ||
b6044542 DM |
45 | $self->{shutdown_request} = 1; |
46 | } | |
47 | ||
f25a336a DM |
48 | sub get_local_status { |
49 | my ($self) = @_; | |
50 | ||
51 | return $self->{status}; | |
52 | } | |
53 | ||
54 | sub set_local_status { | |
17654a06 | 55 | my ($self, $new) = @_; |
f25a336a | 56 | |
17654a06 | 57 | die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}}; |
f25a336a DM |
58 | |
59 | my $haenv = $self->{haenv}; | |
60 | ||
17654a06 | 61 | my $old = $self->{status}; |
f25a336a | 62 | |
b6044542 DM |
63 | # important: only update if if really changed |
64 | return if $old->{state} eq $new->{state}; | |
f25a336a | 65 | |
0bba8f60 | 66 | $haenv->log('info', "status change $old->{state} => $new->{state}"); |
f25a336a | 67 | |
17654a06 | 68 | $new->{state_change_time} = $haenv->get_time(); |
f25a336a | 69 | |
17654a06 | 70 | $self->{status} = $new; |
f25a336a | 71 | |
b6044542 | 72 | # fixme: do not use extra class |
17654a06 | 73 | if ($new->{state} eq 'master') { |
f25a336a DM |
74 | $self->{manager} = PVE::HA::Manager->new($haenv); |
75 | } else { | |
76 | if ($self->{manager}) { | |
77 | # fixme: what should we do here? | |
78 | $self->{manager}->cleanup(); | |
79 | $self->{manager} = undef; | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
b6044542 | 84 | sub get_protected_ha_manager_lock { |
f25a336a DM |
85 | my ($self) = @_; |
86 | ||
87 | my $haenv = $self->{haenv}; | |
88 | ||
89 | my $count = 0; | |
b6044542 | 90 | my $starttime = $haenv->get_time(); |
68549bf1 | 91 | |
f25a336a | 92 | for (;;) { |
b6044542 DM |
93 | |
94 | if ($haenv->get_ha_manager_lock()) { | |
95 | if ($self->{ha_manager_wd}) { | |
96 | $haenv->watchdog_update($self->{ha_manager_wd}); | |
97 | } else { | |
98 | my $wfh = $haenv->watchdog_open(); | |
99 | $self->{ha_manager_wd} = $wfh; | |
68549bf1 | 100 | } |
b6044542 | 101 | return 1; |
68549bf1 DM |
102 | } |
103 | ||
b6044542 | 104 | last if ++$count > 5; # try max 5 time |
68549bf1 | 105 | |
b6044542 DM |
106 | my $delay = $haenv->get_time() - $starttime; |
107 | last if $delay > 5; # for max 5 seconds | |
68549bf1 | 108 | |
f25a336a DM |
109 | $haenv->sleep(1); |
110 | } | |
b6044542 | 111 | |
f25a336a DM |
112 | return 0; |
113 | } | |
114 | ||
49777d09 DM |
115 | sub check_pending_fencing { |
116 | my ($manager_status, $node) = @_; | |
117 | ||
118 | my $ss = $manager_status->{service_status}; | |
119 | ||
120 | return 1 if PVE::HA::Tools::count_fenced_services($ss, $node); | |
121 | ||
122 | return 0; | |
123 | } | |
124 | ||
f25a336a DM |
125 | sub do_one_iteration { |
126 | my ($self) = @_; | |
127 | ||
128 | my $haenv = $self->{haenv}; | |
129 | ||
130 | my $status = $self->get_local_status(); | |
17654a06 | 131 | my $state = $status->{state}; |
f25a336a | 132 | |
49777d09 DM |
133 | my $manager_status = $haenv->read_manager_status(); |
134 | my $pending_fencing = check_pending_fencing($manager_status, $haenv->nodename()); | |
135 | ||
17654a06 | 136 | # do state changes first |
f25a336a | 137 | |
b6044542 | 138 | if ($state eq 'wait_for_quorum') { |
f25a336a | 139 | |
49777d09 | 140 | if (!$pending_fencing && $haenv->quorate()) { |
b6044542 | 141 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
142 | $self->set_local_status({ state => 'master' }); |
143 | } else { | |
144 | $self->set_local_status({ state => 'slave' }); | |
145 | } | |
146 | } | |
147 | ||
b6044542 | 148 | } elsif ($state eq 'slave') { |
17654a06 | 149 | |
49777d09 | 150 | if (!$pending_fencing && $haenv->quorate()) { |
b6044542 DM |
151 | if ($self->get_protected_ha_manager_lock()) { |
152 | $self->set_local_status({ state => 'master' }); | |
f25a336a | 153 | } |
b6044542 DM |
154 | } else { |
155 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
f25a336a DM |
156 | } |
157 | ||
b6044542 | 158 | } elsif ($state eq 'lost_manager_lock') { |
17654a06 | 159 | |
49777d09 | 160 | if (!$pending_fencing && $haenv->quorate()) { |
b6044542 | 161 | if ($self->get_protected_ha_manager_lock()) { |
17654a06 DM |
162 | $self->set_local_status({ state => 'master' }); |
163 | } | |
17654a06 DM |
164 | } |
165 | ||
b6044542 DM |
166 | } elsif ($state eq 'master') { |
167 | ||
168 | if (!$self->get_protected_ha_manager_lock()) { | |
169 | $self->set_local_status({ state => 'lost_manager_lock'}); | |
170 | } | |
17654a06 DM |
171 | } |
172 | ||
173 | $status = $self->get_local_status(); | |
174 | $state = $status->{state}; | |
175 | ||
176 | # do work | |
177 | ||
b6044542 | 178 | if ($state eq 'wait_for_quorum') { |
17654a06 | 179 | |
b6044542 | 180 | return 0 if $self->{shutdown_request}; |
17654a06 DM |
181 | |
182 | $haenv->sleep(5); | |
183 | ||
184 | } elsif ($state eq 'master') { | |
f25a336a DM |
185 | |
186 | my $manager = $self->{manager}; | |
187 | ||
188 | die "no manager" if !defined($manager); | |
189 | ||
190 | my $startime = $haenv->get_time(); | |
191 | ||
192 | my $max_time = 10; | |
193 | ||
3d411a6b DM |
194 | my $shutdown = 0; |
195 | ||
f25a336a DM |
196 | # do work (max_time seconds) |
197 | eval { | |
198 | # fixme: set alert timer | |
3d411a6b DM |
199 | |
200 | if ($self->{shutdown_request}) { | |
201 | ||
202 | if ($self->{ha_manager_wd}) { | |
203 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
204 | delete $self->{ha_manager_wd}; | |
205 | } | |
206 | ||
207 | $shutdown = 1; | |
208 | ||
209 | } else { | |
210 | $manager->manage(); | |
211 | } | |
f25a336a DM |
212 | }; |
213 | if (my $err = $@) { | |
f25a336a | 214 | $haenv->log('err', "got unexpected error - $err"); |
b6044542 | 215 | } |
f25a336a | 216 | |
3d411a6b DM |
217 | return 0 if $shutdown; |
218 | ||
b6044542 DM |
219 | $haenv->sleep_until($startime + $max_time); |
220 | ||
221 | } elsif ($state eq 'lost_manager_lock') { | |
222 | ||
223 | if ($self->{ha_manager_wd}) { | |
224 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
225 | delete $self->{ha_manager_wd}; | |
f25a336a DM |
226 | } |
227 | ||
b6044542 DM |
228 | return 0 if $self->{shutdown_request}; |
229 | ||
230 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
231 | ||
17654a06 | 232 | } elsif ($state eq 'slave') { |
b6044542 | 233 | |
1c7886c2 DM |
234 | return 0 if $self->{shutdown_request}; |
235 | ||
b6044542 DM |
236 | # wait until we get master |
237 | ||
f25a336a | 238 | } else { |
b6044542 | 239 | |
17654a06 | 240 | die "got unexpected status '$state'\n"; |
f25a336a DM |
241 | } |
242 | ||
f25a336a DM |
243 | return 1; |
244 | } | |
245 | ||
246 | 1; |