]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/CRM.pm
21a0acccb850078b82da8637f416595381db2e25
[pve-ha-manager.git] / src / PVE / HA / CRM.pm
1 package PVE::HA::CRM;
2
3 # Cluster Resource Manager
4
5 use strict;
6 use warnings;
7
8 use PVE::Tools;
9 use PVE::HA::Tools;
10
11 use PVE::HA::Manager;
12
13 # Server can have several state:
14
15 my $valid_states = {
16 wait_for_quorum => "cluster is not quorate, waiting",
17 master => "quorate, and we got the ha_manager lock",
18 lost_manager_lock => "we lost the ha_manager lock (watchdog active)",
19 slave => "quorate, but we do not own the ha_manager lock",
20 };
21
22 sub new {
23 my ($this, $haenv) = @_;
24
25 my $class = ref($this) || $this;
26
27 my $self = bless {
28 haenv => $haenv,
29 manager => undef,
30 status => { state => 'startup' },
31 }, $class;
32
33 $self->set_local_status({ state => 'wait_for_quorum' });
34
35 return $self;
36 }
37
38 sub shutdown_request {
39 my ($self) = @_;
40
41 $self->{haenv}->log('info' , "server received shutdown request")
42 if !$self->{shutdown_request};
43
44 $self->{shutdown_request} = 1;
45 }
46
47 sub get_local_status {
48 my ($self) = @_;
49
50 return $self->{status};
51 }
52
53 sub set_local_status {
54 my ($self, $new) = @_;
55
56 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
57
58 my $haenv = $self->{haenv};
59
60 my $old = $self->{status};
61
62 # important: only update if if really changed
63 return if $old->{state} eq $new->{state};
64
65 $haenv->log('info', "status change $old->{state} => $new->{state}");
66
67 $new->{state_change_time} = $haenv->get_time();
68
69 $self->{status} = $new;
70
71 # fixme: do not use extra class
72 if ($new->{state} eq 'master') {
73 $self->{manager} = PVE::HA::Manager->new($haenv);
74 } else {
75 if ($self->{manager}) {
76 # fixme: what should we do here?
77 $self->{manager}->cleanup();
78 $self->{manager} = undef;
79 }
80 }
81 }
82
83 sub get_protected_ha_manager_lock {
84 my ($self) = @_;
85
86 my $haenv = $self->{haenv};
87
88 my $count = 0;
89 my $starttime = $haenv->get_time();
90
91 for (;;) {
92
93 if ($haenv->get_ha_manager_lock()) {
94 if ($self->{ha_manager_wd}) {
95 $haenv->watchdog_update($self->{ha_manager_wd});
96 } else {
97 my $wfh = $haenv->watchdog_open();
98 $self->{ha_manager_wd} = $wfh;
99 }
100 return 1;
101 }
102
103 last if ++$count > 5; # try max 5 time
104
105 my $delay = $haenv->get_time() - $starttime;
106 last if $delay > 5; # for max 5 seconds
107
108 $haenv->sleep(1);
109 }
110
111 return 0;
112 }
113
114 # checks quorum, for no active pending fence jobs and if services are configured
115 sub can_get_active {
116 my ($self, $allow_no_service) = @_;
117
118 my $haenv = $self->{haenv};
119
120 return 0 if !$haenv->quorate();
121
122 my $manager_status = eval { $haenv->read_manager_status() };
123 if (my $err = $@) {
124 $haenv->log('err', "could not read manager status: $err");
125 return 0;
126 }
127 my $ss = $manager_status->{service_status};
128 return 0 if PVE::HA::Tools::count_fenced_services($ss, $haenv->nodename());
129
130 if (!$allow_no_service) {
131 my $conf = eval { $haenv->read_service_config() };
132 if (my $err = $@) {
133 $haenv->log('err', "could not read service config: $err");
134 return undef;
135 }
136 return 0 if !scalar(%{$conf});
137 }
138
139 return 1;
140 }
141
142 sub do_one_iteration {
143 my ($self) = @_;
144
145 my $haenv = $self->{haenv};
146
147 $haenv->loop_start_hook();
148
149 my $res = $self->work();
150
151 $haenv->loop_end_hook();
152
153 return $res;
154 }
155
156 sub work {
157 my ($self) = @_;
158
159 my $haenv = $self->{haenv};
160
161 my $status = $self->get_local_status();
162 my $state = $status->{state};
163
164 # do state changes first
165
166 if ($state eq 'wait_for_quorum') {
167
168 if ($self->can_get_active()) {
169 if ($self->get_protected_ha_manager_lock()) {
170 $self->set_local_status({ state => 'master' });
171 } else {
172 $self->set_local_status({ state => 'slave' });
173 }
174 }
175
176 } elsif ($state eq 'slave') {
177
178 if ($self->can_get_active()) {
179 if ($self->get_protected_ha_manager_lock()) {
180 $self->set_local_status({ state => 'master' });
181 }
182 } else {
183 $self->set_local_status({ state => 'wait_for_quorum' });
184 }
185
186 } elsif ($state eq 'lost_manager_lock') {
187
188 if ($self->can_get_active(1)) {
189 if ($self->get_protected_ha_manager_lock()) {
190 $self->set_local_status({ state => 'master' });
191 }
192 }
193
194 } elsif ($state eq 'master') {
195
196 if (!$self->get_protected_ha_manager_lock()) {
197 $self->set_local_status({ state => 'lost_manager_lock'});
198 }
199 }
200
201 $status = $self->get_local_status();
202 $state = $status->{state};
203
204 # do work
205
206 if ($state eq 'wait_for_quorum') {
207
208 return 0 if $self->{shutdown_request};
209
210 $haenv->sleep(5);
211
212 } elsif ($state eq 'master') {
213
214 my $manager = $self->{manager};
215
216 die "no manager" if !defined($manager);
217
218 my $startime = $haenv->get_time();
219
220 my $max_time = 10;
221
222 my $shutdown = 0;
223
224 # do work (max_time seconds)
225 eval {
226 # fixme: set alert timer
227
228 if ($self->{shutdown_request}) {
229
230 if ($self->{ha_manager_wd}) {
231 $haenv->watchdog_close($self->{ha_manager_wd});
232 delete $self->{ha_manager_wd};
233 }
234
235 # release the manager lock, so another CRM slave can get it
236 # and continue to work without waiting for the lock timeout
237 $haenv->log('info', "voluntary release CRM lock");
238 if (!$haenv->release_ha_manager_lock()) {
239 $haenv->log('notice', "CRM lock release failed, let the" .
240 " lock timeout");
241 }
242
243 $shutdown = 1;
244
245 } else {
246 $manager->manage();
247 }
248 };
249 if (my $err = $@) {
250 $haenv->log('err', "got unexpected error - $err");
251 }
252
253 return 0 if $shutdown;
254
255 $haenv->sleep_until($startime + $max_time);
256
257 } elsif ($state eq 'lost_manager_lock') {
258
259 if ($self->{ha_manager_wd}) {
260 $haenv->watchdog_close($self->{ha_manager_wd});
261 delete $self->{ha_manager_wd};
262 }
263
264 return 0 if $self->{shutdown_request};
265
266 $self->set_local_status({ state => 'wait_for_quorum' });
267
268 } elsif ($state eq 'slave') {
269
270 return 0 if $self->{shutdown_request};
271
272 # wait until we get master
273
274 } else {
275
276 die "got unexpected status '$state'\n";
277 }
278
279 return 1;
280 }
281
282 1;