]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/CRM.pm
crm: simply wait if there is no resource config
[pve-ha-manager.git] / src / PVE / HA / CRM.pm
1 package PVE::HA::CRM;
2
3 # Cluster Resource Manager
4
5 use strict;
6 use warnings;
7
8 use PVE::SafeSyslog;
9 use PVE::Tools;
10 use PVE::HA::Tools;
11
12 use PVE::HA::Manager;
13
14 # Server can have several state:
15
16 my $valid_states = {
17 wait_for_quorum => "cluster is not quorate, waiting",
18 master => "quorate, and we got the ha_manager lock",
19 lost_manager_lock => "we lost the ha_manager lock (watchgog active)",
20 slave => "quorate, but we do not own the ha_manager lock",
21 };
22
23 sub new {
24 my ($this, $haenv) = @_;
25
26 my $class = ref($this) || $this;
27
28 my $self = bless {
29 haenv => $haenv,
30 manager => undef,
31 status => { state => 'startup' },
32 }, $class;
33
34 $self->set_local_status({ state => 'wait_for_quorum' });
35
36 return $self;
37 }
38
39 sub shutdown_request {
40 my ($self) = @_;
41
42 syslog('info' , "server received shutdown request")
43 if !$self->{shutdown_request};
44
45 $self->{shutdown_request} = 1;
46 }
47
48 sub get_local_status {
49 my ($self) = @_;
50
51 return $self->{status};
52 }
53
54 sub set_local_status {
55 my ($self, $new) = @_;
56
57 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
58
59 my $haenv = $self->{haenv};
60
61 my $old = $self->{status};
62
63 # important: only update if if really changed
64 return if $old->{state} eq $new->{state};
65
66 $haenv->log('info', "status change $old->{state} => $new->{state}");
67
68 $new->{state_change_time} = $haenv->get_time();
69
70 $self->{status} = $new;
71
72 # fixme: do not use extra class
73 if ($new->{state} eq 'master') {
74 $self->{manager} = PVE::HA::Manager->new($haenv);
75 } else {
76 if ($self->{manager}) {
77 # fixme: what should we do here?
78 $self->{manager}->cleanup();
79 $self->{manager} = undef;
80 }
81 }
82 }
83
84 sub get_protected_ha_manager_lock {
85 my ($self) = @_;
86
87 my $haenv = $self->{haenv};
88
89 my $count = 0;
90 my $starttime = $haenv->get_time();
91
92 for (;;) {
93
94 if ($haenv->get_ha_manager_lock()) {
95 if ($self->{ha_manager_wd}) {
96 $haenv->watchdog_update($self->{ha_manager_wd});
97 } else {
98 my $wfh = $haenv->watchdog_open();
99 $self->{ha_manager_wd} = $wfh;
100 }
101 return 1;
102 }
103
104 last if ++$count > 5; # try max 5 time
105
106 my $delay = $haenv->get_time() - $starttime;
107 last if $delay > 5; # for max 5 seconds
108
109 $haenv->sleep(1);
110 }
111
112 return 0;
113 }
114
115 sub check_pending_fencing {
116 my ($manager_status, $node) = @_;
117
118 my $ss = $manager_status->{service_status};
119
120 return 1 if PVE::HA::Tools::count_fenced_services($ss, $node);
121
122 return 0;
123 }
124
125 sub do_one_iteration {
126 my ($self) = @_;
127
128 my $haenv = $self->{haenv};
129
130 my $status = $self->get_local_status();
131 my $state = $status->{state};
132
133 my $manager_status = $haenv->read_manager_status();
134 my $pending_fencing = check_pending_fencing($manager_status, $haenv->nodename());
135
136 # do state changes first
137
138 if ($state eq 'wait_for_quorum') {
139
140 if (!$pending_fencing && $haenv->quorate() &&
141 $haenv->service_config_exists()) {
142 if ($self->get_protected_ha_manager_lock()) {
143 $self->set_local_status({ state => 'master' });
144 } else {
145 $self->set_local_status({ state => 'slave' });
146 }
147 }
148
149 } elsif ($state eq 'slave') {
150
151 if (!$pending_fencing && $haenv->quorate() &&
152 $haenv->service_config_exists()) {
153 if ($self->get_protected_ha_manager_lock()) {
154 $self->set_local_status({ state => 'master' });
155 }
156 } else {
157 $self->set_local_status({ state => 'wait_for_quorum' });
158 }
159
160 } elsif ($state eq 'lost_manager_lock') {
161
162 if (!$pending_fencing && $haenv->quorate()) {
163 if ($self->get_protected_ha_manager_lock()) {
164 $self->set_local_status({ state => 'master' });
165 }
166 }
167
168 } elsif ($state eq 'master') {
169
170 if (!$self->get_protected_ha_manager_lock()) {
171 $self->set_local_status({ state => 'lost_manager_lock'});
172 }
173 }
174
175 $status = $self->get_local_status();
176 $state = $status->{state};
177
178 # do work
179
180 if ($state eq 'wait_for_quorum') {
181
182 return 0 if $self->{shutdown_request};
183
184 $haenv->sleep(5);
185
186 } elsif ($state eq 'master') {
187
188 my $manager = $self->{manager};
189
190 die "no manager" if !defined($manager);
191
192 my $startime = $haenv->get_time();
193
194 my $max_time = 10;
195
196 my $shutdown = 0;
197
198 # do work (max_time seconds)
199 eval {
200 # fixme: set alert timer
201
202 if ($self->{shutdown_request}) {
203
204 if ($self->{ha_manager_wd}) {
205 $haenv->watchdog_close($self->{ha_manager_wd});
206 delete $self->{ha_manager_wd};
207 }
208
209 $shutdown = 1;
210
211 } else {
212 $manager->manage();
213 }
214 };
215 if (my $err = $@) {
216 $haenv->log('err', "got unexpected error - $err");
217 }
218
219 return 0 if $shutdown;
220
221 $haenv->sleep_until($startime + $max_time);
222
223 } elsif ($state eq 'lost_manager_lock') {
224
225 if ($self->{ha_manager_wd}) {
226 $haenv->watchdog_close($self->{ha_manager_wd});
227 delete $self->{ha_manager_wd};
228 }
229
230 return 0 if $self->{shutdown_request};
231
232 $self->set_local_status({ state => 'wait_for_quorum' });
233
234 } elsif ($state eq 'slave') {
235
236 return 0 if $self->{shutdown_request};
237
238 # wait until we get master
239
240 } else {
241
242 die "got unexpected status '$state'\n";
243 }
244
245 return 1;
246 }
247
248 1;