]>
Commit | Line | Data |
---|---|---|
1 | package PVE::HA::CRM; | |
2 | ||
3 | # Cluster Resource Manager | |
4 | ||
5 | use strict; | |
6 | use warnings; | |
7 | ||
8 | use PVE::SafeSyslog; | |
9 | use PVE::Tools; | |
10 | use PVE::HA::Tools; | |
11 | ||
12 | use PVE::HA::Manager; | |
13 | ||
14 | # Server can have several state: | |
15 | ||
16 | my $valid_states = { | |
17 | wait_for_quorum => "cluster is not quorate, waiting", | |
18 | master => "quorate, and we got the ha_manager lock", | |
19 | lost_manager_lock => "we lost the ha_manager lock (watchgog active)", | |
20 | slave => "quorate, but we do not own the ha_manager lock", | |
21 | }; | |
22 | ||
23 | sub new { | |
24 | my ($this, $haenv) = @_; | |
25 | ||
26 | my $class = ref($this) || $this; | |
27 | ||
28 | my $self = bless { | |
29 | haenv => $haenv, | |
30 | manager => undef, | |
31 | status => { state => 'startup' }, | |
32 | }, $class; | |
33 | ||
34 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
35 | ||
36 | return $self; | |
37 | } | |
38 | ||
39 | sub shutdown_request { | |
40 | my ($self) = @_; | |
41 | ||
42 | syslog('info' , "server received shutdown request") | |
43 | if !$self->{shutdown_request}; | |
44 | ||
45 | $self->{shutdown_request} = 1; | |
46 | } | |
47 | ||
48 | sub get_local_status { | |
49 | my ($self) = @_; | |
50 | ||
51 | return $self->{status}; | |
52 | } | |
53 | ||
54 | sub set_local_status { | |
55 | my ($self, $new) = @_; | |
56 | ||
57 | die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}}; | |
58 | ||
59 | my $haenv = $self->{haenv}; | |
60 | ||
61 | my $old = $self->{status}; | |
62 | ||
63 | # important: only update if if really changed | |
64 | return if $old->{state} eq $new->{state}; | |
65 | ||
66 | $haenv->log('info', "status change $old->{state} => $new->{state}"); | |
67 | ||
68 | $new->{state_change_time} = $haenv->get_time(); | |
69 | ||
70 | $self->{status} = $new; | |
71 | ||
72 | # fixme: do not use extra class | |
73 | if ($new->{state} eq 'master') { | |
74 | $self->{manager} = PVE::HA::Manager->new($haenv); | |
75 | } else { | |
76 | if ($self->{manager}) { | |
77 | # fixme: what should we do here? | |
78 | $self->{manager}->cleanup(); | |
79 | $self->{manager} = undef; | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
84 | sub get_protected_ha_manager_lock { | |
85 | my ($self) = @_; | |
86 | ||
87 | my $haenv = $self->{haenv}; | |
88 | ||
89 | my $count = 0; | |
90 | my $starttime = $haenv->get_time(); | |
91 | ||
92 | for (;;) { | |
93 | ||
94 | if ($haenv->get_ha_manager_lock()) { | |
95 | if ($self->{ha_manager_wd}) { | |
96 | $haenv->watchdog_update($self->{ha_manager_wd}); | |
97 | } else { | |
98 | my $wfh = $haenv->watchdog_open(); | |
99 | $self->{ha_manager_wd} = $wfh; | |
100 | } | |
101 | return 1; | |
102 | } | |
103 | ||
104 | last if ++$count > 5; # try max 5 time | |
105 | ||
106 | my $delay = $haenv->get_time() - $starttime; | |
107 | last if $delay > 5; # for max 5 seconds | |
108 | ||
109 | $haenv->sleep(1); | |
110 | } | |
111 | ||
112 | return 0; | |
113 | } | |
114 | ||
115 | sub do_one_iteration { | |
116 | my ($self) = @_; | |
117 | ||
118 | my $haenv = $self->{haenv}; | |
119 | ||
120 | my $status = $self->get_local_status(); | |
121 | my $state = $status->{state}; | |
122 | ||
123 | # do state changes first | |
124 | ||
125 | if ($state eq 'wait_for_quorum') { | |
126 | ||
127 | if ($haenv->quorate()) { | |
128 | if ($self->get_protected_ha_manager_lock()) { | |
129 | $self->set_local_status({ state => 'master' }); | |
130 | } else { | |
131 | $self->set_local_status({ state => 'slave' }); | |
132 | } | |
133 | } | |
134 | ||
135 | } elsif ($state eq 'slave') { | |
136 | ||
137 | if ($haenv->quorate()) { | |
138 | if ($self->get_protected_ha_manager_lock()) { | |
139 | $self->set_local_status({ state => 'master' }); | |
140 | } | |
141 | } else { | |
142 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
143 | } | |
144 | ||
145 | } elsif ($state eq 'lost_manager_lock') { | |
146 | ||
147 | if ($haenv->quorate()) { | |
148 | if ($self->get_protected_ha_manager_lock()) { | |
149 | $self->set_local_status({ state => 'master' }); | |
150 | } | |
151 | } | |
152 | ||
153 | } elsif ($state eq 'master') { | |
154 | ||
155 | if (!$self->get_protected_ha_manager_lock()) { | |
156 | $self->set_local_status({ state => 'lost_manager_lock'}); | |
157 | } | |
158 | } | |
159 | ||
160 | $status = $self->get_local_status(); | |
161 | $state = $status->{state}; | |
162 | ||
163 | # do work | |
164 | ||
165 | if ($state eq 'wait_for_quorum') { | |
166 | ||
167 | return 0 if $self->{shutdown_request}; | |
168 | ||
169 | $haenv->sleep(5); | |
170 | ||
171 | } elsif ($state eq 'master') { | |
172 | ||
173 | my $manager = $self->{manager}; | |
174 | ||
175 | die "no manager" if !defined($manager); | |
176 | ||
177 | my $startime = $haenv->get_time(); | |
178 | ||
179 | my $max_time = 10; | |
180 | ||
181 | # do work (max_time seconds) | |
182 | eval { | |
183 | # fixme: set alert timer | |
184 | $manager->manage(); | |
185 | }; | |
186 | if (my $err = $@) { | |
187 | $haenv->log('err', "got unexpected error - $err"); | |
188 | } | |
189 | ||
190 | $haenv->sleep_until($startime + $max_time); | |
191 | ||
192 | } elsif ($state eq 'lost_manager_lock') { | |
193 | ||
194 | if ($self->{ha_manager_wd}) { | |
195 | $haenv->watchdog_close($self->{ha_manager_wd}); | |
196 | delete $self->{ha_manager_wd}; | |
197 | } | |
198 | ||
199 | return 0 if $self->{shutdown_request}; | |
200 | ||
201 | $self->set_local_status({ state => 'wait_for_quorum' }); | |
202 | ||
203 | } elsif ($state eq 'slave') { | |
204 | ||
205 | return 0 if $self->{shutdown_request}; | |
206 | ||
207 | # wait until we get master | |
208 | ||
209 | } else { | |
210 | ||
211 | die "got unexpected status '$state'\n"; | |
212 | } | |
213 | ||
214 | return 1; | |
215 | } | |
216 | ||
217 | 1; |