3 # Cluster Resource Manager
13 # Server can have several state:
16 wait_for_quorum
=> "cluster is not quorate, waiting",
17 master
=> "quorate, and we got the ha_manager lock",
18 lost_manager_lock
=> "we lost the ha_manager lock (watchdog active)",
19 slave
=> "quorate, but we do not own the ha_manager lock",
23 my ($this, $haenv) = @_;
25 my $class = ref($this) || $this;
30 status
=> { state => 'startup' },
31 cluster_state_update
=> 0,
34 $self->set_local_status({ state => 'wait_for_quorum' });
39 sub shutdown_request
{
42 $self->{haenv
}->log('info' , "server received shutdown request")
43 if !$self->{shutdown_request
};
45 $self->{shutdown_request
} = 1;
48 sub get_local_status
{
51 return $self->{status
};
54 sub set_local_status
{
55 my ($self, $new) = @_;
57 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
59 my $haenv = $self->{haenv
};
61 my $old = $self->{status
};
63 # important: only update if if really changed
64 return if $old->{state} eq $new->{state};
66 $haenv->log('info', "status change $old->{state} => $new->{state}");
68 $new->{state_change_time
} = $haenv->get_time();
70 $self->{status
} = $new;
72 # fixme: do not use extra class
73 if ($new->{state} eq 'master') {
74 $self->{manager
} = PVE
::HA
::Manager-
>new($haenv);
76 if ($self->{manager
}) {
77 # fixme: what should we do here?
78 $self->{manager
}->cleanup();
79 $self->{manager
} = undef;
84 sub get_protected_ha_manager_lock
{
87 my $haenv = $self->{haenv
};
90 my $starttime = $haenv->get_time();
94 if ($haenv->get_ha_manager_lock()) {
95 if ($self->{ha_manager_wd
}) {
96 $haenv->watchdog_update($self->{ha_manager_wd
});
98 my $wfh = $haenv->watchdog_open();
99 $self->{ha_manager_wd
} = $wfh;
104 last if ++$count > 5; # try max 5 time
106 my $delay = $haenv->get_time() - $starttime;
107 last if $delay > 5; # for max 5 seconds
115 # checks quorum, for no active pending fence jobs and if services are configured
117 my ($self, $allow_no_service) = @_;
119 my $haenv = $self->{haenv
};
121 return 0 if !$haenv->quorate();
123 # we may not do any active work with an incosistent cluster state
124 return 0 if !$self->{cluster_state_update
};
126 my $manager_status = eval { $haenv->read_manager_status() };
128 $haenv->log('err', "could not read manager status: $err");
131 my $ss = $manager_status->{service_status
};
132 return 0 if PVE
::HA
::Tools
::count_fenced_services
($ss, $haenv->nodename());
134 if (!$allow_no_service) {
135 my $conf = eval { $haenv->read_service_config() };
137 $haenv->log('err', "could not read service config: $err");
140 return 0 if !scalar(%{$conf});
146 sub do_one_iteration
{
149 my $haenv = $self->{haenv
};
151 $haenv->loop_start_hook();
153 $self->{cluster_state_update
} = $haenv->cluster_state_update();
155 my $res = $self->work();
157 $haenv->loop_end_hook();
165 my $haenv = $self->{haenv
};
167 my $status = $self->get_local_status();
168 my $state = $status->{state};
170 # do state changes first
172 if ($state eq 'wait_for_quorum') {
174 if ($self->can_get_active()) {
175 if ($self->get_protected_ha_manager_lock()) {
176 $self->set_local_status({ state => 'master' });
178 $self->set_local_status({ state => 'slave' });
182 } elsif ($state eq 'slave') {
184 if ($self->can_get_active()) {
185 if ($self->get_protected_ha_manager_lock()) {
186 $self->set_local_status({ state => 'master' });
189 $self->set_local_status({ state => 'wait_for_quorum' });
192 } elsif ($state eq 'lost_manager_lock') {
194 if ($self->can_get_active(1)) {
195 if ($self->get_protected_ha_manager_lock()) {
196 $self->set_local_status({ state => 'master' });
200 } elsif ($state eq 'master') {
202 if (!$self->get_protected_ha_manager_lock()) {
203 $self->set_local_status({ state => 'lost_manager_lock'});
207 $status = $self->get_local_status();
208 $state = $status->{state};
212 if ($state eq 'wait_for_quorum') {
214 return 0 if $self->{shutdown_request
};
218 } elsif ($state eq 'master') {
220 my $manager = $self->{manager
};
222 die "no manager" if !defined($manager);
224 my $startime = $haenv->get_time();
230 # do work (max_time seconds)
232 # fixme: set alert timer
234 if ($self->{shutdown_request
}) {
236 if ($self->{ha_manager_wd
}) {
237 $haenv->watchdog_close($self->{ha_manager_wd
});
238 delete $self->{ha_manager_wd
};
241 # release the manager lock, so another CRM slave can get it
242 # and continue to work without waiting for the lock timeout
243 $haenv->log('info', "voluntary release CRM lock");
244 if (!$haenv->release_ha_manager_lock()) {
245 $haenv->log('notice', "CRM lock release failed, let the" .
252 if (!$self->{cluster_state_update
}) {
253 # update failed but we could still renew our lock (cfs restart?),
254 # safely skip manage and expect to update just fine next round
255 $haenv->log('notice', "temporary inconsistent cluster state " .
256 "(cfs restart?), skip round");
264 $haenv->log('err', "got unexpected error - $err");
267 return 0 if $shutdown;
269 $haenv->sleep_until($startime + $max_time);
271 } elsif ($state eq 'lost_manager_lock') {
273 if ($self->{ha_manager_wd
}) {
274 $haenv->watchdog_close($self->{ha_manager_wd
});
275 delete $self->{ha_manager_wd
};
278 return 0 if $self->{shutdown_request
};
280 $self->set_local_status({ state => 'wait_for_quorum' });
282 } elsif ($state eq 'slave') {
284 return 0 if $self->{shutdown_request
};
286 # wait until we get master
290 die "got unexpected status '$state'\n";