3 # Cluster Resource Manager
13 # Server can have several state:
16 wait_for_quorum
=> "cluster is not quorate, waiting",
17 master
=> "quorate, and we got the ha_manager lock",
18 lost_manager_lock
=> "we lost the ha_manager lock (watchdog active)",
19 slave
=> "quorate, but we do not own the ha_manager lock",
23 my ($this, $haenv) = @_;
25 my $class = ref($this) || $this;
30 status
=> { state => 'startup' },
33 $self->set_local_status({ state => 'wait_for_quorum' });
38 sub shutdown_request
{
41 $self->{haenv
}->log('info' , "server received shutdown request")
42 if !$self->{shutdown_request
};
44 $self->{shutdown_request
} = 1;
47 sub get_local_status
{
50 return $self->{status
};
53 sub set_local_status
{
54 my ($self, $new) = @_;
56 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
58 my $haenv = $self->{haenv
};
60 my $old = $self->{status
};
62 # important: only update if if really changed
63 return if $old->{state} eq $new->{state};
65 $haenv->log('info', "status change $old->{state} => $new->{state}");
67 $new->{state_change_time
} = $haenv->get_time();
69 $self->{status
} = $new;
71 # fixme: do not use extra class
72 if ($new->{state} eq 'master') {
73 $self->{manager
} = PVE
::HA
::Manager-
>new($haenv);
75 if ($self->{manager
}) {
76 # fixme: what should we do here?
77 $self->{manager
}->cleanup();
78 $self->{manager
} = undef;
83 sub get_protected_ha_manager_lock
{
86 my $haenv = $self->{haenv
};
89 my $starttime = $haenv->get_time();
93 if ($haenv->get_ha_manager_lock()) {
94 if ($self->{ha_manager_wd
}) {
95 $haenv->watchdog_update($self->{ha_manager_wd
});
97 my $wfh = $haenv->watchdog_open();
98 $self->{ha_manager_wd
} = $wfh;
103 last if ++$count > 5; # try max 5 time
105 my $delay = $haenv->get_time() - $starttime;
106 last if $delay > 5; # for max 5 seconds
114 # checks quorum, for no active pending fence jobs and if services are configured
116 my ($self, $allow_no_service) = @_;
118 my $haenv = $self->{haenv
};
120 return 0 if !$haenv->quorate();
122 my $manager_status = eval { $haenv->read_manager_status() };
124 $haenv->log('err', "could not read manager status: $err");
127 my $ss = $manager_status->{service_status
};
128 return 0 if PVE
::HA
::Tools
::count_fenced_services
($ss, $haenv->nodename());
130 if (!$allow_no_service) {
131 my $conf = eval { $haenv->read_service_config() };
133 $haenv->log('err', "could not read service config: $err");
136 return 0 if !scalar(%{$conf});
142 sub do_one_iteration
{
145 my $haenv = $self->{haenv
};
147 $haenv->loop_start_hook();
149 my $res = $self->work();
151 $haenv->loop_end_hook();
159 my $haenv = $self->{haenv
};
161 my $status = $self->get_local_status();
162 my $state = $status->{state};
164 # do state changes first
166 if ($state eq 'wait_for_quorum') {
168 if ($self->can_get_active()) {
169 if ($self->get_protected_ha_manager_lock()) {
170 $self->set_local_status({ state => 'master' });
172 $self->set_local_status({ state => 'slave' });
176 } elsif ($state eq 'slave') {
178 if ($self->can_get_active()) {
179 if ($self->get_protected_ha_manager_lock()) {
180 $self->set_local_status({ state => 'master' });
183 $self->set_local_status({ state => 'wait_for_quorum' });
186 } elsif ($state eq 'lost_manager_lock') {
188 if ($self->can_get_active(1)) {
189 if ($self->get_protected_ha_manager_lock()) {
190 $self->set_local_status({ state => 'master' });
194 } elsif ($state eq 'master') {
196 if (!$self->get_protected_ha_manager_lock()) {
197 $self->set_local_status({ state => 'lost_manager_lock'});
201 $status = $self->get_local_status();
202 $state = $status->{state};
206 if ($state eq 'wait_for_quorum') {
208 return 0 if $self->{shutdown_request
};
212 } elsif ($state eq 'master') {
214 my $manager = $self->{manager
};
216 die "no manager" if !defined($manager);
218 my $startime = $haenv->get_time();
224 # do work (max_time seconds)
226 # fixme: set alert timer
228 if ($self->{shutdown_request
}) {
230 if ($self->{ha_manager_wd
}) {
231 $haenv->watchdog_close($self->{ha_manager_wd
});
232 delete $self->{ha_manager_wd
};
235 # release the manager lock, so another CRM slave can get it
236 # and continue to work without waiting for the lock timeout
237 $haenv->log('info', "voluntary release CRM lock");
238 if (!$haenv->release_ha_manager_lock()) {
239 $haenv->log('notice', "CRM lock release failed, let the" .
250 $haenv->log('err', "got unexpected error - $err");
253 return 0 if $shutdown;
255 $haenv->sleep_until($startime + $max_time);
257 } elsif ($state eq 'lost_manager_lock') {
259 if ($self->{ha_manager_wd
}) {
260 $haenv->watchdog_close($self->{ha_manager_wd
});
261 delete $self->{ha_manager_wd
};
264 return 0 if $self->{shutdown_request
};
266 $self->set_local_status({ state => 'wait_for_quorum' });
268 } elsif ($state eq 'slave') {
270 return 0 if $self->{shutdown_request
};
272 # wait until we get master
276 die "got unexpected status '$state'\n";