3 # Local Resource Manager
8 use POSIX
qw(:sys_wait_h);
14 # Server can have several states:
17 wait_for_agent_lock
=> "waiting for agent lock",
18 active
=> "got agent_lock",
19 lost_agent_lock
=> "lost agent_lock",
23 my ($this, $haenv) = @_;
25 my $class = ref($this) || $this;
29 status
=> { state => 'startup' },
32 shutdown_request
=> 0,
35 $self->set_local_status({ state => 'wait_for_agent_lock' });
40 sub shutdown_request
{
43 $self->{shutdown_request
} = 1;
46 sub get_local_status
{
49 return $self->{status
};
52 sub set_local_status
{
53 my ($self, $new) = @_;
55 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
57 my $haenv = $self->{haenv
};
59 my $old = $self->{status
};
61 # important: only update if if really changed
62 return if $old->{state} eq $new->{state};
64 $haenv->log('info', "status change $old->{state} => $new->{state}");
66 $new->{state_change_time
} = $haenv->get_time();
68 $self->{status
} = $new;
71 sub get_protected_ha_agent_lock
{
74 my $haenv = $self->{haenv
};
77 my $starttime = $haenv->get_time();
81 if ($haenv->get_ha_agent_lock()) {
82 if ($self->{ha_agent_wd
}) {
83 $haenv->watchdog_update($self->{ha_agent_wd
});
85 my $wfh = $haenv->watchdog_open();
86 $self->{ha_agent_wd
} = $wfh;
91 last if ++$count > 5; # try max 5 time
93 my $delay = $haenv->get_time() - $starttime;
94 last if $delay > 5; # for max 5 seconds
102 sub active_service_count
{
105 my $haenv = $self->{haenv
};
107 my $nodename = $haenv->nodename();
109 my $ss = $self->{service_status
};
113 foreach my $sid (keys %$ss) {
114 my $sd = $ss->{$sid};
115 next if !$sd->{node
};
116 next if $sd->{node
} ne $nodename;
117 my $req_state = $sd->{state};
118 next if !defined($req_state);
119 next if $req_state eq 'stopped';
127 sub do_one_iteration
{
130 my $haenv = $self->{haenv
};
132 my $status = $self->get_local_status();
133 my $state = $status->{state};
135 my $ms = $haenv->read_manager_status();
136 $self->{service_status
} = $ms->{service_status
} || {};
138 my $fence_request = PVE
::HA
::Tools
::count_fenced_services
($self->{service_status
}, $haenv->nodename());
140 # do state changes first
142 my $ctime = $haenv->get_time();
144 if ($state eq 'wait_for_agent_lock') {
146 my $service_count = $self->active_service_count();
148 if (!$fence_request && $service_count && $haenv->quorate()) {
149 if ($self->get_protected_ha_agent_lock()) {
150 $self->set_local_status({ state => 'active' });
154 } elsif ($state eq 'lost_agent_lock') {
156 if (!$fence_request && $haenv->quorate()) {
157 if ($self->get_protected_ha_agent_lock()) {
158 $self->set_local_status({ state => 'active' });
162 } elsif ($state eq 'active') {
164 if ($fence_request) {
165 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
166 $self->set_local_status({ state => 'lost_agent_lock'});
167 } elsif (!$self->get_protected_ha_agent_lock()) {
168 $self->set_local_status({ state => 'lost_agent_lock'});
172 $status = $self->get_local_status();
173 $state = $status->{state};
177 if ($state eq 'wait_for_agent_lock') {
179 return 0 if $self->{shutdown_request
};
183 } elsif ($state eq 'active') {
185 my $startime = $haenv->get_time();
191 # do work (max_time seconds)
193 # fixme: set alert timer
195 if ($self->{shutdown_request
}) {
197 # fixme: request service stop or relocate ?
199 my $service_count = $self->active_service_count();
201 if ($service_count == 0) {
203 if ($self->{ha_agent_wd
}) {
204 $haenv->watchdog_close($self->{ha_agent_wd
});
205 delete $self->{ha_agent_wd
};
212 $self->manage_resources();
217 $haenv->log('err', "got unexpected error - $err");
220 return 0 if $shutdown;
222 $haenv->sleep_until($startime + $max_time);
224 } elsif ($state eq 'lost_agent_lock') {
226 # Note: watchdog is active an will triger soon!
228 # so we hope to get the lock back soon!
230 if ($self->{shutdown_request
}) {
232 my $service_count = $self->active_service_count();
234 if ($service_count > 0) {
235 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
236 "detected $service_count running services");
240 # all services are stopped, so we can close the watchdog
242 if ($self->{ha_agent_wd
}) {
243 $haenv->watchdog_close($self->{ha_agent_wd
});
244 delete $self->{ha_agent_wd
};
255 die "got unexpected status '$state'\n";
262 sub manage_resources
{
265 my $haenv = $self->{haenv
};
267 my $nodename = $haenv->nodename();
269 my $ss = $self->{service_status
};
271 foreach my $sid (keys %$ss) {
272 my $sd = $ss->{$sid};
273 next if !$sd->{node
};
275 next if $sd->{node
} ne $nodename;
276 my $req_state = $sd->{state};
277 next if !defined($req_state);
279 $self->queue_resource_command($sid, $sd->{uid
}, $req_state, $sd->{target
});
282 $haenv->log('err', "unable to run resource agent for '$sid' - $err"); # fixme
286 my $starttime = $haenv->get_time();
291 my $sc = $haenv->read_service_config();
293 while (($haenv->get_time() - $starttime) < 5) {
294 my $count = $self->check_active_workers();
296 foreach my $sid (keys %{$self->{workers
}}) {
297 last if $count >= $max_workers;
298 my $w = $self->{workers
}->{$sid};
299 my $cd = $sc->{$sid};
301 $haenv->log('err', "missing resource configuration for '$sid'");
305 if ($haenv->can_fork()) {
307 if (!defined($pid)) {
308 $haenv->log('err', "fork worker failed");
309 $count = 0; last; # abort, try later
310 } elsif ($pid == 0) {
314 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target
});
317 $haenv->log('err', $err);
328 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target
});
331 $haenv->log('err', $err);
333 $self->resource_command_finished($sid, $w->{uid
}, $res);
344 # fixme: use a queue an limit number of parallel workers?
345 sub queue_resource_command
{
346 my ($self, $sid, $uid, $state, $target) = @_;
348 if (my $w = $self->{workers
}->{$sid}) {
349 return if $w->{pid
}; # already started
350 # else, delete and overwrite queue entry with new command
351 delete $self->{workers
}->{$sid};
354 $self->{workers
}->{$sid} = {
360 $self->{workers
}->{$sid}->{target
} = $target if $target;
363 sub check_active_workers
{
366 # finish/count workers
368 foreach my $sid (keys %{$self->{workers
}}) {
369 my $w = $self->{workers
}->{$sid};
370 if (my $pid = $w->{pid
}) {
372 my $waitpid = waitpid($pid, WNOHANG
);
373 if (defined($waitpid) && ($waitpid == $pid)) {
374 $self->resource_command_finished($sid, $w->{uid
}, $?);
384 sub resource_command_finished
{
385 my ($self, $sid, $uid, $status) = @_;
387 my $haenv = $self->{haenv
};
389 my $w = delete $self->{workers
}->{$sid};
390 return if !$w; # should not happen
395 $haenv->log('err', "resource agent $sid finished - failed to execute");
396 } elsif (my $sig = ($status & 127)) {
397 $haenv->log('err', "resource agent $sid finished - got signal $sig");
399 $exit_code = ($status >> 8);
402 $self->{results
}->{$uid} = {
404 state => $w->{state},
405 exit_code
=> $exit_code,
408 my $ss = $self->{service_status
};
410 # compute hash of valid/existing uids
412 foreach my $sid (keys %$ss) {
413 my $sd = $ss->{$sid};
415 $valid_uids->{$sd->{uid
}} = 1;
419 foreach my $id (keys %{$self->{results
}}) {
420 next if !$valid_uids->{$id};
421 $results->{$id} = $self->{results
}->{$id};
423 $self->{results
} = $results;
425 $haenv->write_lrm_status($results);