3 # Local Resource Manager
8 use POSIX
qw(:sys_wait_h);
14 # Server can have several states:
17 wait_for_agent_lock
=> "waiting for agent lock",
18 active
=> "got agent_lock",
19 lost_agent_lock
=> "lost agent_lock",
23 my ($this, $haenv) = @_;
25 my $class = ref($this) || $this;
29 status
=> { state => 'startup' },
32 shutdown_request
=> 0,
35 $self->set_local_status({ state => 'wait_for_agent_lock' });
40 sub shutdown_request
{
43 $self->{shutdown_request
} = 1;
46 sub get_local_status
{
49 return $self->{status
};
52 sub set_local_status
{
53 my ($self, $new) = @_;
55 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
57 my $haenv = $self->{haenv
};
59 my $old = $self->{status
};
61 # important: only update if if really changed
62 return if $old->{state} eq $new->{state};
64 $haenv->log('info', "status change $old->{state} => $new->{state}");
66 $new->{state_change_time
} = $haenv->get_time();
68 $self->{status
} = $new;
71 sub get_protected_ha_agent_lock
{
74 my $haenv = $self->{haenv
};
77 my $starttime = $haenv->get_time();
81 if ($haenv->get_ha_agent_lock()) {
82 if ($self->{ha_agent_wd
}) {
83 $haenv->watchdog_update($self->{ha_agent_wd
});
85 my $wfh = $haenv->watchdog_open();
86 $self->{ha_agent_wd
} = $wfh;
91 last if ++$count > 5; # try max 5 time
93 my $delay = $haenv->get_time() - $starttime;
94 last if $delay > 5; # for max 5 seconds
102 sub fenced_service_count
{
105 my $haenv = $self->{haenv
};
107 my $nodename = $haenv->nodename();
109 my $ss = $self->{service_status
};
113 foreach my $sid (keys %$ss) {
114 my $sd = $ss->{$sid};
115 next if !$sd->{node
};
116 next if $sd->{node
} ne $nodename;
117 my $req_state = $sd->{state};
118 next if !defined($req_state);
119 if ($req_state eq 'fence') {
128 sub do_one_iteration
{
131 my $haenv = $self->{haenv
};
133 my $status = $self->get_local_status();
134 my $state = $status->{state};
136 my $ms = $haenv->read_manager_status();
137 $self->{service_status
} = $ms->{service_status
} || {};
139 my $fence_request = $self->fenced_service_count();
141 # do state changes first
143 my $ctime = $haenv->get_time();
145 if ($state eq 'wait_for_agent_lock') {
147 my $service_count = 1; # todo: correctly compute
149 if (!$fence_request && $service_count && $haenv->quorate()) {
150 if ($self->get_protected_ha_agent_lock()) {
151 $self->set_local_status({ state => 'active' });
155 } elsif ($state eq 'lost_agent_lock') {
157 if (!$fence_request && $haenv->quorate()) {
158 if ($self->get_protected_ha_agent_lock()) {
159 $self->set_local_status({ state => 'active' });
163 } elsif ($state eq 'active') {
165 if ($fence_request) {
166 $haenv->log('err', "node need to be fenced - releasing agent_lock\n");
167 $self->set_local_status({ state => 'lost_agent_lock'});
168 } elsif (!$self->get_protected_ha_agent_lock()) {
169 $self->set_local_status({ state => 'lost_agent_lock'});
173 $status = $self->get_local_status();
174 $state = $status->{state};
178 if ($state eq 'wait_for_agent_lock') {
180 return 0 if $self->{shutdown_request
};
184 } elsif ($state eq 'active') {
186 my $startime = $haenv->get_time();
192 # do work (max_time seconds)
194 # fixme: set alert timer
196 if ($self->{shutdown_request
}) {
198 # fixme: request service stop or relocate ?
200 my $service_count = 0; # fixme
202 if ($service_count == 0) {
204 if ($self->{ha_agent_wd
}) {
205 $haenv->watchdog_close($self->{ha_agent_wd
});
206 delete $self->{ha_agent_wd
};
213 $self->manage_resources();
218 $haenv->log('err', "got unexpected error - $err");
221 return 0 if $shutdown;
223 $haenv->sleep_until($startime + $max_time);
225 } elsif ($state eq 'lost_agent_lock') {
227 # Note: watchdog is active an will triger soon!
229 # so we hope to get the lock back soon!
231 if ($self->{shutdown_request
}) {
233 my $running_services = 0; # fixme: correctly compute
235 if ($running_services > 0) {
236 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
237 "killing running services");
239 # fixme: kill all services as fast as possible
242 # now all services are stopped, so we can close the watchdog
244 if ($self->{ha_agent_wd
}) {
245 $haenv->watchdog_close($self->{ha_agent_wd
});
246 delete $self->{ha_agent_wd
};
256 die "got unexpected status '$state'\n";
263 sub manage_resources
{
266 my $haenv = $self->{haenv
};
268 my $nodename = $haenv->nodename();
270 my $ss = $self->{service_status
};
272 foreach my $sid (keys %$ss) {
273 my $sd = $ss->{$sid};
274 next if !$sd->{node
};
276 next if $sd->{node
} ne $nodename;
277 my $req_state = $sd->{state};
278 next if !defined($req_state);
280 $self->queue_resource_command($sid, $sd->{uid
}, $req_state, $sd->{target
});
283 warn "unable to run resource agent for '$sid' - $err"; # fixme
287 my $starttime = time();
292 my $sc = $haenv->read_service_config();
294 while ((time() - $starttime) < 5) {
295 my $count = $self->check_active_workers();
297 foreach my $sid (keys %{$self->{workers
}}) {
298 last if $count >= $max_workers;
299 my $w = $self->{workers
}->{$sid};
300 my $cd = $sc->{$sid};
302 warn "missing resource configuration for '$sid'\n";
307 if (!defined($pid)) {
308 warn "fork worker failed\n";
309 $count = 0; last; # abort, try later
310 } elsif ($pid == 0) {
314 $res = $haenv->exec_resource_agent($sid, $cd, $w->{state}, $w->{target
});
334 # fixme: use a queue an limit number of parallel workers?
335 sub queue_resource_command
{
336 my ($self, $sid, $uid, $state, $target) = @_;
338 if (my $w = $self->{workers
}->{$sid}) {
339 return if $w->{pid
}; # already started
340 # else, delete and overwrite queue entry with new command
341 delete $self->{workers
}->{$sid};
344 $self->{workers
}->{$sid} = {
350 $self->{workers
}->{$sid}->{target
} = $target if $target;
353 sub check_active_workers
{
356 # finish/count workers
358 foreach my $sid (keys %{$self->{workers
}}) {
359 my $w = $self->{workers
}->{$sid};
360 if (my $pid = $w->{pid
}) {
362 my $waitpid = waitpid($pid, WNOHANG
);
363 if (defined($waitpid) && ($waitpid == $pid)) {
364 $self->resource_command_finished($sid, $w->{uid
}, $?);
374 sub resource_command_finished
{
375 my ($self, $sid, $uid, $status) = @_;
377 my $haenv = $self->{haenv
};
379 my $w = delete $self->{workers
}->{$sid};
380 return if !$w; # should not happen
385 $haenv->log('err', "resource agent $sid finished - failed to execute");
386 } elsif (my $sig = ($status & 127)) {
387 $haenv->log('err', "resource agent $sid finished - got signal $sig");
389 $exit_code = ($status >> 8);
392 $self->{results
}->{$uid} = {
394 state => $w->{state},
395 exit_code
=> $exit_code,
398 my $ss = $self->{service_status
};
400 # compute hash of valid/existing uids
402 foreach my $sid (keys %$ss) {
403 my $sd = $ss->{$sid};
405 $valid_uids->{$sd->{uid
}} = 1;
409 foreach my $id (keys %{$self->{results
}}) {
410 next if !$valid_uids->{$id};
411 $results->{$id} = $self->{results
}->{$id};
413 $self->{results
} = $results;
415 $haenv->write_lrm_status($results);