3 # Local Resource Manager
8 use POSIX
qw(:sys_wait_h);
14 # Server can have several states:
17 wait_for_agent_lock
=> "waiting for agnet lock",
18 active
=> "got agent_lock",
19 lost_agent_lock
=> "lost agent_lock",
23 my ($this, $haenv) = @_;
25 my $class = ref($this) || $this;
29 status
=> { state => 'startup' },
34 $self->set_local_status({ state => 'wait_for_agent_lock' });
39 sub shutdown_request
{
42 $self->{shutdown_request
} = 1;
45 sub get_local_status
{
48 return $self->{status
};
51 sub set_local_status
{
52 my ($self, $new) = @_;
54 die "invalid state '$new->{state}'" if !$valid_states->{$new->{state}};
56 my $haenv = $self->{haenv
};
58 my $old = $self->{status
};
60 # important: only update if if really changed
61 return if $old->{state} eq $new->{state};
63 $haenv->log('info', "status change $old->{state} => $new->{state}");
65 $new->{state_change_time
} = $haenv->get_time();
67 $self->{status
} = $new;
70 sub get_protected_ha_agent_lock
{
73 my $haenv = $self->{haenv
};
76 my $starttime = $haenv->get_time();
80 if ($haenv->get_ha_agent_lock()) {
81 if ($self->{ha_agent_wd
}) {
82 $haenv->watchdog_update($self->{ha_agent_wd
});
84 my $wfh = $haenv->watchdog_open();
85 $self->{ha_agent_wd
} = $wfh;
90 last if ++$count > 5; # try max 5 time
92 my $delay = $haenv->get_time() - $starttime;
93 last if $delay > 5; # for max 5 seconds
101 sub do_one_iteration
{
104 my $haenv = $self->{haenv
};
106 my $status = $self->get_local_status();
107 my $state = $status->{state};
109 # do state changes first
111 my $ctime = $haenv->get_time();
113 if ($state eq 'wait_for_agent_lock') {
115 my $service_count = 1; # todo: correctly compute
117 if ($service_count && $haenv->quorate()) {
118 if ($self->get_protected_ha_agent_lock()) {
119 $self->set_local_status({ state => 'active' });
123 } elsif ($state eq 'lost_agent_lock') {
125 if ($haenv->quorate()) {
126 if ($self->get_protected_ha_agent_lock()) {
127 $self->set_local_status({ state => 'active' });
131 } elsif ($state eq 'active') {
133 if (!$self->get_protected_ha_agent_lock()) {
134 $self->set_local_status({ state => 'lost_agent_lock'});
138 $status = $self->get_local_status();
139 $state = $status->{state};
143 $self->{service_status
} = {};
145 if ($state eq 'wait_for_agent_lock') {
147 return 0 if $self->{shutdown_request
};
151 } elsif ($state eq 'active') {
153 my $startime = $haenv->get_time();
159 # do work (max_time seconds)
161 # fixme: set alert timer
163 if ($self->{shutdown_request
}) {
165 # fixme: request service stop or relocate ?
167 my $service_count = 0; # fixme
169 if ($service_count == 0) {
171 if ($self->{ha_agent_wd
}) {
172 $haenv->watchdog_close($self->{ha_agent_wd
});
173 delete $self->{ha_agent_wd
};
179 my $ms = $haenv->read_manager_status();
181 $self->{service_status
} = $ms->{service_status
} || {};
183 $self->manage_resources();
187 $haenv->log('err', "got unexpected error - $err");
190 return 0 if $shutdown;
192 $haenv->sleep_until($startime + $max_time);
194 } elsif ($state eq 'lost_agent_lock') {
196 # Note: watchdog is active an will triger soon!
198 # so we hope to get the lock back soon!
200 if ($self->{shutdown_request
}) {
202 my $running_services = 0; # fixme: correctly compute
204 if ($running_services > 0) {
205 $haenv->log('err', "get shutdown request in state 'lost_agent_lock' - " .
206 "killing running services");
208 # fixme: kill all services as fast as possible
211 # now all services are stopped, so we can close the watchdog
213 if ($self->{ha_agent_wd
}) {
214 $haenv->watchdog_close($self->{ha_agent_wd
});
215 delete $self->{ha_agent_wd
};
223 die "got unexpected status '$state'\n";
230 sub manage_resources
{
233 my $haenv = $self->{haenv
};
235 my $nodename = $haenv->nodename();
237 my $ms = $haenv->read_manager_status();
239 my $ss = $self->{service_status
};
241 foreach my $sid (keys %$ss) {
242 my $sd = $ss->{$sid};
243 next if !$sd->{node
};
245 next if $sd->{node
} ne $nodename;
246 my $req_state = $sd->{state};
247 next if !defined($req_state);
250 $self->queue_resource_command($sid, $sd->{uid
}, $req_state, $sd->{target
});
253 warn "unable to run resource agent for '$sid' - $err"; # fixme
257 my $starttime = time();
262 while ((time() - $starttime) < 5) {
263 my $count = $self->check_active_workers();
265 foreach my $sid (keys %{$self->{workers
}}) {
266 last if $count >= $max_workers;
267 my $w = $self->{workers
}->{$sid};
270 if (!defined($pid)) {
271 warn "fork worker failed\n";
272 $count = 0; last; # abort, try later
273 } elsif ($pid == 0) {
277 $res = $haenv->exec_resource_agent($sid, $w->{state}, $w->{target
});
297 # fixme: use a queue an limit number of parallel workers?
298 sub queue_resource_command
{
299 my ($self, $sid, $uid, $state, $target) = @_;
301 if (my $w = $self->{workers
}->{$sid}) {
302 return if $w->{pid
}; # already started
303 # else, delete and overwrite queue entry with new command
304 delete $self->{workers
}->{$sid};
307 $self->{workers
}->{$sid} = {
313 $self->{workers
}->{$sid}->{target
} = $target if $target;
316 sub check_active_workers
{
319 # finish/count workers
321 foreach my $sid (keys %{$self->{workers
}}) {
322 my $w = $self->{workers
}->{$sid};
323 if (my $pid = $w->{pid
}) {
325 my $waitpid = waitpid($pid, WNOHANG
);
326 if (defined($waitpid) && ($waitpid == $pid)) {
327 $self->resource_command_finished($sid, $w->{uid
}, $?);
337 sub resource_command_finished
{
338 my ($self, $sid, $uid, $status) = @_;
340 my $haenv = $self->{haenv
};
342 my $w = delete $self->{workers
}->{$sid};
343 return if !$w; # should not happen
348 $haenv->log('err', "resource agent $sid finished - failed to execute");
349 } elsif (my $sig = ($status & 127)) {
350 $haenv->log('err', "resource agent $sid finished - got signal $sig");
352 $exit_code = ($status >> 8);
355 $self->{results
}->{$uid} = {
357 state => $w->{state},
358 exit_code
=> $exit_code,
361 my $ss = $self->{service_status
};
363 # compute hash of valid/existing uids
365 foreach my $sid (keys %$ss) {
366 my $sd = $ss->{$sid};
368 $valid_uids->{$sd->{uid
}} = 1;
372 foreach my $id (keys %{$self->{results
}}) {
373 next if !$valid_uids->{$id};
374 $results->{$id} = $self->{results
}->{$id};
376 $self->{results
} = $results;
378 $haenv->write_lrm_status($results);