1 package PVE
::HA
::Env
::PVE2
;
5 use POSIX
qw(:errno_h :fcntl_h);
12 use PVE
::Cluster
qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
13 use PVE
::DataCenterConfig
;
15 use PVE
::RPCEnvironment
;
18 use PVE
::HA
::Tools
':exit_codes';
21 use PVE
::HA
::FenceConfig
;
22 use PVE
::HA
::Resources
;
23 use PVE
::HA
::Resources
::PVEVM
;
24 use PVE
::HA
::Resources
::PVECT
;
26 PVE
::HA
::Resources
::PVEVM-
>register();
27 PVE
::HA
::Resources
::PVECT-
>register();
29 PVE
::HA
::Resources-
>init();
31 my $lockdir = "/etc/pve/priv/lock";
34 my ($this, $nodename) = @_;
36 die "missing nodename" if !$nodename;
38 my $class = ref($this) || $this;
40 my $self = bless {}, $class;
42 $self->{nodename
} = $nodename;
50 return $self->{nodename
};
56 die "hardware is for testing and simulation only";
59 sub read_manager_status
{
62 return PVE
::HA
::Config
::read_manager_status
();
65 sub write_manager_status
{
66 my ($self, $status_obj) = @_;
68 PVE
::HA
::Config
::write_manager_status
($status_obj);
72 my ($self, $node) = @_;
74 $node = $self->{nodename
} if !defined($node);
76 return PVE
::HA
::Config
::read_lrm_status
($node);
79 sub write_lrm_status
{
80 my ($self, $status_obj) = @_;
82 my $node = $self->{nodename
};
84 PVE
::HA
::Config
::write_lrm_status
($node, $status_obj);
87 sub is_node_shutdown
{
96 # ensure we match the full unit name by matching /^JOB_ID UNIT /
97 # see: man systemd.special
98 $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
99 $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
102 my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
103 eval { PVE
::Tools
::run_command
($cmd, outfunc
=> $code, noerr
=> 1); };
105 return ($shutdown, $reboot);
108 sub queue_crm_commands
{
109 my ($self, $cmd) = @_;
111 return PVE
::HA
::Config
::queue_crm_commands
($cmd);
114 sub read_crm_commands
{
117 return PVE
::HA
::Config
::read_crm_commands
();
120 sub read_service_config
{
123 return PVE
::HA
::Config
::read_and_check_resources_config
();
126 sub update_service_config
{
127 my ($self, $sid, $param) = @_;
129 return PVE
::HA
::Config
::update_resources_config
($sid, $param);
133 my ($self, $sid) = @_;
135 return PVE
::HA
::Config
::parse_sid
($sid);
138 sub read_fence_config
{
141 return PVE
::HA
::Config
::read_fence_config
();
147 my $datacenterconfig = cfs_read_file
('datacenter.cfg');
149 return 'watchdog' if !$datacenterconfig->{fencing
};
151 return $datacenterconfig->{fencing
};
154 sub exec_fence_agent
{
155 my ($self, $agent, $node, @param) = @_;
157 # setup execution environment
158 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
160 my $cmd = "$agent " . PVE
::HA
::FenceConfig
::gen_arg_str
(@param);
166 # this is only allowed by the master to recover a _fenced_ service
168 my ($self, $sid, $current_node, $new_node) = @_;
170 my (undef, $type, $name) = PVE
::HA
::Config
::parse_sid
($sid);
172 if(my $plugin = PVE
::HA
::Resources-
>lookup($type)) {
173 my $old = $plugin->config_file($name, $current_node);
174 my $new = $plugin->config_file($name, $new_node);
175 rename($old, $new) ||
176 die "rename '$old' to '$new' failed - $!\n";
181 # Necessary for (at least) static usage plugin to always be able to read service config from new
183 $self->cluster_state_update();
186 sub read_group_config
{
189 return PVE
::HA
::Config
::read_group_config
();
192 # this should return a hash containing info
193 # what nodes are members and online.
197 my ($node_info, $quorate) = ({}, 0);
199 my $nodename = $self->{nodename
};
201 $quorate = PVE
::Cluster
::check_cfs_quorum
(1) || 0;
203 my $members = PVE
::Cluster
::get_members
();
205 foreach my $node (keys %$members) {
206 my $d = $members->{$node};
207 $node_info->{$node}->{online
} = $d->{online
};
210 $node_info->{$nodename}->{online
} = 1; # local node is always up
212 return ($node_info, $quorate);
216 my ($self, $level, $msg) = @_;
220 syslog
($level, $msg);
223 sub send_notification
{
224 my ($self, $subject, $text, $properties) = @_;
227 my $dc_config = PVE
::Cluster
::cfs_read_file
('datacenter.cfg');
228 my $target = $dc_config->{notify
}->{'target-fencing'} // PVE
::Notify
::default_target
();
229 my $notify = $dc_config->{notify
}->{fencing
} // 'always';
231 if ($notify eq 'always') {
232 PVE
::Notify
::error
($target, $subject, $text, $properties);
236 $self->log("warning", "could not notify: $@") if $@;
239 my $last_lock_status_hash = {};
242 my ($self, $lockid) = @_;
246 my $filename = "$lockdir/$lockid";
248 $last_lock_status_hash->{$lockid} //= { lock_time
=> 0, got_lock
=> 0};
249 my $last = $last_lock_status_hash->{$lockid};
252 my $last_lock_time = $last->{lock_time
} // 0;
253 my $last_got_lock = $last->{got_lock
};
255 my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
261 # pve cluster filesystem not online
262 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d
$lockdir;
264 if (($ctime - $last_lock_time) < $retry_timeout) {
265 # try cfs lock update request (utime)
266 if (utime(0, $ctime, $filename)) {
270 die "cfs lock update failed - $!\n";
273 if (mkdir $filename) {
278 utime 0, 0, $filename; # cfs unlock request
279 die "can't get cfs lock\n";
284 #$self->log('err', $err) if $err; # for debugging
286 $last->{got_lock
} = $got_lock;
287 $last->{lock_time
} = $ctime if $got_lock;
289 if (!!$got_lock != !!$last_got_lock) {
291 $self->log('info', "successfully acquired lock '$lockid'");
293 my $msg = "lost lock '$lockid";
294 $msg .= " - $err" if $err;
295 $self->log('err', $msg);
302 sub get_ha_manager_lock
{
305 return $self->get_pve_lock("ha_manager_lock");
308 # release the cluster wide manager lock.
309 # when released another CRM may step up and get the lock, thus this should only
310 # get called when shutting down/deactivating the current master
311 sub release_ha_manager_lock
{
314 return rmdir("$lockdir/ha_manager_lock");
317 sub get_ha_agent_lock
{
318 my ($self, $node) = @_;
320 $node = $self->nodename() if !defined($node);
322 return $self->get_pve_lock("ha_agent_${node}_lock");
325 # release the respective node agent lock.
326 # this should only get called if the nodes LRM gracefully shuts down with
327 # all services already cleanly stopped!
328 sub release_ha_agent_lock
{
331 my $node = $self->nodename();
333 return rmdir("$lockdir/ha_agent_${node}_lock");
341 $quorate = PVE
::Cluster
::check_cfs_quorum
();
354 my ($self, $delay) = @_;
360 my ($self, $end_time) = @_;
363 my $cur_time = time();
365 last if $cur_time >= $end_time;
371 sub loop_start_hook
{
374 $self->{loop_start
} = $self->get_time();
381 my $delay = $self->get_time() - $self->{loop_start
};
383 warn "loop take too long ($delay seconds)\n" if $delay > 30;
386 sub cluster_state_update
{
389 eval { PVE
::Cluster
::cfs_update
(1) };
391 $self->log('warn', "cluster file system update failed - $err");
403 die "watchdog already open\n" if defined($watchdog_fh);
405 $watchdog_fh = IO
::Socket
::UNIX-
>new(
406 Type
=> SOCK_STREAM
(),
407 Peer
=> "/run/watchdog-mux.sock") ||
408 die "unable to open watchdog socket - $!\n";
410 $self->log('info', "watchdog active");
413 sub watchdog_update
{
414 my ($self, $wfh) = @_;
416 my $res = $watchdog_fh->syswrite("\0", 1);
417 if (!defined($res)) {
418 $self->log('err', "watchdog update failed - $!\n");
422 $self->log('err', "watchdog update failed - write $res bytes\n");
430 my ($self, $wfh) = @_;
432 $watchdog_fh->syswrite("V", 1); # magic watchdog close
433 if (!$watchdog_fh->close()) {
434 $self->log('err', "watchdog close failed - $!");
436 $watchdog_fh = undef;
437 $self->log('info', "watchdog closed (disabled)");
444 # close inherited inotify FD from parent and reopen our own
445 PVE
::INotify
::inotify_close
();
446 PVE
::INotify
::inotify_init
();
448 PVE
::Cluster
::cfs_update
();
451 sub get_max_workers
{
454 my $datacenterconfig = cfs_read_file
('datacenter.cfg');
456 return $datacenterconfig->{max_workers
} || 4;
459 # return cluster wide enforced HA settings
460 sub get_datacenter_settings
{
463 my $datacenterconfig = eval { cfs_read_file
('datacenter.cfg') };
464 $self->log('err', "unable to get HA settings from datacenter.cfg - $@") if $@;
467 ha
=> $datacenterconfig->{ha
} // {},
468 crs
=> $datacenterconfig->{crs
} // {},
472 sub get_static_node_stats
{
475 my $stats = PVE
::Cluster
::get_node_kv
('static-info');
476 for my $node (keys $stats->%*) {
477 $stats->{$node} = eval { decode_json
($stats->{$node}) };
478 $self->log('err', "unable to decode static node info for '$node' - $@") if $@;