]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
use PVE::DataCenterConfig
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8
9 use PVE::SafeSyslog;
10 use PVE::Tools;
11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
12 use PVE::DataCenterConfig;
13 use PVE::INotify;
14 use PVE::RPCEnvironment;
15
16 use PVE::HA::Tools ':exit_codes';
17 use PVE::HA::Env;
18 use PVE::HA::Config;
19 use PVE::HA::FenceConfig;
20 use PVE::HA::Resources;
21 use PVE::HA::Resources::PVEVM;
22 use PVE::HA::Resources::PVECT;
23
24 PVE::HA::Resources::PVEVM->register();
25 PVE::HA::Resources::PVECT->register();
26
27 PVE::HA::Resources->init();
28
29 my $lockdir = "/etc/pve/priv/lock";
30
31 sub new {
32 my ($this, $nodename) = @_;
33
34 die "missing nodename" if !$nodename;
35
36 my $class = ref($this) || $this;
37
38 my $self = bless {}, $class;
39
40 $self->{nodename} = $nodename;
41
42 return $self;
43 }
44
45 sub nodename {
46 my ($self) = @_;
47
48 return $self->{nodename};
49 }
50
51 sub hardware {
52 my ($self) = @_;
53
54 die "hardware is for testing and simulation only";
55 }
56
57 sub read_manager_status {
58 my ($self) = @_;
59
60 return PVE::HA::Config::read_manager_status();
61 }
62
63 sub write_manager_status {
64 my ($self, $status_obj) = @_;
65
66 PVE::HA::Config::write_manager_status($status_obj);
67 }
68
69 sub read_lrm_status {
70 my ($self, $node) = @_;
71
72 $node = $self->{nodename} if !defined($node);
73
74 return PVE::HA::Config::read_lrm_status($node);
75 }
76
77 sub write_lrm_status {
78 my ($self, $status_obj) = @_;
79
80 my $node = $self->{nodename};
81
82 PVE::HA::Config::write_lrm_status($node, $status_obj);
83 }
84
85 sub is_node_shutdown {
86 my ($self) = @_;
87
88 my $shutdown = 0;
89 my $reboot = 0;
90
91 my $code = sub {
92 my $line = shift;
93
94 # ensure we match the full unit name by matching /^JOB_ID UNIT /
95 # see: man systemd.special
96 $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
97 $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
98 };
99
100 my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
101 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
102
103 return ($shutdown, $reboot);
104 }
105
106 sub queue_crm_commands {
107 my ($self, $cmd) = @_;
108
109 return PVE::HA::Config::queue_crm_commands($cmd);
110 }
111
112 sub read_crm_commands {
113 my ($self) = @_;
114
115 return PVE::HA::Config::read_crm_commands();
116 }
117
118 sub read_service_config {
119 my ($self) = @_;
120
121 return PVE::HA::Config::read_and_check_resources_config();
122 }
123
124 sub update_service_config {
125 my ($self, $sid, $param) = @_;
126
127 return PVE::HA::Config::update_resources_config($sid, $param);
128 }
129
130 sub parse_sid {
131 my ($self, $sid) = @_;
132
133 return PVE::HA::Config::parse_sid($sid);
134 }
135
136 sub read_fence_config {
137 my ($self) = @_;
138
139 return PVE::HA::Config::read_fence_config();
140 }
141
142 sub fencing_mode {
143 my ($self) = @_;
144
145 my $datacenterconfig = cfs_read_file('datacenter.cfg');
146
147 return 'watchdog' if !$datacenterconfig->{fencing};
148
149 return $datacenterconfig->{fencing};
150 }
151
152 sub exec_fence_agent {
153 my ($self, $agent, $node, @param) = @_;
154
155 # setup execution environment
156 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
157
158 my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
159
160 exec($cmd);
161 exit -1;
162 }
163
164 # this is only allowed by the master to recover a _fenced_ service
165 sub steal_service {
166 my ($self, $sid, $current_node, $new_node) = @_;
167
168 my (undef, $type, $name) = PVE::HA::Config::parse_sid($sid);
169
170 if(my $plugin = PVE::HA::Resources->lookup($type)) {
171 my $old = $plugin->config_file($name, $current_node);
172 my $new = $plugin->config_file($name, $new_node);
173 rename($old, $new) ||
174 die "rename '$old' to '$new' failed - $!\n";
175 } else {
176 die "implement me";
177 }
178 }
179
180 sub read_group_config {
181 my ($self) = @_;
182
183 return PVE::HA::Config::read_group_config();
184 }
185
186 # this should return a hash containing info
187 # what nodes are members and online.
188 sub get_node_info {
189 my ($self) = @_;
190
191 my ($node_info, $quorate) = ({}, 0);
192
193 my $nodename = $self->{nodename};
194
195 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
196
197 my $members = PVE::Cluster::get_members();
198
199 foreach my $node (keys %$members) {
200 my $d = $members->{$node};
201 $node_info->{$node}->{online} = $d->{online};
202 }
203
204 $node_info->{$nodename}->{online} = 1; # local node is always up
205
206 return ($node_info, $quorate);
207 }
208
209 sub log {
210 my ($self, $level, $msg) = @_;
211
212 chomp $msg;
213
214 syslog($level, $msg);
215 }
216
217 sub sendmail {
218 my ($self, $subject, $text) = @_;
219
220 # Leave it to postfix to append the correct hostname
221 my $mailfrom = 'root';
222 # /root/.forward makes pvemailforward redirect the
223 # mail to the address configured in the datacenter
224 my $mailto = 'root';
225
226 PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
227 }
228
229 my $last_lock_status_hash = {};
230
231 sub get_pve_lock {
232 my ($self, $lockid) = @_;
233
234 my $got_lock = 0;
235
236 my $filename = "$lockdir/$lockid";
237
238 $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
239 my $last = $last_lock_status_hash->{$lockid};
240
241 my $ctime = time();
242 my $last_lock_time = $last->{lock_time} // 0;
243 my $last_got_lock = $last->{got_lock};
244
245 my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
246
247 eval {
248
249 mkdir $lockdir;
250
251 # pve cluster filesystem not online
252 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
253
254 if (($ctime - $last_lock_time) < $retry_timeout) {
255 # try cfs lock update request (utime)
256 if (utime(0, $ctime, $filename)) {
257 $got_lock = 1;
258 return;
259 }
260 die "cfs lock update failed - $!\n";
261 }
262
263 if (mkdir $filename) {
264 $got_lock = 1;
265 return;
266 }
267
268 utime 0, 0, $filename; # cfs unlock request
269 die "can't get cfs lock\n";
270 };
271
272 my $err = $@;
273
274 #$self->log('err', $err) if $err; # for debugging
275
276 $last->{got_lock} = $got_lock;
277 $last->{lock_time} = $ctime if $got_lock;
278
279 if (!!$got_lock != !!$last_got_lock) {
280 if ($got_lock) {
281 $self->log('info', "successfully acquired lock '$lockid'");
282 } else {
283 my $msg = "lost lock '$lockid";
284 $msg .= " - $err" if $err;
285 $self->log('err', $msg);
286 }
287 }
288
289 return $got_lock;
290 }
291
292 sub get_ha_manager_lock {
293 my ($self) = @_;
294
295 return $self->get_pve_lock("ha_manager_lock");
296 }
297
298 # release the cluster wide manager lock.
299 # when released another CRM may step up and get the lock, thus this should only
300 # get called when shutting down/deactivating the current master
301 sub release_ha_manager_lock {
302 my ($self) = @_;
303
304 return rmdir("$lockdir/ha_manager_lock");
305 }
306
307 sub get_ha_agent_lock {
308 my ($self, $node) = @_;
309
310 $node = $self->nodename() if !defined($node);
311
312 return $self->get_pve_lock("ha_agent_${node}_lock");
313 }
314
315 # release the respective node agent lock.
316 # this should only get called if the nodes LRM gracefully shuts down with
317 # all services already cleanly stopped!
318 sub release_ha_agent_lock {
319 my ($self) = @_;
320
321 my $node = $self->nodename();
322
323 return rmdir("$lockdir/ha_agent_${node}_lock");
324 }
325
326 sub quorate {
327 my ($self) = @_;
328
329 my $quorate = 0;
330 eval {
331 $quorate = PVE::Cluster::check_cfs_quorum();
332 };
333
334 return $quorate;
335 }
336
337 sub get_time {
338 my ($self) = @_;
339
340 return time();
341 }
342
343 sub sleep {
344 my ($self, $delay) = @_;
345
346 CORE::sleep($delay);
347 }
348
349 sub sleep_until {
350 my ($self, $end_time) = @_;
351
352 for (;;) {
353 my $cur_time = time();
354
355 last if $cur_time >= $end_time;
356
357 $self->sleep(1);
358 }
359 }
360
361 sub loop_start_hook {
362 my ($self) = @_;
363
364 $self->{loop_start} = $self->get_time();
365
366 }
367
368 sub loop_end_hook {
369 my ($self) = @_;
370
371 my $delay = $self->get_time() - $self->{loop_start};
372
373 warn "loop take too long ($delay seconds)\n" if $delay > 30;
374 }
375
376 sub cluster_state_update {
377 my ($self) = @_;
378
379 eval { PVE::Cluster::cfs_update(1) };
380 if (my $err = $@) {
381 $self->log('warn', "cluster file system update failed - $err");
382 return 0;
383 }
384
385 return 1;
386 }
387
388 my $watchdog_fh;
389
390 sub watchdog_open {
391 my ($self) = @_;
392
393 die "watchdog already open\n" if defined($watchdog_fh);
394
395 $watchdog_fh = IO::Socket::UNIX->new(
396 Type => SOCK_STREAM(),
397 Peer => "/run/watchdog-mux.sock") ||
398 die "unable to open watchdog socket - $!\n";
399
400 $self->log('info', "watchdog active");
401 }
402
403 sub watchdog_update {
404 my ($self, $wfh) = @_;
405
406 my $res = $watchdog_fh->syswrite("\0", 1);
407 if (!defined($res)) {
408 $self->log('err', "watchdog update failed - $!\n");
409 return 0;
410 }
411 if ($res != 1) {
412 $self->log('err', "watchdog update failed - write $res bytes\n");
413 return 0;
414 }
415
416 return 1;
417 }
418
419 sub watchdog_close {
420 my ($self, $wfh) = @_;
421
422 $watchdog_fh->syswrite("V", 1); # magic watchdog close
423 if (!$watchdog_fh->close()) {
424 $self->log('err', "watchdog close failed - $!");
425 } else {
426 $watchdog_fh = undef;
427 $self->log('info', "watchdog closed (disabled)");
428 }
429 }
430
431 sub after_fork {
432 my ($self) = @_;
433
434 # close inherited inotify FD from parent and reopen our own
435 PVE::INotify::inotify_close();
436 PVE::INotify::inotify_init();
437
438 PVE::Cluster::cfs_update();
439 }
440
441 sub get_max_workers {
442 my ($self) = @_;
443
444 my $datacenterconfig = cfs_read_file('datacenter.cfg');
445
446 return $datacenterconfig->{max_workers} || 4;
447 }
448
449 # return cluster wide enforced HA settings
450 sub get_ha_settings {
451 my ($self) = @_;
452
453 my $datacenterconfig = eval { cfs_read_file('datacenter.cfg') };
454 if (my $err = $@) {
455 $self->log('err', "unable to get HA settings from datacenter.cfg - $err");
456 return {};
457 }
458
459 return $datacenterconfig->{ha};
460 }
461
462 1;