]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
manager: send notifications via new notification module
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8 use JSON;
9
10 use PVE::SafeSyslog;
11 use PVE::Tools;
12 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
13 use PVE::DataCenterConfig;
14 use PVE::INotify;
15 use PVE::RPCEnvironment;
16 use PVE::Notify;
17
18 use PVE::HA::Tools ':exit_codes';
19 use PVE::HA::Env;
20 use PVE::HA::Config;
21 use PVE::HA::FenceConfig;
22 use PVE::HA::Resources;
23 use PVE::HA::Resources::PVEVM;
24 use PVE::HA::Resources::PVECT;
25
26 PVE::HA::Resources::PVEVM->register();
27 PVE::HA::Resources::PVECT->register();
28
29 PVE::HA::Resources->init();
30
31 my $lockdir = "/etc/pve/priv/lock";
32
33 sub new {
34 my ($this, $nodename) = @_;
35
36 die "missing nodename" if !$nodename;
37
38 my $class = ref($this) || $this;
39
40 my $self = bless {}, $class;
41
42 $self->{nodename} = $nodename;
43
44 return $self;
45 }
46
47 sub nodename {
48 my ($self) = @_;
49
50 return $self->{nodename};
51 }
52
53 sub hardware {
54 my ($self) = @_;
55
56 die "hardware is for testing and simulation only";
57 }
58
59 sub read_manager_status {
60 my ($self) = @_;
61
62 return PVE::HA::Config::read_manager_status();
63 }
64
65 sub write_manager_status {
66 my ($self, $status_obj) = @_;
67
68 PVE::HA::Config::write_manager_status($status_obj);
69 }
70
71 sub read_lrm_status {
72 my ($self, $node) = @_;
73
74 $node = $self->{nodename} if !defined($node);
75
76 return PVE::HA::Config::read_lrm_status($node);
77 }
78
79 sub write_lrm_status {
80 my ($self, $status_obj) = @_;
81
82 my $node = $self->{nodename};
83
84 PVE::HA::Config::write_lrm_status($node, $status_obj);
85 }
86
87 sub is_node_shutdown {
88 my ($self) = @_;
89
90 my $shutdown = 0;
91 my $reboot = 0;
92
93 my $code = sub {
94 my $line = shift;
95
96 # ensure we match the full unit name by matching /^JOB_ID UNIT /
97 # see: man systemd.special
98 $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
99 $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
100 };
101
102 my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
103 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
104
105 return ($shutdown, $reboot);
106 }
107
108 sub queue_crm_commands {
109 my ($self, $cmd) = @_;
110
111 return PVE::HA::Config::queue_crm_commands($cmd);
112 }
113
114 sub read_crm_commands {
115 my ($self) = @_;
116
117 return PVE::HA::Config::read_crm_commands();
118 }
119
120 sub read_service_config {
121 my ($self) = @_;
122
123 return PVE::HA::Config::read_and_check_resources_config();
124 }
125
126 sub update_service_config {
127 my ($self, $sid, $param) = @_;
128
129 return PVE::HA::Config::update_resources_config($sid, $param);
130 }
131
132 sub parse_sid {
133 my ($self, $sid) = @_;
134
135 return PVE::HA::Config::parse_sid($sid);
136 }
137
138 sub read_fence_config {
139 my ($self) = @_;
140
141 return PVE::HA::Config::read_fence_config();
142 }
143
144 sub fencing_mode {
145 my ($self) = @_;
146
147 my $datacenterconfig = cfs_read_file('datacenter.cfg');
148
149 return 'watchdog' if !$datacenterconfig->{fencing};
150
151 return $datacenterconfig->{fencing};
152 }
153
154 sub exec_fence_agent {
155 my ($self, $agent, $node, @param) = @_;
156
157 # setup execution environment
158 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
159
160 my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
161
162 exec($cmd);
163 exit -1;
164 }
165
166 # this is only allowed by the master to recover a _fenced_ service
167 sub steal_service {
168 my ($self, $sid, $current_node, $new_node) = @_;
169
170 my (undef, $type, $name) = PVE::HA::Config::parse_sid($sid);
171
172 if(my $plugin = PVE::HA::Resources->lookup($type)) {
173 my $old = $plugin->config_file($name, $current_node);
174 my $new = $plugin->config_file($name, $new_node);
175 rename($old, $new) ||
176 die "rename '$old' to '$new' failed - $!\n";
177 } else {
178 die "implement me";
179 }
180
181 # Necessary for (at least) static usage plugin to always be able to read service config from new
182 # node right away.
183 $self->cluster_state_update();
184 }
185
186 sub read_group_config {
187 my ($self) = @_;
188
189 return PVE::HA::Config::read_group_config();
190 }
191
192 # this should return a hash containing info
193 # what nodes are members and online.
194 sub get_node_info {
195 my ($self) = @_;
196
197 my ($node_info, $quorate) = ({}, 0);
198
199 my $nodename = $self->{nodename};
200
201 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
202
203 my $members = PVE::Cluster::get_members();
204
205 foreach my $node (keys %$members) {
206 my $d = $members->{$node};
207 $node_info->{$node}->{online} = $d->{online};
208 }
209
210 $node_info->{$nodename}->{online} = 1; # local node is always up
211
212 return ($node_info, $quorate);
213 }
214
215 sub log {
216 my ($self, $level, $msg) = @_;
217
218 chomp $msg;
219
220 syslog($level, $msg);
221 }
222
223 sub send_notification {
224 my ($self, $subject, $text, $properties) = @_;
225
226 eval {
227 my $dc_config = PVE::Cluster::cfs_read_file('datacenter.cfg');
228 my $target = $dc_config->{notify}->{'target-fencing'} // PVE::Notify::default_target();
229 my $notify = $dc_config->{notify}->{fencing} // 'always';
230
231 if ($notify eq 'always') {
232 PVE::Notify::error($target, $subject, $text, $properties);
233 }
234 };
235
236 $self->log("warning", "could not notify: $@") if $@;
237 }
238
239 my $last_lock_status_hash = {};
240
241 sub get_pve_lock {
242 my ($self, $lockid) = @_;
243
244 my $got_lock = 0;
245
246 my $filename = "$lockdir/$lockid";
247
248 $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
249 my $last = $last_lock_status_hash->{$lockid};
250
251 my $ctime = time();
252 my $last_lock_time = $last->{lock_time} // 0;
253 my $last_got_lock = $last->{got_lock};
254
255 my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
256
257 eval {
258
259 mkdir $lockdir;
260
261 # pve cluster filesystem not online
262 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
263
264 if (($ctime - $last_lock_time) < $retry_timeout) {
265 # try cfs lock update request (utime)
266 if (utime(0, $ctime, $filename)) {
267 $got_lock = 1;
268 return;
269 }
270 die "cfs lock update failed - $!\n";
271 }
272
273 if (mkdir $filename) {
274 $got_lock = 1;
275 return;
276 }
277
278 utime 0, 0, $filename; # cfs unlock request
279 die "can't get cfs lock\n";
280 };
281
282 my $err = $@;
283
284 #$self->log('err', $err) if $err; # for debugging
285
286 $last->{got_lock} = $got_lock;
287 $last->{lock_time} = $ctime if $got_lock;
288
289 if (!!$got_lock != !!$last_got_lock) {
290 if ($got_lock) {
291 $self->log('info', "successfully acquired lock '$lockid'");
292 } else {
293 my $msg = "lost lock '$lockid";
294 $msg .= " - $err" if $err;
295 $self->log('err', $msg);
296 }
297 }
298
299 return $got_lock;
300 }
301
302 sub get_ha_manager_lock {
303 my ($self) = @_;
304
305 return $self->get_pve_lock("ha_manager_lock");
306 }
307
308 # release the cluster wide manager lock.
309 # when released another CRM may step up and get the lock, thus this should only
310 # get called when shutting down/deactivating the current master
311 sub release_ha_manager_lock {
312 my ($self) = @_;
313
314 return rmdir("$lockdir/ha_manager_lock");
315 }
316
317 sub get_ha_agent_lock {
318 my ($self, $node) = @_;
319
320 $node = $self->nodename() if !defined($node);
321
322 return $self->get_pve_lock("ha_agent_${node}_lock");
323 }
324
325 # release the respective node agent lock.
326 # this should only get called if the nodes LRM gracefully shuts down with
327 # all services already cleanly stopped!
328 sub release_ha_agent_lock {
329 my ($self) = @_;
330
331 my $node = $self->nodename();
332
333 return rmdir("$lockdir/ha_agent_${node}_lock");
334 }
335
336 sub quorate {
337 my ($self) = @_;
338
339 my $quorate = 0;
340 eval {
341 $quorate = PVE::Cluster::check_cfs_quorum();
342 };
343
344 return $quorate;
345 }
346
347 sub get_time {
348 my ($self) = @_;
349
350 return time();
351 }
352
353 sub sleep {
354 my ($self, $delay) = @_;
355
356 CORE::sleep($delay);
357 }
358
359 sub sleep_until {
360 my ($self, $end_time) = @_;
361
362 for (;;) {
363 my $cur_time = time();
364
365 last if $cur_time >= $end_time;
366
367 $self->sleep(1);
368 }
369 }
370
371 sub loop_start_hook {
372 my ($self) = @_;
373
374 $self->{loop_start} = $self->get_time();
375
376 }
377
378 sub loop_end_hook {
379 my ($self) = @_;
380
381 my $delay = $self->get_time() - $self->{loop_start};
382
383 warn "loop take too long ($delay seconds)\n" if $delay > 30;
384 }
385
386 sub cluster_state_update {
387 my ($self) = @_;
388
389 eval { PVE::Cluster::cfs_update(1) };
390 if (my $err = $@) {
391 $self->log('warn', "cluster file system update failed - $err");
392 return 0;
393 }
394
395 return 1;
396 }
397
398 my $watchdog_fh;
399
400 sub watchdog_open {
401 my ($self) = @_;
402
403 die "watchdog already open\n" if defined($watchdog_fh);
404
405 $watchdog_fh = IO::Socket::UNIX->new(
406 Type => SOCK_STREAM(),
407 Peer => "/run/watchdog-mux.sock") ||
408 die "unable to open watchdog socket - $!\n";
409
410 $self->log('info', "watchdog active");
411 }
412
413 sub watchdog_update {
414 my ($self, $wfh) = @_;
415
416 my $res = $watchdog_fh->syswrite("\0", 1);
417 if (!defined($res)) {
418 $self->log('err', "watchdog update failed - $!\n");
419 return 0;
420 }
421 if ($res != 1) {
422 $self->log('err', "watchdog update failed - write $res bytes\n");
423 return 0;
424 }
425
426 return 1;
427 }
428
429 sub watchdog_close {
430 my ($self, $wfh) = @_;
431
432 $watchdog_fh->syswrite("V", 1); # magic watchdog close
433 if (!$watchdog_fh->close()) {
434 $self->log('err', "watchdog close failed - $!");
435 } else {
436 $watchdog_fh = undef;
437 $self->log('info', "watchdog closed (disabled)");
438 }
439 }
440
441 sub after_fork {
442 my ($self) = @_;
443
444 # close inherited inotify FD from parent and reopen our own
445 PVE::INotify::inotify_close();
446 PVE::INotify::inotify_init();
447
448 PVE::Cluster::cfs_update();
449 }
450
451 sub get_max_workers {
452 my ($self) = @_;
453
454 my $datacenterconfig = cfs_read_file('datacenter.cfg');
455
456 return $datacenterconfig->{max_workers} || 4;
457 }
458
459 # return cluster wide enforced HA settings
460 sub get_datacenter_settings {
461 my ($self) = @_;
462
463 my $datacenterconfig = eval { cfs_read_file('datacenter.cfg') };
464 $self->log('err', "unable to get HA settings from datacenter.cfg - $@") if $@;
465
466 return {
467 ha => $datacenterconfig->{ha} // {},
468 crs => $datacenterconfig->{crs} // {},
469 };
470 }
471
472 sub get_static_node_stats {
473 my ($self) = @_;
474
475 my $stats = PVE::Cluster::get_node_kv('static-info');
476 for my $node (keys $stats->%*) {
477 $stats->{$node} = eval { decode_json($stats->{$node}) };
478 $self->log('err', "unable to decode static node info for '$node' - $@") if $@;
479 }
480
481 return $stats;
482 }
483
484 1;