]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
7cecf3544da6879c10a3d5f8aa901cccaedb70ed
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8 use JSON;
9
10 use PVE::SafeSyslog;
11 use PVE::Tools;
12 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
13 use PVE::DataCenterConfig;
14 use PVE::INotify;
15 use PVE::RPCEnvironment;
16
17 use PVE::HA::Tools ':exit_codes';
18 use PVE::HA::Env;
19 use PVE::HA::Config;
20 use PVE::HA::FenceConfig;
21 use PVE::HA::Resources;
22 use PVE::HA::Resources::PVEVM;
23 use PVE::HA::Resources::PVECT;
24
25 PVE::HA::Resources::PVEVM->register();
26 PVE::HA::Resources::PVECT->register();
27
28 PVE::HA::Resources->init();
29
30 my $lockdir = "/etc/pve/priv/lock";
31
32 sub new {
33 my ($this, $nodename) = @_;
34
35 die "missing nodename" if !$nodename;
36
37 my $class = ref($this) || $this;
38
39 my $self = bless {}, $class;
40
41 $self->{nodename} = $nodename;
42
43 return $self;
44 }
45
46 sub nodename {
47 my ($self) = @_;
48
49 return $self->{nodename};
50 }
51
52 sub hardware {
53 my ($self) = @_;
54
55 die "hardware is for testing and simulation only";
56 }
57
58 sub read_manager_status {
59 my ($self) = @_;
60
61 return PVE::HA::Config::read_manager_status();
62 }
63
64 sub write_manager_status {
65 my ($self, $status_obj) = @_;
66
67 PVE::HA::Config::write_manager_status($status_obj);
68 }
69
70 sub read_lrm_status {
71 my ($self, $node) = @_;
72
73 $node = $self->{nodename} if !defined($node);
74
75 return PVE::HA::Config::read_lrm_status($node);
76 }
77
78 sub write_lrm_status {
79 my ($self, $status_obj) = @_;
80
81 my $node = $self->{nodename};
82
83 PVE::HA::Config::write_lrm_status($node, $status_obj);
84 }
85
86 sub is_node_shutdown {
87 my ($self) = @_;
88
89 my $shutdown = 0;
90 my $reboot = 0;
91
92 my $code = sub {
93 my $line = shift;
94
95 # ensure we match the full unit name by matching /^JOB_ID UNIT /
96 # see: man systemd.special
97 $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
98 $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
99 };
100
101 my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
102 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
103
104 return ($shutdown, $reboot);
105 }
106
107 sub queue_crm_commands {
108 my ($self, $cmd) = @_;
109
110 return PVE::HA::Config::queue_crm_commands($cmd);
111 }
112
113 sub read_crm_commands {
114 my ($self) = @_;
115
116 return PVE::HA::Config::read_crm_commands();
117 }
118
119 sub read_service_config {
120 my ($self) = @_;
121
122 return PVE::HA::Config::read_and_check_resources_config();
123 }
124
125 sub update_service_config {
126 my ($self, $sid, $param) = @_;
127
128 return PVE::HA::Config::update_resources_config($sid, $param);
129 }
130
131 sub parse_sid {
132 my ($self, $sid) = @_;
133
134 return PVE::HA::Config::parse_sid($sid);
135 }
136
137 sub read_fence_config {
138 my ($self) = @_;
139
140 return PVE::HA::Config::read_fence_config();
141 }
142
143 sub fencing_mode {
144 my ($self) = @_;
145
146 my $datacenterconfig = cfs_read_file('datacenter.cfg');
147
148 return 'watchdog' if !$datacenterconfig->{fencing};
149
150 return $datacenterconfig->{fencing};
151 }
152
153 sub exec_fence_agent {
154 my ($self, $agent, $node, @param) = @_;
155
156 # setup execution environment
157 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
158
159 my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
160
161 exec($cmd);
162 exit -1;
163 }
164
165 # this is only allowed by the master to recover a _fenced_ service
166 sub steal_service {
167 my ($self, $sid, $current_node, $new_node) = @_;
168
169 my (undef, $type, $name) = PVE::HA::Config::parse_sid($sid);
170
171 if(my $plugin = PVE::HA::Resources->lookup($type)) {
172 my $old = $plugin->config_file($name, $current_node);
173 my $new = $plugin->config_file($name, $new_node);
174 rename($old, $new) ||
175 die "rename '$old' to '$new' failed - $!\n";
176 } else {
177 die "implement me";
178 }
179 }
180
181 sub read_group_config {
182 my ($self) = @_;
183
184 return PVE::HA::Config::read_group_config();
185 }
186
187 # this should return a hash containing info
188 # what nodes are members and online.
189 sub get_node_info {
190 my ($self) = @_;
191
192 my ($node_info, $quorate) = ({}, 0);
193
194 my $nodename = $self->{nodename};
195
196 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
197
198 my $members = PVE::Cluster::get_members();
199
200 foreach my $node (keys %$members) {
201 my $d = $members->{$node};
202 $node_info->{$node}->{online} = $d->{online};
203 }
204
205 $node_info->{$nodename}->{online} = 1; # local node is always up
206
207 return ($node_info, $quorate);
208 }
209
210 sub log {
211 my ($self, $level, $msg) = @_;
212
213 chomp $msg;
214
215 syslog($level, $msg);
216 }
217
218 sub sendmail {
219 my ($self, $subject, $text) = @_;
220
221 # Leave it to postfix to append the correct hostname
222 my $mailfrom = 'root';
223 # /root/.forward makes pvemailforward redirect the
224 # mail to the address configured in the datacenter
225 my $mailto = 'root';
226
227 PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
228 }
229
230 my $last_lock_status_hash = {};
231
232 sub get_pve_lock {
233 my ($self, $lockid) = @_;
234
235 my $got_lock = 0;
236
237 my $filename = "$lockdir/$lockid";
238
239 $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
240 my $last = $last_lock_status_hash->{$lockid};
241
242 my $ctime = time();
243 my $last_lock_time = $last->{lock_time} // 0;
244 my $last_got_lock = $last->{got_lock};
245
246 my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
247
248 eval {
249
250 mkdir $lockdir;
251
252 # pve cluster filesystem not online
253 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
254
255 if (($ctime - $last_lock_time) < $retry_timeout) {
256 # try cfs lock update request (utime)
257 if (utime(0, $ctime, $filename)) {
258 $got_lock = 1;
259 return;
260 }
261 die "cfs lock update failed - $!\n";
262 }
263
264 if (mkdir $filename) {
265 $got_lock = 1;
266 return;
267 }
268
269 utime 0, 0, $filename; # cfs unlock request
270 die "can't get cfs lock\n";
271 };
272
273 my $err = $@;
274
275 #$self->log('err', $err) if $err; # for debugging
276
277 $last->{got_lock} = $got_lock;
278 $last->{lock_time} = $ctime if $got_lock;
279
280 if (!!$got_lock != !!$last_got_lock) {
281 if ($got_lock) {
282 $self->log('info', "successfully acquired lock '$lockid'");
283 } else {
284 my $msg = "lost lock '$lockid";
285 $msg .= " - $err" if $err;
286 $self->log('err', $msg);
287 }
288 }
289
290 return $got_lock;
291 }
292
293 sub get_ha_manager_lock {
294 my ($self) = @_;
295
296 return $self->get_pve_lock("ha_manager_lock");
297 }
298
299 # release the cluster wide manager lock.
300 # when released another CRM may step up and get the lock, thus this should only
301 # get called when shutting down/deactivating the current master
302 sub release_ha_manager_lock {
303 my ($self) = @_;
304
305 return rmdir("$lockdir/ha_manager_lock");
306 }
307
308 sub get_ha_agent_lock {
309 my ($self, $node) = @_;
310
311 $node = $self->nodename() if !defined($node);
312
313 return $self->get_pve_lock("ha_agent_${node}_lock");
314 }
315
316 # release the respective node agent lock.
317 # this should only get called if the nodes LRM gracefully shuts down with
318 # all services already cleanly stopped!
319 sub release_ha_agent_lock {
320 my ($self) = @_;
321
322 my $node = $self->nodename();
323
324 return rmdir("$lockdir/ha_agent_${node}_lock");
325 }
326
327 sub quorate {
328 my ($self) = @_;
329
330 my $quorate = 0;
331 eval {
332 $quorate = PVE::Cluster::check_cfs_quorum();
333 };
334
335 return $quorate;
336 }
337
338 sub get_time {
339 my ($self) = @_;
340
341 return time();
342 }
343
344 sub sleep {
345 my ($self, $delay) = @_;
346
347 CORE::sleep($delay);
348 }
349
350 sub sleep_until {
351 my ($self, $end_time) = @_;
352
353 for (;;) {
354 my $cur_time = time();
355
356 last if $cur_time >= $end_time;
357
358 $self->sleep(1);
359 }
360 }
361
362 sub loop_start_hook {
363 my ($self) = @_;
364
365 $self->{loop_start} = $self->get_time();
366
367 }
368
369 sub loop_end_hook {
370 my ($self) = @_;
371
372 my $delay = $self->get_time() - $self->{loop_start};
373
374 warn "loop take too long ($delay seconds)\n" if $delay > 30;
375 }
376
377 sub cluster_state_update {
378 my ($self) = @_;
379
380 eval { PVE::Cluster::cfs_update(1) };
381 if (my $err = $@) {
382 $self->log('warn', "cluster file system update failed - $err");
383 return 0;
384 }
385
386 return 1;
387 }
388
389 my $watchdog_fh;
390
391 sub watchdog_open {
392 my ($self) = @_;
393
394 die "watchdog already open\n" if defined($watchdog_fh);
395
396 $watchdog_fh = IO::Socket::UNIX->new(
397 Type => SOCK_STREAM(),
398 Peer => "/run/watchdog-mux.sock") ||
399 die "unable to open watchdog socket - $!\n";
400
401 $self->log('info', "watchdog active");
402 }
403
404 sub watchdog_update {
405 my ($self, $wfh) = @_;
406
407 my $res = $watchdog_fh->syswrite("\0", 1);
408 if (!defined($res)) {
409 $self->log('err', "watchdog update failed - $!\n");
410 return 0;
411 }
412 if ($res != 1) {
413 $self->log('err', "watchdog update failed - write $res bytes\n");
414 return 0;
415 }
416
417 return 1;
418 }
419
420 sub watchdog_close {
421 my ($self, $wfh) = @_;
422
423 $watchdog_fh->syswrite("V", 1); # magic watchdog close
424 if (!$watchdog_fh->close()) {
425 $self->log('err', "watchdog close failed - $!");
426 } else {
427 $watchdog_fh = undef;
428 $self->log('info', "watchdog closed (disabled)");
429 }
430 }
431
432 sub after_fork {
433 my ($self) = @_;
434
435 # close inherited inotify FD from parent and reopen our own
436 PVE::INotify::inotify_close();
437 PVE::INotify::inotify_init();
438
439 PVE::Cluster::cfs_update();
440 }
441
442 sub get_max_workers {
443 my ($self) = @_;
444
445 my $datacenterconfig = cfs_read_file('datacenter.cfg');
446
447 return $datacenterconfig->{max_workers} || 4;
448 }
449
450 # return cluster wide enforced HA settings
451 sub get_ha_settings {
452 my ($self) = @_;
453
454 my $datacenterconfig = eval { cfs_read_file('datacenter.cfg') };
455 if (my $err = $@) {
456 $self->log('err', "unable to get HA settings from datacenter.cfg - $err");
457 return {};
458 }
459
460 return $datacenterconfig->{ha};
461 }
462
463 sub get_static_node_stats {
464 my ($self) = @_;
465
466 my $stats = PVE::Cluster::get_node_kv('static-info');
467 for my $node (keys $stats->%*) {
468 $stats->{$node} = eval { decode_json($stats->{$node}) };
469 $self->log('err', "unable to decode static node info for '$node' - $@") if $@;
470 }
471
472 return $stats;
473 }
474
475 1;