]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
8baf2d01ef6f899996ea17bca8197d7544919193
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8
9 use PVE::SafeSyslog;
10 use PVE::Tools;
11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
12 use PVE::INotify;
13 use PVE::RPCEnvironment;
14
15 use PVE::HA::Tools ':exit_codes';
16 use PVE::HA::Env;
17 use PVE::HA::Config;
18 use PVE::HA::FenceConfig;
19 use PVE::HA::Resources;
20 use PVE::HA::Resources::PVEVM;
21 use PVE::HA::Resources::PVECT;
22
23 PVE::HA::Resources::PVEVM->register();
24 PVE::HA::Resources::PVECT->register();
25
26 PVE::HA::Resources->init();
27
28 my $lockdir = "/etc/pve/priv/lock";
29
30 sub new {
31 my ($this, $nodename) = @_;
32
33 die "missing nodename" if !$nodename;
34
35 my $class = ref($this) || $this;
36
37 my $self = bless {}, $class;
38
39 $self->{nodename} = $nodename;
40
41 return $self;
42 }
43
44 sub nodename {
45 my ($self) = @_;
46
47 return $self->{nodename};
48 }
49
50 sub hardware {
51 my ($self) = @_;
52
53 die "hardware is for testing and simulation only";
54 }
55
56 sub read_manager_status {
57 my ($self) = @_;
58
59 return PVE::HA::Config::read_manager_status();
60 }
61
62 sub write_manager_status {
63 my ($self, $status_obj) = @_;
64
65 PVE::HA::Config::write_manager_status($status_obj);
66 }
67
68 sub read_lrm_status {
69 my ($self, $node) = @_;
70
71 $node = $self->{nodename} if !defined($node);
72
73 return PVE::HA::Config::read_lrm_status($node);
74 }
75
76 sub write_lrm_status {
77 my ($self, $status_obj) = @_;
78
79 my $node = $self->{nodename};
80
81 PVE::HA::Config::write_lrm_status($node, $status_obj);
82 }
83
84 sub is_node_shutdown {
85 my ($self) = @_;
86
87 my $shutdown = 0;
88 my $reboot = 0;
89
90 my $code = sub {
91 my $line = shift;
92
93 # ensure we match the full unit name by matching /^JOB_ID UNIT /
94 # see: man systemd.special
95 $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
96 $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
97 };
98
99 my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
100 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
101
102 return ($shutdown, $reboot);
103 }
104
105 sub queue_crm_commands {
106 my ($self, $cmd) = @_;
107
108 return PVE::HA::Config::queue_crm_commands($cmd);
109 }
110
111 sub read_crm_commands {
112 my ($self) = @_;
113
114 return PVE::HA::Config::read_crm_commands();
115 }
116
117 sub read_service_config {
118 my ($self) = @_;
119
120 return PVE::HA::Config::read_and_check_resources_config();
121 }
122
123 sub read_fence_config {
124 my ($self) = @_;
125
126 return PVE::HA::Config::read_fence_config();
127 }
128
129 sub fencing_mode {
130 my ($self) = @_;
131
132 my $datacenterconfig = cfs_read_file('datacenter.cfg');
133
134 return 'watchdog' if !$datacenterconfig->{fencing};
135
136 return $datacenterconfig->{fencing};
137 }
138
139 sub exec_fence_agent {
140 my ($self, $agent, $node, @param) = @_;
141
142 # setup execution environment
143 $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
144
145 my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
146
147 exec($cmd);
148 exit -1;
149 }
150
151 # this is only allowed by the master to recover a _fenced_ service
152 sub steal_service {
153 my ($self, $sid, $current_node, $new_node) = @_;
154
155 my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
156
157 if(my $plugin = PVE::HA::Resources->lookup($type)) {
158 my $old = $plugin->config_file($name, $current_node);
159 my $new = $plugin->config_file($name, $new_node);
160 rename($old, $new) ||
161 die "rename '$old' to '$new' failed - $!\n";
162 } else {
163 die "implement me";
164 }
165 }
166
167 sub read_group_config {
168 my ($self) = @_;
169
170 return PVE::HA::Config::read_group_config();
171 }
172
173 # this should return a hash containing info
174 # what nodes are members and online.
175 sub get_node_info {
176 my ($self) = @_;
177
178 my ($node_info, $quorate) = ({}, 0);
179
180 my $nodename = $self->{nodename};
181
182 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
183
184 my $members = PVE::Cluster::get_members();
185
186 foreach my $node (keys %$members) {
187 my $d = $members->{$node};
188 $node_info->{$node}->{online} = $d->{online};
189 }
190
191 $node_info->{$nodename}->{online} = 1; # local node is always up
192
193 return ($node_info, $quorate);
194 }
195
196 sub log {
197 my ($self, $level, $msg) = @_;
198
199 chomp $msg;
200
201 syslog($level, $msg);
202 }
203
204 sub sendmail {
205 my ($self, $subject, $text) = @_;
206
207 # Leave it to postfix to append the correct hostname
208 my $mailfrom = 'root';
209 # /root/.forward makes pvemailforward redirect the
210 # mail to the address configured in the datacenter
211 my $mailto = 'root';
212
213 PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
214 }
215
216 my $last_lock_status_hash = {};
217
218 sub get_pve_lock {
219 my ($self, $lockid) = @_;
220
221 my $got_lock = 0;
222
223 my $filename = "$lockdir/$lockid";
224
225 $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
226 my $last = $last_lock_status_hash->{$lockid};
227
228 my $ctime = time();
229 my $last_lock_time = $last->{lock_time} // 0;
230 my $last_got_lock = $last->{got_lock};
231
232 my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
233
234 eval {
235
236 mkdir $lockdir;
237
238 # pve cluster filesystem not online
239 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
240
241 if (($ctime - $last_lock_time) < $retry_timeout) {
242 # try cfs lock update request (utime)
243 if (utime(0, $ctime, $filename)) {
244 $got_lock = 1;
245 return;
246 }
247 die "cfs lock update failed - $!\n";
248 }
249
250 if (mkdir $filename) {
251 $got_lock = 1;
252 return;
253 }
254
255 utime 0, 0, $filename; # cfs unlock request
256 die "can't get cfs lock\n";
257 };
258
259 my $err = $@;
260
261 #$self->log('err', $err) if $err; # for debugging
262
263 $last->{got_lock} = $got_lock;
264 $last->{lock_time} = $ctime if $got_lock;
265
266 if (!!$got_lock != !!$last_got_lock) {
267 if ($got_lock) {
268 $self->log('info', "successfully acquired lock '$lockid'");
269 } else {
270 my $msg = "lost lock '$lockid";
271 $msg .= " - $err" if $err;
272 $self->log('err', $msg);
273 }
274 }
275
276 return $got_lock;
277 }
278
279 sub get_ha_manager_lock {
280 my ($self) = @_;
281
282 return $self->get_pve_lock("ha_manager_lock");
283 }
284
285 # release the cluster wide manager lock.
286 # when released another CRM may step up and get the lock, thus this should only
287 # get called when shutting down/deactivating the current master
288 sub release_ha_manager_lock {
289 my ($self) = @_;
290
291 return rmdir("$lockdir/ha_manager_lock");
292 }
293
294 sub get_ha_agent_lock {
295 my ($self, $node) = @_;
296
297 $node = $self->nodename() if !defined($node);
298
299 return $self->get_pve_lock("ha_agent_${node}_lock");
300 }
301
302 # release the respective node agent lock.
303 # this should only get called if the nodes LRM gracefully shuts down with
304 # all services already cleanly stopped!
305 sub release_ha_agent_lock {
306 my ($self) = @_;
307
308 my $node = $self->nodename();
309
310 return rmdir("$lockdir/ha_agent_${node}_lock");
311 }
312
313 sub quorate {
314 my ($self) = @_;
315
316 my $quorate = 0;
317 eval {
318 $quorate = PVE::Cluster::check_cfs_quorum();
319 };
320
321 return $quorate;
322 }
323
324 sub get_time {
325 my ($self) = @_;
326
327 return time();
328 }
329
330 sub sleep {
331 my ($self, $delay) = @_;
332
333 CORE::sleep($delay);
334 }
335
336 sub sleep_until {
337 my ($self, $end_time) = @_;
338
339 for (;;) {
340 my $cur_time = time();
341
342 last if $cur_time >= $end_time;
343
344 $self->sleep(1);
345 }
346 }
347
348 sub loop_start_hook {
349 my ($self) = @_;
350
351 PVE::Cluster::cfs_update();
352
353 $self->{loop_start} = $self->get_time();
354 }
355
356 sub loop_end_hook {
357 my ($self) = @_;
358
359 my $delay = $self->get_time() - $self->{loop_start};
360
361 warn "loop take too long ($delay seconds)\n" if $delay > 30;
362 }
363
364 my $watchdog_fh;
365
366 sub watchdog_open {
367 my ($self) = @_;
368
369 die "watchdog already open\n" if defined($watchdog_fh);
370
371 $watchdog_fh = IO::Socket::UNIX->new(
372 Type => SOCK_STREAM(),
373 Peer => "/run/watchdog-mux.sock") ||
374 die "unable to open watchdog socket - $!\n";
375
376 $self->log('info', "watchdog active");
377 }
378
379 sub watchdog_update {
380 my ($self, $wfh) = @_;
381
382 my $res = $watchdog_fh->syswrite("\0", 1);
383 if (!defined($res)) {
384 $self->log('err', "watchdog update failed - $!\n");
385 return 0;
386 }
387 if ($res != 1) {
388 $self->log('err', "watchdog update failed - write $res bytes\n");
389 return 0;
390 }
391
392 return 1;
393 }
394
395 sub watchdog_close {
396 my ($self, $wfh) = @_;
397
398 $watchdog_fh->syswrite("V", 1); # magic watchdog close
399 if (!$watchdog_fh->close()) {
400 $self->log('err', "watchdog close failed - $!");
401 } else {
402 $watchdog_fh = undef;
403 $self->log('info', "watchdog closed (disabled)");
404 }
405 }
406
407 sub after_fork {
408 my ($self) = @_;
409
410 # close inherited inotify FD from parent and reopen our own
411 PVE::INotify::inotify_close();
412 PVE::INotify::inotify_init();
413
414 PVE::Cluster::cfs_update();
415 }
416
417 sub get_max_workers {
418 my ($self) = @_;
419
420 my $datacenterconfig = cfs_read_file('datacenter.cfg');
421
422 return $datacenterconfig->{max_workers} || 4;
423 }
424
425 1;