]> git.proxmox.com Git - pve-ha-manager.git/blob - src/PVE/HA/Env/PVE2.pm
fix 'change_service_location' misuse and recovery from fencing
[pve-ha-manager.git] / src / PVE / HA / Env / PVE2.pm
1 package PVE::HA::Env::PVE2;
2
3 use strict;
4 use warnings;
5 use POSIX qw(:errno_h :fcntl_h);
6 use IO::File;
7 use IO::Socket::UNIX;
8
9 use PVE::SafeSyslog;
10 use PVE::Tools;
11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
12 use PVE::INotify;
13 use PVE::RPCEnvironment;
14
15 use PVE::HA::Tools ':exit_codes';
16 use PVE::HA::Env;
17 use PVE::HA::Config;
18 use PVE::HA::Resources;
19 use PVE::HA::Resources::PVEVM;
20 use PVE::HA::Resources::PVECT;
21
22 PVE::HA::Resources::PVEVM->register();
23 PVE::HA::Resources::PVECT->register();
24
25 PVE::HA::Resources->init();
26
27 my $lockdir = "/etc/pve/priv/lock";
28
29 sub new {
30 my ($this, $nodename) = @_;
31
32 die "missing nodename" if !$nodename;
33
34 my $class = ref($this) || $this;
35
36 my $self = bless {}, $class;
37
38 $self->{nodename} = $nodename;
39
40 return $self;
41 }
42
43 sub nodename {
44 my ($self) = @_;
45
46 return $self->{nodename};
47 }
48
49 sub hardware {
50 my ($self) = @_;
51
52 die "hardware is for testing and simulation only";
53 }
54
55 sub read_manager_status {
56 my ($self) = @_;
57
58 return PVE::HA::Config::read_manager_status();
59 }
60
61 sub write_manager_status {
62 my ($self, $status_obj) = @_;
63
64 PVE::HA::Config::write_manager_status($status_obj);
65 }
66
67 sub read_lrm_status {
68 my ($self, $node) = @_;
69
70 $node = $self->{nodename} if !defined($node);
71
72 return PVE::HA::Config::read_lrm_status($node);
73 }
74
75 sub write_lrm_status {
76 my ($self, $status_obj) = @_;
77
78 my $node = $self->{nodename};
79
80 PVE::HA::Config::write_lrm_status($node, $status_obj);
81 }
82
83 sub is_node_shutdown {
84 my ($self) = @_;
85
86 my $shutdown = 0;
87
88 my $code = sub {
89 my $line = shift;
90
91 $shutdown = 1 if ($line =~ m/shutdown\.target/);
92 };
93
94 my $cmd = ['/bin/systemctl', 'list-jobs'];
95 eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
96
97 return $shutdown;
98 }
99
100 sub queue_crm_commands {
101 my ($self, $cmd) = @_;
102
103 return PVE::HA::Config::queue_crm_commands($cmd);
104 }
105
106 sub read_crm_commands {
107 my ($self) = @_;
108
109 return PVE::HA::Config::read_crm_commands();
110 }
111
112 sub read_service_config {
113 my ($self) = @_;
114
115 my $res = PVE::HA::Config::read_resources_config();
116
117 my $vmlist = PVE::Cluster::get_vmlist();
118 my $conf = {};
119
120 foreach my $sid (keys %{$res->{ids}}) {
121 my $d = $res->{ids}->{$sid};
122 my (undef, undef, $name) = PVE::HA::Tools::parse_sid($sid);
123 $d->{state} = 'enabled' if !defined($d->{state});
124 $d->{max_restart} = 1 if !defined($d->{max_restart});
125 $d->{max_relocate} = 1 if !defined($d->{max_relocate});
126 if (PVE::HA::Resources->lookup($d->{type})) {
127 if (my $vmd = $vmlist->{ids}->{$name}) {
128 if (!$vmd) {
129 warn "no such VM '$name'\n";
130 } else {
131 $d->{node} = $vmd->{node};
132 $conf->{$sid} = $d;
133 }
134 } else {
135 if (defined($d->{node})) {
136 $conf->{$sid} = $d;
137 } else {
138 warn "service '$sid' without node\n";
139 }
140 }
141 }
142 }
143
144 return $conf;
145 }
146
147 # this is only allowed by the master to recover a _fenced_ service
148 sub steal_service {
149 my ($self, $sid, $current_node, $new_node) = @_;
150
151 my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
152
153 if(my $plugin = PVE::HA::Resources->lookup($type)) {
154 my $old = $plugin->config_file($name, $current_node);
155 my $new = $plugin->config_file($name, $new_node);
156 rename($old, $new) ||
157 die "rename '$old' to '$new' failed - $!\n";
158 } else {
159 die "implement me";
160 }
161 }
162
163 sub read_group_config {
164 my ($self) = @_;
165
166 return PVE::HA::Config::read_group_config();
167 }
168
169 # this should return a hash containing info
170 # what nodes are members and online.
171 sub get_node_info {
172 my ($self) = @_;
173
174 my ($node_info, $quorate) = ({}, 0);
175
176 my $nodename = $self->{nodename};
177
178 $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
179
180 my $members = PVE::Cluster::get_members();
181
182 foreach my $node (keys %$members) {
183 my $d = $members->{$node};
184 $node_info->{$node}->{online} = $d->{online};
185 }
186
187 $node_info->{$nodename}->{online} = 1; # local node is always up
188
189 return ($node_info, $quorate);
190 }
191
192 sub log {
193 my ($self, $level, $msg) = @_;
194
195 chomp $msg;
196
197 syslog($level, $msg);
198 }
199
200 my $last_lock_status = {};
201
202 sub get_pve_lock {
203 my ($self, $lockid) = @_;
204
205 my $got_lock = 0;
206
207 my $filename = "$lockdir/$lockid";
208
209 my $last = $last_lock_status->{$lockid} || 0;
210
211 my $ctime = time();
212
213 my $retry = 0;
214 my $retry_timeout = 100; # fixme: what timeout
215
216 eval {
217
218 mkdir $lockdir;
219
220 # pve cluster filesystem not online
221 die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
222
223 if ($last && (($ctime - $last) < $retry_timeout)) {
224 # send cfs lock update request (utime)
225 if (!utime(0, $ctime, $filename)) {
226 $retry = 1;
227 die "cfs lock update failed - $!\n";
228 }
229 } else {
230
231 # fixme: wait some time?
232 if (!(mkdir $filename)) {
233 utime 0, 0, $filename; # cfs unlock request
234 die "can't get cfs lock\n";
235 }
236 }
237
238 $got_lock = 1;
239 };
240
241 my $err = $@;
242
243 if ($retry) {
244 # $self->log('err', $err) if $err; # for debugging
245 return 0;
246 }
247
248 $last_lock_status->{$lockid} = $got_lock ? $ctime : 0;
249
250 if (!!$got_lock != !!$last) {
251 if ($got_lock) {
252 $self->log('info', "successfully acquired lock '$lockid'");
253 } else {
254 my $msg = "lost lock '$lockid";
255 $msg .= " - $err" if $err;
256 $self->log('err', $msg);
257 }
258 } else {
259 # $self->log('err', $err) if $err; # for debugging
260 }
261
262 return $got_lock;
263 }
264
265 sub get_ha_manager_lock {
266 my ($self) = @_;
267
268 return $self->get_pve_lock("ha_manager_lock");
269 }
270
271 # release the cluster wide manager lock.
272 # when released another CRM may step up and get the lock, thus this should only
273 # get called when shutting down/deactivating the current master
274 sub release_ha_manager_lock {
275 my ($self) = @_;
276
277 return rmdir("$lockdir/ha_manager_lock");
278 }
279
280 sub get_ha_agent_lock {
281 my ($self, $node) = @_;
282
283 $node = $self->nodename() if !defined($node);
284
285 return $self->get_pve_lock("ha_agent_${node}_lock");
286 }
287
288 # release the respective node agent lock.
289 # this should only get called if the nodes LRM gracefully shuts down with
290 # all services already cleanly stopped!
291 sub release_ha_agent_lock {
292 my ($self) = @_;
293
294 my $node = $self->nodename();
295
296 return rmdir("$lockdir/ha_agent_${node}_lock");
297 }
298
299 sub quorate {
300 my ($self) = @_;
301
302 my $quorate = 0;
303 eval {
304 $quorate = PVE::Cluster::check_cfs_quorum();
305 };
306
307 return $quorate;
308 }
309
310 sub get_time {
311 my ($self) = @_;
312
313 return time();
314 }
315
316 sub sleep {
317 my ($self, $delay) = @_;
318
319 CORE::sleep($delay);
320 }
321
322 sub sleep_until {
323 my ($self, $end_time) = @_;
324
325 for (;;) {
326 my $cur_time = time();
327
328 last if $cur_time >= $end_time;
329
330 $self->sleep(1);
331 }
332 }
333
334 sub loop_start_hook {
335 my ($self) = @_;
336
337 PVE::Cluster::cfs_update();
338
339 $self->{loop_start} = $self->get_time();
340 }
341
342 sub loop_end_hook {
343 my ($self) = @_;
344
345 my $delay = $self->get_time() - $self->{loop_start};
346
347 warn "loop take too long ($delay seconds)\n" if $delay > 30;
348 }
349
350 my $watchdog_fh;
351
352 sub watchdog_open {
353 my ($self) = @_;
354
355 die "watchdog already open\n" if defined($watchdog_fh);
356
357 $watchdog_fh = IO::Socket::UNIX->new(
358 Type => SOCK_STREAM(),
359 Peer => "/run/watchdog-mux.sock") ||
360 die "unable to open watchdog socket - $!\n";
361
362 $self->log('info', "watchdog active");
363 }
364
365 sub watchdog_update {
366 my ($self, $wfh) = @_;
367
368 my $res = $watchdog_fh->syswrite("\0", 1);
369 if (!defined($res)) {
370 $self->log('err', "watchdog update failed - $!\n");
371 return 0;
372 }
373 if ($res != 1) {
374 $self->log('err', "watchdog update failed - write $res bytes\n");
375 return 0;
376 }
377
378 return 1;
379 }
380
381 sub watchdog_close {
382 my ($self, $wfh) = @_;
383
384 $watchdog_fh->syswrite("V", 1); # magic watchdog close
385 if (!$watchdog_fh->close()) {
386 $self->log('err', "watchdog close failed - $!");
387 } else {
388 $watchdog_fh = undef;
389 $self->log('info', "watchdog closed (disabled)");
390 }
391 }
392
393 sub after_fork {
394 my ($self) = @_;
395
396 # close inherited inotify FD from parent and reopen our own
397 PVE::INotify::inotify_close();
398 PVE::INotify::inotify_init();
399
400 PVE::Cluster::cfs_update();
401 }
402
403 sub get_max_workers {
404 my ($self) = @_;
405
406 my $datacenterconfig = cfs_read_file('datacenter.cfg');
407
408 return $datacenterconfig->{max_workers} || 4;
409 }
410
411 1;