]>
Commit | Line | Data |
---|---|---|
aff192e6 DM |
1 | #!/usr/bin/perl -w |
2 | ||
3 | use strict; | |
4 | use PVE::SafeSyslog; | |
5 | use POSIX ":sys_wait_h"; | |
6 | use Fcntl ':flock'; | |
7 | use Getopt::Long; | |
8 | use Time::HiRes qw (gettimeofday); | |
f9d4fc64 | 9 | use PVE::Tools qw(dir_glob_foreach file_read_firstline); |
aff192e6 DM |
10 | use PVE::ProcFSTools; |
11 | use Filesys::Df; | |
12 | use PVE::INotify; | |
13 | use PVE::Cluster qw(cfs_read_file); | |
14 | use PVE::Storage; | |
15 | use PVE::QemuServer; | |
b3409356 | 16 | use PVE::OpenVZ; |
aff192e6 | 17 | use PVE::RPCEnvironment; |
16b69b6c | 18 | use PVE::API2::Subscription; |
aff192e6 DM |
19 | |
20 | $SIG{'__WARN__'} = sub { | |
21 | my $err = $@; | |
22 | my $t = $_[0]; | |
23 | chomp $t; | |
24 | syslog('warning', "WARNING: %s", $t); | |
25 | $@ = $err; | |
26 | }; | |
27 | ||
28 | initlog('pvestatd'); | |
29 | ||
30 | $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin'; | |
31 | ||
32 | die "please run as root\n" if $> != 0; | |
33 | ||
34 | my $nodename = PVE::INotify::nodename(); | |
35 | ||
36 | my $opt_debug; | |
37 | ||
38 | if (!GetOptions ('debug' => \$opt_debug)) { | |
39 | die "USAGE: $0 [--debug]\n"; | |
40 | } | |
41 | ||
42 | my $opt_pidfile = "/var/run/pvestatd.pid"; | |
43 | ||
44 | sub lockpidfile { | |
45 | my $pidfile = shift; | |
46 | my $lkfn = "$pidfile.lock"; | |
47 | ||
48 | if (!open (FLCK, ">>$lkfn")) { | |
49 | my $msg = "can't aquire lock on file '$lkfn' - $!"; | |
50 | syslog ('err', $msg); | |
51 | die "ERROR: $msg\n"; | |
52 | } | |
53 | ||
54 | if (!flock (FLCK, LOCK_EX|LOCK_NB)) { | |
55 | close (FLCK); | |
56 | my $msg = "can't aquire lock '$lkfn' - $!"; | |
57 | syslog ('err', $msg); | |
58 | die "ERROR: $msg\n"; | |
59 | } | |
60 | } | |
61 | ||
62 | sub writepidfile { | |
63 | my $pidfile = shift; | |
64 | ||
65 | if (!open (PIDFH, ">$pidfile")) { | |
66 | my $msg = "can't open pid file '$pidfile' - $!"; | |
67 | syslog ('err', $msg); | |
68 | die "ERROR: $msg\n"; | |
69 | } | |
70 | print PIDFH "$$\n"; | |
71 | close (PIDFH); | |
72 | } | |
73 | ||
74 | # try to get the lock | |
75 | lockpidfile($opt_pidfile); | |
76 | ||
77 | # run in background | |
78 | my $spid; | |
79 | ||
80 | my $restart = $ENV{RESTART_PVESTATD}; | |
81 | ||
82 | if (!$opt_debug) { | |
83 | open STDIN, '</dev/null' || die "can't read /dev/null"; | |
84 | open STDOUT, '>/dev/null' || die "can't write /dev/null"; | |
85 | } | |
86 | ||
87 | if (!$restart && !$opt_debug) { | |
88 | $spid = fork(); | |
89 | if (!defined ($spid)) { | |
90 | my $msg = "can't put server into background - fork failed"; | |
91 | syslog('err', $msg); | |
92 | die "ERROR: $msg\n"; | |
93 | } elsif ($spid) { #parent | |
94 | exit (0); | |
95 | } | |
96 | } | |
97 | ||
98 | writepidfile($opt_pidfile); | |
99 | ||
100 | open STDERR, '>&STDOUT' || die "can't close STDERR\n"; | |
101 | ||
102 | sub cleanup { | |
103 | unlink "$opt_pidfile.lock"; | |
104 | unlink "$opt_pidfile"; | |
105 | } | |
106 | ||
107 | $SIG{INT} = $SIG{TERM} = $SIG{QUIT} = sub { | |
108 | syslog('info' , "server closing"); | |
109 | ||
110 | $SIG{INT} = 'DEFAULT'; | |
111 | ||
112 | # wait for children | |
113 | 1 while (waitpid(-1, POSIX::WNOHANG()) > 0); | |
114 | ||
115 | cleanup(); | |
116 | ||
117 | exit (0); | |
118 | }; | |
119 | ||
120 | PVE::INotify::inotify_init(); | |
121 | ||
122 | my $reload_config; | |
123 | ||
124 | if ($restart) { | |
125 | syslog('info' , "restarting server"); | |
126 | } else { | |
127 | syslog('info' , "starting server"); | |
128 | } | |
129 | ||
130 | $SIG{HUP} = sub { | |
131 | $reload_config = 1; | |
132 | }; | |
133 | ||
134 | sub update_node_status { | |
135 | ||
136 | my ($avg1, $avg5, $avg15) = PVE::ProcFSTools::read_loadavg(); | |
137 | ||
138 | my $stat = PVE::ProcFSTools::read_proc_stat(); | |
139 | ||
140 | my $netdev = PVE::ProcFSTools::read_proc_net_dev(); | |
141 | ||
142 | my ($uptime) = PVE::ProcFSTools::read_proc_uptime(); | |
143 | ||
144 | my $cpuinfo = PVE::ProcFSTools::read_cpuinfo(); | |
145 | ||
146 | my $maxcpu = $cpuinfo->{cpus}; | |
147 | ||
16b69b6c DM |
148 | my $subinfo = PVE::INotify::read_file('subscription'); |
149 | my $sublevel = $subinfo->{level} || ''; | |
150 | ||
aff192e6 DM |
151 | # traffic from/to physical interface cards |
152 | my $netin = 0; | |
153 | my $netout = 0; | |
154 | foreach my $dev (keys %$netdev) { | |
155 | next if $dev !~ m/^eth\d+$/; | |
156 | $netin += $netdev->{$dev}->{receive}; | |
157 | $netout += $netdev->{$dev}->{transmit}; | |
158 | } | |
159 | ||
160 | my $meminfo = PVE::ProcFSTools::read_meminfo(); | |
161 | ||
162 | my $dinfo = df('/', 1); # output is bytes | |
163 | ||
164 | my $ctime = time(); | |
165 | ||
166 | # everything not free is considered to be used | |
167 | my $dused = $dinfo->{blocks} - $dinfo->{bfree}; | |
168 | ||
16b69b6c | 169 | my $data = "$uptime:$sublevel:$ctime:$avg1:$maxcpu:$stat->{cpu}:$stat->{wait}:" . |
aff192e6 DM |
170 | "$meminfo->{memtotal}:$meminfo->{memused}:" . |
171 | "$meminfo->{swaptotal}:$meminfo->{swapused}:" . | |
172 | "$dinfo->{blocks}:$dused:$netin:$netout"; | |
173 | ||
174 | PVE::Cluster::broadcast_rrd("pve2-node/$nodename", $data); | |
175 | } | |
176 | ||
0883a378 DM |
177 | sub auto_balloning { |
178 | my ($vmstatus) = @_; | |
179 | ||
180 | my $log = sub { | |
181 | return if !$opt_debug; | |
182 | print @_; | |
183 | }; | |
184 | my $hostmeminfo = PVE::ProcFSTools::read_meminfo(); | |
185 | ||
186 | # to debug, run 'pvestatd -d' and set memtotal here | |
187 | #$hostmeminfo->{memtotal} = int(3*1024*1024*1024/0.8); # you can set this to test | |
188 | ||
189 | my $hostfreemem = $hostmeminfo->{memtotal} - $hostmeminfo->{memused}; | |
190 | ||
191 | # we try to use about 80% host memory | |
192 | # goal: we want to change memory usage by this amount (positive or negative) | |
193 | my $goal = int($hostmeminfo->{memtotal}*0.8 - $hostmeminfo->{memused}); | |
194 | ||
195 | &$log("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n"); | |
196 | ||
197 | my $maxchange = 100*1024*1024; | |
198 | ||
199 | my $get_summary = sub { | |
200 | my ($idlist) = @_; | |
201 | ||
202 | my $shares = 0; | |
203 | my $freeshares = 0; | |
204 | my $alloc = 0; | |
205 | my $free = 0; | |
206 | foreach my $vmid (@$idlist) { | |
207 | my $d = $vmstatus->{$vmid}; | |
208 | $shares += $d->{shares} || 1000; | |
209 | $freeshares += 1/($d->{shares} || 1000); | |
210 | if ($d->{balloon} > $d->{balloon_min}) { # just to be sure | |
211 | $alloc += $d->{balloon} - $d->{balloon_min} | |
212 | } | |
213 | if ($d->{maxmem} > $d->{balloon}) { # just to be sure | |
214 | $free += $d->{maxmem} - $d->{balloon}; | |
215 | } | |
216 | } | |
217 | return ($shares, $freeshares, $alloc, $free); | |
218 | }; | |
219 | ||
220 | my $grow_func = sub { | |
221 | my ($res, $idlist, $bytes) = @_; | |
222 | ||
223 | my $changes = 0; | |
224 | my (undef, $shares_total, undef, $free_total) = &$get_summary($idlist); | |
225 | return $changes if !$shares_total; | |
226 | ||
227 | &$log("grow $goal\n"); | |
228 | ||
229 | my $target = $bytes < $free_total ? $free_total - $bytes : 0; | |
230 | &$log("shares_total: $shares_total\n"); | |
231 | &$log("free_total: $free_total\n"); | |
232 | &$log("target: $target\n"); | |
233 | ||
234 | foreach my $vmid (@$idlist) { | |
235 | my $d = $vmstatus->{$vmid}; | |
236 | my $shares = 1/($d->{shares} || 1000); | |
237 | &$log("shares $vmid: $shares\n"); | |
238 | next if $shares < 0; # just to be sure | |
239 | my $max = $d->{maxmem} - int(($target/$shares_total)*$shares); | |
240 | $max = $d->{balloon_min} if $max < $d->{balloon_min}; | |
241 | my $new = $d->{balloon} + $maxchange; | |
242 | my $balloon = $new > $max ? $max : $new; | |
243 | my $diff = $balloon - $d->{balloon}; | |
244 | if ($diff > 0) { | |
245 | $res->{$vmid} = $balloon; | |
246 | $changes += $diff; | |
247 | &$log("grow request for $vmid ($res->{$vmid}, $diff, $max, $new)\n"); | |
248 | } | |
249 | } | |
250 | return $changes; | |
251 | }; | |
252 | ||
253 | my $idlist = []; # list of VMs with working balloon river | |
254 | my $idlist1 = []; # list of VMs with memory pressure | |
255 | my $idlist2 = []; # list of VMs with enough free memory | |
256 | ||
257 | foreach my $vmid (keys %$vmstatus) { | |
258 | my $d = $vmstatus->{$vmid}; | |
259 | next if !$d->{balloon}; # skip if balloon driver not running | |
260 | next if !$d->{balloon_min}; # skip if balloon value not set in config | |
261 | ||
262 | push @$idlist, $vmid; | |
263 | ||
264 | if (($goal > 0) && $d->{freemem} && | |
265 | ($d->{freemem} > $d->{maxmem}*0.25) && | |
266 | ($d->{balloon} >= $d->{balloon_min})) { | |
267 | push @$idlist2, $vmid; | |
268 | &$log("idlist2 $vmid $d->{balloon}, $d->{balloon_min}, $d->{freemem}\n"); | |
269 | } else { | |
270 | push @$idlist1, $vmid; | |
271 | &$log("idlist1 $vmid $d->{balloon}, $d->{balloon_min}, $d->{freemem}\n"); | |
272 | } | |
273 | } | |
274 | ||
275 | my $res = {}; | |
276 | ||
277 | if ($goal > 10*1024*1024) { | |
278 | &$log("grow request $goal\n"); | |
279 | # we priorize VMs with memory pressure | |
280 | if (!&$grow_func($res, $idlist1, $goal)) { | |
281 | &$grow_func($res, $idlist2, $goal); | |
282 | } | |
283 | } elsif ($goal < -10*1024*1024) { | |
284 | &$log("shrink request $goal\n"); | |
285 | my ($shares_total, undef, $alloc_old) = &$get_summary($idlist); | |
286 | my $alloc_new = $alloc_old + $goal; | |
287 | $alloc_new = 0 if $alloc_new < 0; | |
288 | &$log("shares_total: $shares_total $alloc_new\n"); | |
289 | ||
290 | foreach my $vmid (@$idlist) { | |
291 | my $d = $vmstatus->{$vmid}; | |
292 | my $shares = $d->{shares} || 1000; | |
293 | next if $shares < 0; # just to be sure | |
294 | my $min = $d->{balloon_min} + int(($alloc_new/$shares_total)*$shares); | |
295 | my $new = $d->{balloon} - $maxchange; | |
296 | $res->{$vmid} = $new > $min ? $new : $min; | |
297 | } | |
298 | } else { | |
299 | &$log("do nothing\n"); | |
300 | # do nothing - requested change to small | |
301 | } | |
302 | ||
303 | foreach my $vmid (@$idlist) { | |
304 | next if !$res->{$vmid}; | |
305 | my $d = $vmstatus->{$vmid}; | |
306 | my $diff = int($res->{$vmid} - $d->{balloon}); | |
307 | my $absdiff = $diff < 0 ? -$diff : $diff; | |
308 | if ($absdiff > 0) { | |
309 | &$log("BALLOON $vmid to $res->{$vmid} ($diff)\n"); | |
310 | eval { | |
311 | PVE::QemuServer::vm_mon_cmd($vmid, "balloon", | |
312 | value => int($res->{$vmid})); | |
313 | }; | |
314 | warn $@ if $@; | |
315 | } | |
316 | } | |
317 | } | |
318 | ||
aff192e6 DM |
319 | sub update_qemu_status { |
320 | ||
321 | my $ctime = time(); | |
322 | ||
cbb20c6e | 323 | my $vmstatus = PVE::QemuServer::vmstatus(undef, 1); |
aff192e6 | 324 | |
0883a378 DM |
325 | eval { auto_balloning($vmstatus); }; |
326 | syslog('err', "auto ballooning error: $@") if $@; | |
327 | ||
aff192e6 DM |
328 | foreach my $vmid (keys %$vmstatus) { |
329 | my $d = $vmstatus->{$vmid}; | |
330 | my $data; | |
331 | if ($d->{pid}) { # running | |
332 | $data = "$d->{uptime}:$d->{name}:$ctime:$d->{cpus}:$d->{cpu}:" . | |
333 | "$d->{maxmem}:$d->{mem}:" . | |
334 | "$d->{maxdisk}:$d->{disk}:" . | |
335 | "$d->{netin}:$d->{netout}:" . | |
336 | "$d->{diskread}:$d->{diskwrite}"; | |
337 | } else { | |
338 | $data = "0:$d->{name}:$ctime:$d->{cpus}::" . | |
339 | "$d->{maxmem}::" . | |
340 | "$d->{maxdisk}:$d->{disk}:" . | |
341 | ":::"; | |
342 | } | |
343 | PVE::Cluster::broadcast_rrd("pve2-vm/$vmid", $data); | |
344 | } | |
345 | } | |
346 | ||
f9d4fc64 DM |
347 | sub find_vzctl_console_pids { |
348 | ||
349 | my $res = {}; | |
350 | ||
351 | dir_glob_foreach('/proc', '\d+', sub { | |
352 | my ($pid) = @_; | |
353 | ||
354 | my $cmdline = file_read_firstline("/proc/$pid/cmdline"); | |
355 | return if !$cmdline; | |
356 | ||
357 | my @args = split(/\0/, $cmdline); | |
358 | ||
359 | # serach for vzctl console <vmid> | |
360 | return if scalar(@args) != 3; | |
361 | return if $args[1] ne 'console'; | |
362 | return if $args[2] !~ m/^\d+$/; | |
363 | return if $args[0] !~ m|^(/usr/sbin/)?vzctl$|; | |
364 | ||
365 | my $vmid = $args[2]; | |
366 | ||
367 | push @{$res->{$vmid}}, $pid; | |
368 | }); | |
369 | ||
370 | return $res; | |
371 | } | |
372 | sub remove_stale_openvz_consoles { | |
373 | ||
374 | my $vmstatus = PVE::OpenVZ::vmstatus(); | |
375 | my $pidhash = find_vzctl_console_pids(); | |
376 | ||
377 | foreach my $vmid (keys %$pidhash) { | |
378 | next if defined($vmstatus->{$vmid}); | |
379 | syslog('info', "remove stale vzctl console for CT $vmid"); | |
380 | foreach my $pid (@{$pidhash->{$vmid}}) { | |
381 | kill(9, $pid); | |
382 | } | |
383 | } | |
384 | } | |
385 | ||
b3409356 DM |
386 | sub update_openvz_status { |
387 | ||
388 | my $ctime = time(); | |
389 | ||
390 | my $vmstatus = PVE::OpenVZ::vmstatus(); | |
391 | ||
392 | foreach my $vmid (keys %$vmstatus) { | |
393 | my $d = $vmstatus->{$vmid}; | |
394 | my $data; | |
395 | if ($d->{status} eq 'running') { # running | |
396 | $data = "$d->{uptime}:$d->{name}:$ctime:$d->{cpus}:$d->{cpu}:" . | |
397 | "$d->{maxmem}:$d->{mem}:" . | |
398 | "$d->{maxdisk}:$d->{disk}:" . | |
399 | "$d->{netin}:$d->{netout}:" . | |
400 | "$d->{diskread}:$d->{diskwrite}"; | |
401 | } else { | |
402 | $data = "0:$d->{name}:$ctime:$d->{cpus}::" . | |
403 | "$d->{maxmem}::" . | |
404 | "$d->{maxdisk}:$d->{disk}:" . | |
405 | ":::"; | |
406 | } | |
407 | PVE::Cluster::broadcast_rrd("pve2-vm/$vmid", $data); | |
408 | } | |
409 | } | |
410 | ||
aff192e6 DM |
411 | sub update_storage_status { |
412 | ||
413 | my $cfg = cfs_read_file("storage.cfg"); | |
414 | ||
415 | my $ctime = time(); | |
416 | ||
417 | my $info = PVE::Storage::storage_info($cfg); | |
418 | ||
419 | foreach my $storeid (keys %$info) { | |
420 | my $d = $info->{$storeid}; | |
421 | next if !$d->{active}; | |
422 | ||
423 | # everything not free is considered to be used | |
424 | my $realused = $d->{total} - $d->{avail}; | |
425 | ||
426 | my $data = "$ctime:$d->{total}:$realused"; | |
427 | ||
428 | my $key = "pve2-storage/${nodename}/$storeid"; | |
429 | PVE::Cluster::broadcast_rrd($key, $data); | |
430 | } | |
431 | } | |
432 | ||
433 | sub update_status { | |
434 | ||
435 | # update worker list. This is not really required and | |
436 | # we just call this to make sure that we have a correct | |
437 | # list in case of an unexpected crash. | |
438 | eval { | |
439 | my $tlist = PVE::RPCEnvironment::active_workers(); | |
440 | PVE::Cluster::broadcast_tasklist($tlist); | |
441 | }; | |
442 | my $err = $@; | |
443 | syslog('err', $err) if $err; | |
444 | ||
445 | eval { | |
446 | update_node_status(); | |
447 | }; | |
448 | $err = $@; | |
449 | syslog('err', "node status update error: $err") if $err; | |
450 | ||
451 | eval { | |
452 | update_qemu_status(); | |
453 | }; | |
454 | $err = $@; | |
455 | syslog('err', "qemu status update error: $err") if $err; | |
456 | ||
b3409356 DM |
457 | eval { |
458 | update_openvz_status(); | |
459 | }; | |
460 | $err = $@; | |
461 | syslog('err', "openvz status update error: $err") if $err; | |
462 | ||
aff192e6 DM |
463 | eval { |
464 | update_storage_status(); | |
465 | }; | |
466 | $err = $@; | |
467 | syslog('err', "storage status update error: $err") if $err; | |
f9d4fc64 DM |
468 | |
469 | eval { | |
470 | remove_stale_openvz_consoles(); | |
471 | }; | |
472 | $err = $@; | |
473 | syslog('err', "openvz console cleanup error: $err") if $err; | |
aff192e6 DM |
474 | } |
475 | ||
476 | my $next_update = 0; | |
477 | ||
478 | # do not update directly after startup, because install scripts | |
479 | # have a problem with that | |
480 | my $cycle = 0; | |
481 | my $updatetime = 10; | |
482 | ||
483 | my $commandline = [$0, @ARGV]; | |
484 | ||
485 | $0 = "pvestatd"; | |
486 | ||
487 | sub restart_server { | |
488 | my $waittime = shift; | |
489 | ||
490 | syslog('info', "server shutdown (restart)"); | |
491 | ||
492 | $ENV{RESTART_PVESTATD} = 1; | |
493 | ||
494 | sleep($waittime) if $waittime; # avoid high server load due to restarts | |
495 | ||
496 | exec (@$commandline); | |
497 | exit (-1); # never reached? | |
498 | } | |
499 | ||
350b3b46 DM |
500 | my $initial_memory_usage; |
501 | ||
aff192e6 DM |
502 | for (;;) { # forever |
503 | ||
504 | eval { | |
505 | $next_update = time() + $updatetime; | |
506 | ||
507 | if ($cycle) { | |
508 | my ($ccsec, $cusec) = gettimeofday (); | |
509 | eval { | |
510 | $reload_config = 0; | |
9b0aba10 | 511 | # syslog('info', "start status update"); |
aff192e6 DM |
512 | PVE::Cluster::cfs_update(); |
513 | update_status(); | |
514 | }; | |
515 | my $err = $@; | |
516 | ||
517 | if ($err) { | |
518 | syslog('err', "status update error: $err"); | |
519 | } | |
520 | ||
521 | my ($ccsec_end, $cusec_end) = gettimeofday (); | |
522 | my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000; | |
523 | ||
9b0aba10 DM |
524 | syslog('info', sprintf("status update time (%.3f seconds)", $cptime)) |
525 | if ($cptime > 5); | |
aff192e6 DM |
526 | } |
527 | ||
528 | $cycle++; | |
529 | ||
530 | my $mem = PVE::ProcFSTools::read_memory_usage(); | |
531 | ||
9dbdda49 | 532 | if (!defined($initial_memory_usage) || ($cycle < 10)) { |
350b3b46 DM |
533 | $initial_memory_usage = $mem->{resident}; |
534 | } else { | |
535 | my $diff = $mem->{resident} - $initial_memory_usage; | |
536 | if ($diff > 5*1024*1024) { | |
537 | syslog ('info', "restarting server after $cycle cycles to " . | |
538 | "reduce memory usage (free $mem->{resident} ($diff) bytes)"); | |
539 | restart_server (); | |
540 | } | |
aff192e6 DM |
541 | } |
542 | ||
543 | my $wcount = 0; | |
544 | while ((time() < $next_update) && | |
545 | ($wcount < $updatetime) && # protect against time wrap | |
546 | !$reload_config) { $wcount++; sleep (1); }; | |
547 | }; | |
548 | ||
549 | my $err = $@; | |
550 | ||
551 | if ($err) { | |
552 | syslog ('err', "ERROR: $err"); | |
553 | restart_server(5); | |
554 | exit (0); | |
555 | } | |
556 | } | |
557 | ||
558 | exit (0); | |
559 | ||
560 | __END__ | |
561 | ||
562 | =head1 NAME | |
563 | ||
564 | pvestatd - PVE Status Daemon | |
565 | ||
566 | =head1 SYNOPSIS | |
567 | ||
568 | pvestatd | |
569 | ||
570 | =head1 DESCRIPTION | |
571 | ||
572 | Documentation is available at www.proxmox.com | |
573 | ||
574 | ||
575 | ||
576 | ||
577 |