[pve-manager.git] / PVE / Service / pvestatd.pm

package PVE::Service::pvestatd;

use strict;
use warnings;

use PVE::SafeSyslog;
use PVE::Daemon;

use Time::HiRes qw (gettimeofday);
use PVE::Tools qw(dir_glob_foreach file_read_firstline);
use PVE::ProcFSTools;
use PVE::CpuSet;
use Filesys::Df;
use PVE::INotify;
use PVE::Cluster qw(cfs_read_file);
use PVE::Storage;
use PVE::QemuServer;
use PVE::LXC;
use PVE::LXC::Config;
use PVE::RPCEnvironment;
use PVE::API2::Subscription;
use PVE::AutoBalloon;

use PVE::Status::Plugin;
use PVE::Status::Graphite;
use PVE::Status::InfluxDB;

PVE::Status::Graphite->register();
PVE::Status::InfluxDB->register();
PVE::Status::Plugin->init();

use base qw(PVE::Daemon);

my $opt_debug;
my $restart_request;

my $nodename = PVE::INotify::nodename();

my $cmdline = [$0, @ARGV];

my %daemon_options = (restart_on_error => 5, stop_wait_time => 5);
my $daemon = __PACKAGE__->new('pvestatd', $cmdline, %daemon_options);

sub init {
    my ($self) = @_;

    $opt_debug = $self->{debug};

    PVE::Cluster::cfs_update();
}

sub shutdown {
    my ($self) = @_;

    syslog('info' , "server closing");

    # wait for children
    1 while (waitpid(-1, POSIX::WNOHANG()) > 0);

    $self->exit_daemon(0);
}

sub hup {
    my ($self) = @_;

    $restart_request = 1;
}

sub update_node_status {
    my ($status_cfg) = @_;

    my ($avg1, $avg5, $avg15) = PVE::ProcFSTools::read_loadavg();

    my $stat = PVE::ProcFSTools::read_proc_stat();

    my $netdev = PVE::ProcFSTools::read_proc_net_dev();

    my ($uptime) = PVE::ProcFSTools::read_proc_uptime();

    my $cpuinfo = PVE::ProcFSTools::read_cpuinfo();

    my $maxcpu = $cpuinfo->{cpus}; 

    my $subinfo = PVE::INotify::read_file('subscription');
    my $sublevel = $subinfo->{level} || '';

    # traffic from/to physical interface cards
    my $netin = 0;
    my $netout = 0;
    foreach my $dev (keys %$netdev) {
	next if $dev !~ m/^eth\d+$/;
	$netin += $netdev->{$dev}->{receive};
	$netout += $netdev->{$dev}->{transmit};
    }
 
    my $meminfo = PVE::ProcFSTools::read_meminfo();

    my $dinfo = df('/', 1);     # output is bytes

    my $ctime = time();

    # everything not free is considered to be used
    my $dused = $dinfo->{blocks} - $dinfo->{bfree};

    my $data = "$uptime:$sublevel:$ctime:$avg1:$maxcpu:$stat->{cpu}:$stat->{wait}:" .
	"$meminfo->{memtotal}:$meminfo->{memused}:" .
	"$meminfo->{swaptotal}:$meminfo->{swapused}:" .
	"$dinfo->{blocks}:$dused:$netin:$netout";

    PVE::Cluster::broadcast_rrd("pve2-node/$nodename", $data);

    foreach my $id (keys %{$status_cfg->{ids}}) {
	my $plugin_config = $status_cfg->{ids}->{$id};
	next if $plugin_config->{disable};
	my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});

	my $d = {};
	$d->{uptime} = $uptime;
	$d->{cpustat} = $stat;
	$d->{cpustat}->{avg1} = $avg1;
	$d->{cpustat}->{avg5} = $avg5;
	$d->{cpustat}->{avg15} = $avg15;
	$d->{cpustat}->{cpus} = $maxcpu;
	$d->{memory} = $meminfo;
	$d->{blockstat} = $dinfo;
	$d->{nics} = $netdev;

	$plugin->update_node_status($plugin_config, $nodename, $d, $ctime);
    }
}

sub auto_balloning {
    my ($vmstatus) =  @_;

    my $log = sub {
       return if !$opt_debug;
       print @_;
    };

    my $hostmeminfo = PVE::ProcFSTools::read_meminfo();

    # to debug, run 'pvestatd -d' and set  memtotal here
    #$hostmeminfo->{memtotal} = int(2*1024*1024*1024/0.8); # you can set this to test

    my $hostfreemem = $hostmeminfo->{memtotal} - $hostmeminfo->{memused};

    # we try to use about 80% host memory
    # goal: we want to change memory usage by this amount (positive or negative)
    my $goal = int($hostmeminfo->{memtotal}*0.8 - $hostmeminfo->{memused});

    my $maxchange = 100*1024*1024;
    my $res = PVE::AutoBalloon::compute_alg1($vmstatus, $goal, $maxchange);
 
    &$log("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n");

    foreach my $vmid (keys %$vmstatus) {
	next if !$res->{$vmid};
	my $d = $vmstatus->{$vmid};
	my $diff = int($res->{$vmid} - $d->{balloon});
	my $absdiff = $diff < 0 ? -$diff : $diff;
	if ($absdiff > 0) {
	    &$log("BALLOON $vmid to $res->{$vmid} ($diff)\n");
	    eval {
		PVE::QemuServer::vm_mon_cmd($vmid, "balloon", 
					    value => int($res->{$vmid}));
	    };
	    warn $@ if $@;
	}
    }
}

sub update_qemu_status {
    my ($status_cfg) = @_;

    my $ctime = time();

    my $vmstatus = PVE::QemuServer::vmstatus(undef, 1);

    eval { auto_balloning($vmstatus); };
    syslog('err', "auto ballooning error: $@") if $@;

    foreach my $vmid (keys %$vmstatus) {
	my $d = $vmstatus->{$vmid};
	my $data;
	my $status = $d->{qmpstatus} || $d->{status} || 'stopped';
	my $template = $d->{template} ? $d->{template} : "0";
	if ($d->{pid}) { # running
	    $data = "$d->{uptime}:$d->{name}:$status:$template:" .
		"$ctime:$d->{cpus}:$d->{cpu}:" .
		"$d->{maxmem}:$d->{mem}:" .
		"$d->{maxdisk}:$d->{disk}:" .
		"$d->{netin}:$d->{netout}:" .
		"$d->{diskread}:$d->{diskwrite}";
	} else {
	    $data = "0:$d->{name}:$status:$template:$ctime:$d->{cpus}::" .
		"$d->{maxmem}::" .
		"$d->{maxdisk}:$d->{disk}:" .
		":::";
	}
	PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);

	foreach my $id (keys %{$status_cfg->{ids}}) {
	    my $plugin_config = $status_cfg->{ids}->{$id};
	    next if $plugin_config->{disable};
	    my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
	    $plugin->update_qemu_status($plugin_config, $vmid, $d, $ctime);
	}
    }
}

sub remove_stale_lxc_consoles {

    my $vmstatus = PVE::LXC::vmstatus();
    my $pidhash = PVE::LXC::find_lxc_console_pids();

    foreach my $vmid (keys %$pidhash) {
	next if defined($vmstatus->{$vmid});
	syslog('info', "remove stale lxc-console for CT $vmid");
	foreach my $pid (@{$pidhash->{$vmid}}) {
	    kill(9, $pid);
	}
    }
}

sub rebalance_lxc_containers {

    return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...

    my $all_cpus = PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus');
    my @allowed_cpus = $all_cpus->members();
    my $cpucount = scalar(@allowed_cpus);
    my $max_cpuid = PVE::CpuSet::max_cpuid();

    my @cpu_ctcount = (0) x $max_cpuid;
    my @balanced_cts;

    my $ctlist = PVE::LXC::config_list();

    foreach my $vmid (sort keys %$ctlist) {
	next if ! -d "/sys/fs/cgroup/cpuset/lxc/$vmid";

	my ($conf, $cpuset);
	eval {

	    $conf = PVE::LXC::Config->load_config($vmid);

	    $cpuset = PVE::CpuSet->new_from_cgroup("lxc/$vmid");
	};
	if (my $err = $@) {
	    warn $err;
	    next;
	}

	my @cpuset_members = $cpuset->members();

	if (!PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')) {

	    my $cores = $conf->{cores} || $cpucount;
	    $cores = $cpucount if $cores > $cpucount;

	    # see if the number of cores was hot-reduced or
	    # hasn't been enacted at all yet
	    my $newset = PVE::CpuSet->new();
	    if ($cores <  scalar(@cpuset_members)) {
		for (my $i = 0; $i < $cores; $i++) {
		    $newset->insert($cpuset_members[$i]);
		}
	    } elsif ($cores > scalar(@cpuset_members)) {
		my $count = $newset->insert(@cpuset_members);
		foreach my $cpu (@allowed_cpus) {
		    $count += $newset->insert($cpu);
		    last if $count >= $cores;
		}
	    } else {
		$newset->insert(@cpuset_members);
	    }

	    # Apply hot-plugged changes if any:
	    if (!$newset->is_equal($cpuset)) {
		@cpuset_members = $newset->members();
		syslog('info', "detected changed cpu set for lxc/$vmid: " .
		       $newset->short_string());
		$newset->write_to_cgroup("lxc/$vmid");
	    }

	    # Note: no need to rebalance if we already use all cores
	    push @balanced_cts, [$vmid, $cores, $newset]
		if defined($conf->{cores}) && ($cores != $cpucount);
	}

	foreach my $cpu (@cpuset_members) {
	    $cpu_ctcount[$cpu]++ if $cpu <= $max_cpuid;
	}
    }

    my $find_best_cpu = sub {
	my ($cpulist, $cpu) = @_;

	my $cur_cost = $cpu_ctcount[$cpu];
	my $cur_cpu = $cpu;

	foreach my $candidate (@$cpulist) {
	    my $cost = $cpu_ctcount[$candidate];
	    if ($cost < ($cur_cost -1)) {
		$cur_cost = $cost;
		$cur_cpu = $candidate;
	    }
	}

	return $cur_cpu;
    };

    foreach my $bct (@balanced_cts) {
	my ($vmid, $cores, $cpuset) = @$bct;

	my $newset = PVE::CpuSet->new();

	my $rest = [];
	foreach my $cpu (@allowed_cpus) {
	    next if $cpuset->has($cpu);
	    push @$rest, $cpu;
	}

	my @members = $cpuset->members();
	foreach my $cpu (@members) {
	    my $best =  &$find_best_cpu($rest, $cpu);
	    if ($best != $cpu) {
		$cpu_ctcount[$best]++;
		$cpu_ctcount[$cpu]--;
	    }
	    $newset->insert($best);
	}

	if (!$newset->is_equal($cpuset)) {
	    syslog('info', "modified cpu set for lxc/$vmid: " .
		   $newset->short_string());
	    eval {
		# allow all, so that we can set new cpuset in /ns
		$all_cpus->write_to_cgroup("lxc/$vmid");
		eval {
		    $newset->write_to_cgroup("lxc/$vmid/ns");
		};
		if (my $err = $@) {
		    warn $err;
		    # restore original
		    $cpuset->write_to_cgroup("lxc/$vmid");
		} else {
		    # also apply to container root cgroup
		    $newset->write_to_cgroup("lxc/$vmid");
		}
	    };
	    warn $@ if $@;
	}
    }
}

sub update_lxc_status {
    my ($status_cfg) = @_;

    my $ctime = time();

    my $vmstatus = PVE::LXC::vmstatus();

    foreach my $vmid (keys %$vmstatus) {
	my $d = $vmstatus->{$vmid};
	my $template = $d->{template} ? $d->{template} : "0";
	my $data;
	if ($d->{status} eq 'running') { # running
	    $data = "$d->{uptime}:$d->{name}:$d->{status}:$template:" .
		"$ctime:$d->{cpus}:$d->{cpu}:" .
		"$d->{maxmem}:$d->{mem}:" .
		"$d->{maxdisk}:$d->{disk}:" .
		"$d->{netin}:$d->{netout}:" .
		"$d->{diskread}:$d->{diskwrite}";
	} else {
	    $data = "0:$d->{name}:$d->{status}:$template:$ctime:$d->{cpus}::" .
		"$d->{maxmem}::" .
		"$d->{maxdisk}:$d->{disk}:" .
		":::";
	}
	PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);

	foreach my $id (keys %{$status_cfg->{ids}}) {
	    my $plugin_config = $status_cfg->{ids}->{$id};
	    next if $plugin_config->{disable};
	    my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
	    $plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
	}
    }
}

sub update_storage_status {
    my ($status_cfg) = @_;

    my $cfg = PVE::Storage::config();

    my $ctime = time();

    my $info = PVE::Storage::storage_info($cfg);

    foreach my $storeid (keys %$info) {
	my $d = $info->{$storeid};
	next if !$d->{active};

	my $data = "$ctime:$d->{total}:$d->{used}";

	my $key = "pve2-storage/${nodename}/$storeid";
	PVE::Cluster::broadcast_rrd($key, $data);

	foreach my $id (keys %{$status_cfg->{ids}}) {
	    my $plugin_config = $status_cfg->{ids}->{$id};
	    next if $plugin_config->{disable};
	    my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
	    $plugin->update_storage_status($plugin_config, $nodename, $storeid, $d, $ctime);
	}
    }
}

sub update_status {

    # update worker list. This is not really required and
    # we just call this to make sure that we have a correct
    # list in case of an unexpected crash.
    eval {
	my $tlist = PVE::RPCEnvironment::active_workers();
	PVE::Cluster::broadcast_tasklist($tlist);
    };
    my $err = $@;
    syslog('err', $err) if $err;

    my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');

    eval {
	update_node_status($status_cfg);
    };
    $err = $@;
    syslog('err', "node status update error: $err") if $err;

    eval {
	update_qemu_status($status_cfg);
    };
    $err = $@;
    syslog('err', "qemu status update error: $err") if $err;

    eval {
	update_lxc_status($status_cfg);
    };
    $err = $@;
    syslog('err', "lxc status update error: $err") if $err;

    eval {
	rebalance_lxc_containers();
    };
    $err = $@;
    syslog('err', "lxc cpuset rebalance error: $err") if $err;

    eval {
	update_storage_status($status_cfg);
    };
    $err = $@;
    syslog('err', "storage status update error: $err") if $err;

    eval {
	remove_stale_lxc_consoles();
    };
    $err = $@;
    syslog('err', "lxc console cleanup error: $err") if $err;
}

my $next_update = 0;

# do not update directly after startup, because install scripts
# have a problem with that
my $cycle = 0; 
my $updatetime = 10;

my $initial_memory_usage;

sub run {
    my ($self) = @_;

    for (;;) { # forever

 	$next_update = time() + $updatetime;

	if ($cycle) {
	    my ($ccsec, $cusec) = gettimeofday ();
	    eval {
		# syslog('info', "start status update");
		PVE::Cluster::cfs_update();
		update_status();
	    };
	    my $err = $@;

	    if ($err) {
		syslog('err', "status update error: $err");
	    }

	    my ($ccsec_end, $cusec_end) = gettimeofday ();
	    my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000;

	    syslog('info', sprintf("status update time (%.3f seconds)", $cptime))
		if ($cptime > 5);
	}

	$cycle++;

	my $mem = PVE::ProcFSTools::read_memory_usage();

	if (!defined($initial_memory_usage) || ($cycle < 10)) {
	    $initial_memory_usage = $mem->{resident};
	} else {
	    my $diff = $mem->{resident} - $initial_memory_usage;
	    if ($diff > 5*1024*1024) {
		syslog ('info', "restarting server after $cycle cycles to " .
			"reduce memory usage (free $mem->{resident} ($diff) bytes)");
		$self->restart_daemon();
	    }
	}

	my $wcount = 0;
	while ((time() < $next_update) && 
	       ($wcount < $updatetime) && # protect against time wrap
	       !$restart_request) { $wcount++; sleep (1); };

	$self->restart_daemon() if $restart_request;
    }
}

$daemon->register_start_command();
$daemon->register_restart_command(1);
$daemon->register_stop_command();
$daemon->register_status_command();

our $cmddef = {
    start => [ __PACKAGE__, 'start', []],
    restart => [ __PACKAGE__, 'restart', []],
    stop => [ __PACKAGE__, 'stop', []],
    status => [ __PACKAGE__, 'status', [], undef, sub { print shift . "\n";} ],
};

#my $cmd = shift;
#PVE::CLIHandler::handle_cmd($cmddef, $0, $cmd, \@ARGV, undef, $0);
#exit (0);

1;

__END__

=head1 NAME
                                          
pvestatd - PVE Status Daemon

=head1 SYNOPSIS

=include synopsis

=head1 DESCRIPTION

This daemom queries the status of VMs, storages and containers at
regular intervals. The result is sent to all nodes in the cluster.

=include pve_copyright
Commit	Line	Data
efd04666 DM	1	package PVE::Service::pvestatd;
	2
	3	use strict;
	4	use warnings;
	5
	6	use PVE::SafeSyslog;
	7	use PVE::Daemon;
	8
	9	use Time::HiRes qw (gettimeofday);
	10	use PVE::Tools qw(dir_glob_foreach file_read_firstline);
	11	use PVE::ProcFSTools;
41db757b	12	use PVE::CpuSet;
efd04666 DM	13	use Filesys::Df;
	14	use PVE::INotify;
	15	use PVE::Cluster qw(cfs_read_file);
	16	use PVE::Storage;
	17	use PVE::QemuServer;
	18	use PVE::LXC;
41db757b	19	use PVE::LXC::Config;
efd04666 DM	20	use PVE::RPCEnvironment;
	21	use PVE::API2::Subscription;
	22	use PVE::AutoBalloon;
	23
	24	use PVE::Status::Plugin;
	25	use PVE::Status::Graphite;
58541b94	26	use PVE::Status::InfluxDB;
efd04666 DM	27
efd04666 DM	28	PVE::Status::Graphite->register();
58541b94	29	PVE::Status::InfluxDB->register();
efd04666 DM	30	PVE::Status::Plugin->init();
	31
	32	use base qw(PVE::Daemon);
	33
	34	my $opt_debug;
	35	my $restart_request;
	36
	37	my $nodename = PVE::INotify::nodename();
	38
	39	my $cmdline = [$0, @ARGV];
	40
	41	my %daemon_options = (restart_on_error => 5, stop_wait_time => 5);
	42	my $daemon = __PACKAGE__->new('pvestatd', $cmdline, %daemon_options);
	43
	44	sub init {
	45	my ($self) = @_;
	46
	47	$opt_debug = $self->{debug};
	48
	49	PVE::Cluster::cfs_update();
	50	}
	51
	52	sub shutdown {
	53	my ($self) = @_;
	54
	55	syslog('info' , "server closing");
	56
	57	# wait for children
	58	1 while (waitpid(-1, POSIX::WNOHANG()) > 0);
	59
	60	$self->exit_daemon(0);
	61	}
	62
	63	sub hup {
	64	my ($self) = @_;
	65
	66	$restart_request = 1;
	67	}
	68
	69	sub update_node_status {
	70	my ($status_cfg) = @_;
	71
	72	my ($avg1, $avg5, $avg15) = PVE::ProcFSTools::read_loadavg();
	73
	74	my $stat = PVE::ProcFSTools::read_proc_stat();
	75
	76	my $netdev = PVE::ProcFSTools::read_proc_net_dev();
	77
	78	my ($uptime) = PVE::ProcFSTools::read_proc_uptime();
	79
	80	my $cpuinfo = PVE::ProcFSTools::read_cpuinfo();
	81
	82	my $maxcpu = $cpuinfo->{cpus};
	83
	84	my $subinfo = PVE::INotify::read_file('subscription');
	85	my $sublevel = $subinfo->{level} \|\| '';
	86
	87	# traffic from/to physical interface cards
	88	my $netin = 0;
	89	my $netout = 0;
	90	foreach my $dev (keys %$netdev) {
	91	next if $dev !~ m/^eth\d+$/;
	92	$netin += $netdev->{$dev}->{receive};
	93	$netout += $netdev->{$dev}->{transmit};
94	}
95
96	my $meminfo = PVE::ProcFSTools::read_meminfo();
97
98	my $dinfo = df('/', 1); # output is bytes
99
100	my $ctime = time();
101
102	# everything not free is considered to be used
103	my $dused = $dinfo->{blocks} - $dinfo->{bfree};
104
105	my $data = "$uptime:$sublevel:$ctime:$avg1:$maxcpu:$stat->{cpu}:$stat->{wait}:" .
106	"$meminfo->{memtotal}:$meminfo->{memused}:" .
107	"$meminfo->{swaptotal}:$meminfo->{swapused}:" .
108	"$dinfo->{blocks}:$dused:$netin:$netout";
109
110	PVE::Cluster::broadcast_rrd("pve2-node/$nodename", $data);
111
112	foreach my $id (keys %{$status_cfg->{ids}}) {
113	my $plugin_config = $status_cfg->{ids}->{$id};
114	next if $plugin_config->{disable};
115	my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
116
117	my $d = {};
118	$d->{uptime} = $uptime;
119	$d->{cpustat} = $stat;
120	$d->{cpustat}->{avg1} = $avg1;
121	$d->{cpustat}->{avg5} = $avg5;
122	$d->{cpustat}->{avg15} = $avg15;
123	$d->{cpustat}->{cpus} = $maxcpu;
124	$d->{memory} = $meminfo;
125	$d->{blockstat} = $dinfo;
126	$d->{nics} = $netdev;
127
128	$plugin->update_node_status($plugin_config, $nodename, $d, $ctime);
129	}
130	}
131
132	sub auto_balloning {
133	my ($vmstatus) = @_;
134
135	my $log = sub {
136	return if !$opt_debug;
137	print @_;
138	};
139
140	my $hostmeminfo = PVE::ProcFSTools::read_meminfo();
141
142	# to debug, run 'pvestatd -d' and set memtotal here
143	#$hostmeminfo->{memtotal} = int(210241024*1024/0.8); # you can set this to test
144
145	my $hostfreemem = $hostmeminfo->{memtotal} - $hostmeminfo->{memused};
146
147	# we try to use about 80% host memory
148	# goal: we want to change memory usage by this amount (positive or negative)
149	my $goal = int($hostmeminfo->{memtotal}*0.8 - $hostmeminfo->{memused});
150
151	my $maxchange = 10010241024;
152	my $res = PVE::AutoBalloon::compute_alg1($vmstatus, $goal, $maxchange);
153
154	&$log("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n");
155
156	foreach my $vmid (keys %$vmstatus) {
157	next if !$res->{$vmid};
158	my $d = $vmstatus->{$vmid};
159	my $diff = int($res->{$vmid} - $d->{balloon});
160	my $absdiff = $diff < 0 ? -$diff : $diff;
161	if ($absdiff > 0) {
162	&$log("BALLOON $vmid to $res->{$vmid} ($diff)\n");
163	eval {
164	PVE::QemuServer::vm_mon_cmd($vmid, "balloon",
165	value => int($res->{$vmid}));
166	};
167	warn $@ if $@;
168	}
169	}
170	}
171
172	sub update_qemu_status {
173	my ($status_cfg) = @_;
174
175	my $ctime = time();
176
177	my $vmstatus = PVE::QemuServer::vmstatus(undef, 1);
178
179	eval { auto_balloning($vmstatus); };
180	syslog('err', "auto ballooning error: $@") if $@;
181
182	foreach my $vmid (keys %$vmstatus) {
183	my $d = $vmstatus->{$vmid};
184	my $data;
185	my $status = $d->{qmpstatus} \|\| $d->{status} \|\| 'stopped';
186	my $template = $d->{template} ? $d->{template} : "0";
187	if ($d->{pid}) { # running
188	$data = "$d->{uptime}:$d->{name}:$status:$template:" .
189	"$ctime:$d->{cpus}:$d->{cpu}:" .
190	"$d->{maxmem}:$d->{mem}:" .
191	"$d->{maxdisk}:$d->{disk}:" .
192	"$d->{netin}:$d->{netout}:" .
193	"$d->{diskread}:$d->{diskwrite}";
194	} else {
195	$data = "0:$d->{name}:$status:$template:$ctime:$d->{cpus}::" .
196	"$d->{maxmem}::" .
197	"$d->{maxdisk}:$d->{disk}:" .
198	":::";
199	}
200	PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
201
202	foreach my $id (keys %{$status_cfg->{ids}}) {
203	my $plugin_config = $status_cfg->{ids}->{$id};
204	next if $plugin_config->{disable};
205	my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
206	$plugin->update_qemu_status($plugin_config, $vmid, $d, $ctime);
207	}
208	}
209	}
210
211	sub remove_stale_lxc_consoles {
212
213	my $vmstatus = PVE::LXC::vmstatus();
214	my $pidhash = PVE::LXC::find_lxc_console_pids();
215
216	foreach my $vmid (keys %$pidhash) {
217	next if defined($vmstatus->{$vmid});
218	syslog('info', "remove stale lxc-console for CT $vmid");
219	foreach my $pid (@{$pidhash->{$vmid}}) {
220	kill(9, $pid);
221	}
222	}
223	}
224
41db757b	225	sub rebalance_lxc_containers {
41db757b DM	226
	227	return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...
	228
	229	my $all_cpus = PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus');
	230	my @allowed_cpus = $all_cpus->members();
	231	my $cpucount = scalar(@allowed_cpus);
ccfff920	232	my $max_cpuid = PVE::CpuSet::max_cpuid();
41db757b	233
ccfff920	234	my @cpu_ctcount = (0) x $max_cpuid;
41db757b DM	235	my @balanced_cts;
41db757b DM	236
e0dc09ad DM	237	my $ctlist = PVE::LXC::config_list();
	238
	239	foreach my $vmid (sort keys %$ctlist) {
	240	next if ! -d "/sys/fs/cgroup/cpuset/lxc/$vmid";
41db757b DM	241
	242	my ($conf, $cpuset);
	243	eval {
	244
	245	$conf = PVE::LXC::Config->load_config($vmid);
	246
	247	$cpuset = PVE::CpuSet->new_from_cgroup("lxc/$vmid");
	248	};
	249	if (my $err = $@) {
	250	warn $err;
	251	next;
	252	}
	253
	254	my @cpuset_members = $cpuset->members();
	255
8b750abc	256	if (!PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')) {
2499255b	257
8b750abc DM	258	my $cores = $conf->{cores} \|\| $cpucount;
8b750abc DM	259	$cores = $cpucount if $cores > $cpucount;
41db757b	260
2499255b DM	261	# see if the number of cores was hot-reduced or
	262	# hasn't been enacted at all yet
	263	my $newset = PVE::CpuSet->new();
	264	if ($cores < scalar(@cpuset_members)) {
	265	for (my $i = 0; $i < $cores; $i++) {
	266	$newset->insert($cpuset_members[$i]);
	267	}
	268	} elsif ($cores > scalar(@cpuset_members)) {
	269	my $count = $newset->insert(@cpuset_members);
	270	foreach my $cpu (@allowed_cpus) {
	271	$count += $newset->insert($cpu);
	272	last if $count >= $cores;
	273	}
	274	} else {
	275	$newset->insert(@cpuset_members);
	276	}
07f9595f	277
2499255b DM	278	# Apply hot-plugged changes if any:
	279	if (!$newset->is_equal($cpuset)) {
	280	@cpuset_members = $newset->members();
	281	syslog('info', "detected changed cpu set for lxc/$vmid: " .
8b750abc	282	$newset->short_string());
2499255b DM	283	$newset->write_to_cgroup("lxc/$vmid");
2499255b DM	284	}
07f9595f	285
2499255b DM	286	# Note: no need to rebalance if we already use all cores
2499255b DM	287	push @balanced_cts, [$vmid, $cores, $newset]
8b750abc	288	if defined($conf->{cores}) && ($cores != $cpucount);
2499255b	289	}
07f9595f	290
2499255b	291	foreach my $cpu (@cpuset_members) {
ccfff920	292	$cpu_ctcount[$cpu]++ if $cpu <= $max_cpuid;
07f9595f	293	}
2499255b	294	}
07f9595f	295
2499255b DM	296	my $find_best_cpu = sub {
2499255b DM	297	my ($cpulist, $cpu) = @_;
07f9595f	298
2499255b DM	299	my $cur_cost = $cpu_ctcount[$cpu];
2499255b DM	300	my $cur_cpu = $cpu;
41db757b	301
2499255b DM	302	foreach my $candidate (@$cpulist) {
	303	my $cost = $cpu_ctcount[$candidate];
	304	if ($cost < ($cur_cost -1)) {
	305	$cur_cost = $cost;
	306	$cur_cpu = $candidate;
	307	}
07f9595f DM	308	}
07f9595f DM	309
2499255b DM	310	return $cur_cpu;
	311	};
	312
	313	foreach my $bct (@balanced_cts) {
	314	my ($vmid, $cores, $cpuset) = @$bct;
41db757b DM	315
	316	my $newset = PVE::CpuSet->new();
	317
2499255b DM	318	my $rest = [];
	319	foreach my $cpu (@allowed_cpus) {
	320	next if $cpuset->has($cpu);
	321	push @$rest, $cpu;
	322	}
	323
	324	my @members = $cpuset->members();
	325	foreach my $cpu (@members) {
	326	my $best = &$find_best_cpu($rest, $cpu);
	327	if ($best != $cpu) {
	328	$cpu_ctcount[$best]++;
	329	$cpu_ctcount[$cpu]--;
	330	}
	331	$newset->insert($best);
41db757b DM	332	}
	333
	334	if (!$newset->is_equal($cpuset)) {
2499255b	335	syslog('info', "modified cpu set for lxc/$vmid: " .
8b750abc	336	$newset->short_string());
193146f8 DM	337	eval {
	338	# allow all, so that we can set new cpuset in /ns
	339	$all_cpus->write_to_cgroup("lxc/$vmid");
	340	eval {
	341	$newset->write_to_cgroup("lxc/$vmid/ns");
	342	};
	343	if (my $err = $@) {
	344	warn $err;
	345	# restore original
	346	$cpuset->write_to_cgroup("lxc/$vmid");
	347	} else {
	348	# also apply to container root cgroup
	349	$newset->write_to_cgroup("lxc/$vmid");
	350	}
	351	};
2499255b	352	warn $@ if $@;
41db757b DM	353	}
	354	}
	355	}
	356
efd04666 DM	357	sub update_lxc_status {
	358	my ($status_cfg) = @_;
	359
	360	my $ctime = time();
	361
	362	my $vmstatus = PVE::LXC::vmstatus();
	363
	364	foreach my $vmid (keys %$vmstatus) {
	365	my $d = $vmstatus->{$vmid};
	366	my $template = $d->{template} ? $d->{template} : "0";
	367	my $data;
	368	if ($d->{status} eq 'running') { # running
	369	$data = "$d->{uptime}:$d->{name}:$d->{status}:$template:" .
	370	"$ctime:$d->{cpus}:$d->{cpu}:" .
	371	"$d->{maxmem}:$d->{mem}:" .
	372	"$d->{maxdisk}:$d->{disk}:" .
	373	"$d->{netin}:$d->{netout}:" .
	374	"$d->{diskread}:$d->{diskwrite}";
	375	} else {
	376	$data = "0:$d->{name}:$d->{status}:$template:$ctime:$d->{cpus}::" .
	377	"$d->{maxmem}::" .
	378	"$d->{maxdisk}:$d->{disk}:" .
	379	":::";
	380	}
	381	PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
	382
	383	foreach my $id (keys %{$status_cfg->{ids}}) {
	384	my $plugin_config = $status_cfg->{ids}->{$id};
	385	next if $plugin_config->{disable};
	386	my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
	387	$plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
	388	}
	389	}
	390	}
	391
	392	sub update_storage_status {
	393	my ($status_cfg) = @_;
	394
bbcfdc08	395	my $cfg = PVE::Storage::config();
efd04666 DM	396
	397	my $ctime = time();
	398
	399	my $info = PVE::Storage::storage_info($cfg);
	400
	401	foreach my $storeid (keys %$info) {
	402	my $d = $info->{$storeid};
	403	next if !$d->{active};
	404
	405	my $data = "$ctime:$d->{total}:$d->{used}";
	406
	407	my $key = "pve2-storage/${nodename}/$storeid";
	408	PVE::Cluster::broadcast_rrd($key, $data);
	409
	410	foreach my $id (keys %{$status_cfg->{ids}}) {
	411	my $plugin_config = $status_cfg->{ids}->{$id};
	412	next if $plugin_config->{disable};
	413	my $plugin = PVE::Status::Plugin->lookup($plugin_config->{type});
	414	$plugin->update_storage_status($plugin_config, $nodename, $storeid, $d, $ctime);
	415	}
	416	}
	417	}
	418
	419	sub update_status {
	420
	421	# update worker list. This is not really required and
	422	# we just call this to make sure that we have a correct
	423	# list in case of an unexpected crash.
	424	eval {
	425	my $tlist = PVE::RPCEnvironment::active_workers();
	426	PVE::Cluster::broadcast_tasklist($tlist);
	427	};
	428	my $err = $@;
	429	syslog('err', $err) if $err;
	430
	431	my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');
	432
	433	eval {
	434	update_node_status($status_cfg);
	435	};
	436	$err = $@;
	437	syslog('err', "node status update error: $err") if $err;
	438
	439	eval {
	440	update_qemu_status($status_cfg);
	441	};
	442	$err = $@;
	443	syslog('err', "qemu status update error: $err") if $err;
	444
	445	eval {
	446	update_lxc_status($status_cfg);
	447	};
	448	$err = $@;
	449	syslog('err', "lxc status update error: $err") if $err;
	450
e0dc09ad DM	451	eval {
	452	rebalance_lxc_containers();
	453	};
	454	$err = $@;
	455	syslog('err', "lxc cpuset rebalance error: $err") if $err;
	456
efd04666 DM	457	eval {
	458	update_storage_status($status_cfg);
	459	};
	460	$err = $@;
	461	syslog('err', "storage status update error: $err") if $err;
	462
	463	eval {
	464	remove_stale_lxc_consoles();
	465	};
	466	$err = $@;
	467	syslog('err', "lxc console cleanup error: $err") if $err;
	468	}
	469
	470	my $next_update = 0;
	471
	472	# do not update directly after startup, because install scripts
	473	# have a problem with that
	474	my $cycle = 0;
	475	my $updatetime = 10;
	476
	477	my $initial_memory_usage;
	478
	479	sub run {
	480	my ($self) = @_;
	481
	482	for (;;) { # forever
	483
	484	$next_update = time() + $updatetime;
	485
	486	if ($cycle) {
	487	my ($ccsec, $cusec) = gettimeofday ();
	488	eval {
	489	# syslog('info', "start status update");
	490	PVE::Cluster::cfs_update();
	491	update_status();
	492	};
	493	my $err = $@;
	494
	495	if ($err) {
	496	syslog('err', "status update error: $err");
	497	}
	498
	499	my ($ccsec_end, $cusec_end) = gettimeofday ();
	500	my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000;
	501
	502	syslog('info', sprintf("status update time (%.3f seconds)", $cptime))
	503	if ($cptime > 5);
	504	}
	505
	506	$cycle++;
	507
	508	my $mem = PVE::ProcFSTools::read_memory_usage();
	509
	510	if (!defined($initial_memory_usage) \|\| ($cycle < 10)) {
	511	$initial_memory_usage = $mem->{resident};
	512	} else {
	513	my $diff = $mem->{resident} - $initial_memory_usage;
	514	if ($diff > 510241024) {
	515	syslog ('info', "restarting server after $cycle cycles to " .
	516	"reduce memory usage (free $mem->{resident} ($diff) bytes)");
	517	$self->restart_daemon();
	518	}
	519	}
	520
521	my $wcount = 0;
522	while ((time() < $next_update) &&
523	($wcount < $updatetime) && # protect against time wrap
524	!$restart_request) { $wcount++; sleep (1); };
525
526	$self->restart_daemon() if $restart_request;
527	}
528	}
529
530	$daemon->register_start_command();
531	$daemon->register_restart_command(1);
532	$daemon->register_stop_command();
533	$daemon->register_status_command();
534
535	our $cmddef = {
536	start => [ __PACKAGE__, 'start', []],
537	restart => [ __PACKAGE__, 'restart', []],
538	stop => [ __PACKAGE__, 'stop', []],
539	status => [ __PACKAGE__, 'status', [], undef, sub { print shift . "\n";} ],
540	};
541
542	#my $cmd = shift;
543	#PVE::CLIHandler::handle_cmd($cmddef, $0, $cmd, \@ARGV, undef, $0);
544	#exit (0);
545
546	1;
547
548	__END__
549
550	=head1 NAME
551
552	pvestatd - PVE Status Daemon
553
554	=head1 SYNOPSIS
555
556	=include synopsis
557
558	=head1 DESCRIPTION
559
560	This daemom queries the status of VMs, storages and containers at
561	regular intervals. The result is sent to all nodes in the cluster.
562
563	=include pve_copyright
564
565
566
567
568