]> git.proxmox.com Git - pve-manager.git/blame - PVE/API2/Ceph/OSD.pm
fix #4631: ceph: osd: create: add osds-per-device
[pve-manager.git] / PVE / API2 / Ceph / OSD.pm
CommitLineData
79fa41a2
DC
1package PVE::API2::Ceph::OSD;
2
3use strict;
4use warnings;
5
6use Cwd qw(abs_path);
7use IO::File;
e907f822 8use JSON;
7783f755 9use UUID;
79fa41a2
DC
10
11use PVE::Ceph::Tools;
12use PVE::Ceph::Services;
13use PVE::CephConfig;
14use PVE::Cluster qw(cfs_read_file cfs_write_file);
15use PVE::Diskmanage;
7783f755 16use PVE::Storage::LVMPlugin;
79fa41a2
DC
17use PVE::Exception qw(raise_param_exc);
18use PVE::JSONSchema qw(get_standard_option);
a05349ab 19use PVE::INotify;
79fa41a2
DC
20use PVE::RADOS;
21use PVE::RESTHandler;
22use PVE::RPCEnvironment;
23use PVE::Tools qw(run_command file_set_contents);
3c6aa3f4 24use PVE::ProcFSTools;
05bd76ac 25use PVE::Network;
79fa41a2
DC
26
27use base qw(PVE::RESTHandler);
28
a05349ab
TL
29my $nodename = PVE::INotify::nodename();
30
79fa41a2
DC
31my $get_osd_status = sub {
32 my ($rados, $osdid) = @_;
33
34 my $stat = $rados->mon_command({ prefix => 'osd dump' });
35
36 my $osdlist = $stat->{osds} || [];
37
38 my $flags = $stat->{flags} || undef;
39
40 my $osdstat;
41 foreach my $d (@$osdlist) {
42 $osdstat->{$d->{osd}} = $d if defined($d->{osd});
43 }
44 if (defined($osdid)) {
45 die "no such OSD '$osdid'\n" if !$osdstat->{$osdid};
46 return $osdstat->{$osdid};
47 }
48
017bb1a8 49 return wantarray ? ($osdstat, $flags) : $osdstat;
79fa41a2
DC
50};
51
52my $get_osd_usage = sub {
53 my ($rados) = @_;
54
de6ad72f
TL
55 my $osdlist = $rados->mon_command({ prefix => 'pg dump', dumpcontents => [ 'osds' ]});
56 if (!($osdlist && ref($osdlist))) {
57 warn "got unknown result format for 'pg dump osds' command\n";
58 return [];
91564b72 59 }
79fa41a2 60
de6ad72f
TL
61 if (ref($osdlist) eq "HASH") { # since nautilus
62 $osdlist = $osdlist->{osd_stats};
63 }
64
65 my $osdstat = {};
66 for my $d (@$osdlist) {
79fa41a2
DC
67 $osdstat->{$d->{osd}} = $d if defined($d->{osd});
68 }
69
70 return $osdstat;
71};
72
b4b39b55
TL
73my sub get_proc_pss_from_pid {
74 my ($pid) = @_;
75 return if !defined($pid) || $pid <= 1;
76
77 open (my $SMAPS_FH, '<', "/proc/$pid/smaps_rollup")
78 or die "failed to open PSS memory-stat from process - $!\n";
79
80 while (my $line = <$SMAPS_FH>) {
81 if ($line =~ m/^Pss:\s+([0-9]+) kB$/) { # using PSS avoids bias with many OSDs
82 close $SMAPS_FH;
83 return int($1) * 1024;
84 }
85 }
86 close $SMAPS_FH;
87 die "internal error: failed to find PSS memory-stat in procfs for PID $pid\n";
88}
89
90
79fa41a2
DC
91__PACKAGE__->register_method ({
92 name => 'index',
93 path => '',
94 method => 'GET',
95 description => "Get Ceph osd list/tree.",
96 proxyto => 'node',
97 protected => 1,
98 permissions => {
99 check => ['perm', '/', [ 'Sys.Audit', 'Datastore.Audit' ], any => 1],
100 },
101 parameters => {
102 additionalProperties => 0,
103 properties => {
104 node => get_standard_option('pve-node'),
105 },
106 },
107 # fixme: return a list instead of extjs tree format ?
108 returns => {
109 type => "object",
b62ba85a
AL
110 items => {
111 type => "object",
112 properties => {
113 flags => { type => "string" },
114 root => {
115 type => "object",
116 description => "Tree with OSDs in the CRUSH map structure.",
117 },
118 },
119 },
79fa41a2
DC
120 },
121 code => sub {
122 my ($param) = @_;
123
124 PVE::Ceph::Tools::check_ceph_inited();
125
126 my $rados = PVE::RADOS->new();
c4368cf6 127 my $res = $rados->mon_command({ prefix => 'osd df', output_method => 'tree', });
79fa41a2
DC
128
129 die "no tree nodes found\n" if !($res && $res->{nodes});
130
9cc5ac9e 131 my ($osdhash, $flags) = $get_osd_status->($rados);
79fa41a2 132
de6ad72f 133 my $osd_usage = $get_osd_usage->($rados);
79fa41a2 134
78c2d7f7
TL
135 my $osdmetadata_res = $rados->mon_command({ prefix => 'osd metadata' });
136 my $osdmetadata = { map { $_->{id} => $_ } @$osdmetadata_res };
79fa41a2 137
d3eed3b4 138 my $hostversions = PVE::Ceph::Services::get_ceph_versions();
cead98bd 139
79fa41a2
DC
140 my $nodes = {};
141 my $newnodes = {};
142 foreach my $e (@{$res->{nodes}}) {
cead98bd
TL
143 my ($id, $name) = $e->@{qw(id name)};
144
145 $nodes->{$id} = $e;
79fa41a2
DC
146
147 my $new = {
cead98bd
TL
148 id => $id,
149 name => $name,
79fa41a2
DC
150 type => $e->{type}
151 };
152
c4368cf6 153 foreach my $opt (qw(status crush_weight reweight device_class pgs)) {
79fa41a2
DC
154 $new->{$opt} = $e->{$opt} if defined($e->{$opt});
155 }
156
cead98bd 157 if (my $stat = $osdhash->{$id}) {
79fa41a2
DC
158 $new->{in} = $stat->{in} if defined($stat->{in});
159 }
160
cead98bd 161 if (my $stat = $osd_usage->{$id}) {
79fa41a2
DC
162 $new->{total_space} = ($stat->{kb} || 1) * 1024;
163 $new->{bytes_used} = ($stat->{kb_used} || 0) * 1024;
164 $new->{percent_used} = ($new->{bytes_used}*100)/$new->{total_space};
165 if (my $d = $stat->{perf_stat}) {
166 $new->{commit_latency_ms} = $d->{commit_latency_ms};
167 $new->{apply_latency_ms} = $d->{apply_latency_ms};
168 }
169 }
170
cead98bd 171 my $osdmd = $osdmetadata->{$id};
79fa41a2
DC
172 if ($e->{type} eq 'osd' && $osdmd) {
173 if ($osdmd->{bluefs}) {
174 $new->{osdtype} = 'bluestore';
175 $new->{blfsdev} = $osdmd->{bluestore_bdev_dev_node};
176 $new->{dbdev} = $osdmd->{bluefs_db_dev_node};
177 $new->{waldev} = $osdmd->{bluefs_wal_dev_node};
178 } else {
179 $new->{osdtype} = 'filestore';
180 }
e0297023
DC
181 for my $field (qw(ceph_version ceph_version_short)) {
182 $new->{$field} = $osdmd->{$field} if $osdmd->{$field};
183 }
79fa41a2
DC
184 }
185
cead98bd 186 $newnodes->{$id} = $new;
79fa41a2
DC
187 }
188
189 foreach my $e (@{$res->{nodes}}) {
cead98bd
TL
190 my ($id, $name) = $e->@{qw(id name)};
191 my $new = $newnodes->{$id};
192
79fa41a2
DC
193 if ($e->{children} && scalar(@{$e->{children}})) {
194 $new->{children} = [];
195 $new->{leaf} = 0;
196 foreach my $cid (@{$e->{children}}) {
cead98bd
TL
197 $nodes->{$cid}->{parent} = $id;
198 if ($nodes->{$cid}->{type} eq 'osd' && $e->{type} eq 'host') {
199 $newnodes->{$cid}->{host} = $name;
79fa41a2
DC
200 }
201 push @{$new->{children}}, $newnodes->{$cid};
202 }
203 } else {
cead98bd 204 $new->{leaf} = ($id >= 0) ? 1 : 0;
79fa41a2 205 }
69ad2e53 206
cead98bd 207 if ($name && $e->{type} eq 'host') {
d3eed3b4 208 $new->{version} = $hostversions->{$name}->{version}->{str};
69ad2e53 209 }
79fa41a2
DC
210 }
211
cead98bd 212 my $realroots = [];
79fa41a2 213 foreach my $e (@{$res->{nodes}}) {
cead98bd
TL
214 my $id = $e->{id};
215 if (!$nodes->{$id}->{parent}) {
216 push @$realroots, $newnodes->{$id};
79fa41a2
DC
217 }
218 }
219
cead98bd 220 die "no root node\n" if scalar(@$realroots) < 1;
79fa41a2 221
cead98bd
TL
222 my $data = {
223 root => {
224 leaf => 0,
225 children => $realroots
226 },
cead98bd 227 };
79fa41a2 228
cead98bd 229 $data->{flags} = $flags if $flags; # we want this for the noout flag
79fa41a2
DC
230
231 return $data;
232 }});
233
234__PACKAGE__->register_method ({
235 name => 'createosd',
236 path => '',
237 method => 'POST',
238 description => "Create OSD",
239 proxyto => 'node',
240 protected => 1,
241 parameters => {
242 additionalProperties => 0,
243 properties => {
244 node => get_standard_option('pve-node'),
245 dev => {
246 description => "Block device name.",
247 type => 'string',
248 },
7783f755
DC
249 db_dev => {
250 description => "Block device name for block.db.",
79fa41a2
DC
251 optional => 1,
252 type => 'string',
253 },
596bb7b1 254 db_dev_size => {
0e5f83ba
TL
255 description => "Size in GiB for block.db.",
256 verbose_description => "If a block.db is requested but the size is not given, ".
257 "will be automatically selected by: bluestore_block_db_size from the ".
7783f755
DC
258 "ceph database (osd or global section) or config (osd or global section)".
259 "in that order. If this is not available, it will be sized 10% of the size ".
260 "of the OSD device. Fails if the available size is not enough.",
79fa41a2 261 optional => 1,
7783f755 262 type => 'number',
0e5f83ba 263 default => 'bluestore_block_db_size or 10% of OSD size',
7783f755
DC
264 requires => 'db_dev',
265 minimum => 1.0,
79fa41a2 266 },
7783f755
DC
267 wal_dev => {
268 description => "Block device name for block.wal.",
79fa41a2 269 optional => 1,
7783f755 270 type => 'string',
79fa41a2 271 },
596bb7b1 272 wal_dev_size => {
0e5f83ba
TL
273 description => "Size in GiB for block.wal.",
274 verbose_description => "If a block.wal is requested but the size is not given, ".
275 "will be automatically selected by: bluestore_block_wal_size from the ".
7783f755
DC
276 "ceph database (osd or global section) or config (osd or global section)".
277 "in that order. If this is not available, it will be sized 1% of the size ".
278 "of the OSD device. Fails if the available size is not enough.",
79fa41a2 279 optional => 1,
7783f755 280 minimum => 0.5,
0e5f83ba 281 default => 'bluestore_block_wal_size or 1% of OSD size',
7783f755
DC
282 requires => 'wal_dev',
283 type => 'number',
79fa41a2 284 },
4ce04578
DC
285 encrypted => {
286 type => 'boolean',
287 optional => 1,
288 default => 0,
289 description => "Enables encryption of the OSD."
290 },
2184098e
AA
291 'crush-device-class' => {
292 optional => 1,
293 type => 'string',
294 description => "Set the device class of the OSD in crush."
295 },
ad1677d2
AL
296 'osds-per-device' => {
297 optional => 1,
298 type => 'integer',
299 minimum => '1',
300 description => 'OSD services per physical device. Only useful for fast ".
301 "NVME devices to utilize their performance better.',
302 },
79fa41a2
DC
303 },
304 },
305 returns => { type => 'string' },
306 code => sub {
307 my ($param) = @_;
308
309 my $rpcenv = PVE::RPCEnvironment::get();
310
311 my $authuser = $rpcenv->get_user();
312
45d45a63 313 # test basic requirements
79fa41a2 314 PVE::Ceph::Tools::check_ceph_inited();
79fa41a2 315 PVE::Ceph::Tools::setup_pve_symlinks();
79fa41a2 316 PVE::Ceph::Tools::check_ceph_installed('ceph_osd');
7783f755 317 PVE::Ceph::Tools::check_ceph_installed('ceph_volume');
79fa41a2 318
45d45a63
DC
319 # extract parameter info and fail if a device is set more than once
320 my $devs = {};
79fa41a2 321
ad1677d2
AL
322 # allow 'osds-per-device' only without dedicated db and/or wal devs. We cannot specify them with
323 # 'ceph-volume lvm batch' and they don't make a lot of sense on fast NVMEs anyway.
324 if ($param->{'osds-per-device'}) {
325 for my $type ( qw(db_dev wal_dev) ) {
326 raise_param_exc({ $type => "canot use 'osds-per-device' parameter with '${type}'" })
327 if $param->{$type};
328 }
329 }
330
05bd76ac
AL
331 my $ceph_conf = cfs_read_file('ceph.conf');
332
a05349ab
TL
333 my $osd_network = $ceph_conf->{global}->{cluster_network};
334 $osd_network //= $ceph_conf->{global}->{public_network}; # fallback
05bd76ac 335
a0ef509a
DC
336 if ($osd_network) { # check only if something is configured
337 my $cluster_net_ips = PVE::Network::get_local_ip_from_cidr($osd_network);
338 if (scalar(@$cluster_net_ips) < 1) {
339 my $osd_net_obj = PVE::Network::IP_from_cidr($osd_network);
340 my $osd_base_cidr = $osd_net_obj->{ip} . "/" . $osd_net_obj->{prefixlen};
341
342 die "No address from ceph cluster network (${osd_base_cidr}) found on node '$nodename'. ".
343 "Check your network config.\n";
344 }
05bd76ac
AL
345 }
346
970f96fd
TL
347 for my $type ( qw(dev db_dev wal_dev) ) {
348 next if !$param->{$type};
0154e795 349
970f96fd 350 my $type_dev = PVE::Diskmanage::verify_blockdev_path($param->{$type});
45d45a63 351 (my $type_devname = $type_dev) =~ s|/dev/||;
79fa41a2 352
970f96fd 353 raise_param_exc({ $type => "cannot chose '$type_dev' for more than one type." })
45d45a63 354 if grep { $_->{name} eq $type_devname } values %$devs;
79fa41a2 355
45d45a63
DC
356 $devs->{$type} = {
357 dev => $type_dev,
358 name => $type_devname,
359 };
79fa41a2 360
45d45a63
DC
361 if (my $size = $param->{"${type}_size"}) {
362 $devs->{$type}->{size} = PVE::Tools::convert_size($size, 'gb' => 'b') ;
363 }
364 }
79fa41a2 365
e2565956
FE
366 my $test_disk_requirements = sub {
367 my ($disklist) = @_;
368
369 my $dev = $devs->{dev}->{dev};
370 my $devname = $devs->{dev}->{name};
371 die "unable to get device info for '$dev'\n" if !$disklist->{$devname};
372 die "device '$dev' is already in use\n" if $disklist->{$devname}->{used};
373
374 for my $type ( qw(db_dev wal_dev) ) {
375 my $d = $devs->{$type};
376 next if !$d;
377 my $name = $d->{name};
378 my $info = $disklist->{$name};
379 die "unable to get device info for '$d->{dev}' for type $type\n" if !$disklist->{$name};
380 if (my $usage = $info->{used}) {
381 if ($usage eq 'partitions') {
382 die "device '$d->{dev}' is not GPT partitioned\n" if !$info->{gpt};
383 } elsif ($usage ne 'LVM') {
384 die "device '$d->{dev}' is already in use and has no LVM on it\n";
385 }
385df838
DC
386 }
387 }
e2565956
FE
388 };
389
390
391 # test disk requirements early
392 my $devlist = [ map { $_->{name} } values %$devs ];
5161a0c2 393 my $disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
e2565956 394 $test_disk_requirements->($disklist);
0154e795 395
45d45a63 396 # get necessary ceph infos
79fa41a2 397 my $rados = PVE::RADOS->new();
e25dda25 398 my $monstat = $rados->mon_command({ prefix => 'quorum_status' });
79fa41a2 399
79fa41a2
DC
400 my $ceph_bootstrap_osd_keyring = PVE::Ceph::Tools::get_config('ceph_bootstrap_osd_keyring');
401
7712a4e1 402 if (! -f $ceph_bootstrap_osd_keyring && $ceph_conf->{global}->{auth_client_required} eq 'cephx') {
217dde83
DC
403 my $bindata = $rados->mon_command({
404 prefix => 'auth get-or-create',
405 entity => 'client.bootstrap-osd',
406 caps => [
407 'mon' => 'allow profile bootstrap-osd'
408 ],
409 format => 'plain',
410 });
79fa41a2
DC
411 file_set_contents($ceph_bootstrap_osd_keyring, $bindata);
412 };
413
45d602f2
FE
414 # See FIXME below
415 my @udev_trigger_devs = ();
416
7783f755
DC
417 my $create_part_or_lv = sub {
418 my ($dev, $size, $type) = @_;
419
0154e795
TL
420 $size =~ m/^(\d+)$/ or die "invalid size '$size'\n";
421 $size = $1;
7783f755
DC
422
423 die "'$dev->{devpath}' is smaller than requested size '$size' bytes\n"
424 if $dev->{size} < $size;
79fa41a2 425
ab62d137
DC
426 # sgdisk and lvcreate can only sizes divisible by 512b
427 # so we round down to the nearest kb
428 $size = PVE::Tools::convert_size($size, 'b' => 'kb', 1);
429
7783f755
DC
430 if (!$dev->{used}) {
431 # create pv,vg,lv
79fa41a2 432
7783f755
DC
433 my $vg = "ceph-" . UUID::uuid();
434 my $lv = $type . "-" . UUID::uuid();
79fa41a2 435
7783f755 436 PVE::Storage::LVMPlugin::lvm_create_volume_group($dev->{devpath}, $vg);
ab62d137 437 PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
79fa41a2 438
cffeb115
FE
439 if (PVE::Diskmanage::is_partition($dev->{devpath})) {
440 eval { PVE::Diskmanage::change_parttype($dev->{devpath}, '8E00'); };
441 warn $@ if $@;
442 }
443
45d602f2
FE
444 push @udev_trigger_devs, $dev->{devpath};
445
7783f755
DC
446 return "$vg/$lv";
447
448 } elsif ($dev->{used} eq 'LVM') {
449 # check pv/vg and create lv
450
451 my $vgs = PVE::Storage::LVMPlugin::lvm_vgs(1);
452 my $vg;
453 for my $vgname ( sort keys %$vgs ) {
454 next if $vgname !~ /^ceph-/;
455
456 for my $pv ( @{$vgs->{$vgname}->{pvs}} ) {
457 next if $pv->{name} ne $dev->{devpath};
458 $vg = $vgname;
459 last;
460 }
461 last if $vg;
462 }
463
464 die "no ceph vg found on '$dev->{devpath}'\n" if !$vg;
465 die "vg '$vg' has not enough free space\n" if $vgs->{$vg}->{free} < $size;
466
afa09e02 467 my $lv = $type . "-" . UUID::uuid();
7783f755 468
ab62d137 469 PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
7783f755
DC
470
471 return "$vg/$lv";
472
3d7b3992 473 } elsif ($dev->{used} eq 'partitions' && $dev->{gpt}) {
7783f755 474 # create new partition at the end
46b1ccc3
FE
475 my $parttypes = {
476 'osd-db' => '30CD0809-C2B2-499C-8879-2D6B78529876',
477 'osd-wal' => '5CE17FCE-4087-4169-B7FF-056CC58473F9',
478 };
7783f755 479
45d602f2 480 my $part = PVE::Diskmanage::append_partition($dev->{devpath}, $size * 1024);
46b1ccc3
FE
481
482 if (my $parttype = $parttypes->{$type}) {
483 eval { PVE::Diskmanage::change_parttype($part, $parttype); };
484 warn $@ if $@;
485 }
486
45d602f2
FE
487 push @udev_trigger_devs, $part;
488 return $part;
7783f755
DC
489 }
490
491 die "cannot use '$dev->{devpath}' for '$type'\n";
492 };
493
494 my $worker = sub {
495 my $upid = shift;
496
497 PVE::Diskmanage::locked_disk_action(sub {
e2565956 498 # update disklist and re-test requirements
5161a0c2 499 $disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
e2565956 500 $test_disk_requirements->($disklist);
7783f755 501
2184098e 502 my $dev_class = $param->{'crush-device-class'};
ad1677d2
AL
503 # create allows for detailed configuration of DB and WAL devices
504 # batch for easy creation of multiple OSDs (per device)
505 my $create_mode = $param->{'osds-per-device'} ? 'batch' : 'create';
506 my $cmd = ['ceph-volume', 'lvm', $create_mode ];
2184098e 507 push @$cmd, '--crush-device-class', $dev_class if $dev_class;
79fa41a2 508
e2565956 509 my $devname = $devs->{dev}->{name};
45d45a63 510 my $devpath = $disklist->{$devname}->{devpath};
79fa41a2 511 print "create OSD on $devpath (bluestore)\n";
79fa41a2 512
45d602f2
FE
513 push @udev_trigger_devs, $devpath;
514
45d45a63
DC
515 my $osd_size = $disklist->{$devname}->{size};
516 my $size_map = {
517 db => int($osd_size / 10), # 10% of OSD
518 wal => int($osd_size / 100), # 1% of OSD
519 };
520
521 my $sizes;
522 foreach my $type ( qw(db wal) ) {
523 my $fallback_size = $size_map->{$type};
970f96fd 524 my $d = $devs->{"${type}_dev"};
45d45a63
DC
525 next if !$d;
526
527 # size was not set via api, getting from config/fallback
528 if (!defined($d->{size})) {
529 $sizes = PVE::Ceph::Tools::get_db_wal_sizes() if !$sizes;
530 $d->{size} = $sizes->{$type} // $fallback_size;
531 }
532 print "creating block.$type on '$d->{dev}'\n";
533 my $name = $d->{name};
534 my $part_or_lv = $create_part_or_lv->($disklist->{$name}, $d->{size}, "osd-$type");
79fa41a2 535
45d45a63
DC
536 print "using '$part_or_lv' for block.$type\n";
537 push @$cmd, "--block.$type", $part_or_lv;
79fa41a2
DC
538 }
539
ad1677d2 540 push @$cmd, '--data', $devpath if $create_mode eq 'create';
4ce04578 541 push @$cmd, '--dmcrypt' if $param->{encrypted};
79fa41a2 542
ad1677d2
AL
543 if ($create_mode eq 'batch') {
544 push @$cmd,
545 '--osds-per-device', $param->{'osds-per-device'},
546 '--yes',
547 '--no-auto',
548 '--',
549 $devpath;
550 }
683a3563 551 PVE::Diskmanage::wipe_blockdev($devpath);
79fa41a2 552
cffeb115
FE
553 if (PVE::Diskmanage::is_partition($devpath)) {
554 eval { PVE::Diskmanage::change_parttype($devpath, '8E00'); };
555 warn $@ if $@;
556 }
557
7783f755 558 run_command($cmd);
45d602f2
FE
559
560 # FIXME: Remove once we depend on systemd >= v249.
561 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
562 # udev database is updated.
563 eval { run_command(['udevadm', 'trigger', @udev_trigger_devs]); };
564 warn $@ if $@;
7783f755 565 });
79fa41a2
DC
566 };
567
e2565956 568 return $rpcenv->fork_worker('cephcreateosd', $devs->{dev}->{name}, $authuser, $worker);
79fa41a2
DC
569 }});
570
e907f822
AL
571my $OSD_DEV_RETURN_PROPS = {
572 device => {
573 type => 'string',
574 enum => ['block', 'db', 'wal'],
575 description => 'Kind of OSD device',
576 },
577 dev_node => {
578 type => 'string',
579 description => 'Device node',
580 },
581 devices => {
582 type => 'string',
583 description => 'Physical disks used',
584 },
585 size => {
586 type => 'integer',
587 description => 'Size in bytes',
588 },
589 support_discard => {
590 type => 'boolean',
591 description => 'Discard support of the physical device',
592 },
593 type => {
594 type => 'string',
595 description => 'Type of device. For example, hdd or ssd',
596 },
597};
598
599__PACKAGE__->register_method ({
600 name => 'osdindex',
601 path => '{osdid}',
602 method => 'GET',
603 permissions => { user => 'all' },
604 description => "OSD index.",
605 parameters => {
606 additionalProperties => 0,
607 properties => {
608 node => get_standard_option('pve-node'),
609 osdid => {
610 description => 'OSD ID',
611 type => 'integer',
612 },
613 },
614 },
615 returns => {
616 type => 'array',
617 items => {
618 type => "object",
619 properties => {},
620 },
621 links => [ { rel => 'child', href => "{name}" } ],
622 },
623 code => sub {
624 my ($param) = @_;
625
626 my $result = [
627 { name => 'metadata' },
628 { name => 'lv-info' },
629 ];
630
631 return $result;
632 }});
633
634__PACKAGE__->register_method ({
635 name => 'osddetails',
636 path => '{osdid}/metadata',
637 method => 'GET',
638 description => "Get OSD details",
639 proxyto => 'node',
640 protected => 1,
641 permissions => {
642 check => ['perm', '/', [ 'Sys.Audit' ], any => 1],
643 },
644 parameters => {
645 additionalProperties => 0,
646 properties => {
647 node => get_standard_option('pve-node'),
648 osdid => {
649 description => 'OSD ID',
650 type => 'integer',
651 },
652 },
653 },
654 returns => {
655 type => 'object',
656 properties => {
657 osd => {
658 type => 'object',
659 description => 'General information about the OSD',
660 properties => {
661 hostname => {
662 type => 'string',
663 description => 'Name of the host containing the OSD.',
664 },
665 id => {
666 type => 'integer',
667 description => 'ID of the OSD.',
668 },
669 mem_usage => {
670 type => 'integer',
671 description => 'Memory usage of the OSD service.',
672 },
673 osd_data => {
674 type => 'string',
675 description => "Path to the OSD's data directory.",
676 },
677 osd_objectstore => {
678 type => 'string',
679 description => 'The type of object store used.',
680 },
681 pid => {
682 type => 'integer',
683 description => 'OSD process ID.',
684 },
685 version => {
686 type => 'string',
687 description => 'Ceph version of the OSD service.',
688 },
689 front_addr => {
690 type => 'string',
691 description => 'Address and port used to talk to clients and monitors.',
692 },
693 back_addr => {
694 type => 'string',
695 description => 'Address and port used to talk to other OSDs.',
696 },
697 hb_front_addr => {
698 type => 'string',
699 description => 'Heartbeat address and port for clients and monitors.',
700 },
701 hb_back_addr => {
702 type => 'string',
703 description => 'Heartbeat address and port for other OSDs.',
704 },
705 },
706 },
707 devices => {
708 type => 'array',
709 description => 'Array containing data about devices',
710 items => {
711 type => "object",
712 properties => $OSD_DEV_RETURN_PROPS,
713 },
714 }
715 }
716 },
717 code => sub {
718 my ($param) = @_;
719
720 PVE::Ceph::Tools::check_ceph_inited();
721
722 my $osdid = $param->{osdid};
723 my $rados = PVE::RADOS->new();
724 my $metadata = $rados->mon_command({ prefix => 'osd metadata', id => int($osdid) });
725
726 die "OSD '${osdid}' does not exists on host '${nodename}'\n"
727 if $nodename ne $metadata->{hostname};
728
e907f822 729 my $pid;
e907f822
AL
730 my $parser = sub {
731 my $line = shift;
732 if ($line =~ m/^MainPID=([0-9]*)$/) {
f7b7e942 733 $pid = int($1);
e907f822
AL
734 }
735 };
736
737 my $cmd = [
738 '/bin/systemctl',
739 'show',
740 "ceph-osd\@${osdid}.service",
741 '--property',
808eb12f 742 'MainPID',
e907f822
AL
743 ];
744 run_command($cmd, errmsg => 'fetching OSD PID and memory usage failed', outfunc => $parser);
745
b4b39b55
TL
746 my $osd_pss_memory = eval { get_proc_pss_from_pid($pid) } // 0;
747 warn $@ if $@;
e907f822
AL
748
749 my $data = {
750 osd => {
751 hostname => $metadata->{hostname},
752 id => $metadata->{id},
b4b39b55 753 mem_usage => $osd_pss_memory,
e907f822
AL
754 osd_data => $metadata->{osd_data},
755 osd_objectstore => $metadata->{osd_objectstore},
756 pid => $pid,
757 version => "$metadata->{ceph_version_short} ($metadata->{ceph_release})",
758 front_addr => $metadata->{front_addr},
759 back_addr => $metadata->{back_addr},
760 hb_front_addr => $metadata->{hb_front_addr},
761 hb_back_addr => $metadata->{hb_back_addr},
762 },
763 };
764
765 $data->{devices} = [];
766
767 my $get_data = sub {
768 my ($dev, $prefix, $device) = @_;
769 push (
770 @{$data->{devices}},
771 {
772 dev_node => $metadata->{"${prefix}_${dev}_dev_node"},
773 physical_device => $metadata->{"${prefix}_${dev}_devices"},
774 size => int($metadata->{"${prefix}_${dev}_size"}),
775 support_discard => int($metadata->{"${prefix}_${dev}_support_discard"}),
776 type => $metadata->{"${prefix}_${dev}_type"},
777 device => $device,
778 }
779 );
780 };
781
782 $get_data->("bdev", "bluestore", "block");
783 $get_data->("db", "bluefs", "db") if $metadata->{bluefs_dedicated_db};
784 $get_data->("wal", "bluefs", "wal") if $metadata->{bluefs_dedicated_wal};
785
786 return $data;
787 }});
788
789__PACKAGE__->register_method ({
790 name => 'osdvolume',
791 path => '{osdid}/lv-info',
792 method => 'GET',
793 description => "Get OSD volume details",
794 proxyto => 'node',
795 protected => 1,
796 permissions => {
797 check => ['perm', '/', [ 'Sys.Audit' ], any => 1],
798 },
799 parameters => {
800 additionalProperties => 0,
801 properties => {
802 node => get_standard_option('pve-node'),
803 osdid => {
804 description => 'OSD ID',
805 type => 'integer',
806 },
807 type => {
808 description => 'OSD device type',
809 type => 'string',
810 enum => ['block', 'db', 'wal'],
811 default => 'block',
812 optional => 1,
813 },
814 },
815 },
816 returns => {
817 type => 'object',
818 properties => {
819 creation_time => {
820 type => 'string',
821 description => "Creation time as reported by `lvs`.",
822 },
823 lv_name => {
824 type => 'string',
825 description => 'Name of the logical volume (LV).',
826 },
827 lv_path => {
828 type => 'string',
829 description => 'Path to the logical volume (LV).',
830 },
831 lv_size => {
832 type => 'integer',
833 description => 'Size of the logical volume (LV).',
834 },
835 lv_uuid => {
836 type => 'string',
837 description => 'UUID of the logical volume (LV).',
838 },
839 vg_name => {
840 type => 'string',
841 description => 'Name of the volume group (VG).',
842 },
843 },
844 },
845 code => sub {
846 my ($param) = @_;
847
848 PVE::Ceph::Tools::check_ceph_inited();
849
850 my $osdid = $param->{osdid};
851 my $type = $param->{type} // 'block';
852
853 my $raw = '';
854 my $parser = sub { $raw .= shift };
855 my $cmd = ['/usr/sbin/ceph-volume', 'lvm', 'list', $osdid, '--format', 'json'];
856 run_command($cmd, errmsg => 'listing Ceph LVM volumes failed', outfunc => $parser);
857
858 my $result;
859 if ($raw =~ m/^(\{.*\})$/s) { #untaint
860 $result = JSON::decode_json($1);
861 } else {
862 die "got unexpected data from ceph-volume: '${raw}'\n";
863 }
864 if (!$result->{$osdid}) {
865 die "OSD '${osdid}' not found in 'ceph-volume lvm list' on node '${nodename}'.\n"
866 ."Maybe it was created before LVM became the default?\n";
867 }
868
869 my $lv_data = { map { $_->{type} => $_ } @{$result->{$osdid}} };
870 my $volume = $lv_data->{$type} || die "volume type '${type}' not found for OSD ${osdid}\n";
871
872 $raw = '';
873 $cmd = ['/sbin/lvs', $volume->{lv_path}, '--reportformat', 'json', '-o', 'lv_time'];
874 run_command($cmd, errmsg => 'listing logical volumes failed', outfunc => $parser);
875
876 if ($raw =~ m/(\{.*\})$/s) { #untaint, lvs has whitespace at beginning
877 $result = JSON::decode_json($1);
878 } else {
879 die "got unexpected data from lvs: '${raw}'\n";
880 }
881
882 my $data = { map { $_ => $volume->{$_} } qw(lv_name lv_path lv_uuid vg_name) };
883 $data->{lv_size} = int($volume->{lv_size});
884
885 $data->{creation_time} = @{$result->{report}}[0]->{lv}[0]->{lv_time};
886
887 return $data;
888 }});
889
220173e9
DJ
890# Check if $osdid belongs to $nodename
891# $tree ... rados osd tree (passing the tree makes it easy to test)
892sub osd_belongs_to_node {
893 my ($tree, $nodename, $osdid) = @_;
d7a63207 894 return 0 if !($tree && $tree->{nodes});
220173e9 895
d7a63207
TL
896 my $node_map = {};
897 for my $el (grep { defined($_->{type}) && $_->{type} eq 'host' } @{$tree->{nodes}}) {
898 my $name = $el->{name};
899 die "internal error: duplicate host name found '$name'\n" if $node_map->{$name};
900 $node_map->{$name} = $el;
901 }
220173e9 902
d7a63207
TL
903 my $osds = $node_map->{$nodename}->{children};
904 return 0 if !$osds;
220173e9 905
220173e9
DJ
906 return grep($_ == $osdid, @$osds);
907}
908
79fa41a2
DC
909__PACKAGE__->register_method ({
910 name => 'destroyosd',
911 path => '{osdid}',
912 method => 'DELETE',
913 description => "Destroy OSD",
914 proxyto => 'node',
915 protected => 1,
916 parameters => {
917 additionalProperties => 0,
918 properties => {
919 node => get_standard_option('pve-node'),
920 osdid => {
921 description => 'OSD ID',
922 type => 'integer',
923 },
924 cleanup => {
925 description => "If set, we remove partition table entries.",
926 type => 'boolean',
927 optional => 1,
928 default => 0,
929 },
930 },
931 },
932 returns => { type => 'string' },
933 code => sub {
934 my ($param) = @_;
935
936 my $rpcenv = PVE::RPCEnvironment::get();
937
938 my $authuser = $rpcenv->get_user();
939
940 PVE::Ceph::Tools::check_ceph_inited();
941
942 my $osdid = $param->{osdid};
5ebb945c 943 my $cleanup = $param->{cleanup};
79fa41a2
DC
944
945 my $rados = PVE::RADOS->new();
220173e9
DJ
946
947 my $osd_belongs_to_node = osd_belongs_to_node(
948 $rados->mon_command({ prefix => 'osd tree' }),
949 $param->{node},
950 $osdid,
951 );
952 die "OSD osd.$osdid does not belong to node $param->{node}!"
953 if !$osd_belongs_to_node;
954
017bb1a8 955 # dies if osdid is unknown
9cc5ac9e 956 my $osdstat = $get_osd_status->($rados, $osdid);
79fa41a2
DC
957
958 die "osd is in use (in == 1)\n" if $osdstat->{in};
959 #&$run_ceph_cmd(['osd', 'out', $osdid]);
960
017bb1a8 961 die "osd is still running (up == 1)\n" if $osdstat->{up};
79fa41a2
DC
962
963 my $osdsection = "osd.$osdid";
964
965 my $worker = sub {
966 my $upid = shift;
967
968 # reopen with longer timeout
969 $rados = PVE::RADOS->new(timeout => PVE::Ceph::Tools::get_config('long_rados_timeout'));
970
971 print "destroy OSD $osdsection\n";
972
973 eval {
974 PVE::Ceph::Services::ceph_service_cmd('stop', $osdsection);
975 PVE::Ceph::Services::ceph_service_cmd('disable', $osdsection);
976 };
977 warn $@ if $@;
978
979 print "Remove $osdsection from the CRUSH map\n";
980 $rados->mon_command({ prefix => "osd crush remove", name => $osdsection, format => 'plain' });
981
982 print "Remove the $osdsection authentication key.\n";
983 $rados->mon_command({ prefix => "auth del", entity => $osdsection, format => 'plain' });
984
985 print "Remove OSD $osdsection\n";
986 $rados->mon_command({ prefix => "osd rm", ids => [ $osdsection ], format => 'plain' });
987
988 # try to unmount from standard mount point
989 my $mountpoint = "/var/lib/ceph/osd/ceph-$osdid";
990
45d602f2
FE
991 # See FIXME below
992 my $udev_trigger_devs = {};
993
79fa41a2
DC
994 my $remove_partition = sub {
995 my ($part) = @_;
996
997 return if !$part || (! -b $part );
998 my $partnum = PVE::Diskmanage::get_partnum($part);
999 my $devpath = PVE::Diskmanage::get_blockdev($part);
1000
45d602f2
FE
1001 $udev_trigger_devs->{$devpath} = 1;
1002
683a3563 1003 PVE::Diskmanage::wipe_blockdev($part);
79fa41a2
DC
1004 print "remove partition $part (disk '${devpath}', partnum $partnum)\n";
1005 eval { run_command(['/sbin/sgdisk', '-d', $partnum, "${devpath}"]); };
1006 warn $@ if $@;
79fa41a2
DC
1007 };
1008
9b44d03d
DC
1009 my $osd_list = PVE::Ceph::Tools::ceph_volume_list();
1010
b32e9255 1011 if ($osd_list->{$osdid}) { # ceph-volume managed
79fa41a2 1012
b32e9255 1013 eval { PVE::Ceph::Tools::ceph_volume_zap($osdid, $cleanup) };
9b44d03d 1014 warn $@ if $@;
5ebb945c
TL
1015
1016 if ($cleanup) {
9b44d03d 1017 # try to remove pvs, but do not fail if it does not work
b32e9255
TL
1018 for my $osd_part (@{$osd_list->{$osdid}}) {
1019 for my $dev (@{$osd_part->{devices}}) {
c92fc8a1
SI
1020 ($dev) = ($dev =~ m|^(/dev/[-_.a-zA-Z0-9\/]+)$|); #untaint
1021
259b557c 1022 eval { run_command(['/sbin/pvremove', $dev], errfunc => sub {}) };
b32e9255 1023 warn $@ if $@;
45d602f2
FE
1024
1025 $udev_trigger_devs->{$dev} = 1;
b32e9255 1026 }
9b44d03d
DC
1027 }
1028 }
1029 } else {
1030 my $partitions_to_remove = [];
5ebb945c 1031 if ($cleanup) {
9b44d03d
DC
1032 if (my $mp = PVE::ProcFSTools::parse_proc_mounts()) {
1033 foreach my $line (@$mp) {
1034 my ($dev, $path, $fstype) = @$line;
1035 next if !($dev && $path && $fstype);
1036 next if $dev !~ m|^/dev/|;
1037
1038 if ($path eq $mountpoint) {
1039 abs_path($dev) =~ m|^(/.+)| or die "invalid dev: $dev\n";
1040 push @$partitions_to_remove, $1;
1041 last;
1042 }
1043 }
1044 }
1045
1046 foreach my $path (qw(journal block block.db block.wal)) {
1047 abs_path("$mountpoint/$path") =~ m|^(/.+)| or die "invalid path: $path\n";
1048 push @$partitions_to_remove, $1;
1049 }
79fa41a2 1050 }
79fa41a2 1051
9b44d03d
DC
1052 print "Unmount OSD $osdsection from $mountpoint\n";
1053 eval { run_command(['/bin/umount', $mountpoint]); };
1054 if (my $err = $@) {
1055 warn $err;
5ebb945c 1056 } elsif ($cleanup) {
9b44d03d
DC
1057 #be aware of the ceph udev rules which can remount.
1058 foreach my $part (@$partitions_to_remove) {
1059 $remove_partition->($part);
1060 }
79fa41a2 1061 }
79fa41a2 1062 }
45d602f2
FE
1063
1064 # FIXME: Remove once we depend on systemd >= v249.
1065 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
1066 # udev database is updated.
1067 if ($cleanup) {
1068 eval { run_command(['udevadm', 'trigger', keys $udev_trigger_devs->%*]); };
1069 warn $@ if $@;
1070 }
79fa41a2
DC
1071 };
1072
1073 return $rpcenv->fork_worker('cephdestroyosd', $osdsection, $authuser, $worker);
1074 }});
1075
1076__PACKAGE__->register_method ({
1077 name => 'in',
1078 path => '{osdid}/in',
1079 method => 'POST',
1080 description => "ceph osd in",
1081 proxyto => 'node',
1082 protected => 1,
1083 permissions => {
1084 check => ['perm', '/', [ 'Sys.Modify' ]],
1085 },
1086 parameters => {
1087 additionalProperties => 0,
1088 properties => {
1089 node => get_standard_option('pve-node'),
1090 osdid => {
1091 description => 'OSD ID',
1092 type => 'integer',
1093 },
1094 },
1095 },
1096 returns => { type => "null" },
1097 code => sub {
1098 my ($param) = @_;
1099
1100 PVE::Ceph::Tools::check_ceph_inited();
1101
1102 my $osdid = $param->{osdid};
1103
1104 my $rados = PVE::RADOS->new();
1105
9cc5ac9e 1106 $get_osd_status->($rados, $osdid); # osd exists?
79fa41a2
DC
1107
1108 my $osdsection = "osd.$osdid";
1109
1110 $rados->mon_command({ prefix => "osd in", ids => [ $osdsection ], format => 'plain' });
1111
1112 return undef;
1113 }});
1114
1115__PACKAGE__->register_method ({
1116 name => 'out',
1117 path => '{osdid}/out',
1118 method => 'POST',
1119 description => "ceph osd out",
1120 proxyto => 'node',
1121 protected => 1,
1122 permissions => {
1123 check => ['perm', '/', [ 'Sys.Modify' ]],
1124 },
1125 parameters => {
1126 additionalProperties => 0,
1127 properties => {
1128 node => get_standard_option('pve-node'),
1129 osdid => {
1130 description => 'OSD ID',
1131 type => 'integer',
1132 },
1133 },
1134 },
1135 returns => { type => "null" },
1136 code => sub {
1137 my ($param) = @_;
1138
1139 PVE::Ceph::Tools::check_ceph_inited();
1140
1141 my $osdid = $param->{osdid};
1142
1143 my $rados = PVE::RADOS->new();
1144
9cc5ac9e 1145 $get_osd_status->($rados, $osdid); # osd exists?
79fa41a2
DC
1146
1147 my $osdsection = "osd.$osdid";
1148
1149 $rados->mon_command({ prefix => "osd out", ids => [ $osdsection ], format => 'plain' });
1150
1151 return undef;
1152 }});
1153
b7701301
DC
1154__PACKAGE__->register_method ({
1155 name => 'scrub',
1156 path => '{osdid}/scrub',
1157 method => 'POST',
1158 description => "Instruct the OSD to scrub.",
1159 proxyto => 'node',
1160 protected => 1,
1161 permissions => {
1162 check => ['perm', '/', [ 'Sys.Modify' ]],
1163 },
1164 parameters => {
1165 additionalProperties => 0,
1166 properties => {
1167 node => get_standard_option('pve-node'),
1168 osdid => {
1169 description => 'OSD ID',
1170 type => 'integer',
1171 },
1172 deep => {
1173 description => 'If set, instructs a deep scrub instead of a normal one.',
1174 type => 'boolean',
1175 optional => 1,
1176 default => 0,
1177 },
1178 },
1179 },
1180 returns => { type => "null" },
1181 code => sub {
1182 my ($param) = @_;
1183
1184 PVE::Ceph::Tools::check_ceph_inited();
1185
1186 my $osdid = $param->{osdid};
1187 my $deep = $param->{deep} // 0;
1188
1189 my $rados = PVE::RADOS->new();
1190
9cc5ac9e 1191 $get_osd_status->($rados, $osdid); # osd exists?
b7701301 1192
9cc5ac9e 1193 my $prefix = $deep ? 'osd deep-scrub' : 'osd scrub';
b7701301
DC
1194 $rados->mon_command({ prefix => $prefix, who => $osdid });
1195
1196 return undef;
1197 }});
1198
79fa41a2 11991;