]> git.proxmox.com Git - pve-manager.git/blob - PVE/API2/Ceph/OSD.pm
fix #4631: ceph: osd: create: add osds-per-device
[pve-manager.git] / PVE / API2 / Ceph / OSD.pm
1 package PVE::API2::Ceph::OSD;
2
3 use strict;
4 use warnings;
5
6 use Cwd qw(abs_path);
7 use IO::File;
8 use JSON;
9 use UUID;
10
11 use PVE::Ceph::Tools;
12 use PVE::Ceph::Services;
13 use PVE::CephConfig;
14 use PVE::Cluster qw(cfs_read_file cfs_write_file);
15 use PVE::Diskmanage;
16 use PVE::Storage::LVMPlugin;
17 use PVE::Exception qw(raise_param_exc);
18 use PVE::JSONSchema qw(get_standard_option);
19 use PVE::INotify;
20 use PVE::RADOS;
21 use PVE::RESTHandler;
22 use PVE::RPCEnvironment;
23 use PVE::Tools qw(run_command file_set_contents);
24 use PVE::ProcFSTools;
25 use PVE::Network;
26
27 use base qw(PVE::RESTHandler);
28
29 my $nodename = PVE::INotify::nodename();
30
31 my $get_osd_status = sub {
32 my ($rados, $osdid) = @_;
33
34 my $stat = $rados->mon_command({ prefix => 'osd dump' });
35
36 my $osdlist = $stat->{osds} || [];
37
38 my $flags = $stat->{flags} || undef;
39
40 my $osdstat;
41 foreach my $d (@$osdlist) {
42 $osdstat->{$d->{osd}} = $d if defined($d->{osd});
43 }
44 if (defined($osdid)) {
45 die "no such OSD '$osdid'\n" if !$osdstat->{$osdid};
46 return $osdstat->{$osdid};
47 }
48
49 return wantarray ? ($osdstat, $flags) : $osdstat;
50 };
51
52 my $get_osd_usage = sub {
53 my ($rados) = @_;
54
55 my $osdlist = $rados->mon_command({ prefix => 'pg dump', dumpcontents => [ 'osds' ]});
56 if (!($osdlist && ref($osdlist))) {
57 warn "got unknown result format for 'pg dump osds' command\n";
58 return [];
59 }
60
61 if (ref($osdlist) eq "HASH") { # since nautilus
62 $osdlist = $osdlist->{osd_stats};
63 }
64
65 my $osdstat = {};
66 for my $d (@$osdlist) {
67 $osdstat->{$d->{osd}} = $d if defined($d->{osd});
68 }
69
70 return $osdstat;
71 };
72
73 my sub get_proc_pss_from_pid {
74 my ($pid) = @_;
75 return if !defined($pid) || $pid <= 1;
76
77 open (my $SMAPS_FH, '<', "/proc/$pid/smaps_rollup")
78 or die "failed to open PSS memory-stat from process - $!\n";
79
80 while (my $line = <$SMAPS_FH>) {
81 if ($line =~ m/^Pss:\s+([0-9]+) kB$/) { # using PSS avoids bias with many OSDs
82 close $SMAPS_FH;
83 return int($1) * 1024;
84 }
85 }
86 close $SMAPS_FH;
87 die "internal error: failed to find PSS memory-stat in procfs for PID $pid\n";
88 }
89
90
91 __PACKAGE__->register_method ({
92 name => 'index',
93 path => '',
94 method => 'GET',
95 description => "Get Ceph osd list/tree.",
96 proxyto => 'node',
97 protected => 1,
98 permissions => {
99 check => ['perm', '/', [ 'Sys.Audit', 'Datastore.Audit' ], any => 1],
100 },
101 parameters => {
102 additionalProperties => 0,
103 properties => {
104 node => get_standard_option('pve-node'),
105 },
106 },
107 # fixme: return a list instead of extjs tree format ?
108 returns => {
109 type => "object",
110 items => {
111 type => "object",
112 properties => {
113 flags => { type => "string" },
114 root => {
115 type => "object",
116 description => "Tree with OSDs in the CRUSH map structure.",
117 },
118 },
119 },
120 },
121 code => sub {
122 my ($param) = @_;
123
124 PVE::Ceph::Tools::check_ceph_inited();
125
126 my $rados = PVE::RADOS->new();
127 my $res = $rados->mon_command({ prefix => 'osd df', output_method => 'tree', });
128
129 die "no tree nodes found\n" if !($res && $res->{nodes});
130
131 my ($osdhash, $flags) = $get_osd_status->($rados);
132
133 my $osd_usage = $get_osd_usage->($rados);
134
135 my $osdmetadata_res = $rados->mon_command({ prefix => 'osd metadata' });
136 my $osdmetadata = { map { $_->{id} => $_ } @$osdmetadata_res };
137
138 my $hostversions = PVE::Ceph::Services::get_ceph_versions();
139
140 my $nodes = {};
141 my $newnodes = {};
142 foreach my $e (@{$res->{nodes}}) {
143 my ($id, $name) = $e->@{qw(id name)};
144
145 $nodes->{$id} = $e;
146
147 my $new = {
148 id => $id,
149 name => $name,
150 type => $e->{type}
151 };
152
153 foreach my $opt (qw(status crush_weight reweight device_class pgs)) {
154 $new->{$opt} = $e->{$opt} if defined($e->{$opt});
155 }
156
157 if (my $stat = $osdhash->{$id}) {
158 $new->{in} = $stat->{in} if defined($stat->{in});
159 }
160
161 if (my $stat = $osd_usage->{$id}) {
162 $new->{total_space} = ($stat->{kb} || 1) * 1024;
163 $new->{bytes_used} = ($stat->{kb_used} || 0) * 1024;
164 $new->{percent_used} = ($new->{bytes_used}*100)/$new->{total_space};
165 if (my $d = $stat->{perf_stat}) {
166 $new->{commit_latency_ms} = $d->{commit_latency_ms};
167 $new->{apply_latency_ms} = $d->{apply_latency_ms};
168 }
169 }
170
171 my $osdmd = $osdmetadata->{$id};
172 if ($e->{type} eq 'osd' && $osdmd) {
173 if ($osdmd->{bluefs}) {
174 $new->{osdtype} = 'bluestore';
175 $new->{blfsdev} = $osdmd->{bluestore_bdev_dev_node};
176 $new->{dbdev} = $osdmd->{bluefs_db_dev_node};
177 $new->{waldev} = $osdmd->{bluefs_wal_dev_node};
178 } else {
179 $new->{osdtype} = 'filestore';
180 }
181 for my $field (qw(ceph_version ceph_version_short)) {
182 $new->{$field} = $osdmd->{$field} if $osdmd->{$field};
183 }
184 }
185
186 $newnodes->{$id} = $new;
187 }
188
189 foreach my $e (@{$res->{nodes}}) {
190 my ($id, $name) = $e->@{qw(id name)};
191 my $new = $newnodes->{$id};
192
193 if ($e->{children} && scalar(@{$e->{children}})) {
194 $new->{children} = [];
195 $new->{leaf} = 0;
196 foreach my $cid (@{$e->{children}}) {
197 $nodes->{$cid}->{parent} = $id;
198 if ($nodes->{$cid}->{type} eq 'osd' && $e->{type} eq 'host') {
199 $newnodes->{$cid}->{host} = $name;
200 }
201 push @{$new->{children}}, $newnodes->{$cid};
202 }
203 } else {
204 $new->{leaf} = ($id >= 0) ? 1 : 0;
205 }
206
207 if ($name && $e->{type} eq 'host') {
208 $new->{version} = $hostversions->{$name}->{version}->{str};
209 }
210 }
211
212 my $realroots = [];
213 foreach my $e (@{$res->{nodes}}) {
214 my $id = $e->{id};
215 if (!$nodes->{$id}->{parent}) {
216 push @$realroots, $newnodes->{$id};
217 }
218 }
219
220 die "no root node\n" if scalar(@$realroots) < 1;
221
222 my $data = {
223 root => {
224 leaf => 0,
225 children => $realroots
226 },
227 };
228
229 $data->{flags} = $flags if $flags; # we want this for the noout flag
230
231 return $data;
232 }});
233
234 __PACKAGE__->register_method ({
235 name => 'createosd',
236 path => '',
237 method => 'POST',
238 description => "Create OSD",
239 proxyto => 'node',
240 protected => 1,
241 parameters => {
242 additionalProperties => 0,
243 properties => {
244 node => get_standard_option('pve-node'),
245 dev => {
246 description => "Block device name.",
247 type => 'string',
248 },
249 db_dev => {
250 description => "Block device name for block.db.",
251 optional => 1,
252 type => 'string',
253 },
254 db_dev_size => {
255 description => "Size in GiB for block.db.",
256 verbose_description => "If a block.db is requested but the size is not given, ".
257 "will be automatically selected by: bluestore_block_db_size from the ".
258 "ceph database (osd or global section) or config (osd or global section)".
259 "in that order. If this is not available, it will be sized 10% of the size ".
260 "of the OSD device. Fails if the available size is not enough.",
261 optional => 1,
262 type => 'number',
263 default => 'bluestore_block_db_size or 10% of OSD size',
264 requires => 'db_dev',
265 minimum => 1.0,
266 },
267 wal_dev => {
268 description => "Block device name for block.wal.",
269 optional => 1,
270 type => 'string',
271 },
272 wal_dev_size => {
273 description => "Size in GiB for block.wal.",
274 verbose_description => "If a block.wal is requested but the size is not given, ".
275 "will be automatically selected by: bluestore_block_wal_size from the ".
276 "ceph database (osd or global section) or config (osd or global section)".
277 "in that order. If this is not available, it will be sized 1% of the size ".
278 "of the OSD device. Fails if the available size is not enough.",
279 optional => 1,
280 minimum => 0.5,
281 default => 'bluestore_block_wal_size or 1% of OSD size',
282 requires => 'wal_dev',
283 type => 'number',
284 },
285 encrypted => {
286 type => 'boolean',
287 optional => 1,
288 default => 0,
289 description => "Enables encryption of the OSD."
290 },
291 'crush-device-class' => {
292 optional => 1,
293 type => 'string',
294 description => "Set the device class of the OSD in crush."
295 },
296 'osds-per-device' => {
297 optional => 1,
298 type => 'integer',
299 minimum => '1',
300 description => 'OSD services per physical device. Only useful for fast ".
301 "NVME devices to utilize their performance better.',
302 },
303 },
304 },
305 returns => { type => 'string' },
306 code => sub {
307 my ($param) = @_;
308
309 my $rpcenv = PVE::RPCEnvironment::get();
310
311 my $authuser = $rpcenv->get_user();
312
313 # test basic requirements
314 PVE::Ceph::Tools::check_ceph_inited();
315 PVE::Ceph::Tools::setup_pve_symlinks();
316 PVE::Ceph::Tools::check_ceph_installed('ceph_osd');
317 PVE::Ceph::Tools::check_ceph_installed('ceph_volume');
318
319 # extract parameter info and fail if a device is set more than once
320 my $devs = {};
321
322 # allow 'osds-per-device' only without dedicated db and/or wal devs. We cannot specify them with
323 # 'ceph-volume lvm batch' and they don't make a lot of sense on fast NVMEs anyway.
324 if ($param->{'osds-per-device'}) {
325 for my $type ( qw(db_dev wal_dev) ) {
326 raise_param_exc({ $type => "canot use 'osds-per-device' parameter with '${type}'" })
327 if $param->{$type};
328 }
329 }
330
331 my $ceph_conf = cfs_read_file('ceph.conf');
332
333 my $osd_network = $ceph_conf->{global}->{cluster_network};
334 $osd_network //= $ceph_conf->{global}->{public_network}; # fallback
335
336 if ($osd_network) { # check only if something is configured
337 my $cluster_net_ips = PVE::Network::get_local_ip_from_cidr($osd_network);
338 if (scalar(@$cluster_net_ips) < 1) {
339 my $osd_net_obj = PVE::Network::IP_from_cidr($osd_network);
340 my $osd_base_cidr = $osd_net_obj->{ip} . "/" . $osd_net_obj->{prefixlen};
341
342 die "No address from ceph cluster network (${osd_base_cidr}) found on node '$nodename'. ".
343 "Check your network config.\n";
344 }
345 }
346
347 for my $type ( qw(dev db_dev wal_dev) ) {
348 next if !$param->{$type};
349
350 my $type_dev = PVE::Diskmanage::verify_blockdev_path($param->{$type});
351 (my $type_devname = $type_dev) =~ s|/dev/||;
352
353 raise_param_exc({ $type => "cannot chose '$type_dev' for more than one type." })
354 if grep { $_->{name} eq $type_devname } values %$devs;
355
356 $devs->{$type} = {
357 dev => $type_dev,
358 name => $type_devname,
359 };
360
361 if (my $size = $param->{"${type}_size"}) {
362 $devs->{$type}->{size} = PVE::Tools::convert_size($size, 'gb' => 'b') ;
363 }
364 }
365
366 my $test_disk_requirements = sub {
367 my ($disklist) = @_;
368
369 my $dev = $devs->{dev}->{dev};
370 my $devname = $devs->{dev}->{name};
371 die "unable to get device info for '$dev'\n" if !$disklist->{$devname};
372 die "device '$dev' is already in use\n" if $disklist->{$devname}->{used};
373
374 for my $type ( qw(db_dev wal_dev) ) {
375 my $d = $devs->{$type};
376 next if !$d;
377 my $name = $d->{name};
378 my $info = $disklist->{$name};
379 die "unable to get device info for '$d->{dev}' for type $type\n" if !$disklist->{$name};
380 if (my $usage = $info->{used}) {
381 if ($usage eq 'partitions') {
382 die "device '$d->{dev}' is not GPT partitioned\n" if !$info->{gpt};
383 } elsif ($usage ne 'LVM') {
384 die "device '$d->{dev}' is already in use and has no LVM on it\n";
385 }
386 }
387 }
388 };
389
390
391 # test disk requirements early
392 my $devlist = [ map { $_->{name} } values %$devs ];
393 my $disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
394 $test_disk_requirements->($disklist);
395
396 # get necessary ceph infos
397 my $rados = PVE::RADOS->new();
398 my $monstat = $rados->mon_command({ prefix => 'quorum_status' });
399
400 my $ceph_bootstrap_osd_keyring = PVE::Ceph::Tools::get_config('ceph_bootstrap_osd_keyring');
401
402 if (! -f $ceph_bootstrap_osd_keyring && $ceph_conf->{global}->{auth_client_required} eq 'cephx') {
403 my $bindata = $rados->mon_command({
404 prefix => 'auth get-or-create',
405 entity => 'client.bootstrap-osd',
406 caps => [
407 'mon' => 'allow profile bootstrap-osd'
408 ],
409 format => 'plain',
410 });
411 file_set_contents($ceph_bootstrap_osd_keyring, $bindata);
412 };
413
414 # See FIXME below
415 my @udev_trigger_devs = ();
416
417 my $create_part_or_lv = sub {
418 my ($dev, $size, $type) = @_;
419
420 $size =~ m/^(\d+)$/ or die "invalid size '$size'\n";
421 $size = $1;
422
423 die "'$dev->{devpath}' is smaller than requested size '$size' bytes\n"
424 if $dev->{size} < $size;
425
426 # sgdisk and lvcreate can only sizes divisible by 512b
427 # so we round down to the nearest kb
428 $size = PVE::Tools::convert_size($size, 'b' => 'kb', 1);
429
430 if (!$dev->{used}) {
431 # create pv,vg,lv
432
433 my $vg = "ceph-" . UUID::uuid();
434 my $lv = $type . "-" . UUID::uuid();
435
436 PVE::Storage::LVMPlugin::lvm_create_volume_group($dev->{devpath}, $vg);
437 PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
438
439 if (PVE::Diskmanage::is_partition($dev->{devpath})) {
440 eval { PVE::Diskmanage::change_parttype($dev->{devpath}, '8E00'); };
441 warn $@ if $@;
442 }
443
444 push @udev_trigger_devs, $dev->{devpath};
445
446 return "$vg/$lv";
447
448 } elsif ($dev->{used} eq 'LVM') {
449 # check pv/vg and create lv
450
451 my $vgs = PVE::Storage::LVMPlugin::lvm_vgs(1);
452 my $vg;
453 for my $vgname ( sort keys %$vgs ) {
454 next if $vgname !~ /^ceph-/;
455
456 for my $pv ( @{$vgs->{$vgname}->{pvs}} ) {
457 next if $pv->{name} ne $dev->{devpath};
458 $vg = $vgname;
459 last;
460 }
461 last if $vg;
462 }
463
464 die "no ceph vg found on '$dev->{devpath}'\n" if !$vg;
465 die "vg '$vg' has not enough free space\n" if $vgs->{$vg}->{free} < $size;
466
467 my $lv = $type . "-" . UUID::uuid();
468
469 PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
470
471 return "$vg/$lv";
472
473 } elsif ($dev->{used} eq 'partitions' && $dev->{gpt}) {
474 # create new partition at the end
475 my $parttypes = {
476 'osd-db' => '30CD0809-C2B2-499C-8879-2D6B78529876',
477 'osd-wal' => '5CE17FCE-4087-4169-B7FF-056CC58473F9',
478 };
479
480 my $part = PVE::Diskmanage::append_partition($dev->{devpath}, $size * 1024);
481
482 if (my $parttype = $parttypes->{$type}) {
483 eval { PVE::Diskmanage::change_parttype($part, $parttype); };
484 warn $@ if $@;
485 }
486
487 push @udev_trigger_devs, $part;
488 return $part;
489 }
490
491 die "cannot use '$dev->{devpath}' for '$type'\n";
492 };
493
494 my $worker = sub {
495 my $upid = shift;
496
497 PVE::Diskmanage::locked_disk_action(sub {
498 # update disklist and re-test requirements
499 $disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
500 $test_disk_requirements->($disklist);
501
502 my $dev_class = $param->{'crush-device-class'};
503 # create allows for detailed configuration of DB and WAL devices
504 # batch for easy creation of multiple OSDs (per device)
505 my $create_mode = $param->{'osds-per-device'} ? 'batch' : 'create';
506 my $cmd = ['ceph-volume', 'lvm', $create_mode ];
507 push @$cmd, '--crush-device-class', $dev_class if $dev_class;
508
509 my $devname = $devs->{dev}->{name};
510 my $devpath = $disklist->{$devname}->{devpath};
511 print "create OSD on $devpath (bluestore)\n";
512
513 push @udev_trigger_devs, $devpath;
514
515 my $osd_size = $disklist->{$devname}->{size};
516 my $size_map = {
517 db => int($osd_size / 10), # 10% of OSD
518 wal => int($osd_size / 100), # 1% of OSD
519 };
520
521 my $sizes;
522 foreach my $type ( qw(db wal) ) {
523 my $fallback_size = $size_map->{$type};
524 my $d = $devs->{"${type}_dev"};
525 next if !$d;
526
527 # size was not set via api, getting from config/fallback
528 if (!defined($d->{size})) {
529 $sizes = PVE::Ceph::Tools::get_db_wal_sizes() if !$sizes;
530 $d->{size} = $sizes->{$type} // $fallback_size;
531 }
532 print "creating block.$type on '$d->{dev}'\n";
533 my $name = $d->{name};
534 my $part_or_lv = $create_part_or_lv->($disklist->{$name}, $d->{size}, "osd-$type");
535
536 print "using '$part_or_lv' for block.$type\n";
537 push @$cmd, "--block.$type", $part_or_lv;
538 }
539
540 push @$cmd, '--data', $devpath if $create_mode eq 'create';
541 push @$cmd, '--dmcrypt' if $param->{encrypted};
542
543 if ($create_mode eq 'batch') {
544 push @$cmd,
545 '--osds-per-device', $param->{'osds-per-device'},
546 '--yes',
547 '--no-auto',
548 '--',
549 $devpath;
550 }
551 PVE::Diskmanage::wipe_blockdev($devpath);
552
553 if (PVE::Diskmanage::is_partition($devpath)) {
554 eval { PVE::Diskmanage::change_parttype($devpath, '8E00'); };
555 warn $@ if $@;
556 }
557
558 run_command($cmd);
559
560 # FIXME: Remove once we depend on systemd >= v249.
561 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
562 # udev database is updated.
563 eval { run_command(['udevadm', 'trigger', @udev_trigger_devs]); };
564 warn $@ if $@;
565 });
566 };
567
568 return $rpcenv->fork_worker('cephcreateosd', $devs->{dev}->{name}, $authuser, $worker);
569 }});
570
571 my $OSD_DEV_RETURN_PROPS = {
572 device => {
573 type => 'string',
574 enum => ['block', 'db', 'wal'],
575 description => 'Kind of OSD device',
576 },
577 dev_node => {
578 type => 'string',
579 description => 'Device node',
580 },
581 devices => {
582 type => 'string',
583 description => 'Physical disks used',
584 },
585 size => {
586 type => 'integer',
587 description => 'Size in bytes',
588 },
589 support_discard => {
590 type => 'boolean',
591 description => 'Discard support of the physical device',
592 },
593 type => {
594 type => 'string',
595 description => 'Type of device. For example, hdd or ssd',
596 },
597 };
598
599 __PACKAGE__->register_method ({
600 name => 'osdindex',
601 path => '{osdid}',
602 method => 'GET',
603 permissions => { user => 'all' },
604 description => "OSD index.",
605 parameters => {
606 additionalProperties => 0,
607 properties => {
608 node => get_standard_option('pve-node'),
609 osdid => {
610 description => 'OSD ID',
611 type => 'integer',
612 },
613 },
614 },
615 returns => {
616 type => 'array',
617 items => {
618 type => "object",
619 properties => {},
620 },
621 links => [ { rel => 'child', href => "{name}" } ],
622 },
623 code => sub {
624 my ($param) = @_;
625
626 my $result = [
627 { name => 'metadata' },
628 { name => 'lv-info' },
629 ];
630
631 return $result;
632 }});
633
634 __PACKAGE__->register_method ({
635 name => 'osddetails',
636 path => '{osdid}/metadata',
637 method => 'GET',
638 description => "Get OSD details",
639 proxyto => 'node',
640 protected => 1,
641 permissions => {
642 check => ['perm', '/', [ 'Sys.Audit' ], any => 1],
643 },
644 parameters => {
645 additionalProperties => 0,
646 properties => {
647 node => get_standard_option('pve-node'),
648 osdid => {
649 description => 'OSD ID',
650 type => 'integer',
651 },
652 },
653 },
654 returns => {
655 type => 'object',
656 properties => {
657 osd => {
658 type => 'object',
659 description => 'General information about the OSD',
660 properties => {
661 hostname => {
662 type => 'string',
663 description => 'Name of the host containing the OSD.',
664 },
665 id => {
666 type => 'integer',
667 description => 'ID of the OSD.',
668 },
669 mem_usage => {
670 type => 'integer',
671 description => 'Memory usage of the OSD service.',
672 },
673 osd_data => {
674 type => 'string',
675 description => "Path to the OSD's data directory.",
676 },
677 osd_objectstore => {
678 type => 'string',
679 description => 'The type of object store used.',
680 },
681 pid => {
682 type => 'integer',
683 description => 'OSD process ID.',
684 },
685 version => {
686 type => 'string',
687 description => 'Ceph version of the OSD service.',
688 },
689 front_addr => {
690 type => 'string',
691 description => 'Address and port used to talk to clients and monitors.',
692 },
693 back_addr => {
694 type => 'string',
695 description => 'Address and port used to talk to other OSDs.',
696 },
697 hb_front_addr => {
698 type => 'string',
699 description => 'Heartbeat address and port for clients and monitors.',
700 },
701 hb_back_addr => {
702 type => 'string',
703 description => 'Heartbeat address and port for other OSDs.',
704 },
705 },
706 },
707 devices => {
708 type => 'array',
709 description => 'Array containing data about devices',
710 items => {
711 type => "object",
712 properties => $OSD_DEV_RETURN_PROPS,
713 },
714 }
715 }
716 },
717 code => sub {
718 my ($param) = @_;
719
720 PVE::Ceph::Tools::check_ceph_inited();
721
722 my $osdid = $param->{osdid};
723 my $rados = PVE::RADOS->new();
724 my $metadata = $rados->mon_command({ prefix => 'osd metadata', id => int($osdid) });
725
726 die "OSD '${osdid}' does not exists on host '${nodename}'\n"
727 if $nodename ne $metadata->{hostname};
728
729 my $pid;
730 my $parser = sub {
731 my $line = shift;
732 if ($line =~ m/^MainPID=([0-9]*)$/) {
733 $pid = int($1);
734 }
735 };
736
737 my $cmd = [
738 '/bin/systemctl',
739 'show',
740 "ceph-osd\@${osdid}.service",
741 '--property',
742 'MainPID',
743 ];
744 run_command($cmd, errmsg => 'fetching OSD PID and memory usage failed', outfunc => $parser);
745
746 my $osd_pss_memory = eval { get_proc_pss_from_pid($pid) } // 0;
747 warn $@ if $@;
748
749 my $data = {
750 osd => {
751 hostname => $metadata->{hostname},
752 id => $metadata->{id},
753 mem_usage => $osd_pss_memory,
754 osd_data => $metadata->{osd_data},
755 osd_objectstore => $metadata->{osd_objectstore},
756 pid => $pid,
757 version => "$metadata->{ceph_version_short} ($metadata->{ceph_release})",
758 front_addr => $metadata->{front_addr},
759 back_addr => $metadata->{back_addr},
760 hb_front_addr => $metadata->{hb_front_addr},
761 hb_back_addr => $metadata->{hb_back_addr},
762 },
763 };
764
765 $data->{devices} = [];
766
767 my $get_data = sub {
768 my ($dev, $prefix, $device) = @_;
769 push (
770 @{$data->{devices}},
771 {
772 dev_node => $metadata->{"${prefix}_${dev}_dev_node"},
773 physical_device => $metadata->{"${prefix}_${dev}_devices"},
774 size => int($metadata->{"${prefix}_${dev}_size"}),
775 support_discard => int($metadata->{"${prefix}_${dev}_support_discard"}),
776 type => $metadata->{"${prefix}_${dev}_type"},
777 device => $device,
778 }
779 );
780 };
781
782 $get_data->("bdev", "bluestore", "block");
783 $get_data->("db", "bluefs", "db") if $metadata->{bluefs_dedicated_db};
784 $get_data->("wal", "bluefs", "wal") if $metadata->{bluefs_dedicated_wal};
785
786 return $data;
787 }});
788
789 __PACKAGE__->register_method ({
790 name => 'osdvolume',
791 path => '{osdid}/lv-info',
792 method => 'GET',
793 description => "Get OSD volume details",
794 proxyto => 'node',
795 protected => 1,
796 permissions => {
797 check => ['perm', '/', [ 'Sys.Audit' ], any => 1],
798 },
799 parameters => {
800 additionalProperties => 0,
801 properties => {
802 node => get_standard_option('pve-node'),
803 osdid => {
804 description => 'OSD ID',
805 type => 'integer',
806 },
807 type => {
808 description => 'OSD device type',
809 type => 'string',
810 enum => ['block', 'db', 'wal'],
811 default => 'block',
812 optional => 1,
813 },
814 },
815 },
816 returns => {
817 type => 'object',
818 properties => {
819 creation_time => {
820 type => 'string',
821 description => "Creation time as reported by `lvs`.",
822 },
823 lv_name => {
824 type => 'string',
825 description => 'Name of the logical volume (LV).',
826 },
827 lv_path => {
828 type => 'string',
829 description => 'Path to the logical volume (LV).',
830 },
831 lv_size => {
832 type => 'integer',
833 description => 'Size of the logical volume (LV).',
834 },
835 lv_uuid => {
836 type => 'string',
837 description => 'UUID of the logical volume (LV).',
838 },
839 vg_name => {
840 type => 'string',
841 description => 'Name of the volume group (VG).',
842 },
843 },
844 },
845 code => sub {
846 my ($param) = @_;
847
848 PVE::Ceph::Tools::check_ceph_inited();
849
850 my $osdid = $param->{osdid};
851 my $type = $param->{type} // 'block';
852
853 my $raw = '';
854 my $parser = sub { $raw .= shift };
855 my $cmd = ['/usr/sbin/ceph-volume', 'lvm', 'list', $osdid, '--format', 'json'];
856 run_command($cmd, errmsg => 'listing Ceph LVM volumes failed', outfunc => $parser);
857
858 my $result;
859 if ($raw =~ m/^(\{.*\})$/s) { #untaint
860 $result = JSON::decode_json($1);
861 } else {
862 die "got unexpected data from ceph-volume: '${raw}'\n";
863 }
864 if (!$result->{$osdid}) {
865 die "OSD '${osdid}' not found in 'ceph-volume lvm list' on node '${nodename}'.\n"
866 ."Maybe it was created before LVM became the default?\n";
867 }
868
869 my $lv_data = { map { $_->{type} => $_ } @{$result->{$osdid}} };
870 my $volume = $lv_data->{$type} || die "volume type '${type}' not found for OSD ${osdid}\n";
871
872 $raw = '';
873 $cmd = ['/sbin/lvs', $volume->{lv_path}, '--reportformat', 'json', '-o', 'lv_time'];
874 run_command($cmd, errmsg => 'listing logical volumes failed', outfunc => $parser);
875
876 if ($raw =~ m/(\{.*\})$/s) { #untaint, lvs has whitespace at beginning
877 $result = JSON::decode_json($1);
878 } else {
879 die "got unexpected data from lvs: '${raw}'\n";
880 }
881
882 my $data = { map { $_ => $volume->{$_} } qw(lv_name lv_path lv_uuid vg_name) };
883 $data->{lv_size} = int($volume->{lv_size});
884
885 $data->{creation_time} = @{$result->{report}}[0]->{lv}[0]->{lv_time};
886
887 return $data;
888 }});
889
890 # Check if $osdid belongs to $nodename
891 # $tree ... rados osd tree (passing the tree makes it easy to test)
892 sub osd_belongs_to_node {
893 my ($tree, $nodename, $osdid) = @_;
894 return 0 if !($tree && $tree->{nodes});
895
896 my $node_map = {};
897 for my $el (grep { defined($_->{type}) && $_->{type} eq 'host' } @{$tree->{nodes}}) {
898 my $name = $el->{name};
899 die "internal error: duplicate host name found '$name'\n" if $node_map->{$name};
900 $node_map->{$name} = $el;
901 }
902
903 my $osds = $node_map->{$nodename}->{children};
904 return 0 if !$osds;
905
906 return grep($_ == $osdid, @$osds);
907 }
908
909 __PACKAGE__->register_method ({
910 name => 'destroyosd',
911 path => '{osdid}',
912 method => 'DELETE',
913 description => "Destroy OSD",
914 proxyto => 'node',
915 protected => 1,
916 parameters => {
917 additionalProperties => 0,
918 properties => {
919 node => get_standard_option('pve-node'),
920 osdid => {
921 description => 'OSD ID',
922 type => 'integer',
923 },
924 cleanup => {
925 description => "If set, we remove partition table entries.",
926 type => 'boolean',
927 optional => 1,
928 default => 0,
929 },
930 },
931 },
932 returns => { type => 'string' },
933 code => sub {
934 my ($param) = @_;
935
936 my $rpcenv = PVE::RPCEnvironment::get();
937
938 my $authuser = $rpcenv->get_user();
939
940 PVE::Ceph::Tools::check_ceph_inited();
941
942 my $osdid = $param->{osdid};
943 my $cleanup = $param->{cleanup};
944
945 my $rados = PVE::RADOS->new();
946
947 my $osd_belongs_to_node = osd_belongs_to_node(
948 $rados->mon_command({ prefix => 'osd tree' }),
949 $param->{node},
950 $osdid,
951 );
952 die "OSD osd.$osdid does not belong to node $param->{node}!"
953 if !$osd_belongs_to_node;
954
955 # dies if osdid is unknown
956 my $osdstat = $get_osd_status->($rados, $osdid);
957
958 die "osd is in use (in == 1)\n" if $osdstat->{in};
959 #&$run_ceph_cmd(['osd', 'out', $osdid]);
960
961 die "osd is still running (up == 1)\n" if $osdstat->{up};
962
963 my $osdsection = "osd.$osdid";
964
965 my $worker = sub {
966 my $upid = shift;
967
968 # reopen with longer timeout
969 $rados = PVE::RADOS->new(timeout => PVE::Ceph::Tools::get_config('long_rados_timeout'));
970
971 print "destroy OSD $osdsection\n";
972
973 eval {
974 PVE::Ceph::Services::ceph_service_cmd('stop', $osdsection);
975 PVE::Ceph::Services::ceph_service_cmd('disable', $osdsection);
976 };
977 warn $@ if $@;
978
979 print "Remove $osdsection from the CRUSH map\n";
980 $rados->mon_command({ prefix => "osd crush remove", name => $osdsection, format => 'plain' });
981
982 print "Remove the $osdsection authentication key.\n";
983 $rados->mon_command({ prefix => "auth del", entity => $osdsection, format => 'plain' });
984
985 print "Remove OSD $osdsection\n";
986 $rados->mon_command({ prefix => "osd rm", ids => [ $osdsection ], format => 'plain' });
987
988 # try to unmount from standard mount point
989 my $mountpoint = "/var/lib/ceph/osd/ceph-$osdid";
990
991 # See FIXME below
992 my $udev_trigger_devs = {};
993
994 my $remove_partition = sub {
995 my ($part) = @_;
996
997 return if !$part || (! -b $part );
998 my $partnum = PVE::Diskmanage::get_partnum($part);
999 my $devpath = PVE::Diskmanage::get_blockdev($part);
1000
1001 $udev_trigger_devs->{$devpath} = 1;
1002
1003 PVE::Diskmanage::wipe_blockdev($part);
1004 print "remove partition $part (disk '${devpath}', partnum $partnum)\n";
1005 eval { run_command(['/sbin/sgdisk', '-d', $partnum, "${devpath}"]); };
1006 warn $@ if $@;
1007 };
1008
1009 my $osd_list = PVE::Ceph::Tools::ceph_volume_list();
1010
1011 if ($osd_list->{$osdid}) { # ceph-volume managed
1012
1013 eval { PVE::Ceph::Tools::ceph_volume_zap($osdid, $cleanup) };
1014 warn $@ if $@;
1015
1016 if ($cleanup) {
1017 # try to remove pvs, but do not fail if it does not work
1018 for my $osd_part (@{$osd_list->{$osdid}}) {
1019 for my $dev (@{$osd_part->{devices}}) {
1020 ($dev) = ($dev =~ m|^(/dev/[-_.a-zA-Z0-9\/]+)$|); #untaint
1021
1022 eval { run_command(['/sbin/pvremove', $dev], errfunc => sub {}) };
1023 warn $@ if $@;
1024
1025 $udev_trigger_devs->{$dev} = 1;
1026 }
1027 }
1028 }
1029 } else {
1030 my $partitions_to_remove = [];
1031 if ($cleanup) {
1032 if (my $mp = PVE::ProcFSTools::parse_proc_mounts()) {
1033 foreach my $line (@$mp) {
1034 my ($dev, $path, $fstype) = @$line;
1035 next if !($dev && $path && $fstype);
1036 next if $dev !~ m|^/dev/|;
1037
1038 if ($path eq $mountpoint) {
1039 abs_path($dev) =~ m|^(/.+)| or die "invalid dev: $dev\n";
1040 push @$partitions_to_remove, $1;
1041 last;
1042 }
1043 }
1044 }
1045
1046 foreach my $path (qw(journal block block.db block.wal)) {
1047 abs_path("$mountpoint/$path") =~ m|^(/.+)| or die "invalid path: $path\n";
1048 push @$partitions_to_remove, $1;
1049 }
1050 }
1051
1052 print "Unmount OSD $osdsection from $mountpoint\n";
1053 eval { run_command(['/bin/umount', $mountpoint]); };
1054 if (my $err = $@) {
1055 warn $err;
1056 } elsif ($cleanup) {
1057 #be aware of the ceph udev rules which can remount.
1058 foreach my $part (@$partitions_to_remove) {
1059 $remove_partition->($part);
1060 }
1061 }
1062 }
1063
1064 # FIXME: Remove once we depend on systemd >= v249.
1065 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
1066 # udev database is updated.
1067 if ($cleanup) {
1068 eval { run_command(['udevadm', 'trigger', keys $udev_trigger_devs->%*]); };
1069 warn $@ if $@;
1070 }
1071 };
1072
1073 return $rpcenv->fork_worker('cephdestroyosd', $osdsection, $authuser, $worker);
1074 }});
1075
1076 __PACKAGE__->register_method ({
1077 name => 'in',
1078 path => '{osdid}/in',
1079 method => 'POST',
1080 description => "ceph osd in",
1081 proxyto => 'node',
1082 protected => 1,
1083 permissions => {
1084 check => ['perm', '/', [ 'Sys.Modify' ]],
1085 },
1086 parameters => {
1087 additionalProperties => 0,
1088 properties => {
1089 node => get_standard_option('pve-node'),
1090 osdid => {
1091 description => 'OSD ID',
1092 type => 'integer',
1093 },
1094 },
1095 },
1096 returns => { type => "null" },
1097 code => sub {
1098 my ($param) = @_;
1099
1100 PVE::Ceph::Tools::check_ceph_inited();
1101
1102 my $osdid = $param->{osdid};
1103
1104 my $rados = PVE::RADOS->new();
1105
1106 $get_osd_status->($rados, $osdid); # osd exists?
1107
1108 my $osdsection = "osd.$osdid";
1109
1110 $rados->mon_command({ prefix => "osd in", ids => [ $osdsection ], format => 'plain' });
1111
1112 return undef;
1113 }});
1114
1115 __PACKAGE__->register_method ({
1116 name => 'out',
1117 path => '{osdid}/out',
1118 method => 'POST',
1119 description => "ceph osd out",
1120 proxyto => 'node',
1121 protected => 1,
1122 permissions => {
1123 check => ['perm', '/', [ 'Sys.Modify' ]],
1124 },
1125 parameters => {
1126 additionalProperties => 0,
1127 properties => {
1128 node => get_standard_option('pve-node'),
1129 osdid => {
1130 description => 'OSD ID',
1131 type => 'integer',
1132 },
1133 },
1134 },
1135 returns => { type => "null" },
1136 code => sub {
1137 my ($param) = @_;
1138
1139 PVE::Ceph::Tools::check_ceph_inited();
1140
1141 my $osdid = $param->{osdid};
1142
1143 my $rados = PVE::RADOS->new();
1144
1145 $get_osd_status->($rados, $osdid); # osd exists?
1146
1147 my $osdsection = "osd.$osdid";
1148
1149 $rados->mon_command({ prefix => "osd out", ids => [ $osdsection ], format => 'plain' });
1150
1151 return undef;
1152 }});
1153
1154 __PACKAGE__->register_method ({
1155 name => 'scrub',
1156 path => '{osdid}/scrub',
1157 method => 'POST',
1158 description => "Instruct the OSD to scrub.",
1159 proxyto => 'node',
1160 protected => 1,
1161 permissions => {
1162 check => ['perm', '/', [ 'Sys.Modify' ]],
1163 },
1164 parameters => {
1165 additionalProperties => 0,
1166 properties => {
1167 node => get_standard_option('pve-node'),
1168 osdid => {
1169 description => 'OSD ID',
1170 type => 'integer',
1171 },
1172 deep => {
1173 description => 'If set, instructs a deep scrub instead of a normal one.',
1174 type => 'boolean',
1175 optional => 1,
1176 default => 0,
1177 },
1178 },
1179 },
1180 returns => { type => "null" },
1181 code => sub {
1182 my ($param) = @_;
1183
1184 PVE::Ceph::Tools::check_ceph_inited();
1185
1186 my $osdid = $param->{osdid};
1187 my $deep = $param->{deep} // 0;
1188
1189 my $rados = PVE::RADOS->new();
1190
1191 $get_osd_status->($rados, $osdid); # osd exists?
1192
1193 my $prefix = $deep ? 'osd deep-scrub' : 'osd scrub';
1194 $rados->mon_command({ prefix => $prefix, who => $osdid });
1195
1196 return undef;
1197 }});
1198
1199 1;