]>
git.proxmox.com Git - pve-manager.git/blob - PVE/API2/Ceph/OSD.pm
93433b3a2bc06a87d98f986dfa1ed751432eaadb
1 package PVE
::API2
::Ceph
::OSD
;
11 use PVE
::Ceph
::Services
;
13 use PVE
::Cluster
qw(cfs_read_file cfs_write_file);
15 use PVE
::Storage
::LVMPlugin
;
16 use PVE
::Exception
qw(raise_param_exc);
17 use PVE
::JSONSchema
qw(get_standard_option);
21 use PVE
::RPCEnvironment
;
22 use PVE
::Tools
qw(run_command file_set_contents);
26 use base
qw(PVE::RESTHandler);
28 my $nodename = PVE
::INotify
::nodename
();
30 my $get_osd_status = sub {
31 my ($rados, $osdid) = @_;
33 my $stat = $rados->mon_command({ prefix
=> 'osd dump' });
35 my $osdlist = $stat->{osds
} || [];
37 my $flags = $stat->{flags
} || undef;
40 foreach my $d (@$osdlist) {
41 $osdstat->{$d->{osd
}} = $d if defined($d->{osd
});
43 if (defined($osdid)) {
44 die "no such OSD '$osdid'\n" if !$osdstat->{$osdid};
45 return $osdstat->{$osdid};
48 return wantarray ?
($osdstat, $flags) : $osdstat;
51 my $get_osd_usage = sub {
54 my $osdlist = $rados->mon_command({ prefix
=> 'pg dump', dumpcontents
=> [ 'osds' ]});
55 if (!($osdlist && ref($osdlist))) {
56 warn "got unknown result format for 'pg dump osds' command\n";
60 if (ref($osdlist) eq "HASH") { # since nautilus
61 $osdlist = $osdlist->{osd_stats
};
65 for my $d (@$osdlist) {
66 $osdstat->{$d->{osd
}} = $d if defined($d->{osd
});
72 __PACKAGE__-
>register_method ({
76 description
=> "Get Ceph osd list/tree.",
80 check
=> ['perm', '/', [ 'Sys.Audit', 'Datastore.Audit' ], any
=> 1],
83 additionalProperties
=> 0,
85 node
=> get_standard_option
('pve-node'),
88 # fixme: return a list instead of extjs tree format ?
95 PVE
::Ceph
::Tools
::check_ceph_inited
();
97 my $rados = PVE
::RADOS-
>new();
98 my $res = $rados->mon_command({ prefix
=> 'osd tree' });
100 die "no tree nodes found\n" if !($res && $res->{nodes
});
102 my ($osdhash, $flags) = $get_osd_status->($rados);
104 my $osd_usage = $get_osd_usage->($rados);
106 my $osdmetadata_res = $rados->mon_command({ prefix
=> 'osd metadata' });
107 my $osdmetadata = { map { $_->{id
} => $_ } @$osdmetadata_res };
109 my $hostversions = PVE
::Ceph
::Services
::get_ceph_versions
();
113 foreach my $e (@{$res->{nodes
}}) {
114 my ($id, $name) = $e->@{qw(id name)};
124 foreach my $opt (qw(status crush_weight reweight device_class)) {
125 $new->{$opt} = $e->{$opt} if defined($e->{$opt});
128 if (my $stat = $osdhash->{$id}) {
129 $new->{in} = $stat->{in} if defined($stat->{in});
132 if (my $stat = $osd_usage->{$id}) {
133 $new->{total_space
} = ($stat->{kb
} || 1) * 1024;
134 $new->{bytes_used
} = ($stat->{kb_used
} || 0) * 1024;
135 $new->{percent_used
} = ($new->{bytes_used
}*100)/$new->{total_space
};
136 if (my $d = $stat->{perf_stat
}) {
137 $new->{commit_latency_ms
} = $d->{commit_latency_ms
};
138 $new->{apply_latency_ms
} = $d->{apply_latency_ms
};
142 my $osdmd = $osdmetadata->{$id};
143 if ($e->{type
} eq 'osd' && $osdmd) {
144 if ($osdmd->{bluefs
}) {
145 $new->{osdtype
} = 'bluestore';
146 $new->{blfsdev
} = $osdmd->{bluestore_bdev_dev_node
};
147 $new->{dbdev
} = $osdmd->{bluefs_db_dev_node
};
148 $new->{waldev
} = $osdmd->{bluefs_wal_dev_node
};
150 $new->{osdtype
} = 'filestore';
152 for my $field (qw(ceph_version ceph_version_short)) {
153 $new->{$field} = $osdmd->{$field} if $osdmd->{$field};
157 $newnodes->{$id} = $new;
160 foreach my $e (@{$res->{nodes
}}) {
161 my ($id, $name) = $e->@{qw(id name)};
162 my $new = $newnodes->{$id};
164 if ($e->{children
} && scalar(@{$e->{children
}})) {
165 $new->{children
} = [];
167 foreach my $cid (@{$e->{children
}}) {
168 $nodes->{$cid}->{parent
} = $id;
169 if ($nodes->{$cid}->{type
} eq 'osd' && $e->{type
} eq 'host') {
170 $newnodes->{$cid}->{host
} = $name;
172 push @{$new->{children
}}, $newnodes->{$cid};
175 $new->{leaf
} = ($id >= 0) ?
1 : 0;
178 if ($name && $e->{type
} eq 'host') {
179 $new->{version
} = $hostversions->{$name}->{version
}->{str
};
184 foreach my $e (@{$res->{nodes
}}) {
186 if (!$nodes->{$id}->{parent
}) {
187 push @$realroots, $newnodes->{$id};
191 die "no root node\n" if scalar(@$realroots) < 1;
196 children
=> $realroots
200 $data->{flags
} = $flags if $flags; # we want this for the noout flag
205 __PACKAGE__-
>register_method ({
209 description
=> "Create OSD",
213 additionalProperties
=> 0,
215 node
=> get_standard_option
('pve-node'),
217 description
=> "Block device name.",
221 description
=> "Block device name for block.db.",
226 description
=> "Size in GiB for block.db.",
227 verbose_description
=> "If a block.db is requested but the size is not given, ".
228 "will be automatically selected by: bluestore_block_db_size from the ".
229 "ceph database (osd or global section) or config (osd or global section)".
230 "in that order. If this is not available, it will be sized 10% of the size ".
231 "of the OSD device. Fails if the available size is not enough.",
234 default => 'bluestore_block_db_size or 10% of OSD size',
235 requires
=> 'db_dev',
239 description
=> "Block device name for block.wal.",
244 description
=> "Size in GiB for block.wal.",
245 verbose_description
=> "If a block.wal is requested but the size is not given, ".
246 "will be automatically selected by: bluestore_block_wal_size from the ".
247 "ceph database (osd or global section) or config (osd or global section)".
248 "in that order. If this is not available, it will be sized 1% of the size ".
249 "of the OSD device. Fails if the available size is not enough.",
252 default => 'bluestore_block_wal_size or 1% of OSD size',
253 requires
=> 'wal_dev',
260 description
=> "Enables encryption of the OSD."
262 'crush-device-class' => {
265 description
=> "Set the device class of the OSD in crush."
269 returns
=> { type
=> 'string' },
273 my $rpcenv = PVE
::RPCEnvironment
::get
();
275 my $authuser = $rpcenv->get_user();
277 # test basic requirements
278 PVE
::Ceph
::Tools
::check_ceph_inited
();
279 PVE
::Ceph
::Tools
::setup_pve_symlinks
();
280 PVE
::Ceph
::Tools
::check_ceph_installed
('ceph_osd');
281 PVE
::Ceph
::Tools
::check_ceph_installed
('ceph_volume');
283 # extract parameter info and fail if a device is set more than once
286 my $ceph_conf = cfs_read_file
('ceph.conf');
288 my $osd_network = $ceph_conf->{global
}->{cluster_network
};
289 $osd_network //= $ceph_conf->{global
}->{public_network
}; # fallback
291 if ($osd_network) { # check only if something is configured
292 my $cluster_net_ips = PVE
::Network
::get_local_ip_from_cidr
($osd_network);
293 if (scalar(@$cluster_net_ips) < 1) {
294 my $osd_net_obj = PVE
::Network
::IP_from_cidr
($osd_network);
295 my $osd_base_cidr = $osd_net_obj->{ip
} . "/" . $osd_net_obj->{prefixlen
};
297 die "No address from ceph cluster network (${osd_base_cidr}) found on node '$nodename'. ".
298 "Check your network config.\n";
302 for my $type ( qw(dev db_dev wal_dev) ) {
303 next if !$param->{$type};
305 my $type_dev = PVE
::Diskmanage
::verify_blockdev_path
($param->{$type});
306 (my $type_devname = $type_dev) =~ s
|/dev/||;
308 raise_param_exc
({ $type => "cannot chose '$type_dev' for more than one type." })
309 if grep { $_->{name
} eq $type_devname } values %$devs;
313 name
=> $type_devname,
316 if (my $size = $param->{"${type}_size"}) {
317 $devs->{$type}->{size
} = PVE
::Tools
::convert_size
($size, 'gb' => 'b') ;
321 my $test_disk_requirements = sub {
324 my $dev = $devs->{dev
}->{dev
};
325 my $devname = $devs->{dev
}->{name
};
326 die "unable to get device info for '$dev'\n" if !$disklist->{$devname};
327 die "device '$dev' is already in use\n" if $disklist->{$devname}->{used
};
329 for my $type ( qw(db_dev wal_dev) ) {
330 my $d = $devs->{$type};
332 my $name = $d->{name
};
333 my $info = $disklist->{$name};
334 die "unable to get device info for '$d->{dev}' for type $type\n" if !$disklist->{$name};
335 if (my $usage = $info->{used
}) {
336 if ($usage eq 'partitions') {
337 die "device '$d->{dev}' is not GPT partitioned\n" if !$info->{gpt
};
338 } elsif ($usage ne 'LVM') {
339 die "device '$d->{dev}' is already in use and has no LVM on it\n";
346 # test disk requirements early
347 my $devlist = [ map { $_->{name
} } values %$devs ];
348 my $disklist = PVE
::Diskmanage
::get_disks
($devlist, 1, 1);
349 $test_disk_requirements->($disklist);
351 # get necessary ceph infos
352 my $rados = PVE
::RADOS-
>new();
353 my $monstat = $rados->mon_command({ prefix
=> 'quorum_status' });
355 die "unable to get fsid\n" if !$monstat->{monmap
} || !$monstat->{monmap
}->{fsid
};
356 my $fsid = $monstat->{monmap
}->{fsid
};
357 $fsid = $1 if $fsid =~ m/^([0-9a-f\-]+)$/;
359 my $ceph_bootstrap_osd_keyring = PVE
::Ceph
::Tools
::get_config
('ceph_bootstrap_osd_keyring');
361 if (! -f
$ceph_bootstrap_osd_keyring && $ceph_conf->{global
}->{auth_client_required
} eq 'cephx') {
362 my $bindata = $rados->mon_command({
363 prefix
=> 'auth get-or-create',
364 entity
=> 'client.bootstrap-osd',
366 'mon' => 'allow profile bootstrap-osd'
370 file_set_contents
($ceph_bootstrap_osd_keyring, $bindata);
374 my @udev_trigger_devs = ();
376 my $create_part_or_lv = sub {
377 my ($dev, $size, $type) = @_;
379 $size =~ m/^(\d+)$/ or die "invalid size '$size'\n";
382 die "'$dev->{devpath}' is smaller than requested size '$size' bytes\n"
383 if $dev->{size
} < $size;
385 # sgdisk and lvcreate can only sizes divisible by 512b
386 # so we round down to the nearest kb
387 $size = PVE
::Tools
::convert_size
($size, 'b' => 'kb', 1);
392 my $vg = "ceph-" . UUID
::uuid
();
393 my $lv = $type . "-" . UUID
::uuid
();
395 PVE
::Storage
::LVMPlugin
::lvm_create_volume_group
($dev->{devpath
}, $vg);
396 PVE
::Storage
::LVMPlugin
::lvcreate
($vg, $lv, "${size}k");
398 if (PVE
::Diskmanage
::is_partition
($dev->{devpath
})) {
399 eval { PVE
::Diskmanage
::change_parttype
($dev->{devpath
}, '8E00'); };
403 push @udev_trigger_devs, $dev->{devpath
};
407 } elsif ($dev->{used
} eq 'LVM') {
408 # check pv/vg and create lv
410 my $vgs = PVE
::Storage
::LVMPlugin
::lvm_vgs
(1);
412 for my $vgname ( sort keys %$vgs ) {
413 next if $vgname !~ /^ceph-/;
415 for my $pv ( @{$vgs->{$vgname}->{pvs
}} ) {
416 next if $pv->{name
} ne $dev->{devpath
};
423 die "no ceph vg found on '$dev->{devpath}'\n" if !$vg;
424 die "vg '$vg' has not enough free space\n" if $vgs->{$vg}->{free
} < $size;
426 my $lv = $type . "-" . UUID
::uuid
();
428 PVE
::Storage
::LVMPlugin
::lvcreate
($vg, $lv, "${size}k");
432 } elsif ($dev->{used
} eq 'partitions' && $dev->{gpt
}) {
433 # create new partition at the end
435 'osd-db' => '30CD0809-C2B2-499C-8879-2D6B78529876',
436 'osd-wal' => '5CE17FCE-4087-4169-B7FF-056CC58473F9',
439 my $part = PVE
::Diskmanage
::append_partition
($dev->{devpath
}, $size * 1024);
441 if (my $parttype = $parttypes->{$type}) {
442 eval { PVE
::Diskmanage
::change_parttype
($part, $parttype); };
446 push @udev_trigger_devs, $part;
450 die "cannot use '$dev->{devpath}' for '$type'\n";
456 PVE
::Diskmanage
::locked_disk_action
(sub {
457 # update disklist and re-test requirements
458 $disklist = PVE
::Diskmanage
::get_disks
($devlist, 1, 1);
459 $test_disk_requirements->($disklist);
461 my $dev_class = $param->{'crush-device-class'};
462 my $cmd = ['ceph-volume', 'lvm', 'create', '--cluster-fsid', $fsid ];
463 push @$cmd, '--crush-device-class', $dev_class if $dev_class;
465 my $devname = $devs->{dev
}->{name
};
466 my $devpath = $disklist->{$devname}->{devpath
};
467 print "create OSD on $devpath (bluestore)\n";
469 push @udev_trigger_devs, $devpath;
471 my $osd_size = $disklist->{$devname}->{size
};
473 db
=> int($osd_size / 10), # 10% of OSD
474 wal
=> int($osd_size / 100), # 1% of OSD
478 foreach my $type ( qw(db wal) ) {
479 my $fallback_size = $size_map->{$type};
480 my $d = $devs->{"${type}_dev"};
483 # size was not set via api, getting from config/fallback
484 if (!defined($d->{size
})) {
485 $sizes = PVE
::Ceph
::Tools
::get_db_wal_sizes
() if !$sizes;
486 $d->{size
} = $sizes->{$type} // $fallback_size;
488 print "creating block.$type on '$d->{dev}'\n";
489 my $name = $d->{name
};
490 my $part_or_lv = $create_part_or_lv->($disklist->{$name}, $d->{size
}, "osd-$type");
492 print "using '$part_or_lv' for block.$type\n";
493 push @$cmd, "--block.$type", $part_or_lv;
496 push @$cmd, '--data', $devpath;
497 push @$cmd, '--dmcrypt' if $param->{encrypted
};
499 PVE
::Diskmanage
::wipe_blockdev
($devpath);
501 if (PVE
::Diskmanage
::is_partition
($devpath)) {
502 eval { PVE
::Diskmanage
::change_parttype
($devpath, '8E00'); };
508 # FIXME: Remove once we depend on systemd >= v249.
509 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
510 # udev database is updated.
511 eval { run_command
(['udevadm', 'trigger', @udev_trigger_devs]); };
516 return $rpcenv->fork_worker('cephcreateosd', $devs->{dev
}->{name
}, $authuser, $worker);
519 # Check if $osdid belongs to $nodename
520 # $tree ... rados osd tree (passing the tree makes it easy to test)
521 sub osd_belongs_to_node
{
522 my ($tree, $nodename, $osdid) = @_;
523 return 0 if !($tree && $tree->{nodes
});
526 for my $el (grep { defined($_->{type
}) && $_->{type
} eq 'host' } @{$tree->{nodes
}}) {
527 my $name = $el->{name
};
528 die "internal error: duplicate host name found '$name'\n" if $node_map->{$name};
529 $node_map->{$name} = $el;
532 my $osds = $node_map->{$nodename}->{children
};
535 return grep($_ == $osdid, @$osds);
538 __PACKAGE__-
>register_method ({
539 name
=> 'destroyosd',
542 description
=> "Destroy OSD",
546 additionalProperties
=> 0,
548 node
=> get_standard_option
('pve-node'),
550 description
=> 'OSD ID',
554 description
=> "If set, we remove partition table entries.",
561 returns
=> { type
=> 'string' },
565 my $rpcenv = PVE
::RPCEnvironment
::get
();
567 my $authuser = $rpcenv->get_user();
569 PVE
::Ceph
::Tools
::check_ceph_inited
();
571 my $osdid = $param->{osdid
};
572 my $cleanup = $param->{cleanup
};
574 my $rados = PVE
::RADOS-
>new();
576 my $osd_belongs_to_node = osd_belongs_to_node
(
577 $rados->mon_command({ prefix
=> 'osd tree' }),
581 die "OSD osd.$osdid does not belong to node $param->{node}!"
582 if !$osd_belongs_to_node;
584 # dies if osdid is unknown
585 my $osdstat = $get_osd_status->($rados, $osdid);
587 die "osd is in use (in == 1)\n" if $osdstat->{in};
588 #&$run_ceph_cmd(['osd', 'out', $osdid]);
590 die "osd is still running (up == 1)\n" if $osdstat->{up
};
592 my $osdsection = "osd.$osdid";
597 # reopen with longer timeout
598 $rados = PVE
::RADOS-
>new(timeout
=> PVE
::Ceph
::Tools
::get_config
('long_rados_timeout'));
600 print "destroy OSD $osdsection\n";
603 PVE
::Ceph
::Services
::ceph_service_cmd
('stop', $osdsection);
604 PVE
::Ceph
::Services
::ceph_service_cmd
('disable', $osdsection);
608 print "Remove $osdsection from the CRUSH map\n";
609 $rados->mon_command({ prefix
=> "osd crush remove", name
=> $osdsection, format
=> 'plain' });
611 print "Remove the $osdsection authentication key.\n";
612 $rados->mon_command({ prefix
=> "auth del", entity
=> $osdsection, format
=> 'plain' });
614 print "Remove OSD $osdsection\n";
615 $rados->mon_command({ prefix
=> "osd rm", ids
=> [ $osdsection ], format
=> 'plain' });
617 # try to unmount from standard mount point
618 my $mountpoint = "/var/lib/ceph/osd/ceph-$osdid";
621 my $udev_trigger_devs = {};
623 my $remove_partition = sub {
626 return if !$part || (! -b
$part );
627 my $partnum = PVE
::Diskmanage
::get_partnum
($part);
628 my $devpath = PVE
::Diskmanage
::get_blockdev
($part);
630 $udev_trigger_devs->{$devpath} = 1;
632 PVE
::Diskmanage
::wipe_blockdev
($part);
633 print "remove partition $part (disk '${devpath}', partnum $partnum)\n";
634 eval { run_command
(['/sbin/sgdisk', '-d', $partnum, "${devpath}"]); };
638 my $osd_list = PVE
::Ceph
::Tools
::ceph_volume_list
();
640 if ($osd_list->{$osdid}) { # ceph-volume managed
642 eval { PVE
::Ceph
::Tools
::ceph_volume_zap
($osdid, $cleanup) };
646 # try to remove pvs, but do not fail if it does not work
647 for my $osd_part (@{$osd_list->{$osdid}}) {
648 for my $dev (@{$osd_part->{devices
}}) {
649 ($dev) = ($dev =~ m
|^(/dev/[-_
.a-zA-Z0-9\
/]+)$|); #untaint
651 eval { run_command
(['/sbin/pvremove', $dev], errfunc
=> sub {}) };
654 $udev_trigger_devs->{$dev} = 1;
659 my $partitions_to_remove = [];
661 if (my $mp = PVE
::ProcFSTools
::parse_proc_mounts
()) {
662 foreach my $line (@$mp) {
663 my ($dev, $path, $fstype) = @$line;
664 next if !($dev && $path && $fstype);
665 next if $dev !~ m
|^/dev/|;
667 if ($path eq $mountpoint) {
668 abs_path
($dev) =~ m
|^(/.+)| or die "invalid dev: $dev\n";
669 push @$partitions_to_remove, $1;
675 foreach my $path (qw(journal block block.db block.wal)) {
676 abs_path
("$mountpoint/$path") =~ m
|^(/.+)| or die "invalid path: $path\n";
677 push @$partitions_to_remove, $1;
681 print "Unmount OSD $osdsection from $mountpoint\n";
682 eval { run_command
(['/bin/umount', $mountpoint]); };
686 #be aware of the ceph udev rules which can remount.
687 foreach my $part (@$partitions_to_remove) {
688 $remove_partition->($part);
693 # FIXME: Remove once we depend on systemd >= v249.
694 # Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
695 # udev database is updated.
697 eval { run_command
(['udevadm', 'trigger', keys $udev_trigger_devs->%*]); };
702 return $rpcenv->fork_worker('cephdestroyosd', $osdsection, $authuser, $worker);
705 __PACKAGE__-
>register_method ({
707 path
=> '{osdid}/in',
709 description
=> "ceph osd in",
713 check
=> ['perm', '/', [ 'Sys.Modify' ]],
716 additionalProperties
=> 0,
718 node
=> get_standard_option
('pve-node'),
720 description
=> 'OSD ID',
725 returns
=> { type
=> "null" },
729 PVE
::Ceph
::Tools
::check_ceph_inited
();
731 my $osdid = $param->{osdid
};
733 my $rados = PVE
::RADOS-
>new();
735 $get_osd_status->($rados, $osdid); # osd exists?
737 my $osdsection = "osd.$osdid";
739 $rados->mon_command({ prefix
=> "osd in", ids
=> [ $osdsection ], format
=> 'plain' });
744 __PACKAGE__-
>register_method ({
746 path
=> '{osdid}/out',
748 description
=> "ceph osd out",
752 check
=> ['perm', '/', [ 'Sys.Modify' ]],
755 additionalProperties
=> 0,
757 node
=> get_standard_option
('pve-node'),
759 description
=> 'OSD ID',
764 returns
=> { type
=> "null" },
768 PVE
::Ceph
::Tools
::check_ceph_inited
();
770 my $osdid = $param->{osdid
};
772 my $rados = PVE
::RADOS-
>new();
774 $get_osd_status->($rados, $osdid); # osd exists?
776 my $osdsection = "osd.$osdid";
778 $rados->mon_command({ prefix
=> "osd out", ids
=> [ $osdsection ], format
=> 'plain' });
783 __PACKAGE__-
>register_method ({
785 path
=> '{osdid}/scrub',
787 description
=> "Instruct the OSD to scrub.",
791 check
=> ['perm', '/', [ 'Sys.Modify' ]],
794 additionalProperties
=> 0,
796 node
=> get_standard_option
('pve-node'),
798 description
=> 'OSD ID',
802 description
=> 'If set, instructs a deep scrub instead of a normal one.',
809 returns
=> { type
=> "null" },
813 PVE
::Ceph
::Tools
::check_ceph_inited
();
815 my $osdid = $param->{osdid
};
816 my $deep = $param->{deep
} // 0;
818 my $rados = PVE
::RADOS-
>new();
820 $get_osd_status->($rados, $osdid); # osd exists?
822 my $prefix = $deep ?
'osd deep-scrub' : 'osd scrub';
823 $rados->mon_command({ prefix
=> $prefix, who
=> $osdid });