cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.1)
+set(VERSION 12.2.2)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
Copyright (C) Copyright Howard Hinnant
Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba
License: Boost Software License, Version 1.0
-
-Files: src/msg/async/AsyncConnection.cc, src/msg/simple/Pipe.cc (sigpipe suppression)
- Copyright (C) 2010 Tomash Brechko. All rights reserved.
- License: GPL3
-
limit (5% by default). Limits by inode count are still supported using
mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
inode limit.
+
+* The maximum number of PGs per OSD before the monitor issues a
+ warning has been reduced from 300 to 200 PGs. 200 is still twice
+ the generally recommended target of 100 PGs per OSD. This limit can
+ be adjusted via the ``mon_max_pg_per_osd`` option on the
+ monitors. The older ``mon_pg_warn_max_per_osd`` option has been removed.
+
+* Creating pools or adjusting pg_num will now fail if the change would
+ make the number of PGs per OSD exceed the configured
+ ``mon_max_pg_per_osd`` limit. The option can be adjusted if it
+ is really necessary to create a pool with more PGs.
-Sphinx == 1.1.3
--e git+https://github.com/ceph/sphinx-ditaa.git#egg=sphinx-ditaa
+Sphinx == 1.6.3
+-e git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
-e git+https://github.com/michaeljones/breathe#egg=breathe
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.1
+pkgver=12.2.2
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.1.tar.bz2"
+source="ceph-12.2.2.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.1
+builddir=$srcdir/ceph-12.2.2
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.1
+Version: 12.2.2
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
%endif
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
%global _epoch_prefix %{?epoch:%{epoch}:}
Summary: User space components of the Ceph file system
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.1.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.2.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
%if 0%{?suse_version}
BuildRequires: python-CherryPy
BuildRequires: python-Werkzeug
+BuildRequires: python-numpy-devel
%endif
BuildRequires: python-pecan
BuildRequires: socat
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.1
+%autosetup -p1 -n ceph-12.2.2
%build
%if 0%{with cephfs_java}
install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
# firewall templates and /sbin/mount.ceph symlink
%if 0%{?suse_version}
%{_udevrulesdir}/95-ceph-osd.rules
%{_mandir}/man8/ceph-clsinfo.8*
%{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
%if 0%{?rhel} && ! 0%{?centos}
%attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
%endif
%{_unitdir}/ceph-osd@.service
%{_unitdir}/ceph-osd.target
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
%post osd
%if 0%{?suse_version}
if [ $1 -eq 1 ] ; then
/usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
fi
+%if 0%{?sysctl_apply}
+ %sysctl_apply 90-ceph-osd.conf
+%else
+ /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
%preun osd
%if 0%{?suse_version}
Epoch: 2
%endif
-# define %_epoch_prefix macro which will expand to the empty string if %epoch is undefined
+# define _epoch_prefix macro which will expand to the empty string if epoch is
+# undefined
%global _epoch_prefix %{?epoch:%{epoch}:}
Summary: User space components of the Ceph file system
%if 0%{?suse_version}
BuildRequires: python-CherryPy
BuildRequires: python-Werkzeug
+BuildRequires: python-numpy-devel
%endif
BuildRequires: python-pecan
BuildRequires: socat
install -m 0644 -D src/logrotate.conf %{buildroot}%{_sysconfdir}/logrotate.d/ceph
chmod 0644 %{buildroot}%{_docdir}/ceph/sample.ceph.conf
install -m 0644 -D COPYING %{buildroot}%{_docdir}/ceph/COPYING
+install -m 0644 -D src/90-ceph-osd.conf %{buildroot}%{_sysctldir}/90-ceph-osd.conf
# firewall templates and /sbin/mount.ceph symlink
%if 0%{?suse_version}
%{_udevrulesdir}/95-ceph-osd.rules
%{_mandir}/man8/ceph-clsinfo.8*
%{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-bluestore-tool.8*
%if 0%{?rhel} && ! 0%{?centos}
%attr(0755,-,-) %{_sysconfdir}/cron.hourly/subman
%endif
%{_unitdir}/ceph-osd@.service
%{_unitdir}/ceph-osd.target
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%config(noreplace) %{_sysctldir}/90-ceph-osd.conf
%post osd
%if 0%{?suse_version}
if [ $1 -eq 1 ] ; then
/usr/bin/systemctl start ceph-osd.target >/dev/null 2>&1 || :
fi
+%if 0%{?sysctl_apply}
+ %sysctl_apply 90-ceph-osd.conf
+%else
+ /usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
+%endif
%preun osd
%if 0%{?suse_version}
usr/share/man/man8/ceph-volume.8
usr/share/man/man8/ceph-volume-systemd.8
usr/share/man/man8/ceph-osd.8
+usr/share/man/man8/ceph-bluestore-tool.8
+etc/sysctl.d/30-ceph-osd.conf
case "$1" in
configure)
+ [ -x /etc/init.d/procps ] && invoke-rc.d procps restart || :
[ -x /sbin/start ] && start ceph-osd-all || :
;;
abort-upgrade|abort-remove|abort-deconfigure)
+ceph (12.2.2-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Thu, 30 Nov 2017 14:59:26 +0000
+
ceph (12.2.1-1) stable; urgency=medium
* New upstream release
install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules
install -D -m 644 udev/60-ceph-by-parttypeuuid.rules $(DESTDIR)/lib/udev/rules.d/60-ceph-by-parttypeuuid.rules
install -D -m 644 src/etc-rbdmap $(DESTDIR)/etc/ceph/rbdmap
+ install -D -m 644 src/90-ceph-osd.conf $(DESTDIR)/etc/sysctl.d/30-ceph-osd.conf
# doc/changelog is a directory, which confuses dh_installchangelogs
override_dh_installchangelogs:
ceph-volume
===========
Deploy OSDs with different device technologies like lvm or physical disks using
-pluggable tools (:doc:`lvm/index` itself is treated like a plugin). It tries to
-follow the workflow of ``ceph-disk`` for deploying OSDs, with a predictable,
-and robust way of preparing, activating, and starting OSDs.
+pluggable tools (:doc:`lvm/index` itself is treated like a plugin) and trying to
+follow a predictable, and robust way of preparing, activating, and starting OSDs.
:ref:`Overview <ceph-volume-overview>` |
:ref:`Plugin Guide <ceph-volume-plugins>` |
**Command Line Subcommands**
-Although currently there is support for ``lvm``, the plan is to support other
-technologies, including plain disks.
+There is currently support for ``lvm``, and plain disks (with GPT partitions)
+that may have been deployed with ``ceph-disk``.
* :ref:`ceph-volume-lvm`
+* :ref:`ceph-volume-simple`
+
+
+Migrating
+---------
+Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
+warnings will show up that will link to this page. It is strongly suggested
+that users start consuming ``ceph-volume``.
+
+New deployments
+^^^^^^^^^^^^^^^
+For new deployments, :ref:`ceph-volume-lvm` is recommended, it can use any
+logical volume as input for data OSDs, or it can setup a minimal/naive logical
+volume from a device.
+
+Existing OSDs
+^^^^^^^^^^^^^
+If the cluster has OSDs that were provisioned with ``ceph-disk``, then
+``ceph-volume`` can take over the management of these with
+:ref:`ceph-volume-simple`. A scan is done on the data device or OSD directory,
+and ``ceph-disk`` is fully disabled.
+
+Encrypted OSDs
+^^^^^^^^^^^^^^
+If using encryption with OSDs, there is currently no support in ``ceph-volume``
+for this scenario (although support for this is coming soon). In this case, it
+is OK to continue to use ``ceph-disk`` until ``ceph-volume`` fully supports it.
+This page will be updated when that happens.
.. toctree::
:hidden:
:caption: Contents:
intro
+ systemd
lvm/index
lvm/activate
lvm/prepare
lvm/scan
lvm/systemd
+ lvm/list
+ lvm/zap
+ simple/index
+ simple/activate
+ simple/scan
+ simple/systemd
To activate newly prepared OSDs both the :term:`OSD id` and :term:`OSD uuid`
need to be supplied. For example::
- ceph-volume activate --filestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
+ ceph-volume lvm activate --bluestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
.. note:: The UUID is stored in the ``osd_fsid`` file in the OSD path, which is
generated when :ref:`ceph-volume-lvm-prepare` is used.
Would start the discovery process for the OSD with an id of ``0`` and a UUID of
``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
-.. note:: for more details on the systemd workflow see :ref:`ceph-volume-systemd`
+.. note:: for more details on the systemd workflow see :ref:`ceph-volume-lvm-systemd`
The systemd unit will look for the matching OSD device, and by looking at its
:term:`LVM tags` will proceed to:
# start the ``ceph-osd@0`` systemd unit
+.. note:: The system infers the objectstore type (filestore or bluestore) by
+ inspecting the LVM tags applied to the OSD devices
+
Existing OSDs
-------------
For exsiting OSDs that have been deployed with different tooling, the only way
Summary
-------
-To recap the ``activate`` process:
+To recap the ``activate`` process for :term:`bluestore`:
+
+#. require both :term:`OSD id` and :term:`OSD uuid`
+#. enable the system unit with matching id and uuid
+#. Create the ``tmpfs`` mount at the OSD directory in
+ ``/var/lib/ceph/osd/$cluster-$id/``
+#. Recreate all the files needed with ``ceph-bluestore-tool prime-osd-dir`` by
+ pointing it to the OSD ``block`` device.
+#. the systemd unit will ensure all devices are ready and linked
+#. the matching ``ceph-osd`` systemd unit will get started
+
+And for :term:`filestore`:
#. require both :term:`OSD id` and :term:`OSD uuid`
#. enable the system unit with matching id and uuid
--- /dev/null
+.. _ceph-volume-lvm-create:
+
+``create``
+===========
+This subcommand wraps the two-step process to provision a new osd (calling
+``prepare`` first and then ``activate``) into a single
+one. The reason to prefer ``prepare`` and then ``activate`` is to gradually
+introduce new OSDs into a cluster, and avoiding large amounts of data being
+rebalanced.
+
+The single-call process unifies exactly what :ref:`ceph-volume-lvm-prepare` and
+:ref:`ceph-volume-lvm-activate` do, with the convenience of doing it all at
+once.
+
+There is nothing different to the process except the OSD will become up and in
+immediately after completion.
+
+The backing objectstore can be specified with:
+
+* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
+
+All command line flags and options are the same as ``ceph-volume lvm prepare``.
+Please refer to :ref:`ceph-volume-lvm-prepare` for details.
* :ref:`ceph-volume-lvm-activate`
+* :ref:`ceph-volume-lvm-create`
+
+* :ref:`ceph-volume-lvm-list`
+
.. not yet implemented
.. * :ref:`ceph-volume-lvm-scan`
exposed to the user, these sections explain how these pieces work together,
clarifying the workflows of the tool.
-:ref:`Systemd Units <ceph-volume-systemd>` |
+:ref:`Systemd Units <ceph-volume-lvm-systemd>` |
:ref:`lvm <ceph-volume-lvm-api>`
--- /dev/null
+.. _ceph-volume-lvm-list:
+
+``list``
+========
+This subcommand will list any devices (logical and physical) that may be
+associated with a Ceph cluster, as long as they contain enough metadata to
+allow for that discovery.
+
+Output is grouped by the OSD ID associated with the devices, and unlike
+``ceph-disk`` it does not provide any information for devices that aren't
+associated with Ceph.
+
+Command line options:
+
+* ``--format`` Allows a ``json`` or ``pretty`` value. Defaults to ``pretty``
+ which will group the device information in a human-readable format.
+
+Full Reporting
+--------------
+When no positional arguments are used, a full reporting will be presented. This
+means that all devices and logical volumes found in the system will be
+displayed.
+
+Full ``pretty`` reporting for two OSDs, one with a lv as a journal, and another
+one with a physical device may look similar to::
+
+ # ceph-volume lvm list
+
+
+ ====== osd.1 =======
+
+ [journal] /dev/journals/journal1
+
+ journal uuid C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+ osd id 1
+ cluster fsid ce454d91-d748-4751-a318-ff7f7aa18ffd
+ type journal
+ osd fsid 661b24f8-e062-482b-8110-826ffe7f13fa
+ data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+ journal device /dev/journals/journal1
+ data device /dev/test_group/data-lv2
+
+ [data] /dev/test_group/data-lv2
+
+ journal uuid C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+ osd id 1
+ cluster fsid ce454d91-d748-4751-a318-ff7f7aa18ffd
+ type data
+ osd fsid 661b24f8-e062-482b-8110-826ffe7f13fa
+ data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+ journal device /dev/journals/journal1
+ data device /dev/test_group/data-lv2
+
+ ====== osd.0 =======
+
+ [data] /dev/test_group/data-lv1
+
+ journal uuid cd72bd28-002a-48da-bdf6-d5b993e84f3f
+ osd id 0
+ cluster fsid ce454d91-d748-4751-a318-ff7f7aa18ffd
+ type data
+ osd fsid 943949f0-ce37-47ca-a33c-3413d46ee9ec
+ data uuid TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00
+ journal device /dev/sdd1
+ data device /dev/test_group/data-lv1
+
+ [journal] /dev/sdd1
+
+ PARTUUID cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+ as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+ see :ref:`ceph-volume-lvm-tag-api`
+
+Single Reporting
+----------------
+Single reporting can consume both devices and logical volumes as input
+(positional parameters). For logical volumes, it is required to use the group
+name as well as the logical volume name.
+
+For example the ``data-lv2`` logical volume, in the ``test_group`` volume group
+can be listed in the following way::
+
+ # ceph-volume lvm list test_group/data-lv2
+
+
+ ====== osd.1 =======
+
+ [data] /dev/test_group/data-lv2
+
+ journal uuid C65n7d-B1gy-cqX3-vZKY-ZoE0-IEYM-HnIJzs
+ osd id 1
+ cluster fsid ce454d91-d748-4751-a318-ff7f7aa18ffd
+ type data
+ osd fsid 661b24f8-e062-482b-8110-826ffe7f13fa
+ data uuid SlEgHe-jX1H-QBQk-Sce0-RUls-8KlY-g8HgcZ
+ journal device /dev/journals/journal1
+ data device /dev/test_group/data-lv2
+
+
+.. note:: Tags are displayed in a readable format. The ``osd id`` key is stored
+ as a ``ceph.osd_id`` tag. For more information on lvm tag conventions
+ see :ref:`ceph-volume-lvm-tag-api`
+
+
+For plain disks, the full path to the device is required. For example, for
+a device like ``/dev/sdd1`` it can look like::
+
+
+ # ceph-volume lvm list /dev/sdd1
+
+
+ ====== osd.0 =======
+
+ [journal] /dev/sdd1
+
+ PARTUUID cd72bd28-002a-48da-bdf6-d5b993e84f3f
+
+
+
+``json`` output
+---------------
+All output using ``--format=json`` will show everything the system has stored
+as metadata for the devices, including tags.
+
+No changes for readability are done with ``json`` reporting, and all
+information is presented as-is. Full output as well as single devices can be
+listed.
+
+For brevity, this is how a single logical volume would look with ``json``
+output (note how tags aren't modified)::
+
+ # ceph-volume lvm list --format=json test_group/data-lv1
+ {
+ "0": [
+ {
+ "lv_name": "data-lv1",
+ "lv_path": "/dev/test_group/data-lv1",
+ "lv_tags": "ceph.cluster_fsid=ce454d91-d748-4751-a318-ff7f7aa18ffd,ceph.data_device=/dev/test_group/data-lv1,ceph.data_uuid=TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00,ceph.journal_device=/dev/sdd1,ceph.journal_uuid=cd72bd28-002a-48da-bdf6-d5b993e84f3f,ceph.osd_fsid=943949f0-ce37-47ca-a33c-3413d46ee9ec,ceph.osd_id=0,ceph.type=data",
+ "lv_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+ "name": "data-lv1",
+ "path": "/dev/test_group/data-lv1",
+ "tags": {
+ "ceph.cluster_fsid": "ce454d91-d748-4751-a318-ff7f7aa18ffd",
+ "ceph.data_device": "/dev/test_group/data-lv1",
+ "ceph.data_uuid": "TUpfel-Q5ZT-eFph-bdGW-SiNW-l0ag-f5kh00",
+ "ceph.journal_device": "/dev/sdd1",
+ "ceph.journal_uuid": "cd72bd28-002a-48da-bdf6-d5b993e84f3f",
+ "ceph.osd_fsid": "943949f0-ce37-47ca-a33c-3413d46ee9ec",
+ "ceph.osd_id": "0",
+ "ceph.type": "data"
+ },
+ "type": "data",
+ "vg_name": "test_group"
+ }
+ ]
+ }
+
+
+Synchronized information
+------------------------
+Before any listing type, the lvm API is queried to ensure that physical devices
+that may be in use haven't changed naming. It is possible that non-persistent
+devices like ``/dev/sda1`` could change to ``/dev/sdb1``.
+
+The detection is possible because the ``PARTUUID`` is stored as part of the
+metadata in the logical volume for the data lv. Even in the case of a journal
+that is a physical device, this information is still stored on the data logical
+volume associated with it.
+
+If the name is no longer the same (as reported by ``blkid`` when using the
+``PARTUUID``), the tag will get updated and the report will use the newly
+refreshed information.
``prepare``
===========
-This subcommand allows a :term:`filestore` setup (:term:`bluestore` support is
-planned) and currently consumes only logical volumes for both the data and
-journal. It will not create or modify the logical volumes except for adding
-extra metadata.
+This subcommand allows a :term:`filestore` or :term:`bluestore` setup. It is
+recommended to pre-provision a logical volume before using it with
+``ceph-volume lvm``.
+
+Logical volumes are not altered except for adding extra metadata.
.. note:: This is part of a two step process to deploy an OSD. If looking for
a single-call way, please see :ref:`ceph-volume-lvm-create`
* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
-* ``--bluestore``
-
-.. when available, this will need to be updated to:
-.. * :ref:`--bluestore <ceph-volume-prepare_bluestore>`
+* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
.. _ceph-volume-lvm-prepare_filestore:
``filestore``
-------------
-This is the default OSD backend and allows preparation of logical volumes for
-a :term:`filestore` OSD.
+This is the OSD backend that allows preparation of logical volumes for
+a :term:`filestore` objectstore OSD.
-The process is *very* strict, it requires two logical volumes that are ready to
-be used. No special preparation is needed for these volumes other than
-following the minimum size requirements for data and journal.
+It can use a logical volume for the OSD data and a partitioned physical device
+or logical volume for the journal. No special preparation is needed for these
+volumes other than following the minimum size requirements for data and
+journal.
The API call looks like::
ceph-volume prepare --filestore --data data --journal journal
-The journal *must* be a logical volume, just like the data volume, and that
-argument is always required even if both live under the same group.
+There is flexibility to use a raw device or partition as well for ``--data``
+that will be converted to a logical volume. This is not ideal in all situations
+since ``ceph-volume`` is just going to create a unique volume group and
+a logical volume from that device.
+
+When using logical volumes for ``--data``, the value *must* be a volume group
+name and a logical volume name separated by a ``/``. Since logical volume names
+are not enforced for uniqueness, this prevents using the wrong volume. The
+``--journal`` can be either a logical volume *or* a partition.
+
+When using a partition, it *must* contain a ``PARTUUID`` discoverable by
+``blkid``, so that it can later be identified correctly regardless of the
+device name (or path).
+
+When using a partition, this is how it would look for ``/dev/sdc1``::
+
+ ceph-volume prepare --filestore --data volume_group/lv_name --journal /dev/sdc1
+
+For a logical volume, just like for ``--data``, a volume group and logical
+volume name are required::
+
+ ceph-volume prepare --filestore --data volume_group/lv_name --journal volume_group/journal_lv
A generated uuid is used to ask the cluster for a new OSD. These two pieces are
crucial for identifying an OSD and will later be used throughout the
``bluestore``
-------------
-This subcommand is planned but not currently implemented.
+The :term:`bluestore` objectstore is the default for new OSDs. It offers a bit
+more flexibility for devices. Bluestore supports the following configurations:
+
+* A block device, a block.wal, and a block.db device
+* A block device and a block.wal device
+* A block device and a block.db device
+* A single block device
+
+It can accept a whole device (or partition), or a logical volume for ``block``.
+If a physical device is provided it will then be turned into a logical volume.
+This allows a simpler approach at using LVM but at the cost of flexibility:
+there are no options or configurations to change how the LV is created.
+
+The ``block`` is specified with the ``--data`` flag, and in its simplest use
+case it looks like::
+
+ ceph-volume lvm prepare --bluestore --data vg/lv
+
+A raw device can be specified in the same way::
+
+ ceph-volume lvm prepare --bluestore --data /path/to/device
+
+
+If a ``block.db`` or a ``block.wal`` is needed (they are optional for
+bluestore) they can be specified with ``--block.db`` and ``--block.wal``
+accordingly. These can be a physical device (they **must** be a partition) or
+a logical volume.
+
+For both ``block.db`` and ``block.wal`` partitions aren't made logical volumes
+because they can be used as-is. Logical Volumes are also allowed.
+
+While creating the OSD directory, the process will use a ``tmpfs`` mount to
+place all the files needed for the OSD. These files are initially created by
+``ceph-osd --mkfs`` and are fully ephemeral.
+
+A symlink is always created for the ``block`` device, and optionally for
+``block.db`` and ``block.wal``. For a cluster with a default name, and an OSD
+id of 0, the directory could look like::
+
+ # ls -l /var/lib/ceph/osd/ceph-0
+ lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block -> /dev/ceph-be2b6fbd-bcf2-4c51-b35d-a35a162a02f0/osd-block-25cf0a05-2bc6-44ef-9137-79d65bd7ad62
+ lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.db -> /dev/sda1
+ lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block.wal -> /dev/ceph/osd-wal-0
+ -rw-------. 1 ceph ceph 37 Oct 20 13:05 ceph_fsid
+ -rw-------. 1 ceph ceph 37 Oct 20 13:05 fsid
+ -rw-------. 1 ceph ceph 55 Oct 20 13:05 keyring
+ -rw-------. 1 ceph ceph 6 Oct 20 13:05 ready
+ -rw-------. 1 ceph ceph 10 Oct 20 13:05 type
+ -rw-------. 1 ceph ceph 2 Oct 20 13:05 whoami
+
+In the above case, a device was used for ``block`` so ``ceph-volume`` create
+a volume group and a logical volume using the following convention:
+
+* volume group name: ``ceph-{cluster fsid}`` or if the vg exists already
+ ``ceph-{random uuid}``
+
+* logical volume name: ``osd-block-{osd_fsid}``
Storing metadata
----------------
-The following tags will get applied as part of the prepartion process
-regardless of the type of volume (journal or data) and also regardless of the
-OSD backend:
+The following tags will get applied as part of the preparation process
+regardless of the type of volume (journal or data) or OSD objectstore:
* ``cluster_fsid``
-* ``data_device``
-* ``journal_device``
* ``encrypted``
* ``osd_fsid``
* ``osd_id``
-* ``block``
-* ``db``
-* ``wal``
-* ``lockbox_device``
+
+For :term:`filestore` these tags will be added:
+
+* ``journal_device``
+* ``journal_uuid``
+
+For :term:`bluestore` these tags will be added:
+
+* ``block_device``
+* ``block_uuid``
+* ``db_device``
+* ``db_uuid``
+* ``wal_device``
+* ``wal_uuid``
.. note:: For the complete lvm tag conventions see :ref:`ceph-volume-lvm-tag-api`
Summary
-------
-To recap the ``prepare`` process:
+To recap the ``prepare`` process for :term:`bluestore`:
+
+#. Accept a logical volume for block or a raw device (that will get converted
+ to an lv)
+#. Accept partitions or logical volumes for ``block.wal`` or ``block.db``
+#. Generate a UUID for the OSD
+#. Ask the monitor get an OSD ID reusing the generated UUID
+#. OSD data directory is created on a tmpfs mount.
+#. ``block``, ``block.wal``, and ``block.db`` are symlinked if defined.
+#. monmap is fetched for activation
+#. Data directory is populated by ``ceph-osd``
+#. Logical Volumes are are assigned all the Ceph metadata using lvm tags
+
+
+And the ``prepare`` process for :term:`filestore`:
#. Accept only logical volumes for data and journal (both required)
#. Generate a UUID for the OSD
-.. _ceph-volume-systemd:
+.. _ceph-volume-lvm-systemd:
systemd
=======
-As part of the :ref:`ceph-volume-lvm-activate` process, a few systemd units will get enabled
-that will use the OSD id and uuid as part of their name. These units will be
-run when the system boots, and will proceed to activate their corresponding
-volumes.
-
-The API for activation requires both the :term:`OSD id` and :term:`OSD uuid`,
-which get persisted by systemd. Internally, the activation process enables the
-systemd unit using the following convention::
-
- ceph-volume@<type>-<extra metadata>
-
-Where ``type`` is the sub-command used to parse the extra metadata, and ``extra
-metadata`` is any additional information needed by the sub-command to be able
-to activate the OSD. For example an OSD with an ID of 0, for the ``lvm``
-sub-command would look like::
-
- systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
-
-
-Process
--------
-The systemd unit is a :term:`systemd oneshot` service, meant to start at boot after the
-local filesystem is ready to be used.
-
Upon startup, it will identify the logical volume using :term:`LVM tags`,
finding a matching ID and later ensuring it is the right one with
the :term:`OSD uuid`.
/var/lib/ceph/osd/ceph-0
+
Once that process is complete, a call will be made to start the OSD::
systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume lvm
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume lvm activate`` which
+would proceed with activation.
--- /dev/null
+.. _ceph-volume-lvm-zap:
+
+``zap``
+=======
+
+This subcommand is used to zap lvs or partitions that have been used
+by ceph OSDs so that they may be reused. If given a path to a logical
+volume it must be in the format of vg/lv. Any filesystems present
+on the given lv or partition will be removed and all data will be purged.
+
+.. note:: The lv or partition will be kept intact.
+
+Zapping a logical volume::
+
+ ceph-volume lvm zap {vg name/lv name}
+
+Zapping a partition::
+
+ ceph-volume lvm zap /dev/sdc1
--- /dev/null
+.. _ceph-volume-simple-activate:
+
+``activate``
+============
+Once :ref:`ceph-volume-simple-scan` has been completed, and all the metadata
+captured for an OSD has been persisted to ``/etc/ceph/osd/{id}-{uuid}.json``
+the OSD is now ready to get "activated".
+
+This activation process **disables** all ``ceph-disk`` systemd units by masking
+them, to prevent the UDEV/ceph-disk interaction that will attempt to start them
+up at boot time.
+
+The disabling of ``ceph-disk`` units is done only when calling ``ceph-volume
+simple activate`` directly, but is is avoided when being called by systemd when
+the system is booting up.
+
+The activation process requires using both the :term:`OSD id` and :term:`OSD uuid`
+To activate parsed OSDs::
+
+ ceph-volume simple activate 0 6cc43680-4f6e-4feb-92ff-9c7ba204120e
+
+The above command will assume that a JSON configuration will be found in::
+
+ /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+Alternatively, using a path to a JSON file directly is also possible::
+
+ ceph-volume simple activate --file /etc/ceph/osd/0-6cc43680-4f6e-4feb-92ff-9c7ba204120e.json
+
+requiring uuids
+^^^^^^^^^^^^^^^
+The :term:`OSD uuid` is being required as an extra step to ensure that the
+right OSD is being activated. It is entirely possible that a previous OSD with
+the same id exists and would end up activating the incorrect one.
+
+
+Discovery
+---------
+With OSDs previously scanned by ``ceph-volume``, a *discovery* process is
+performed using ``blkid`` and ``lvm``. There is currently support only for
+devices with GPT partitions and LVM logical volumes.
+
+The GPT partitions will have a ``PARTUUID`` that can be queried by calling out
+to ``blkid``, and the logical volumes will have a ``lv_uuid`` that can be
+queried against ``lvs`` (the LVM tool to list logical volumes).
+
+This discovery process ensures that devices can be correctly detected even if
+they are repurposed into another system or if their name changes (as in the
+case of non-persisting names like ``/dev/sda1``)
+
+The JSON configuration file used to map what devices go to what OSD will then
+coordinate the mounting and symlinking as part of activation.
+
+To ensure that the symlinks are always correct, if they exist in the OSD
+directory, the symlinks will be re-done.
+
+A systemd unit will capture the :term:`OSD id` and :term:`OSD uuid` and
+persist it. Internally, the activation will enable it like::
+
+ systemctl enable ceph-volume@simple-$id-$uuid
+
+For example::
+
+ systemctl enable ceph-volume@simple-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+Would start the discovery process for the OSD with an id of ``0`` and a UUID of
+``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
+
+
+The systemd process will call out to activate passing the information needed to
+identify the OSD and its devices, and it will proceed to:
+
+# mount the device in the corresponding location (by convention this is
+ ``/var/lib/ceph/osd/<cluster name>-<osd id>/``)
+
+# ensure that all required devices are ready for that OSD and properly linked,
+regardless of objectstore used (filestore or bluestore). The symbolic link will
+**always** be re-done to ensure that the correct device is linked.
+
+# start the ``ceph-osd@0`` systemd unit
--- /dev/null
+.. _ceph-volume-simple:
+
+``simple``
+==========
+Implements the functionality needed to manage OSDs from the ``simple`` subcommand:
+``ceph-volume simple``
+
+**Command Line Subcommands**
+
+* :ref:`ceph-volume-simple-scan`
+
+* :ref:`ceph-volume-simple-activate`
+
+* :ref:`ceph-volume-simple-systemd`
+
+
+By *taking over* management, it disables all ``ceph-disk`` systemd units used
+to trigger devices at startup, relying on basic (customizable) JSON
+configuration and systemd for starting up OSDs.
--- /dev/null
+.. _ceph-volume-simple-scan:
+
+``scan``
+========
+Scanning allows to capture any important details from an already-deployed OSD
+so that ``ceph-volume`` can manage it without the need of any other startup
+workflows or tools (like ``udev`` or ``ceph-disk``).
+
+The command has the ability to inspect a running OSD, by inspecting the
+directory where the OSD data is stored, or by consuming the data partition.
+
+Once scanned, information will (by default) persist the metadata as JSON in
+a file in ``/etc/ceph/osd``. This ``JSON`` file will use the naming convention
+of: ``{OSD ID}-{OSD FSID}.json``. An OSD with an id of 1, and an FSID like
+``86ebd829-1405-43d3-8fd6-4cbc9b6ecf96`` the absolute path of the file would
+be::
+
+ /etc/ceph/osd/1-86ebd829-1405-43d3-8fd6-4cbc9b6ecf96.json
+
+The ``scan`` subcommand will refuse to write to this file if it already exists.
+If overwriting the contents is needed, the ``--force`` flag must be used::
+
+ ceph-volume simple scan --force {path}
+
+If there is no need to persist the ``JSON`` metadata, there is support to send
+the contents to ``stdout`` (no file will be written)::
+
+ ceph-volume simple scan --stdout {path}
+
+
+.. _ceph-volume-simple-scan-directory:
+
+Directory scan
+--------------
+The directory scan will capture OSD file contents from interesting files. There
+are a few files that must exist in order to have a successful scan:
+
+* ``ceph_fsid``
+* ``fsid``
+* ``keyring``
+* ``ready``
+* ``type``
+* ``whoami``
+
+In the case of any other file, as long as it is not a binary or a directory, it
+will also get captured and persisted as part of the JSON object.
+
+The convention for the keys in the JSON object is that any file name will be
+a key, and its contents will be its value. If the contents are a single line
+(like in the case of the ``whoami``) the contents are trimmed, and the newline
+is dropped. For example with an OSD with an id of 1, this is how the JSON entry
+would look like::
+
+ "whoami": "1",
+
+For files that may have more than one line, the contents are left as-is, for
+example, a ``keyring`` could look like this::
+
+ "keyring": "[osd.1]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+
+For a directory like ``/var/lib/ceph/osd/ceph-1``, the command could look
+like::
+
+ ceph-volume simple scan /var/lib/ceph/osd/ceph1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-device:
+
+Device scan
+-----------
+When an OSD directory is not available (OSD is not running, or device is not
+mounted) the ``scan`` command is able to introspect the device to capture
+required data. Just like :ref:`ceph-volume-simple-scan-directory`, it would
+still require a few files present. This means that the device to be scanned
+**must be** the data partition of the OSD.
+
+As long as the data partition of the OSD is being passed in as an argument, the
+sub-command can scan its contents.
+
+In the case where the device is already mounted, the tool can detect this
+scenario and capture file contents from that directory.
+
+If the device is not mounted, a temporary directory will be created, and the
+device will be mounted temporarily just for scanning the contents. Once
+contents are scanned, the device will be unmounted.
+
+For a device like ``/dev/sda1`` which **must** be a data partition, the command
+could look like::
+
+ ceph-volume simple scan /dev/sda1
+
+
+.. note:: There is no support for encrypted OSDs
+
+
+.. _ceph-volume-simple-scan-json:
+
+``JSON`` contents
+-----------------
+The contents of the JSON object is very simple. The scan not only will persist
+information from the special OSD files and their contents, but will also
+validate paths and device UUIDs. Unlike what ``ceph-disk`` would do, by storing
+them in ``{device type}_uuid`` files, the tool will persist them as part of the
+device type key.
+
+For example, a ``block.db`` device would look something like::
+
+ "block.db": {
+ "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+ "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+ },
+
+But it will also persist the ``ceph-disk`` special file generated, like so::
+
+ "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+
+This duplication is in place because the tool is trying to ensure the
+following:
+
+# Support OSDs that may not have ceph-disk special files
+# Check the most up-to-date information on the device, by querying against LVM
+and ``blkid``
+# Support both logical volumes and GPT devices
+
+This is a sample ``JSON`` metadata, from an OSD that is using ``bluestore``::
+
+ {
+ "active": "ok",
+ "block": {
+ "path": "/dev/disk/by-partuuid/40fd0a64-caa5-43a3-9717-1836ac661a12",
+ "uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12"
+ },
+ "block.db": {
+ "path": "/dev/disk/by-partuuid/6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+ "uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e"
+ },
+ "block.db_uuid": "6cc43680-4f6e-4feb-92ff-9c7ba204120e",
+ "block_uuid": "40fd0a64-caa5-43a3-9717-1836ac661a12",
+ "bluefs": "1",
+ "ceph_fsid": "c92fc9eb-0610-4363-aafc-81ddf70aaf1b",
+ "cluster_name": "ceph",
+ "data": {
+ "path": "/dev/sdr1",
+ "uuid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96"
+ },
+ "fsid": "86ebd829-1405-43d3-8fd6-4cbc9b6ecf96",
+ "keyring": "[osd.3]\n\tkey = AQBBJ/dZp57NIBAAtnuQS9WOS0hnLVe0rZnE6Q==\n",
+ "kv_backend": "rocksdb",
+ "magic": "ceph osd volume v026",
+ "mkfs_done": "yes",
+ "ready": "ready",
+ "systemd": "",
+ "type": "bluestore",
+ "whoami": "3"
+ }
--- /dev/null
+.. _ceph-volume-simple-systemd:
+
+systemd
+=======
+Upon startup, it will identify the logical volume by loading the JSON file in
+``/etc/ceph/osd/{id}-{uuid}.json`` corresponding to the instance name of the
+systemd unit.
+
+After identifying the correct volume it will then proceed to mount it by using
+the OSD destination conventions, that is::
+
+ /var/lib/ceph/osd/{cluster name}-{osd id}
+
+For our example OSD with an id of ``0``, that means the identified device will
+be mounted at::
+
+
+ /var/lib/ceph/osd/ceph-0
+
+
+Once that process is complete, a call will be made to start the OSD::
+
+ systemctl start ceph-osd@0
+
+The systemd portion of this process is handled by the ``ceph-volume simple
+trigger`` sub-command, which is only in charge of parsing metadata coming from
+systemd and startup, and then dispatching to ``ceph-volume simple activate`` which
+would proceed with activation.
--- /dev/null
+.. _ceph-volume-systemd:
+
+systemd
+=======
+As part of the activation process (either with :ref:`ceph-volume-lvm-activate`
+or :ref:`ceph-volume-simple-activate`), systemd units will get enabled that
+will use the OSD id and uuid as part of their name. These units will be run
+when the system boots, and will proceed to activate their corresponding
+volumes via their sub-command implementation.
+
+The API for activation is a bit loose, it only requires two parts: the
+subcommand to use and any extra meta information separated by a dash. This
+convention makes the units look like::
+
+ ceph-volume@{command}-{extra metadata}
+
+The *extra metadata* can be anything needed that the subcommand implementing
+the processing might need. In the case of :ref:`ceph-volume-lvm` and
+:ref:`ceph-volume-simple`, both look to consume the :term:`OSD id` and :term:`OSD uuid`,
+but this is not a hard requirement, it is just how the sub-commands are
+implemented.
+
+Both the command and extra metadata gets persisted by systemd as part of the
+*"instance name"* of the unit. For example an OSD with an ID of 0, for the
+``lvm`` sub-command would look like::
+
+ systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
+
+The enabled unit is a :term:`systemd oneshot` service, meant to start at boot
+after the local filesystem is ready to be used.
+
+
+Failure and Retries
+-------------------
+It is common to have failures when a system is coming up online. The devices
+are sometimes not fully available and this unpredictable behavior may cause an
+OSD to not be ready to be used.
+
+There are two configurable environment variables used to set the retry
+behavior:
+
+* ``CEPH_VOLUME_SYSTEMD_TRIES``: Defaults to 30
+* ``CEPH_VOLUME_SYSTEMD_INTERVAL``: Defaults to 5
+
+The *"tries"* is a number that sets the maximum amount of times the unit will
+attempt to activate an OSD before giving up.
+
+The *"interval"* is a value in seconds that determines the waiting time before
+initiating another try at activating the OSD.
:Type: Boolean
:Default: ``false``
+
+
+``mds min caps per client``
+
+:Description: Set the minimum number of capabilities a client may hold.
+:Type: Integer
+:Default: ``100``
+
+
+``mds max ratio caps per client``
+
+:Description: Set the maximum ratio of current caps that may be recalled during MDS cache pressure.
+:Type: Float
+:Default: ``0.8``
'sphinx.ext.autodoc',
'sphinx.ext.graphviz',
'sphinx.ext.todo',
- 'sphinx_ditaa',
+ 'sphinxcontrib.ditaa',
'breathe',
]
+ditaa = 'ditaa'
todo_include_todos = True
top_level = os.path.dirname(
ceph-volume.rst
ceph-volume-systemd.rst
ceph-osd.rst
- osdmaptool.rst)
+ osdmaptool.rst
+ ceph-bluestore-tool.rst)
set(mon_srcs
ceph-mon.rst
--- /dev/null
+:orphan:
+
+======================================================
+ ceph-bluestore-tool -- bluestore administrative tool
+======================================================
+
+.. program:: ceph-bluestore-tool
+
+Synopsis
+========
+
+| **ceph-bluestore-tool** *command*
+ [ --dev *device* ... ]
+ [ --path *osd path* ]
+ [ --out-dir *dir* ]
+ [ --log-file | -l *filename* ]
+ [ --deep ]
+| **ceph-bluestore-tool** fsck|repair --path *osd path* [ --deep ]
+| **ceph-bluestore-tool** show-label --dev *device* ...
+| **ceph-bluestore-tool** prime-osd-dir --dev *device* --path *osd path*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
+
+
+Description
+===========
+
+**ceph-bluestore-tool** is a utility to perform low-level administrative
+operations on a BlueStore instance.
+
+Commands
+========
+
+.. option:: help
+
+ show help
+
+.. option:: fsck
+
+ run consistency check on BlueStore metadata. If *--deep* is specified, also read all object data and verify checksums.
+
+.. option:: repair
+
+ Run a consistency check *and* repair any errors we can.
+
+.. option:: bluefs-export
+
+ Export the contents of BlueFS (i.e., rocksdb files) to an output directory.
+
+.. option:: bluefs-bdev-sizes --path *osd path*
+
+ Print the device sizes, as understood by BlueFS, to stdout.
+
+.. option:: bluefs-bdev-expand --path *osd path*
+
+ Instruct BlueFS to check the size of its block devices and, if they have expanded, make use of the additional space.
+
+.. option:: show-label --dev *device* [...]
+
+ Show device label(s).
+
+Options
+=======
+
+.. option:: --dev *device*
+
+ Add *device* to the list of devices to consider
+
+.. option:: --path *osd path*
+
+ Specify an osd path. In most cases, the device list is inferred from the symlinks present in *osd path*. This is usually simpler than explicitly specifying the device(s) with --dev.
+
+.. option:: --out-dir *dir*
+
+ Output directory for bluefs-export
+
+.. option:: -l, --log-file *log file*
+
+ file to log to
+
+.. option:: --log-level *num*
+
+ debug log level. Default is 30 (extremely verbose), 20 is very
+ verbose, 10 is verbose, and 1 is not very verbose.
+
+.. option:: --deep
+
+ deep scrub/repair (read and validate object data, not just metadata)
+
+Device labels
+=============
+
+Every BlueStore block device has a single block label at the beginning of the
+device. You can dump the contents of the label with::
+
+ ceph-bluestore-tool show-label --dev *device*
+
+The main device will have a lot of metadata, including information
+that used to be stored in small files in the OSD data directory. The
+auxilliary devices (db and wal) will only have the minimum required
+fields (OSD UUID, size, device type, birth time).
+
+OSD directory priming
+=====================
+
+You can generate the content for an OSD data directory that can start up a
+BlueStore OSD with the *prime-osd-dir* command::
+
+ ceph-bluestore-tool prime-osd-dir --dev *main device* --path /var/lib/ceph/osd/ceph-*id*
+
+
+Availability
+============
+
+**ceph-bluestore-tool** is part of Ceph, a massively scalable,
+open-source, distributed storage system. Please refer to the Ceph
+documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8)
If you want to pre-empt failover, you can explicitly mark a ceph-mgr
daemon as failed using ``ceph mgr fail <mgr name>``.
+Using modules
+-------------
+
+Use the command ``ceph mgr module ls`` to see which modules are
+available, and which are currently enabled. Enable or disable modules
+using the commands ``ceph mgr module enable <module>`` and
+``ceph mgr module disable <module>`` respectively.
+
+If a module is *enabled* then the active ceph-mgr daemon will load
+and execute it. In the case of modules that provide a service,
+such as an HTTP server, the module may publish its address when it
+is loaded. To see the addresses of such modules, use the command
+``ceph mgr services``.
+
+Some modules may also implement a special standby mode which runs on
+standby ceph-mgr daemons as well as the active daemon. This enables
+modules that provide services to redirect their clients to the active
+daemon, if the client tries to connect to a standby.
+
+Consult the documentation pages for individual manager modules for more
+information about what functionality each module provides.
+
+Here is an example of enabling the ``dashboard`` module:
+
+::
+
+ $ ceph mgr module ls
+ {
+ "enabled_modules": [
+ "restful",
+ "status"
+ ],
+ "disabled_modules": [
+ "dashboard"
+ ]
+ }
+
+ $ ceph mgr module enable dashboard
+ $ ceph mgr module ls
+ {
+ "enabled_modules": [
+ "restful",
+ "status",
+ "dashboard"
+ ],
+ "disabled_modules": [
+ ]
+ }
+
+ $ ceph mgr services
+ {
+ "dashboard": "http://myserver.com:7789/",
+ "restful": "https://myserver.com:8789/"
+ }
+
+
Calling module commands
-----------------------
If the address it not configured, the web app will bind to ``::``,
which corresponds to all available IPv4 and IPv6 addresses.
+You can configure a prefix for all URLs::
+
+ ceph config-key set mgr/dashboard/url_prefix $PREFIX
+
+so you can access the dashboard at ``http://$IP:$PORT/$PREFIX/``.
+
+
Load balancer
-------------
dashboard available via a consistent URL regardless of which manager
daemon is currently active, you may want to set up a load balancer
front-end to direct traffic to whichever manager endpoint is
-available.
+available. If you use a reverse http proxy that forwards a subpath to
+the dashboard, you need to configure ``url_prefix`` (see above).
:maxdepth: 1
Installation and Configuration <administrator>
+ Writing plugins <plugins>
Dashboard plugin <dashboard>
+ Local pool plugin <localpool>
RESTful plugin <restful>
Zabbix plugin <zabbix>
Prometheus plugin <prometheus>
- Writing plugins <plugins>
+ Influx plugin <influx>
--- /dev/null
+=============
+Influx Plugin
+=============
+
+The influx plugin continuously collects and sends time series data to an
+influxdb database.
+
+The influx plugin was introduced in the 13.x *Mimic* release.
+
+--------
+Enabling
+--------
+
+To enable the module, use the following command:
+
+::
+
+ ceph mgr module enable influx
+
+If you wish to subsequently disable the module, you can use the equivalent
+*disable* command:
+
+::
+
+ ceph mgr module disable influx
+
+-------------
+Configuration
+-------------
+
+For the influx module to send statistics to an InfluxDB server, it
+is necessary to configure the servers address and some authentication
+credentials.
+
+Set configuration values using the following command:
+
+::
+
+ ceph config-key set mgr/influx/<key> <value>
+
+
+The most important settings are ``hostname``, ``username`` and ``password``.
+For example, a typical configuration might look like this:
+
+::
+
+ ceph config-key set mgr/influx/hostname influx.mydomain.com
+ ceph config-key set mgr/influx/username admin123
+ ceph config-key set mgr/influx/password p4ssw0rd
+
+Additional optional configuration settings are:
+
+:interval: Time between reports to InfluxDB. Default 5 seconds.
+:database: InfluxDB database name. Default "ceph". You will need to create this database and grant write privileges to the configured username or the username must have admin privileges to create it.
+:port: InfluxDB server port. Default 8086
+
+
+---------
+Debugging
+---------
+
+By default, a few debugging statments as well as error statements have been set to print in the log files. Users can add more if necessary.
+To make use of the debugging option in the module:
+
+- Add this to the ceph.conf file.::
+
+ [mgr]
+ debug_mgr = 20
+
+- Use this command ``ceph tell mgr.<mymonitor> influx self-test``.
+- Check the log files. Users may find it easier to filter the log files using *mgr[influx]*.
+
+--------------------
+Interesting counters
+--------------------
+
+The following tables describe a subset of the values output by
+this module.
+
+^^^^^
+Pools
+^^^^^
+
++---------------+-----------------------------------------------------+
+|Counter | Description |
++===============+=====================================================+
+|bytes_used | Bytes used in the pool not including copies |
++---------------+-----------------------------------------------------+
+|max_avail | Max available number of bytes in the pool |
++---------------+-----------------------------------------------------+
+|objects | Number of objects in the pool |
++---------------+-----------------------------------------------------+
+|wr_bytes | Number of bytes written in the pool |
++---------------+-----------------------------------------------------+
+|dirty | Number of bytes dirty in the pool |
++---------------+-----------------------------------------------------+
+|rd_bytes | Number of bytes read in the pool |
++---------------+-----------------------------------------------------+
+|raw_bytes_used | Bytes used in pool including copies made |
++---------------+-----------------------------------------------------+
+
+^^^^
+OSDs
+^^^^
+
++------------+------------------------------------+
+|Counter | Description |
++============+====================================+
+|op_w | Client write operations |
++------------+------------------------------------+
+|op_in_bytes | Client operations total write size |
++------------+------------------------------------+
+|op_r | Client read operations |
++------------+------------------------------------+
+|op_out_bytes| Client operations total read size |
++------------+------------------------------------+
+
+
++------------------------+--------------------------------------------------------------------------+
+|Counter | Description |
++========================+==========================================================================+
+|op_wip | Replication operations currently being processed (primary) |
++------------------------+--------------------------------------------------------------------------+
+|op_latency | Latency of client operations (including queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_process_latency | Latency of client operations (excluding queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_prepare_latency | Latency of client operations (excluding queue time and wait for finished)|
++------------------------+--------------------------------------------------------------------------+
+|op_r_latency | Latency of read operation (including queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_r_process_latency | Latency of read operation (excluding queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_w_in_bytes | Client data written |
++------------------------+--------------------------------------------------------------------------+
+|op_w_latency | Latency of write operation (including queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_w_process_latency | Latency of write operation (excluding queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_w_prepare_latency | Latency of write operations (excluding queue time and wait for finished) |
++------------------------+--------------------------------------------------------------------------+
+|op_rw | Client read-modify-write operations |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_in_bytes | Client read-modify-write operations write in |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_out_bytes | Client read-modify-write operations read out |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_latency | Latency of read-modify-write operation (including queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_process_latency | Latency of read-modify-write operation (excluding queue time) |
++------------------------+--------------------------------------------------------------------------+
+|op_rw_prepare_latency | Latency of read-modify-write operations (excluding queue time |
+| | and wait for finished) |
++------------------------+--------------------------------------------------------------------------+
+|op_before_queue_op_lat | Latency of IO before calling queue (before really queue into ShardedOpWq)|
+| | op_before_dequeue_op_lat |
++------------------------+--------------------------------------------------------------------------+
+|op_before_dequeue_op_lat| Latency of IO before calling dequeue_op(already dequeued and get PG lock)|
++------------------------+--------------------------------------------------------------------------+
+
+Latency counters are measured in microseconds unless otherwise specified in the description.
+
--- /dev/null
+Local pool plugin
+=================
+
+The *localpool* plugin can automatically create RADOS pools that are
+localized to a subset of the overall cluster. For example, by default, it will
+create a pool for each distinct rack in the cluster. This can be useful for some
+deployments that want to distribute some data locally as well as globally across the cluster .
+
+Enabling
+--------
+
+The *localpool* module is enabled with::
+
+ ceph mgr module enable localpool
+
+Configuring
+-----------
+
+The *localpool* module understands the following options:
+
+* **subtree** (default: `rack`): which CRUSH subtree type the module
+ should create a pool for.
+* **failure_domain** (default: `host`): what failure domain we should
+ separate data replicas across.
+* **pg_num** (default: `128`): number of PGs to create for each pool
+* **num_rep** (default: `3`): number of replicas for each pool.
+ (Currently, pools are always replicated.)
+* **min_size** (default: none): value to set min_size to (unchanged from Ceph's default if this option is not set)
+* **prefix** (default: `by-$subtreetype-`): prefix for the pool name.
+
+These options are set via the config-key interface. For example, to
+change the replication level to 2x with only 64 PGs, ::
+
+ ceph config-key set mgr/localpool/num_rep 2
+ ceph config-key set mgr/localpool/pg_num 64
instance is triggered, with notify_type set to "command", and
notify_id set to the tag of the command.
+Implementing standby mode
+-------------------------
+
+For some modules, it is useful to run on standby manager daemons as well
+as on the active daemon. For example, an HTTP server can usefully
+serve HTTP redirect responses from the standby managers so that
+the user can point his browser at any of the manager daemons without
+having to worry about which one is active.
+
+Standby manager daemons look for a class called ``StandbyModule``
+in each module. If the class is not found then the module is not
+used at all on standby daemons. If the class is found, then
+its ``serve`` method is called. Implementations of ``StandbyModule``
+must inherit from ``mgr_module.MgrStandbyModule``.
+
+The interface of ``MgrStandbyModule`` is much restricted compared to
+``MgrModule`` -- none of the Ceph cluster state is available to
+the module. ``serve`` and ``shutdown`` methods are used in the same
+way as a normal module class. The ``get_active_uri`` method enables
+the standby module to discover the address of its active peer in
+order to make redirects. See the ``MgrStandbyModule`` definition
+in the Ceph source code for the full list of methods.
+
+For an example of how to use this interface, look at the source code
+of the ``dashboard`` module.
Logging
-------
+=================
Prometheus plugin
=================
for all reporting entities are returned in text exposition format.
(See the Prometheus `documentation <https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details>`_.)
-Enabling
---------
+Enabling prometheus output
+==========================
The *prometheus* module is enabled with::
``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``.
This port is registered with Prometheus's `registry <https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
+Statistic names and labels
+==========================
+
+The names of the stats are exactly as Ceph names them, with
+illegal characters ``.``, ``-`` and ``::`` translated to ``_``,
+and ``ceph_`` prefixed to all names.
+
+
+All *daemon* statistics have a ``ceph_daemon`` label such as "osd.123"
+that identifies the type and ID of the daemon they come from. Some
+statistics can come from different types of daemon, so when querying
+e.g. an OSD's RocksDB stats, you would probably want to filter
+on ceph_daemon starting with "osd" to avoid mixing in the monitor
+rocksdb stats.
+
+
+The *cluster* statistics (i.e. those global to the Ceph cluster)
+have labels appropriate to what they report on. For example,
+metrics relating to pools have a ``pool_id`` label.
+
+Pool and OSD metadata series
+----------------------------
+
+Special series are output to enable displaying and querying on
+certain metadata fields.
+
+Pools have a ``ceph_pool_metadata`` field like this:
+
+::
+
+ ceph_pool_metadata{pool_id="2",name="cephfs_metadata_a"} 0.0
+
+OSDs have a ``ceph_osd_metadata`` field like this:
+
+::
+
+ ceph_osd_metadata{cluster_addr="172.21.9.34:6802/19096",device_class="ssd",id="0",public_addr="172.21.9.34:6801/19096",weight="1.0"} 0.0
+
+
+Correlating drive statistics with node_exporter
+-----------------------------------------------
+
+The prometheus output from Ceph is designed to be used in conjunction
+with the generic host monitoring from the Prometheus node_exporter.
+
+To enable correlation of Ceph OSD statistics with node_exporter's
+drive statistics, special series are output like this:
+
+::
+
+ ceph_disk_occupation{ceph_daemon="osd.0",device="sdd",instance="myhost",job="ceph"}
+
+To use this to get disk statistics by OSD ID, use the ``and on`` syntax
+in your prometheus query like this:
+
+::
+
+ rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"}
+
+See the prometheus documentation for more information about constructing
+queries.
+
+Note that for this mechanism to work, Ceph and node_exporter must agree
+about the values of the ``instance`` label. See the following section
+for guidance about to to set up Prometheus in a way that sets
+``instance`` properly.
+
+Configuring Prometheus server
+=============================
+
+See the prometheus documentation for full details of how to add
+scrape endpoints: the notes
+in this section are tips on how to configure Prometheus to capture
+the Ceph statistics in the most usefully-labelled form.
+
+This configuration is necessary because Ceph is reporting metrics
+from many hosts and services via a single endpoint, and some
+metrics that relate to no physical host (such as pool statistics).
+
+honor_labels
+------------
+
+To enable Ceph to output properly-labelled data relating to any host,
+use the ``honor_labels`` setting when adding the ceph-mgr endpoints
+to your prometheus configuration.
+
+Without this setting, any ``instance`` labels that Ceph outputs, such
+as those in ``ceph_disk_occupation`` series, will be overridden
+by Prometheus.
+
+Ceph instance label
+-------------------
+
+By default, Prometheus applies an ``instance`` label that includes
+the hostname and port of the endpoint that the series game from. Because
+Ceph clusters have multiple manager daemons, this results in an ``instance``
+label that changes spuriously when the active manager daemon changes.
+
+Set a custom ``instance`` label in your Prometheus target configuration:
+you might wish to set it to the hostname of your first monitor, or something
+completely arbitrary like "ceph_cluster".
+
+node_exporter instance labels
+-----------------------------
+
+Set your ``instance`` labels to match what appears in Ceph's OSD metadata
+in the ``hostname`` field. This is generally the short hostname of the node.
+
+This is only necessary if you want to correlate Ceph stats with host stats,
+but you may find it useful to do it in all cases in case you want to do
+the correlation in the future.
+
+Example configuration
+---------------------
+
+This example shows a single node configuration running ceph-mgr and
+node_exporter on a server called ``senta04``.
+
+This is just an example: there are other ways to configure prometheus
+scrape targets and label rewrite rules.
+
+prometheus.yml
+~~~~~~~~~~~~~~
+
+::
+
+ global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+ scrape_configs:
+ - job_name: 'node'
+ file_sd_configs:
+ - files:
+ - node_targets.yml
+ - job_name: 'ceph'
+ honor_labels: true
+ file_sd_configs:
+ - files:
+ - ceph_targets.yml
+
+
+ceph_targets.yml
+~~~~~~~~~~~~~~~~
+
+
+::
+
+ [
+ {
+ "targets": [ "senta04.mydomain.com:9283" ],
+ "labels": {
+ "instance": "ceph_cluster"
+ }
+ }
+ ]
+
+
+node_targets.yml
+~~~~~~~~~~~~~~~~
+
+::
+
+ [
+ {
+ "targets": [ "senta04.mydomain.com:9100" ],
+ "labels": {
+ "instance": "senta04"
+ }
+ }
+ ]
+
+
Notes
------
+=====
Counters and gauges are exported; currently histograms and long-running
averages are not. It's possible that Ceph's 2-D histograms could be
reduced to two separate 1-D histograms, and that long-running averages
could be exported as Prometheus' Summary type.
-The names of the stats are exactly as Ceph names them, with
-illegal characters ``.`` and ``-`` translated to ``_``. There is one
-label applied, ``daemon``, and its value is the daemon.id for the
-daemon in question (e.g. ``{daemon=mon.hosta}`` or ``{daemon=osd.11}``).
-
Timestamps, as with many Prometheus exporters, are established by
the server's scrape time (Prometheus expects that it is polling the
actual counter process synchronously). It is possible to supply a
:Type: 32-bit Integer
:Default: ``45``
+``osd max pg per osd hard ratio``
+
+:Description: The ratio of number of PGs per OSD allowed by the cluster before
+ OSD refuses to create new PGs. OSD stops creating new PGs if the number
+ of PGs it serves exceeds
+ ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``.
+
+:Type: Float
+:Default: ``2``
.. _pool: ../../operations/pools
.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
____________
The number of PGs in use in the cluster is above the configurable
-threshold of ``mon_pg_warn_max_per_osd`` PGs per OSD. This can lead
+threshold of ``mon_max_pg_per_osd`` PGs per OSD. If this threshold is
+exceed the cluster will not allow new pools to be created, pool `pg_num` to
+be increased, or pool replication to be increased (any of which would lead to
+more PGs in the cluster). A large number of PGs can lead
to higher memory utilization for OSD daemons, slower peering after
cluster state changes (like OSD restarts, additions, or removals), and
higher load on the Manager and Monitor daemons.
-The ``pg_num`` value for existing pools cannot currently be reduced.
-However, the ``pgp_num`` value can, which effectively collocates some
-PGs on the same sets of OSDs, mitigating some of the negative impacts
-described above. The ``pgp_num`` value can be adjusted with::
+The simplest way to mitigate the problem is to increase the number of
+OSDs in the cluster by adding more hardware. Note that the OSD count
+used for the purposes of this health check is the number of "in" OSDs,
+so marking "out" OSDs "in" (if there are any) can also help::
- ceph osd pool set <pool> pgp_num <value>
+ ceph osd in <osd id(s)>
Please refer to
:doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for
ceph osd pool set <pool> pgp_num <pg-num-value>
-
MANY_OBJECTS_PER_PG
___________________
)
def read_input(self, input_lines):
+ previous_line = None
for line in input_lines:
self.get_state(line)
self.get_event(line)
- self.get_context(line)
-
- def get_context(self, line):
- match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)",
- line)
+ # pass two lines at a time to get the context so that regexes can
+ # match on split signatures
+ self.get_context(line, previous_line)
+ previous_line = line
+
+ def get_context(self, line, previous_line):
+ match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(const (?P<event>\w+)", line)
+ if match is None and previous_line is not None:
+ # it is possible that we need to match on the previous line as well, so join
+ # them to make them one line and try and get this matching
+ joined_line = ' '.join([previous_line, line])
+ match = re.search(r"(\w+::)*::(?P<tag>\w+)::\w+\(\s*const (?P<event>\w+)", joined_line)
if match is not None:
self.context.append((match.group('tag'), self.context_depth, match.group('event')))
if '{' in line:
r"boost::statechart::state_machine<\s*(\w*),\s*(\w*)\s*>",
line)
if tokens is None:
- raise "Error: malformed state_machine line: " + line
+ raise Exception("Error: malformed state_machine line: " + line)
self.machines[tokens.group(1)] = tokens.group(2)
self.context.append((tokens.group(1), self.context_depth, ""))
return
r"boost::statechart::state<\s*(\w*),\s*(\w*)\s*,?\s*(\w*)\s*>",
line)
if tokens is None:
- raise "Error: malformed state line: " + line
+ raise Exception("Error: malformed state line: " + line)
self.states[tokens.group(1)] = tokens.group(2)
if tokens.group(2) not in self.state_contents.keys():
self.state_contents[tokens.group(2)] = []
if i.group(1) not in self.edges.keys():
self.edges[i.group(1)] = []
if len(self.context) is 0:
- raise "no context at line: " + line
+ raise Exception("no context at line: " + line)
self.edges[i.group(1)].append((self.context[-1][0], i.group(2)))
i = re.search("return\s+transit<\s*(\w*)\s*>()", line)
if i is not None:
if len(self.context) is 0:
- raise "no context at line: " + line
+ raise Exception("no context at line: " + line)
if self.context[-1][2] is "":
- raise "no event in context at line: " + line
+ raise Exception("no event in context at line: " + line)
if self.context[-1][2] not in self.edges.keys():
self.edges[self.context[-1][2]] = []
self.edges[self.context[-1][2]].append((self.context[-1][0], i.group(1)))
# Increase tcmalloc cache size
TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
-
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs. However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1
# Increase tcmalloc cache size
TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728
-## use jemalloc instead of tcmalloc
-#
-# jemalloc is generally faster for small IO workloads and when
-# ceph-osd is backed by SSDs. However, memory usage is usually
-# higher by 200-300mb.
-#
-#LD_PRELOAD=/usr/lib64/libjemalloc.so.1
-
## automatically restart systemd units on upgrade
#
# By default, it is left to the administrator to restart
roles:
-- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.b, mds.c, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
- [client.0]
roles:
-- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2]
-- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5]
+- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
- [client.0]
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ cephfs_ec_profile:
+ - m=2
+ - k=2
+ - crush-failure-domain=osd
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore compression mode: aggressive
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
+
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore compression mode: aggressive
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ cephfs_ec_profile:
+ - m=2
+ - k=2
+ - crush-failure-domain=osd
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
+ ceph-deploy:
+ fs: xfs
+ bluestore: yes
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
--- /dev/null
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
+ ceph-deploy:
+ fs: xfs
+ bluestore: yes
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
--- /dev/null
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ osd objectstore: filestore
+ osd sloppy crc: true
+ ceph-deploy:
+ fs: xfs
+ filestore: True
+ conf:
+ osd:
+ osd objectstore: filestore
+ osd sloppy crc: true
+
--- /dev/null
+os_type: centos
+os_version: "7.4"
-../all/centos_7.3.yaml
\ No newline at end of file
+../all/centos_7.4.yaml
\ No newline at end of file
- exec:
osd.0:
- ceph osd require-osd-release luminous
- - ceph osd set-require-min-compat-client luminous
- ceph.healthy:
overrides:
ceph:
conf:
mon:
mon warn on osd down out interval zero: false
+ log-whitelist:
+ - ruleset-
mon warn on osd down out interval zero: false
log-whitelist:
- no active mgr
+ - ruleset-
# the fix for http://tracker.ceph.com/issues/7387. If it turns out
# to not be OK (when is the default encoding *not* UTF-8?), maybe
# the character '黄' can be replaced with the escape $'\xe9\xbb\x84'
- ceph osd pool create 黄 1024 || return 1
+ ceph osd pool create 黄 16 || return 1
ceph osd lspools 2>&1 | \
grep "黄" || return 1
ceph -f json-pretty osd dump | \
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import print_function
+from subprocess import call
+try:
+ from subprocess import check_output
+except ImportError:
+ def check_output(*popenargs, **kwargs):
+ import subprocess
+ # backported from python 2.7 stdlib
+ process = subprocess.Popen(
+ stdout=subprocess.PIPE, *popenargs, **kwargs)
+ output, unused_err = process.communicate()
+ retcode = process.poll()
+ if retcode:
+ cmd = kwargs.get("args")
+ if cmd is None:
+ cmd = popenargs[0]
+ error = subprocess.CalledProcessError(retcode, cmd)
+ error.output = output
+ raise error
+ return output
+
+import filecmp
+import os
+import subprocess
+import math
+import time
+import sys
+import re
+import logging
+import json
+import tempfile
+import platform
+
+try:
+ from subprocess import DEVNULL
+except ImportError:
+ DEVNULL = open(os.devnull, "wb")
+
+logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
+
+
+if sys.version_info[0] >= 3:
+ def decode(s):
+ return s.decode('utf-8')
+
+ def check_output(*args, **kwargs):
+ return decode(subprocess.check_output(*args, **kwargs))
+else:
+ def decode(s):
+ return s
+
+
+
+def wait_for_health():
+ print("Wait for health_ok...", end="")
+ tries = 0
+ while call("{path}/ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null".format(path=CEPH_BIN), shell=True) == 0:
+ tries += 1
+ if tries == 150:
+ raise Exception("Time exceeded to go to health")
+ time.sleep(1)
+ print("DONE")
+
+
+def get_pool_id(name, nullfd):
+ cmd = "{path}/ceph osd pool stats {pool}".format(pool=name, path=CEPH_BIN).split()
+ # pool {pool} id # .... grab the 4 field
+ return check_output(cmd, stderr=nullfd).split()[3]
+
+
+# return a list of unique PGS given an osd subdirectory
+def get_osd_pgs(SUBDIR, ID):
+ PGS = []
+ if ID:
+ endhead = re.compile("{id}.*_head$".format(id=ID))
+ DIR = os.path.join(SUBDIR, "current")
+ PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
+ PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
+ return PGS
+
+
+# return a sorted list of unique PGs given a directory
+def get_pgs(DIR, ID):
+ OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+ PGS = []
+ for d in OSDS:
+ SUBDIR = os.path.join(DIR, d)
+ PGS += get_osd_pgs(SUBDIR, ID)
+ return sorted(set(PGS))
+
+
+# return a sorted list of PGS a subset of ALLPGS that contain objects with prefix specified
+def get_objs(ALLPGS, prefix, DIR, ID):
+ OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+ PGS = []
+ for d in OSDS:
+ DIRL2 = os.path.join(DIR, d)
+ SUBDIR = os.path.join(DIRL2, "current")
+ for p in ALLPGS:
+ PGDIR = p + "_head"
+ if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
+ continue
+ FINALDIR = os.path.join(SUBDIR, PGDIR)
+ # See if there are any objects there
+ if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
+ PGS += [p]
+ return sorted(set(PGS))
+
+
+# return a sorted list of OSDS which have data from a given PG
+def get_osds(PG, DIR):
+ ALLOSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
+ OSDS = []
+ for d in ALLOSDS:
+ DIRL2 = os.path.join(DIR, d)
+ SUBDIR = os.path.join(DIRL2, "current")
+ PGDIR = PG + "_head"
+ if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
+ continue
+ OSDS += [d]
+ return sorted(OSDS)
+
+
+def get_lines(filename):
+ tmpfd = open(filename, "r")
+ line = True
+ lines = []
+ while line:
+ line = tmpfd.readline().rstrip('\n')
+ if line:
+ lines += [line]
+ tmpfd.close()
+ os.unlink(filename)
+ return lines
+
+
+def cat_file(level, filename):
+ if level < logging.getLogger().getEffectiveLevel():
+ return
+ print("File: " + filename)
+ with open(filename, "r") as f:
+ while True:
+ line = f.readline().rstrip('\n')
+ if not line:
+ break
+ print(line)
+ print("<EOF>")
+
+
+def vstart(new, opt=""):
+ print("vstarting....", end="")
+ NEW = new and "-n" or "-N"
+ call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --filestore --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
+ print("DONE")
+
+
+def test_failure(cmd, errmsg, tty=False):
+ if tty:
+ try:
+ ttyfd = open("/dev/tty", "rwb")
+ except Exception as e:
+ logging.info(str(e))
+ logging.info("SKIP " + cmd)
+ return 0
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+ tmpfd = open(TMPFILE, "wb")
+
+ logging.debug(cmd)
+ if tty:
+ ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
+ ttyfd.close()
+ else:
+ ret = call(cmd, shell=True, stderr=tmpfd)
+ tmpfd.close()
+ if ret == 0:
+ logging.error(cmd)
+ logging.error("Should have failed, but got exit 0")
+ return 1
+ lines = get_lines(TMPFILE)
+ matched = [ l for l in lines if errmsg in l ]
+ if any(matched):
+ logging.info("Correctly failed with message \"" + matched[0] + "\"")
+ return 0
+ else:
+ logging.error("Command: " + cmd )
+ logging.error("Bad messages to stderr \"" + str(lines) + "\"")
+ logging.error("Expected \"" + errmsg + "\"")
+ return 1
+
+
+def get_nspace(num):
+ if num == 0:
+ return ""
+ return "ns{num}".format(num=num)
+
+
+def verify(DATADIR, POOL, NAME_PREFIX, db):
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+ ERRORS = 0
+ for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+ nsfile = rawnsfile.split("__")[0]
+ clone = rawnsfile.split("__")[1]
+ nspace = nsfile.split("-")[0]
+ file = nsfile.split("-")[1]
+ # Skip clones
+ if clone != "head":
+ continue
+ path = os.path.join(DATADIR, rawnsfile)
+ try:
+ os.unlink(TMPFILE)
+ except:
+ pass
+ cmd = "{path}/rados -p {pool} -N '{nspace}' get {file} {out}".format(pool=POOL, file=file, out=TMPFILE, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=DEVNULL, stderr=DEVNULL)
+ cmd = "diff -q {src} {result}".format(src=path, result=TMPFILE)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("{file} data not imported properly".format(file=file))
+ ERRORS += 1
+ try:
+ os.unlink(TMPFILE)
+ except:
+ pass
+ for key, val in db[nspace][file]["xattr"].items():
+ cmd = "{path}/rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True, stderr=DEVNULL)
+ logging.debug("getxattr {key} {val}".format(key=key, val=getval))
+ if getval != val:
+ logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ hdr = db[nspace][file].get("omapheader", "")
+ cmd = "{path}/rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=DEVNULL)
+ if ret != 0:
+ logging.error("rados getomapheader returned {ret}".format(ret=ret))
+ ERRORS += 1
+ else:
+ getlines = get_lines(TMPFILE)
+ assert(len(getlines) == 0 or len(getlines) == 1)
+ if len(getlines) == 0:
+ gethdr = ""
+ else:
+ gethdr = getlines[0]
+ logging.debug("header: {hdr}".format(hdr=gethdr))
+ if gethdr != hdr:
+ logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+ ERRORS += 1
+ for key, val in db[nspace][file]["omap"].items():
+ cmd = "{path}/rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=DEVNULL)
+ if ret != 0:
+ logging.error("getomapval returned {ret}".format(ret=ret))
+ ERRORS += 1
+ continue
+ getlines = get_lines(TMPFILE)
+ if len(getlines) != 1:
+ logging.error("Bad data from getomapval {lines}".format(lines=getlines))
+ ERRORS += 1
+ continue
+ getval = getlines[0]
+ logging.debug("getomapval {key} {val}".format(key=key, val=getval))
+ if getval != val:
+ logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
+ ERRORS += 1
+ try:
+ os.unlink(TMPFILE)
+ except:
+ pass
+ return ERRORS
+
+
+def check_journal(jsondict):
+ errors = 0
+ if 'header' not in jsondict:
+ logging.error("Key 'header' not in dump-journal")
+ errors += 1
+ elif 'max_size' not in jsondict['header']:
+ logging.error("Key 'max_size' not in dump-journal header")
+ errors += 1
+ else:
+ print("\tJournal max_size = {size}".format(size=jsondict['header']['max_size']))
+ if 'entries' not in jsondict:
+ logging.error("Key 'entries' not in dump-journal output")
+ errors += 1
+ elif len(jsondict['entries']) == 0:
+ logging.info("No entries in journal found")
+ else:
+ errors += check_journal_entries(jsondict['entries'])
+ return errors
+
+
+def check_journal_entries(entries):
+ errors = 0
+ for enum in range(len(entries)):
+ if 'offset' not in entries[enum]:
+ logging.error("No 'offset' key in entry {e}".format(e=enum))
+ errors += 1
+ if 'seq' not in entries[enum]:
+ logging.error("No 'seq' key in entry {e}".format(e=enum))
+ errors += 1
+ if 'transactions' not in entries[enum]:
+ logging.error("No 'transactions' key in entry {e}".format(e=enum))
+ errors += 1
+ elif len(entries[enum]['transactions']) == 0:
+ logging.error("No transactions found in entry {e}".format(e=enum))
+ errors += 1
+ else:
+ errors += check_entry_transactions(entries[enum], enum)
+ return errors
+
+
+def check_entry_transactions(entry, enum):
+ errors = 0
+ for tnum in range(len(entry['transactions'])):
+ if 'trans_num' not in entry['transactions'][tnum]:
+ logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+ errors += 1
+ elif entry['transactions'][tnum]['trans_num'] != tnum:
+ ft = entry['transactions'][tnum]['trans_num']
+ logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
+ errors += 1
+ if 'ops' not in entry['transactions'][tnum]:
+ logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+ errors += 1
+ else:
+ errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
+ return errors
+
+
+def check_transaction_ops(ops, enum, tnum):
+ if len(ops) is 0:
+ logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
+ errors = 0
+ for onum in range(len(ops)):
+ if 'op_num' not in ops[onum]:
+ logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+ errors += 1
+ elif ops[onum]['op_num'] != onum:
+ fo = ops[onum]['op_num']
+ logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
+ errors += 1
+ if 'op_name' not in ops[onum]:
+ logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+ errors += 1
+ return errors
+
+
+def test_dump_journal(CFSD_PREFIX, osds):
+ ERRORS = 0
+ pid = os.getpid()
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+
+ for osd in osds:
+ # Test --op dump-journal by loading json
+ cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
+ logging.debug(cmd)
+ tmpfd = open(TMPFILE, "wb")
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ ERRORS += 1
+ continue
+ tmpfd.close()
+ tmpfd = open(TMPFILE, "r")
+ jsondict = json.load(tmpfd)
+ tmpfd.close()
+ os.unlink(TMPFILE)
+
+ journal_errors = check_journal(jsondict)
+ if journal_errors is not 0:
+ logging.error(jsondict)
+ ERRORS += journal_errors
+
+ return ERRORS
+
+CEPH_BUILD_DIR = os.environ.get('CEPH_BUILD_DIR')
+CEPH_BIN = os.environ.get('CEPH_BIN')
+CEPH_ROOT = os.environ.get('CEPH_ROOT')
+
+if not CEPH_BUILD_DIR:
+ CEPH_BUILD_DIR=os.getcwd()
+ os.putenv('CEPH_BUILD_DIR', CEPH_BUILD_DIR)
+ CEPH_BIN=os.path.join(CEPH_BUILD_DIR, 'bin')
+ os.putenv('CEPH_BIN', CEPH_BIN)
+ CEPH_ROOT=os.path.dirname(CEPH_BUILD_DIR)
+ os.putenv('CEPH_ROOT', CEPH_ROOT)
+ CEPH_LIB=os.path.join(CEPH_BUILD_DIR, 'lib')
+ os.putenv('CEPH_LIB', CEPH_LIB)
+
+try:
+ os.mkdir("td")
+except:
+ pass # ok if this is already there
+CEPH_DIR = os.path.join(CEPH_BUILD_DIR, os.path.join("td", "cot_dir"))
+CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
+
+def kill_daemons():
+ call("{path}/init-ceph -c {conf} stop > /dev/null 2>&1".format(conf=CEPH_CONF, path=CEPH_BIN), shell=True)
+
+
+def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
+ repcount = 0
+ ERRORS = 0
+ for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+ nsfile = rawnsfile.split("__")[0]
+ clone = rawnsfile.split("__")[1]
+ nspace = nsfile.split("-")[0]
+ file = nsfile.split("-")[1] + "__" + clone
+ # Skip clones
+ if clone != "head":
+ continue
+ path = os.path.join(DATADIR, rawnsfile)
+ tmpfd = open(TMPFILE, "wb")
+ cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret:
+ logging.critical("INTERNAL ERROR")
+ return 1
+ tmpfd.close()
+ obj_locs = get_lines(TMPFILE)
+ if len(obj_locs) == 0:
+ logging.error("Can't find imported object {name}".format(name=file))
+ ERRORS += 1
+ for obj_loc in obj_locs:
+ # For btrfs skip snap_* dirs
+ if re.search("/snap_[0-9]*/", obj_loc) is not None:
+ continue
+ repcount += 1
+ cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
+ ERRORS += 1
+ return ERRORS, repcount
+
+
+def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+ # change the weight of osd.0 to math.pi in the newest osdmap of given osd
+ osdmap_file = tempfile.NamedTemporaryFile(delete=True)
+ cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+ osdmap_file=osdmap_file.name)
+ output = check_output(cmd, shell=True)
+ epoch = int(re.findall('#(\d+)', output)[0])
+
+ new_crush_file = tempfile.NamedTemporaryFile(delete=True)
+ old_crush_file = tempfile.NamedTemporaryFile(delete=True)
+ ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=old_crush_file.name, path=CEPH_BIN),
+ stdout=DEVNULL,
+ stderr=DEVNULL,
+ shell=True)
+ assert(ret == 0)
+
+ for osd_id in osd_ids:
+ cmd = "{path}/crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
+ crush_file=old_crush_file.name,
+ weight=weight,
+ new_crush_file=new_crush_file.name, path=CEPH_BIN)
+ ret = call(cmd, stdout=DEVNULL, shell=True)
+ assert(ret == 0)
+ old_crush_file, new_crush_file = new_crush_file, old_crush_file
+
+ # change them back, since we don't need to preapre for another round
+ old_crush_file, new_crush_file = new_crush_file, old_crush_file
+ old_crush_file.close()
+
+ ret = call("{path}/osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=new_crush_file.name, path=CEPH_BIN),
+ stdout=DEVNULL,
+ stderr=DEVNULL,
+ shell=True)
+ assert(ret == 0)
+
+ # Minimum test of --dry-run by using it, but not checking anything
+ cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
+ cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+ ret = call(cmd, stdout=DEVNULL, shell=True)
+ assert(ret == 0)
+
+ # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
+ # to use use a different epoch than the one in osdmap
+ cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
+ cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+ ret = call(cmd, stdout=DEVNULL, shell=True)
+
+ return ret == 0
+
+def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
+ osdmap_file = tempfile.NamedTemporaryFile(delete=True)
+ cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+ osdmap_file=osdmap_file.name)
+ ret = call(cmd, stdout=DEVNULL, shell=True)
+ if ret != 0:
+ return None
+ # we have to read the weights from the crush map, even we can query the weights using
+ # osdmaptool, but please keep in mind, they are different:
+ # item weights in crush map versus weight associated with each osd in osdmap
+ crush_file = tempfile.NamedTemporaryFile(delete=True)
+ ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+ crush_file=crush_file.name, path=CEPH_BIN),
+ stdout=DEVNULL,
+ shell=True)
+ assert(ret == 0)
+ output = check_output("{path}/crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
+ num_osd=len(osd_ids), path=CEPH_BIN),
+ stderr=DEVNULL,
+ shell=True)
+ weights = []
+ for line in output.strip().split('\n'):
+ print(line)
+ linev = re.split('\s+', line)
+ if linev[0] is '':
+ linev.pop(0)
+ print('linev %s' % linev)
+ weights.append(float(linev[2]))
+
+ return weights
+
+
+def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
+ print("Testing get-osdmap and set-osdmap")
+ errors = 0
+ kill_daemons()
+ weight = 1 / math.e # just some magic number in [0, 1]
+ changed = []
+ for osd_path in osd_paths:
+ if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+ changed.append(osd_path)
+ else:
+ logging.warning("Failed to change the weights: {0}".format(osd_path))
+ # i am pissed off if none of the store gets changed
+ if not changed:
+ errors += 1
+
+ for osd_path in changed:
+ weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
+ if not weights:
+ errors += 1
+ continue
+ if any(abs(w - weight) > 1e-5 for w in weights):
+ logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
+ errors += 1
+ return errors
+
+def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
+ # incrementals are not used unless we need to build an MOSDMap to update
+ # OSD's peers, so an obvious way to test it is simply overwrite an epoch
+ # with a different copy, and read it back to see if it matches.
+ kill_daemons()
+ file_e2 = tempfile.NamedTemporaryFile(delete=True)
+ cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
+ file=file_e2.name)
+ output = check_output(cmd, shell=True)
+ epoch = int(re.findall('#(\d+)', output)[0])
+ # backup e1 incremental before overwriting it
+ epoch -= 1
+ file_e1_backup = tempfile.NamedTemporaryFile(delete=True)
+ cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret: return 1
+ # overwrite e1 with e2
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
+ if ret: return 1
+ # Use dry-run to set back to e1 which shouldn't happen
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret: return 1
+ # read from e1
+ file_e1_read = tempfile.NamedTemporaryFile(delete=True)
+ cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
+ if ret: return 1
+ errors = 0
+ try:
+ if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
+ logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
+ errors += 1
+ finally:
+ # revert the change with file_e1_backup
+ cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
+ ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+ if ret:
+ logging.error("Failed to revert the changed inc-osdmap")
+ errors += 1
+
+ return errors
+
+
+def test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS):
+ # Test removeall
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
+ nullfd = open(os.devnull, "w")
+ errors=0
+ print("Test removeall")
+ kill_daemons()
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ JSON = db[nspace][basename]['json']
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+
+ if int(basename.split(REP_NAME)[1]) <= int(NUM_CLONED_REP_OBJECTS):
+ cmd = (CFSD_PREFIX + "'{json}' remove").format(osd=osd, json=JSON)
+ errors += test_failure(cmd, "Snapshots are present, use removeall to delete everything")
+
+ cmd = (CFSD_PREFIX + " --force --dry-run '{json}' remove").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("remove with --force failed for {json}".format(json=JSON))
+ errors += 1
+
+ cmd = (CFSD_PREFIX + " --dry-run '{json}' removeall").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("removeall failed for {json}".format(json=JSON))
+ errors += 1
+
+ cmd = (CFSD_PREFIX + " '{json}' removeall").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("removeall failed for {json}".format(json=JSON))
+ errors += 1
+
+ tmpfd = open(TMPFILE, "w")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg} --namespace {ns} {name}").format(osd=osd, pg=pg, ns=nspace, name=basename)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ errors += 1
+ tmpfd.close()
+ lines = get_lines(TMPFILE)
+ if len(lines) != 0:
+ logging.error("Removeall didn't remove all objects {ns}/{name} : {lines}".format(ns=nspace, name=basename, lines=lines))
+ errors += 1
+ vstart(new=False)
+ wait_for_health()
+ cmd = "{path}/rados -p {pool} rmsnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("rados rmsnap failed")
+ errors += 1
+ time.sleep(2)
+ wait_for_health()
+ return errors
+
+
+def main(argv):
+ if sys.version_info[0] < 3:
+ sys.stdout = stdout = os.fdopen(sys.stdout.fileno(), 'wb', 0)
+ else:
+ stdout = sys.stdout.buffer
+ if len(argv) > 1 and argv[1] == "debug":
+ nullfd = stdout
+ else:
+ nullfd = DEVNULL
+
+ call("rm -fr {dir}; mkdir -p {dir}".format(dir=CEPH_DIR), shell=True)
+ os.chdir(CEPH_DIR)
+ os.environ["CEPH_DIR"] = CEPH_DIR
+ OSDDIR = "dev"
+ REP_POOL = "rep_pool"
+ REP_NAME = "REPobject"
+ EC_POOL = "ec_pool"
+ EC_NAME = "ECobject"
+ if len(argv) > 0 and argv[0] == 'large':
+ PG_COUNT = 12
+ NUM_REP_OBJECTS = 800
+ NUM_CLONED_REP_OBJECTS = 100
+ NUM_EC_OBJECTS = 12
+ NUM_NSPACES = 4
+ # Larger data sets for first object per namespace
+ DATALINECOUNT = 50000
+ # Number of objects to do xattr/omap testing on
+ ATTR_OBJS = 10
+ else:
+ PG_COUNT = 4
+ NUM_REP_OBJECTS = 2
+ NUM_CLONED_REP_OBJECTS = 2
+ NUM_EC_OBJECTS = 2
+ NUM_NSPACES = 2
+ # Larger data sets for first object per namespace
+ DATALINECOUNT = 10
+ # Number of objects to do xattr/omap testing on
+ ATTR_OBJS = 2
+ ERRORS = 0
+ pid = os.getpid()
+ TESTDIR = "/tmp/test.{pid}".format(pid=pid)
+ DATADIR = "/tmp/data.{pid}".format(pid=pid)
+ CFSD_PREFIX = CEPH_BIN + "/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} "
+ PROFNAME = "testecprofile"
+
+ os.environ['CEPH_CONF'] = CEPH_CONF
+ vstart(new=True)
+ wait_for_health()
+
+ cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=REP_POOL, pg=PG_COUNT, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ time.sleep(2)
+ REPID = get_pool_id(REP_POOL, nullfd)
+
+ print("Created Replicated pool #{repid}".format(repid=REPID))
+
+ cmd = "{path}/ceph osd erasure-code-profile set {prof} crush-failure-domain=osd".format(prof=PROFNAME, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ cmd = "{path}/ceph osd erasure-code-profile get {prof}".format(prof=PROFNAME, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ cmd = "{path}/ceph osd pool create {pool} {pg} {pg} erasure {prof}".format(pool=EC_POOL, prof=PROFNAME, pg=PG_COUNT, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ ECID = get_pool_id(EC_POOL, nullfd)
+
+ print("Created Erasure coded pool #{ecid}".format(ecid=ECID))
+
+ print("Creating {objs} objects in replicated pool".format(objs=(NUM_REP_OBJECTS*NUM_NSPACES)))
+ cmd = "mkdir -p {datadir}".format(datadir=DATADIR)
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ db = {}
+
+ objects = range(1, NUM_REP_OBJECTS + 1)
+ nspaces = range(NUM_NSPACES)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ db[nspace] = {}
+
+ for i in objects:
+ NAME = REP_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
+
+ cmd = "rm -f " + DDNAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the replicated data for " + LNAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
+ return 1
+
+ db[nspace][NAME] = {}
+
+ if i < ATTR_OBJS + 1:
+ keys = range(i)
+ else:
+ keys = range(0)
+ db[nspace][NAME]["xattr"] = {}
+ for k in keys:
+ if k == 0:
+ continue
+ mykey = "key{i}-{k}".format(i=i, k=k)
+ myval = "val{i}-{k}".format(i=i, k=k)
+ cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("setxattr failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ db[nspace][NAME]["xattr"][mykey] = myval
+
+ # Create omap header in all objects but REPobject1
+ if i < ATTR_OBJS + 1 and i != 1:
+ myhdr = "hdr{i}".format(i=i)
+ cmd = "{path}/rados -p {pool} -N '{nspace}' setomapheader {name} {hdr}".format(pool=REP_POOL, name=NAME, hdr=myhdr, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.critical("setomapheader failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ db[nspace][NAME]["omapheader"] = myhdr
+
+ db[nspace][NAME]["omap"] = {}
+ for k in keys:
+ if k == 0:
+ continue
+ mykey = "okey{i}-{k}".format(i=i, k=k)
+ myval = "oval{i}-{k}".format(i=i, k=k)
+ cmd = "{path}/rados -p {pool} -N '{nspace}' setomapval {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.critical("setomapval failed with {ret}".format(ret=ret))
+ db[nspace][NAME]["omap"][mykey] = myval
+
+ # Create some clones
+ cmd = "{path}/rados -p {pool} mksnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ objects = range(1, NUM_CLONED_REP_OBJECTS + 1)
+ nspaces = range(NUM_NSPACES)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ for i in objects:
+ NAME = REP_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ # First clone
+ CLONENAME = DDNAME + "__1"
+ DDNAME += "__head"
+
+ cmd = "mv -f " + DDNAME + " " + CLONENAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the replicated data after a snapshot for " + LNAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
+ return 1
+
+ print("Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES)))
+
+ objects = range(1, NUM_EC_OBJECTS + 1)
+ nspaces = range(NUM_NSPACES)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ for i in objects:
+ NAME = EC_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
+
+ cmd = "rm -f " + DDNAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the erasure coded data for " + LNAME + "\n"
+ for j in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=EC_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Erasure coded pool creation failed with {ret}".format(ret=ret))
+ return 1
+
+ db[nspace][NAME] = {}
+
+ db[nspace][NAME]["xattr"] = {}
+ if i < ATTR_OBJS + 1:
+ keys = range(i)
+ else:
+ keys = range(0)
+ for k in keys:
+ if k == 0:
+ continue
+ mykey = "key{i}-{k}".format(i=i, k=k)
+ myval = "val{i}-{k}".format(i=i, k=k)
+ cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=EC_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("setxattr failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ db[nspace][NAME]["xattr"][mykey] = myval
+
+ # Omap isn't supported in EC pools
+ db[nspace][NAME]["omap"] = {}
+
+ logging.debug(db)
+
+ kill_daemons()
+
+ if ERRORS:
+ logging.critical("Unable to set up test")
+ return 1
+
+ ALLREPPGS = get_pgs(OSDDIR, REPID)
+ logging.debug(ALLREPPGS)
+ ALLECPGS = get_pgs(OSDDIR, ECID)
+ logging.debug(ALLECPGS)
+
+ OBJREPPGS = get_objs(ALLREPPGS, REP_NAME, OSDDIR, REPID)
+ logging.debug(OBJREPPGS)
+ OBJECPGS = get_objs(ALLECPGS, EC_NAME, OSDDIR, ECID)
+ logging.debug(OBJECPGS)
+
+ ONEPG = ALLREPPGS[0]
+ logging.debug(ONEPG)
+ osds = get_osds(ONEPG, OSDDIR)
+ ONEOSD = osds[0]
+ logging.debug(ONEOSD)
+
+ print("Test invalid parameters")
+ # On export can't use stdout to a terminal
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
+
+ # On export can't use stdout to a terminal
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
+
+ # Prep a valid ec export file for import failure tests
+ ONEECPG = ALLECPGS[0]
+ osds = get_osds(ONEECPG, OSDDIR)
+ ONEECOSD = osds[0]
+ OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+ # On import can't specify a different shard
+ BADPG = ONEECPG.split('s')[0] + "s10"
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
+
+ os.unlink(OTHERFILE)
+
+ # Prep a valid export file for import failure tests
+ OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+ # On import can't specify a PG with a non-existent pool
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
+
+ # On import can't specify shard for a replicated export
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
+
+ # On import can't specify a PG with a bad seed
+ TMPPG="{pool}.80".format(pool=REPID)
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
+ ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
+
+ os.unlink(OTHERFILE)
+ cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
+ ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
+
+ cmd = "{path}/ceph-objectstore-tool --data-path BAD_DATA_PATH --op list".format(osd=ONEOSD, path=CEPH_BIN)
+ ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
+
+ cmd = "{path}/ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal".format(path=CEPH_BIN)
+ ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
+
+ cmd = (CFSD_PREFIX + "--journal-path BAD_JOURNAL_PATH --op list").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: No such file or directory")
+
+ cmd = (CFSD_PREFIX + "--journal-path /bin --op list").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "journal-path: /bin: (21) Is a directory")
+
+ # On import can't use stdin from a terminal
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
+
+ # On import can't use stdin from a terminal
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
+
+ # Specify a bad --type
+ os.mkdir(OSDDIR + "/fakeosd")
+ cmd = ("{path}/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} --type foobar --op list --pgid {pg}").format(osd="fakeosd", pg=ONEPG, path=CEPH_BIN)
+ ERRORS += test_failure(cmd, "Unable to create store of type foobar")
+
+ # Don't specify a data-path
+ cmd = "{path}/ceph-objectstore-tool --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG, path=CEPH_BIN)
+ ERRORS += test_failure(cmd, "Must provide --data-path")
+
+ cmd = (CFSD_PREFIX + "--op remove --pgid 2.0").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Please use export-remove or you must use --force option")
+
+ cmd = (CFSD_PREFIX + "--force --op remove").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide pgid")
+
+ # Don't secify a --op nor object command
+ cmd = CFSD_PREFIX.format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide --op or object command...")
+
+ # Specify a bad --op command
+ cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
+
+ # Provide just the object param not a command
+ cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "Invalid syntax, missing command")
+
+ # Provide an object name that doesn't exist
+ cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
+ ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
+
+ # Provide an invalid object command
+ cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
+
+ cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
+
+ cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
+
+ cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
+
+ cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
+
+ cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+ ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
+
+ TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+ ALLPGS = OBJREPPGS + OBJECPGS
+ OSDS = get_osds(ALLPGS[0], OSDDIR)
+ osd = OSDS[0]
+
+ print("Test all --op dump-journal")
+ ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
+ ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+ # Test --op list and generate json for all objects
+ print("Test --op list variants")
+
+ # retrieve all objects from all PGs
+ tmpfd = open(TMPFILE, "wb")
+ cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ ERRORS += 1
+ tmpfd.close()
+ lines = get_lines(TMPFILE)
+ JSONOBJ = sorted(set(lines))
+ (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
+
+ # retrieve all objects in a given PG
+ tmpfd = open(OTHERFILE, "ab")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ ERRORS += 1
+ tmpfd.close()
+ lines = get_lines(OTHERFILE)
+ JSONOBJ = sorted(set(lines))
+ (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
+
+ if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
+ logging.error("the first line of --op list is different "
+ "from the first line of --op list --pgid {pg}".format(pg=pgid))
+ ERRORS += 1
+
+ # retrieve all objects with a given name in a given PG
+ tmpfd = open(OTHERFILE, "wb")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+ ERRORS += 1
+ tmpfd.close()
+ lines = get_lines(OTHERFILE)
+ JSONOBJ = sorted(set(lines))
+ (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
+
+ if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
+ logging.error("the first line of --op list is different "
+ "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
+ ERRORS += 1
+
+ print("Test --op list by generating json for all objects using default format")
+ for pg in ALLPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ tmpfd = open(TMPFILE, "ab")
+ cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from --op list request".format(ret=ret))
+ ERRORS += 1
+
+ tmpfd.close()
+ lines = get_lines(TMPFILE)
+ JSONOBJ = sorted(set(lines))
+ for JSON in JSONOBJ:
+ (pgid, jsondict) = json.loads(JSON)
+ # Skip clones for now
+ if jsondict['snapid'] != -2:
+ continue
+ db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
+ # print db[jsondict['namespace']][jsondict['oid']]['json']
+ if jsondict['oid'].find(EC_NAME) == 0 and 'shard_id' not in jsondict:
+ logging.error("Malformed JSON {json}".format(json=JSON))
+ ERRORS += 1
+
+ # Test get-bytes
+ print("Test get-bytes and set-bytes")
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+ JSON = db[nspace][basename]['json']
+ GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+ TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
+ SETNAME = "/tmp/setbytes.{pid}".format(pid=pid)
+ BADNAME = "/tmp/badbytes.{pid}".format(pid=pid)
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-bytes {fname}").format(osd=osd, pg=pg, json=JSON, fname=GETNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret}".format(ret=ret))
+ ERRORS += 1
+ continue
+ cmd = "diff -q {file} {getfile}".format(file=file, getfile=GETNAME)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Data from get-bytes differ")
+ logging.debug("Got:")
+ cat_file(logging.DEBUG, GETNAME)
+ logging.debug("Expected:")
+ cat_file(logging.DEBUG, file)
+ ERRORS += 1
+ fd = open(SETNAME, "w")
+ data = "put-bytes going into {file}\n".format(file=file)
+ fd.write(data)
+ fd.close()
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=SETNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-bytes".format(ret=ret))
+ ERRORS += 1
+ fd = open(TESTNAME, "wb")
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=fd)
+ fd.close()
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
+ ERRORS += 1
+ cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Data after set-bytes differ")
+ logging.debug("Got:")
+ cat_file(logging.DEBUG, TESTNAME)
+ logging.debug("Expected:")
+ cat_file(logging.DEBUG, SETNAME)
+ ERRORS += 1
+
+ # Use set-bytes with --dry-run and make sure contents haven't changed
+ fd = open(BADNAME, "w")
+ data = "Bad data for --dry-run in {file}\n".format(file=file)
+ fd.write(data)
+ fd.close()
+ cmd = (CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=BADNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-bytes --dry-run".format(ret=ret))
+ ERRORS += 1
+ fd = open(TESTNAME, "wb")
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=fd)
+ fd.close()
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
+ ERRORS += 1
+ cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Data after set-bytes --dry-run changed!")
+ logging.debug("Got:")
+ cat_file(logging.DEBUG, TESTNAME)
+ logging.debug("Expected:")
+ cat_file(logging.DEBUG, SETNAME)
+ ERRORS += 1
+
+ fd = open(file, "rb")
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdin=fd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
+ ERRORS += 1
+ fd.close()
+
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ try:
+ os.unlink(TESTNAME)
+ except:
+ pass
+ try:
+ os.unlink(SETNAME)
+ except:
+ pass
+ try:
+ os.unlink(BADNAME)
+ except:
+ pass
+
+ # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
+ print("Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap")
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+ JSON = db[nspace][basename]['json']
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ for key, val in db[nspace][basename]["xattr"].items():
+ attrkey = "_" + key
+ cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if getval != val:
+ logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ # set-attr to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Test set-attr with dry-run
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-attr
+ cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ if getval != "foobar":
+ logging.error("Check of set-attr failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test rm-attr
+ cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check rm-attr with dry-run
+ cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+ cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+ if ret == 0:
+ logging.error("For rm-attr expect get-attr to fail, but it succeeded")
+ ERRORS += 1
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ hdr = db[nspace][basename].get("omapheader", "")
+ cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ gethdr = check_output(cmd, shell=True)
+ if gethdr != hdr:
+ logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+ ERRORS += 1
+ continue
+ # set-omaphdr to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-omaphdr
+ cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ gethdr = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ if gethdr != "foobar":
+ logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test dry-run with set-omaphdr
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ for omapkey, val in db[nspace][basename]["omap"].items():
+ cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if getval != val:
+ logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
+ ERRORS += 1
+ continue
+ # set-omap to bogus value "foobar"
+ cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check set-omap with dry-run
+ cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ # Check the set-omap
+ cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ getval = check_output(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+ if getval != "foobar":
+ logging.error("Check of set-omap failed because we got {val}".format(val=getval))
+ ERRORS += 1
+ continue
+ # Test rm-omap
+ cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+ ERRORS += 1
+ # Check rm-omap with dry-run
+ cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+ ERRORS += 1
+ cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+ if ret == 0:
+ logging.error("For rm-omap expect get-omap to fail, but it succeeded")
+ ERRORS += 1
+ # Put back value
+ cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+ ERRORS += 1
+ continue
+
+ # Test dump
+ print("Test dump")
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+ JSON = db[nspace][basename]['json']
+ GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+ for pg in OBJREPPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
+ continue
+ cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Invalid dump for {json}".format(json=JSON))
+ ERRORS += 1
+
+ print("Test list-attrs get-attr")
+ ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
+ VALFILE = r"/tmp/val.{pid}".format(pid=pid)
+ for nspace in db.keys():
+ for basename in db[nspace].keys():
+ file = os.path.join(DATADIR, nspace + "-" + basename)
+ JSON = db[nspace][basename]['json']
+ jsondict = json.loads(JSON)
+
+ if 'shard_id' in jsondict:
+ logging.debug("ECobject " + JSON)
+ found = 0
+ for pg in OBJECPGS:
+ OSDS = get_osds(pg, OSDDIR)
+ # Fix shard_id since we only have one json instance for each object
+ jsondict['shard_id'] = int(pg.split('s')[1])
+ JSON = json.dumps(jsondict)
+ for osd in OSDS:
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
+ logging.debug("TRY: " + cmd)
+ try:
+ out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
+ logging.debug("FOUND: {json} in {osd} has value '{val}'".format(osd=osd, json=JSON, val=out))
+ found += 1
+ except subprocess.CalledProcessError as e:
+ if "No such file or directory" not in e.output and "No data available" not in e.output:
+ raise
+ # Assuming k=2 m=1 for the default ec pool
+ if found != 3:
+ logging.error("{json} hinfo_key found {found} times instead of 3".format(json=JSON, found=found))
+ ERRORS += 1
+
+ for pg in ALLPGS:
+ # Make sure rep obj with rep pg or ec obj with ec pg
+ if ('shard_id' in jsondict) != (pg.find('s') > 0):
+ continue
+ if 'shard_id' in jsondict:
+ # Fix shard_id since we only have one json instance for each object
+ jsondict['shard_id'] = int(pg.split('s')[1])
+ JSON = json.dumps(jsondict)
+ OSDS = get_osds(pg, OSDDIR)
+ for osd in OSDS:
+ DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+ fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+ and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+ if not fnames:
+ continue
+ afd = open(ATTRFILE, "wb")
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=afd)
+ afd.close()
+ if ret != 0:
+ logging.error("list-attrs failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ continue
+ keys = get_lines(ATTRFILE)
+ values = dict(db[nspace][basename]["xattr"])
+ for key in keys:
+ if key == "_" or key == "snapset" or key == "hinfo_key":
+ continue
+ key = key.strip("_")
+ if key not in values:
+ logging.error("Unexpected key {key} present".format(key=key))
+ ERRORS += 1
+ continue
+ exp = values.pop(key)
+ vfd = open(VALFILE, "wb")
+ cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=vfd)
+ vfd.close()
+ if ret != 0:
+ logging.error("get-attr failed with {ret}".format(ret=ret))
+ ERRORS += 1
+ continue
+ lines = get_lines(VALFILE)
+ val = lines[0]
+ if exp != val:
+ logging.error("For key {key} got value {got} instead of {expected}".format(key=key, got=val, expected=exp))
+ ERRORS += 1
+ if len(values) != 0:
+ logging.error("Not all keys found, remaining keys:")
+ print(values)
+
+ print("Test --op meta-list")
+ tmpfd = open(TMPFILE, "wb")
+ cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
+ ERRORS += 1
+
+ print("Test get-bytes on meta")
+ tmpfd.close()
+ lines = get_lines(TMPFILE)
+ JSONOBJ = sorted(set(lines))
+ for JSON in JSONOBJ:
+ (pgid, jsondict) = json.loads(JSON)
+ if pgid != "meta":
+ logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
+ ERRORS += 1
+ if jsondict['namespace'] != "":
+ logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
+ ERRORS += 1
+ logging.info(JSON)
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True)
+ if ret != 0:
+ logging.error("Bad exit status {ret}".format(ret=ret))
+ ERRORS += 1
+
+ try:
+ os.unlink(GETNAME)
+ except:
+ pass
+ try:
+ os.unlink(TESTNAME)
+ except:
+ pass
+
+ print("Test pg info")
+ for pg in ALLREPPGS + ALLECPGS:
+ for osd in get_osds(pg, OSDDIR):
+ cmd = (CFSD_PREFIX + "--op info --pgid {pg} | grep '\"pgid\": \"{pg}\"'").format(osd=osd, pg=pg)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Getting info failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ ERRORS += 1
+
+ print("Test pg logging")
+ if len(ALLREPPGS + ALLECPGS) == len(OBJREPPGS + OBJECPGS):
+ logging.warning("All PGs have objects, so no log without modify entries")
+ for pg in ALLREPPGS + ALLECPGS:
+ for osd in get_osds(pg, OSDDIR):
+ tmpfd = open(TMPFILE, "wb")
+ cmd = (CFSD_PREFIX + "--op log --pgid {pg}").format(osd=osd, pg=pg)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=tmpfd)
+ if ret != 0:
+ logging.error("Getting log failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ ERRORS += 1
+ HASOBJ = pg in OBJREPPGS + OBJECPGS
+ MODOBJ = False
+ for line in get_lines(TMPFILE):
+ if line.find("modify") != -1:
+ MODOBJ = True
+ break
+ if HASOBJ != MODOBJ:
+ logging.error("Bad log for pg {pg} from {osd}".format(pg=pg, osd=osd))
+ MSG = (HASOBJ and [""] or ["NOT "])[0]
+ print("Log should {msg}have a modify entry".format(msg=MSG))
+ ERRORS += 1
+
+ try:
+ os.unlink(TMPFILE)
+ except:
+ pass
+
+ print("Test list-pgs")
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+
+ CHECK_PGS = get_osd_pgs(os.path.join(OSDDIR, osd), None)
+ CHECK_PGS = sorted(CHECK_PGS)
+
+ cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
+ logging.debug(cmd)
+ TEST_PGS = check_output(cmd, shell=True).split("\n")
+ TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line
+
+ if TEST_PGS != CHECK_PGS:
+ logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
+ logging.error("Expected {pgs}".format(pgs=CHECK_PGS))
+ logging.error("Got {pgs}".format(pgs=TEST_PGS))
+ ERRORS += 1
+
+ EXP_ERRORS = 0
+ print("Test pg export --dry-run")
+ pg = ALLREPPGS[0]
+ osd = get_osds(pg, OSDDIR)[0]
+ fname = "/tmp/fname.{pid}".format(pid=pid)
+ cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ EXP_ERRORS += 1
+ elif os.path.exists(fname):
+ logging.error("Exporting --dry-run created file")
+ EXP_ERRORS += 1
+
+ cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ EXP_ERRORS += 1
+ else:
+ outdata = get_lines(fname)
+ if len(outdata) > 0:
+ logging.error("Exporting --dry-run to stdout not empty")
+ logging.error("Data: " + outdata)
+ EXP_ERRORS += 1
+
+ os.mkdir(TESTDIR)
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+ os.mkdir(os.path.join(TESTDIR, osd))
+ print("Test pg export")
+ for pg in ALLREPPGS + ALLECPGS:
+ for osd in get_osds(pg, OSDDIR):
+ mydir = os.path.join(TESTDIR, osd)
+ fname = os.path.join(mydir, pg)
+ if pg == ALLREPPGS[0]:
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
+ elif pg == ALLREPPGS[1]:
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
+ else:
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ EXP_ERRORS += 1
+
+ ERRORS += EXP_ERRORS
+
+ print("Test pg removal")
+ RM_ERRORS = 0
+ for pg in ALLREPPGS + ALLECPGS:
+ for osd in get_osds(pg, OSDDIR):
+ # This should do nothing
+ cmd = (CFSD_PREFIX + "--op remove --pgid {pg} --dry-run").format(pg=pg, osd=osd)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ RM_ERRORS += 1
+ cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Removing failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ RM_ERRORS += 1
+
+ ERRORS += RM_ERRORS
+
+ IMP_ERRORS = 0
+ if EXP_ERRORS == 0 and RM_ERRORS == 0:
+ print("Test pg import")
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+ dir = os.path.join(TESTDIR, osd)
+ PGS = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
+ for pg in PGS:
+ file = os.path.join(dir, pg)
+ # This should do nothing
+ cmd = (CFSD_PREFIX + "--op import --file {file} --dry-run").format(osd=osd, file=file)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+ IMP_ERRORS += 1
+ if pg == PGS[0]:
+ cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
+ elif pg == PGS[1]:
+ cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
+ else:
+ cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+ IMP_ERRORS += 1
+ else:
+ logging.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
+
+ ERRORS += IMP_ERRORS
+ logging.debug(cmd)
+
+ if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+ print("Verify replicated import data")
+ data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
+ ERRORS += data_errors
+ else:
+ logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
+
+ print("Test all --op dump-journal again")
+ ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
+ ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+ vstart(new=False)
+ wait_for_health()
+
+ if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+ print("Verify erasure coded import data")
+ ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
+ # Check replicated data/xattr/omap using rados
+ print("Verify replicated import data using rados")
+ ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
+
+ if EXP_ERRORS == 0:
+ NEWPOOL = "rados-import-pool"
+ cmd = "{path}/rados mkpool {pool}".format(pool=NEWPOOL, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+ print("Test rados import")
+ first = True
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+ dir = os.path.join(TESTDIR, osd)
+ for pg in [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]:
+ if pg.find("{id}.".format(id=REPID)) != 0:
+ continue
+ file = os.path.join(dir, pg)
+ if first:
+ first = False
+ # This should do nothing
+ cmd = "{path}/rados import -p {pool} --dry-run {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Rados import --dry-run failed from {file} with {ret}".format(file=file, ret=ret))
+ ERRORS += 1
+ cmd = "{path}/rados -p {pool} ls".format(pool=NEWPOOL, path=CEPH_BIN)
+ logging.debug(cmd)
+ data = check_output(cmd, shell=True)
+ if data:
+ logging.error("'{data}'".format(data=data))
+ logging.error("Found objects after dry-run")
+ ERRORS += 1
+ cmd = "{path}/rados import -p {pool} {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Rados import failed from {file} with {ret}".format(file=file, ret=ret))
+ ERRORS += 1
+ cmd = "{path}/rados import -p {pool} --no-overwrite {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
+ ERRORS += 1
+
+ ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
+ else:
+ logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
+
+ # Clear directories of previous portion
+ call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+ call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+ os.mkdir(TESTDIR)
+ os.mkdir(DATADIR)
+
+ # Cause SPLIT_POOL to split and test import with object/log filtering
+ print("Testing import all objects after a split")
+ SPLIT_POOL = "split_pool"
+ PG_COUNT = 1
+ SPLIT_OBJ_COUNT = 5
+ SPLIT_NSPACE_COUNT = 2
+ SPLIT_NAME = "split"
+ cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT, path=CEPH_BIN)
+ logging.debug(cmd)
+ call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ SPLITID = get_pool_id(SPLIT_POOL, nullfd)
+ pool_size = int(check_output("{path}/ceph osd pool get {pool} size".format(pool=SPLIT_POOL, path=CEPH_BIN), shell=True, stderr=nullfd).split(" ")[1])
+ EXP_ERRORS = 0
+ RM_ERRORS = 0
+ IMP_ERRORS = 0
+
+ objects = range(1, SPLIT_OBJ_COUNT + 1)
+ nspaces = range(SPLIT_NSPACE_COUNT)
+ for n in nspaces:
+ nspace = get_nspace(n)
+
+ for i in objects:
+ NAME = SPLIT_NAME + "{num}".format(num=i)
+ LNAME = nspace + "-" + NAME
+ DDNAME = os.path.join(DATADIR, LNAME)
+ DDNAME += "__head"
+
+ cmd = "rm -f " + DDNAME
+ logging.debug(cmd)
+ call(cmd, shell=True)
+
+ if i == 1:
+ dataline = range(DATALINECOUNT)
+ else:
+ dataline = range(1)
+ fd = open(DDNAME, "w")
+ data = "This is the split data for " + LNAME + "\n"
+ for _ in dataline:
+ fd.write(data)
+ fd.close()
+
+ cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stderr=nullfd)
+ if ret != 0:
+ logging.critical("Rados put command failed with {ret}".format(ret=ret))
+ return 1
+
+ wait_for_health()
+ kill_daemons()
+
+ for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
+ os.mkdir(os.path.join(TESTDIR, osd))
+
+ pg = "{pool}.0".format(pool=SPLITID)
+ EXPORT_PG = pg
+
+ export_osds = get_osds(pg, OSDDIR)
+ for osd in export_osds:
+ mydir = os.path.join(TESTDIR, osd)
+ fname = os.path.join(mydir, pg)
+ cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ if ret != 0:
+ logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+ EXP_ERRORS += 1
+
+ ERRORS += EXP_ERRORS
+
+ if EXP_ERRORS == 0:
+ vstart(new=False)
+ wait_for_health()
+
+ cmd = "{path}/ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL, path=CEPH_BIN)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+ time.sleep(5)
+ wait_for_health()
+
+ kill_daemons()
+
+ # Now 2 PGs, poolid.0 and poolid.1
+ for seed in range(2):
+ pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
+
+ which = 0
+ for osd in get_osds(pg, OSDDIR):
+ cmd = (CFSD_PREFIX + "--force --op remove --pgid {pg}").format(pg=pg, osd=osd)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+
+ # This is weird. The export files are based on only the EXPORT_PG
+ # and where that pg was before the split. Use 'which' to use all
+ # export copies in import.
+ mydir = os.path.join(TESTDIR, export_osds[which])
+ fname = os.path.join(mydir, EXPORT_PG)
+ which += 1
+ cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+ logging.debug(cmd)
+ ret = call(cmd, shell=True, stdout=nullfd)
+ if ret != 0:
+ logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+ IMP_ERRORS += 1
+
+ ERRORS += IMP_ERRORS
+
+ # Start up again to make sure imports didn't corrupt anything
+ if IMP_ERRORS == 0:
+ print("Verify split import data")
+ data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
+ ERRORS += data_errors
+ if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
+ logging.error("Incorrect number of replicas seen {count}".format(count=count))
+ ERRORS += 1
+ vstart(new=False)
+ wait_for_health()
+
+ call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+ call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+
+ ERRORS += test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS)
+
+ # vstart() starts 4 OSDs
+ ERRORS += test_get_set_osdmap(CFSD_PREFIX, list(range(4)), ALLOSDS)
+ ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
+
+ kill_daemons()
+ CORES = [f for f in os.listdir(CEPH_DIR) if f.startswith("core.")]
+ if CORES:
+ CORE_DIR = os.path.join("/tmp", "cores.{pid}".format(pid=os.getpid()))
+ os.mkdir(CORE_DIR)
+ call("/bin/mv {ceph_dir}/core.* {core_dir}".format(ceph_dir=CEPH_DIR, core_dir=CORE_DIR), shell=True)
+ logging.error("Failure due to cores found")
+ logging.error("See {core_dir} for cores".format(core_dir=CORE_DIR))
+ ERRORS += len(CORES)
+
+ if ERRORS == 0:
+ print("TEST PASSED")
+ return 0
+ else:
+ print("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
+ return 1
+
+
+def remove_btrfs_subvolumes(path):
+ if platform.system() == "FreeBSD":
+ return
+ result = subprocess.Popen("stat -f -c '%%T' %s" % path, shell=True, stdout=subprocess.PIPE)
+ for line in result.stdout:
+ filesystem = decode(line).rstrip('\n')
+ if filesystem == "btrfs":
+ result = subprocess.Popen("sudo btrfs subvolume list %s" % path, shell=True, stdout=subprocess.PIPE)
+ for line in result.stdout:
+ subvolume = decode(line).split()[8]
+ # extracting the relative volume name
+ m = re.search(".*(%s.*)" % path, subvolume)
+ if m:
+ found = m.group(1)
+ call("sudo btrfs subvolume delete %s" % found, shell=True)
+
+
+if __name__ == "__main__":
+ status = 1
+ try:
+ status = main(sys.argv[1:])
+ finally:
+ kill_daemons()
+ os.chdir(CEPH_BUILD_DIR)
+ remove_btrfs_subvolumes(CEPH_DIR)
+ call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
+ sys.exit(status)
+++ /dev/null
-os_type: centos
-os_version: "7.3"
--- /dev/null
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
+++ /dev/null
-os_type: ubuntu
-os_version: "16.04"
--- /dev/null
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: "Build the ceph cluster using ceph-ansible"
+
+overrides:
+ ceph_ansible:
+ vars:
+ ceph_conf_overrides:
+ global:
+ osd default pool size: 2
+ mon pg warn min per osd: 2
+ osd pool default pg num: 64
+ osd pool default pgp num: 64
+ mon_max_pg_per_osd: 1024
+ ceph_test: true
+ ceph_stable_release: luminous
+ osd_scenario: collocated
+ journal_size: 1024
+ osd_auto_discovery: false
+ ceph_origin: repository
+ ceph_repository: dev
+ ceph_mgr_modules:
+ - status
+ - restful
+ cephfs_pools:
+ - name: "cephfs_data"
+ pgs: "64"
+ - name: "cephfs_metadata"
+ pgs: "64"
+tasks:
+- ssh-keys:
+- ceph_ansible:
+- install.ship_utilities:
+++ /dev/null
-meta:
-- desc: "Build the ceph cluster using ceph-ansible"
-
-overrides:
- ceph_ansible:
- vars:
- ceph_conf_overrides:
- global:
- osd default pool size: 2
- mon pg warn min per osd: 2
- ceph_dev: true
- ceph_dev_key: https://download.ceph.com/keys/autobuild.asc
- ceph_origin: upstream
- ceph_test: true
- journal_collocation: true
- journal_size: 1024
- osd_auto_discovery: false
-
-tasks:
-- ssh-keys:
-- ceph_ansible:
-- install.ship_utilities:
--- /dev/null
+meta:
+- desc: "use bluestore + dmcrypt option"
+
+overrides:
+ ceph_ansible:
+ vars:
+ osd_objectstore: bluestore
+ dmcrypt: True
--- /dev/null
+meta:
+- desc: "without dmcrypt"
+
+overrides:
+ ceph_ansible:
+ vars:
+ dmcrypt: False
--- /dev/null
+meta:
+- desc: "use dmcrypt option"
+
+overrides:
+ ceph_ansible:
+ vars:
+ dmcrypt: True
+++ /dev/null
-meta:
-- desc: "Run ceph-admin-commands.sh"
-tasks:
-- workunit:
- clients:
- client.0:
- - ceph-tests/ceph-admin-commands.sh
+++ /dev/null
-meta:
-- desc: "Run the rados cls tests"
-tasks:
-- workunit:
- clients:
- client.0:
- - cls
+++ /dev/null
-meta:
-- desc: "Run the rbd import/export tests"
-tasks:
-- workunit:
- clients:
- client.0:
- - rbd/import_export.sh
--- /dev/null
+meta:
+- desc: "Run ceph-admin-commands.sh"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - ceph-tests/ceph-admin-commands.sh
--- /dev/null
+meta:
+- desc: "Run the rbd import/export tests"
+tasks:
+- workunit:
+ clients:
+ client.0:
+ - rbd/import_export.sh
--- /dev/null
+tasks:
+- exec:
+ mgr.x:
+ - systemctl stop ceph-mgr.target
+ - sleep 5
+ - ceph -s
+- exec:
+ mon.a:
+ - ceph restful create-key admin
+ - ceph restful create-self-signed-cert
+ - ceph restful restart
+- workunit:
+ clients:
+ client.0:
+ - rest/test-restful.sh
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
roles:
-- [mon.a, mgr.x, osd.0, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
+- [client.0, osd.4, osd.5, osd.6, osd.7]
openstack:
- volumes: # attached to each instance
count: 2
--- /dev/null
+../../../../cephfs/objectstore-ec/bluestore-ec-root.yaml
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
- [client.2]
- [client.1]
- [client.0]
roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2]
+- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
- [client.1]
- [client.0]
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
roles:
-- [mon.a, mgr.x, osd.0, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.1, osd.2]
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
+- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
openstack:
- volumes: # attached to each instance
count: 2
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec/
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
roles:
-- [mon.a, osd.0, mds.a, mds.c, client.2]
-- [mgr.x, osd.1, osd.2, mds.b, mds.d, client.3]
+- [mon.a, osd.0, osd.1, osd.2, osd.3, mds.a, mds.c, client.2]
+- [mgr.x, osd.4, osd.5, osd.6, osd.7, mds.b, mds.d, client.3]
- [client.0]
- [client.1]
openstack:
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../../../objectstore_cephfs
\ No newline at end of file
--- /dev/null
+../../../cephfs/objectstore-ec
\ No newline at end of file
+++ /dev/null
-../thrash/d-require-luminous/
\ No newline at end of file
--- /dev/null
+# do not require luminous osds at mkfs time; only set flag at
+# the end of the test run, then do a final scrub (to convert any
+# legacy snapsets), and verify we are healthy.
+tasks:
+- full_sequential_finally:
+ - exec:
+ mon.a:
+ - ceph osd require-osd-release luminous
+ - ceph osd pool application enable base rados || true
+# make sure osds have latest map
+ - rados -p rbd bench 5 write -b 4096
+ - ceph.healthy:
+ - ceph.osd_scrub_pgs:
+ cluster: ceph
+ - exec:
+ mon.a:
+ - sleep 15
+ - ceph osd dump | grep purged_snapdirs
+ - ceph pg dump -f json-pretty
+ - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
+overrides:
+ ceph:
+ conf:
+ global:
+ mon debug no require luminous: true
+
+# setting luminous triggers peering, which *might* trigger health alerts
+ log-whitelist:
+ - overall HEALTH_
+ - \(PG_AVAILABILITY\)
+ - \(PG_DEGRADED\)
+ thrashosds:
+ chance_thrash_cluster_full: 0
roles:
- [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
log-rotate:
ceph-mds: 10G
ceph-osd: 10G
--- /dev/null
+
+tasks:
+ - install:
+ - ceph:
+ # tests may leave mgrs broken, so don't try and call into them
+ # to invoke e.g. pg dump during teardown.
+ wait-for-scrub: false
+ log-whitelist:
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - replacing it with standby
+ - No standby daemons available
+ - cephfs_test_runner:
+ modules:
+ - tasks.mgr.test_dashboard
--- /dev/null
+
+tasks:
+ - install:
+ - ceph:
+ # tests may leave mgrs broken, so don't try and call into them
+ # to invoke e.g. pg dump during teardown.
+ wait-for-scrub: false
+ log-whitelist:
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - replacing it with standby
+ - No standby daemons available
+ - Reduced data availability
+ - Degraded data redundancy
+ - objects misplaced
+ - cephfs_test_runner:
+ modules:
+ - tasks.mgr.test_module_selftest
--- /dev/null
+tasks:
+ - install:
+ - ceph:
+ # tests may leave mgrs broken, so don't try and call into them
+ # to invoke e.g. pg dump during teardown.
+ wait-for-scrub: false
+ log-whitelist:
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - replacing it with standby
+ - No standby daemons available
+ - workunit:
+ clients:
+ client.0:
+ - mgr
\ No newline at end of file
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
log-whitelist:
- overall HEALTH_
- \(MGR_DOWN\)
+ - \(PG_
+ - \(OSD_
+ - \(OBJECT_
- exec:
mon.a:
- ceph restful create-key admin
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - mds.a
+ - osd.0
+ - osd.1
+- - mon.b
+ - mon.c
+ - osd.2
+ - osd.3
+ - client.0
+
+openstack:
+- volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+
+tasks:
+- install:
+- ceph:
+ fs: xfs
+ log-whitelist:
+ - overall HEALTH
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
+ - \(POOL_
+ - \(CACHE_POOL_
+ - \(SMALLER_PGP_NUM\)
+ - \(OBJECT_
+ - \(REQUEST_SLOW\)
+ - \(SLOW_OPS\)
+ - \(TOO_FEW_PGS\)
+ - but it is still running
+ conf:
+ client.rest0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+- rest-api: [client.0]
+- workunit:
+ clients:
+ client.0:
+ - rest/test.py
- (OSDMAP_FLAGS)
- (OSD_FULL)
- (MDS_READ_ONLY)
+ - (POOL_FULL)
tasks:
- install:
- ceph:
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 2
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: True
+ pg_num: 2
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 1
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+ - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: False
+ pg_num: 1
+ pool_size: 2
+ from_primary: True
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 1
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+ - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: False
+ pg_num: 1
+ pool_size: 2
+ from_primary: False
osd:
debug monc: 1
debug ms: 1
+ log-whitelist:
+ - overall HEALTH
+ - Manager daemon
+ - \(MGR_DOWN\)
- mon_seesaw:
- ceph_manager.create_pool:
kwargs:
--- /dev/null
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 20 # GB
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ osd recovery sleep: .1
+ osd min pg log entries: 100
+ osd max pg log entries: 1000
+ log-whitelist:
+ - \(POOL_APP_NOT_ENABLED\)
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(OBJECT_
+ - \(PG_
+ - overall HEALTH
+- exec:
+ osd.0:
+ - ceph osd pool create foo 128
+ - ceph osd pool application enable foo foo
+ - rados -p foo bench 30 write -b 4096 --no-cleanup
+ - ceph osd out 0
+ - sleep 5
+ - ceph osd set noup
+- ceph.restart:
+ daemons: [osd.1]
+ wait-for-up: false
+ wait-for-healthy: false
+- exec:
+ osd.0:
+ - rados -p foo bench 3 write -b 4096 --no-cleanup
+ - ceph osd unset noup
+ - sleep 10
+ - ceph tell osd.* config set osd_recovery_sleep 0
+ - ceph tell osd.* config set osd_recovery_max_active 20
+- ceph.healthy:
+- exec:
+ osd.0:
+ - egrep '(defer backfill|defer recovery)' /var/log/ceph/ceph-osd.*.log
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mgr:
+ debug osd: 20
+tasks:
+- exec:
+ mon.a:
+ - while ! ceph balancer status ; do sleep 1 ; done
+ - ceph balancer mode crush-compat
+ - ceph balancer on
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ mgr:
+ debug osd: 20
+tasks:
+- exec:
+ mon.a:
+ - while ! ceph balancer status ; do sleep 1 ; done
+ - ceph balancer mode upmap
+ - ceph balancer on
-../thrash/d-require-luminous/
\ No newline at end of file
+../basic/d-require-luminous
\ No newline at end of file
clients:
client.0:
- cls/test_cls_rbd.sh
+ - cls/test_cls_lock.sh
+ - cls/test_cls_journal.sh
+++ /dev/null
-roles:
-- - mon.a
- - mgr.x
- - mds.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
- - client.0
-
-openstack:
-- volumes: # attached to each instance
- count: 2
- size: 10 # GB
-
-tasks:
-- install:
-- ceph:
- fs: xfs
- log-whitelist:
- - but it is still running
- conf:
- client.rest0:
- debug ms: 1
- debug objecter: 20
- debug rados: 20
-- rest-api: [client.0]
-- workunit:
- clients:
- client.0:
- - rest/test.py
-os_type: centos
-os_version: "7.3"
-machine_type: vps
+machine_type: ovh
+openstack:
+- volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
overrides:
ceph_ansible:
vars:
osd default pool size: 2
osd pool default pg num: 128
osd pool default pgp num: 128
- debug rgw: 20
+ debug rgw: 20
debug ms: 1
ceph_test: true
- ceph_dev: true
- ceph_dev_key: https://download.ceph.com/keys/autobuild.asc
- ceph_origin: upstream
journal_collocation: true
osd_auto_discovery: false
journal_size: 1024
+ ceph_stable_release: luminous
+ osd_scenario: collocated
+ ceph_origin: repository
+ ceph_repository: dev
roles:
- [mon.a, osd.0, osd.1, osd.2, rgw.0]
- [osd.3, osd.4, osd.5]
- rgw: [client.0]
- s3tests:
client.0:
- force-branch: ceph-master
+ force-branch: ceph-luminous
rgw_server: client.0
overrides:
ceph:
tasks:
- s3tests:
client.0:
- force-branch: ceph-master
+ force-branch: ceph-luminous
rgw_server: client.0
overrides:
ceph:
valgrind: [--tool=memcheck]
- s3tests:
client.0:
- force-branch: ceph-master
+ force-branch: ceph-luminous
rgw_server: client.0
overrides:
ceph:
--- /dev/null
+../../../../../distros/supported/centos_latest.yaml
\ No newline at end of file
--- /dev/null
+../../../../../distros/supported/ubuntu_latest.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ Setup 4 node ceph cluster using ceph-deploy, use latest
+ stable jewel as initial release, upgrade to luminous and
+ also setup mgr nodes along after upgrade, check for
+ cluster to reach healthy state, After upgrade run kernel tar/untar
+ task and systemd task. This test will detect any
+ ceph upgrade issue and systemd issues.
+overrides:
+ ceph-deploy:
+ fs: xfs
+ conf:
+ global:
+ mon pg warn min per osd: 2
+ osd:
+ osd pool default size: 2
+ osd objectstore: filestore
+ osd sloppy crc: true
+ client:
+ rbd default features: 5
+openstack:
+- machine:
+ disk: 100
+- volumes:
+ count: 3
+ size: 30
+# reluctantely :( hard-coded machine type
+# it will override command line args with teuthology-suite
+machine_type: vps
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+- - mon.b
+ - mgr.y
+- - mon.c
+ - osd.3
+ - osd.4
+ - osd.5
+- - osd.6
+ - osd.7
+ - osd.8
+ - client.0
+tasks:
+- ssh-keys:
+- print: "**** done ssh-keys"
+- ceph-deploy:
+ branch:
+ stable: jewel
+ skip-mgr: True
+- print: "**** done initial ceph-deploy"
+- ceph-deploy.upgrade:
+ branch:
+ dev: luminous
+ setup-mgr-node: True
+ check-for-healthy: True
+ roles:
+ - mon.a
+ - mon.b
+ - mon.c
+ - osd.6
+- print: "**** done ceph-deploy upgrade"
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+ - ceph osd set-require-min-compat-client luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- workunit:
+ clients:
+ all:
+ - kernel_untar_build.sh
+- print: "**** done kernel_untar_build.sh"
+- systemd:
+- print: "**** done systemd"
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix.sh
+- print: "**** done rados/load-gen-mix.sh"
- client.1
- client.2
- client.3
+- - client.4
overrides:
ceph:
log-whitelist:
+overrides:
+ ceph:
+ conf:
+ client.0:
+ debug ms: 1
+ debug client: 10
+ debug monc: 10
+ client.1:
+ debug ms: 1
+ debug client: 10
+ debug monc: 10
+ client.2:
+ debug ms: 1
+ debug client: 10
+ debug monc: 10
+ client.3:
+ debug ms: 1
+ debug client: 10
+ debug monc: 10
meta:
- desc: |
install ceph/jewel latest
workload:
full_sequential:
- sequential:
- - ceph-fuse:
+ - ceph-fuse: [client.2]
- print: "**** done ceph-fuse 2-workload"
- workunit:
clients:
+++ /dev/null
-../../../../releases/luminous.yaml
\ No newline at end of file
--- /dev/null
+# this is the same fragment as ../../../../releases/luminous.yaml
+# but without line "ceph osd set-require-min-compat-client luminous"
+
+tasks:
+- exec:
+ mgr.x:
+ - mkdir -p /var/lib/ceph/mgr/ceph-x
+ - ceph auth get-or-create-key mgr.x mon 'allow profile mgr'
+ - ceph auth export mgr.x > /var/lib/ceph/mgr/ceph-x/keyring
+- ceph.restart:
+ daemons: [mgr.x]
+ wait-for-healthy: false
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+- ceph.healthy:
+overrides:
+ ceph:
+ conf:
+ mon:
+ mon warn on osd down out interval zero: false
+ log-whitelist:
+ - no active mgr
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 5-final-workload"
- - workunit:
- clients:
- client.3:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 5-final-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
- - rados:
- clients: [client.1]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- - print: "**** done rados 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
- - workunit:
- clients:
- client.1:
- - rados/load-gen-mix.sh
- - print: "**** done rados/load-gen-mix.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- librados C and C++ api tests
-overrides:
- ceph:
- log-whitelist:
- - reached quota
-tasks:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- - print: "**** done mon_thrash 4-final-workload"
- - workunit:
- branch: jewel
- clients:
- client.1:
- - rados/test-upgrade-v11.0.0.sh
- - print: "**** done rados/test-upgrade-v11.0.0.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- rbd object class functional tests
-tasks:
- - workunit:
- clients:
- client.1:
- - cls/test_cls_rbd.sh
- - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
- - workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-overrides:
- rgw:
- frontend: civetweb
-tasks:
- - rgw: [client.1]
- - print: "**** done rgw 4-final-workload"
- - swift:
- client.1:
- rgw_server: client.1
- - print: "**** done swift 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd on not upgrated client.4
+ (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+ - workunit:
+ branch: jewel
+ clients:
+ client.4:
+ - rbd/import_export.sh
+ - print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ mon.a:
+ - ceph osd set-require-min-compat-client jewel
+ - ceph osd crush set-all-straw-buckets-to-straw2
+ - ceph osd crush weight-set create-compat
+ - ceph osd crush weight-set reweight-compat osd.0 .9
+ - ceph osd crush weight-set reweight-compat osd.1 1.2
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+ - ceph-fuse: [client.3]
+ - print: "**** done ceph-fuse 5-final-workload"
+ - workunit:
+ clients:
+ client.3:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+ - rados:
+ clients: [client.1]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ - print: "**** done rados 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rados/load-gen-mix.sh
+ - print: "**** done rados/load-gen-mix.sh 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ librados C and C++ api tests
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+tasks:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ - print: "**** done mon_thrash 4-final-workload"
+ - workunit:
+ branch: jewel
+ clients:
+ client.1:
+ - rados/test-upgrade-v11.0.0.sh
+ - print: "**** done rados/test-upgrade-v11.0.0.sh 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ rbd object class functional tests
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - cls/test_cls_rbd.sh
+ - print: "**** done cls/test_cls_rbd.sh 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 7-final-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+overrides:
+ rgw:
+ frontend: civetweb
+tasks:
+ - rgw: [client.1]
+ - print: "**** done rgw 7-final-workload"
+ - swift:
+ client.1:
+ rgw_server: client.1
+ - print: "**** done swift 7-final-workload"
--- /dev/null
+5-workload.yaml
\ No newline at end of file
- scrub
- osd_map_max_advance
- wrongly marked
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(OSD_
+ - \(PG_
+ - \(CACHE_
fs: xfs
conf:
+ global:
+ mon warn on pool no app: false
mon:
mon debug unsafe allow tier with nonempty snaps: true
- mon warn on pool no app: false
osd:
osd map max advance: 1000
+ osd map cache size: 1100
roles:
- - mon.a
- mds.a
branch: jewel
clients:
client.1:
- - rados/test-upgrade-v11.0.0.sh
+ - rados/test-upgrade-v11.0.0-noec.sh
- cls
env:
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_image'
branch: jewel
clients:
client.0:
- - rados/test-upgrade-v11.0.0.sh
+ - rados/test-upgrade-v11.0.0-noec.sh
- cls
- print: "**** done rados/test-upgrade-v11.0.0.sh & cls workload_x upgraded client"
- rgw: [client.1]
--- /dev/null
+../parallel/6.5-crush-compat.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ Setup 4 node ceph cluster using ceph-deploy, use latest
+ stable kraken as initial release, upgrade to luminous and
+ also setup mgr nodes along after upgrade, check for
+ cluster to reach healthy state, After upgrade run kernel tar/untar
+ task and systemd task. This test will detect any
+ ceph upgrade issue and systemd issues.
+overrides:
+ ceph-deploy:
+ fs: xfs
+ conf:
+ global:
+ mon pg warn min per osd: 2
+ osd:
+ osd pool default size: 2
+ osd objectstore: filestore
+ osd sloppy crc: true
+ client:
+ rbd default features: 5
+roles:
+- - mon.a
+ - mds.a
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+- - mon.b
+ - mgr.y
+- - mon.c
+ - osd.3
+ - osd.4
+ - osd.5
+- - osd.6
+ - osd.7
+ - osd.8
+ - client.0
+tasks:
+- ssh-keys:
+- ceph-deploy:
+ branch:
+ stable: kraken
+ skip-mgr: True
+- ceph-deploy.upgrade:
+ branch:
+ dev: luminous
+ setup-mgr-node: True
+ check-for-healthy: True
+ roles:
+ - mon.a
+ - mon.b
+ - mon.c
+- workunit:
+ clients:
+ all:
+ - kernel_untar_build.sh
+- systemd:
+- workunit:
+ clients:
+ all:
+ - rados/load-gen-mix.sh
- client.1
- client.2
- client.3
+- - client.4
overrides:
ceph:
log-whitelist:
+++ /dev/null
-../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
--- /dev/null
+tasks:
+- exec:
+ osd.0:
+ - ceph osd require-osd-release luminous
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 5-final-workload"
- - workunit:
- clients:
- client.3:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 5-final-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
- - rados:
- clients: [client.1]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- - print: "**** done rados 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
- - workunit:
- clients:
- client.1:
- - rados/load-gen-mix.sh
- - print: "**** done rados/load-gen-mix.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- librados C and C++ api tests
-overrides:
- ceph:
- log-whitelist:
- - reached quota
-tasks:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- - print: "**** done mon_thrash 4-final-workload"
- - workunit:
- branch: kraken
- clients:
- client.1:
- - rados/test.sh
- - print: "**** done rados/test.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- rbd object class functional tests
-tasks:
- - workunit:
- clients:
- client.1:
- - cls/test_cls_rbd.sh
- - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
- - workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-overrides:
- rgw:
- frontend: civetweb
-tasks:
- - rgw: [client.1]
- - print: "**** done rgw 4-final-workload"
- - swift:
- client.1:
- rgw_server: client.1
- - print: "**** done swift 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd on not upgrated client.4
+ (covers issue http://tracker.ceph.com/issues/21660)
+tasks:
+ - workunit:
+ branch: kraken
+ clients:
+ client.4:
+ - rbd/import_export.sh
+ - print: "**** done rbd/import_export.sh 5-workload"
--- /dev/null
+../../../../releases/luminous-with-mgr.yaml
\ No newline at end of file
--- /dev/null
+meta:
+- desc: |
+ run a cephfs stress test
+ mount ceph-fuse on client.3 before running workunit
+tasks:
+- sequential:
+ - ceph-fuse:
+ - print: "**** done ceph-fuse 5-final-workload"
+ - workunit:
+ clients:
+ client.3:
+ - suites/blogbench.sh
+ - print: "**** done suites/blogbench.sh 5-final-workload"
--- /dev/null
+meta:
+- desc: |
+ randomized correctness test for rados operations on a replicated pool with snapshots
+tasks:
+ - rados:
+ clients: [client.1]
+ ops: 4000
+ objects: 50
+ write_append_excl: false
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 50
+ - print: "**** done rados 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ generate read/write load with rados objects ranging from 1 byte to 1MB
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rados/load-gen-mix.sh
+ - print: "**** done rados/load-gen-mix.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ librados C and C++ api tests
+overrides:
+ ceph:
+ log-whitelist:
+ - reached quota
+tasks:
+ - mon_thrash:
+ revive_delay: 20
+ thrash_delay: 1
+ - print: "**** done mon_thrash 4-final-workload"
+ - workunit:
+ branch: kraken
+ clients:
+ client.1:
+ - rados/test.sh
+ - print: "**** done rados/test.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ rbd object class functional tests
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - cls/test_cls_rbd.sh
+ - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ swift api tests for rgw
+overrides:
+ rgw:
+ frontend: civetweb
+tasks:
+ - rgw: [client.1]
+ - print: "**** done rgw 4-final-workload"
+ - swift:
+ client.1:
+ rgw_server: client.1
+ - print: "**** done swift 4-final-workload"
- client.1
- client.2
- client.3
+- - client.4
overrides:
ceph:
log-whitelist:
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-tasks:
- - workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+ on NO upgrated client
+tasks:
+ - workunit:
+ branch: luminous
+ clients:
+ client.4:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 4-final-workload on NO upgrated client"
--- /dev/null
+meta:
+- desc: |
+ run basic import/export cli tests for rbd
+ on upgrated client
+tasks:
+ - workunit:
+ clients:
+ client.1:
+ - rbd/import_export.sh
+ env:
+ RBD_CREATE_ARGS: --new-format
+ - print: "**** done rbd/import_export.sh 4-final-workload on upgrated client"
--- /dev/null
+../stress-split/1-ceph-install/
\ No newline at end of file
osd='allow *',
),
mgr=dict(
- mon='allow *',
+ mon='allow profile mgr',
+ osd='allow *',
+ mds='allow *',
),
mds=dict(
mon='allow *',
remote=mon_remote,
ceph_cluster=cluster_name,
)
- log.info('Creating RBD pool')
- mon_remote.run(
- args=['sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'pool', 'create', 'rbd', '8'])
- mon_remote.run(
- args=[
- 'sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'pool', 'application', 'enable',
- 'rbd', 'rbd', '--yes-i-really-mean-it'
- ],
- check_status=False)
+ if config.get('create_rbd_pool', True):
+ log.info('Creating RBD pool')
+ mon_remote.run(
+ args=['sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'pool', 'create', 'rbd', '8'])
+ mon_remote.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'pool', 'application', 'enable',
+ 'rbd', 'rbd', '--yes-i-really-mean-it'
+ ],
+ check_status=False)
yield
@contextlib.contextmanager
if mdss.remotes:
log.info('Setting up CephFS filesystem...')
- fs = Filesystem(ctx, name='cephfs', create=True)
+ fs = Filesystem(ctx, name='cephfs', create=True,
+ ec_profile=config.get('cephfs_ec_profile', None))
is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
from teuthology.task import install as install_fn
from teuthology.orchestra import run
from tasks.cephfs.filesystem import Filesystem
+from teuthology.misc import wait_until_healthy
log = logging.getLogger(__name__)
will use that instead. The `bootstrap` script is ran, with the argument
obtained from `python_version`, if specified.
"""
- ceph_admin = ctx.cluster.only(teuthology.get_first_mon(ctx, config))
+ # use mon.a for ceph_admin
+ (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
try:
py_ver = str(config['python_version'])
))
log.info("Installing Python")
- for admin in ceph_admin.remotes:
- system_type = teuthology.get_system_type(admin)
+ system_type = teuthology.get_system_type(ceph_admin)
if system_type == 'rpm':
package = 'python34' if py_ver == '3' else 'python'
# Prepare a modified version of cluster.remotes with ceph-deploy-ized names
modified_remotes = {}
-
+ ceph_deploy_mapped = dict()
for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
modified_remotes[_remote] = []
for svc_id in roles_for_host:
nodes_of_interest.append(fqdn)
else:
nodes_of_interest.append(nodename)
-
- modified_remotes[_remote].append(
- "{0}.{1}".format(target_role, nodename))
+ mapped_role = "{0}.{1}".format(target_role, nodename)
+ modified_remotes[_remote].append(mapped_role)
+ # keep dict of mapped role for later use by tasks
+ # eg. mon.a => mon.node1
+ ceph_deploy_mapped[svc_id] = mapped_role
else:
modified_remotes[_remote].append(svc_id)
ctx.cluster.remotes = modified_remotes
+ ctx.cluster.mapped_role = ceph_deploy_mapped
return nodes_of_interest
# Expect to find ceph_admin on the first mon by ID, same place that the download task
# puts it. Remember this here, because subsequently IDs will change from those in
# the test config to those that ceph-deploy invents.
- (ceph_admin,) = ctx.cluster.only(
- teuthology.get_first_mon(ctx, config)).remotes.iterkeys()
+
+ (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
def execute_ceph_deploy(cmd):
"""Remotely execute a ceph_deploy command"""
mds_nodes = " ".join(mds_nodes)
mon_node = get_nodes_using_role(ctx, 'mon')
mon_nodes = " ".join(mon_node)
- mgr_nodes = get_nodes_using_role(ctx, 'mgr')
- mgr_nodes = " ".join(mgr_nodes)
+ # skip mgr based on config item
+ # this is needed when test uses latest code to install old ceph
+ # versions
+ skip_mgr = config.get('skip-mgr', False)
+ if not skip_mgr:
+ mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+ mgr_nodes = " ".join(mgr_nodes)
new_mon = './ceph-deploy new' + " " + mon_nodes
- mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+ if not skip_mgr:
+ mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
mon_hostname = mon_nodes.split(' ')[0]
mon_hostname = str(mon_hostname)
gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
estatus_gather = execute_ceph_deploy(gather_keys)
- execute_ceph_deploy(mgr_create)
+ if not skip_mgr:
+ execute_ceph_deploy(mgr_create)
if mds_nodes:
estatus_mds = execute_ceph_deploy(deploy_mds)
# first check for filestore, default is bluestore with ceph-deploy
if config.get('filestore') is not None:
osd_create_cmd += '--filestore '
- else:
+ elif config.get('bluestore') is not None:
osd_create_cmd += '--bluestore '
if config.get('dmcrypt') is not None:
osd_create_cmd += '--dmcrypt '
if mds_nodes:
log.info('Configuring CephFS...')
- ceph_fs = Filesystem(ctx, create=True)
+ Filesystem(ctx, create=True)
elif not config.get('only_mon'):
raise RuntimeError(
"The cluster is NOT operational due to insufficient OSDs")
"""Either use git path or repo path """
args = ['cd', conf_dir, run.Raw(';')]
if path:
- args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path));
+ args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
else:
args.append('ceph-deploy')
args.append(run.Raw(cmd))
log.info("Waiting for cluster to become healthy")
with contextutil.safe_while(sleep=10, tries=6,
action='check health') as proceed:
- while proceed():
- r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
- out = r.stdout.getvalue()
- if (out.split(None,1)[0] == 'HEALTH_OK'):
- break
+ while proceed():
+ r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+ out = r.stdout.getvalue()
+ if (out.split(None, 1)[0] == 'HEALTH_OK'):
+ break
rgw_install = 'install {branch} --rgw {node}'.format(
branch=test_branch,
node=nodename,
yield
+@contextlib.contextmanager
+def upgrade(ctx, config):
+ """
+ Upgrade using ceph-deploy
+ eg:
+ ceph-deploy.upgrade:
+ # to upgrade to specific branch, use
+ branch:
+ stable: jewel
+ # to setup mgr node, use
+ setup-mgr-node: True
+ # to wait for cluster to be healthy after all upgrade, use
+ wait-for-healthy: True
+ role: (upgrades the below roles serially)
+ mon.a
+ mon.b
+ osd.0
+ """
+ roles = config.get('roles')
+ # get the roles that are mapped as per ceph-deploy
+ # roles are mapped for mon/mds eg: mon.a => mon.host_short_name
+ mapped_role = ctx.cluster.mapped_role
+ if config.get('branch'):
+ branch = config.get('branch')
+ (var, val) = branch.items()[0]
+ ceph_branch = '--{var}={val}'.format(var=var, val=val)
+ else:
+ # default to master
+ ceph_branch = '--dev=master'
+ # get the node used for initial deployment which is mon.a
+ mon_a = mapped_role.get('mon.a')
+ (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys()
+ testdir = teuthology.get_testdir(ctx)
+ cmd = './ceph-deploy install ' + ceph_branch
+ for role in roles:
+ # check if this role is mapped (mon or mds)
+ if mapped_role.get(role):
+ role = mapped_role.get(role)
+ remotes_and_roles = ctx.cluster.only(role).remotes
+ for remote, roles in remotes_and_roles.iteritems():
+ nodename = remote.shortname
+ cmd = cmd + ' ' + nodename
+ log.info("Upgrading ceph on %s", nodename)
+ ceph_admin.run(
+ args=[
+ 'cd',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ run.Raw('&&'),
+ run.Raw(cmd),
+ ],
+ )
+ # restart all ceph services, ideally upgrade should but it does not
+ remote.run(
+ args=[
+ 'sudo', 'systemctl', 'restart', 'ceph.target'
+ ]
+ )
+ ceph_admin.run(args=['sudo', 'ceph', '-s'])
+
+ # workaround for http://tracker.ceph.com/issues/20950
+ # write the correct mgr key to disk
+ if config.get('setup-mgr-node', None):
+ mons = ctx.cluster.only(teuthology.is_type('mon'))
+ for remote, roles in mons.remotes.iteritems():
+ remote.run(
+ args=[
+ run.Raw('sudo ceph auth get client.bootstrap-mgr'),
+ run.Raw('|'),
+ run.Raw('sudo tee'),
+ run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
+ ]
+ )
+
+ if config.get('setup-mgr-node', None):
+ mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+ mgr_nodes = " ".join(mgr_nodes)
+ mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
+ mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+ # install mgr
+ ceph_admin.run(
+ args=[
+ 'cd',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ run.Raw('&&'),
+ run.Raw(mgr_install),
+ ],
+ )
+ # create mgr
+ ceph_admin.run(
+ args=[
+ 'cd',
+ '{tdir}/ceph-deploy'.format(tdir=testdir),
+ run.Raw('&&'),
+ run.Raw(mgr_create),
+ ],
+ )
+ ceph_admin.run(args=['sudo', 'ceph', '-s'])
+ if config.get('wait-for-healthy', None):
+ wait_until_healthy(ctx, ceph_admin, use_sudo=True)
+ yield
+
+
@contextlib.contextmanager
def task(ctx, config):
"""
branch:
stable: bobtail
mon_initial_members: 1
+ ceph-deploy-branch: my-ceph-deploy-branch
only_mon: true
keep_running: true
# either choose bluestore or filestore, default is bluestore
bluestore: True
# or
filestore: True
+ # skip install of mgr for old release using below flag
+ skip-mgr: True ( default is False )
tasks:
- install:
self.stopping = False
self.logger = logger
self.config = config
- self.revive_timeout = self.config.get("revive_timeout", 150)
+ self.revive_timeout = self.config.get("revive_timeout", 360)
self.pools_to_fix_pgp_num = set()
if self.config.get('powercycle'):
self.revive_timeout += 120
self.clean_wait = self.config.get('clean_wait', 0)
- self.minin = self.config.get("min_in", 3)
+ self.minin = self.config.get("min_in", 4)
self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
self.sighup_delay = self.config.get('sighup_delay')
self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
pg=pg,
id=exp_osd))
# export
+ # Can't use new export-remove op since this is part of upgrade testing
cmd = prefix + "--op export --pgid {pg} --file {file}"
cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
proc = exp_remote.run(args=cmd)
"export failure with status {ret}".
format(ret=proc.exitstatus))
# remove
- cmd = prefix + "--op remove --pgid {pg}"
+ cmd = prefix + "--force --op remove --pgid {pg}"
cmd = cmd.format(id=exp_osd, pg=pg)
proc = exp_remote.run(args=cmd)
if proc.exitstatus:
osd_debug_skip_full_check_in_backfill_reservation to force
the more complicated check in do_scan to be exercised.
- Then, verify that all backfills stop.
+ Then, verify that all backfillings stop.
"""
self.log("injecting backfill full")
for i in self.live_osds:
check_status=True, timeout=30, stdout=DEVNULL)
for i in range(30):
status = self.ceph_manager.compile_pg_status()
- if 'backfill' not in status.keys():
+ if 'backfilling' not in status.keys():
break
self.log(
- "waiting for {still_going} backfills".format(
- still_going=status.get('backfill')))
+ "waiting for {still_going} backfillings".format(
+ still_going=status.get('backfilling')))
time.sleep(1)
- assert('backfill' not in self.ceph_manager.compile_pg_status().keys())
+ assert('backfilling' not in self.ceph_manager.compile_pg_status().keys())
for i in self.live_osds:
self.ceph_manager.set_config(
i,
for pg in pgs:
if (pg['state'].count('active') and
not pg['state'].count('recover') and
- not pg['state'].count('backfill') and
+ not pg['state'].count('backfilling') and
not pg['state'].count('stale')):
num += 1
return num
else:
self.log("no progress seen, keeping timeout for now")
if now - start >= timeout:
+ if self.is_recovered():
+ break
self.log('dumping pgs')
out = self.raw_cluster_cmd('pg', 'dump')
self.log(out)
time.sleep(3)
self.log("active!")
+ def wait_till_pg_convergence(self, timeout=None):
+ start = time.time()
+ old_stats = None
+ active_osds = [osd['osd'] for osd in self.get_osd_dump()
+ if osd['in'] and osd['up']]
+ while True:
+ # strictly speaking, no need to wait for mon. but due to the
+ # "ms inject socket failures" setting, the osdmap could be delayed,
+ # so mgr is likely to ignore the pg-stat messages with pgs serving
+ # newly created pools which is not yet known by mgr. so, to make sure
+ # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+ # necessary.
+ self.flush_pg_stats(active_osds)
+ new_stats = dict((stat['pgid'], stat['state'])
+ for stat in self.get_pg_stats())
+ if old_stats == new_stats:
+ return old_stats
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'failed to reach convergence before %d secs' % timeout
+ old_stats = new_stats
+ # longer than mgr_stats_period
+ time.sleep(5 + 1)
+
def mark_out_osd(self, osd):
"""
Wrapper to mark osd out.
time.sleep(2)
self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
- def revive_osd(self, osd, timeout=150, skip_admin_check=False):
+ def revive_osd(self, osd, timeout=360, skip_admin_check=False):
"""
Revive osds by either power cycling (if indicated by the config)
or by restarting.
continue
for pg in pgs[osdid]:
- cmd = ((prefix + "--op remove --pgid {pg}").
+ cmd = ((prefix + "--force --op remove --pgid {pg}").
format(pg=pg, id=osdid))
proc = remote.run(args=cmd, check_status=False,
stdout=StringIO())
This object is for driving a CephFS filesystem. The MDS daemons driven by
MDSCluster may be shared with other Filesystems.
"""
- def __init__(self, ctx, fscid=None, name=None, create=False):
+ def __init__(self, ctx, fscid=None, name=None, create=False,
+ ec_profile=None):
super(Filesystem, self).__init__(ctx)
self.name = name
+ self.ec_profile = ec_profile
self.id = None
self.metadata_pool_name = None
self.metadata_overlay = False
self.name, self.metadata_pool_name, data_pool_name,
'--allow-dangerous-metadata-overlay')
else:
- self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
- data_pool_name, pgs_per_fs_pool.__str__())
+ if self.ec_profile:
+ log.info("EC profile is %s", self.ec_profile)
+ cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name]
+ cmd.extend(self.ec_profile)
+ self.mon_manager.raw_cluster_cmd(*cmd)
+ self.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'create',
+ data_pool_name, pgs_per_fs_pool.__str__(), 'erasure',
+ data_pool_name)
+ self.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set',
+ data_pool_name, 'allow_ec_overwrites', 'true')
+ else:
+ self.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'create',
+ data_pool_name, pgs_per_fs_pool.__str__())
self.mon_manager.raw_cluster_cmd('fs', 'new',
self.name, self.metadata_pool_name, data_pool_name)
self.check_pool_application(self.metadata_pool_name)
REQUIRE_KCLIENT_REMOTE = True
CLIENTS_REQUIRED = 2
- def _test_client_pin(self, use_subdir):
+ def _test_client_pin(self, use_subdir, open_files):
"""
When a client pins an inode in its cache, for example because the file is held open,
it should reject requests from the MDS to trim these caps. The MDS should complain
:param use_subdir: whether to put test files in a subdir or use root
"""
- cache_size = 100
- open_files = 200
+ cache_size = open_files/2
self.set_conf('mds', 'mds cache size', cache_size)
self.fs.mds_fail_restart()
self.fs.wait_for_daemons()
+ mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+ self.assertTrue(open_files >= mds_min_caps_per_client)
+ mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
+
mount_a_client_id = self.mount_a.get_global_id()
path = "subdir/mount_a" if use_subdir else "mount_a"
open_proc = self.mount_a.open_n_background(path, open_files)
# MDS should not be happy about that, as the client is failing to comply
# with the SESSION_RECALL messages it is being sent
mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
- self.wait_for_health("MDS_CLIENT_RECALL",
- mds_recall_state_timeout + 10)
+ self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
# We can also test that the MDS health warning for oversized
# cache is functioning as intended.
# The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
# which depend on the caps outstanding, cache size and overall ratio
- self.wait_until_equal(
- lambda: self.get_session(mount_a_client_id)['num_caps'],
- int(open_files * 0.2),
- timeout=30,
- reject_fn=lambda x: x < int(open_files*0.2))
+ recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
+ def expected_caps():
+ num_caps = self.get_session(mount_a_client_id)['num_caps']
+ if num_caps < mds_min_caps_per_client:
+ raise RuntimeError("client caps fell below min!")
+ elif num_caps == mds_min_caps_per_client:
+ return True
+ elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+ return True
+ else:
+ return False
+
+ self.wait_until_true(expected_caps, timeout=60)
@needs_trimming
def test_client_pin_root(self):
- self._test_client_pin(False)
+ self._test_client_pin(False, 400)
@needs_trimming
def test_client_pin(self):
- self._test_client_pin(True)
+ self._test_client_pin(True, 800)
+
+ @needs_trimming
+ def test_client_pin_mincaps(self):
+ self._test_client_pin(True, 200)
def test_client_release_bug(self):
"""
:return:
"""
- # Because the teuthology config template sets mon_pg_warn_max_per_osd to
+ # Because the teuthology config template sets mon_max_pg_per_osd to
# 10000 (i.e. it just tries to ignore health warnings), reset it to something
# sane before using volume_client, to avoid creating pools with absurdly large
# numbers of PGs.
- self.set_conf("global", "mon pg warn max per osd", "300")
+ self.set_conf("global", "mon max pg per osd", "300")
for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
mon_daemon_state.restart()
# Calculate how many PGs we'll expect the new volume pool to have
osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
- max_per_osd = int(self.fs.get_config('mon_pg_warn_max_per_osd'))
+ max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd'))
osd_count = len(osd_map['osds'])
max_overall = osd_count * max_per_osd
# auth ID belongs to, the auth ID's authorized access levels
# for different volumes, versioning details, etc.
expected_auth_metadata = {
- u"version": 1,
+ u"version": 2,
u"compat_version": 1,
u"dirty": False,
u"tenant_id": u"tenant1",
# Verify that the volume metadata file stores info about auth IDs
# and their access levels to the volume, versioning details, etc.
expected_vol_metadata = {
- u"version": 1,
+ u"version": 2,
u"compat_version": 1,
u"auths": {
u"guest": {
volume_id=volume_id,
auth_id=guestclient["auth_id"],
)))
+
+ def test_put_object(self):
+ vc_mount = self.mounts[1]
+ vc_mount.umount_wait()
+ self._configure_vc_auth(vc_mount, "manila")
+
+ obj_data = 'test data'
+ obj_name = 'test_vc_obj_1'
+ pool_name = self.fs.get_data_pool_names()[0]
+
+ self._volume_client_python(vc_mount, dedent("""
+ vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+ """.format(
+ pool_name = pool_name,
+ obj_name = obj_name,
+ obj_data = obj_data
+ )))
+
+ read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name)
+ self.assertEqual(obj_data, read_data)
+
+ def test_get_object(self):
+ vc_mount = self.mounts[1]
+ vc_mount.umount_wait()
+ self._configure_vc_auth(vc_mount, "manila")
+
+ obj_data = 'test_data'
+ obj_name = 'test_vc_ob_2'
+ pool_name = self.fs.get_data_pool_names()[0]
+
+ self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+ self._volume_client_python(vc_mount, dedent("""
+ data_read = vc.get_object("{pool_name}", "{obj_name}")
+ assert data_read == b"{obj_data}"
+ """.format(
+ pool_name = pool_name,
+ obj_name = obj_name,
+ obj_data = obj_data
+ )))
+
+ def test_delete_object(self):
+ vc_mount = self.mounts[1]
+ vc_mount.umount_wait()
+ self._configure_vc_auth(vc_mount, "manila")
+
+ obj_data = 'test data'
+ obj_name = 'test_vc_obj_3'
+ pool_name = self.fs.get_data_pool_names()[0]
+
+ self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+ self._volume_client_python(vc_mount, dedent("""
+ data_read = vc.delete_object("{pool_name}", "{obj_name}")
+ """.format(
+ pool_name = pool_name,
+ obj_name = obj_name,
+ )))
+
+ with self.assertRaises(CommandFailedError):
+ self.fs.rados(['stat', obj_name], pool=pool_name)
+
+ # Check idempotency -- no error raised trying to delete non-existent
+ # object
+ self._volume_client_python(vc_mount, dedent("""
+ data_read = vc.delete_object("{pool_name}", "{obj_name}")
+ """.format(
+ pool_name = pool_name,
+ obj_name = obj_name,
+ )))
+
+ def test_21501(self):
+ """
+ Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+ existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+ """
+
+ vc_mount = self.mounts[1]
+ vc_mount.umount_wait()
+
+ # Configure vc_mount as the handle for driving volumeclient
+ self._configure_vc_auth(vc_mount, "manila")
+
+ # Create a volume
+ group_id = "grpid"
+ volume_id = "volid"
+ mount_path = self._volume_client_python(vc_mount, dedent("""
+ vp = VolumePath("{group_id}", "{volume_id}")
+ create_result = vc.create_volume(vp, 1024*1024*10)
+ print create_result['mount_path']
+ """.format(
+ group_id=group_id,
+ volume_id=volume_id
+ )))
+
+ # Create an auth ID with no caps
+ guest_id = '21501'
+ self.fs.mon_manager.raw_cluster_cmd_result(
+ 'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+ guest_mount = self.mounts[2]
+ guest_mount.umount_wait()
+
+ # Set auth caps for the auth ID using the volumeclient
+ self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path)
+
+ # Mount the volume in the guest using the auth ID to assert that the
+ # auth caps are valid
+ guest_mount.mount(mount_path=mount_path)
format(fpath=FSPATH, jpath=JPATH))
pid = os.getpid()
expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
- cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
- format(id=divergent, file=expfile))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
- cmd = ((prefix + "--op remove --pgid 2.0").
+ cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
format(id=divergent, file=expfile))
proc = exp_remote.run(args=cmd, wait=True,
check_status=False, stdout=StringIO())
from unittest import case
import json
+import logging
from teuthology import misc
from tasks.ceph_test_case import CephTestCase
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
from tasks.cephfs.filesystem import CephCluster
+log = logging.getLogger(__name__)
+
+
class MgrCluster(CephCluster):
def __init__(self, ctx):
super(MgrCluster, self).__init__(ctx)
def get_standby_ids(self):
return [s['name'] for s in self.get_mgr_map()["standbys"]]
+ def set_module_localized_conf(self, module, mgr_id, key, val):
+ self.mon_manager.raw_cluster_cmd("config-key", "set",
+ "mgr/{0}/{1}/{2}".format(
+ module, mgr_id, key
+ ), val)
+
class MgrTestCase(CephTestCase):
MGRS_REQUIRED = 1
self.wait_until_true(
lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
timeout=20)
+
+ def _load_module(self, module_name):
+ loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "module", "ls"))['enabled_modules']
+ if module_name in loaded:
+ # The enable command is idempotent, but our wait for a restart
+ # isn't, so let's return now if it's already loaded
+ return
+
+ initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+ module_name)
+
+ # Wait for the module to load
+ def has_restarted():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+ if done:
+ log.info("Restarted after module load (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(has_restarted, timeout=30)
+
+
+ def _get_uri(self, service_name):
+ # Little dict hack so that I can assign into this from
+ # the get_or_none function
+ mgr_map = {'x': None}
+
+ def _get_or_none():
+ mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+ result = mgr_map['x']['services'].get(service_name, None)
+ return result
+
+ self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+ uri = mgr_map['x']['services'][service_name]
+
+ log.info("Found {0} at {1} (daemon {2}/{3})".format(
+ service_name, uri, mgr_map['x']['active_name'],
+ mgr_map['x']['active_gid']))
+
+ return uri
+
+
+ def _assign_ports(self, module_name, config_name, min_port=7789):
+ """
+ To avoid the need to run lots of hosts in teuthology tests to
+ get different URLs per mgr, we will hand out different ports
+ to each mgr here.
+
+ This is already taken care of for us when running in a vstart
+ environment.
+ """
+ # Start handing out ports well above Ceph's range.
+ assign_port = min_port
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_stop(mgr_id)
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ log.info("Using port {0} for {1} on mgr.{2}".format(
+ assign_port, module_name, mgr_id
+ ))
+ self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+ config_name,
+ str(assign_port))
+ assign_port += 1
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_restart(mgr_id)
+
+ def is_available():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['available']
+ if done:
+ log.info("Available after assign ports (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(is_available, timeout=30)
--- /dev/null
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+ MGRS_REQUIRED = 3
+
+ def test_standby(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+
+ original_uri = self._get_uri("dashboard")
+ log.info("Originally running at {0}".format(original_uri))
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ failed_over_uri = self._get_uri("dashboard")
+ log.info("After failover running at {0}".format(original_uri))
+
+ self.assertNotEqual(original_uri, failed_over_uri)
+
+ # The original active daemon should have come back up as a standby
+ # and be doing redirects to the new active daemon
+ r = requests.get(original_uri, allow_redirects=False)
+ self.assertEqual(r.status_code, 303)
+ self.assertEqual(r.headers['Location'], failed_over_uri)
+
+ def test_urls(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ base_uri = self._get_uri("dashboard")
+
+ # This is a very simple smoke test to check that the dashboard can
+ # give us a 200 response to requests. We're not testing that
+ # the content is correct or even renders!
+
+ urls = [
+ "/health",
+ "/servers",
+ "/osd/",
+ "/osd/perf/0",
+ "/rbd_mirroring",
+ "/rbd_iscsi"
+ ]
+
+ failures = []
+
+ for url in urls:
+ r = requests.get(base_uri + url, allow_redirects=False)
+ if r.status_code >= 300 and r.status_code < 400:
+ log.error("Unexpected redirect to: {0} (from {1})".format(
+ r.headers['Location'], base_uri))
+ if r.status_code != 200:
+ failures.append(url)
+
+ log.info("{0}: {1} ({2} bytes)".format(
+ url, r.status_code, len(r.content)
+ ))
+
+ self.assertListEqual(failures, [])
--- /dev/null
+
+import time
+import requests
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+class TestModuleSelftest(MgrTestCase):
+ """
+ That modules with a self-test command can be loaded and execute it
+ without errors.
+
+ This is not a substitute for really testing the modules, but it
+ is quick and is designed to catch regressions that could occur
+ if data structures change in a way that breaks how the modules
+ touch them.
+ """
+ MGRS_REQUIRED = 1
+
+ def _selftest_plugin(self, module_name):
+ self._load_module(module_name)
+
+ # Execute the module's self-test routine
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
+
+ def test_zabbix(self):
+ self._selftest_plugin("zabbix")
+
+ def test_prometheus(self):
+ self._selftest_plugin("prometheus")
+
+ def test_influx(self):
+ self._selftest_plugin("influx")
+
+ def test_selftest_run(self):
+ self._load_module("selftest")
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+ def test_selftest_command_spam(self):
+ # Use the selftest module to stress the mgr daemon
+ self._load_module("selftest")
+
+ # Use the dashboard to test that the mgr is still able to do its job
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "start",
+ "command_spam")
+
+ dashboard_uri = self._get_uri("dashboard")
+
+ delay = 10
+ periods = 10
+ for i in range(0, periods):
+ t1 = time.time()
+ # Check that an HTTP module remains responsive
+ r = requests.get(dashboard_uri)
+ self.assertEqual(r.status_code, 200)
+
+ # Check that a native non-module command remains responsive
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+ time.sleep(delay - (time.time() - t1))
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "stop")
+
+ # Check that all mgr daemons are still running
+ self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+ self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
--- /dev/null
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+ return sum(1 for state in pgs.itervalues()
+ if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+ return sum(1 for state in pgs.itervalues()
+ if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+ """
+ osd should stop creating new pools if the number of pg it servers
+ exceeds the max-pg-per-osd setting, and it should resume the previously
+ suspended pg creations once the its pg number drops down below the setting
+ How it works::
+ 1. set the hard limit of pg-per-osd to "2"
+ 2. create pool.a with pg_num=2
+ # all pgs should be active+clean
+ 2. create pool.b with pg_num=2
+ # new pgs belonging to this pool should be unknown (the primary osd
+ reaches the limit) or creating (replica osd reaches the limit)
+ 3. remove pool.a
+ 4. all pg belonging to pool.b should be active+clean
+ """
+ pg_num = config.get('pg_num', 2)
+ manager = ctx.managers['ceph']
+ log.info('1. creating pool.a')
+ pool_a = manager.create_pool_with_unique_name(pg_num)
+ manager.wait_for_clean()
+ assert manager.get_num_active_clean() == pg_num
+
+ log.info('2. creating pool.b')
+ pool_b = manager.create_pool_with_unique_name(pg_num)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
+ pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+ assert pg_pending == pg_num
+
+ log.info('3. removing pool.a')
+ manager.remove_pool(pool_a)
+ pg_states = manager.wait_till_pg_convergence(300)
+ assert len(pg_states) == pg_num
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
+
+ # cleanup
+ manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+ """
+ osd should stop creating new pools if the number of pg it servers
+ exceeds the max-pg-per-osd setting, and it should resume the previously
+ suspended pg creations once the its pg number drops down below the setting
+
+ How it works::
+ 0. create 4 OSDs.
+ 1. create pool.a with pg_num=1, size=2
+ pg will be mapped to osd.0, and osd.1, and it should be active+clean
+ 2. create pool.b with pg_num=1, size=2.
+ if the pgs stuck in creating, delete the pool since the pool and try
+ again, eventually we'll get the pool to land on the other 2 osds that
+ aren't occupied by pool.a. (this will also verify that pgs for deleted
+ pools get cleaned out of the creating wait list.)
+ 3. mark an osd out. verify that some pgs get stuck stale or peering.
+ 4. delete a pool, verify pgs go active.
+ """
+ pg_num = config.get('pg_num', 1)
+ pool_size = config.get('pool_size', 2)
+ from_primary = config.get('from_primary', True)
+
+ manager = ctx.managers['ceph']
+ log.info('1. creating pool.a')
+ pool_a = manager.create_pool_with_unique_name(pg_num)
+ manager.wait_for_clean()
+ assert manager.get_num_active_clean() == pg_num
+
+ log.info('2. creating pool.b')
+ while True:
+ pool_b = manager.create_pool_with_unique_name(pg_num)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created >= pg_num
+ pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+ assert pg_pending == pg_num * 2 - pg_created
+ if pg_created == pg_num * 2:
+ break
+ manager.remove_pool(pool_b)
+
+ log.info('3. mark an osd out')
+ pg_stats = manager.get_pg_stats()
+ pg = random.choice(pg_stats)
+ if from_primary:
+ victim = pg['acting'][-1]
+ else:
+ victim = pg['acting'][0]
+ manager.mark_out_osd(victim)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+ assert pg_stuck > 0
+
+ log.info('4. removing pool.b')
+ manager.remove_pool(pool_b)
+ manager.wait_for_clean(30)
+
+ # cleanup
+ manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+ assert isinstance(config, dict), \
+ 'osd_max_pg_per_osd task only accepts a dict for config'
+ manager = ctx.managers['ceph']
+ if config.get('test_create_from_mon', True):
+ test_create_from_mon(ctx, config)
+ else:
+ test_create_from_peer(ctx, config)
format(fpath=FSPATH, jpath=JPATH))
pid = os.getpid()
expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
- cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
+ cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
format(id=divergent, file=expfile))
proc = exp_remote.run(args=cmd, wait=True,
check_status=False, stdout=StringIO())
assert proc.exitstatus == 0
- # Remove the same pg that was exported
- cmd = ((prefix + "--op remove --pgid 2.0").
- format(id=divergent))
- proc = exp_remote.run(args=cmd, wait=True,
- check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
-
# Kill one of non-divergent OSDs
log.info('killing osd.%d' % non_divergent[0])
manager.kill_osd(non_divergent[0])
# manager.mark_out_osd(non_divergent[0])
# An empty collection for pg 2.0 might need to be cleaned up
- cmd = ((prefix + "--op remove --pgid 2.0").
+ cmd = ((prefix + "--force --op remove --pgid 2.0").
format(id=non_divergent[0]))
proc = exp_remote.run(args=cmd, wait=True,
check_status=False, stdout=StringIO())
fix_rgw_config(rgw_node, dnsmasq_name)
setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir)
if hadoop_ver.startswith('2.8'):
- test_options = '-Dit.test=ITestS3A* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
+ # test all ITtests but skip AWS test using public bucket landsat-pds
+ # which is not available from within this test
+ test_options = '-Dit.test=ITestS3A* -Dit.test=\!ITestS3AAWSCredentialsProvider* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
else:
test_options = 'test -Dtest=S3a*,TestS3A*'
try:
cluster: (default 'ceph') the name of the cluster to thrash
- min_in: (default 3) the minimum number of OSDs to keep in the
+ min_in: (default 4) the minimum number of OSDs to keep in the
cluster
min_out: (default 0) the minimum number of OSDs to keep out of the
if application:
remote.run(args=[
'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
- ])
+ ], check_status=False) # may fail as EINVAL when run in jewel upgrade test
def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
remote.run(args=[
if application:
remote.run(args=[
'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
- ])
+ ], check_status=False)
def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
remote.run(args=[
LOG.debug(self.unused_disks('sd.'))
if self.unused_disks('sd.'):
return
- modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=200 ; udevadm settle"
+ modprobe = "modprobe scsi_debug vpd_use_hostno=0 add_host=1 dev_size_mb=300 ; udevadm settle"
try:
self.sh(modprobe)
except:
exit 1
fi
-sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
+sudo env PATH=$(dirname $0):$(dirname $0)/..:$PATH PYTHONWARNINGS=ignore ${PYTHON} -m pytest -s -v $(dirname $0)/ceph-disk-test.py
result=$?
sudo rm -f /lib/udev/rules.d/60-ceph-by-partuuid.rules
# When CEPH_CLI_TEST_DUP_COMMAND is set, osd create
# is repeated and consumes two osd id, not just one.
#
- local next_osd
- if test "$CEPH_CLI_TEST_DUP_COMMAND" ; then
- next_osd=$((gap_start + 1))
- else
- next_osd=$gap_start
- fi
- id=`ceph osd create`
- [ "$id" = "$next_osd" ]
-
- next_osd=$((id + 1))
+ local next_osd=$gap_start
id=`ceph osd create $(uuidgen)`
[ "$id" = "$next_osd" ]
ceph osd erasure-code-profile set fooprofile a=b c=d e=f --force
ceph osd erasure-code-profile set fooprofile a=b c=d e=f
expect_false ceph osd erasure-code-profile set fooprofile a=b c=d e=f g=h
- #
- # cleanup by removing profile 'fooprofile'
+ # ruleset-foo will work for luminous only
+ ceph osd erasure-code-profile set barprofile ruleset-failure-domain=host
+ ceph osd erasure-code-profile set barprofile crush-failure-domain=host
+ # clean up
ceph osd erasure-code-profile rm fooprofile
+ ceph osd erasure-code-profile rm barprofile
}
function test_mon_osd_misc()
--- /dev/null
+#!/bin/sh -e
+
+GTEST_FILTER=${CLS_JOURNAL_GTEST_FILTER:-*}
+ceph_test_cls_journal --gtest_filter=${GTEST_FILTER}
+
+exit 0
--- /dev/null
+#!/bin/sh -ex
+
+ceph config-key set mgr/localpool/subtree host
+ceph config-key set mgr/localpool/failure_domain osd
+ceph mgr module enable localpool
+
+while ! ceph osd pool ls | grep '^by-host-'
+do
+ sleep 5
+done
+
+ceph mgr module disable localpool
+for p in `ceph osd pool ls | grep '^by-host-'`
+do
+ ceph osd pool rm $p $p --yes-i-really-really-mean-it
+done
+
+ceph config-key rm mgr/localpool/subtree
+ceph config-key rm mgr/localpool/failure_domain
+
+echo OK
$CEPH_TOOL osd pool set-quota $p max_objects 1
V1=`mktemp fooattrXXXXXXX`
$RADOS_TOOL put $OBJ $V1 -p $p
- while ! $CEPH_TOOL osd dump | grep 'full max_objects'
+ while ! $CEPH_TOOL osd dump | grep 'full_no_quota max_objects'
do
sleep 2
done
admin_daemon ${CLUSTER1} rbd mirror flush
admin_daemon ${CLUSTER1} rbd mirror status
+testlog "TEST: test image rename"
+new_name="${image}_RENAMED"
+rename_image ${CLUSTER2} ${POOL} ${image} ${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+admin_daemon ${CLUSTER1} rbd mirror status ${POOL}/${new_name}
+admin_daemon ${CLUSTER1} rbd mirror restart ${POOL}/${new_name}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${new_name}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${new_name} 'up+replaying'
+rename_image ${CLUSTER2} ${POOL} ${new_name} ${image}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
+
testlog "TEST: failover and failback"
start_mirror ${CLUSTER2}
rbd --cluster ${cluster} -p ${pool} image-meta set ${image} $key $val
}
+rename_image()
+{
+ local cluster=$1
+ local pool=$2
+ local image=$3
+ local new_name=$4
+
+ rbd --cluster=${cluster} -p ${pool} rename ${image} ${new_name}
+}
+
remove_image()
{
local cluster=$1
allow ceph_t sysfs_t:dir read;
allow ceph_t sysfs_t:file { read getattr open };
-allow ceph_t sysfs_t:lnk_file read;
+allow ceph_t sysfs_t:lnk_file { read getattr };
allow ceph_t random_device_t:chr_file getattr;
allow ceph_t urandom_device_t:chr_file getattr;
-3e7492b9ada8bdc9a5cd0feafd42fbca27f9c38e
-v12.2.1
+cf0baeeeeba3b47f9427c6c97e2144b094b7e5ba
+v12.2.2
--- /dev/null
+fs.aio-max-nr = 1048576
${auth_files}
${mds_files})
+CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
if(HAS_VTA)
- set_source_files_properties(common/config.cc
+ set_source_files_properties(
+ common/config.cc
+ common/options.cc
PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
endif()
mgr/DaemonState.cc
mgr/DaemonServer.cc
mgr/ClusterState.cc
- mgr/PyModules.cc
+ mgr/ActivePyModules.cc
+ mgr/StandbyPyModules.cc
+ mgr/PyModuleRegistry.cc
+ mgr/PyModuleRunner.cc
mgr/PyFormatter.cc
- mgr/PyState.cc
- mgr/MgrPyModule.cc
+ mgr/PyOSDMap.cc
+ mgr/BaseMgrModule.cc
+ mgr/BaseMgrStandbyModule.cc
+ mgr/ActivePyModule.cc
mgr/MgrStandby.cc
mgr/Mgr.cc
+ mgr/Gil.cc
mgr/mgr_commands.cc)
add_executable(ceph-mgr ${mgr_srcs}
$<TARGET_OBJECTS:heap_profiler_objs>)
add_subdirectory(ceph-detect-init)
## dencoder
-CHECK_C_COMPILER_FLAG("-fvar-tracking-assignments" HAS_VTA)
if(HAS_VTA)
set_source_files_properties(test/encoding/ceph_dencoder.cc
PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+#include "acconfig.h"
#include "arch/probe.h"
/* flags we export */
ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
#elif __aarch64__ && __linux__
ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
-# ifdef HWCAP_CRC32
+# if defined(HAVE_ARMV8_CRC) && defined(HWCAP_CRC32)
ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
-# else
- ceph_arch_aarch64_crc32 = 0; // sorry!
# endif
#else
if (0)
import base64
import errno
import fcntl
+import functools
import json
import logging
import os
import grp
import textwrap
import glob
+import warnings
CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
CEPH_LOCKBOX_ONDISK_MAGIC = 'ceph lockbox volume v001'
KEY_MANAGEMENT_MODE_V1 = 'ceph-mon v1'
+DEPRECATION_WARNING = """
+*******************************************************************************
+This tool is now deprecated in favor of ceph-volume.
+It is recommended to use ceph-volume for OSD deployments. For details see:
+
+ http://docs.ceph.com/docs/master/ceph-volume/#migrating
+
+*******************************************************************************
+"""
+
PTYPE = {
'regular': {
'journal': {
return None
+def retry(on_error=Exception, max_tries=10, wait=0.2, backoff=0):
+ def wrapper(func):
+ @functools.wraps(func)
+ def repeat(*args, **kwargs):
+ for tries in range(max_tries - 1):
+ try:
+ return func(*args, **kwargs)
+ except on_error:
+ time.sleep(wait + backoff * tries)
+ return func(*args, **kwargs)
+ return repeat
+ return wrapper
+
+
+@retry(Error)
def get_partition_dev(dev, pnum):
"""
get the device name for a partition
sda 1 -> sda1
cciss/c0d1 1 -> cciss!c0d1p1
"""
- max_retry = 10
- for retry in range(0, max_retry + 1):
- partname = None
- error_msg = ""
- if is_mpath(dev):
- partname = get_partition_mpath(dev, pnum)
- else:
- name = get_dev_name(os.path.realpath(dev))
- sys_entry = os.path.join(BLOCKDIR, name)
- error_msg = " in %s" % sys_entry
- for f in os.listdir(sys_entry):
- if f.startswith(name) and f.endswith(str(pnum)):
- # we want the shortest name that starts with the base name
- # and ends with the partition number
- if not partname or len(f) < len(partname):
- partname = f
- if partname:
- if retry:
- LOG.info('Found partition %d for %s after %d tries' %
- (pnum, dev, retry))
- return get_dev_path(partname)
- else:
- if retry < max_retry:
- LOG.info('Try %d/%d : partition %d for %s does not exist%s' %
- (retry + 1, max_retry, pnum, dev, error_msg))
- time.sleep(.2)
- continue
- else:
- raise Error('partition %d for %s does not appear to exist%s' %
- (pnum, dev, error_msg))
+ partname = None
+ error_msg = ""
+ if is_mpath(dev):
+ partname = get_partition_mpath(dev, pnum)
+ else:
+ name = get_dev_name(os.path.realpath(dev))
+ sys_entry = os.path.join(BLOCKDIR, name)
+ error_msg = " in %s" % sys_entry
+ for f in os.listdir(sys_entry):
+ if f.startswith(name) and f.endswith(str(pnum)):
+ # we want the shortest name that starts with the base name
+ # and ends with the partition number
+ if not partname or len(f) < len(partname):
+ partname = f
+ if partname:
+ return get_dev_path(partname)
+ else:
+ raise Error('partition %d for %s does not appear to exist%s' %
+ (pnum, dev, error_msg))
def list_all_partitions():
raise Error('unable to map device', rawdev, e)
-def dmcrypt_unmap(
- _uuid
-):
+@retry(Error, max_tries=10, wait=0.5, backoff=1.0)
+def dmcrypt_unmap(_uuid):
if not os.path.exists('/dev/mapper/' + _uuid):
return
- retries = 0
- while True:
- try:
- command_check_call(['cryptsetup', 'remove', _uuid])
- break
- except subprocess.CalledProcessError as e:
- if retries == 10:
- raise Error('unable to unmap device', _uuid, e)
- else:
- time.sleep(0.5 + retries * 1.0)
- retries += 1
+ try:
+ command_check_call(['cryptsetup', 'remove', _uuid])
+ except subprocess.CalledProcessError as e:
+ raise Error('unable to unmap device', _uuid, e)
def mount(
return path
+@retry(UnmountError, max_tries=3, wait=0.5, backoff=1.0)
def unmount(
path,
do_rm=True,
"""
Unmount and removes the given mount point.
"""
- retries = 0
- while True:
- try:
- LOG.debug('Unmounting %s', path)
- command_check_call(
- [
- '/bin/umount',
- '--',
- path,
- ],
- )
- break
- except subprocess.CalledProcessError as e:
- # on failure, retry 3 times with incremental backoff
- if retries == 3:
- raise UnmountError(e)
- else:
- time.sleep(0.5 + retries * 1.0)
- retries += 1
+ try:
+ LOG.debug('Unmounting %s', path)
+ command_check_call(
+ [
+ '/bin/umount',
+ '--',
+ path,
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise UnmountError(e)
if not do_rm:
return
os.rmdir(path)
return self.ptype_map[name]['ready']
@staticmethod
+ @retry(OSError)
def factory(path, dev, args):
dmcrypt_type = CryptHelpers.get_dmcrypt_type(args)
if ((path is not None and is_mpath(path)) or
osd_id,
):
systemd_disable(path, osd_id)
- if is_mounted(path):
+ if os.path.ismount(path):
style = ['--runtime']
else:
style = []
)
osd_data = get_mount_point(cluster, osd_id)
+ args.cluster = cluster
+ if args.dmcrypt:
+ for name in Space.NAMES:
+ # Check if encrypted device in journal
+ dev_path = os.path.join(osd_data, name + '_dmcrypt')
+ if not os.path.exists(dev_path):
+ continue
+ partition = DevicePartition.factory(
+ path=None,
+ dev=dev_path,
+ args=args)
+ partition.rawdev = args.path
+ partition.map()
+
elif stat.S_ISDIR(mode):
(cluster, osd_id) = activate_dir(
path=args.path,
def main(argv):
+ # Deprecate from the very beginning
+ warnings.warn(DEPRECATION_WARNING)
args = parse_args(argv)
setup_logging(args.verbose, args.log_stdout)
CEPH_PREF_GROUP = args.setgroup
if args.verbose:
- args.func(args)
+ try:
+ args.func(args)
+ except Exception:
+ # warn on any exception when running with verbosity
+ warnings.warn(DEPRECATION_WARNING)
+ # but still raise the original issue
+ raise
+
else:
main_catch(args.func, args)
+ # if there aren't any errors, still log again at the very bottom
+ warnings.warn(DEPRECATION_WARNING)
+
def setup_logging(verbose, log_stdout):
loglevel = logging.WARNING
func(args)
except Error as e:
+ # warn on generic 'error' exceptions
+ warnings.warn(DEPRECATION_WARNING)
raise SystemExit(
'{prog}: {msg}'.format(
prog=args.prog,
)
except CephDiskException as error:
+ # warn on ceph-disk exceptions
+ warnings.warn(DEPRECATION_WARNING)
exc_name = error.__class__.__name__
raise SystemExit(
'{prog} {exc_name}: {msg}'.format(
coverage report --show-missing
[testenv:flake8]
-commands = flake8 --ignore=H105,H405,E127 ceph_disk tests
+commands = flake8 --ignore=H105,H405,E127,E722 ceph_disk tests
--- /dev/null
+"""
+Device API that can be shared among other implementations.
+"""
--- /dev/null
+"""
+API for CRUD lvm tag operations. Follows the Ceph LVM tag naming convention
+that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
+set of utilities for interacting with LVM.
+"""
+from ceph_volume import process
+from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
+
+
+def _output_parser(output, fields):
+ """
+ Newer versions of LVM allow ``--reportformat=json``, but older versions,
+ like the one included in Xenial do not. LVM has the ability to filter and
+ format its output so we assume the output will be in a format this parser
+ can handle (using ',' as a delimiter)
+
+ :param fields: A string, possibly using ',' to group many items, as it
+ would be used on the CLI
+ :param output: The CLI output from the LVM call
+ """
+ field_items = fields.split(',')
+ report = []
+ for line in output:
+ # clear the leading/trailing whitespace
+ line = line.strip()
+
+ # remove the extra '"' in each field
+ line = line.replace('"', '')
+
+ # prevent moving forward with empty contents
+ if not line:
+ continue
+
+ # spliting on ';' because that is what the lvm call uses as
+ # '--separator'
+ output_items = [i.strip() for i in line.split(';')]
+ # map the output to the fiels
+ report.append(
+ dict(zip(field_items, output_items))
+ )
+
+ return report
+
+
+def parse_tags(lv_tags):
+ """
+ Return a dictionary mapping of all the tags associated with
+ a Volume from the comma-separated tags coming from the LVM API
+
+ Input look like::
+
+ "ceph.osd_fsid=aaa-fff-bbbb,ceph.osd_id=0"
+
+ For the above example, the expected return value would be::
+
+ {
+ "ceph.osd_fsid": "aaa-fff-bbbb",
+ "ceph.osd_id": "0"
+ }
+ """
+ if not lv_tags:
+ return {}
+ tag_mapping = {}
+ tags = lv_tags.split(',')
+ for tag_assignment in tags:
+ key, value = tag_assignment.split('=', 1)
+ tag_mapping[key] = value
+
+ return tag_mapping
+
+
+def get_api_vgs():
+ """
+ Return the list of group volumes available in the system using flags to
+ include common metadata associated with them
+
+ Command and sample delimeted output, should look like::
+
+ $ sudo vgs --noheadings --separator=';' \
+ -o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
+ ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
+ osd_vg;3;1;0;wz--n-;29.21g;9.21g
+
+ """
+ fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
+ stdout, stderr, returncode = process.call(
+ ['sudo', 'vgs', '--noheadings', '--separator=";"', '-o', fields]
+ )
+ return _output_parser(stdout, fields)
+
+
+def get_api_lvs():
+ """
+ Return the list of logical volumes available in the system using flags to include common
+ metadata associated with them
+
+ Command and delimeted output, should look like::
+
+ $ sudo lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
+ ;/dev/ubuntubox-vg/root;root;ubuntubox-vg
+ ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
+
+ """
+ fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
+ stdout, stderr, returncode = process.call(
+ ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
+ )
+ return _output_parser(stdout, fields)
+
+
+def get_api_pvs():
+ """
+ Return the list of physical volumes configured for lvm and available in the
+ system using flags to include common metadata associated with them like the uuid
+
+ Command and delimeted output, should look like::
+
+ $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
+ /dev/sda1;;
+ /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
+
+ """
+ fields = 'pv_name,pv_tags,pv_uuid'
+
+ # note the use of `pvs -a` which will return every physical volume including
+ # ones that have not been initialized as "pv" by LVM
+ stdout, stderr, returncode = process.call(
+ ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
+ )
+
+ return _output_parser(stdout, fields)
+
+
+def get_lv_from_argument(argument):
+ """
+ Helper proxy function that consumes a possible logical volume passed in from the CLI
+ in the form of `vg/lv`, but with some validation so that an argument that is a full
+ path to a device can be ignored
+ """
+ if argument.startswith('/'):
+ lv = get_lv(lv_path=argument)
+ return lv
+ try:
+ vg_name, lv_name = argument.split('/')
+ except (ValueError, AttributeError):
+ return None
+ return get_lv(lv_name=lv_name, vg_name=vg_name)
+
+
+def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+ """
+ Return a matching lv for the current system, requiring ``lv_name``,
+ ``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
+ is found.
+
+ It is useful to use ``tags`` when trying to find a specific logical volume,
+ but it can also lead to multiple lvs being found, since a lot of metadata
+ is shared between lvs of a distinct OSD.
+ """
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+ return None
+ lvs = Volumes()
+ return lvs.get(
+ lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
+ lv_tags=lv_tags
+ )
+
+
+def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ Return a matching pv (physical volume) for the current system, requiring
+ ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
+ pv is found.
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ return None
+ pvs = PVolumes()
+ return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
+
+
+def create_pv(device):
+ """
+ Create a physical volume from a device, useful when devices need to be later mapped
+ to journals.
+ """
+ process.run([
+ 'sudo',
+ 'pvcreate',
+ '-v', # verbose
+ '-f', # force it
+ '--yes', # answer yes to any prompts
+ device
+ ])
+
+
+def create_vg(name, *devices):
+ """
+ Create a Volume Group. Command looks like::
+
+ vgcreate --force --yes group_name device
+
+ Once created the volume group is returned as a ``VolumeGroup`` object
+ """
+ process.run([
+ 'sudo',
+ 'vgcreate',
+ '--force',
+ '--yes',
+ name] + list(devices)
+ )
+
+ vg = get_vg(vg_name=name)
+ return vg
+
+
+def remove_lv(path):
+ """
+ Removes a logical volume given it's absolute path.
+
+ Will return True if the lv is successfully removed or
+ raises a RuntimeError if the removal fails.
+ """
+ stdout, stderr, returncode = process.call(
+ [
+ 'sudo',
+ 'lvremove',
+ '-v', # verbose
+ '-f', # force it
+ path
+ ],
+ show_command=True,
+ terminal_verbose=True,
+ )
+ if returncode != 0:
+ raise RuntimeError("Unable to remove %s".format(path))
+ return True
+
+
+def create_lv(name, group, size=None, tags=None):
+ """
+ Create a Logical Volume in a Volume Group. Command looks like::
+
+ lvcreate -L 50G -n gfslv vg0
+
+ ``name``, ``group``, are required. If ``size`` is provided it must follow
+ lvm's size notation (like 1G, or 20M). Tags are an optional dictionary and is expected to
+ conform to the convention of prefixing them with "ceph." like::
+
+ {"ceph.block_device": "/dev/ceph/osd-1"}
+ """
+ # XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
+ type_path_tag = {
+ 'journal': 'ceph.journal_device',
+ 'data': 'ceph.data_device',
+ 'block': 'ceph.block_device',
+ 'wal': 'ceph.wal_device',
+ 'db': 'ceph.db_device',
+ 'lockbox': 'ceph.lockbox_device', # XXX might not ever need this lockbox sorcery
+ }
+ if size:
+ process.run([
+ 'sudo',
+ 'lvcreate',
+ '--yes',
+ '-L',
+ '%s' % size,
+ '-n', name, group
+ ])
+ # create the lv with all the space available, this is needed because the
+ # system call is different for LVM
+ else:
+ process.run([
+ 'sudo',
+ 'lvcreate',
+ '--yes',
+ '-l',
+ '100%FREE',
+ '-n', name, group
+ ])
+
+ lv = get_lv(lv_name=name, vg_name=group)
+ lv.set_tags(tags)
+
+ # when creating a distinct type, the caller doesn't know what the path will
+ # be so this function will set it after creation using the mapping
+ path_tag = type_path_tag.get(tags.get('ceph.type'))
+ if path_tag:
+ lv.set_tags(
+ {path_tag: lv.lv_path}
+ )
+ return lv
+
+
+def get_vg(vg_name=None, vg_tags=None):
+ """
+ Return a matching vg for the current system, requires ``vg_name`` or
+ ``tags``. Raises an error if more than one vg is found.
+
+ It is useful to use ``tags`` when trying to find a specific volume group,
+ but it can also lead to multiple vgs being found.
+ """
+ if not any([vg_name, vg_tags]):
+ return None
+ vgs = VolumeGroups()
+ return vgs.get(vg_name=vg_name, vg_tags=vg_tags)
+
+
+class VolumeGroups(list):
+ """
+ A list of all known volume groups for the current system, with the ability
+ to filter them via keyword arguments.
+ """
+
+ def __init__(self):
+ self._populate()
+
+ def _populate(self):
+ # get all the vgs in the current system
+ for vg_item in get_api_vgs():
+ self.append(VolumeGroup(**vg_item))
+
+ def _purge(self):
+ """
+ Deplete all the items in the list, used internally only so that we can
+ dynamically allocate the items when filtering without the concern of
+ messing up the contents
+ """
+ self[:] = []
+
+ def _filter(self, vg_name=None, vg_tags=None):
+ """
+ The actual method that filters using a new list. Useful so that other
+ methods that do not want to alter the contents of the list (e.g.
+ ``self.find``) can operate safely.
+
+ .. note:: ``vg_tags`` is not yet implemented
+ """
+ filtered = [i for i in self]
+ if vg_name:
+ filtered = [i for i in filtered if i.vg_name == vg_name]
+
+ # at this point, `filtered` has either all the volumes in self or is an
+ # actual filtered list if any filters were applied
+ if vg_tags:
+ tag_filtered = []
+ for volume in filtered:
+ matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
+ if matches:
+ tag_filtered.append(volume)
+ return tag_filtered
+
+ return filtered
+
+ def filter(self, vg_name=None, vg_tags=None):
+ """
+ Filter out groups on top level attributes like ``vg_name`` or by
+ ``vg_tags`` where a dict is required. For example, to find a Ceph group
+ with dmcache as the type, the filter would look like::
+
+ vg_tags={'ceph.type': 'dmcache'}
+
+ .. warning:: These tags are not documented because they are currently
+ unused, but are here to maintain API consistency
+ """
+ if not any([vg_name, vg_tags]):
+ raise TypeError('.filter() requires vg_name or vg_tags (none given)')
+ # first find the filtered volumes with the values in self
+ filtered_groups = self._filter(
+ vg_name=vg_name,
+ vg_tags=vg_tags
+ )
+ # then purge everything
+ self._purge()
+ # and add the filtered items
+ self.extend(filtered_groups)
+
+ def get(self, vg_name=None, vg_tags=None):
+ """
+ This is a bit expensive, since it will try to filter out all the
+ matching items in the list, filter them out applying anything that was
+ added and return the matching item.
+
+ This method does *not* alter the list, and it will raise an error if
+ multiple VGs are matched
+
+ It is useful to use ``tags`` when trying to find a specific volume group,
+ but it can also lead to multiple vgs being found (although unlikely)
+ """
+ if not any([vg_name, vg_tags]):
+ return None
+ vgs = self._filter(
+ vg_name=vg_name,
+ vg_tags=vg_tags
+ )
+ if not vgs:
+ return None
+ if len(vgs) > 1:
+ # this is probably never going to happen, but it is here to keep
+ # the API code consistent
+ raise MultipleVGsError(vg_name)
+ return vgs[0]
+
+
+class Volumes(list):
+ """
+ A list of all known (logical) volumes for the current system, with the ability
+ to filter them via keyword arguments.
+ """
+
+ def __init__(self):
+ self._populate()
+
+ def _populate(self):
+ # get all the lvs in the current system
+ for lv_item in get_api_lvs():
+ self.append(Volume(**lv_item))
+
+ def _purge(self):
+ """
+ Deplete all the items in the list, used internally only so that we can
+ dynamically allocate the items when filtering without the concern of
+ messing up the contents
+ """
+ self[:] = []
+
+ def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+ """
+ The actual method that filters using a new list. Useful so that other
+ methods that do not want to alter the contents of the list (e.g.
+ ``self.find``) can operate safely.
+ """
+ filtered = [i for i in self]
+ if lv_name:
+ filtered = [i for i in filtered if i.lv_name == lv_name]
+
+ if vg_name:
+ filtered = [i for i in filtered if i.vg_name == vg_name]
+
+ if lv_uuid:
+ filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
+
+ if lv_path:
+ filtered = [i for i in filtered if i.lv_path == lv_path]
+
+ # at this point, `filtered` has either all the volumes in self or is an
+ # actual filtered list if any filters were applied
+ if lv_tags:
+ tag_filtered = []
+ for volume in filtered:
+ # all the tags we got need to match on the volume
+ matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
+ if matches:
+ tag_filtered.append(volume)
+ return tag_filtered
+
+ return filtered
+
+ def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+ """
+ Filter out volumes on top level attributes like ``lv_name`` or by
+ ``lv_tags`` where a dict is required. For example, to find a volume
+ that has an OSD ID of 0, the filter would look like::
+
+ lv_tags={'ceph.osd_id': '0'}
+
+ """
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+ raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
+ # first find the filtered volumes with the values in self
+ filtered_volumes = self._filter(
+ lv_name=lv_name,
+ vg_name=vg_name,
+ lv_path=lv_path,
+ lv_uuid=lv_uuid,
+ lv_tags=lv_tags
+ )
+ # then purge everything
+ self._purge()
+ # and add the filtered items
+ self.extend(filtered_volumes)
+
+ def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
+ """
+ This is a bit expensive, since it will try to filter out all the
+ matching items in the list, filter them out applying anything that was
+ added and return the matching item.
+
+ This method does *not* alter the list, and it will raise an error if
+ multiple LVs are matched
+
+ It is useful to use ``tags`` when trying to find a specific logical volume,
+ but it can also lead to multiple lvs being found, since a lot of metadata
+ is shared between lvs of a distinct OSD.
+ """
+ if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
+ return None
+ lvs = self._filter(
+ lv_name=lv_name,
+ vg_name=vg_name,
+ lv_path=lv_path,
+ lv_uuid=lv_uuid,
+ lv_tags=lv_tags
+ )
+ if not lvs:
+ return None
+ if len(lvs) > 1:
+ raise MultipleLVsError(lv_name, lv_path)
+ return lvs[0]
+
+
+class PVolumes(list):
+ """
+ A list of all known (physical) volumes for the current system, with the ability
+ to filter them via keyword arguments.
+ """
+
+ def __init__(self):
+ self._populate()
+
+ def _populate(self):
+ # get all the pvs in the current system
+ for pv_item in get_api_pvs():
+ self.append(PVolume(**pv_item))
+
+ def _purge(self):
+ """
+ Deplete all the items in the list, used internally only so that we can
+ dynamically allocate the items when filtering without the concern of
+ messing up the contents
+ """
+ self[:] = []
+
+ def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ The actual method that filters using a new list. Useful so that other
+ methods that do not want to alter the contents of the list (e.g.
+ ``self.find``) can operate safely.
+ """
+ filtered = [i for i in self]
+ if pv_name:
+ filtered = [i for i in filtered if i.pv_name == pv_name]
+
+ if pv_uuid:
+ filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
+
+ # at this point, `filtered` has either all the physical volumes in self
+ # or is an actual filtered list if any filters were applied
+ if pv_tags:
+ tag_filtered = []
+ for pvolume in filtered:
+ matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
+ if matches:
+ tag_filtered.append(pvolume)
+ # return the tag_filtered pvolumes here, the `filtered` list is no
+ # longer useable
+ return tag_filtered
+
+ return filtered
+
+ def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ Filter out volumes on top level attributes like ``pv_name`` or by
+ ``pv_tags`` where a dict is required. For example, to find a physical volume
+ that has an OSD ID of 0, the filter would look like::
+
+ pv_tags={'ceph.osd_id': '0'}
+
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
+ # first find the filtered volumes with the values in self
+ filtered_volumes = self._filter(
+ pv_name=pv_name,
+ pv_uuid=pv_uuid,
+ pv_tags=pv_tags
+ )
+ # then purge everything
+ self._purge()
+ # and add the filtered items
+ self.extend(filtered_volumes)
+
+ def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
+ """
+ This is a bit expensive, since it will try to filter out all the
+ matching items in the list, filter them out applying anything that was
+ added and return the matching item.
+
+ This method does *not* alter the list, and it will raise an error if
+ multiple pvs are matched
+
+ It is useful to use ``tags`` when trying to find a specific logical volume,
+ but it can also lead to multiple pvs being found, since a lot of metadata
+ is shared between pvs of a distinct OSD.
+ """
+ if not any([pv_name, pv_uuid, pv_tags]):
+ return None
+ pvs = self._filter(
+ pv_name=pv_name,
+ pv_uuid=pv_uuid,
+ pv_tags=pv_tags
+ )
+ if not pvs:
+ return None
+ if len(pvs) > 1:
+ raise MultiplePVsError(pv_name)
+ return pvs[0]
+
+
+class VolumeGroup(object):
+ """
+ Represents an LVM group, with some top-level attributes like ``vg_name``
+ """
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+ self.name = kw['vg_name']
+ self.tags = parse_tags(kw.get('vg_tags', ''))
+
+ def __str__(self):
+ return '<%s>' % self.name
+
+ def __repr__(self):
+ return self.__str__()
+
+
+class Volume(object):
+ """
+ Represents a Logical Volume from LVM, with some top-level attributes like
+ ``lv_name`` and parsed tags as a dictionary of key/value pairs.
+ """
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+ self.lv_api = kw
+ self.name = kw['lv_name']
+ self.tags = parse_tags(kw['lv_tags'])
+
+ def __str__(self):
+ return '<%s>' % self.lv_api['lv_path']
+
+ def __repr__(self):
+ return self.__str__()
+
+ def as_dict(self):
+ obj = {}
+ obj.update(self.lv_api)
+ obj['tags'] = self.tags
+ obj['name'] = self.name
+ obj['type'] = self.tags['ceph.type']
+ obj['path'] = self.lv_path
+ return obj
+
+ def clear_tags(self):
+ """
+ Removes all tags from the Logical Volume.
+ """
+ for k, v in self.tags.items():
+ tag = "%s=%s" % (k, v)
+ process.run(['sudo', 'lvchange', '--deltag', tag, self.lv_path])
+
+ def set_tags(self, tags):
+ """
+ :param tags: A dictionary of tag names and values, like::
+
+ {
+ "ceph.osd_fsid": "aaa-fff-bbbb",
+ "ceph.osd_id": "0"
+ }
+
+ At the end of all modifications, the tags are refreshed to reflect
+ LVM's most current view.
+ """
+ for k, v in tags.items():
+ self.set_tag(k, v)
+ # after setting all the tags, refresh them for the current object, use the
+ # lv_* identifiers to filter because those shouldn't change
+ lv_object = get_lv(lv_name=self.lv_name, lv_path=self.lv_path)
+ self.tags = lv_object.tags
+
+ def set_tag(self, key, value):
+ """
+ Set the key/value pair as an LVM tag. Does not "refresh" the values of
+ the current object for its tags. Meant to be a "fire and forget" type
+ of modification.
+ """
+ # remove it first if it exists
+ if self.tags.get(key):
+ current_value = self.tags[key]
+ tag = "%s=%s" % (key, current_value)
+ process.call(['sudo', 'lvchange', '--deltag', tag, self.lv_api['lv_path']])
+
+ process.call(
+ [
+ 'sudo', 'lvchange',
+ '--addtag', '%s=%s' % (key, value), self.lv_path
+ ]
+ )
+
+
+class PVolume(object):
+ """
+ Represents a Physical Volume from LVM, with some top-level attributes like
+ ``pv_name`` and parsed tags as a dictionary of key/value pairs.
+ """
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+ self.pv_api = kw
+ self.name = kw['pv_name']
+ self.tags = parse_tags(kw['pv_tags'])
+
+ def __str__(self):
+ return '<%s>' % self.pv_api['pv_name']
+
+ def __repr__(self):
+ return self.__str__()
+
+ def set_tags(self, tags):
+ """
+ :param tags: A dictionary of tag names and values, like::
+
+ {
+ "ceph.osd_fsid": "aaa-fff-bbbb",
+ "ceph.osd_id": "0"
+ }
+
+ At the end of all modifications, the tags are refreshed to reflect
+ LVM's most current view.
+ """
+ for k, v in tags.items():
+ self.set_tag(k, v)
+ # after setting all the tags, refresh them for the current object, use the
+ # pv_* identifiers to filter because those shouldn't change
+ pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
+ self.tags = pv_object.tags
+
+ def set_tag(self, key, value):
+ """
+ Set the key/value pair as an LVM tag. Does not "refresh" the values of
+ the current object for its tags. Meant to be a "fire and forget" type
+ of modification.
+
+ **warning**: Altering tags on a PV has to be done ensuring that the
+ device is actually the one intended. ``pv_name`` is *not* a persistent
+ value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
+ sure the device getting changed is the one needed.
+ """
+ # remove it first if it exists
+ if self.tags.get(key):
+ current_value = self.tags[key]
+ tag = "%s=%s" % (key, current_value)
+ process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
+
+ process.call(
+ [
+ 'sudo', 'pvchange',
+ '--addtag', '%s=%s' % (key, value), self.pv_name
+ ]
+ )
try:
return f(*a, **kw)
except catch as e:
+ import logging
+ logger = logging.getLogger('ceph_volume')
+ logger.exception('exception caught by decorator')
if os.environ.get('CEPH_VOLUME_DEBUG'):
raise
if handler:
-from . import lvm # noqa
+from . import lvm, simple # noqa
from __future__ import print_function
import argparse
+import logging
+import os
from textwrap import dedent
from ceph_volume import process, conf, decorators
from ceph_volume.util import system, disk
+from ceph_volume.util import prepare as prepare_utils
from ceph_volume.systemd import systemctl
-from . import api
+from ceph_volume.api import lvm as api
+
+
+logger = logging.getLogger(__name__)
def activate_filestore(lvs):
# find the osd
osd_lv = lvs.get(lv_tags={'ceph.type': 'data'})
+ if not osd_lv:
+ raise RuntimeError('Unable to find a data LV for filestore activation')
osd_id = osd_lv.tags['ceph.osd_id']
+ conf.cluster = osd_lv.tags['ceph.cluster_name']
# it may have a volume with a journal
osd_journal_lv = lvs.get(lv_tags={'ceph.type': 'journal'})
# TODO: add sensible error reporting if this is ever the case
# mount the osd
source = osd_lv.lv_path
destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
- if not system.is_mounted(source, destination=destination):
+ if not system.device_is_mounted(source, destination=destination):
process.run(['sudo', 'mount', '-v', source, destination])
# always re-do the symlink regardless if it exists, so that the journal
systemctl.start_osd(osd_id)
+def get_osd_device_path(osd_lv, lvs, device_type):
+ """
+ ``device_type`` can be one of ``db``, ``wal`` or ``block`` so that
+ we can query ``lvs`` (a ``Volumes`` object) and fallback to querying the uuid
+ if that is not present.
+
+ Return a path if possible, failing to do that a ``None``, since some of these devices
+ are optional
+ """
+ osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+ uuid_tag = 'ceph.%s_uuid' % device_type
+ device_uuid = osd_lv.tags.get(uuid_tag)
+ if not device_uuid:
+ return None
+
+ device_lv = lvs.get(lv_uuid=device_uuid)
+ if device_lv:
+ return device_lv.lv_path
+ else:
+ # this could be a regular device, so query it with blkid
+ physical_device = disk.get_device_from_partuuid(device_uuid)
+ return physical_device or None
+ return None
+
+
def activate_bluestore(lvs):
- # TODO
- pass
+ # find the osd
+ osd_lv = lvs.get(lv_tags={'ceph.type': 'block'})
+ osd_id = osd_lv.tags['ceph.osd_id']
+ conf.cluster = osd_lv.tags['ceph.cluster_name']
+ osd_fsid = osd_lv.tags['ceph.osd_fsid']
+ db_device_path = get_osd_device_path(osd_lv, lvs, 'db')
+ wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal')
+
+ # mount on tmpfs the osd directory
+ osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
+ if not system.path_is_mounted(osd_path):
+ # mkdir -p and mount as tmpfs
+ prepare_utils.create_osd_path(osd_id, tmpfs=True)
+ # XXX This needs to be removed once ceph-bluestore-tool can deal with
+ # symlinks that exist in the osd dir
+ for link_name in ['block', 'block.db', 'block.wal']:
+ link_path = os.path.join(osd_path, link_name)
+ if os.path.exists(link_path):
+ os.unlink(os.path.join(osd_path, link_name))
+ # Once symlinks are removed, the osd dir can be 'primed again.
+ process.run([
+ 'sudo', 'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
+ 'prime-osd-dir', '--dev', osd_lv.lv_path,
+ '--path', osd_path])
+ # always re-do the symlink regardless if it exists, so that the block,
+ # block.wal, and block.db devices that may have changed can be mapped
+ # correctly every time
+ process.run(['sudo', 'ln', '-snf', osd_lv.lv_path, os.path.join(osd_path, 'block')])
+ system.chown(os.path.join(osd_path, 'block'))
+ system.chown(osd_path)
+ if db_device_path:
+ destination = os.path.join(osd_path, 'block.db')
+ process.run(['sudo', 'ln', '-snf', db_device_path, destination])
+ system.chown(db_device_path)
+ if wal_device_path:
+ destination = os.path.join(osd_path, 'block.wal')
+ process.run(['sudo', 'ln', '-snf', wal_device_path, destination])
+ system.chown(wal_device_path)
+
+ # enable the ceph-volume unit for this OSD
+ systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+
+ # start the OSD
+ systemctl.start_osd(osd_id)
class Activate(object):
lvs.filter(lv_tags={'ceph.osd_fsid': args.osd_fsid})
if not lvs:
raise RuntimeError('could not find osd.%s with fsid %s' % (args.osd_id, args.osd_fsid))
- activate_filestore(lvs)
+ # This argument is only available when passed in directly or via
+ # systemd, not when ``create`` is being used
+ if getattr(args, 'auto_detect_objectstore', False):
+ logger.info('auto detecting objectstore')
+ # may get multiple lvs, so can't do lvs.get() calls here
+ for lv in lvs:
+ has_journal = lv.tags.get('ceph.journal_uuid')
+ if has_journal:
+ logger.info('found a journal associated with the OSD, assuming filestore')
+ return activate_filestore(lvs)
+ logger.info('unable to find a journal associated with the OSD, assuming bluestore')
+ return activate_bluestore(lvs)
+ if args.bluestore:
+ activate_bluestore(lvs)
+ elif args.filestore:
+ activate_filestore(lvs)
def main(self):
sub_command_help = dedent("""
nargs='?',
help='The FSID of the OSD, similar to a SHA1'
)
+ parser.add_argument(
+ '--auto-detect-objectstore',
+ action='store_true',
+ help='Autodetect the objectstore by inspecting the OSD',
+ )
parser.add_argument(
'--bluestore',
- action='store_true', default=False,
+ action='store_true',
help='filestore objectstore (not yet implemented)',
)
parser.add_argument(
'--filestore',
- action='store_true', default=True,
+ action='store_true',
help='filestore objectstore (current default)',
)
if len(self.argv) == 0:
print(sub_command_help)
return
args = parser.parse_args(self.argv)
+ # Default to bluestore here since defaulting it in add_argument may
+ # cause both to be True
+ if not args.bluestore and not args.filestore:
+ args.bluestore = True
self.activate(args)
+++ /dev/null
-"""
-API for CRUD lvm tag operations. Follows the Ceph LVM tag naming convention
-that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
-set of utilities for interacting with LVM.
-"""
-from ceph_volume import process
-from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
-
-
-def _output_parser(output, fields):
- """
- Newer versions of LVM allow ``--reportformat=json``, but older versions,
- like the one included in Xenial do not. LVM has the ability to filter and
- format its output so we assume the output will be in a format this parser
- can handle (using ',' as a delimiter)
-
- :param fields: A string, possibly using ',' to group many items, as it
- would be used on the CLI
- :param output: The CLI output from the LVM call
- """
- field_items = fields.split(',')
- report = []
- for line in output:
- # clear the leading/trailing whitespace
- line = line.strip()
-
- # remove the extra '"' in each field
- line = line.replace('"', '')
-
- # prevent moving forward with empty contents
- if not line:
- continue
-
- # spliting on ';' because that is what the lvm call uses as
- # '--separator'
- output_items = [i.strip() for i in line.split(';')]
- # map the output to the fiels
- report.append(
- dict(zip(field_items, output_items))
- )
-
- return report
-
-
-def parse_tags(lv_tags):
- """
- Return a dictionary mapping of all the tags associated with
- a Volume from the comma-separated tags coming from the LVM API
-
- Input look like::
-
- "ceph.osd_fsid=aaa-fff-bbbb,ceph.osd_id=0"
-
- For the above example, the expected return value would be::
-
- {
- "ceph.osd_fsid": "aaa-fff-bbbb",
- "ceph.osd_id": "0"
- }
- """
- if not lv_tags:
- return {}
- tag_mapping = {}
- tags = lv_tags.split(',')
- for tag_assignment in tags:
- key, value = tag_assignment.split('=', 1)
- tag_mapping[key] = value
-
- return tag_mapping
-
-
-def get_api_vgs():
- """
- Return the list of group volumes available in the system using flags to
- include common metadata associated with them
-
- Command and sample delimeted output, should look like::
-
- $ sudo vgs --noheadings --separator=';' \
- -o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
- ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
- osd_vg;3;1;0;wz--n-;29.21g;9.21g
-
- """
- fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
- stdout, stderr, returncode = process.call(
- ['sudo', 'vgs', '--noheadings', '--separator=";"', '-o', fields]
- )
- return _output_parser(stdout, fields)
-
-
-def get_api_lvs():
- """
- Return the list of logical volumes available in the system using flags to include common
- metadata associated with them
-
- Command and delimeted output, should look like::
-
- $ sudo lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
- ;/dev/ubuntubox-vg/root;root;ubuntubox-vg
- ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
-
- """
- fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
- stdout, stderr, returncode = process.call(
- ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
- )
- return _output_parser(stdout, fields)
-
-
-def get_api_pvs():
- """
- Return the list of physical volumes configured for lvm and available in the
- system using flags to include common metadata associated with them like the uuid
-
- Command and delimeted output, should look like::
-
- $ sudo pvs --noheadings --separator=';' -o pv_name,pv_tags,pv_uuid
- /dev/sda1;;
- /dev/sdv;;07A4F654-4162-4600-8EB3-88D1E42F368D
-
- """
- fields = 'pv_name,pv_tags,pv_uuid'
-
- # note the use of `pvs -a` which will return every physical volume including
- # ones that have not been initialized as "pv" by LVM
- stdout, stderr, returncode = process.call(
- ['sudo', 'pvs', '-a', '--no-heading', '--separator=";"', '-o', fields]
- )
-
- return _output_parser(stdout, fields)
-
-
-def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
- """
- Return a matching lv for the current system, requiring ``lv_name``,
- ``vg_name``, ``lv_path`` or ``tags``. Raises an error if more than one lv
- is found.
-
- It is useful to use ``tags`` when trying to find a specific logical volume,
- but it can also lead to multiple lvs being found, since a lot of metadata
- is shared between lvs of a distinct OSD.
- """
- if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
- return None
- lvs = Volumes()
- return lvs.get(
- lv_name=lv_name, vg_name=vg_name, lv_path=lv_path, lv_uuid=lv_uuid,
- lv_tags=lv_tags
- )
-
-
-def get_pv(pv_name=None, pv_uuid=None, pv_tags=None):
- """
- Return a matching pv (physical volume) for the current system, requiring
- ``pv_name``, ``pv_uuid``, or ``pv_tags``. Raises an error if more than one
- pv is found.
- """
- if not any([pv_name, pv_uuid, pv_tags]):
- return None
- pvs = PVolumes()
- return pvs.get(pv_name=pv_name, pv_uuid=pv_uuid, pv_tags=pv_tags)
-
-
-def create_pv(device):
- """
- Create a physical volume from a device, useful when devices need to be later mapped
- to journals.
- """
- process.run([
- 'sudo',
- 'pvcreate',
- '-v', # verbose
- '-f', # force it
- '--yes', # answer yes to any prompts
- device
- ])
-
-
-def create_lv(name, group, size=None, **tags):
- """
- Create a Logical Volume in a Volume Group. Command looks like::
-
- lvcreate -L 50G -n gfslv vg0
-
- ``name``, ``group``, and ``size`` are required. Tags are optional and are "translated" to include
- the prefixes for the Ceph LVM tag API.
-
- """
- # XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
- type_path_tag = {
- 'journal': 'ceph.journal_device',
- 'data': 'ceph.data_device',
- 'block': 'ceph.block',
- 'wal': 'ceph.wal',
- 'db': 'ceph.db',
- 'lockbox': 'ceph.lockbox_device',
- }
- if size:
- process.run([
- 'sudo',
- 'lvcreate',
- '--yes',
- '-L',
- '%sG' % size,
- '-n', name, group
- ])
- # create the lv with all the space available, this is needed because the
- # system call is different for LVM
- else:
- process.run([
- 'sudo',
- 'lvcreate',
- '--yes',
- '-l',
- '100%FREE',
- '-n', name, group
- ])
-
- lv = get_lv(lv_name=name, vg_name=group)
- ceph_tags = {}
- for k, v in tags.items():
- ceph_tags['ceph.%s' % k] = v
- lv.set_tags(ceph_tags)
-
- # when creating a distinct type, the caller doesn't know what the path will
- # be so this function will set it after creation using the mapping
- path_tag = type_path_tag[tags['type']]
- lv.set_tags(
- {path_tag: lv.lv_path}
- )
- return lv
-
-
-def get_vg(vg_name=None, vg_tags=None):
- """
- Return a matching vg for the current system, requires ``vg_name`` or
- ``tags``. Raises an error if more than one vg is found.
-
- It is useful to use ``tags`` when trying to find a specific volume group,
- but it can also lead to multiple vgs being found.
- """
- if not any([vg_name, vg_tags]):
- return None
- vgs = VolumeGroups()
- return vgs.get(vg_name=vg_name, vg_tags=vg_tags)
-
-
-class VolumeGroups(list):
- """
- A list of all known volume groups for the current system, with the ability
- to filter them via keyword arguments.
- """
-
- def __init__(self):
- self._populate()
-
- def _populate(self):
- # get all the vgs in the current system
- for vg_item in get_api_vgs():
- self.append(VolumeGroup(**vg_item))
-
- def _purge(self):
- """
- Deplete all the items in the list, used internally only so that we can
- dynamically allocate the items when filtering without the concern of
- messing up the contents
- """
- self[:] = []
-
- def _filter(self, vg_name=None, vg_tags=None):
- """
- The actual method that filters using a new list. Useful so that other
- methods that do not want to alter the contents of the list (e.g.
- ``self.find``) can operate safely.
-
- .. note:: ``vg_tags`` is not yet implemented
- """
- filtered = [i for i in self]
- if vg_name:
- filtered = [i for i in filtered if i.vg_name == vg_name]
-
- # at this point, `filtered` has either all the volumes in self or is an
- # actual filtered list if any filters were applied
- if vg_tags:
- tag_filtered = []
- for volume in filtered:
- matches = all(volume.tags.get(k) == str(v) for k, v in vg_tags.items())
- if matches:
- tag_filtered.append(volume)
- return tag_filtered
-
- return filtered
-
- def filter(self, vg_name=None, vg_tags=None):
- """
- Filter out groups on top level attributes like ``vg_name`` or by
- ``vg_tags`` where a dict is required. For example, to find a Ceph group
- with dmcache as the type, the filter would look like::
-
- vg_tags={'ceph.type': 'dmcache'}
-
- .. warning:: These tags are not documented because they are currently
- unused, but are here to maintain API consistency
- """
- if not any([vg_name, vg_tags]):
- raise TypeError('.filter() requires vg_name or vg_tags (none given)')
- # first find the filtered volumes with the values in self
- filtered_groups = self._filter(
- vg_name=vg_name,
- vg_tags=vg_tags
- )
- # then purge everything
- self._purge()
- # and add the filtered items
- self.extend(filtered_groups)
-
- def get(self, vg_name=None, vg_tags=None):
- """
- This is a bit expensive, since it will try to filter out all the
- matching items in the list, filter them out applying anything that was
- added and return the matching item.
-
- This method does *not* alter the list, and it will raise an error if
- multiple VGs are matched
-
- It is useful to use ``tags`` when trying to find a specific volume group,
- but it can also lead to multiple vgs being found (although unlikely)
- """
- if not any([vg_name, vg_tags]):
- return None
- vgs = self._filter(
- vg_name=vg_name,
- vg_tags=vg_tags
- )
- if not vgs:
- return None
- if len(vgs) > 1:
- # this is probably never going to happen, but it is here to keep
- # the API code consistent
- raise MultipleVGsError(vg_name)
- return vgs[0]
-
-
-class Volumes(list):
- """
- A list of all known (logical) volumes for the current system, with the ability
- to filter them via keyword arguments.
- """
-
- def __init__(self):
- self._populate()
-
- def _populate(self):
- # get all the lvs in the current system
- for lv_item in get_api_lvs():
- self.append(Volume(**lv_item))
-
- def _purge(self):
- """
- Deplete all the items in the list, used internally only so that we can
- dynamically allocate the items when filtering without the concern of
- messing up the contents
- """
- self[:] = []
-
- def _filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
- """
- The actual method that filters using a new list. Useful so that other
- methods that do not want to alter the contents of the list (e.g.
- ``self.find``) can operate safely.
- """
- filtered = [i for i in self]
- if lv_name:
- filtered = [i for i in filtered if i.lv_name == lv_name]
-
- if vg_name:
- filtered = [i for i in filtered if i.vg_name == vg_name]
-
- if lv_uuid:
- filtered = [i for i in filtered if i.lv_uuid == lv_uuid]
-
- if lv_path:
- filtered = [i for i in filtered if i.lv_path == lv_path]
-
- # at this point, `filtered` has either all the volumes in self or is an
- # actual filtered list if any filters were applied
- if lv_tags:
- tag_filtered = []
- for volume in filtered:
- # all the tags we got need to match on the volume
- matches = all(volume.tags.get(k) == str(v) for k, v in lv_tags.items())
- if matches:
- tag_filtered.append(volume)
- return tag_filtered
-
- return filtered
-
- def filter(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
- """
- Filter out volumes on top level attributes like ``lv_name`` or by
- ``lv_tags`` where a dict is required. For example, to find a volume
- that has an OSD ID of 0, the filter would look like::
-
- lv_tags={'ceph.osd_id': '0'}
-
- """
- if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
- raise TypeError('.filter() requires lv_name, vg_name, lv_path, lv_uuid, or tags (none given)')
- # first find the filtered volumes with the values in self
- filtered_volumes = self._filter(
- lv_name=lv_name,
- vg_name=vg_name,
- lv_path=lv_path,
- lv_uuid=lv_uuid,
- lv_tags=lv_tags
- )
- # then purge everything
- self._purge()
- # and add the filtered items
- self.extend(filtered_volumes)
-
- def get(self, lv_name=None, vg_name=None, lv_path=None, lv_uuid=None, lv_tags=None):
- """
- This is a bit expensive, since it will try to filter out all the
- matching items in the list, filter them out applying anything that was
- added and return the matching item.
-
- This method does *not* alter the list, and it will raise an error if
- multiple LVs are matched
-
- It is useful to use ``tags`` when trying to find a specific logical volume,
- but it can also lead to multiple lvs being found, since a lot of metadata
- is shared between lvs of a distinct OSD.
- """
- if not any([lv_name, vg_name, lv_path, lv_uuid, lv_tags]):
- return None
- lvs = self._filter(
- lv_name=lv_name,
- vg_name=vg_name,
- lv_path=lv_path,
- lv_uuid=lv_uuid,
- lv_tags=lv_tags
- )
- if not lvs:
- return None
- if len(lvs) > 1:
- raise MultipleLVsError(lv_name, lv_path)
- return lvs[0]
-
-
-class PVolumes(list):
- """
- A list of all known (physical) volumes for the current system, with the ability
- to filter them via keyword arguments.
- """
-
- def __init__(self):
- self._populate()
-
- def _populate(self):
- # get all the pvs in the current system
- for pv_item in get_api_pvs():
- self.append(PVolume(**pv_item))
-
- def _purge(self):
- """
- Deplete all the items in the list, used internally only so that we can
- dynamically allocate the items when filtering without the concern of
- messing up the contents
- """
- self[:] = []
-
- def _filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
- """
- The actual method that filters using a new list. Useful so that other
- methods that do not want to alter the contents of the list (e.g.
- ``self.find``) can operate safely.
- """
- filtered = [i for i in self]
- if pv_name:
- filtered = [i for i in filtered if i.pv_name == pv_name]
-
- if pv_uuid:
- filtered = [i for i in filtered if i.pv_uuid == pv_uuid]
-
- # at this point, `filtered` has either all the physical volumes in self
- # or is an actual filtered list if any filters were applied
- if pv_tags:
- tag_filtered = []
- for pvolume in filtered:
- matches = all(pvolume.tags.get(k) == str(v) for k, v in pv_tags.items())
- if matches:
- tag_filtered.append(pvolume)
- # return the tag_filtered pvolumes here, the `filtered` list is no
- # longer useable
- return tag_filtered
-
- return filtered
-
- def filter(self, pv_name=None, pv_uuid=None, pv_tags=None):
- """
- Filter out volumes on top level attributes like ``pv_name`` or by
- ``pv_tags`` where a dict is required. For example, to find a physical volume
- that has an OSD ID of 0, the filter would look like::
-
- pv_tags={'ceph.osd_id': '0'}
-
- """
- if not any([pv_name, pv_uuid, pv_tags]):
- raise TypeError('.filter() requires pv_name, pv_uuid, or pv_tags (none given)')
- # first find the filtered volumes with the values in self
- filtered_volumes = self._filter(
- pv_name=pv_name,
- pv_uuid=pv_uuid,
- pv_tags=pv_tags
- )
- # then purge everything
- self._purge()
- # and add the filtered items
- self.extend(filtered_volumes)
-
- def get(self, pv_name=None, pv_uuid=None, pv_tags=None):
- """
- This is a bit expensive, since it will try to filter out all the
- matching items in the list, filter them out applying anything that was
- added and return the matching item.
-
- This method does *not* alter the list, and it will raise an error if
- multiple pvs are matched
-
- It is useful to use ``tags`` when trying to find a specific logical volume,
- but it can also lead to multiple pvs being found, since a lot of metadata
- is shared between pvs of a distinct OSD.
- """
- if not any([pv_name, pv_uuid, pv_tags]):
- return None
- pvs = self._filter(
- pv_name=pv_name,
- pv_uuid=pv_uuid,
- pv_tags=pv_tags
- )
- if not pvs:
- return None
- if len(pvs) > 1:
- raise MultiplePVsError(pv_name)
- return pvs[0]
-
-
-class VolumeGroup(object):
- """
- Represents an LVM group, with some top-level attributes like ``vg_name``
- """
-
- def __init__(self, **kw):
- for k, v in kw.items():
- setattr(self, k, v)
- self.name = kw['vg_name']
- self.tags = parse_tags(kw.get('vg_tags', ''))
-
- def __str__(self):
- return '<%s>' % self.name
-
- def __repr__(self):
- return self.__str__()
-
-
-class Volume(object):
- """
- Represents a Logical Volume from LVM, with some top-level attributes like
- ``lv_name`` and parsed tags as a dictionary of key/value pairs.
- """
-
- def __init__(self, **kw):
- for k, v in kw.items():
- setattr(self, k, v)
- self.lv_api = kw
- self.name = kw['lv_name']
- self.tags = parse_tags(kw['lv_tags'])
-
- def __str__(self):
- return '<%s>' % self.lv_api['lv_path']
-
- def __repr__(self):
- return self.__str__()
-
- def set_tags(self, tags):
- """
- :param tags: A dictionary of tag names and values, like::
-
- {
- "ceph.osd_fsid": "aaa-fff-bbbb",
- "ceph.osd_id": "0"
- }
-
- At the end of all modifications, the tags are refreshed to reflect
- LVM's most current view.
- """
- for k, v in tags.items():
- self.set_tag(k, v)
- # after setting all the tags, refresh them for the current object, use the
- # lv_* identifiers to filter because those shouldn't change
- lv_object = get_lv(lv_name=self.lv_name, lv_path=self.lv_path)
- self.tags = lv_object.tags
-
- def set_tag(self, key, value):
- """
- Set the key/value pair as an LVM tag. Does not "refresh" the values of
- the current object for its tags. Meant to be a "fire and forget" type
- of modification.
- """
- # remove it first if it exists
- if self.tags.get(key):
- current_value = self.tags[key]
- tag = "%s=%s" % (key, current_value)
- process.call(['sudo', 'lvchange', '--deltag', tag, self.lv_api['lv_path']])
-
- process.call(
- [
- 'sudo', 'lvchange',
- '--addtag', '%s=%s' % (key, value), self.lv_path
- ]
- )
-
-
-class PVolume(object):
- """
- Represents a Physical Volume from LVM, with some top-level attributes like
- ``pv_name`` and parsed tags as a dictionary of key/value pairs.
- """
-
- def __init__(self, **kw):
- for k, v in kw.items():
- setattr(self, k, v)
- self.pv_api = kw
- self.name = kw['pv_name']
- self.tags = parse_tags(kw['pv_tags'])
-
- def __str__(self):
- return '<%s>' % self.pv_api['pv_name']
-
- def __repr__(self):
- return self.__str__()
-
- def set_tags(self, tags):
- """
- :param tags: A dictionary of tag names and values, like::
-
- {
- "ceph.osd_fsid": "aaa-fff-bbbb",
- "ceph.osd_id": "0"
- }
-
- At the end of all modifications, the tags are refreshed to reflect
- LVM's most current view.
- """
- for k, v in tags.items():
- self.set_tag(k, v)
- # after setting all the tags, refresh them for the current object, use the
- # pv_* identifiers to filter because those shouldn't change
- pv_object = get_pv(pv_name=self.pv_name, pv_uuid=self.pv_uuid)
- self.tags = pv_object.tags
-
- def set_tag(self, key, value):
- """
- Set the key/value pair as an LVM tag. Does not "refresh" the values of
- the current object for its tags. Meant to be a "fire and forget" type
- of modification.
-
- **warning**: Altering tags on a PV has to be done ensuring that the
- device is actually the one intended. ``pv_name`` is *not* a persistent
- value, only ``pv_uuid`` is. Using ``pv_uuid`` is the best way to make
- sure the device getting changed is the one needed.
- """
- # remove it first if it exists
- if self.tags.get(key):
- current_value = self.tags[key]
- tag = "%s=%s" % (key, current_value)
- process.call(['sudo', 'pvchange', '--deltag', tag, self.pv_name])
-
- process.call(
- [
- 'sudo', 'pvchange',
- '--addtag', '%s=%s' % (key, value), self.pv_name
- ]
- )
required_args = parser.add_argument_group('required arguments')
parser.add_argument(
'--journal',
- help='A logical volume (vg_name/lv_name), or path to a device',
+ help='(filestore) A logical volume (vg_name/lv_name), or path to a device',
)
required_args.add_argument(
'--data',
required=True,
type=arg_validators.LVPath(),
- help='A logical volume (vg_name/lv_name) for OSD data',
+ help='OSD data path. A physical device or logical volume',
)
parser.add_argument(
'--journal-size',
default=5,
metavar='GB',
type=int,
- help='Size (in GB) A logical group name or a path to a logical volume',
+ help='(filestore) Size (in GB) for the journal',
)
parser.add_argument(
'--bluestore',
- action='store_true', default=False,
- help='Use the bluestore objectstore (not currently supported)',
+ action='store_true',
+ help='Use the bluestore objectstore',
)
parser.add_argument(
'--filestore',
- action='store_true', default=True,
- help='Use the filestore objectstore (currently the only supported object store)',
+ action='store_true',
+ help='Use the filestore objectstore',
)
parser.add_argument(
'--osd-id',
'--osd-fsid',
help='Reuse an existing OSD fsid',
)
+ parser.add_argument(
+ '--block.db',
+ dest='block_db',
+ help='(bluestore) Path to bluestore block.db logical volume or device',
+ )
+ parser.add_argument(
+ '--block.wal',
+ dest='block_wal',
+ help='(bluestore) Path to bluestore block.wal logical volume or device',
+ )
# Do not parse args, so that consumers can do something before the args get
# parsed triggering argparse behavior
return parser
print(sub_command_help)
return
args = parser.parse_args(self.argv)
+ # Default to bluestore here since defaulting it in add_argument may
+ # cause both to be True
+ if args.bluestore is None and args.filestore is None:
+ args.bluestore = True
self.create(args)
--- /dev/null
+from __future__ import print_function
+import argparse
+import json
+import logging
+from textwrap import dedent
+from ceph_volume import decorators
+from ceph_volume.util import disk
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+osd_list_header_template = """\n
+{osd_id:=^20}"""
+
+
+osd_device_header_template = """
+
+ [{type: >4}] {path}
+"""
+
+device_metadata_item_template = """
+ {tag_name: <25} {value}"""
+
+
+def readable_tag(tag):
+ actual_name = tag.split('.')[-1]
+ return actual_name.replace('_', ' ')
+
+
+def pretty_report(report):
+ output = []
+ for _id, devices in report.items():
+ output.append(
+ osd_list_header_template.format(osd_id=" osd.%s " % _id)
+ )
+ for device in devices:
+ output.append(
+ osd_device_header_template.format(
+ type=device['type'],
+ path=device['path']
+ )
+ )
+ for tag_name, value in device.get('tags', {}).items():
+ output.append(
+ device_metadata_item_template.format(
+ tag_name=readable_tag(tag_name),
+ value=value
+ )
+ )
+ print(''.join(output))
+
+
+class List(object):
+
+ help = 'list logical volumes and devices associated with Ceph'
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ @decorators.needs_root
+ def list(self, args):
+ # ensure everything is up to date before calling out
+ # to list lv's
+ self.update()
+ report = self.generate(args)
+ if args.format == 'json':
+ # If the report is empty, we don't return a non-zero exit status
+ # because it is assumed this is going to be consumed by automated
+ # systems like ceph-ansible which would be forced to ignore the
+ # non-zero exit status if all they need is the information in the
+ # JSON object
+ print(json.dumps(report, indent=4, sort_keys=True))
+ else:
+ if not report:
+ raise SystemExit('No valid Ceph devices found')
+ pretty_report(report)
+
+ def update(self):
+ """
+ Ensure all journal devices are up to date if they aren't a logical
+ volume
+ """
+ lvs = api.Volumes()
+ for lv in lvs:
+ try:
+ lv.tags['ceph.osd_id']
+ except KeyError:
+ # only consider ceph-based logical volumes, everything else
+ # will get ignored
+ continue
+
+ for device_type in ['journal', 'block', 'wal', 'db']:
+ device_name = 'ceph.%s_device' % device_type
+ device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+ if not device_uuid:
+ # bluestore will not have a journal, filestore will not have
+ # a block/wal/db, so we must skip if not present
+ continue
+ disk_device = disk.get_device_from_partuuid(device_uuid)
+ if disk_device:
+ if lv.tags[device_name] != disk_device:
+ # this means that the device has changed, so it must be updated
+ # on the API to reflect this
+ lv.set_tags({device_name: disk_device})
+
+ def generate(self, args):
+ """
+ Generate reports for an individual device or for all Ceph-related
+ devices, logical or physical, as long as they have been prepared by
+ this tool before and contain enough metadata.
+ """
+ if args.device:
+ return self.single_report(args.device)
+ else:
+ return self.full_report()
+
+ def single_report(self, device):
+ """
+ Generate a report for a single device. This can be either a logical
+ volume in the form of vg/lv or a device with an absolute path like
+ /dev/sda1
+ """
+ lvs = api.Volumes()
+ report = {}
+ lv = api.get_lv_from_argument(device)
+ if lv:
+ try:
+ _id = lv.tags['ceph.osd_id']
+ except KeyError:
+ logger.warning('device is not part of ceph: %s', device)
+ return report
+
+ report.setdefault(_id, [])
+ report[_id].append(
+ lv.as_dict()
+ )
+
+ else:
+ # this has to be a journal/wal/db device (not a logical volume) so try
+ # to find the PARTUUID that should be stored in the OSD logical
+ # volume
+ for device_type in ['journal', 'block', 'wal', 'db']:
+ device_tag_name = 'ceph.%s_device' % device_type
+ device_tag_uuid = 'ceph.%s_uuid' % device_type
+ associated_lv = lvs.get(lv_tags={device_tag_name: device})
+ if associated_lv:
+ _id = associated_lv.tags['ceph.osd_id']
+ uuid = associated_lv.tags[device_tag_uuid]
+
+ report.setdefault(_id, [])
+ report[_id].append(
+ {
+ 'tags': {'PARTUUID': uuid},
+ 'type': device_type,
+ 'path': device,
+ }
+ )
+ return report
+
+ def full_report(self):
+ """
+ Generate a report for all the logical volumes and associated devices
+ that have been previously prepared by Ceph
+ """
+ lvs = api.Volumes()
+ report = {}
+ for lv in lvs:
+ try:
+ _id = lv.tags['ceph.osd_id']
+ except KeyError:
+ # only consider ceph-based logical volumes, everything else
+ # will get ignored
+ continue
+
+ report.setdefault(_id, [])
+ report[_id].append(
+ lv.as_dict()
+ )
+
+ for device_type in ['journal', 'block', 'wal', 'db']:
+ device_uuid = lv.tags.get('ceph.%s_uuid' % device_type)
+ if not device_uuid:
+ # bluestore will not have a journal, filestore will not have
+ # a block/wal/db, so we must skip if not present
+ continue
+ if not api.get_lv(lv_uuid=device_uuid):
+ # means we have a regular device, so query blkid
+ disk_device = disk.get_device_from_partuuid(device_uuid)
+ if disk_device:
+ report[_id].append(
+ {
+ 'tags': {'PARTUUID': device_uuid},
+ 'type': device_type,
+ 'path': disk_device,
+ }
+ )
+
+ return report
+
+ def main(self):
+ sub_command_help = dedent("""
+ List devices or logical volumes associated with Ceph. An association is
+ determined if a device has information relating to an OSD. This is
+ verified by querying LVM's metadata and correlating it with devices.
+
+ The lvs associated with the OSD need to have been prepared previously,
+ so that all needed tags and metadata exist.
+
+ Full listing of all system devices associated with a cluster::
+
+ ceph-volume lvm list
+
+ List a particular device, reporting all metadata about it::
+
+ ceph-volume lvm list /dev/sda1
+
+ List a logical volume, along with all its metadata (vg is a volume
+ group, and lv the logical volume name)::
+
+ ceph-volume lvm list {vg/lv}
+ """)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume lvm list',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=sub_command_help,
+ )
+
+ parser.add_argument(
+ 'device',
+ metavar='DEVICE',
+ nargs='?',
+ help='Path to an lv (as vg/lv) or to a device like /dev/sda1'
+ )
+
+ parser.add_argument(
+ '--format',
+ help='output format, defaults to "pretty"',
+ default='pretty',
+ choices=['json', 'pretty'],
+ )
+
+ args = parser.parse_args(self.argv)
+ self.list(args)
from . import prepare
from . import create
from . import trigger
+from . import listing
+from . import zap
class LVM(object):
'prepare': prepare.Prepare,
'create': create.Create,
'trigger': trigger.Trigger,
+ 'list': listing.List,
+ 'zap': zap.Zap,
}
def __init__(self, argv):
from __future__ import print_function
import json
-import os
+import uuid
from textwrap import dedent
from ceph_volume.util import prepare as prepare_utils
from ceph_volume.util import system, disk
from ceph_volume import conf, decorators, terminal
-from . import api
+from ceph_volume.api import lvm as api
from .common import prepare_parser
def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
"""
- :param device: The name of the volume group or lvm to work with
+ :param device: The name of the logical volume to work with
:param journal: similar to device but can also be a regular/plain disk
:param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
:param id_: The OSD id
# allow re-using an id, in case a prepare failed
osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
# create the directory
- prepare_utils.create_path(osd_id)
+ prepare_utils.create_osd_path(osd_id)
# format the device
prepare_utils.format_device(device)
# mount the data device
# get the latest monmap
prepare_utils.get_monmap(osd_id)
# prepare the osd filesystem
- prepare_utils.osd_mkfs(osd_id, fsid)
+ prepare_utils.osd_mkfs_filestore(osd_id, fsid)
# write the OSD keyring if it doesn't exist already
prepare_utils.write_keyring(osd_id, cephx_secret)
-def prepare_bluestore():
- raise NotImplemented()
+def prepare_bluestore(block, wal, db, secrets, id_=None, fsid=None):
+ """
+ :param block: The name of the logical volume for the bluestore data
+ :param wal: a regular/plain disk or logical volume, to be used for block.wal
+ :param db: a regular/plain disk or logical volume, to be used for block.db
+ :param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
+ :param id_: The OSD id
+ :param fsid: The OSD fsid, also known as the OSD UUID
+ """
+ cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key())
+ json_secrets = json.dumps(secrets)
+
+ # allow re-using an existing fsid, in case prepare failed
+ fsid = fsid or system.generate_uuid()
+ # allow re-using an id, in case a prepare failed
+ osd_id = id_ or prepare_utils.create_id(fsid, json_secrets)
+ # create the directory
+ prepare_utils.create_osd_path(osd_id, tmpfs=True)
+ # symlink the block
+ prepare_utils.link_block(block, osd_id)
+ # get the latest monmap
+ prepare_utils.get_monmap(osd_id)
+ # write the OSD keyring if it doesn't exist already
+ prepare_utils.write_keyring(osd_id, cephx_secret)
+ # prepare the osd filesystem
+ prepare_utils.osd_mkfs_bluestore(
+ osd_id, fsid,
+ keyring=cephx_secret,
+ wal=wal,
+ db=db
+ )
class Prepare(object):
def __init__(self, argv):
self.argv = argv
- def get_journal_ptuuid(self, argument):
+ def get_ptuuid(self, argument):
uuid = disk.get_partuuid(argument)
if not uuid:
terminal.error('blkid could not detect a PARTUUID for device: %s' % argument)
- raise RuntimeError('unable to use device for a journal')
+ raise RuntimeError('unable to use device')
return uuid
- def get_journal_lv(self, argument):
+ def get_lv(self, argument):
"""
- Perform some parsing of the value of ``--journal`` so that the process
- can determine correctly if it got a device path or an lv
- :param argument: The value of ``--journal``, that will need to be split
- to retrieve the actual lv
+ Perform some parsing of the command-line value so that the process
+ can determine correctly if it got a device path or an lv.
+
+ :param argument: The command-line value that will need to be split to
+ retrieve the actual lv
"""
try:
vg_name, lv_name = argument.split('/')
return None
return api.get_lv(lv_name=lv_name, vg_name=vg_name)
+ def setup_device(self, device_type, device_name, tags):
+ """
+ Check if ``device`` is an lv, if so, set the tags, making sure to
+ update the tags with the lv_uuid and lv_path which the incoming tags
+ will not have.
+
+ If the device is not a logical volume, then retrieve the partition UUID
+ by querying ``blkid``
+ """
+ if device_name is None:
+ return '', '', tags
+ tags['ceph.type'] = device_type
+ lv = self.get_lv(device_name)
+ if lv:
+ uuid = lv.lv_uuid
+ path = lv.lv_path
+ tags['ceph.%s_uuid' % device_type] = uuid
+ tags['ceph.%s_device' % device_type] = path
+ lv.set_tags(tags)
+ else:
+ # otherwise assume this is a regular disk partition
+ uuid = self.get_ptuuid(device_name)
+ path = device_name
+ tags['ceph.%s_uuid' % device_type] = uuid
+ tags['ceph.%s_device' % device_type] = path
+ return path, uuid, tags
+
+ def prepare_device(self, arg, device_type, cluster_fsid, osd_fsid):
+ """
+ Check if ``arg`` is a device or partition to create an LV out of it
+ with a distinct volume group name, assigning LV tags on it and
+ ultimately, returning the logical volume object. Failing to detect
+ a device or partition will result in error.
+
+ :param arg: The value of ``--data`` when parsing args
+ :param device_type: Usually, either ``data`` or ``block`` (filestore vs. bluestore)
+ :param cluster_fsid: The cluster fsid/uuid
+ :param osd_fsid: The OSD fsid/uuid
+ """
+ if disk.is_partition(arg) or disk.is_device(arg):
+ # we must create a vg, and then a single lv
+ vg_name = "ceph-%s" % cluster_fsid
+ if api.get_vg(vg_name=vg_name):
+ # means we already have a group for this, make a different one
+ # XXX this could end up being annoying for an operator, maybe?
+ vg_name = "ceph-%s" % str(uuid.uuid4())
+ api.create_vg(vg_name, arg)
+ lv_name = "osd-%s-%s" % (device_type, osd_fsid)
+ return api.create_lv(
+ lv_name,
+ vg_name, # the volume group
+ tags={'ceph.type': device_type})
+ else:
+ error = [
+ 'Cannot use device (%s).',
+ 'A vg/lv path or an existing device is needed' % arg]
+ raise RuntimeError(' '.join(error))
+
+ raise RuntimeError('no data logical volume found with: %s' % arg)
+
@decorators.needs_root
def prepare(self, args):
# FIXME we don't allow re-using a keyring, we always generate one for the
secrets = {'cephx_secret': prepare_utils.create_key()}
cluster_fsid = conf.ceph.get('global', 'fsid')
- fsid = args.osd_fsid or system.generate_uuid()
- #osd_id = args.osd_id or prepare_utils.create_id(fsid)
+ osd_fsid = args.osd_fsid or system.generate_uuid()
# allow re-using an id, in case a prepare failed
- osd_id = args.osd_id or prepare_utils.create_id(fsid, json.dumps(secrets))
- vg_name, lv_name = args.data.split('/')
+ osd_id = args.osd_id or prepare_utils.create_id(osd_fsid, json.dumps(secrets))
if args.filestore:
- data_lv = api.get_lv(lv_name=lv_name, vg_name=vg_name)
-
- # we must have either an existing data_lv or a newly created, so lets make
- # sure that the tags are correct
- if not data_lv:
- raise RuntimeError('no data logical volume found with: %s' % args.data)
-
if not args.journal:
raise RuntimeError('--journal is required when using --filestore')
- journal_lv = self.get_journal_lv(args.journal)
- if journal_lv:
- journal_device = journal_lv.lv_path
- journal_uuid = journal_lv.lv_uuid
- # we can only set tags on an lv, the pv (if any) can't as we
- # aren't making it part of an lvm group (vg)
- journal_lv.set_tags({
- 'ceph.type': 'journal',
- 'ceph.osd_fsid': fsid,
- 'ceph.osd_id': osd_id,
- 'ceph.cluster_fsid': cluster_fsid,
- 'ceph.journal_device': journal_device,
- 'ceph.journal_uuid': journal_uuid,
- 'ceph.data_device': data_lv.lv_path,
- 'ceph.data_uuid': data_lv.lv_uuid,
- })
-
- # allow a file
- elif os.path.isfile(args.journal):
- journal_uuid = ''
- journal_device = args.journal
-
- # otherwise assume this is a regular disk partition
- else:
- journal_uuid = self.get_journal_ptuuid(args.journal)
- journal_device = args.journal
+ data_lv = self.get_lv(args.data)
+ if not data_lv:
+ data_lv = self.prepare_device(args.data, 'data', cluster_fsid, osd_fsid)
- data_lv.set_tags({
- 'ceph.type': 'data',
- 'ceph.osd_fsid': fsid,
+ tags = {
+ 'ceph.osd_fsid': osd_fsid,
'ceph.osd_id': osd_id,
'ceph.cluster_fsid': cluster_fsid,
- 'ceph.journal_device': journal_device,
- 'ceph.journal_uuid': journal_uuid,
+ 'ceph.cluster_name': conf.cluster,
'ceph.data_device': data_lv.lv_path,
'ceph.data_uuid': data_lv.lv_uuid,
- })
+ }
+
+ journal_device, journal_uuid, tags = self.setup_device('journal', args.journal, tags)
+
+ tags['ceph.type'] = 'data'
+ data_lv.set_tags(tags)
prepare_filestore(
data_lv.lv_path,
journal_device,
secrets,
id_=osd_id,
- fsid=fsid,
+ fsid=osd_fsid,
)
elif args.bluestore:
- prepare_bluestore(args)
+ block_lv = self.get_lv(args.data)
+ if not block_lv:
+ block_lv = self.prepare_device(args.data, 'block', cluster_fsid, osd_fsid)
+
+ tags = {
+ 'ceph.osd_fsid': osd_fsid,
+ 'ceph.osd_id': osd_id,
+ 'ceph.cluster_fsid': cluster_fsid,
+ 'ceph.cluster_name': conf.cluster,
+ 'ceph.block_device': block_lv.lv_path,
+ 'ceph.block_uuid': block_lv.lv_uuid,
+ }
+
+ wal_device, wal_uuid, tags = self.setup_device('wal', args.block_wal, tags)
+ db_device, db_uuid, tags = self.setup_device('db', args.block_db, tags)
+
+ tags['ceph.type'] = 'block'
+ block_lv.set_tags(tags)
+
+ prepare_bluestore(
+ block_lv.lv_path,
+ wal_device,
+ db_device,
+ secrets,
+ id_=osd_id,
+ fsid=osd_fsid,
+ )
def main(self):
sub_command_help = dedent("""
Existing logical volume (lv) or device:
- ceph-volume lvm prepare --data {logical volume} --journal /path/to/{lv}|{device}
+ ceph-volume lvm prepare --filestore --data {vg/lv} --journal /path/to/device
Or:
- ceph-volume lvm prepare --data {data volume group} --journal {journal volume group}
+ ceph-volume lvm prepare --filestore --data {vg/lv} --journal {vg/lv}
+
+ Existing block device, that will be made a group and logical volume:
+
+ ceph-volume lvm prepare --filestore --data /path/to/device --journal {vg/lv}
+
+ Bluestore
+ ---------
+
+ Existing logical volume (lv):
+
+ ceph-volume lvm prepare --bluestore --data {vg/lv}
+
+ Existing block device, that will be made a group and logical volume:
- Collocated (same group) for data and journal
- --------------------------------------------
+ ceph-volume lvm prepare --bluestore --data /path/to/device
- ceph-volume lvm prepare --data {volume group}
+ Optionally, can consume db and wal devices or logical volumes:
+ ceph-volume lvm prepare --bluestore --data {vg/lv} --block.wal {device} --block-db {vg/lv}
""")
parser = prepare_parser(
prog='ceph-volume lvm prepare',
print(sub_command_help)
return
args = parser.parse_args(self.argv)
+ # Default to bluestore here since defaulting it in add_argument may
+ # cause both to be True
+ if args.bluestore is None and args.filestore is None:
+ args.bluestore = True
self.prepare(args)
args = parser.parse_args(self.argv)
osd_id = parse_osd_id(args.systemd_data)
osd_uuid = parse_osd_uuid(args.systemd_data)
- Activate([osd_id, osd_uuid]).main()
+ Activate(['--auto-detect-objectstore', osd_id, osd_uuid]).main()
--- /dev/null
+import argparse
+import logging
+
+from textwrap import dedent
+
+from ceph_volume import decorators, terminal, process
+from ceph_volume.api import lvm as api
+
+logger = logging.getLogger(__name__)
+
+
+def wipefs(path):
+ """
+ Removes the filesystem from an lv or partition.
+ """
+ process.run([
+ 'sudo',
+ 'wipefs',
+ '--all',
+ path
+ ])
+
+
+def zap_data(path):
+ """
+ Clears all data from the given path. Path should be
+ an absolute path to an lv or partition.
+
+ 10M of data is written to the path to make sure that
+ there is no trace left of any previous Filesystem.
+ """
+ process.run([
+ 'dd',
+ 'if=/dev/zero',
+ 'of={path}'.format(path=path),
+ 'bs=1M',
+ 'count=10',
+ ])
+
+
+class Zap(object):
+
+ help = 'Removes all data and filesystems from a logical volume or partition.'
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ @decorators.needs_root
+ def zap(self, args):
+ device = args.device
+ lv = api.get_lv_from_argument(device)
+ if lv:
+ # we are zapping a logical volume
+ path = lv.lv_path
+ else:
+ # we are zapping a partition
+ #TODO: ensure device is a partition
+ path = device
+
+ logger.info("Zapping: %s", path)
+ terminal.write("Zapping: %s" % path)
+
+ wipefs(path)
+ zap_data(path)
+
+ if lv:
+ # remove all lvm metadata
+ lv.clear_tags()
+
+ terminal.success("Zapping successful for: %s" % path)
+
+ def main(self):
+ sub_command_help = dedent("""
+ Zaps the given logical volume or partition. If given a path to a logical
+ volume it must be in the format of vg/lv. Any filesystems present
+ on the given lv or partition will be removed and all data will be purged.
+
+ However, the lv or partition will be kept intact.
+
+ Example calls for supported scenarios:
+
+ Zapping a logical volume:
+
+ ceph-volume lvm zap {vg name/lv name}
+
+ Zapping a partition:
+
+ ceph-volume lvm zap /dev/sdc1
+
+ """)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume lvm zap',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=sub_command_help,
+ )
+
+ parser.add_argument(
+ 'device',
+ metavar='DEVICE',
+ nargs='?',
+ help='Path to an lv (as vg/lv) or to a partition like /dev/sda1'
+ )
+ if len(self.argv) == 0:
+ print(sub_command_help)
+ return
+ args = parser.parse_args(self.argv)
+ self.zap(args)
--- /dev/null
+from .main import Simple # noqa
--- /dev/null
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import process, decorators, terminal
+from ceph_volume.util import system, disk
+from ceph_volume.systemd import systemctl
+
+
+logger = logging.getLogger(__name__)
+
+
+class Activate(object):
+
+ help = 'Enable systemd units to mount configured devices and start a Ceph OSD'
+
+ def __init__(self, argv, systemd=False):
+ self.argv = argv
+ self.systemd = systemd
+
+ @decorators.needs_root
+ def activate(self, args):
+ with open(args.json_config, 'r') as fp:
+ osd_metadata = json.load(fp)
+
+ osd_id = osd_metadata.get('whoami', args.osd_id)
+ osd_fsid = osd_metadata.get('fsid', args.osd_fsid)
+
+ cluster_name = osd_metadata.get('cluster_name', 'ceph')
+ osd_dir = '/var/lib/ceph/osd/%s-%s' % (cluster_name, osd_id)
+ data_uuid = osd_metadata.get('data', {}).get('uuid')
+ if not data_uuid:
+ raise RuntimeError(
+ 'Unable to activate OSD %s - no "uuid" key found for data' % args.osd_id
+ )
+ data_device = disk.get_device_from_partuuid(data_uuid)
+ journal_device = disk.get_device_from_partuuid(osd_metadata.get('journal', {}).get('uuid'))
+ block_device = disk.get_device_from_partuuid(osd_metadata.get('block', {}).get('uuid'))
+ block_db_device = disk.get_device_from_partuuid(osd_metadata.get('block.db', {}).get('uuid'))
+ block_wal_device = disk.get_device_from_partuuid(
+ osd_metadata.get('block.wal', {}).get('uuid')
+ )
+
+ if not system.device_is_mounted(data_device, destination=osd_dir):
+ process.run(['sudo', 'mount', '-v', data_device, osd_dir])
+
+ device_map = {
+ 'journal': journal_device,
+ 'block': block_device,
+ 'block.db': block_db_device,
+ 'block.wal': block_wal_device
+ }
+
+ for name, device in device_map.items():
+ if not device:
+ continue
+ # always re-do the symlink regardless if it exists, so that the journal
+ # device path that may have changed can be mapped correctly every time
+ destination = os.path.join(osd_dir, name)
+ process.run(['sudo', 'ln', '-snf', device, destination])
+
+ # make sure that the journal has proper permissions
+ system.chown(device)
+
+ if not self.systemd:
+ # enable the ceph-volume unit for this OSD
+ systemctl.enable_volume(osd_id, osd_fsid, 'simple')
+
+ # disable any/all ceph-disk units
+ systemctl.mask_ceph_disk()
+
+ # enable the OSD
+ systemctl.enable_osd(osd_id)
+
+ # start the OSD
+ systemctl.start_osd(osd_id)
+
+ if not self.systemd:
+ terminal.success('Successfully activated OSD %s with FSID %s' % (osd_id, osd_fsid))
+ terminal.warning(
+ ('All ceph-disk systemd units have been disabled to '
+ 'prevent OSDs getting triggered by UDEV events')
+ )
+
+ def main(self):
+ sub_command_help = dedent("""
+ Activate OSDs by mounting devices previously configured to their
+ appropriate destination::
+
+ ceph-volume simple activate {ID} {FSID}
+
+ Or using a JSON file directly::
+
+ ceph-volume simple activate --file /etc/ceph/osd/{ID}-{FSID}.json
+
+ The OSD must have been "scanned" previously (see ``ceph-volume simple
+ scan``), so that all needed OSD device information and metadata exist.
+
+ A previously scanned OSD would exist like::
+
+ /etc/ceph/osd/{ID}-{FSID}.json
+
+
+ Environment variables supported:
+
+ CEPH_VOLUME_SIMPLE_JSON_DIR: Directory location for scanned OSD JSON configs
+ """)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume simple activate',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=sub_command_help,
+ )
+ parser.add_argument(
+ 'osd_id',
+ metavar='ID',
+ nargs='?',
+ help='The ID of the OSD, usually an integer, like 0'
+ )
+ parser.add_argument(
+ 'osd_fsid',
+ metavar='FSID',
+ nargs='?',
+ help='The FSID of the OSD, similar to a SHA1'
+ )
+ parser.add_argument(
+ '--file',
+ help='The path to a JSON file, from a scanned OSD'
+ )
+ if len(self.argv) == 0:
+ print(sub_command_help)
+ return
+ args = parser.parse_args(self.argv)
+ if not args.file:
+ if not args.osd_id and not args.osd_fsid:
+ terminal.error('ID and FSID are required to find the right OSD to activate')
+ terminal.error('from a scanned OSD location in /etc/ceph/osd/')
+ raise RuntimeError('Unable to activate without both ID and FSID')
+ # don't allow a CLI flag to specify the JSON dir, because that might
+ # implicitly indicate that it would be possible to activate a json file
+ # at a non-default location which would not work at boot time if the
+ # custom location is not exposed through an ENV var
+ json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/')
+ if args.file:
+ json_config = args.file
+ else:
+ json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
+ if not os.path.exists(json_config):
+ raise RuntimeError('Expected JSON config path not found: %s' % json_config)
+ args.json_config = json_config
+ self.activate(args)
--- /dev/null
+import argparse
+from textwrap import dedent
+from ceph_volume import terminal
+from . import scan
+from . import activate
+from . import trigger
+
+
+class Simple(object):
+
+ help = 'Manage already deployed OSDs with ceph-volume'
+
+ _help = dedent("""
+ Take over a deployed OSD, persisting its metadata in /etc/ceph/osd/ so that it can be managed
+ with ceph-volume directly. Avoids UDEV and ceph-disk handling.
+
+ {sub_help}
+ """)
+
+ mapper = {
+ 'scan': scan.Scan,
+ 'activate': activate.Activate,
+ 'trigger': trigger.Trigger,
+ }
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ def print_help(self, sub_help):
+ return self._help.format(sub_help=sub_help)
+
+ def main(self):
+ terminal.dispatch(self.mapper, self.argv)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume simple',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=self.print_help(terminal.subhelp(self.mapper)),
+ )
+ parser.parse_args(self.argv)
+ if len(self.argv) <= 1:
+ return parser.print_help()
--- /dev/null
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+from textwrap import dedent
+from ceph_volume import decorators, terminal, conf
+from ceph_volume.api import lvm
+from ceph_volume.util import arg_validators, system, disk
+
+
+logger = logging.getLogger(__name__)
+
+
+class Scan(object):
+
+ help = 'Capture metadata from an OSD data partition or directory'
+
+ def __init__(self, argv):
+ self.argv = argv
+ self._etc_path = '/etc/ceph/osd/'
+
+ @property
+ def etc_path(self):
+ if os.path.isdir(self._etc_path):
+ return self._etc_path
+
+ if not os.path.exists(self._etc_path):
+ os.mkdir(self._etc_path)
+ return self._etc_path
+
+ error = "OSD Configuration path (%s) needs to be a directory" % self._etc_path
+ raise RuntimeError(error)
+
+ def get_contents(self, path):
+ with open(path, 'r') as fp:
+ contents = fp.readlines()
+ if len(contents) > 1:
+ return ''.join(contents)
+ return ''.join(contents).strip().strip('\n')
+
+ def scan_device(self, path):
+ device_metadata = {'path': None, 'uuid': None}
+ if not path:
+ return device_metadata
+ # cannot read the symlink if this is tmpfs
+ if os.path.islink(path):
+ device = os.readlink(path)
+ else:
+ device = path
+ lvm_device = lvm.get_lv_from_argument(device)
+ if lvm_device:
+ device_uuid = lvm_device.lv_uuid
+ else:
+ device_uuid = disk.get_partuuid(device)
+
+ device_metadata['uuid'] = device_uuid
+ device_metadata['path'] = device
+
+ return device_metadata
+
+ def scan_directory(self, path):
+ osd_metadata = {'cluster_name': conf.cluster}
+ path_mounts = system.get_mounts(paths=True)
+ for _file in os.listdir(path):
+ file_path = os.path.join(path, _file)
+ if os.path.islink(file_path):
+ osd_metadata[_file] = self.scan_device(file_path)
+ if os.path.isdir(file_path):
+ continue
+ # the check for binary needs to go before the file, to avoid
+ # capturing data from binary files but still be able to capture
+ # contents from actual files later
+ if system.is_binary(file_path):
+ continue
+ if os.path.isfile(file_path):
+ osd_metadata[_file] = self.get_contents(file_path)
+
+ device = path_mounts.get(path)
+ # it is possible to have more than one device, pick the first one, and
+ # warn that it is possible that more than one device is 'data'
+ if not device:
+ terminal.error('Unable to detect device mounted for path: %s' % path)
+ raise RuntimeError('Cannot activate OSD')
+ osd_metadata['data'] = self.scan_device(device[0] if len(device) else None)
+
+ return osd_metadata
+
+ @decorators.needs_root
+ def scan(self, args):
+ osd_metadata = {'cluster_name': conf.cluster}
+ device_mounts = system.get_mounts(devices=True)
+ osd_path = None
+ logger.info('detecting if argument is a device or a directory: %s', args.osd_path)
+ if os.path.isdir(args.osd_path):
+ logger.info('will scan directly, path is a directory')
+ osd_path = args.osd_path
+ else:
+ # assume this is a device, check if it is mounted and use that path
+ logger.info('path is not a directory, will check if mounted')
+ if system.device_is_mounted(args.osd_path):
+ logger.info('argument is a device, which is mounted')
+ mounted_osd_paths = device_mounts.get(args.osd_path)
+ osd_path = mounted_osd_paths[0] if len(mounted_osd_paths) else None
+
+ # argument is not a directory, and it is not a device that is mounted
+ # somewhere so temporarily mount it to poke inside, otherwise, scan
+ # directly
+ if not osd_path:
+ logger.info('device is not mounted, will mount it temporarily to scan')
+ with system.tmp_mount(args.osd_path) as osd_path:
+ osd_metadata = self.scan_directory(osd_path)
+ else:
+ logger.info('will scan OSD directory at path: %s', osd_path)
+ osd_metadata = self.scan_directory(osd_path)
+
+ osd_id = osd_metadata['whoami']
+ osd_fsid = osd_metadata['fsid']
+ filename = '%s-%s.json' % (osd_id, osd_fsid)
+ json_path = os.path.join(self.etc_path, filename)
+ if os.path.exists(json_path) and not args.stdout:
+ if not args.force:
+ raise RuntimeError(
+ '--force was not used and OSD metadata file exists: %s' % json_path
+ )
+
+ if args.stdout:
+ print(json.dumps(osd_metadata, indent=4, sort_keys=True, ensure_ascii=False))
+ else:
+ with open(json_path, 'w') as fp:
+ json.dump(osd_metadata, fp, indent=4, sort_keys=True, ensure_ascii=False)
+ terminal.success(
+ 'OSD %s got scanned and metadata persisted to file: %s' % (
+ osd_id,
+ json_path
+ )
+ )
+ terminal.success(
+ 'To take over managment of this scanned OSD, and disable ceph-disk and udev, run:'
+ )
+ terminal.success(' ceph-volume simple activate %s %s' % (osd_id, osd_fsid))
+
+ if not osd_metadata.get('data'):
+ msg = 'Unable to determine device mounted on %s' % args.osd_path
+ logger.warning(msg)
+ terminal.warning(msg)
+ terminal.warning('OSD will not be able to start without this information:')
+ terminal.warning(' "data": "/path/to/device",')
+ logger.warning('Unable to determine device mounted on %s' % args.osd_path)
+
+ def main(self):
+ sub_command_help = dedent("""
+ Scan an OSD directory for files and configurations that will allow to
+ take over the management of the OSD.
+
+ Scanned OSDs will get their configurations stored in
+ /etc/ceph/osd/<id>-<fsid>.json
+
+ For an OSD ID of 0 with fsid of ``a9d50838-e823-43d6-b01f-2f8d0a77afc2``
+ that could mean a scan command that looks like::
+
+ ceph-volume lvm scan /var/lib/ceph/osd/ceph-0
+
+ Which would store the metadata in a JSON file at::
+
+ /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json
+
+ To a scan an existing, running, OSD:
+
+ ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id}
+
+ And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1
+
+ ceph-volume simple scan /dev/sda1
+ """)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume simple scan',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=sub_command_help,
+ )
+
+ parser.add_argument(
+ '-f', '--force',
+ action='store_true',
+ help='If OSD has already been scanned, the JSON file will be overwritten'
+ )
+
+ parser.add_argument(
+ '--stdout',
+ action='store_true',
+ help='Do not save to a file, output metadata to stdout'
+ )
+
+ parser.add_argument(
+ 'osd_path',
+ metavar='OSD_PATH',
+ type=arg_validators.OSDPath(),
+ nargs='?',
+ help='Path to an existing OSD directory or OSD data partition'
+ )
+
+ if len(self.argv) == 0:
+ print(sub_command_help)
+ return
+ args = parser.parse_args(self.argv)
+ self.scan(args)
--- /dev/null
+from __future__ import print_function
+import argparse
+from textwrap import dedent
+from ceph_volume.exceptions import SuffixParsingError
+from ceph_volume import decorators
+from .activate import Activate
+
+
+def parse_osd_id(string):
+ osd_id = string.split('-', 1)[0]
+ if not osd_id:
+ raise SuffixParsingError('OSD id', string)
+ if osd_id.isdigit():
+ return osd_id
+ raise SuffixParsingError('OSD id', string)
+
+
+def parse_osd_uuid(string):
+ osd_id = '%s-' % parse_osd_id(string)
+ # remove the id first
+ osd_uuid = string.split(osd_id, 1)[-1]
+ if not osd_uuid:
+ raise SuffixParsingError('OSD uuid', string)
+ return osd_uuid
+
+
+class Trigger(object):
+
+ help = 'systemd helper to activate an OSD'
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ @decorators.needs_root
+ def main(self):
+ sub_command_help = dedent("""
+ ** DO NOT USE DIRECTLY **
+ This tool is meant to help the systemd unit that knows about OSDs.
+
+ Proxy OSD activation to ``ceph-volume simple activate`` by parsing the
+ input from systemd, detecting the UUID and ID associated with an OSD::
+
+ ceph-volume simple trigger {SYSTEMD-DATA}
+
+ The systemd "data" is expected to be in the format of::
+
+ {OSD ID}-{OSD UUID}
+
+ The devices associated with the OSD need to have been scanned previously,
+ so that all needed metadata can be used for starting the OSD process.
+ """)
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume simple trigger',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=sub_command_help,
+ )
+
+ parser.add_argument(
+ 'systemd_data',
+ metavar='SYSTEMD_DATA',
+ nargs='?',
+ help='Data from a systemd unit containing ID and UUID of the OSD, like 0-asdf-lkjh'
+ )
+ if len(self.argv) == 0:
+ print(sub_command_help)
+ return
+ args = parser.parse_args(self.argv)
+ osd_id = parse_osd_id(args.systemd_data)
+ osd_uuid = parse_osd_uuid(args.systemd_data)
+ Activate([osd_id, osd_uuid], systemd=True).main()
"""
def __init__(self, argv=None, parse=True):
- self.mapper = {'lvm': devices.lvm.LVM}
+ self.mapper = {'lvm': devices.lvm.LVM, 'simple': devices.simple.Simple}
self.plugin_help = "No plugins found/loaded"
if argv is None:
self.argv = sys.argv
pass
+def obfuscate(command_, on=None):
+ """
+ Certain commands that are useful to log might contain information that
+ should be replaced by '*' like when creating OSDs and the keyryings are
+ being passed, which should not be logged.
+
+ :param on: A string (will match a flag) or an integer (will match an index)
+
+ If matching on a flag (when ``on`` is a string) it will obfuscate on the
+ value for that flag. That is a command like ['ls', '-l', '/'] that calls
+ `obfuscate(command, on='-l')` will obfustace '/' which is the value for
+ `-l`.
+
+ The reason for `on` to allow either a string or an integer, altering
+ behavior for both is because it is easier for ``run`` and ``call`` to just
+ pop a value to obfuscate (vs. allowing an index or a flag)
+ """
+ command = command_[:]
+ msg = "Running command: %s" % ' '.join(command)
+ if on in [None, False]:
+ return msg
+
+ if isinstance(on, int):
+ index = on
+
+ else:
+ try:
+ index = command.index(on) + 1
+ except ValueError:
+ # if the flag just doesn't exist then it doesn't matter just return
+ # the base msg
+ return msg
+
+ try:
+ command[index] = '*' * len(command[index])
+ except IndexError: # the index was completely out of range
+ return msg
+
+ return "Running command: %s" % ' '.join(command)
+
+
def run(command, **kw):
"""
A real-time-logging implementation of a remote subprocess.Popen call where
:param stop_on_error: If a nonzero exit status is return, it raises a ``RuntimeError``
"""
stop_on_error = kw.pop('stop_on_error', True)
- command_msg = "Running command: %s" % ' '.join(command)
+ command_msg = obfuscate(command, kw.pop('obfuscate', None))
stdin = kw.pop('stdin', None)
logger.info(command_msg)
terminal.write(command_msg)
it is forcefully set to True if a return code is non-zero
"""
terminal_verbose = kw.pop('terminal_verbose', False)
+ show_command = kw.pop('show_command', False)
command_msg = "Running command: %s" % ' '.join(command)
stdin = kw.pop('stdin', None)
logger.info(command_msg)
- terminal.write(command_msg)
+ if show_command:
+ terminal.write(command_msg)
process = subprocess.Popen(
command,
process.run(['sudo', 'systemctl', 'disable', unit])
+def mask(unit):
+ process.run(['sudo', 'systemctl', 'mask', unit])
+
+
def start_osd(id_):
return start(osd_unit % id_)
return enable(volume_unit % (device_type, id_, fsid))
+def mask_ceph_disk():
+ # systemctl allows using a glob like '*' for masking, but there was a bug
+ # in that it wouldn't allow this for service templates. This means that
+ # masking ceph-disk@* will not work, so we must link the service directly.
+ # /etc/systemd takes precendence regardless of the location of the unit
+ process.run(
+ ['sudo', 'ln', '-sf', '/dev/null', '/etc/systemd/system/ceph-disk@.service']
+ )
+
+
#
# templates
#
osd_unit = "ceph-osd@%s"
+ceph_disk_unit = "ceph-disk@%s"
volume_unit = "ceph-volume@%s-%s-%s"
--- /dev/null
+import pytest
+from ceph_volume import process, exceptions
+from ceph_volume.api import lvm as api
+
+
+class TestParseTags(object):
+
+ def test_no_tags_means_empty_dict(self):
+ result = api.parse_tags('')
+ assert result == {}
+
+ def test_single_tag_gets_parsed(self):
+ result = api.parse_tags('ceph.osd_something=1')
+ assert result == {'ceph.osd_something': '1'}
+
+ def test_multiple_csv_expands_in_dict(self):
+ result = api.parse_tags('ceph.osd_something=1,ceph.foo=2,ceph.fsid=0000')
+ # assert them piecemeal to avoid the un-ordered dict nature
+ assert result['ceph.osd_something'] == '1'
+ assert result['ceph.foo'] == '2'
+ assert result['ceph.fsid'] == '0000'
+
+
+class TestGetAPIVgs(object):
+
+ def test_report_is_emtpy(self, monkeypatch):
+ monkeypatch.setattr(api.process, 'call', lambda x: ('\n\n', '', 0))
+ assert api.get_api_vgs() == []
+
+ def test_report_has_stuff(self, monkeypatch):
+ report = [' VolGroup00']
+ monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+ assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]
+
+ def test_report_has_stuff_with_empty_attrs(self, monkeypatch):
+ report = [' VolGroup00 ;;;;;;9g']
+ monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+ result = api.get_api_vgs()[0]
+ assert len(result.keys()) == 7
+ assert result['vg_name'] == 'VolGroup00'
+ assert result['vg_free'] == '9g'
+
+ def test_report_has_multiple_items(self, monkeypatch):
+ report = [' VolGroup00;;;;;;;', ' ceph_vg;;;;;;;']
+ monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+ result = api.get_api_vgs()
+ assert result[0]['vg_name'] == 'VolGroup00'
+ assert result[1]['vg_name'] == 'ceph_vg'
+
+
+class TestGetAPILvs(object):
+
+ def test_report_is_emtpy(self, monkeypatch):
+ monkeypatch.setattr(api.process, 'call', lambda x: ('', '', 0))
+ assert api.get_api_lvs() == []
+
+ def test_report_has_stuff(self, monkeypatch):
+ report = [' ;/path;VolGroup00;root']
+ monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+ result = api.get_api_lvs()
+ assert result[0]['lv_name'] == 'VolGroup00'
+
+ def test_report_has_multiple_items(self, monkeypatch):
+ report = [' ;/path;VolName;root', ';/dev/path;ceph_lv;ceph_vg']
+ monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+ result = api.get_api_lvs()
+ assert result[0]['lv_name'] == 'VolName'
+ assert result[1]['lv_name'] == 'ceph_lv'
+
+
+@pytest.fixture
+def volumes(monkeypatch):
+ monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+ volumes = api.Volumes()
+ volumes._purge()
+ # also patch api.Volumes so that when it is called, it will use the newly
+ # created fixture, with whatever the test method wants to append to it
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ return volumes
+
+
+@pytest.fixture
+def pvolumes(monkeypatch):
+ monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+ pvolumes = api.PVolumes()
+ pvolumes._purge()
+ return pvolumes
+
+
+@pytest.fixture
+def volume_groups(monkeypatch):
+ monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
+ vgs = api.VolumeGroups()
+ vgs._purge()
+ return vgs
+
+
+class TestGetLV(object):
+
+ def test_nothing_is_passed_in(self):
+ # so we return a None
+ assert api.get_lv() is None
+
+ def test_single_lv_is_matched(self, volumes, monkeypatch):
+ FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.type=data")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ assert api.get_lv(lv_name='foo') == FooVolume
+
+ def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
+ FooVolume = api.Volume(
+ lv_name='foo', lv_path='/dev/vg/foo',
+ lv_uuid='1111', lv_tags="ceph.type=data")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ assert api.get_lv(lv_uuid='1111') == FooVolume
+
+
+class TestGetPV(object):
+
+ def test_nothing_is_passed_in(self):
+ # so we return a None
+ assert api.get_pv() is None
+
+ def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='foo') is None
+
+ def test_single_pv_is_matched(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='0000') == FooPVolume
+
+ def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags="ceph.type=data")
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='1111') == FooPVolume
+
+
+class TestPVolumes(object):
+
+ def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
+ pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags=pv_tags)
+ pvolumes.append(FooPVolume)
+ pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
+ assert pvolumes == []
+
+ def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
+ pv_tags = "ceph.type=journal,ceph.osd_id=1"
+ FooPVolume = api.PVolume(
+ pv_name='/dev/vg/foo',
+ pv_uuid='1111', pv_tags=pv_tags)
+ pvolumes.append(FooPVolume)
+ pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
+ assert pvolumes == [FooPVolume]
+
+
+class TestGetVG(object):
+
+ def test_nothing_is_passed_in(self):
+ # so we return a None
+ assert api.get_vg() is None
+
+ def test_single_vg_is_matched(self, volume_groups, monkeypatch):
+ FooVG = api.VolumeGroup(vg_name='foo')
+ volume_groups.append(FooVG)
+ monkeypatch.setattr(api, 'VolumeGroups', lambda: volume_groups)
+ assert api.get_vg(vg_name='foo') == FooVG
+
+
+class TestVolumes(object):
+
+ def test_volume_get_has_no_volumes(self, volumes):
+ assert volumes.get() is None
+
+ def test_volume_get_filtered_has_no_volumes(self, volumes):
+ assert volumes.get(lv_name='ceph') is None
+
+ def test_volume_has_multiple_matches(self, volumes):
+ volume1 = volume2 = api.Volume(lv_name='foo', lv_path='/dev/vg/lv', lv_tags='')
+ volumes.append(volume1)
+ volumes.append(volume2)
+ with pytest.raises(exceptions.MultipleLVsError):
+ volumes.get(lv_name='foo')
+
+ def test_as_dict_infers_type_from_tags(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+ volumes.append(osd)
+ result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+ assert result['type'] == 'data'
+
+ def test_as_dict_populates_path_from_lv_api(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+ volumes.append(osd)
+ result = volumes.get(lv_tags={'ceph.type': 'data'}).as_dict()
+ assert result['path'] == '/dev/vg/lv'
+
+ def test_find_the_correct_one(self, volumes):
+ volume1 = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags='')
+ volume2 = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='')
+ volumes.append(volume1)
+ volumes.append(volume2)
+ assert volumes.get(lv_name='volume1') == volume1
+
+ def test_filter_by_tag(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.type=journal')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_tags={'ceph.type': 'data'})
+ assert len(volumes) == 1
+ assert volumes[0].lv_name == 'volume1'
+
+ def test_filter_by_tag_does_not_match_one(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
+ volumes.append(osd)
+ volumes.append(journal)
+ # note the different osd_id!
+ volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
+ assert volumes == []
+
+ def test_filter_by_vg_name(self, volumes):
+ lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
+ journal = api.Volume(lv_name='volume2', vg_name='system_vg', lv_tags='ceph.type=journal')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(vg_name='ceph_vg')
+ assert len(volumes) == 1
+ assert volumes[0].lv_name == 'volume1'
+
+ def test_filter_by_lv_path(self, volumes):
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_tags='')
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_tags='')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_path='/dev/volume1')
+ assert len(volumes) == 1
+ assert volumes[0].lv_name == 'volume1'
+
+ def test_filter_by_lv_uuid(self, volumes):
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_uuid='1111')
+ assert len(volumes) == 1
+ assert volumes[0].lv_name == 'volume1'
+
+ def test_filter_by_lv_uuid_nothing_found(self, volumes):
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
+ journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
+ volumes.append(osd)
+ volumes.append(journal)
+ volumes.filter(lv_uuid='22222')
+ assert volumes == []
+
+ def test_filter_requires_params(self, volumes):
+ with pytest.raises(TypeError):
+ volumes.filter()
+
+
+class TestVolumeGroups(object):
+
+ def test_volume_get_has_no_volume_groups(self, volume_groups):
+ assert volume_groups.get() is None
+
+ def test_volume_get_filtered_has_no_volumes(self, volume_groups):
+ assert volume_groups.get(vg_name='ceph') is None
+
+ def test_volume_has_multiple_matches(self, volume_groups):
+ volume1 = volume2 = api.VolumeGroup(vg_name='foo', lv_path='/dev/vg/lv', lv_tags='')
+ volume_groups.append(volume1)
+ volume_groups.append(volume2)
+ with pytest.raises(exceptions.MultipleVGsError):
+ volume_groups.get(vg_name='foo')
+
+ def test_find_the_correct_one(self, volume_groups):
+ volume1 = api.VolumeGroup(vg_name='volume1', lv_tags='')
+ volume2 = api.VolumeGroup(vg_name='volume2', lv_tags='')
+ volume_groups.append(volume1)
+ volume_groups.append(volume2)
+ assert volume_groups.get(vg_name='volume1') == volume1
+
+ def test_filter_by_tag(self, volume_groups):
+ vg_tags = "ceph.group=dmcache"
+ osd = api.VolumeGroup(vg_name='volume1', vg_tags=vg_tags)
+ journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.group=plain')
+ volume_groups.append(osd)
+ volume_groups.append(journal)
+ volume_groups.filter(vg_tags={'ceph.group': 'dmcache'})
+ assert len(volume_groups) == 1
+ assert volume_groups[0].vg_name == 'volume1'
+
+ def test_filter_by_tag_does_not_match_one(self, volume_groups):
+ vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
+ osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
+ volume_groups.append(osd)
+ volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
+ assert volume_groups == []
+
+ def test_filter_by_vg_name(self, volume_groups):
+ vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
+ osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
+ journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.type=journal')
+ volume_groups.append(osd)
+ volume_groups.append(journal)
+ volume_groups.filter(vg_name='ceph_vg')
+ assert len(volume_groups) == 1
+ assert volume_groups[0].vg_name == 'ceph_vg'
+
+ def test_filter_requires_params(self, volume_groups):
+ with pytest.raises(TypeError):
+ volume_groups.filter()
+
+
+class TestGetLVFromArgument(object):
+
+ def setup(self):
+ self.foo_volume = api.Volume(
+ lv_name='foo', lv_path='/path/to/lv',
+ vg_name='foo_group', lv_tags=''
+ )
+
+ def test_non_absolute_path_is_not_valid(self, volumes):
+ volumes.append(self.foo_volume)
+ assert api.get_lv_from_argument('foo') is None
+
+ def test_too_many_slashes_is_invalid(self, volumes):
+ volumes.append(self.foo_volume)
+ assert api.get_lv_from_argument('path/to/lv') is None
+
+ def test_absolute_path_is_not_lv(self, volumes):
+ volumes.append(self.foo_volume)
+ assert api.get_lv_from_argument('/path') is None
+
+ def test_absolute_path_is_lv(self, volumes):
+ volumes.append(self.foo_volume)
+ assert api.get_lv_from_argument('/path/to/lv') == self.foo_volume
+
+
+class TestRemoveLV(object):
+
+ def test_removes_lv(self, monkeypatch):
+ def mock_call(cmd, **kw):
+ return ('', '', 0)
+ monkeypatch.setattr(process, 'call', mock_call)
+ assert api.remove_lv("vg/lv")
+
+ def test_fails_to_remove_lv(self, monkeypatch):
+ def mock_call(cmd, **kw):
+ return ('', '', 1)
+ monkeypatch.setattr(process, 'call', mock_call)
+ with pytest.raises(RuntimeError):
+ api.remove_lv("vg/lv")
+
+
+class TestCreateLV(object):
+
+ def setup(self):
+ self.foo_volume = api.Volume(lv_name='foo', lv_path='/path', vg_name='foo_group', lv_tags='')
+
+ def test_uses_size(self, monkeypatch, capture):
+ monkeypatch.setattr(process, 'run', capture)
+ monkeypatch.setattr(process, 'call', capture)
+ monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+ api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+ expected = ['sudo', 'lvcreate', '--yes', '-L', '5G', '-n', 'foo', 'foo_group']
+ assert capture.calls[0]['args'][0] == expected
+
+ def test_calls_to_set_type_tag(self, monkeypatch, capture):
+ monkeypatch.setattr(process, 'run', capture)
+ monkeypatch.setattr(process, 'call', capture)
+ monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+ api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+ ceph_tag = ['sudo', 'lvchange', '--addtag', 'ceph.type=data', '/path']
+ assert capture.calls[1]['args'][0] == ceph_tag
+
+ def test_calls_to_set_data_tag(self, monkeypatch, capture):
+ monkeypatch.setattr(process, 'run', capture)
+ monkeypatch.setattr(process, 'call', capture)
+ monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+ api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'})
+ data_tag = ['sudo', 'lvchange', '--addtag', 'ceph.data_device=/path', '/path']
+ assert capture.calls[2]['args'][0] == data_tag
+import os
import pytest
-from ceph_volume.devices.lvm import api
+from ceph_volume.api import lvm as lvm_api
+
class Capture(object):
self.calls.append({'args': a, 'kwargs': kw})
+class Factory(object):
+
+ def __init__(self, **kw):
+ for k, v in kw.items():
+ setattr(self, k, v)
+
+
+@pytest.fixture
+def factory():
+ return Factory
+
+
@pytest.fixture
def capture():
return Capture()
@pytest.fixture
def volumes(monkeypatch):
monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
- volumes = api.Volumes()
+ volumes = lvm_api.Volumes()
volumes._purge()
return volumes
@pytest.fixture
def volume_groups(monkeypatch):
monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
- vgs = api.VolumeGroups()
+ vgs = lvm_api.VolumeGroups()
vgs._purge()
return vgs
is root (or is sudoing to superuser) can continue as-is
"""
monkeypatch.setattr('os.getuid', lambda: 0)
+
+
+@pytest.fixture
+def tmpfile(tmpdir):
+ """
+ Create a temporary file, optionally filling it with contents, returns an
+ absolute path to the file when called
+ """
+ def generate_file(name='file', contents=''):
+ path = os.path.join(str(tmpdir), name)
+ with open(path, 'w') as fp:
+ fp.write(contents)
+ return path
+ return generate_file
import pytest
-from ceph_volume.devices.lvm import activate, api
+from ceph_volume.devices.lvm import activate
+from ceph_volume.api import lvm as api
class Args(object):
def __init__(self, **kw):
+ # default flags
+ self.bluestore = False
+ self.filestore = False
+ self.auto_detect_objectstore = None
for k, v in kw.items():
setattr(self, k, v)
volumes.append(FooVolume)
monkeypatch.setattr(api, 'Volumes', lambda: volumes)
monkeypatch.setattr(activate, 'activate_filestore', capture)
- args = Args(osd_id=None, osd_fsid='1234')
+ args = Args(osd_id=None, osd_fsid='1234', filestore=True)
+ activate.Activate([]).activate(args)
+ assert capture.calls[0]['args'][0] == [FooVolume]
+
+ def test_no_osd_id_matches_fsid_bluestore(self, is_root, volumes, monkeypatch, capture):
+ FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.osd_fsid=1234")
+ volumes.append(FooVolume)
+ monkeypatch.setattr(api, 'Volumes', lambda: volumes)
+ monkeypatch.setattr(activate, 'activate_bluestore', capture)
+ args = Args(osd_id=None, osd_fsid='1234', bluestore=True)
activate.Activate([]).activate(args)
assert capture.calls[0]['args'][0] == [FooVolume]
args = Args(osd_id=None, osd_fsid='1234')
with pytest.raises(RuntimeError):
activate.Activate([]).activate(args)
+
+
+class TestActivateFlags(object):
+
+ def test_default_objectstore(self, capture):
+ args = ['0', 'asdf-ljh-asdf']
+ activation = activate.Activate(args)
+ activation.activate = capture
+ activation.main()
+ parsed_args = capture.calls[0]['args'][0]
+ assert parsed_args.filestore is False
+ assert parsed_args.bluestore is True
+
+ def test_uses_filestore(self, capture):
+ args = ['--filestore', '0', 'asdf-ljh-asdf']
+ activation = activate.Activate(args)
+ activation.activate = capture
+ activation.main()
+ parsed_args = capture.calls[0]['args'][0]
+ assert parsed_args.filestore is True
+ assert parsed_args.bluestore is False
+
+ def test_uses_bluestore(self, capture):
+ args = ['--bluestore', '0', 'asdf-ljh-asdf']
+ activation = activate.Activate(args)
+ activation.activate = capture
+ activation.main()
+ parsed_args = capture.calls[0]['args'][0]
+ assert parsed_args.filestore is False
+ assert parsed_args.bluestore is True
+++ /dev/null
-import pytest
-from ceph_volume import process, exceptions
-from ceph_volume.devices.lvm import api
-
-
-class TestParseTags(object):
-
- def test_no_tags_means_empty_dict(self):
- result = api.parse_tags('')
- assert result == {}
-
- def test_single_tag_gets_parsed(self):
- result = api.parse_tags('ceph.osd_something=1')
- assert result == {'ceph.osd_something': '1'}
-
- def test_multiple_csv_expands_in_dict(self):
- result = api.parse_tags('ceph.osd_something=1,ceph.foo=2,ceph.fsid=0000')
- # assert them piecemeal to avoid the un-ordered dict nature
- assert result['ceph.osd_something'] == '1'
- assert result['ceph.foo'] == '2'
- assert result['ceph.fsid'] == '0000'
-
-
-class TestGetAPIVgs(object):
-
- def test_report_is_emtpy(self, monkeypatch):
- monkeypatch.setattr(api.process, 'call', lambda x: ('\n\n', '', 0))
- assert api.get_api_vgs() == []
-
- def test_report_has_stuff(self, monkeypatch):
- report = [' VolGroup00']
- monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
- assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]
-
- def test_report_has_stuff_with_empty_attrs(self, monkeypatch):
- report = [' VolGroup00 ;;;;;;9g']
- monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
- result = api.get_api_vgs()[0]
- assert len(result.keys()) == 7
- assert result['vg_name'] == 'VolGroup00'
- assert result['vg_free'] == '9g'
-
- def test_report_has_multiple_items(self, monkeypatch):
- report = [' VolGroup00;;;;;;;', ' ceph_vg;;;;;;;']
- monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
- result = api.get_api_vgs()
- assert result[0]['vg_name'] == 'VolGroup00'
- assert result[1]['vg_name'] == 'ceph_vg'
-
-
-class TestGetAPILvs(object):
-
- def test_report_is_emtpy(self, monkeypatch):
- monkeypatch.setattr(api.process, 'call', lambda x: ('', '', 0))
- assert api.get_api_lvs() == []
-
- def test_report_has_stuff(self, monkeypatch):
- report = [' ;/path;VolGroup00;root']
- monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
- result = api.get_api_lvs()
- assert result[0]['lv_name'] == 'VolGroup00'
-
- def test_report_has_multiple_items(self, monkeypatch):
- report = [' ;/path;VolName;root', ';/dev/path;ceph_lv;ceph_vg']
- monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
- result = api.get_api_lvs()
- assert result[0]['lv_name'] == 'VolName'
- assert result[1]['lv_name'] == 'ceph_lv'
-
-
-@pytest.fixture
-def volumes(monkeypatch):
- monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
- volumes = api.Volumes()
- volumes._purge()
- return volumes
-
-
-@pytest.fixture
-def pvolumes(monkeypatch):
- monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
- pvolumes = api.PVolumes()
- pvolumes._purge()
- return pvolumes
-
-
-@pytest.fixture
-def volume_groups(monkeypatch):
- monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
- vgs = api.VolumeGroups()
- vgs._purge()
- return vgs
-
-
-class TestGetLV(object):
-
- def test_nothing_is_passed_in(self):
- # so we return a None
- assert api.get_lv() is None
-
- def test_single_lv_is_matched(self, volumes, monkeypatch):
- FooVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.type=data")
- volumes.append(FooVolume)
- monkeypatch.setattr(api, 'Volumes', lambda: volumes)
- assert api.get_lv(lv_name='foo') == FooVolume
-
- def test_single_lv_is_matched_by_uuid(self, volumes, monkeypatch):
- FooVolume = api.Volume(
- lv_name='foo', lv_path='/dev/vg/foo',
- lv_uuid='1111', lv_tags="ceph.type=data")
- volumes.append(FooVolume)
- monkeypatch.setattr(api, 'Volumes', lambda: volumes)
- assert api.get_lv(lv_uuid='1111') == FooVolume
-
-
-class TestGetPV(object):
-
- def test_nothing_is_passed_in(self):
- # so we return a None
- assert api.get_pv() is None
-
- def test_single_pv_is_not_matched(self, pvolumes, monkeypatch):
- FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
- pvolumes.append(FooPVolume)
- monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
- assert api.get_pv(pv_uuid='foo') is None
-
- def test_single_pv_is_matched(self, pvolumes, monkeypatch):
- FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
- pvolumes.append(FooPVolume)
- monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
- assert api.get_pv(pv_uuid='0000') == FooPVolume
-
- def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
- FooPVolume = api.PVolume(
- pv_name='/dev/vg/foo',
- pv_uuid='1111', pv_tags="ceph.type=data")
- pvolumes.append(FooPVolume)
- monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
- assert api.get_pv(pv_uuid='1111') == FooPVolume
-
-
-class TestPVolumes(object):
-
- def test_filter_by_tag_does_not_match_one(self, pvolumes, monkeypatch):
- pv_tags = "ceph.type=journal,ceph.osd_id=1,ceph.fsid=000-aaa"
- FooPVolume = api.PVolume(
- pv_name='/dev/vg/foo',
- pv_uuid='1111', pv_tags=pv_tags)
- pvolumes.append(FooPVolume)
- pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '2'})
- assert pvolumes == []
-
- def test_filter_by_tags_matches(self, pvolumes, monkeypatch):
- pv_tags = "ceph.type=journal,ceph.osd_id=1"
- FooPVolume = api.PVolume(
- pv_name='/dev/vg/foo',
- pv_uuid='1111', pv_tags=pv_tags)
- pvolumes.append(FooPVolume)
- pvolumes.filter(pv_tags={'ceph.type': 'journal', 'ceph.osd_id': '1'})
- assert pvolumes == [FooPVolume]
-
-
-class TestGetVG(object):
-
- def test_nothing_is_passed_in(self):
- # so we return a None
- assert api.get_vg() is None
-
- def test_single_vg_is_matched(self, volume_groups, monkeypatch):
- FooVG = api.VolumeGroup(vg_name='foo')
- volume_groups.append(FooVG)
- monkeypatch.setattr(api, 'VolumeGroups', lambda: volume_groups)
- assert api.get_vg(vg_name='foo') == FooVG
-
-
-class TestVolumes(object):
-
- def test_volume_get_has_no_volumes(self, volumes):
- assert volumes.get() is None
-
- def test_volume_get_filtered_has_no_volumes(self, volumes):
- assert volumes.get(lv_name='ceph') is None
-
- def test_volume_has_multiple_matches(self, volumes):
- volume1 = volume2 = api.Volume(lv_name='foo', lv_path='/dev/vg/lv', lv_tags='')
- volumes.append(volume1)
- volumes.append(volume2)
- with pytest.raises(exceptions.MultipleLVsError):
- volumes.get(lv_name='foo')
-
- def test_find_the_correct_one(self, volumes):
- volume1 = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags='')
- volume2 = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='')
- volumes.append(volume1)
- volumes.append(volume2)
- assert volumes.get(lv_name='volume1') == volume1
-
- def test_filter_by_tag(self, volumes):
- lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
- osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
- journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.type=journal')
- volumes.append(osd)
- volumes.append(journal)
- volumes.filter(lv_tags={'ceph.type': 'data'})
- assert len(volumes) == 1
- assert volumes[0].lv_name == 'volume1'
-
- def test_filter_by_tag_does_not_match_one(self, volumes):
- lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
- osd = api.Volume(lv_name='volume1', lv_path='/dev/vg/lv', lv_tags=lv_tags)
- journal = api.Volume(lv_name='volume2', lv_path='/dev/vg/lv', lv_tags='ceph.osd_id=1,ceph.type=journal')
- volumes.append(osd)
- volumes.append(journal)
- # note the different osd_id!
- volumes.filter(lv_tags={'ceph.type': 'data', 'ceph.osd_id': '2'})
- assert volumes == []
-
- def test_filter_by_vg_name(self, volumes):
- lv_tags = "ceph.type=data,ceph.fsid=000-aaa"
- osd = api.Volume(lv_name='volume1', vg_name='ceph_vg', lv_tags=lv_tags)
- journal = api.Volume(lv_name='volume2', vg_name='system_vg', lv_tags='ceph.type=journal')
- volumes.append(osd)
- volumes.append(journal)
- volumes.filter(vg_name='ceph_vg')
- assert len(volumes) == 1
- assert volumes[0].lv_name == 'volume1'
-
- def test_filter_by_lv_path(self, volumes):
- osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_tags='')
- journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_tags='')
- volumes.append(osd)
- volumes.append(journal)
- volumes.filter(lv_path='/dev/volume1')
- assert len(volumes) == 1
- assert volumes[0].lv_name == 'volume1'
-
- def test_filter_by_lv_uuid(self, volumes):
- osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
- journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
- volumes.append(osd)
- volumes.append(journal)
- volumes.filter(lv_uuid='1111')
- assert len(volumes) == 1
- assert volumes[0].lv_name == 'volume1'
-
- def test_filter_by_lv_uuid_nothing_found(self, volumes):
- osd = api.Volume(lv_name='volume1', lv_path='/dev/volume1', lv_uuid='1111', lv_tags='')
- journal = api.Volume(lv_name='volume2', lv_path='/dev/volume2', lv_uuid='', lv_tags='')
- volumes.append(osd)
- volumes.append(journal)
- volumes.filter(lv_uuid='22222')
- assert volumes == []
-
- def test_filter_requires_params(self, volumes):
- with pytest.raises(TypeError):
- volumes.filter()
-
-
-class TestVolumeGroups(object):
-
- def test_volume_get_has_no_volume_groups(self, volume_groups):
- assert volume_groups.get() is None
-
- def test_volume_get_filtered_has_no_volumes(self, volume_groups):
- assert volume_groups.get(vg_name='ceph') is None
-
- def test_volume_has_multiple_matches(self, volume_groups):
- volume1 = volume2 = api.VolumeGroup(vg_name='foo', lv_path='/dev/vg/lv', lv_tags='')
- volume_groups.append(volume1)
- volume_groups.append(volume2)
- with pytest.raises(exceptions.MultipleVGsError):
- volume_groups.get(vg_name='foo')
-
- def test_find_the_correct_one(self, volume_groups):
- volume1 = api.VolumeGroup(vg_name='volume1', lv_tags='')
- volume2 = api.VolumeGroup(vg_name='volume2', lv_tags='')
- volume_groups.append(volume1)
- volume_groups.append(volume2)
- assert volume_groups.get(vg_name='volume1') == volume1
-
- def test_filter_by_tag(self, volume_groups):
- vg_tags = "ceph.group=dmcache"
- osd = api.VolumeGroup(vg_name='volume1', vg_tags=vg_tags)
- journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.group=plain')
- volume_groups.append(osd)
- volume_groups.append(journal)
- volume_groups.filter(vg_tags={'ceph.group': 'dmcache'})
- assert len(volume_groups) == 1
- assert volume_groups[0].vg_name == 'volume1'
-
- def test_filter_by_tag_does_not_match_one(self, volume_groups):
- vg_tags = "ceph.group=dmcache,ceph.disk_type=ssd"
- osd = api.VolumeGroup(vg_name='volume1', vg_path='/dev/vg/lv', vg_tags=vg_tags)
- volume_groups.append(osd)
- volume_groups.filter(vg_tags={'ceph.group': 'data', 'ceph.disk_type': 'ssd'})
- assert volume_groups == []
-
- def test_filter_by_vg_name(self, volume_groups):
- vg_tags = "ceph.type=data,ceph.fsid=000-aaa"
- osd = api.VolumeGroup(vg_name='ceph_vg', vg_tags=vg_tags)
- journal = api.VolumeGroup(vg_name='volume2', vg_tags='ceph.type=journal')
- volume_groups.append(osd)
- volume_groups.append(journal)
- volume_groups.filter(vg_name='ceph_vg')
- assert len(volume_groups) == 1
- assert volume_groups[0].vg_name == 'ceph_vg'
-
- def test_filter_requires_params(self, volume_groups):
- with pytest.raises(TypeError):
- volume_groups.filter()
-
-
-class TestCreateLV(object):
-
- def setup(self):
- self.foo_volume = api.Volume(lv_name='foo', lv_path='/path', vg_name='foo_group', lv_tags='')
-
- def test_uses_size(self, monkeypatch, capture):
- monkeypatch.setattr(process, 'run', capture)
- monkeypatch.setattr(process, 'call', capture)
- monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
- api.create_lv('foo', 'foo_group', size=5, type='data')
- expected = ['sudo', 'lvcreate', '--yes', '-L', '5G', '-n', 'foo', 'foo_group']
- assert capture.calls[0]['args'][0] == expected
-
- def test_calls_to_set_type_tag(self, monkeypatch, capture):
- monkeypatch.setattr(process, 'run', capture)
- monkeypatch.setattr(process, 'call', capture)
- monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
- api.create_lv('foo', 'foo_group', size=5, type='data')
- ceph_tag = ['sudo', 'lvchange', '--addtag', 'ceph.type=data', '/path']
- assert capture.calls[1]['args'][0] == ceph_tag
-
- def test_calls_to_set_data_tag(self, monkeypatch, capture):
- monkeypatch.setattr(process, 'run', capture)
- monkeypatch.setattr(process, 'call', capture)
- monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
- api.create_lv('foo', 'foo_group', size=5, type='data')
- data_tag = ['sudo', 'lvchange', '--addtag', 'ceph.data_device=/path', '/path']
- assert capture.calls[2]['args'][0] == data_tag
--- /dev/null
+import pytest
+from ceph_volume.devices import lvm
+from ceph_volume.api import lvm as api
+
+
+class TestReadableTag(object):
+
+ def test_dots_get_replaced(self):
+ result = lvm.listing.readable_tag('ceph.foo')
+ assert result == 'foo'
+
+ def test_underscores_are_replaced_with_spaces(self):
+ result = lvm.listing.readable_tag('ceph.long_tag')
+ assert result == 'long tag'
+
+
+class TestPrettyReport(object):
+
+ def test_is_empty(self, capsys):
+ lvm.listing.pretty_report({})
+ stdout, stderr = capsys.readouterr()
+ assert stdout == '\n'
+
+ def test_type_and_path_are_reported(self, capsys):
+ lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+ stdout, stderr = capsys.readouterr()
+ assert '[data] /dev/sda1' in stdout
+
+ def test_osd_id_header_is_reported(self, capsys):
+ lvm.listing.pretty_report({0: [{'type': 'data', 'path': '/dev/sda1'}]})
+ stdout, stderr = capsys.readouterr()
+ assert '====== osd.0 =======' in stdout
+
+ def test_tags_are_included(self, capsys):
+ lvm.listing.pretty_report(
+ {0: [{
+ 'type': 'data',
+ 'path': '/dev/sda1',
+ 'tags': {'ceph.osd_id': '0'}
+ }]}
+ )
+ stdout, stderr = capsys.readouterr()
+ assert 'osd id' in stdout
+
+
+class TestList(object):
+
+ def test_empty_full_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+ args = factory(format='json', device=None)
+ lvm.listing.List([]).list(args)
+ stdout, stderr = capsys.readouterr()
+ assert stdout == '{}\n'
+
+ def test_empty_device_json_zero_exit_status(self, is_root, volumes, factory, capsys):
+ args = factory(format='json', device='/dev/sda1')
+ lvm.listing.List([]).list(args)
+ stdout, stderr = capsys.readouterr()
+ assert stdout == '{}\n'
+
+ def test_empty_full_zero_exit_status(self, is_root, volumes, factory):
+ args = factory(format='pretty', device=None)
+ with pytest.raises(SystemExit):
+ lvm.listing.List([]).list(args)
+
+ def test_empty_device_zero_exit_status(self, is_root, volumes, factory):
+ args = factory(format='pretty', device='/dev/sda1')
+ with pytest.raises(SystemExit):
+ lvm.listing.List([]).list(args)
+
+
+class TestFullReport(object):
+
+ def test_no_ceph_lvs(self, volumes, monkeypatch):
+ # ceph lvs are detected by looking into its tags
+ osd = api.Volume(lv_name='volume1', lv_path='/dev/VolGroup/lv', lv_tags={})
+ volumes.append(osd)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).full_report()
+ assert result == {}
+
+ def test_ceph_data_lv_reported(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).full_report()
+ assert result['0'][0]['name'] == 'volume1'
+
+ def test_ceph_journal_lv_reported(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ journal_tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=journal'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ journal = api.Volume(
+ lv_name='journal', lv_uuid='x', lv_path='/dev/VolGroup/journal', lv_tags=journal_tags)
+ volumes.append(osd)
+ volumes.append(journal)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).full_report()
+ assert result['0'][0]['name'] == 'volume1'
+ assert result['0'][1]['name'] == 'journal'
+
+ def test_ceph_wal_lv_reported(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+ wal_tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=wal'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ wal = api.Volume(
+ lv_name='wal', lv_uuid='x', lv_path='/dev/VolGroup/wal', lv_tags=wal_tags)
+ volumes.append(osd)
+ volumes.append(wal)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).full_report()
+ assert result['0'][0]['name'] == 'volume1'
+ assert result['0'][1]['name'] == 'wal'
+
+ def test_physical_journal_gets_reported(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+ result = lvm.listing.List([]).full_report()
+ assert result['0'][1]['path'] == '/dev/sda1'
+ assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+ assert result['0'][1]['type'] == 'journal'
+
+ def test_physical_wal_gets_reported(self, volumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.wal_uuid=x,ceph.type=data'
+ osd = api.Volume(
+ lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(osd)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ monkeypatch.setattr(lvm.listing.disk, 'get_device_from_partuuid', lambda x: '/dev/sda1')
+ result = lvm.listing.List([]).full_report()
+ assert result['0'][1]['path'] == '/dev/sda1'
+ assert result['0'][1]['tags'] == {'PARTUUID': 'x'}
+ assert result['0'][1]['type'] == 'wal'
+
+
+class TestSingleReport(object):
+
+ def test_not_a_ceph_lv(self, volumes, monkeypatch):
+ # ceph lvs are detected by looking into its tags
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags={})
+ volumes.append(lv)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).single_report('VolGroup/lv')
+ assert result == {}
+
+ def test_report_a_ceph_lv(self, volumes, monkeypatch):
+ # ceph lvs are detected by looking into its tags
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(lv)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).single_report('VolGroup/lv')
+ assert result['0'][0]['name'] == 'lv'
+ assert result['0'][0]['lv_tags'] == tags
+ assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+
+ def test_report_a_ceph_journal_device(self, volumes, monkeypatch):
+ # ceph lvs are detected by looking into its tags
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data,ceph.journal_device=/dev/sda1'
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+ volumes.append(lv)
+ monkeypatch.setattr(lvm.listing.api, 'Volumes', lambda: volumes)
+ result = lvm.listing.List([]).single_report('/dev/sda1')
+ assert result['0'][0]['tags'] == {'PARTUUID': 'x'}
+ assert result['0'][0]['type'] == 'journal'
+ assert result['0'][0]['path'] == '/dev/sda1'
with pytest.raises(SystemExit):
lvm.prepare.Prepare(argv=['--help']).main()
stdout, stderr = capsys.readouterr()
- assert 'required arguments:' in stdout
- assert 'A logical group name or a path' in stdout
+ assert 'Use the filestore objectstore' in stdout
+ assert 'Use the bluestore objectstore' in stdout
+ assert 'A physical device or logical' in stdout
class TestGetJournalLV(object):
def test_no_journal_on_invalid_path(self, monkeypatch, arg):
monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: False)
prepare = lvm.prepare.Prepare([])
- assert prepare.get_journal_lv(arg) is None
+ assert prepare.get_lv(arg) is None
def test_no_journal_lv_found(self, monkeypatch):
# patch it with 0 so we know we are getting to get_lv
monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: 0)
prepare = lvm.prepare.Prepare([])
- assert prepare.get_journal_lv('vg/lv') == 0
+ assert prepare.get_lv('vg/lv') == 0
class TestActivate(object):
--- /dev/null
+import os
+import pytest
+from ceph_volume.devices.simple import activate
+
+
+class TestActivate(object):
+
+ def test_no_data_uuid(self, factory, tmpfile, is_root, monkeypatch, capture):
+ json_config = tmpfile(contents='{}')
+ args = factory(osd_id='0', osd_fsid='1234', json_config=json_config)
+ with pytest.raises(RuntimeError):
+ activate.Activate([]).activate(args)
+
+ def test_invalid_json_path(self):
+ os.environ['CEPH_VOLUME_SIMPLE_JSON_DIR'] = '/non/existing/path'
+ with pytest.raises(RuntimeError) as error:
+ activate.Activate(['1', 'asdf']).main()
+ assert 'RuntimeError: Expected JSON config path not found' in str(error)
+
+ def test_main_spits_help_with_no_arguments(self, capsys):
+ activate.Activate([]).main()
+ stdout, stderr = capsys.readouterr()
+ assert 'Activate OSDs by mounting devices previously configured' in stdout
--- /dev/null
+import os
+import pytest
+from ceph_volume.devices.simple import scan
+
+
+class TestScan(object):
+
+ def test_main_spits_help_with_no_arguments(self, capsys):
+ scan.Scan([]).main()
+ stdout, stderr = capsys.readouterr()
+ assert 'Scan an OSD directory for files' in stdout
+
+
+class TestGetContents(object):
+
+ def test_multiple_lines_are_left_as_is(self, tmpfile):
+ magic_file = tmpfile(contents='first\nsecond\n')
+ scanner = scan.Scan([])
+ assert scanner.get_contents(magic_file) == 'first\nsecond\n'
+
+ def test_extra_whitespace_gets_removed(self, tmpfile):
+ magic_file = tmpfile(contents='first ')
+ scanner = scan.Scan([])
+ assert scanner.get_contents(magic_file) == 'first'
+
+ def test_single_newline_values_are_trimmed(self, tmpfile):
+ magic_file = tmpfile(contents='first\n')
+ scanner = scan.Scan([])
+ assert scanner.get_contents(magic_file) == 'first'
+
+
+class TestEtcPath(object):
+
+ def test_directory_is_valid(self, tmpdir):
+ path = str(tmpdir)
+ scanner = scan.Scan([])
+ scanner._etc_path = path
+ assert scanner.etc_path == path
+
+ def test_directory_does_not_exist_gets_created(self, tmpdir):
+ path = os.path.join(str(tmpdir), 'subdir')
+ scanner = scan.Scan([])
+ scanner._etc_path = path
+ assert scanner.etc_path == path
+ assert os.path.isdir(path)
+
+ def test_complains_when_file(self, tmpfile):
+ path = tmpfile()
+ scanner = scan.Scan([])
+ scanner._etc_path = path
+ with pytest.raises(RuntimeError):
+ scanner.etc_path
--- /dev/null
+import pytest
+from ceph_volume import exceptions
+from ceph_volume.devices.simple import trigger
+
+
+class TestParseOSDid(object):
+
+ def test_no_id_found_if_no_digit(self):
+ with pytest.raises(exceptions.SuffixParsingError):
+ trigger.parse_osd_id('asdlj-ljahsdfaslkjhdfa')
+
+ def test_no_id_found(self):
+ with pytest.raises(exceptions.SuffixParsingError):
+ trigger.parse_osd_id('ljahsdfaslkjhdfa')
+
+ def test_id_found(self):
+ result = trigger.parse_osd_id('1-ljahsdfaslkjhdfa')
+ assert result == '1'
+
+
+class TestParseOSDUUID(object):
+
+ def test_uuid_is_parsed(self):
+ result = trigger.parse_osd_uuid('1-asdf-ljkh-asdf-ljkh-asdf')
+ assert result == 'asdf-ljkh-asdf-ljkh-asdf'
+
+ def test_uuid_is_parsed_longer_sha1(self):
+ result = trigger.parse_osd_uuid('1-foo-bar-asdf-ljkh-asdf-ljkh-asdf')
+ assert result == 'foo-bar-asdf-ljkh-asdf-ljkh-asdf'
+
+ def test_uuid_is_not_found(self):
+ with pytest.raises(exceptions.SuffixParsingError):
+ trigger.parse_osd_uuid('ljahsdfaslkjhdfa')
+
+ def test_uuid_is_not_found_missing_id(self):
+ with pytest.raises(exceptions.SuffixParsingError):
+ trigger.parse_osd_uuid('ljahs-dfa-slkjhdfa-foo')
+
+ def test_robust_double_id_in_uuid(self):
+ # it is possible to have the id in the SHA1, this should
+ # be fine parsing that
+ result = trigger.parse_osd_uuid("1-abc959fd-1ec9-4864-b141-3154f9b9f8ed")
+ assert result == 'abc959fd-1ec9-4864-b141-3154f9b9f8ed'
+
+
--- /dev/null
+import pytest
+from ceph_volume.devices import lvm
+
+
+class TestZap(object):
+
+ def test_main_spits_help_with_no_arguments(self, capsys):
+ lvm.zap.Zap([]).main()
+ stdout, stderr = capsys.readouterr()
+ assert 'Zaps the given logical volume or partition' in stdout
+
+ def test_main_shows_full_help(self, capsys):
+ with pytest.raises(SystemExit):
+ lvm.zap.Zap(argv=['--help']).main()
+ stdout, stderr = capsys.readouterr()
+ assert 'optional arguments' in stdout
+ assert 'positional arguments' in stdout
+++ /dev/null
-../../Vagrantfile
\ No newline at end of file
+++ /dev/null
----
-
-ceph_dev: True
-cluster: ceph
-public_network: "192.168.3.0/24"
-cluster_network: "192.168.4.0/24"
-monitor_interface: eth1
-journal_size: 100
-osd_objectstore: "filestore"
-osd_scenario: lvm
-ceph_origin: 'repository'
-ceph_repository: 'dev'
-copy_admin_key: true
-# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
-lvm_volumes:
- - data: data-lv1
- journal: /dev/sdc1
- data_vg: test_group
- - data: data-lv2
- journal: journal1
- data_vg: test_group
- journal_vg: journals
-os_tuning_params:
- - { name: kernel.pid_max, value: 4194303 }
- - { name: fs.file-max, value: 26234859 }
-ceph_conf_overrides:
- global:
- osd_pool_default_pg_num: 8
- osd_pool_default_size: 1
+++ /dev/null
-[mons]
-mon0
-
-[osds]
-osd0
+++ /dev/null
----
-
-# DEFINE THE NUMBER OF VMS TO RUN
-mon_vms: 1
-osd_vms: 1
-mds_vms: 0
-rgw_vms: 0
-nfs_vms: 0
-rbd_mirror_vms: 0
-client_vms: 0
-iscsi_gw_vms: 0
-mgr_vms: 0
-
-# SUBNETS TO USE FOR THE VMS
-public_subnet: 192.168.3
-cluster_subnet: 192.168.4
-
-# MEMORY
-# set 1024 for CentOS
-memory: 512
-
-# Ethernet interface name
-# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
-eth: 'eth1'
-
-
-# VAGRANT BOX
-# Ceph boxes are *strongly* suggested. They are under better control and will
-# not get updated frequently unless required for build systems. These are (for
-# now):
-#
-# * ceph/ubuntu-xenial
-#
-# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
-# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
-# libvirt CentOS: centos/7
-# parallels Ubuntu: parallels/ubuntu-14.04
-# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
-# For more boxes have a look at:
-# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
-# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
-vagrant_box: centos/7
-#ssh_private_key_path: "~/.ssh/id_rsa"
-# The sync directory changes based on vagrant box
-# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
-#vagrant_sync_dir: /home/vagrant/sync
-#vagrant_sync_dir: /
-# Disables synced folder creation. Not needed for testing, will skip mounting
-# the vagrant directory on the remote box regardless of the provider.
-vagrant_disable_synced_folder: true
-# VAGRANT URL
-# This is a URL to download an image from an alternate location. vagrant_box
-# above should be set to the filename of the image.
-# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
-# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
-# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+ - data: data-lv1
+ data_vg: test_group
+ - data: data-lv2
+ data_vg: test_group
+ db: journal1
+ db_vg: journals
+ - data: /dev/sdd1
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
--- /dev/null
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
+lvm_volumes:
+ - data: data-lv1
+ journal: /dev/sdc1
+ data_vg: test_group
+ - data: data-lv2
+ journal: journal1
+ data_vg: test_group
+ journal_vg: journals
+ - data: /dev/sdd1
+ journal: /dev/sdd2
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
--- /dev/null
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+---
+
+- hosts: osds
+ gather_facts: false
+ become: yes
+ tasks:
+
+ - name: partition /dev/sdd for lvm data usage
+ parted:
+ device: /dev/sdd
+ number: 1
+ part_start: 0%
+ part_end: 50%
+ unit: '%'
+ label: gpt
+ state: present
+
+ - name: partition /dev/sdd lvm journals
+ parted:
+ device: /dev/sdd
+ number: 2
+ part_start: 50%
+ part_end: 100%
+ unit: '%'
+ state: present
+ label: gpt
+
--- /dev/null
+[tox]
+envlist = {centos7,xenial}-{filestore,bluestore}-{create,prepare_activate}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+ vagrant
+ bash
+ git
+passenv=*
+setenv=
+ ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+ ANSIBLE_STDOUT_CALLBACK = debug
+ ANSIBLE_RETRY_FILES_ENABLED = False
+ VAGRANT_CWD = {changedir}
+ CEPH_VOLUME_DEBUG = 1
+deps=
+ ansible==2.4.1
+ testinfra==1.7.1
+ pytest-xdist
+changedir=
+ centos7-filestore-create: {toxinidir}/centos7/filestore/create
+ centos7-bluestore-create: {toxinidir}/centos7/bluestore/create
+ xenial-filestore-create: {toxinidir}/xenial/filestore/create
+ xenial-bluestore-create: {toxinidir}/xenial/bluestore/create
+ # TODO: these are placeholders for now, eventually we want to
+ # test the prepare/activate workflow of ceph-volume as well
+ xenial-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+ xenial-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+ centos7-filestore-prepare_activate: {toxinidir}/xenial/filestore/prepare_activate
+ centos7-bluestore-prepare_activate: {toxinidir}/xenial/bluestore/prepare_activate
+commands=
+ git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+ vagrant up --no-provision {posargs:--provider=virtualbox}
+ bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+ # create logical volumes to test with on the vms
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
+
+ # ad-hoc/local test setup for lvm
+ ansible-playbook -vv -i {changedir}/hosts {changedir}/setup.yml
+
+ # use ceph-ansible to deploy a ceph cluster on the vms
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+
+ # prepare nodes for testing with testinfra
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+ # test cluster state using ceph-ansible tests
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ # reboot all vms
+ vagrant reload --no-provision
+
+ # retest to ensure cluster came back up correctly after rebooting
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ vagrant destroy --force
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+lvm_volumes:
+ - data: data-lv1
+ data_vg: test_group
+ - data: data-lv2
+ data_vg: test_group
+ db: journal1
+ db_vg: journals
+ - data: /dev/sdd1
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
--- /dev/null
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
+lvm_volumes:
+ - data: data-lv1
+ journal: /dev/sdc1
+ data_vg: test_group
+ - data: data-lv2
+ journal: journal1
+ data_vg: test_group
+ journal_vg: journals
+ - data: /dev/sdd1
+ journal: /dev/sdd2
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
--- /dev/null
+../../../playbooks/setup_partitions.yml
\ No newline at end of file
--- /dev/null
+---
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+dedicated_devices:
+ - '/dev/sdc'
+osd_scenario: "non-collocated"
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+ - '/dev/sdc'
+osd_scenario: "collocated"
--- /dev/null
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0
+osd1
+
+[mgrs]
+mon0
--- /dev/null
+---
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: list all OSD directories
+ find:
+ paths: /var/lib/ceph/osd
+ file_type: directory
+ register: osd_paths
+
+ - name: scan all OSD directories
+ command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_paths.files }}"
+
+ - name: list all OSD JSON files
+ find:
+ paths: /etc/ceph/osd
+ file_type: file
+ register: osd_configs
+
+ - name: activate all scanned OSDs
+ command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_configs.files }}"
--- /dev/null
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+dedicated_devices:
+ - '/dev/sdc'
+osd_scenario: "non-collocated"
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+ - '/dev/sdc'
+osd_scenario: "collocated"
--- /dev/null
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0
+osd1
+
+[mgrs]
+mon0
--- /dev/null
+---
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: list all OSD directories
+ find:
+ paths: /var/lib/ceph/osd
+ file_type: directory
+ register: osd_paths
+
+ - name: scan all OSD directories
+ command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_paths.files }}"
+
+ - name: list all OSD JSON files
+ find:
+ paths: /etc/ceph/osd
+ file_type: file
+ register: osd_configs
+
+ - name: activate all scanned OSDs
+ command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_configs.files }}"
--- /dev/null
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+
--- /dev/null
+[tox]
+envlist = {centos7,xenial}-{filestore,bluestore}-{activate}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+ vagrant
+ bash
+ git
+passenv=*
+setenv=
+ ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+ ANSIBLE_STDOUT_CALLBACK = debug
+ ANSIBLE_RETRY_FILES_ENABLED = False
+ VAGRANT_CWD = {changedir}
+ CEPH_VOLUME_DEBUG = 1
+deps=
+ ansible==2.4.1
+ testinfra==1.7.1
+ pytest-xdist
+changedir=
+ centos7-filestore-activate: {toxinidir}/centos7/filestore/activate
+ centos7-bluestore-activate: {toxinidir}/centos7/bluestore/activate
+ xenial-filestore-activate: {toxinidir}/xenial/filestore/activate
+ xenial-bluestore-activate: {toxinidir}/xenial/bluestore/activate
+commands=
+ git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+ vagrant up --no-provision {posargs:--provider=virtualbox}
+ bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+ # use ceph-ansible to deploy a ceph cluster on the vms
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+
+ # prepare nodes for testing with testinfra
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+ # test cluster state using ceph-ansible tests
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ # make ceph-volume simple take over all the OSDs that got deployed, disabling ceph-disk
+ ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
+
+ # reboot all vms
+ vagrant reload --no-provision
+
+ # retest to ensure cluster came back up correctly after rebooting
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ vagrant destroy --force
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "bluestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+dedicated_devices:
+ - '/dev/sdc'
+osd_scenario: "non-collocated"
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+ - '/dev/sdc'
+osd_scenario: "collocated"
--- /dev/null
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0
+osd1
+
+[mgrs]
+mon0
--- /dev/null
+---
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: list all OSD directories
+ find:
+ paths: /var/lib/ceph/osd
+ file_type: directory
+ register: osd_paths
+
+ - name: scan all OSD directories
+ command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_paths.files }}"
+
+ - name: list all OSD JSON files
+ find:
+ paths: /etc/ceph/osd
+ file_type: file
+ register: osd_configs
+
+ - name: activate all scanned OSDs
+ command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_configs.files }}"
--- /dev/null
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: test
+public_network: "192.168.1.0/24"
+cluster_network: "192.168.2.0/24"
+monitor_interface: eth1
+journal_size: 100
+osd_objectstore: "filestore"
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: true
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+dedicated_devices:
+ - '/dev/sdc'
+osd_scenario: "non-collocated"
--- /dev/null
+---
+
+devices:
+ - '/dev/sdb'
+ - '/dev/sdc'
+osd_scenario: "collocated"
--- /dev/null
+[mons]
+mon0 monitor_interface=eth1
+
+[osds]
+osd0
+osd1
+
+[mgrs]
+mon0
--- /dev/null
+---
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: list all OSD directories
+ find:
+ paths: /var/lib/ceph/osd
+ file_type: directory
+ register: osd_paths
+
+ - name: scan all OSD directories
+ command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_paths.files }}"
+
+ - name: list all OSD JSON files
+ find:
+ paths: /etc/ceph/osd
+ file_type: file
+ register: osd_configs
+
+ - name: activate all scanned OSDs
+ command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+ with_items:
+ - "{{ osd_configs.files }}"
--- /dev/null
+---
+
+# DEPLOY CONTAINERIZED DAEMONS
+docker: false
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 2
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+
+# INSTALL SOURCE OF CEPH
+# valid values are 'stable' and 'dev'
+ceph_install_source: stable
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.1
+cluster_subnet: 192.168.2
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+# Disks
+# For libvirt use disks: "[ '/dev/vdb', '/dev/vdc' ]"
+# For CentOS7 use disks: "[ '/dev/sda', '/dev/sdb' ]"
+disks: "[ '/dev/sdb', '/dev/sdc' ]"
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+
+++ /dev/null
-[tox]
-envlist = {centos7,xenial}-{create,prepare_activate}
-skipsdist = True
-
-[testenv]
-whitelist_externals =
- vagrant
- bash
- git
-passenv=*
-setenv=
- ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
- ANSIBLE_STDOUT_CALLBACK = debug
- ANSIBLE_RETRY_FILES_ENABLED = False
- VAGRANT_CWD = {changedir}
- CEPH_VOLUME_DEBUG = 1
-deps=
- ansible==2.3.1
- testinfra==1.6.0
- pytest-xdist
-changedir=
- centos7-create: {toxinidir}/centos7/create
- xenial-create: {toxinidir}/xenial/create
- # TODO: these are placeholders for now, eventually we want to
- # test the prepare/activate workflow of ceph-volume as well
- xenial-prepare_activate: {toxinidir}/xenial/prepare_activate
- centos7-prepare_activate: {toxinidir}/xenial/prepare_activate
-commands=
- git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
-
- vagrant up --no-provision {posargs:--provider=virtualbox}
- bash {toxinidir}/scripts/generate_ssh_config.sh {changedir}
-
- # create logical volumes to test with on the vms
- ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/lvm_setup.yml
-
- # use ceph-ansible to deploy a ceph cluster on the vms
- ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
-
- # prepare nodes for testing with testinfra
- ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
-
- # test cluster state using ceph-ansible tests
- testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
-
- # reboot all vms
- vagrant reload --no-provision
-
- # retest to ensure cluster came back up correctly after rebooting
- testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
-
- vagrant destroy --force
+++ /dev/null
-../../Vagrantfile
\ No newline at end of file
+++ /dev/null
----
-
-ceph_dev: True
-cluster: ceph
-public_network: "192.168.3.0/24"
-cluster_network: "192.168.4.0/24"
-monitor_interface: eth1
-journal_size: 100
-osd_objectstore: "filestore"
-osd_scenario: lvm
-ceph_origin: 'repository'
-ceph_repository: 'dev'
-copy_admin_key: true
-# test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
-lvm_volumes:
- - data: data-lv1
- journal: /dev/sdc1
- data_vg: test_group
- - data: data-lv2
- journal: journal1
- data_vg: test_group
- journal_vg: journals
-os_tuning_params:
- - { name: kernel.pid_max, value: 4194303 }
- - { name: fs.file-max, value: 26234859 }
-ceph_conf_overrides:
- global:
- osd_pool_default_pg_num: 8
- osd_pool_default_size: 1
+++ /dev/null
-[mons]
-mon0
-
-[osds]
-osd0
+++ /dev/null
----
-# DEFINE THE NUMBER OF VMS TO RUN
-mon_vms: 1
-osd_vms: 1
-mds_vms: 0
-rgw_vms: 0
-nfs_vms: 0
-rbd_mirror_vms: 0
-client_vms: 0
-iscsi_gw_vms: 0
-mgr_vms: 0
-
-# SUBNETS TO USE FOR THE VMS
-public_subnet: 192.168.3
-cluster_subnet: 192.168.4
-
-# MEMORY
-# set 1024 for CentOS
-memory: 512
-
-# Ethernet interface name
-# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
-eth: 'eth1'
-
-# VAGRANT BOX
-# Ceph boxes are *strongly* suggested. They are under better control and will
-# not get updated frequently unless required for build systems. These are (for
-# now):
-#
-# * ceph/ubuntu-xenial
-#
-# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
-# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
-# libvirt CentOS: centos/7
-# parallels Ubuntu: parallels/ubuntu-14.04
-# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
-# For more boxes have a look at:
-# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
-# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
-vagrant_box: ceph/ubuntu-xenial
-#ssh_private_key_path: "~/.ssh/id_rsa"
-# The sync directory changes based on vagrant box
-# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
-#vagrant_sync_dir: /home/vagrant/sync
-#vagrant_sync_dir: /
-# Disables synced folder creation. Not needed for testing, will skip mounting
-# the vagrant directory on the remote box regardless of the provider.
-vagrant_disable_synced_folder: true
-# VAGRANT URL
-# This is a URL to download an image from an alternate location. vagrant_box
-# above should be set to the filename of the image.
-# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
-# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
-# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
import pytest
import argparse
+from ceph_volume import exceptions
from ceph_volume.util import arg_validators
invalid_lv_paths = [
- '', 'lv_name', '///', '/lv_name', 'lv_name/',
+ '', 'lv_name', '/lv_name', 'lv_name/',
'/dev/lv_group/lv_name'
]
def test_is_valid(self):
path = 'vg/lv'
assert self.validator(path) == path
+
+ def test_abspath_is_valid(self):
+ path = '/'
+ assert self.validator(path) == path
+
+
+class TestOSDPath(object):
+
+ def setup(self):
+ self.validator = arg_validators.OSDPath()
+
+ def test_is_not_root(self):
+ with pytest.raises(exceptions.SuperUserError):
+ self.validator('')
+
+ def test_path_is_not_a_directory(self, is_root, tmpfile, monkeypatch):
+ monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+ validator = arg_validators.OSDPath()
+ with pytest.raises(argparse.ArgumentError):
+ validator(tmpfile())
+
+ def test_files_are_missing(self, is_root, tmpdir, monkeypatch):
+ tmppath = str(tmpdir)
+ monkeypatch.setattr(arg_validators.disk, 'is_partition', lambda x: False)
+ validator = arg_validators.OSDPath()
+ with pytest.raises(argparse.ArgumentError) as error:
+ validator(tmppath)
+ assert 'Required file (ceph_fsid) was not found in OSD' in str(error)
import os
import pwd
import getpass
+import pytest
from textwrap import dedent
from ceph_volume.util import system
assert os.path.isdir(path)
-class TestIsMounted(object):
+@pytest.fixture
+def fake_proc(tmpdir, monkeypatch):
+ PROCDIR = str(tmpdir)
+ proc_path = os.path.join(PROCDIR, 'mounts')
+ with open(proc_path, 'w') as f:
+ f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+ rootfs / rootfs rw 0 0
+ sysfs /sys sysfs rw,seclabel,nosuid,nodev,noexec,relatime 0 0
+ proc /proc proc rw,nosuid,nodev,noexec,relatime 0 0
+ devtmpfs /dev devtmpfs rw,seclabel,nosuid,size=238292k,nr_inodes=59573,mode=755 0 0
+ securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0
+ tmpfs /dev/shm tmpfs rw,seclabel,nosuid,nodev 0 0
+ devpts /dev/pts devpts rw,seclabel,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0
+ tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+ tmpfs /sys/fs/cgroup tmpfs ro,seclabel,nosuid,nodev,noexec,mode=755 0 0
+ cgroup /sys/fs/cgroup/systemd cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 0 0
+ cgroup /sys/fs/cgroup/freezer cgroup rw,nosuid,nodev,noexec,relatime,freezer 0 0
+ configfs /sys/kernel/config configfs rw,relatime 0 0
+ /dev/mapper/VolGroup00-LogVol00 / xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+ selinuxfs /sys/fs/selinux selinuxfs rw,relatime 0 0
+ debugfs /sys/kernel/debug debugfs rw,relatime 0 0
+ hugetlbfs /dev/hugepages hugetlbfs rw,seclabel,relatime 0 0
+ mqueue /dev/mqueue mqueue rw,seclabel,relatime 0 0
+ sunrpc /far/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0
+ /dev/sde4 /two/field/path
+ nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
+ /dev/sde2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+ tmpfs /far/lib/ceph/osd/ceph-5 tmpfs rw,seclabel,relatime 0 0
+ tmpfs /far/lib/ceph/osd/ceph-7 tmpfs rw,seclabel,relatime 0 0
+ /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,seclabel,noatime,attr2,inode64,noquota 0 0
+ tmpfs /run/user/1000 tmpfs rw,seclabel,nosuid,nodev,relatime,size=50040k,mode=700,uid=1000,gid=1000 0 0
+ /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
+ tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+ monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
+ monkeypatch.setattr(os.path, 'exists', lambda x: True)
+
+
+class TestPathIsMounted(object):
+
+ def test_is_mounted(self, fake_proc):
+ assert system.path_is_mounted('/boot') is True
+
+ def test_is_not_mounted(self, fake_proc):
+ assert system.path_is_mounted('/far/fib/feph') is False
+
+ def test_is_not_mounted_at_destination(self, fake_proc):
+ assert system.path_is_mounted('/boot', destination='/dev/sda1') is False
+
+ def test_is_mounted_at_destination(self, fake_proc):
+ assert system.path_is_mounted('/boot', destination='/dev/sdc2') is True
+
+
+class TestDeviceIsMounted(object):
+
+ def test_is_mounted(self, fake_proc):
+ assert system.device_is_mounted('/dev/sda1') is True
+
+ def test_path_is_not_device(self, fake_proc):
+ assert system.device_is_mounted('/far/lib/ceph/osd/ceph-7') is False
+
+ def test_is_not_mounted_at_destination(self, fake_proc):
+ assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/test-1') is False
+
+ def test_is_mounted_at_destination(self, fake_proc):
+ assert system.device_is_mounted('/dev/sda1', destination='/far/lib/ceph/osd/ceph-7') is False
+
+
+class TestGetMounts(object):
def test_not_mounted(self, tmpdir, monkeypatch):
PROCDIR = str(tmpdir)
with open(proc_path, 'w') as f:
f.write('')
monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
- assert system.is_mounted('sdb') is False
+ assert system.get_mounts() == {}
- def test_is_mounted_(self, tmpdir, monkeypatch):
- PROCDIR = str(tmpdir)
- proc_path = os.path.join(PROCDIR, 'mounts')
- with open(proc_path, 'w') as f:
- f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
- /dev/sdc2 /boot xfs rw,seclabel,relatime,attr2,inode64,noquota 0 0
- tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
- monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
- monkeypatch.setattr(os.path, 'exists', lambda x: True)
- assert system.is_mounted('/dev/sdc2') is True
+ def test_is_mounted_(self, fake_proc):
+ result = system.get_mounts()
+ assert result['/dev/sdc2'] == ['/boot']
- def test_ignores_two_fields(self, tmpdir, monkeypatch):
- PROCDIR = str(tmpdir)
- proc_path = os.path.join(PROCDIR, 'mounts')
- with open(proc_path, 'w') as f:
- f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
- /dev/sdc2 /boot
- tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
- monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
- monkeypatch.setattr(os.path, 'exists', lambda x: True)
- assert system.is_mounted('/dev/sdc2') is False
+ def test_ignores_two_fields(self, fake_proc):
+ result = system.get_mounts()
+ assert result.get('/dev/sde4') is None
- def test_not_mounted_at_destination(self, tmpdir, monkeypatch):
- PROCDIR = str(tmpdir)
- proc_path = os.path.join(PROCDIR, 'mounts')
- with open(proc_path, 'w') as f:
- f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
- /dev/sdc2 /var/lib/ceph/osd/ceph-9 xfs rw,attr2,inode64,noquota 0 0
- tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
- monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
- monkeypatch.setattr(os.path, 'exists', lambda x: True)
- assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is False
+ def test_tmpfs_is_reported(self, fake_proc):
+ result = system.get_mounts()
+ assert result['tmpfs'][0] == '/dev/shm'
+
+ def test_non_skip_devs_arent_reported(self, fake_proc):
+ result = system.get_mounts()
+ assert result.get('cgroup') is None
+
+ def test_multiple_mounts_are_appended(self, fake_proc):
+ result = system.get_mounts()
+ assert len(result['tmpfs']) == 7
- def test_is_mounted_at_destination(self, tmpdir, monkeypatch):
+ def test_nonexistent_devices_are_skipped(self, tmpdir, monkeypatch):
PROCDIR = str(tmpdir)
proc_path = os.path.join(PROCDIR, 'mounts')
with open(proc_path, 'w') as f:
f.write(dedent("""nfsd /proc/fs/nfsd nfsd rw,relatime 0 0
- /dev/sdc2 /var/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
- tmpfs /run/user/1000 tmpfs rw,seclabel,mode=700,uid=1000,gid=1000 0 0"""))
+ /dev/sda1 /far/lib/ceph/osd/ceph-0 xfs rw,attr2,inode64,noquota 0 0
+ /dev/sda2 /far/lib/ceph/osd/ceph-1 xfs rw,attr2,inode64,noquota 0 0"""))
monkeypatch.setattr(system, 'PROCDIR', PROCDIR)
- monkeypatch.setattr(os.path, 'exists', lambda x: True)
- assert system.is_mounted('/dev/sdc2', '/var/lib/ceph/osd/ceph-0') is True
+ monkeypatch.setattr(os.path, 'exists', lambda x: False if x == '/dev/sda1' else True)
+ result = system.get_mounts()
+ assert result.get('/dev/sda1') is None
+
+
+class TestIsBinary(object):
+
+ def test_is_binary(self, tmpfile):
+ binary_path = tmpfile(contents='asd\n\nlkjh\x00')
+ assert system.is_binary(binary_path)
+
+ def test_is_not_binary(self, tmpfile):
+ binary_path = tmpfile(contents='asd\n\nlkjh0')
+ assert system.is_binary(binary_path) is False
import argparse
+import os
+from ceph_volume import terminal
+from ceph_volume import decorators
+from ceph_volume.util import disk
class LVPath(object):
<vg name>/<lv name>
+ Or a full path to a device, like ``/dev/sda``
+
Because for LVM it is better to be specific on what group does an lv
belongs to.
"""
def __call__(self, string):
error = None
+ if string.startswith('/'):
+ if not os.path.exists(string):
+ error = "Argument (device) does not exist: %s" % string
+ raise argparse.ArgumentError(None, error)
+ else:
+ return string
try:
vg, lv = string.split('/')
except ValueError:
if error:
raise argparse.ArgumentError(None, error)
return string
+
+
+class OSDPath(object):
+ """
+ Validate path exists and it looks like an OSD directory.
+ """
+
+ @decorators.needs_root
+ def __call__(self, string):
+ if not os.path.exists(string):
+ error = "Path does not exist: %s" % string
+ raise argparse.ArgumentError(None, error)
+
+ arg_is_partition = disk.is_partition(string)
+ if arg_is_partition:
+ return os.path.abspath(string)
+ absolute_path = os.path.abspath(string)
+ if not os.path.isdir(absolute_path):
+ error = "Argument is not a directory or device which is required to scan"
+ raise argparse.ArgumentError(None, error)
+ key_files = ['ceph_fsid', 'fsid', 'keyring', 'ready', 'type', 'whoami']
+ dir_files = os.listdir(absolute_path)
+ for key_file in key_files:
+ if key_file not in dir_files:
+ terminal.error('All following files must exist in path: %s' % ' '.join(key_files))
+ error = "Required file (%s) was not found in OSD dir path: %s" % (
+ key_file,
+ absolute_path
+ )
+ raise argparse.ArgumentError(None, error)
+
+ return os.path.abspath(string)
+import os
+import stat
from ceph_volume import process
['sudo', 'blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
)
return ' '.join(out).strip()
+
+
+def _stat_is_device(stat_obj):
+ """
+ Helper function that will interpret ``os.stat`` output directly, so that other
+ functions can call ``os.stat`` once and interpret that result several times
+ """
+ return stat.S_ISBLK(stat_obj)
+
+
+def lsblk(device, columns=None):
+ """
+ Create a dictionary of identifying values for a device using ``lsblk``.
+ Each supported column is a key, in its *raw* format (all uppercase
+ usually). ``lsblk`` has support for certain "columns" (in blkid these
+ would be labels), and these columns vary between distributions and
+ ``lsblk`` versions. The newer versions support a richer set of columns,
+ while older ones were a bit limited.
+
+ These are the default lsblk columns reported which are safe to use for
+ Ubuntu 14.04.5 LTS:
+
+ NAME device name
+ KNAME internal kernel device name
+ MAJ:MIN major:minor device number
+ FSTYPE filesystem type
+ MOUNTPOINT where the device is mounted
+ LABEL filesystem LABEL
+ UUID filesystem UUID
+ RO read-only device
+ RM removable device
+ MODEL device identifier
+ SIZE size of the device
+ STATE state of the device
+ OWNER user name
+ GROUP group name
+ MODE device node permissions
+ ALIGNMENT alignment offset
+ MIN-IO minimum I/O size
+ OPT-IO optimal I/O size
+ PHY-SEC physical sector size
+ LOG-SEC logical sector size
+ ROTA rotational device
+ SCHED I/O scheduler name
+ RQ-SIZE request queue size
+ TYPE device type
+ DISC-ALN discard alignment offset
+ DISC-GRAN discard granularity
+ DISC-MAX discard max bytes
+ DISC-ZERO discard zeroes data
+
+ There is a bug in ``lsblk`` where using all the available (supported)
+ columns will result in no output (!), in order to workaround this the
+ following columns have been removed from the default reporting columns:
+
+ * RQ-SIZE (request queue size)
+ * MIN-IO minimum I/O size
+ * OPT-IO optimal I/O size
+
+ These should be available however when using `columns`. For example::
+
+ >>> lsblk('/dev/sda1', columns=['OPT-IO'])
+ {'OPT-IO': '0'}
+
+ Normal CLI output, as filtered by the flags in this function will look like ::
+
+ $ sudo lsblk --nodeps -P -o NAME,KNAME,MAJ:MIN,FSTYPE,MOUNTPOINT
+ NAME="sda1" KNAME="sda1" MAJ:MIN="8:1" FSTYPE="ext4" MOUNTPOINT="/"
+
+ :param columns: A list of columns to report as keys in its original form.
+ """
+ default_columns = [
+ 'NAME', 'KNAME', 'MAJ:MIN', 'FSTYPE', 'MOUNTPOINT', 'LABEL', 'UUID',
+ 'RO', 'RM', 'MODEL', 'SIZE', 'STATE', 'OWNER', 'GROUP', 'MODE',
+ 'ALIGNMENT', 'PHY-SEC', 'LOG-SEC', 'ROTA', 'SCHED', 'TYPE', 'DISC-ALN',
+ 'DISC-GRAN', 'DISC-MAX', 'DISC-ZERO'
+ ]
+ device = device.rstrip('/')
+ columns = columns or default_columns
+ # --nodeps -> Avoid adding children/parents to the device, only give information
+ # on the actual device we are querying for
+ # -P -> Produce pairs of COLUMN="value"
+ # -o -> Use the columns specified or default ones provided by this function
+ command = ['sudo', 'lsblk', '--nodeps', '-P', '-o']
+ command.append(','.join(columns))
+ command.append(device)
+ out, err, rc = process.call(command)
+
+ if rc != 0:
+ return {}
+
+ # parse the COLUMN="value" output to construct the dictionary
+ pairs = ' '.join(out).split()
+ parsed = {}
+ for pair in pairs:
+ try:
+ column, value = pair.split('=')
+ except ValueError:
+ continue
+ parsed[column] = value.strip().strip().strip('"')
+ return parsed
+
+
+def _lsblk_type(device):
+ """
+ Helper function that will use the ``TYPE`` label output of ``lsblk`` to determine
+ if a device is a partition or disk.
+ It does not process the output to return a boolean, but it does process it to return the
+ """
+ out, err, rc = process.call(
+ ['sudo', 'blkid', '-s', 'PARTUUID', '-o', 'value', device]
+ )
+ return ' '.join(out).strip()
+
+
+def is_device(dev):
+ """
+ Boolean to determine if a given device is a block device (**not**
+ a partition!)
+
+ For example: /dev/sda would return True, but not /dev/sdc1
+ """
+ if not os.path.exists(dev):
+ return False
+ # use lsblk first, fall back to using stat
+ TYPE = lsblk(dev).get('TYPE')
+ if TYPE:
+ return TYPE == 'disk'
+
+ # fallback to stat
+ return _stat_is_device(os.lstat(dev).st_mode)
+ if stat.S_ISBLK(os.lstat(dev)):
+ return True
+ return False
+
+
+def is_partition(dev):
+ """
+ Boolean to determine if a given device is a partition, like /dev/sda1
+ """
+ if not os.path.exists(dev):
+ return False
+ # use lsblk first, fall back to using stat
+ TYPE = lsblk(dev).get('TYPE')
+ if TYPE:
+ return TYPE == 'part'
+
+ # fallback to stat
+ stat_obj = os.stat(dev)
+ if _stat_is_device(stat_obj.st_mode):
+ return False
+
+ major = os.major(stat_obj.st_rdev)
+ minor = os.minor(stat_obj.st_rdev)
+ if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)):
+ return True
+ return False
return ' '.join(stdout).strip()
-def create_path(osd_id):
+def mount_tmpfs(path):
+ process.run([
+ 'sudo',
+ 'mount',
+ '-t',
+ 'tmpfs', 'tmpfs',
+ path
+ ])
+
+
+def create_osd_path(osd_id, tmpfs=False):
+ path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
system.mkdir_p('/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id))
+ if tmpfs:
+ mount_tmpfs(path)
def format_device(device):
process.run(command)
-def link_journal(journal_device, osd_id):
- journal_path = '/var/lib/ceph/osd/%s-%s/journal' % (
+def _link_device(device, device_type, osd_id):
+ """
+ Allow linking any device type in an OSD directory. ``device`` must the be
+ source, with an absolute path and ``device_type`` will be the destination
+ name, like 'journal', or 'block'
+ """
+ device_path = '/var/lib/ceph/osd/%s-%s/%s' % (
conf.cluster,
- osd_id
+ osd_id,
+ device_type
)
- command = ['sudo', 'ln', '-s', journal_device, journal_path]
+ command = ['sudo', 'ln', '-s', device, device_path]
+ system.chown(device)
+
process.run(command)
+def link_journal(journal_device, osd_id):
+ _link_device(journal_device, 'journal', osd_id)
+
+
+def link_block(block_device, osd_id):
+ _link_device(block_device, 'block', osd_id)
+
+
+def link_wal(wal_device, osd_id):
+ _link_device(wal_device, 'block.wal', osd_id)
+
+
+def link_db(db_device, osd_id):
+ _link_device(db_device, 'block.db', osd_id)
+
+
def get_monmap(osd_id):
"""
Before creating the OSD files, a monmap needs to be retrieved so that it
])
-def osd_mkfs(osd_id, fsid):
+def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False):
+ """
+ Create the files for the OSD to function. A normal call will look like:
+
+ ceph-osd --cluster ceph --mkfs --mkkey -i 0 \
+ --monmap /var/lib/ceph/osd/ceph-0/activate.monmap \
+ --osd-data /var/lib/ceph/osd/ceph-0 \
+ --osd-uuid 8d208665-89ae-4733-8888-5d3bfbeeec6c \
+ --keyring /var/lib/ceph/osd/ceph-0/keyring \
+ --setuser ceph --setgroup ceph
+
+ In some cases it is required to use the keyring, when it is passed in as
+ a keywork argument it is used as part of the ceph-osd command
+ """
+ path = '/var/lib/ceph/osd/%s-%s/' % (conf.cluster, osd_id)
+ monmap = os.path.join(path, 'activate.monmap')
+
+ system.chown(path)
+
+ base_command = [
+ 'sudo',
+ 'ceph-osd',
+ '--cluster', conf.cluster,
+ # undocumented flag, sets the `type` file to contain 'bluestore'
+ '--osd-objectstore', 'bluestore',
+ '--mkfs',
+ '-i', osd_id,
+ '--monmap', monmap,
+ ]
+
+ supplementary_command = [
+ '--osd-data', path,
+ '--osd-uuid', fsid,
+ '--setuser', 'ceph',
+ '--setgroup', 'ceph'
+ ]
+
+ if keyring is not None:
+ base_command.extend(['--key', keyring])
+
+ if wal:
+ base_command.extend(
+ ['--bluestore-block-wal-path', wal]
+ )
+ system.chown(wal)
+
+ if db:
+ base_command.extend(
+ ['--bluestore-block-db-path', db]
+ )
+ system.chown(db)
+
+ command = base_command + supplementary_command
+
+ process.run(command, obfuscate='--key')
+
+
+def osd_mkfs_filestore(osd_id, fsid):
"""
Create the files for the OSD to function. A normal call will look like:
'sudo',
'ceph-osd',
'--cluster', conf.cluster,
+ # undocumented flag, sets the `type` file to contain 'filestore'
+ '--osd-objectstore', 'filestore',
'--mkfs',
'-i', osd_id,
'--monmap', monmap,
import os
import pwd
import platform
+import tempfile
import uuid
from ceph_volume import process
from . import as_string
os.chown(path, uid, gid)
-def is_mounted(source, destination=None):
+def is_binary(path):
"""
- Check if the given device is mounted, optionally validating destination.
- This relies on absolute path devices, it will ignore non-absolute
- entries like::
+ Detect if a file path is a binary or not. Will falsely report as binary
+ when utf-16 encoded. In the ceph universe there is no such risk (yet)
+ """
+ with open(path, 'rb') as fp:
+ contents = fp.read(8192)
+ if b'\x00' in contents: # a null byte may signal binary
+ return True
+ return False
+
+
+class tmp_mount(object):
+ """
+ Temporarily mount a device on a temporary directory,
+ and unmount it upon exit
+ """
+
+ def __init__(self, device):
+ self.device = device
+ self.path = None
+
+ def __enter__(self):
+ self.path = tempfile.mkdtemp()
+ process.run([
+ 'sudo',
+ 'mount',
+ '-v',
+ self.device,
+ self.path
+ ])
+ return self.path
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ process.run([
+ 'sudo',
+ 'umount',
+ '-v',
+ self.path
+ ])
+
+
+def path_is_mounted(path, destination=None):
+ """
+ Check if the given path is mounted
+ """
+ mounts = get_mounts(paths=True)
+ realpath = os.path.realpath(path)
+ mounted_locations = mounts.get(realpath, [])
+
+ if destination:
+ if destination.startswith('/'):
+ destination = os.path.realpath(destination)
+ return destination in mounted_locations
+ return mounted_locations != []
+
- tmpfs /run tmpfs rw,seclabel,nosuid,nodev,mode=755 0 0
+def device_is_mounted(dev, destination=None):
+ """
+ Check if the given device is mounted, optionally validating that a
+ destination exists
+ """
+ mounts = get_mounts(devices=True)
+ realpath = os.path.realpath(dev) if dev.startswith('/') else dev
+ destination = os.path.realpath(destination) if destination else None
+ mounted_locations = mounts.get(realpath, [])
+
+ if destination:
+ return destination in mounted_locations
+ return mounted_locations != []
+
+
+def get_mounts(devices=False, paths=False):
+ """
+ Create a mapping of all available system mounts so that other helpers can
+ detect nicely what path or device is mounted
- But will parse paths that are absolute like::
+ It ignores (most of) non existing devices, but since some setups might need
+ some extra device information, it will make an exception for:
- /dev/sdc2 /boot xfs rw,attr2,inode64,noquota 0 0
+ - tmpfs
+ - devtmpfs
- When destination is passed in, it will check that the entry where the
- source appears is mounted to where destination defines. This is useful so
- that an error message can report that a source is not mounted at an
- expected destination.
+ If ``devices`` is set to ``True`` the mapping will be a device-to-path(s),
+ if ``paths`` is set to ``True`` then the mapping will be
+ a path-to-device(s)
"""
- dev = os.path.realpath(source)
- with open(PROCDIR + '/mounts', 'rb') as proc_mounts:
- for line in proc_mounts:
- fields = line.split()
- if len(fields) < 3:
+ devices_mounted = {}
+ paths_mounted = {}
+ do_not_skip = ['tmpfs', 'devtmpfs']
+ default_to_devices = devices is False and paths is False
+
+ with open(PROCDIR + '/mounts', 'rb') as mounts:
+ proc_mounts = mounts.readlines()
+
+ for line in proc_mounts:
+ fields = [as_string(f) for f in line.split()]
+ if len(fields) < 3:
+ continue
+ device = os.path.realpath(fields[0]) if fields[0].startswith('/') else fields[0]
+ path = os.path.realpath(fields[1])
+ # only care about actual existing devices
+ if not os.path.exists(device) or not device.startswith('/'):
+ if device not in do_not_skip:
continue
- mounted_device = fields[0]
- mounted_path = fields[1]
- if os.path.isabs(mounted_device) and os.path.exists(mounted_device):
- mounted_device = os.path.realpath(mounted_device)
- if as_string(mounted_device) == dev:
- if destination:
- destination = os.path.realpath(destination)
- return destination == as_string(os.path.realpath(mounted_path))
- else:
- return True
- return False
+ if device in devices_mounted.keys():
+ devices_mounted[device].append(path)
+ else:
+ devices_mounted[device] = [path]
+ if path in paths_mounted.keys():
+ paths_mounted[path].append(device)
+ else:
+ paths_mounted[path] = [device]
+
+ # Default to returning information for devices if
+ if devices is True or default_to_devices:
+ return devices_mounted
+ else:
+ return paths_mounted
PRIO_UNINTERESTING = 2
PRIO_DEBUGONLY = 0
-PRIO_DEFAULT = PRIO_USEFUL
+PRIO_DEFAULT = PRIO_INTERESTING
# Make life easier on developers:
# If our parent dir contains CMakeCache.txt and bin/init-ceph,
file=sys.stderr)
return False
- if service_id in exist_ids:
+ if service_id in exist_ids or len(exist_ids) > 0 and service_id == '*':
return True
else:
print('WARN: the service id you provided does not exist. service id should '
#include <Python.h>
+#include <pthread.h>
+
#include "include/types.h"
+#include "include/compat.h"
#include "common/config.h"
#include "common/ceph_argparse.h"
#include "common/errno.h"
*/
int main(int argc, const char **argv)
{
+ ceph_pthread_setname(pthread_self(), "ceph-mgr");
+
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
flags, "mon_data");
ceph_heap_profiler_init();
- uuid_d fsid;
std::string val;
for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
if (ceph_argparse_double_dash(args, i)) {
MonMap monmap;
// load or generate monmap
- if (g_conf->monmap.length()) {
- int err = monmapbl.read_file(g_conf->monmap.c_str(), &error);
+ const auto monmap_fn = g_conf->get_val<string>("monmap");
+ if (monmap_fn.length()) {
+ int err = monmapbl.read_file(monmap_fn.c_str(), &error);
if (err < 0) {
- derr << argv[0] << ": error reading " << g_conf->monmap << ": " << error << dendl;
+ derr << argv[0] << ": error reading " << monmap_fn << ": " << error << dendl;
exit(1);
}
try {
// always mark seed/mkfs monmap as epoch 0
monmap.set_epoch(0);
- }
- catch (const buffer::error& e) {
- derr << argv[0] << ": error decoding monmap " << g_conf->monmap << ": " << e.what() << dendl;
+ } catch (const buffer::error& e) {
+ derr << argv[0] << ": error decoding monmap " << monmap_fn << ": " << e.what() << dendl;
exit(1);
}
} else {
}
}
- if (!g_conf->fsid.is_zero()) {
- monmap.fsid = g_conf->fsid;
- dout(0) << argv[0] << ": set fsid to " << g_conf->fsid << dendl;
+ const auto fsid = g_conf->get_val<uuid_d>("fsid");
+ if (!fsid.is_zero()) {
+ monmap.fsid = fsid;
+ dout(0) << argv[0] << ": set fsid to " << fsid << dendl;
}
if (monmap.fsid.is_zero()) {
cephd_preload_embedded_plugins();
#endif
- if (mkfs) {
- common_init_finish(g_ceph_context);
- MonClient mc(g_ceph_context);
- if (mc.build_initial_monmap() < 0)
- return -1;
- if (mc.get_monmap_privately() < 0)
- return -1;
-
- if (mc.monmap.fsid.is_zero()) {
- derr << "must specify cluster fsid" << dendl;
- return -EINVAL;
- }
-
- int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
- mc.monmap.fsid, whoami);
- if (err < 0) {
- derr << TEXT_RED << " ** ERROR: error creating empty object store in "
- << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
- exit(1);
- }
- derr << "created object store " << g_conf->osd_data
- << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
- }
if (mkkey) {
common_init_finish(g_ceph_context);
KeyRing *keyring = KeyRing::create_empty();
derr << "created new key in keyring " << g_conf->keyring << dendl;
}
}
+ if (mkfs) {
+ common_init_finish(g_ceph_context);
+ MonClient mc(g_ceph_context);
+ if (mc.build_initial_monmap() < 0)
+ return -1;
+ if (mc.get_monmap_privately() < 0)
+ return -1;
+
+ if (mc.monmap.fsid.is_zero()) {
+ derr << "must specify cluster fsid" << dendl;
+ return -EINVAL;
+ }
+
+ int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
+ mc.monmap.fsid, whoami);
+ if (err < 0) {
+ derr << TEXT_RED << " ** ERROR: error creating empty object store in "
+ << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
+ exit(1);
+ }
+ derr << "created object store " << g_conf->osd_data
+ << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
+ }
if (mkfs || mkkey)
exit(0);
if (mkjournal) {
ldout(cct, 2) << "unmounted." << dendl;
}
-
-
-class C_C_Tick : public Context {
- Client *client;
-public:
- explicit C_C_Tick(Client *c) : client(c) {}
- void finish(int r) override {
- // Called back via Timer, which takes client_lock for us
- assert(client->client_lock.is_locked_by_me());
- client->tick();
- }
-};
-
void Client::flush_cap_releases()
{
// send any cap releases
}
ldout(cct, 21) << "tick" << dendl;
- tick_event = new C_C_Tick(this);
- timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
-
+ tick_event = timer.add_event_after(
+ cct->_conf->client_tick_interval,
+ new FunctionContext([this](int) {
+ // Called back via Timer, which takes client_lock for us
+ assert(client_lock.is_locked_by_me());
+ tick();
+ }));
utime_t now = ceph_clock_now();
if (!mounted && !mds_requests.empty()) {
friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb
friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb
friend class C_Block_Sync; // Calls block map and protected helpers
- friend class C_C_Tick; // Asserts on client_lock
friend class C_Client_RequestInterrupt;
friend class C_Client_Remount;
friend void intrusive_ptr_release(Inode *in);
if (tag.tid >= minimum_tag_tid) {
// no need to check for tag classes beyond this point
vals.clear();
+ more = false;
break;
}
}
// completed calculation of tag class minimums
if (tag.tid >= minimum_tag_tid) {
vals.clear();
+ more = false;
break;
}
} else if (tag_pass == TAG_PASS_LIST) {
CLS_ERR("object map footer read failed");
return r;
}
-
+
try {
bufferlist::iterator it = footer_bl.begin();
object_map.decode_footer(it);
}
bool updated = false;
- for (uint64_t object_no = start_object_no; object_no < end_object_no;
- ++object_no) {
- uint8_t state = object_map[object_no];
+ auto it = object_map.begin() + start_object_no;
+ auto end_it = object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ uint8_t state = *it;
if ((!current_object_state || state == *current_object_state ||
(*current_object_state == OBJECT_EXISTS &&
state == OBJECT_EXISTS_CLEAN)) && state != new_object_state) {
- object_map[object_no] = new_object_state;
+ *it = new_object_state;
updated = true;
}
}
return 0;
}
+int list_watchers(cls_method_context_t hctx,
+ std::set<entity_inst_t> *entities) {
+ obj_list_watch_response_t watchers;
+ int r = cls_cxx_list_watchers(hctx, &watchers);
+ if (r < 0 && r != -ENOENT) {
+ CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
+ return r;
+ }
+
+ entities->clear();
+ for (auto &w : watchers.entries) {
+ entities->emplace(w.name, w.addr);
+ }
+ return 0;
+}
+
int read_peers(cls_method_context_t hctx,
std::vector<cls::rbd::MirrorPeer> *peers) {
std::string last_read = PEER_KEY_PREFIX;
}
int image_status_get(cls_method_context_t hctx, const string &global_image_id,
+ const std::set<entity_inst_t> &watchers,
cls::rbd::MirrorImageStatus *status) {
bufferlist bl;
return -EIO;
}
- obj_list_watch_response_t watchers;
- r = cls_cxx_list_watchers(hctx, &watchers);
- if (r < 0 && r != -ENOENT) {
- CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
- return r;
- }
*status = static_cast<cls::rbd::MirrorImageStatus>(ondisk_status);
- status->up = false;
- for (auto &w : watchers.entries) {
- if (w.name == ondisk_status.origin.name &&
- w.addr == ondisk_status.origin.addr) {
- status->up = true;
- break;
- }
- }
-
+ status->up = (watchers.find(ondisk_status.origin) != watchers.end());
return 0;
}
int max_read = RBD_MAX_KEYS_READ;
bool more = true;
+ std::set<entity_inst_t> watchers;
+ int r = list_watchers(hctx, &watchers);
+ if (r < 0) {
+ return r;
+ }
+
while (more && mirror_images->size() < max_return) {
std::map<std::string, bufferlist> vals;
CLS_LOG(20, "last_read = '%s'", last_read.c_str());
- int r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read,
- &vals, &more);
+ r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read, &vals,
+ &more);
if (r < 0) {
CLS_ERR("error reading mirror image directory by name: %s",
cpp_strerror(r).c_str());
(*mirror_images)[image_id] = mirror_image;
cls::rbd::MirrorImageStatus status;
- int r1 = image_status_get(hctx, mirror_image.global_image_id, &status);
+ int r1 = image_status_get(hctx, mirror_image.global_image_id, watchers,
+ &status);
if (r1 < 0) {
continue;
}
int image_status_get_summary(cls_method_context_t hctx,
std::map<cls::rbd::MirrorImageStatusState, int> *states) {
- obj_list_watch_response_t watchers_;
- int r = cls_cxx_list_watchers(hctx, &watchers_);
+ std::set<entity_inst_t> watchers;
+ int r = list_watchers(hctx, &watchers);
if (r < 0) {
- if (r != -ENOENT) {
- CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
- }
return r;
}
- set<entity_inst_t> watchers;
- for (auto &w : watchers_.entries) {
- watchers.insert(entity_inst_t(w.name, w.addr));
- }
-
states->clear();
string last_read = IMAGE_KEY_PREFIX;
}
cls::rbd::MirrorImageStatus status;
- image_status_get(hctx, mirror_image.global_image_id, &status);
+ image_status_get(hctx, mirror_image.global_image_id, watchers, &status);
cls::rbd::MirrorImageStatusState state = status.up ? status.state :
cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
}
int image_status_remove_down(cls_method_context_t hctx) {
- obj_list_watch_response_t watchers_;
- int r = cls_cxx_list_watchers(hctx, &watchers_);
+ std::set<entity_inst_t> watchers;
+ int r = list_watchers(hctx, &watchers);
if (r < 0) {
- if (r != -ENOENT) {
- CLS_ERR("error listing watchers: '%s'", cpp_strerror(r).c_str());
- }
return r;
}
- set<entity_inst_t> watchers;
- for (auto &w : watchers_.entries) {
- watchers.insert(entity_inst_t(w.name, w.addr));
- }
-
string last_read = STATUS_GLOBAL_KEY_PREFIX;
int max_read = RBD_MAX_KEYS_READ;
bool more = true;
return -EINVAL;
}
+ std::set<entity_inst_t> watchers;
+ int r = mirror::list_watchers(hctx, &watchers);
+ if (r < 0) {
+ return r;
+ }
+
cls::rbd::MirrorImageStatus status;
- int r = mirror::image_status_get(hctx, global_image_id, &status);
+ r = mirror::image_status_get(hctx, global_image_id, watchers, &status);
if (r < 0) {
return r;
}
CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str());
continue;
}
-
- if (!op.list_versions && !entry.is_visible()) {
+
+ // filter out noncurrent versions, delete markers, and initial marker
+ if (!op.list_versions && (!entry.is_visible() || op.start_obj.name == key.name)) {
CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str());
continue;
}
unaccount_entry(header, remove_entry);
if (op.log_op && !header.syncstopped) {
+ ++header.ver; // increment index version, or we'll overwrite keys previously written
rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
if (rc < 0)
return 0;
}
-int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+int rgw_dir_suggest_changes(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out)
{
CLS_LOG(1, "rgw_dir_suggest_changes()");
}
break;
case CEPH_RGW_UPDATE:
+ if (!cur_disk.exists) {
+ // this update would only have been sent by the rgw client
+ // if the rgw_bucket_dir_entry existed, however between that
+ // check and now the entry has diappeared, so we were likely
+ // in the midst of a delete op, and we will not recreate the
+ // entry
+ CLS_LOG(10,
+ "CEPH_RGW_UPDATE not applied because rgw_bucket_dir_entry"
+ " no longer exists\n");
+ break;
+ }
+
CLS_LOG(10, "CEPH_RGW_UPDATE name=%s instance=%s total_entries: %" PRId64 " -> %" PRId64 "\n",
cur_change.key.name.c_str(), cur_change.key.instance.c_str(), stats.num_entries, stats.num_entries + 1);
+
stats.num_entries++;
stats.total_size += cur_change.meta.accounted_size;
stats.total_size_rounded += cls_rgw_get_rounded_size(cur_change.meta.accounted_size);
}
}
break;
- }
- }
-
- }
+ } // switch(op)
+ } // if (cur_disk.pending_map.empty())
+ } // while (!in_iter.end())
if (header_changed) {
return write_bucket_header(hctx, &header);
bool by_user = !user.empty();
uint32_t i = 0;
string user_key;
-
- if (truncated)
- *truncated = false;
+ bool truncated_status = false;
if (!by_user) {
usage_record_prefix_by_time(end, end_key);
}
CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
- int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, truncated);
+ int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, &truncated_status);
if (ret < 0)
return ret;
-
+ if (truncated) {
+ *truncated = truncated_status;
+ }
+
map<string, bufferlist>::iterator iter = keys.begin();
if (iter == keys.end())
return 0;
if (!by_user && key.compare(end_key) >= 0) {
CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ if (truncated_status) {
+ key_iter = key;
+ }
return 0;
}
if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ if (truncated_status) {
+ key_iter = key;
+ }
return 0;
}
if (!op.add){
apply_entry_stats(update_entry, &entry);
}
-
entry.user_stats_sync = true;
ret = write_entry(hctx, key, entry);
cls_user_bucket bucket;
size_t size;
size_t size_rounded;
- real_time creation_time;
+ ceph::real_time creation_time;
uint64_t count;
bool user_stats_sync;
cls_user_bucket_entry() : size(0), size_rounded(0), count(0), user_stats_sync(false) {}
void encode(bufferlist& bl) const {
- ENCODE_START(7, 5, bl);
+ ENCODE_START(9, 5, bl);
uint64_t s = size;
__u32 mt = ceph::real_clock::to_time_t(creation_time);
string empty_str; // originally had the bucket name here, but we encode bucket later
::encode(s, bl);
::encode(user_stats_sync, bl);
::encode(creation_time, bl);
+ //::encode(placement_rule, bl); removed in v9
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
__u32 mt;
uint64_t s;
string empty_str; // backward compatibility
::decode(user_stats_sync, bl);
if (struct_v >= 7)
::decode(creation_time, bl);
+ if (struct_v == 8) { // added in v8, removed in v9
+ std::string placement_rule;
+ ::decode(placement_rule, bl);
+ }
DECODE_FINISH(bl);
}
void dump(Formatter *f) const;
#include "common/Finisher.h"
#include "common/Formatter.h"
+#define rdout(x) lgeneric_subdout(cct,reserver,x)
+
/**
* Manages a configurable number of asyncronous reservations.
*
*/
template <typename T>
class AsyncReserver {
+ CephContext *cct;
Finisher *f;
unsigned max_allowed;
unsigned min_priority;
Mutex lock;
- map<unsigned, list<pair<T, Context*> > > queues;
- map<T, pair<unsigned, typename list<pair<T, Context*> >::iterator > > queue_pointers;
- set<T> in_progress;
+ struct Reservation {
+ T item;
+ unsigned prio = 0;
+ Context *grant = 0;
+ Context *preempt = 0;
+ Reservation() {}
+ Reservation(T i, unsigned pr, Context *g, Context *p = 0)
+ : item(i), prio(pr), grant(g), preempt(p) {}
+ void dump(Formatter *f) const {
+ f->dump_stream("item") << item;
+ f->dump_unsigned("prio", prio);
+ f->dump_bool("can_preempt", !!preempt);
+ }
+ friend ostream& operator<<(ostream& out, const Reservation& r) {
+ return out << r.item << "(prio " << r.prio << " grant " << r.grant
+ << " preempt " << r.preempt << ")";
+ }
+ };
+
+ map<unsigned, list<Reservation>> queues;
+ map<T, pair<unsigned, typename list<Reservation>::iterator>> queue_pointers;
+ map<T,Reservation> in_progress;
+ set<pair<unsigned,T>> preempt_by_prio; ///< in_progress that can be preempted
+
+ void preempt_one() {
+ assert(!preempt_by_prio.empty());
+ auto q = in_progress.find(preempt_by_prio.begin()->second);
+ assert(q != in_progress.end());
+ Reservation victim = q->second;
+ rdout(10) << __func__ << " preempt " << victim << dendl;
+ f->queue(victim.preempt);
+ victim.preempt = nullptr;
+ in_progress.erase(q);
+ preempt_by_prio.erase(preempt_by_prio.begin());
+ }
void do_queues() {
- typename map<unsigned, list<pair<T, Context*> > >::reverse_iterator it;
- for (it = queues.rbegin();
- it != queues.rend() &&
- in_progress.size() < max_allowed &&
- it->first >= min_priority;
- ++it) {
- while (in_progress.size() < max_allowed &&
- !it->second.empty()) {
- pair<T, Context*> p = it->second.front();
- queue_pointers.erase(p.first);
- it->second.pop_front();
- f->queue(p.second);
- in_progress.insert(p.first);
+ rdout(20) << __func__ << ":\n";
+ JSONFormatter jf(true);
+ jf.open_object_section("queue");
+ _dump(&jf);
+ jf.close_section();
+ jf.flush(*_dout);
+ *_dout << dendl;
+
+ // in case min_priority was adjusted up or max_allowed was adjusted down
+ while (!preempt_by_prio.empty() &&
+ (in_progress.size() > max_allowed ||
+ preempt_by_prio.begin()->first < min_priority)) {
+ preempt_one();
+ }
+
+ while (!queues.empty()) {
+ // choose highest priority queue
+ auto it = queues.end();
+ --it;
+ assert(!it->second.empty());
+ if (it->first < min_priority) {
+ break;
+ }
+ if (in_progress.size() >= max_allowed &&
+ !preempt_by_prio.empty() &&
+ it->first > preempt_by_prio.begin()->first) {
+ preempt_one();
+ }
+ if (in_progress.size() >= max_allowed) {
+ break; // no room
+ }
+ // grant
+ Reservation p = it->second.front();
+ rdout(10) << __func__ << " grant " << p << dendl;
+ queue_pointers.erase(p.item);
+ it->second.pop_front();
+ if (it->second.empty()) {
+ queues.erase(it);
+ }
+ f->queue(p.grant);
+ p.grant = nullptr;
+ in_progress[p.item] = p;
+ if (p.preempt) {
+ preempt_by_prio.insert(make_pair(p.prio, p.item));
}
}
}
public:
AsyncReserver(
+ CephContext *cct,
Finisher *f,
unsigned max_allowed,
unsigned min_priority = 0)
- : f(f),
+ : cct(cct),
+ f(f),
max_allowed(max_allowed),
min_priority(min_priority),
lock("AsyncReserver::lock") {}
void dump(Formatter *f) {
Mutex::Locker l(lock);
+ _dump(f);
+ }
+ void _dump(Formatter *f) {
f->dump_unsigned("max_allowed", max_allowed);
f->dump_unsigned("min_priority", min_priority);
f->open_array_section("queues");
- for (typename map<unsigned, list<pair<T, Context*> > > ::const_iterator p =
- queues.begin(); p != queues.end(); ++p) {
+ for (auto& p : queues) {
f->open_object_section("queue");
- f->dump_unsigned("priority", p->first);
+ f->dump_unsigned("priority", p.first);
f->open_array_section("items");
- for (typename list<pair<T, Context*> >::const_iterator q =
- p->second.begin(); q != p->second.end(); ++q) {
- f->dump_stream("item") << q->first;
+ for (auto& q : p.second) {
+ f->dump_object("item", q);
}
f->close_section();
f->close_section();
}
f->close_section();
f->open_array_section("in_progress");
- for (typename set<T>::const_iterator p = in_progress.begin();
- p != in_progress.end();
- ++p) {
- f->dump_stream("item") << *p;
+ for (auto& p : in_progress) {
+ f->dump_object("item", p.second);
}
f->close_section();
}
void request_reservation(
T item, ///< [in] reservation key
Context *on_reserved, ///< [in] callback to be called on reservation
- unsigned prio
+ unsigned prio, ///< [in] priority
+ Context *on_preempt = 0 ///< [in] callback to be called if we are preempted (optional)
) {
Mutex::Locker l(lock);
+ Reservation r(item, prio, on_reserved, on_preempt);
+ rdout(10) << __func__ << " queue " << r << dendl;
assert(!queue_pointers.count(item) &&
!in_progress.count(item));
- queues[prio].push_back(make_pair(item, on_reserved));
- queue_pointers.insert(make_pair(item, make_pair(prio,--(queues[prio]).end())));
+ queues[prio].push_back(r);
+ queue_pointers.insert(make_pair(item,
+ make_pair(prio,--(queues[prio]).end())));
do_queues();
}
T item ///< [in] key for reservation to cancel
) {
Mutex::Locker l(lock);
- if (queue_pointers.count(item)) {
- unsigned prio = queue_pointers[item].first;
- delete queue_pointers[item].second->second;
- queues[prio].erase(queue_pointers[item].second);
- queue_pointers.erase(item);
+ auto i = queue_pointers.find(item);
+ if (i != queue_pointers.end()) {
+ unsigned prio = i->second.first;
+ const Reservation& r = *i->second.second;
+ rdout(10) << __func__ << " cancel " << r << " (was queued)" << dendl;
+ delete r.grant;
+ delete r.preempt;
+ queues[prio].erase(i->second.second);
+ if (queues[prio].empty()) {
+ queues.erase(prio);
+ }
+ queue_pointers.erase(i);
} else {
- in_progress.erase(item);
+ auto p = in_progress.find(item);
+ if (p != in_progress.end()) {
+ rdout(10) << __func__ << " cancel " << p->second
+ << " (was in progress)" << dendl;
+ if (p->second.preempt) {
+ preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+ delete p->second.preempt;
+ }
+ in_progress.erase(p);
+ } else {
+ rdout(10) << __func__ << " cancel " << item << " (not found)" << dendl;
+ }
}
do_queues();
}
static const unsigned MAX_PRIORITY = (unsigned)-1;
};
+#undef rdout
#endif
return r;
}
- fsid = cct->_conf->fsid;
+ fsid = cct->_conf->get_val<uuid_d>("fsid");
host = cct->_conf->host;
return 0;
}
lock.Unlock();
}
-bool SafeTimer::add_event_after(double seconds, Context *callback)
+Context* SafeTimer::add_event_after(double seconds, Context *callback)
{
assert(lock.is_locked());
return add_event_at(when, callback);
}
-bool SafeTimer::add_event_at(utime_t when, Context *callback)
+Context* SafeTimer::add_event_at(utime_t when, Context *callback)
{
assert(lock.is_locked());
ldout(cct,10) << __func__ << " " << when << " -> " << callback << dendl;
if (stopping) {
ldout(cct,5) << __func__ << " already shutdown, event not added" << dendl;
delete callback;
- return false;
+ return nullptr;
}
scheduled_map_t::value_type s_val(when, callback);
scheduled_map_t::iterator i = schedule.insert(s_val);
* adjust our timeout. */
if (i == schedule.begin())
cond.Signal();
- return true;
+ return callback;
}
bool SafeTimer::cancel_event(Context *callback)
/* Schedule an event in the future
* Call with the event_lock LOCKED */
- bool add_event_after(double seconds, Context *callback);
- bool add_event_at(utime_t when, Context *callback);
+ Context* add_event_after(double seconds, Context *callback);
+ Context* add_event_at(utime_t when, Context *callback);
/* Cancel an event.
* Call with the event_lock LOCKED
#include "common/Formatter.h"
#include "include/assert.h"
#include "include/encoding.h"
+#include <utility>
namespace ceph {
// must be power of 2
BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
-public:
- static const uint32_t BLOCK_SIZE;
- class ConstReference {
+ template <typename DataIterator>
+ class ReferenceImpl {
+ protected:
+ DataIterator m_data_iterator;
+ uint64_t m_shift;
+
+ ReferenceImpl(const DataIterator& data_iterator, uint64_t shift)
+ : m_data_iterator(data_iterator), m_shift(shift) {
+ }
+ ReferenceImpl(DataIterator&& data_iterator, uint64_t shift)
+ : m_data_iterator(std::move(data_iterator)), m_shift(shift) {
+ }
+
public:
- operator uint8_t() const;
+ inline operator uint8_t() const {
+ return (*m_data_iterator >> m_shift) & MASK;
+ }
+ };
+
+public:
+
+ class ConstReference : public ReferenceImpl<bufferlist::const_iterator> {
private:
friend class BitVector;
- const BitVector &m_bit_vector;
- uint64_t m_offset;
- ConstReference(const BitVector &bit_vector, uint64_t offset);
+ ConstReference(const bufferlist::const_iterator& data_iterator,
+ uint64_t shift)
+ : ReferenceImpl<bufferlist::const_iterator>(data_iterator, shift) {
+ }
+ ConstReference(bufferlist::const_iterator&& data_iterator, uint64_t shift)
+ : ReferenceImpl<bufferlist::const_iterator>(std::move(data_iterator),
+ shift) {
+ }
};
- class Reference {
+ class Reference : public ReferenceImpl<bufferlist::iterator> {
public:
- operator uint8_t() const;
Reference& operator=(uint8_t v);
+
+ private:
+ friend class BitVector;
+
+ Reference(const bufferlist::iterator& data_iterator, uint64_t shift)
+ : ReferenceImpl<bufferlist::iterator>(data_iterator, shift) {
+ }
+ Reference(bufferlist::iterator&& data_iterator, uint64_t shift)
+ : ReferenceImpl<bufferlist::iterator>(std::move(data_iterator), shift) {
+ }
+ };
+
+public:
+ template <typename BitVectorT, typename DataIterator>
+ class IteratorImpl {
private:
friend class BitVector;
- BitVector &m_bit_vector;
- uint64_t m_offset;
- Reference(BitVector &bit_vector, uint64_t offset);
+ uint64_t m_offset = 0;
+ BitVectorT *m_bit_vector;
+
+ // cached derived values
+ uint64_t m_index = 0;
+ uint64_t m_shift = 0;
+ DataIterator m_data_iterator;
+
+ IteratorImpl(BitVectorT *bit_vector, uint64_t offset)
+ : m_bit_vector(bit_vector),
+ m_data_iterator(bit_vector->m_data.begin()) {
+ *this += offset;
+ }
+
+ public:
+ inline IteratorImpl& operator++() {
+ ++m_offset;
+
+ uint64_t index;
+ compute_index(m_offset, &index, &m_shift);
+
+ assert(index == m_index || index == m_index + 1);
+ if (index > m_index) {
+ m_index = index;
+ ++m_data_iterator;
+ }
+ return *this;
+ }
+ inline IteratorImpl& operator+=(uint64_t offset) {
+ m_offset += offset;
+ compute_index(m_offset, &m_index, &m_shift);
+ if (m_offset < m_bit_vector->size()) {
+ m_data_iterator.seek(m_index);
+ } else {
+ m_data_iterator = m_bit_vector->m_data.end();
+ }
+ return *this;
+ }
+
+ inline IteratorImpl operator++(int) {
+ IteratorImpl iterator_impl(*this);
+ ++iterator_impl;
+ return iterator_impl;
+ }
+ inline IteratorImpl operator+(uint64_t offset) {
+ IteratorImpl iterator_impl(*this);
+ iterator_impl += offset;
+ return iterator_impl;
+ }
+
+ inline bool operator==(const IteratorImpl& rhs) const {
+ return (m_offset == rhs.m_offset && m_bit_vector == rhs.m_bit_vector);
+ }
+ inline bool operator!=(const IteratorImpl& rhs) const {
+ return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector);
+ }
+
+ inline ConstReference operator*() const {
+ return ConstReference(m_data_iterator, m_shift);
+ }
+ inline Reference operator*() {
+ return Reference(m_data_iterator, m_shift);
+ }
};
+ typedef IteratorImpl<const BitVector,
+ bufferlist::const_iterator> ConstIterator;
+ typedef IteratorImpl<BitVector, bufferlist::iterator> Iterator;
+
+ static const uint32_t BLOCK_SIZE;
static const uint8_t BIT_COUNT = _bit_count;
BitVector();
+ inline ConstIterator begin() const {
+ return ConstIterator(this, 0);
+ }
+ inline ConstIterator end() const {
+ return ConstIterator(this, m_size);
+ }
+ inline Iterator begin() {
+ return Iterator(this, 0);
+ }
+ inline Iterator end() {
+ return Iterator(this, m_size);
+ }
+
void set_crc_enabled(bool enabled) {
m_crc_enabled = enabled;
}
template <uint8_t _b>
typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) {
- return Reference(*this, offset);
-}
-
-template <uint8_t _b>
-typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
- return ConstReference(*this, offset);
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::ConstReference(const BitVector<_b> &bit_vector,
- uint64_t offset)
- : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::ConstReference::operator uint8_t() const {
uint64_t index;
uint64_t shift;
- this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+ compute_index(offset, &index, &shift);
- return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+ bufferlist::iterator data_iterator(m_data.begin());
+ data_iterator.seek(index);
+ return Reference(std::move(data_iterator), shift);
}
template <uint8_t _b>
-BitVector<_b>::Reference::Reference(BitVector<_b> &bit_vector, uint64_t offset)
- : m_bit_vector(bit_vector), m_offset(offset)
-{
-}
-
-template <uint8_t _b>
-BitVector<_b>::Reference::operator uint8_t() const {
+typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
uint64_t index;
uint64_t shift;
- this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
+ compute_index(offset, &index, &shift);
- return (this->m_bit_vector.m_data[index] >> shift) & MASK;
+ bufferlist::const_iterator data_iterator(m_data.begin());
+ data_iterator.seek(index);
+ return ConstReference(std::move(data_iterator), shift);
}
template <uint8_t _b>
typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) {
- uint64_t index;
- uint64_t shift;
- this->m_bit_vector.compute_index(this->m_offset, &index, &shift);
-
- uint8_t mask = MASK << shift;
- char packed_value = (this->m_bit_vector.m_data[index] & ~mask) |
- ((v << shift) & mask);
- this->m_bit_vector.m_data.copy_in(index, 1, &packed_value);
+ uint8_t mask = MASK << this->m_shift;
+ char packed_value = (*this->m_data_iterator & ~mask) |
+ ((v << this->m_shift) & mask);
+ bufferlist::iterator it(this->m_data_iterator);
+ it.copy_in(1, &packed_value, true);
return *this;
}
char *data;
unsigned len;
std::atomic<unsigned> nref { 0 };
- int mempool = mempool::mempool_buffer_anon;
+ int mempool;
mutable std::atomic_flag crc_spinlock = ATOMIC_FLAG_INIT;
map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
- explicit raw(unsigned l)
- : data(NULL), len(l), nref(0) {
+ explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(NULL), len(l), nref(0), mempool(mempool) {
mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
}
- raw(char *c, unsigned l)
- : data(c), len(l), nref(0) {
+ raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+ : data(c), len(l), nref(0), mempool(mempool) {
mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
}
virtual ~raw() {
class buffer::raw_combined : public buffer::raw {
size_t alignment;
public:
- raw_combined(char *dataptr, unsigned l, unsigned align=0)
- : raw(dataptr, l),
+ raw_combined(char *dataptr, unsigned l, unsigned align,
+ int mempool)
+ : raw(dataptr, l, mempool),
alignment(align) {
inc_total_alloc(len);
inc_history_alloc(len);
return create(len, alignment);
}
- static raw_combined *create(unsigned len, unsigned align=0) {
+ static raw_combined *create(unsigned len,
+ unsigned align,
+ int mempool = mempool::mempool_buffer_anon) {
if (!align)
align = sizeof(size_t);
size_t rawlen = ROUND_UP_TO(sizeof(buffer::raw_combined),
// actual data first, since it has presumably larger alignment restriction
// then put the raw_combined at the end
- return new (ptr + datalen) raw_combined(ptr, len, align);
+ return new (ptr + datalen) raw_combined(ptr, len, align, mempool);
}
static void operator delete(void *ptr) {
buffer::raw* buffer::create(unsigned len) {
return buffer::create_aligned(len, sizeof(size_t));
}
+ buffer::raw* buffer::create_in_mempool(unsigned len, int mempool) {
+ return buffer::create_aligned_in_mempool(len, sizeof(size_t), mempool);
+ }
buffer::raw* buffer::claim_char(unsigned len, char *buf) {
return new raw_claimed_char(len, buf);
}
return new raw_claim_buffer(buf, len, std::move(del));
}
- buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
+ buffer::raw* buffer::create_aligned_in_mempool(
+ unsigned len, unsigned align, int mempool) {
// If alignment is a page multiple, use a separate buffer::raw to
// avoid fragmenting the heap.
//
return new raw_hack_aligned(len, align);
#endif
}
- return raw_combined::create(len, align);
+ return raw_combined::create(len, align, mempool);
+ }
+ buffer::raw* buffer::create_aligned(
+ unsigned len, unsigned align) {
+ return create_aligned_in_mempool(len, align,
+ mempool::mempool_buffer_anon);
}
buffer::raw* buffer::create_page_aligned(unsigned len) {
bool buffer::ptr::at_buffer_tail() const { return _off + _len == _raw->len; }
+ int buffer::ptr::get_mempool() const {
+ if (_raw) {
+ return _raw->mempool;
+ }
+ return mempool::mempool_buffer_anon;
+ }
+
+ void buffer::ptr::reassign_to_mempool(int pool) {
+ if (_raw) {
+ _raw->reassign_to_mempool(pool);
+ }
+ }
+ void buffer::ptr::try_assign_to_mempool(int pool) {
+ if (_raw) {
+ _raw->try_assign_to_mempool(pool);
+ }
+ }
+
const char *buffer::ptr::c_str() const {
assert(_raw);
if (buffer_track_c_str)
{
std::swap(_len, other._len);
std::swap(_memcopy_count, other._memcopy_count);
- std::swap(_mempool, other._mempool);
_buffers.swap(other._buffers);
append_buffer.swap(other.append_buffer);
//last_p.swap(other.last_p);
return is_aligned(CEPH_PAGE_SIZE);
}
+ int buffer::list::get_mempool() const
+ {
+ if (_buffers.empty()) {
+ return mempool::mempool_buffer_anon;
+ }
+ return _buffers.back().get_mempool();
+ }
+
void buffer::list::reassign_to_mempool(int pool)
{
- _mempool = pool;
if (append_buffer.get_raw()) {
append_buffer.get_raw()->reassign_to_mempool(pool);
}
void buffer::list::try_assign_to_mempool(int pool)
{
- _mempool = pool;
if (append_buffer.get_raw()) {
append_buffer.get_raw()->try_assign_to_mempool(pool);
}
void buffer::list::reserve(size_t prealloc)
{
if (append_buffer.unused_tail_length() < prealloc) {
- append_buffer = buffer::create(prealloc);
- if (_mempool >= 0) {
- append_buffer.get_raw()->reassign_to_mempool(_mempool);
- }
+ append_buffer = buffer::create_in_mempool(prealloc, get_mempool());
append_buffer.set_length(0); // unused, so far.
}
}
unsigned gap = append_buffer.unused_tail_length();
if (!gap) {
// make a new append_buffer!
- append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE);
+ append_buffer = raw_combined::create(CEPH_BUFFER_APPEND_SIZE, 0,
+ get_mempool());
append_buffer.set_length(0); // unused, so far.
- if (_mempool >= 0) {
- append_buffer.get_raw()->reassign_to_mempool(_mempool);
- }
}
append(append_buffer, append_buffer.append(c) - 1, 1); // add segment to the list
}
size_t need = ROUND_UP_TO(len, sizeof(size_t)) + sizeof(raw_combined);
size_t alen = ROUND_UP_TO(need, CEPH_BUFFER_ALLOC_UNIT) -
sizeof(raw_combined);
- append_buffer = raw_combined::create(alen);
+ append_buffer = raw_combined::create(alen, 0, get_mempool());
append_buffer.set_length(0); // unused, so far.
- if (_mempool >= 0) {
- append_buffer.get_raw()->reassign_to_mempool(_mempool);
- }
}
}
}
if (log->graylog() && changed.count("fsid")) {
- log->graylog()->set_fsid(conf->fsid);
+ log->graylog()->set_fsid(conf->get_val<uuid_d>("fsid"));
}
}
};
conf->set_val_or_die("err_to_stderr", "false");
conf->set_val_or_die("log_flush_on_exit", "false");
}
+ if (code_env != CODE_ENVIRONMENT_DAEMON) {
+ // NOTE: disable ms subsystem gathering in clients by default
+ conf->set_val_or_die("debug_ms", "0/0");
+ }
return cct;
}
set_val_or_die("client_mountpoint", val.c_str());
}
else {
- parse_option(args, i, NULL);
+ int r = parse_option(args, i, NULL);
+ if (r < 0) {
+ return r;
+ }
}
}
std::string as_option("--");
as_option += "debug_";
as_option += subsys.get_name(o);
- if (ceph_argparse_witharg(args, i, &val,
+ ostringstream err;
+ if (ceph_argparse_witharg(args, i, &val, err,
as_option.c_str(), (char*)NULL)) {
+ if (err.tellp()) {
+ if (oss) {
+ *oss << err.str();
+ }
+ ret = -EINVAL;
+ break;
+ }
int log, gather;
int r = sscanf(val.c_str(), "%d/%d", &log, &gather);
if (r >= 1) {
/* note: no header guard */
OPTION(host, OPT_STR) // "" means that ceph will use short hostname
-OPTION(fsid, OPT_UUID)
OPTION(public_addr, OPT_ADDR)
OPTION(public_bind_addr, OPT_ADDR)
OPTION(cluster_addr, OPT_ADDR)
OPTION(public_network, OPT_STR)
OPTION(cluster_network, OPT_STR)
-OPTION(monmap, OPT_STR)
-OPTION(mon_host, OPT_STR)
-OPTION(mon_dns_srv_name, OPT_STR)
OPTION(lockdep, OPT_BOOL)
OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock
OPTION(run_dir, OPT_STR) // the "/var/run/ceph" dir, created on daemon startup
OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
OPTION(mon_pg_stuck_threshold, OPT_INT) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
OPTION(mon_pg_min_inactive, OPT_U64) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
-OPTION(mon_pg_warn_min_per_osd, OPT_INT) // min # pgs per (in) osd before we warn the admin
-OPTION(mon_pg_warn_max_per_osd, OPT_INT) // max # pgs per (in) osd before we warn the admin
OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT) // max skew few average in objects per pg
OPTION(mon_pg_warn_min_objects, OPT_INT) // do not warn below this object #
OPTION(mon_pg_warn_min_pool_objects, OPT_INT) // do not warn on pools below this object #
OPTION(mon_max_osd, OPT_INT)
OPTION(mon_probe_timeout, OPT_DOUBLE)
OPTION(mon_client_bytes, OPT_U64) // client msg data allowed in memory (in bytes)
-OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
OPTION(mon_log_max_summary, OPT_U64)
OPTION(mon_daemon_bytes, OPT_U64) // mds, osd message memory cap (in bytes)
OPTION(mon_max_log_entries_per_event, OPT_INT)
OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
-OPTION(mgr_module_path, OPT_STR) // where to load python modules from
-OPTION(mgr_initial_modules, OPT_STR) // Which modules to load
-OPTION(mgr_data, OPT_STR) // where to find keyring etc
-OPTION(mgr_tick_period, OPT_INT) // How frequently to tick
-OPTION(mgr_stats_period, OPT_INT) // How frequently clients send stats
-OPTION(mgr_client_bytes, OPT_U64) // bytes from clients
-OPTION(mgr_client_messages, OPT_U64) // messages from clients
-OPTION(mgr_osd_bytes, OPT_U64) // bytes from osds
-OPTION(mgr_osd_messages, OPT_U64) // messages from osds
-OPTION(mgr_mds_bytes, OPT_U64) // bytes from mdss
-OPTION(mgr_mds_messages, OPT_U64) // messages from mdss
-OPTION(mgr_mon_bytes, OPT_U64) // bytes from mons
-OPTION(mgr_mon_messages, OPT_U64) // messages from mons
-
-OPTION(mgr_connect_retry_interval, OPT_DOUBLE)
-OPTION(mgr_service_beacon_grace, OPT_DOUBLE)
-
-OPTION(mon_mgr_digest_period, OPT_INT) // How frequently to send digests
-OPTION(mon_mgr_beacon_grace, OPT_INT) // How long to wait to failover
-OPTION(mon_mgr_inactive_grace, OPT_INT) // How long before health WARN -> ERR
-OPTION(mon_mgr_mkfs_grace, OPT_INT) // How long before we complain about MGR_DOWN
OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>
+// Definitions for enums
+#include "common/perf_counters.h"
+
void Option::dump_value(const char *field_name,
const Option::value_t &v, Formatter *f) const
Option("public_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.add_service({"mon", "mds", "osd", "mgr"})
.add_tag("network")
- .set_description(""),
+ .set_description("Network(s) from which to choose a public address to bind to"),
+
+ Option("public_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .add_service({"mon", "mds", "osd", "mgr"})
+ .add_tag("network")
+ .set_description("Interface name(s) from which to choose an address from a public_network to bind to; public_network must also be specified.")
+ .add_see_also("public_network"),
Option("cluster_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.add_service("osd")
.add_tag("network")
- .set_description(""),
+ .set_description("Network(s) from which to choose a cluster address to bind to"),
+
+ Option("cluster_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .add_service({"mon", "mds", "osd", "mgr"})
+ .add_tag("network")
+ .set_description("Interface name(s) from which to choose an address from a cluster_network to bind to; cluster_network must also be specified.")
+ .add_see_also("cluster_network"),
Option("monmap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_description("path to MonMap file")
.add_service("common"),
Option("mon_dns_srv_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("ceph-mon")
.set_description("name of DNS SRV record to check for monitor addresses")
.add_service("common")
.add_tag("network")
Option("key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("Authentication key")
+ .set_long_description("A CephX authentication key, base64 encoded. It normally looks something like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.")
+ .add_see_also("keyfile")
+ .add_see_also("keyring"),
Option("keyfile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
- .set_description(""),
+ .set_description("Path to a file containing a key")
+ .set_long_description("The file should contain a CephX authentication key and optionally a trailing newline, but nothing else.")
+ .add_see_also("key"),
Option("keyring", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(
"/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin,"
#endif
)
- .set_description(""),
+ .set_description("Path to a keyring file.")
+ .set_long_description("A keyring file is an INI-style formatted file where the section names are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property with CephX authentication key as the value.")
+ .add_see_also("key")
+ .add_see_also("keyfile"),
Option("heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_default(1)
.set_description(""),
- Option("mon_pg_warn_min_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("mon_pg_warn_min_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .set_description("minimal number PGs per (in) osd before we warn the admin"),
- Option("mon_pg_warn_max_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(300)
- .set_description(""),
+ Option("mon_max_pg_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(200)
+ .set_description("Max number of PGs per OSD the cluster will allow"),
Option("mon_pg_warn_max_object_skew", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(10.0)
.set_default(100ul << 20)
.set_description(""),
- Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.3)
- .set_description(""),
+ .set_description("ratio of mon_client_bytes that can be consumed by "
+ "proxied mgr commands before we error out to client"),
Option("mon_log_max_summary", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(50)
.set_default(false)
.set_description(""),
+ Option("mon_fixup_legacy_erasure_code_profiles", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description("Automatically adjust ruleset-* to crush-* so that legacy apps can set modern erasure code profiles without modification"),
+
Option("mon_debug_deprecated_as_obsolete", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
.set_default(100)
.set_description(""),
+ Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_min(1)
+ .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+ .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+ .add_see_also("mon_max_pg_per_osd"),
+
Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_description(""),
.set_default(false)
.set_description(""),
+ Option("osd_debug_shutdown", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Turn up debug levels during shutdown"),
+
Option("osd_debug_crash_on_ignored_backoff", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
// --------------------------
// bluestore
+ Option("bdev_inject_bad_size", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
Option("bdev_debug_inflight_ios", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),
.set_default(1*1024*1024*1024)
.set_description("minimum disk space allocated to BlueFS (e.g., at mkfs)"),
+ Option("bluestore_bluefs_min_free", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1*1024*1024*1024)
+ .set_description("minimum free space allocated to BlueFS"),
+
Option("bluestore_bluefs_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.02)
.set_description("Minimum fraction of free space devoted to BlueFS"),
// filestore
Option("filestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
- .set_default("")
+ .set_default("max_background_compactions=8,compaction_readahead_size=2097152,compression=kNoCompression")
.set_description(""),
Option("filestore_omap_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
+ Option("mgr_stats_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default((int64_t)PerfCountersBuilder::PRIO_USEFUL)
+ .set_description("Lowest perfcounter priority collected by mgr")
+ .set_long_description("Daemons only set perf counter data to the manager "
+ "daemon if the counter has a priority higher than this.")
+ .set_min_max((int64_t)PerfCountersBuilder::PRIO_DEBUGONLY,
+ (int64_t)PerfCountersBuilder::PRIO_CRITICAL),
+
Option("journal_zero_on_create", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description(""),
Option("mgr_module_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(CEPH_PKGLIBDIR "/mgr")
- .set_description(""),
+ .add_service("mgr")
+ .set_description("Filesystem path to manager modules."),
- Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_ADVANCED)
- .set_default("restful status")
- .set_description(""),
+ Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_BASIC)
+ .set_default("restful status balancer")
+ .add_service("mon")
+ .set_description("List of manager modules to enable when the cluster is "
+ "first started")
+ .set_long_description("This list of module names is read by the monitor "
+ "when the cluster is first started after installation, to populate "
+ "the list of enabled manager modules. Subsequent updates are done using "
+ "the 'mgr module [enable|disable]' commands. List may be comma "
+ "or space separated."),
Option("mgr_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("/var/lib/ceph/mgr/$cluster-$id")
- .set_description(""),
+ .add_service("mgr")
+ .set_description("Filesystem path to the ceph-mgr data directory, used to "
+ "contain keyring."),
Option("mgr_tick_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(2)
- .set_description(""),
+ .add_service("mgr")
+ .set_description("Period in seconds of beacon messages to monitor"),
- Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_BASIC)
.set_default(5)
- .set_description(""),
-
- Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .add_service("mgr")
+ .set_description("Period in seconds of OSD/MDS stats reports to manager")
+ .set_long_description("Use this setting to control the granularity of "
+ "time series data collection from daemons. Adjust "
+ "upwards if the manager CPU load is too high, or "
+ "if you simply do not require the most up to date "
+ "performance counter data."),
+
+ Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128*1048576)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(512)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(512*1048576)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(8192)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128*1048576)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128*1048576)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128)
- .set_description(""),
+ .add_service("mgr"),
- Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1.0)
- .set_description(""),
+ .add_service("common"),
Option("mgr_service_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(60.0)
- .set_description(""),
+ .add_service("mgr")
+ .set_description("Period in seconds from last beacon to manager dropping "
+ "state about a monitored service (RGW, rbd-mirror etc)"),
- Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(5)
- .set_description(""),
+ .add_service("mon")
+ .set_description("Period in seconds between monitor-to-manager "
+ "health/status updates"),
Option("mon_mgr_beacon_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(30)
- .set_description(""),
+ .add_service("mon")
+ .set_description("Period in seconds from last beacon to monitor marking "
+ "a manager daemon as failed"),
Option("mon_mgr_inactive_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(60)
- .set_description(""),
+ .add_service("mon")
+ .set_description("Period in seconds after cluster creation during which "
+ "cluster may have no active manager")
+ .set_long_description("This grace period enables the cluster to come "
+ "up cleanly without raising spurious health check "
+ "failures about managers that aren't online yet"),
Option("mon_mgr_mkfs_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(60)
- .set_description(""),
+ .add_service("mon")
+ .set_description("Period in seconds that the cluster may have no active "
+ "manager before this is reported as an ERR rather than "
+ "a WARN"),
Option("mutex_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_default(1 * 1024 * 1024)
.set_description(""),
+ Option("rgw_max_attr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("The maximum length of metadata value. 0 skips the check"),
+
+ Option("rgw_max_attr_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("The maximum length of metadata name. 0 skips the check"),
+
+ Option("rgw_max_attrs_num_in_req", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("The maximum number of metadata items that can be put via single request"),
+
Option("rgw_override_bucket_index_max_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1024)
.set_description(""),
+
+ Option("mds_min_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description("minimum number of capabilities a client may hold"),
+
+ Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.8)
+ .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
});
}
path += ".";
path += data.name;
- by_path[path] = &data;
+ by_path[path] = {&data, l};
}
}
} else {
f->dump_string("nick", "");
}
- if (d->prio) {
- int p = std::max(std::min(d->prio + prio_adjust,
- (int)PerfCountersBuilder::PRIO_CRITICAL),
- 0);
- f->dump_int("priority", p);
- }
+ f->dump_int("priority", get_adjusted_priority(d->prio));
f->close_section();
} else {
if (d->type & PERFCOUNTER_LONGRUNAVG) {
assert(strlen(nick) <= 4);
}
data.nick = nick;
- data.prio = prio;
+ data.prio = prio ? prio : prio_default;
data.type = (enum perfcounter_type_d)ty;
data.histogram = std::move(histogram);
}
};
+/* Class for constructing a PerfCounters object.
+ *
+ * This class performs some validation that the parameters we have supplied are
+ * correct in create_perf_counters().
+ *
+ * In the future, we will probably get rid of the first/last arguments, since
+ * PerfCountersBuilder can deduce them itself.
+ */
+class PerfCountersBuilder
+{
+public:
+ PerfCountersBuilder(CephContext *cct, const std::string &name,
+ int first, int last);
+ ~PerfCountersBuilder();
+
+ // prio values: higher is better, and higher values get included in
+ // 'ceph daemonperf' (and similar) results.
+ // Use of priorities enables us to add large numbers of counters
+ // internally without necessarily overwhelming consumers.
+ enum {
+ PRIO_CRITICAL = 10,
+ // 'interesting' is the default threshold for `daemonperf` output
+ PRIO_INTERESTING = 8,
+ // `useful` is the default threshold for transmission to ceph-mgr
+ // and inclusion in prometheus/influxdb plugin output
+ PRIO_USEFUL = 5,
+ PRIO_UNINTERESTING = 2,
+ PRIO_DEBUGONLY = 0,
+ };
+ void add_u64(int key, const char *name,
+ const char *description=NULL, const char *nick = NULL,
+ int prio=0);
+ void add_u64_counter(int key, const char *name,
+ const char *description=NULL,
+ const char *nick = NULL,
+ int prio=0);
+ void add_u64_avg(int key, const char *name,
+ const char *description=NULL,
+ const char *nick = NULL,
+ int prio=0);
+ void add_time(int key, const char *name,
+ const char *description=NULL,
+ const char *nick = NULL,
+ int prio=0);
+ void add_time_avg(int key, const char *name,
+ const char *description=NULL,
+ const char *nick = NULL,
+ int prio=0);
+ void add_u64_counter_histogram(
+ int key, const char* name,
+ PerfHistogramCommon::axis_config_d x_axis_config,
+ PerfHistogramCommon::axis_config_d y_axis_config,
+ const char *description=NULL,
+ const char* nick = NULL,
+ int prio=0);
+
+ void set_prio_default(int prio_)
+ {
+ prio_default = prio_;
+ }
+
+ PerfCounters* create_perf_counters();
+private:
+ PerfCountersBuilder(const PerfCountersBuilder &rhs);
+ PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
+ void add_impl(int idx, const char *name,
+ const char *description, const char *nick, int prio, int ty,
+ unique_ptr<PerfHistogram<>> histogram = nullptr);
+
+ PerfCounters *m_perf_counters;
+
+ int prio_default = 0;
+};
+
/*
* A PerfCounters object is usually associated with a single subsystem.
* It contains counters which we modify to track performance and throughput
const char *name;
const char *description;
const char *nick;
- int prio = 0;
+ uint8_t prio = 0;
enum perfcounter_type_d type;
std::atomic<uint64_t> u64 = { 0 };
std::atomic<uint64_t> avgcount = { 0 };
prio_adjust = p;
}
+ int get_adjusted_priority(int p) const {
+ return std::max(std::min(p + prio_adjust,
+ (int)PerfCountersBuilder::PRIO_CRITICAL),
+ 0);
+ }
+
private:
PerfCounters(CephContext *cct, const std::string &name,
int lower_bound, int upper_bound);
dump_formatted_generic(f, schema, true, logger, counter);
}
+ // A reference to a perf_counter_data_any_d, with an accompanying
+ // pointer to the enclosing PerfCounters, in order that the consumer
+ // can see the prio_adjust
+ class PerfCounterRef
+ {
+ public:
+ PerfCounters::perf_counter_data_any_d *data;
+ PerfCounters *perf_counters;
+ };
typedef std::map<std::string,
- PerfCounters::perf_counter_data_any_d *> CounterMap;
+ PerfCounterRef> CounterMap;
void with_counters(std::function<void(const CounterMap &)>) const;
perf_counters_set_t m_loggers;
- std::map<std::string, PerfCounters::perf_counter_data_any_d *> by_path;
+ CounterMap by_path;
friend class PerfCountersCollectionTest;
};
-/* Class for constructing a PerfCounters object.
- *
- * This class performs some validation that the parameters we have supplied are
- * correct in create_perf_counters().
- *
- * In the future, we will probably get rid of the first/last arguments, since
- * PerfCountersBuilder can deduce them itself.
- */
-class PerfCountersBuilder
-{
-public:
- PerfCountersBuilder(CephContext *cct, const std::string &name,
- int first, int last);
- ~PerfCountersBuilder();
-
- // prio values: higher is better, and higher values get included in
- // 'ceph daemonperf' (and similar) results.
- enum {
- PRIO_CRITICAL = 10,
- PRIO_INTERESTING = 8,
- PRIO_USEFUL = 5,
- PRIO_UNINTERESTING = 2,
- PRIO_DEBUGONLY = 0,
- };
- void add_u64(int key, const char *name,
- const char *description=NULL, const char *nick = NULL,
- int prio=0);
- void add_u64_counter(int key, const char *name,
- const char *description=NULL,
- const char *nick = NULL,
- int prio=0);
- void add_u64_avg(int key, const char *name,
- const char *description=NULL,
- const char *nick = NULL,
- int prio=0);
- void add_time(int key, const char *name,
- const char *description=NULL,
- const char *nick = NULL,
- int prio=0);
- void add_time_avg(int key, const char *name,
- const char *description=NULL,
- const char *nick = NULL,
- int prio=0);
- void add_u64_counter_histogram(
- int key, const char* name,
- PerfHistogramCommon::axis_config_d x_axis_config,
- PerfHistogramCommon::axis_config_d y_axis_config,
- const char *description=NULL,
- const char* nick = NULL,
- int prio=0);
-
- PerfCounters* create_perf_counters();
-private:
- PerfCountersBuilder(const PerfCountersBuilder &rhs);
- PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
- void add_impl(int idx, const char *name,
- const char *description, const char *nick, int prio, int ty,
- unique_ptr<PerfHistogram<>> histogram = nullptr);
- PerfCounters *m_perf_counters;
-};
#endif
#define dout_subsys ceph_subsys_
-static const struct sockaddr *find_ip_in_subnet_list(CephContext *cct,
- const struct ifaddrs *ifa,
- const std::string &networks)
+const struct sockaddr *find_ip_in_subnet_list(
+ CephContext *cct,
+ const struct ifaddrs *ifa,
+ const std::string &networks,
+ const std::string &interfaces)
{
std::list<string> nets;
get_str_list(networks, nets);
+ std::list<string> ifs;
+ get_str_list(interfaces, ifs);
+
+ // filter interfaces by name
+ const struct ifaddrs *filtered = 0;
+ if (ifs.empty()) {
+ filtered = ifa;
+ } else {
+ if (nets.empty()) {
+ lderr(cct) << "interface names specified but not network names" << dendl;
+ exit(1);
+ }
+ const struct ifaddrs *t = ifa;
+ struct ifaddrs *head = 0;
+ while (t != NULL) {
+ bool match = false;
+ for (auto& i : ifs) {
+ if (strcmp(i.c_str(), t->ifa_name) == 0) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ struct ifaddrs *n = new ifaddrs;
+ memcpy(n, t, sizeof(*t));
+ n->ifa_next = head;
+ head = n;
+ }
+ t = t->ifa_next;
+ }
+ if (head == NULL) {
+ lderr(cct) << "no interfaces matching " << ifs << dendl;
+ exit(1);
+ }
+ filtered = head;
+ }
- for(std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
- struct sockaddr_storage net;
- unsigned int prefix_len;
+ struct sockaddr *r = NULL;
+ for (std::list<string>::iterator s = nets.begin(); s != nets.end(); ++s) {
+ struct sockaddr_storage net;
+ unsigned int prefix_len;
- if (!parse_network(s->c_str(), &net, &prefix_len)) {
- lderr(cct) << "unable to parse network: " << *s << dendl;
- exit(1);
- }
+ if (!parse_network(s->c_str(), &net, &prefix_len)) {
+ lderr(cct) << "unable to parse network: " << *s << dendl;
+ exit(1);
+ }
+
+ const struct ifaddrs *found = find_ip_in_subnet(
+ filtered,
+ (struct sockaddr *) &net, prefix_len);
+ if (found) {
+ r = found->ifa_addr;
+ break;
+ }
+ }
- const struct ifaddrs *found = find_ip_in_subnet(ifa,
- (struct sockaddr *) &net, prefix_len);
- if (found)
- return found->ifa_addr;
+ if (filtered != ifa) {
+ while (filtered) {
+ struct ifaddrs *t = filtered->ifa_next;
+ delete filtered;
+ filtered = t;
}
+ }
- return NULL;
+ return r;
}
// observe this change
static void fill_in_one_address(CephContext *cct,
const struct ifaddrs *ifa,
const string networks,
+ const string interfaces,
const char *conf_var)
{
- const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks);
+ const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, networks,
+ interfaces);
if (!found) {
- lderr(cct) << "unable to find any IP address in networks: " << networks << dendl;
+ lderr(cct) << "unable to find any IP address in networks '" << networks
+ << "' interfaces '" << interfaces << "'" << dendl;
exit(1);
}
exit(1);
}
-
if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
&& cct->_conf->public_addr.is_blank_ip()
&& !cct->_conf->public_network.empty()) {
- fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
+ fill_in_one_address(cct, ifa, cct->_conf->public_network,
+ cct->_conf->get_val<string>("public_network_interface"),
+ "public_addr");
}
if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
&& cct->_conf->cluster_addr.is_blank_ip()) {
if (!cct->_conf->cluster_network.empty()) {
- fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
+ fill_in_one_address(
+ cct, ifa, cct->_conf->cluster_network,
+ cct->_conf->get_val<string>("cluster_network_interface"),
+ "cluster_addr");
} else {
if (!cct->_conf->public_network.empty()) {
lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
lderr(cct) << " Using public network also for cluster network" << dendl;
- fill_in_one_address(cct, ifa, cct->_conf->public_network, "cluster_addr");
+ fill_in_one_address(
+ cct, ifa, cct->_conf->public_network,
+ cct->_conf->get_val<string>("public_network_interface"),
+ "cluster_addr");
}
}
}
*/
bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_addr_t *match);
+
+const struct sockaddr *find_ip_in_subnet_list(
+ CephContext *cct,
+ const struct ifaddrs *ifa,
+ const std::string &networks,
+ const std::string &interfaces);
+
#endif
SUBSYS(auth, 1, 5)
SUBSYS(crypto, 1, 5)
SUBSYS(finisher, 1, 1)
+SUBSYS(reserver, 1, 1)
SUBSYS(heartbeatmap, 1, 5)
SUBSYS(perfcounter, 1, 5)
SUBSYS(rgw, 1, 5) // log level for the Rados gateway
explicit Dumper(const CrushWrapper *crush_,
const name_map_t& weight_set_names_)
: crush(crush_), weight_set_names(weight_set_names_) {
- crush->find_nonshadow_roots(roots);
+ crush->find_nonshadow_roots(&roots);
root = roots.begin();
}
explicit Dumper(const CrushWrapper *crush_,
bool show_shadow)
: crush(crush_), weight_set_names(weight_set_names_) {
if (show_shadow) {
- crush->find_roots(roots);
+ crush->find_roots(&roots);
} else {
- crush->find_nonshadow_roots(roots);
+ crush->find_nonshadow_roots(&roots);
}
root = roots.begin();
}
#define dout_subsys ceph_subsys_crush
-bool CrushWrapper::has_legacy_rulesets() const
+bool CrushWrapper::has_legacy_rule_ids() const
{
for (unsigned i=0; i<crush->max_rules; i++) {
crush_rule *r = crush->rules[i];
return false;
}
-int CrushWrapper::renumber_rules_by_ruleset()
+std::map<int, int> CrushWrapper::renumber_rules()
{
- int max_ruleset = 0;
+ std::map<int, int> result;
for (unsigned i=0; i<crush->max_rules; i++) {
crush_rule *r = crush->rules[i];
- if (r && r->mask.ruleset >= max_ruleset) {
- max_ruleset = r->mask.ruleset + 1;
+ if (r && r->mask.ruleset != i) {
+ result[r->mask.ruleset] = i;
+ r->mask.ruleset = i;
}
}
- struct crush_rule **newrules =
- (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
- for (unsigned i=0; i<crush->max_rules; i++) {
- crush_rule *r = crush->rules[i];
- if (!r)
- continue;
- if (newrules[r->mask.ruleset]) {
- // collision, we can't do it.
- free(newrules);
- return -EINVAL;
- }
- newrules[r->mask.ruleset] = r;
- }
-
- // success, swap!
- free(crush->rules);
- crush->rules = newrules;
- crush->max_rules = max_ruleset;
- return 0;
-}
-
-bool CrushWrapper::has_multirule_rulesets() const
-{
- for (unsigned i=0; i<crush->max_rules; i++) {
- crush_rule *r = crush->rules[i];
- if (!r)
- continue;
- for (unsigned j=i+1; j<crush->max_rules; j++) {
- crush_rule *s = crush->rules[j];
- if (!s)
- continue;
- if (r->mask.ruleset == s->mask.ruleset)
- return true;
- }
- }
- return false;
+ return result;
}
bool CrushWrapper::has_non_straw2_buckets() const
return 0;
}
-void CrushWrapper::find_takes(set<int>& roots) const
+void CrushWrapper::find_takes(set<int> *roots) const
{
for (unsigned i=0; i<crush->max_rules; i++) {
crush_rule *r = crush->rules[i];
continue;
for (unsigned j=0; j<r->len; j++) {
if (r->steps[j].op == CRUSH_RULE_TAKE)
- roots.insert(r->steps[j].arg1);
+ roots->insert(r->steps[j].arg1);
}
}
}
-void CrushWrapper::find_roots(set<int>& roots) const
+void CrushWrapper::find_roots(set<int> *roots) const
{
for (int i = 0; i < crush->max_buckets; i++) {
if (!crush->buckets[i])
continue;
crush_bucket *b = crush->buckets[i];
if (!_search_item_exists(b->id))
- roots.insert(b->id);
+ roots->insert(b->id);
}
}
// finish constructing the containing buckets.
map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
set<int> roots;
- find_nonshadow_roots(roots);
+ find_nonshadow_roots(&roots);
for (auto &r : roots) {
if (r >= 0)
continue;
int CrushWrapper::trim_roots_with_class()
{
set<int> roots;
- find_shadow_roots(roots);
+ find_shadow_roots(&roots);
for (auto &r : roots) {
if (r >= 0)
continue;
void CrushWrapper::reweight(CephContext *cct)
{
set<int> roots;
- find_roots(roots);
+ find_roots(&roots);
for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
if (*p >= 0)
continue;
rule_type, -1, err);
}
-int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+float CrushWrapper::_get_take_weight_osd_map(int root,
+ map<int,float> *pmap) const
+{
+ float sum = 0.0;
+ list<int> q;
+ q.push_back(root);
+ //breadth first iterate the OSD tree
+ while (!q.empty()) {
+ int bno = q.front();
+ q.pop_front();
+ crush_bucket *b = crush->buckets[-1-bno];
+ assert(b);
+ for (unsigned j=0; j<b->size; ++j) {
+ int item_id = b->items[j];
+ if (item_id >= 0) { //it's an OSD
+ float w = crush_get_bucket_item_weight(b, j);
+ (*pmap)[item_id] = w;
+ sum += w;
+ } else { //not an OSD, expand the child later
+ q.push_back(item_id);
+ }
+ }
+ }
+ return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+ const map<int,float>& m,
+ map<int,float> *pmap) const
+{
+ for (auto& p : m) {
+ map<int,float>::iterator q = pmap->find(p.first);
+ if (q == pmap->end()) {
+ (*pmap)[p.first] = p.second / sum;
+ } else {
+ q->second += p.second / sum;
+ }
+ }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+ map<int,float> m;
+ float sum = _get_take_weight_osd_map(root, &m);
+ _normalize_weight_map(sum, m, pmap);
+ return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+ map<int,float> *pmap) const
{
if (ruleno >= crush->max_rules)
return -ENOENT;
m[n] = 1.0;
sum = 1.0;
} else {
- list<int> q;
- q.push_back(n);
- //breadth first iterate the OSD tree
- while (!q.empty()) {
- int bno = q.front();
- q.pop_front();
- crush_bucket *b = crush->buckets[-1-bno];
- assert(b);
- for (unsigned j=0; j<b->size; ++j) {
- int item_id = b->items[j];
- if (item_id >= 0) { //it's an OSD
- float w = crush_get_bucket_item_weight(b, j);
- m[item_id] = w;
- sum += w;
- } else { //not an OSD, expand the child later
- q.push_back(item_id);
- }
- }
- }
- }
- }
- for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
- map<int,float>::iterator q = pmap->find(p->first);
- if (q == pmap->end()) {
- (*pmap)[p->first] = p->second / sum;
- } else {
- q->second += p->second / sum;
+ sum += _get_take_weight_osd_map(n, &m);
}
}
+ _normalize_weight_map(sum, m, pmap);
}
return 0;
return 0;
}
+int CrushWrapper::bucket_set_alg(int bid, int alg)
+{
+ crush_bucket *b = get_bucket(bid);
+ if (!b) {
+ return -ENOENT;
+ }
+ b->alg = alg;
+ return 0;
+}
+
int CrushWrapper::update_device_class(int id,
const string& class_name,
const string& name,
return 0;
}
+// return rules that might reference the given osd
+int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
+{
+ assert(rules);
+ rules->clear();
+ if (osd < 0) {
+ return -EINVAL;
+ }
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int step_item = r->steps[j].arg1;
+ list<int> unordered;
+ int rc = _get_leaves(step_item, &unordered);
+ if (rc < 0) {
+ return rc; // propagate fatal errors!
+ }
+ bool match = false;
+ for (auto &o: unordered) {
+ assert(o >= 0);
+ if (o == osd) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ rules->insert(i);
+ break;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
bool CrushWrapper::_class_is_dead(int class_id)
{
for (auto &p: class_map) {
void dump(Formatter *f) {
set<int> roots;
- crush->find_roots(roots);
+ crush->find_roots(&roots);
for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
}
set_tunables_default();
}
- /// true if any rule has a ruleset != the rule id
- bool has_legacy_rulesets() const;
-
- /// fix rules whose ruleid != ruleset
- int renumber_rules_by_ruleset();
+ /**
+ * true if any rule has a rule id != its position in the array
+ *
+ * These indicate "ruleset" IDs that were created by older versions
+ * of Ceph. They are cleaned up in renumber_rules so that eventually
+ * we can remove the code for handling them.
+ */
+ bool has_legacy_rule_ids() const;
- /// true if any ruleset has more than 1 rule
- bool has_multirule_rulesets() const;
+ /**
+ * fix rules whose ruleid != ruleset
+ *
+ * These rules were created in older versions of Ceph. The concept
+ * of a ruleset no longer exists.
+ *
+ * Return a map of old ID -> new ID. Caller must update OSDMap
+ * to use new IDs.
+ */
+ std::map<int, int> renumber_rules();
/// true if any buckets that aren't straw2
bool has_non_straw2_buckets() const;
*
* Note that these may not be parentless roots.
*/
- void find_takes(set<int>& roots) const;
+ void find_takes(set<int> *roots) const;
/**
* find tree roots
*
* These are parentless nodes in the map.
*/
- void find_roots(set<int>& roots) const;
+ void find_roots(set<int> *roots) const;
/**
* find tree roots that contain shadow (device class) items only
*/
- void find_shadow_roots(set<int>& roots) const {
+ void find_shadow_roots(set<int> *roots) const {
set<int> all;
- find_roots(all);
+ find_roots(&all);
for (auto& p: all) {
if (is_shadow_item(p)) {
- roots.insert(p);
+ roots->insert(p);
}
}
}
* These are parentless nodes in the map that are not shadow
* items for device classes.
*/
- void find_nonshadow_roots(set<int>& roots) const {
+ void find_nonshadow_roots(set<int> *roots) const {
set<int> all;
- find_roots(all);
+ find_roots(&all);
for (auto& p: all) {
if (!is_shadow_item(p)) {
- roots.insert(p);
+ roots->insert(p);
}
}
}
return true;
return false;
}
+ bool rule_has_take(unsigned ruleno, int take) const {
+ if (!crush) return false;
+ crush_rule *rule = get_rule(ruleno);
+ for (unsigned i = 0; i < rule->len; ++i) {
+ if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+ rule->steps[i].arg1 == take) {
+ return true;
+ }
+ }
+ return false;
+ }
int get_rule_len(unsigned ruleno) const {
crush_rule *r = get_rule(ruleno);
if (IS_ERR(r)) return PTR_ERR(r);
return s->arg2;
}
+private:
+ float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+ void _normalize_weight_map(float sum, const map<int,float>& m,
+ map<int,float> *pmap) const;
+
+public:
/**
* calculate a map of osds to weights for a given rule
*
* @param pmap [out] map of osd to weight
* @return 0 for success, or negative error code
*/
- int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+ /**
+ * calculate a map of osds to weights for a given starting root
+ *
+ * Generate a map of which OSDs get how much relative weight for a
+ * given starting root
+ *
+ * @param root node
+ * @param pmap [out] map of osd to weight
+ * @return 0 for success, or negative error code
+ */
+ int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
/* modifiers */
void finalize() {
assert(crush);
crush_finalize(crush);
- have_uniform_rules = !has_legacy_rulesets();
+ have_uniform_rules = !has_legacy_rule_ids();
}
+ int bucket_set_alg(int id, int alg);
int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
int remove_device_class(CephContext *cct, int id, ostream *ss);
int populate_classes(
const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
int get_rules_by_class(const string &class_name, set<int> *rules);
+ int get_rules_by_osd(int osd, set<int> *rules);
bool _class_is_dead(int class_id);
void cleanup_dead_classes();
int rebuild_roots_with_classes();
/**
* Return the lowest numbered ruleset of type `type`
*
- * @returns a ruleset ID, or -1 if no matching rulesets found.
+ * @returns a ruleset ID, or -1 if no matching rules found.
*/
int find_first_ruleset(int type) const {
int result = -1;
*/
raw* copy(const char *c, unsigned len);
raw* create(unsigned len);
+ raw* create_in_mempool(unsigned len, int mempool);
raw* claim_char(unsigned len, char *buf);
raw* create_malloc(unsigned len);
raw* claim_malloc(unsigned len, char *buf);
raw* create_static(unsigned len, char *buf);
raw* create_aligned(unsigned len, unsigned align);
+ raw* create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
raw* create_page_aligned(unsigned len);
raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
raw* create_unshareable(unsigned len);
return have_raw() && (start() > 0 || end() < raw_length());
}
+ int get_mempool() const;
+ void reassign_to_mempool(int pool);
+ void try_assign_to_mempool(int pool);
+
// accessors
raw *get_raw() const { return _raw; }
const char *c_str() const;
unsigned _len;
unsigned _memcopy_count; //the total of memcopy using rebuild().
ptr append_buffer; // where i put small appends.
- int _mempool = -1;
public:
class iterator;
void advance(int o);
void seek(unsigned o);
+ using iterator_impl<false>::operator*;
char operator*();
iterator& operator++();
ptr get_current_ptr();
_memcopy_count = other._memcopy_count;
last_p = begin();
append_buffer.swap(other.append_buffer);
- _mempool = other._mempool;
other.clear();
return *this;
}
const ptr& front() const { return _buffers.front(); }
const ptr& back() const { return _buffers.back(); }
+ int get_mempool() const;
void reassign_to_mempool(int pool);
void try_assign_to_mempool(int pool);
#define LIBRGW_FILE_VER_MAJOR 1
#define LIBRGW_FILE_VER_MINOR 1
-#define LIBRGW_FILE_VER_EXTRA 4
+#define LIBRGW_FILE_VER_EXTRA 6
#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
const char *secret, struct rgw_fs **rgw_fs,
uint32_t flags);
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, const char *root, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
/*
register invalidate callbacks
*/
rgw_readdir_cb rcb, void *cb_arg, bool *eof,
uint32_t flags);
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *name,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
/* project offset of dirent name */
#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
# define MSG_MORE 0
#endif
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+# define CEPH_USE_SO_NOSIGPIPE
+# else
+# define CEPH_USE_SIGPIPE_BLOCKER
+# warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
#endif
assert(m_lock.is_locked());
assert(m_commit_position_ctx != nullptr);
if (m_commit_position_task_ctx == NULL) {
- m_commit_position_task_ctx = new C_CommitPositionTask(this);
- m_timer->add_event_after(m_settings.commit_interval,
- m_commit_position_task_ctx);
+ m_commit_position_task_ctx =
+ m_timer->add_event_after(m_settings.commit_interval,
+ new C_CommitPositionTask(this));
}
}
}
ldout(m_cct, 20) << __func__ << ": " << m_oid << " scheduling watch" << dendl;
- assert(m_watch_task == NULL);
- m_watch_task = new C_WatchTask(this);
- m_timer.add_event_after(m_watch_interval, m_watch_task);
+ assert(m_watch_task == nullptr);
+ m_watch_task = m_timer.add_event_after(
+ m_watch_interval,
+ new FunctionContext([this](int) {
+ handle_watch_task();
+ }));
}
bool ObjectPlayer::cancel_watch() {
on_finish->complete(r);
}
-void ObjectPlayer::C_WatchTask::finish(int r) {
- object_player->handle_watch_task();
-}
-
void ObjectPlayer::C_WatchFetch::finish(int r) {
object_player->handle_watch_fetched(r);
}
}
void finish(int r) override;
};
- struct C_WatchTask : public Context {
- ObjectPlayerPtr object_player;
- C_WatchTask(ObjectPlayer *o) : object_player(o) {
- }
- void finish(int r) override;
- };
struct C_WatchFetch : public Context {
ObjectPlayerPtr object_player;
C_WatchFetch(ObjectPlayer *o) : object_player(o) {
m_timer_lock(timer_lock), m_handler(handler), m_order(order),
m_soft_max_size(1 << m_order), m_flush_interval(flush_interval),
m_flush_bytes(flush_bytes), m_flush_age(flush_age), m_flush_handler(this),
- m_append_task(NULL), m_lock(lock), m_append_tid(0), m_pending_bytes(0),
+ m_lock(lock), m_append_tid(0), m_pending_bytes(0),
m_size(0), m_overflowed(false), m_object_closed(false),
m_in_flight_flushes(false), m_aio_scheduled(false) {
m_ioctx.dup(ioctx);
void ObjectRecorder::schedule_append_task() {
Mutex::Locker locker(m_timer_lock);
- if (m_append_task == NULL && m_flush_age > 0) {
- m_append_task = new C_AppendTask(this);
- m_timer.add_event_after(m_flush_age, m_append_task);
+ if (m_append_task == nullptr && m_flush_age > 0) {
+ m_append_task = m_timer.add_event_after(
+ m_flush_age, new FunctionContext([this](int) {
+ handle_append_task();
+ }));
}
}
object_recorder->flush(future);
}
};
- struct C_AppendTask : public Context {
- ObjectRecorder *object_recorder;
- C_AppendTask(ObjectRecorder *o) : object_recorder(o) {
- }
- void finish(int r) override {
- object_recorder->handle_append_task();
- }
- };
struct C_AppendFlush : public Context {
ObjectRecorder *object_recorder;
uint64_t tid;
FlushHandler m_flush_handler;
- C_AppendTask *m_append_task;
+ Context *m_append_task = nullptr;
mutable std::shared_ptr<Mutex> m_lock;
AppendBuffers m_append_buffers;
#include <boost/scoped_ptr.hpp>
#include "include/encoding.h"
#include "common/Formatter.h"
+#include "common/perf_counters.h"
using std::string;
/**
virtual void get_statistics(Formatter *f) {
return;
}
+
+ /**
+ * Return your perf counters if you have any. Subclasses are not
+ * required to implement this, and callers must respect a null return
+ * value.
+ */
+ virtual PerfCounters *get_perf_counters() {
+ return nullptr;
+ }
protected:
/// List of matching prefixes and merge operators
std::vector<std::pair<std::string,
void close() override;
+ PerfCounters *get_perf_counters() override
+ {
+ return logger;
+ }
+
class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
public:
leveldb::WriteBatch bat;
void split_stats(const std::string &s, char delim, std::vector<std::string> &elems);
void get_statistics(Formatter *f) override;
+ PerfCounters *get_perf_counters() override
+ {
+ return logger;
+ }
+
struct RocksWBHandler: public rocksdb::WriteBatch::Handler {
std::string seen ;
int num_seen = 0;
}
template <typename I>
-bool ObjectMap<I>::update_required(uint64_t object_no, uint8_t new_state) {
+bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it,
+ uint8_t new_state) {
assert(m_image_ctx.object_map_lock.is_wlocked());
- uint8_t state = (*this)[object_no];
-
+ uint8_t state = *it;
if ((state == new_state) ||
(new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
(new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) {
BlockGuardCell *cell;
int r = m_update_guard->detain({op.start_object_no, op.end_object_no},
- &op, &cell);
+ &op, &cell);
if (r < 0) {
lderr(cct) << "failed to detain object map update: " << cpp_strerror(r)
<< dendl;
return;
}
- uint64_t object_no;
- for (object_no = start_object_no; object_no < end_object_no; ++object_no) {
- if (update_required(object_no, new_state)) {
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
break;
}
}
- if (object_no == end_object_no) {
+ if (it == end_it) {
ldout(cct, 20) << "object map update not required" << dendl;
m_image_ctx.op_work_queue->queue(on_finish, 0);
return;
const ZTracer::Trace &parent_trace, T *callback_object) {
assert(start_object_no < end_object_no);
if (snap_id == CEPH_NOSNAP) {
- uint64_t object_no;
- for (object_no = start_object_no; object_no < end_object_no;
- ++object_no) {
- if (update_required(object_no, new_state)) {
+ auto it = m_object_map.begin() + start_object_no;
+ auto end_it = m_object_map.begin() + end_object_no;
+ for (; it != end_it; ++it) {
+ if (update_required(it, new_state)) {
break;
}
}
- if (object_no == end_object_no) {
+ if (it == end_it) {
return false;
}
uint64_t end_object_no, uint8_t new_state,
const boost::optional<uint8_t> ¤t_state,
const ZTracer::Trace &parent_trace, Context *on_finish);
- bool update_required(uint64_t object_no, uint8_t new_state);
+ bool update_required(const ceph::BitVector<2>::Iterator &it,
+ uint8_t new_state);
};
std::map<std::string, std::string> mirror_images;
r = cls_client::mirror_image_list(&io_ctx, last_read, max_read,
&mirror_images);
- if (r < 0) {
+ if (r < 0 && r != -ENOENT) {
lderr(cct) << "error listing mirrored image directory: "
<< cpp_strerror(r) << dendl;
return r;
for (auto it = images_.begin(); it != images_.end(); ++it) {
auto &image_id = it->first;
auto &info = it->second;
+ if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) {
+ continue;
+ }
+
auto &image_name = id_to_name[image_id];
if (image_name.empty()) {
lderr(cct) << "failed to find image name for image " << image_id << ", "
std::map<cls::rbd::MirrorImageStatusState, int> states_;
int r = cls_client::mirror_image_status_get_summary(&io_ctx, &states_);
- if (r < 0) {
+ if (r < 0 && r != -ENOENT) {
lderr(cct) << "failed to get mirror status summary: "
<< cpp_strerror(r) << dendl;
return r;
object_off, snapc, parent_trace, completion);
}
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_trim(I *ictx, const std::string &oid,
+ uint64_t object_no, const ::SnapContext &snapc,
+ bool post_object_map_update,
+ Context *completion) {
+ return new ObjectTrimRequest(util::get_image_ctx(ictx), oid, object_no,
+ snapc, post_object_map_update, completion);
+}
+
template <typename I>
ObjectRequest<I>*
ObjectRequest<I>::create_write(I *ictx, const std::string &oid,
const ::SnapContext &snapc,
const ZTracer::Trace &parent_trace,
Context *completion);
+ static ObjectRequest* create_trim(ImageCtxT *ictx, const std::string &oid,
+ uint64_t object_no,
+ const ::SnapContext &snapc,
+ bool post_object_map_update,
+ Context *completion);
static ObjectRequest* create_write(ImageCtxT *ictx, const std::string &oid,
uint64_t object_no,
uint64_t object_off,
#include "common/dout.h"
#include "librbd/ImageCtx.h"
#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
#include "cls/lock/cls_lock_client.h"
#include <string>
#define dout_subsys ceph_subsys_rbd
#undef dout_prefix
-#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: "
+#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \
+ << " " << __func__ << ": "
namespace librbd {
namespace object_map {
+namespace {
+
+// keep aligned to bit_vector 4K block sizes
+const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10);
+
+}
+
template <typename I>
void UpdateRequest<I>::send() {
+ update_object_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_object_map() {
assert(m_image_ctx.snap_lock.is_locked());
assert(m_image_ctx.object_map_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
- // safe to update in-memory state first without handling rollback since any
- // failures will invalidate the object map
+ // break very large requests into manageable batches
+ m_update_end_object_no = MIN(
+ m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE);
+
std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id));
- ldout(cct, 20) << this << " updating object map"
- << ": ictx=" << &m_image_ctx << ", oid=" << oid << ", ["
- << m_start_object_no << "," << m_end_object_no << ") = "
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", "
+ << "[" << m_update_start_object_no << ","
+ << m_update_end_object_no << ") = "
<< (m_current_state ?
stringify(static_cast<uint32_t>(*m_current_state)) : "")
<< "->" << static_cast<uint32_t>(m_new_state)
if (m_snap_id == CEPH_NOSNAP) {
rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
}
- cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
- m_new_state, m_current_state);
+ cls_client::object_map_update(&op, m_update_start_object_no,
+ m_update_end_object_no, m_new_state,
+ m_current_state);
- librados::AioCompletion *rados_completion = create_callback_completion();
+ auto rados_completion = librbd::util::create_rados_callback<
+ UpdateRequest<I>, &UpdateRequest<I>::handle_update_object_map>(this);
std::vector<librados::snap_t> snaps;
int r = m_image_ctx.md_ctx.aio_operate(
oid, rados_completion, &op, 0, snaps,
}
template <typename I>
-void UpdateRequest<I>::finish_request() {
- RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
- RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
- ldout(m_image_ctx.cct, 20) << this << " on-disk object map updated"
- << dendl;
+void UpdateRequest<I>::handle_update_object_map(int r) {
+ ldout(m_image_ctx.cct, 20) << "r=" << r << dendl;
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ update_in_memory_object_map();
+
+ if (m_update_end_object_no < m_end_object_no) {
+ m_update_start_object_no = m_update_end_object_no;
+ update_object_map();
+ return;
+ }
+ }
+
+ // no more batch updates to send
+ complete(r);
+}
+
+template <typename I>
+void UpdateRequest<I>::update_in_memory_object_map() {
+ assert(m_image_ctx.snap_lock.is_locked());
+ assert(m_image_ctx.object_map_lock.is_locked());
// rebuilding the object map might update on-disk only
if (m_snap_id == m_image_ctx.snap_id) {
- for (uint64_t object_no = m_start_object_no;
- object_no < MIN(m_end_object_no, m_object_map.size());
- ++object_no) {
- uint8_t state = m_object_map[object_no];
+ ldout(m_image_ctx.cct, 20) << dendl;
+
+ auto it = m_object_map.begin() +
+ MIN(m_update_start_object_no, m_object_map.size());
+ auto end_it = m_object_map.begin() +
+ MIN(m_update_end_object_no, m_object_map.size());
+ for (; it != end_it; ++it) {
+ auto state_ref = *it;
+ uint8_t state = state_ref;
if (!m_current_state || state == *m_current_state ||
(*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
- m_object_map[object_no] = m_new_state;
+ state_ref = m_new_state;
}
}
}
}
+template <typename I>
+void UpdateRequest<I>::finish_request() {
+}
+
} // namespace object_map
} // namespace librbd
const ZTracer::Trace &parent_trace, Context *on_finish)
: Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
m_start_object_no(start_object_no), m_end_object_no(end_object_no),
- m_new_state(new_state), m_current_state(current_state),
+ m_update_start_object_no(start_object_no), m_new_state(new_state),
+ m_current_state(current_state),
m_trace(util::create_trace(image_ctx, "update object map", parent_trace))
{
m_trace.event("start");
void finish_request() override;
private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |/------------------\
+ * v | (repeat in batches)
+ * UPDATE_OBJECT_MAP -----/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
ceph::BitVector<2> &m_object_map;
uint64_t m_start_object_no;
uint64_t m_end_object_no;
+ uint64_t m_update_start_object_no;
+ uint64_t m_update_end_object_no = 0;
uint8_t m_new_state;
boost::optional<uint8_t> m_current_state;
ZTracer::Trace m_trace;
+
+ void update_object_map();
+ void handle_update_object_map(int r);
+
+ void update_in_memory_object_map();
+
};
} // namespace object_map
librados::AioCompletion *rados_completion = create_rados_callback<
SnapshotCreateRequest<I>,
&SnapshotCreateRequest<I>::handle_allocate_snap_id>(this);
- image_ctx.md_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
+ image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion);
rados_completion->release();
}
librados::AioCompletion *rados_completion = create_rados_callback<
SnapshotCreateRequest<I>,
&SnapshotCreateRequest<I>::handle_release_snap_id>(this);
- image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
rados_completion->release();
}
librados::AioCompletion *rados_completion =
this->create_callback_completion();
- image_ctx.md_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
+ image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion);
rados_completion->release();
}
string oid = image_ctx.get_object_name(m_object_no);
ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
- auto req = new io::ObjectTrimRequest(&image_ctx, oid, m_object_no,
- m_snapc, false, this);
+ auto req = io::ObjectRequest<I>::create_trim(&image_ctx, oid, m_object_no,
+ m_snapc, false, this);
req->send();
return 0;
}
template <typename I>
class C_RemoveObject : public C_AsyncObjectThrottle<I> {
public:
- C_RemoveObject(AsyncObjectThrottle<I> &throttle, ImageCtx *image_ctx,
+ C_RemoveObject(AsyncObjectThrottle<I> &throttle, I *image_ctx,
uint64_t object_no)
: C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no)
{
m_delete_off = MIN(new_num_periods * period, original_size);
// first object we can delete free and clear
m_delete_start = new_num_periods * image_ctx.get_stripe_count();
+ m_delete_start_min = m_delete_start;
m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size);
CephContext *cct = image_ctx.cct;
RWLock::RLocker owner_lock(image_ctx.owner_lock);
switch (m_state) {
- case STATE_PRE_COPYUP:
- ldout(cct, 5) << " PRE_COPYUP" << dendl;
+ case STATE_PRE_TRIM:
+ ldout(cct, 5) << " PRE_TRIM" << dendl;
send_copyup_objects();
break;
case STATE_COPYUP_OBJECTS:
ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
- send_post_copyup();
- break;
-
- case STATE_POST_COPYUP:
- ldout(cct, 5) << " POST_COPYUP" << dendl;
- send_pre_remove();
- break;
-
- case STATE_PRE_REMOVE:
- ldout(cct, 5) << " PRE_REMOVE" << dendl;
send_remove_objects();
break;
case STATE_REMOVE_OBJECTS:
ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
- send_post_remove();
+ send_post_trim();
break;
- case STATE_POST_REMOVE:
- ldout(cct, 5) << " POST_OBJECTS" << dendl;
+ case STATE_POST_TRIM:
+ ldout(cct, 5) << " POST_TRIM" << dendl;
send_clean_boundary();
break;
template <typename I>
void TrimRequest<I>::send() {
- send_pre_copyup();
+ send_pre_trim();
}
template<typename I>
-void TrimRequest<I>::send_copyup_objects() {
+void TrimRequest<I>::send_pre_trim() {
I &image_ctx = this->m_image_ctx;
assert(image_ctx.owner_lock.is_locked());
- ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
- << " start object=" << m_copyup_start << ", "
- << " end object=" << m_copyup_end << dendl;
- m_state = STATE_COPYUP_OBJECTS;
+ if (m_delete_start >= m_num_objects) {
+ send_clean_boundary();
+ return;
+ }
- ::SnapContext snapc;
{
RWLock::RLocker snap_locker(image_ctx.snap_lock);
- RWLock::RLocker parent_locker(image_ctx.parent_lock);
- snapc = image_ctx.snapc;
- }
-
- Context *ctx = this->create_callback_context();
- typename AsyncObjectThrottle<I>::ContextFactory context_factory(
- boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
- boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
- AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
- this, image_ctx, context_factory, ctx, &m_prog_ctx, m_copyup_start,
- m_copyup_end);
- throttle->start_ops(image_ctx.concurrent_management_ops);
-}
+ if (image_ctx.object_map != nullptr) {
+ ldout(image_ctx.cct, 5) << this << " send_pre_trim: "
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_PRE_TRIM;
-template <typename I>
-void TrimRequest<I>::send_remove_objects() {
- I &image_ctx = this->m_image_ctx;
- assert(image_ctx.owner_lock.is_locked());
+ assert(image_ctx.exclusive_lock->is_lock_owner());
- ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
- << " delete_start=" << m_delete_start
- << " num_objects=" << m_num_objects << dendl;
- m_state = STATE_REMOVE_OBJECTS;
+ RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
+ if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING,
+ OBJECT_EXISTS, {}, this)) {
+ return;
+ }
+ }
+ }
- Context *ctx = this->create_callback_context();
- typename AsyncObjectThrottle<I>::ContextFactory context_factory(
- boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
- boost::lambda::_1, &image_ctx, boost::lambda::_2));
- AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
- this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
- m_num_objects);
- throttle->start_ops(image_ctx.concurrent_management_ops);
+ send_copyup_objects();
}
template<typename I>
-void TrimRequest<I>::send_pre_copyup() {
+void TrimRequest<I>::send_copyup_objects() {
I &image_ctx = this->m_image_ctx;
assert(image_ctx.owner_lock.is_locked());
- if (m_delete_start >= m_num_objects) {
- send_clean_boundary();
- return;
- }
-
+ ::SnapContext snapc;
bool has_snapshots;
uint64_t parent_overlap;
{
RWLock::RLocker snap_locker(image_ctx.snap_lock);
RWLock::RLocker parent_locker(image_ctx.parent_lock);
+ snapc = image_ctx.snapc;
has_snapshots = !image_ctx.snaps.empty();
int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
assert(r == 0);
}
// copyup is only required for portion of image that overlaps parent
- m_copyup_end = Striper::get_num_objects(image_ctx.layout, parent_overlap);
+ uint64_t copyup_end = Striper::get_num_objects(image_ctx.layout,
+ parent_overlap);
// TODO: protect against concurrent shrink and snap create?
// skip to remove if no copyup is required.
- if (m_copyup_end <= m_delete_start || !has_snapshots) {
- send_pre_remove();
+ if (copyup_end <= m_delete_start || !has_snapshots) {
+ send_remove_objects();
return;
}
- m_copyup_start = m_delete_start;
- m_delete_start = m_copyup_end;
-
- {
- RWLock::RLocker snap_locker(image_ctx.snap_lock);
- if (image_ctx.object_map != nullptr) {
- ldout(image_ctx.cct, 5) << this << " send_pre_copyup: "
- << " copyup_start=" << m_copyup_start
- << " copyup_end=" << m_copyup_end << dendl;
- m_state = STATE_PRE_COPYUP;
-
- assert(image_ctx.exclusive_lock->is_lock_owner());
+ uint64_t copyup_start = m_delete_start;
+ m_delete_start = copyup_end;
- RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
- if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
- CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_PENDING,
- OBJECT_EXISTS, {}, this)) {
- return;
- }
- }
- }
+ ldout(image_ctx.cct, 5) << this << " send_copyup_objects: "
+ << " start object=" << copyup_start << ", "
+ << " end object=" << copyup_end << dendl;
+ m_state = STATE_COPYUP_OBJECTS;
- send_copyup_objects();
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject<I> >(),
+ boost::lambda::_1, &image_ctx, snapc, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+ copyup_end);
+ throttle->start_ops(image_ctx.concurrent_management_ops);
}
template <typename I>
-void TrimRequest<I>::send_pre_remove() {
+void TrimRequest<I>::send_remove_objects() {
I &image_ctx = this->m_image_ctx;
assert(image_ctx.owner_lock.is_locked());
- if (m_delete_start >= m_num_objects) {
- send_clean_boundary();
- return;
- }
-
- {
- RWLock::RLocker snap_locker(image_ctx.snap_lock);
- if (image_ctx.object_map != nullptr) {
- ldout(image_ctx.cct, 5) << this << " send_pre_remove: "
- << " delete_start=" << m_delete_start
- << " num_objects=" << m_num_objects << dendl;
- m_state = STATE_PRE_REMOVE;
-
- assert(image_ctx.exclusive_lock->is_lock_owner());
- // flag the objects as pending deletion
- RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
- if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
- CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_PENDING,
- OBJECT_EXISTS, {}, this)) {
- return;
- }
- }
- }
+ ldout(image_ctx.cct, 5) << this << " send_remove_objects: "
+ << " delete_start=" << m_delete_start
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_REMOVE_OBJECTS;
- // no object map update required
- send_remove_objects();
+ Context *ctx = this->create_callback_context();
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject<I> >(),
+ boost::lambda::_1, &image_ctx, boost::lambda::_2));
+ AsyncObjectThrottle<I> *throttle = new AsyncObjectThrottle<I>(
+ this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
+ m_num_objects);
+ throttle->start_ops(image_ctx.concurrent_management_ops);
}
template<typename I>
-void TrimRequest<I>::send_post_copyup() {
- I &image_ctx = this->m_image_ctx;
- assert(image_ctx.owner_lock.is_locked());
-
- {
- RWLock::RLocker snap_locker(image_ctx.snap_lock);
- if (image_ctx.object_map != nullptr) {
- ldout(image_ctx.cct, 5) << this << " send_post_copyup:"
- << " copyup_start=" << m_copyup_start
- << " copyup_end=" << m_copyup_end << dendl;
- m_state = STATE_POST_COPYUP;
-
- assert(image_ctx.exclusive_lock->is_lock_owner());
-
- RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
- if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
- CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_NONEXISTENT,
- OBJECT_PENDING, {}, this)) {
- return;
- }
- }
- }
-
- send_pre_remove();
-}
-
-template <typename I>
-void TrimRequest<I>::send_post_remove() {
+void TrimRequest<I>::send_post_trim() {
I &image_ctx = this->m_image_ctx;
assert(image_ctx.owner_lock.is_locked());
{
RWLock::RLocker snap_locker(image_ctx.snap_lock);
if (image_ctx.object_map != nullptr) {
- ldout(image_ctx.cct, 5) << this << " send_post_remove: "
- << " delete_start=" << m_delete_start
- << " num_objects=" << m_num_objects << dendl;
- m_state = STATE_POST_REMOVE;
+ ldout(image_ctx.cct, 5) << this << " send_post_trim:"
+ << " delete_start_min=" << m_delete_start_min
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_POST_TRIM;
assert(image_ctx.exclusive_lock->is_lock_owner());
- // flag the pending objects as removed
RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
- CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_NONEXISTENT,
+ CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT,
OBJECT_PENDING, {}, this)) {
return;
}
}
}
- // no object map update required
send_clean_boundary();
}
ldout(cct, 20) << " ex " << *p << dendl;
Context *req_comp = new C_ContextCompletion(*completion);
- io::ObjectRequest<> *req;
+ io::ObjectRequest<I> *req;
if (p->offset == 0) {
- req = new io::ObjectTrimRequest(&image_ctx, p->oid.name, p->objectno,
- snapc, true, req_comp);
+ req = io::ObjectRequest<I>::create_trim(&image_ctx, p->oid.name,
+ p->objectno, snapc, true,
+ req_comp);
} else {
- req = new io::ObjectTruncateRequest(&image_ctx, p->oid.name, p->objectno,
- p->offset, snapc, {}, req_comp);
+ req = io::ObjectRequest<I>::create_truncate(&image_ctx, p->oid.name,
+ p->objectno, p->offset, snapc,
+ {}, req_comp);
}
req->send();
}
prog_ctx);
}
+ TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
+ uint64_t original_size, uint64_t new_size,
+ ProgressContext &prog_ctx);
+
void send() override;
protected:
*
* @verbatim
*
- * <start> . . . . > STATE_FINISHED . . . . . . . . .
- * | . . . . . . . . . . > . . . . . . . . . .
- * | / . .
- * STATE_PRE_COPYUP ---> STATE_COPYUP_OBJECTS . .
- * | . .
- * /-----------------------/ v .
- * | . .
- * v . .
- * STATE_POST_COPYUP. . . > . . .
- * | . . . . . . . . . . < . . . . . . . . . .
- * | | . .
- * v v v .
- * STATE_PRE_REMOVE ---> STATE_REMOVE_OBJECTS .
- * | . . .
- * /-----------------------/ . . . . . . . .
- * | . . .
- * v v v v
- * STATE_POST_REMOVE --> STATE_CLEAN_BOUNDARY ---> <finish>
- * . ^
- * . .
- * . . . . . . . . . . . . . . . . . . . . . . .
- *
- * @endverbatim
+ * <start> . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not needed) .
+ * STATE_PRE_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_COPYUP_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_REMOVE_OBJECTS .
+ * | .
+ * v (skip if not needed) .
+ * STATE_POST_TRIM .
+ * | .
+ * v (skip if not needed) .
+ * STATE_CLEAN_BOUNDARY .
+ * | .
+ * v .
+ * STATE_FINISHED < . . . . . . . . . . . . . . .
+ * |
+ * v
+ * <finish>
*
* The _COPYUP_OBJECTS state is skipped if there is no parent overlap
* within the new image size and the image does not have any snapshots.
- * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
+ * The _PRE_TRIM/_POST_TRIM states are skipped if the object map
* isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
* are removed. The _CLEAN_BOUNDARY state is skipped if no boundary
* objects are cleaned. The state machine will immediately transition
* to _FINISHED state if there are no bytes to trim.
- */
+ */
enum State {
- STATE_PRE_COPYUP,
+ STATE_PRE_TRIM,
STATE_COPYUP_OBJECTS,
- STATE_POST_COPYUP,
- STATE_PRE_REMOVE,
STATE_REMOVE_OBJECTS,
- STATE_POST_REMOVE,
+ STATE_POST_TRIM,
STATE_CLEAN_BOUNDARY,
STATE_FINISHED
};
bool should_complete(int r) override;
- State m_state;
+ State m_state = STATE_PRE_TRIM;
private:
uint64_t m_delete_start;
+ uint64_t m_delete_start_min = 0;
uint64_t m_num_objects;
uint64_t m_delete_off;
uint64_t m_new_size;
ProgressContext &m_prog_ctx;
- uint64_t m_copyup_start;
- uint64_t m_copyup_end;
-
- TrimRequest(ImageCtxT &image_ctx, Context *on_finish,
- uint64_t original_size, uint64_t new_size,
- ProgressContext &prog_ctx);
-
- void send_pre_copyup();
+ void send_pre_trim();
void send_copyup_objects();
- void send_post_copyup();
-
- void send_pre_remove();
void send_remove_objects();
- void send_post_remove();
+ void send_post_trim();
void send_clean_boundary();
void send_finish(int r);
#define dout_prefix *_dout << "mds.beacon." << name << ' '
-class Beacon::C_MDS_BeaconSender : public Context {
-public:
- explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {}
- void finish(int r) override {
- assert(beacon->lock.is_locked_by_me());
- beacon->sender = NULL;
- beacon->_send();
- }
-private:
- Beacon *beacon;
-};
-
Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
name(name_), standby_for_rank(MDS_RANK_NONE),
awaiting_seq(-1)
{
last_seq = 0;
- sender = NULL;
was_laggy = false;
epoch = 0;
if (sender) {
timer.cancel_event(sender);
}
- sender = new C_MDS_BeaconSender(this);
- timer.add_event_after(g_conf->mds_beacon_interval, sender);
+ sender = timer.add_event_after(
+ g_conf->mds_beacon_interval,
+ new FunctionContext([this](int) {
+ assert(lock.is_locked_by_me());
+ sender = nullptr;
+ _send();
+ }));
if (!cct->get_heartbeat_map()->is_healthy()) {
/* If anything isn't progressing, let avoid sending a beacon so that
MDSHealth health;
// Ticker
- class C_MDS_BeaconSender;
- C_MDS_BeaconSender *sender;
+ Context *sender = nullptr;
version_t awaiting_seq;
Cond waiting_cond;
static const int MASK_STATE_EXPORTED =
(STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
static const int MASK_STATE_EXPORT_KEPT =
- (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS);
+ (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
// -- waiters --
static const uint64_t WAIT_DIR = (1<<0);
void FSMap::decode(bufferlist::iterator& p)
{
- // Because the mon used to store an MDSMap where we now
- // store an FSMap, FSMap knows how to decode the legacy
- // MDSMap format (it never needs to encode it though).
- MDSMap legacy_mds_map;
-
// The highest MDSMap encoding version before we changed the
// MDSMonitor to store an FSMap instead of an MDSMap was
// 5, so anything older than 6 is decoded as an MDSMap,
// and anything newer is decoded as an FSMap.
DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
if (struct_v < 6) {
+ // Because the mon used to store an MDSMap where we now
+ // store an FSMap, FSMap knows how to decode the legacy
+ // MDSMap format (it never needs to encode it though).
+ MDSMap legacy_mds_map;
+
// Decoding an MDSMap (upgrade)
::decode(epoch, p);
::decode(legacy_mds_map.flags, p);
DECODE_FINISH(p);
}
+void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+ for (auto &fs : filesystems) {
+ fs.second->mds_map.sanitize(pool_exists);
+ }
+}
void Filesystem::encode(bufferlist& bl, uint64_t features) const
{
bufferlist::iterator p = bl.begin();
decode(p);
}
+ void sanitize(std::function<bool(int64_t pool)> pool_exists);
void print(ostream& out) const;
void print_summary(Formatter *f, ostream *out) const;
mut->add_cow_inode(oldin);
if (pcow_inode)
*pcow_inode = oldin;
- CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
+ CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last);
oldin->inode.version = olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
bool need_snapflush = !oldin->client_snap_caps.empty();
unexpirables.push_back(dn);
} else {
trimmed++;
+ if (count > 0) count--;
}
- count--;
}
for (auto &dn : unexpirables) {
#undef dout_prefix
#define dout_prefix *_dout << "mds." << name << ' '
-
-class MDSDaemon::C_MDS_Tick : public Context {
- protected:
- MDSDaemon *mds_daemon;
-public:
- explicit C_MDS_Tick(MDSDaemon *m) : mds_daemon(m) {}
- void finish(int r) override {
- assert(mds_daemon->mds_lock.is_locked_by_me());
- mds_daemon->tick();
- }
-};
-
// cons/des
MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) :
Dispatcher(m->cct),
mgrc(m->cct, m),
log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
mds_rank(NULL),
- tick_event(0),
asok_hook(NULL)
{
orig_argc = 0;
if (tick_event) timer.cancel_event(tick_event);
// schedule
- tick_event = new C_MDS_Tick(this);
- timer.add_event_after(g_conf->mds_tick_interval, tick_event);
+ tick_event = timer.add_event_after(
+ g_conf->mds_tick_interval,
+ new FunctionContext([this](int) {
+ assert(mds_lock.is_locked_by_me());
+ tick();
+ }));
}
void MDSDaemon::tick()
const std::set <std::string> &changed) override;
protected:
// tick and other timer fun
- class C_MDS_Tick;
- C_MDS_Tick *tick_event;
+ Context *tick_event = nullptr;
void reset_tick();
void wait_for_omap_osds();
*
*/
+#include "common/debug.h"
+#include "mon/health_check.h"
#include "MDSMap.h"
#include <sstream>
using std::stringstream;
-#include "mon/health_check.h"
-
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
// features
CompatSet get_mdsmap_compat_set_all() {
ENCODE_FINISH(bl);
}
+void MDSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
+{
+ /* Before we did stricter checking, it was possible to remove a data pool
+ * without also deleting it from the MDSMap. Check for that here after
+ * decoding the data pools.
+ */
+
+ for (auto it = data_pools.begin(); it != data_pools.end();) {
+ if (!pool_exists(*it)) {
+ dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
+ it = data_pools.erase(it);
+ } else {
+ it++;
+ }
+ }
+}
+
void MDSMap::decode(bufferlist::iterator& p)
{
std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
bufferlist::iterator p = bl.begin();
decode(p);
}
-
+ void sanitize(std::function<bool(int64_t pool)> pool_exists);
void print(ostream& out) const;
void print_summary(Formatter *f, ostream *out) const;
dout(2) << "boot_start " << step << ": opening mds log" << dendl;
mdlog->open(gather.new_sub());
+ if (is_starting()) {
+ dout(2) << "boot_start " << step << ": opening purge queue" << dendl;
+ purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
+ } else if (!standby_replaying) {
+ dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
+ purge_queue.open(NULL);
+ }
+
if (mdsmap->get_tableserver() == whoami) {
dout(2) << "boot_start " << step << ": opening snap table" << dendl;
snapserver->set_rank(whoami);
mdcache->open_mydir_inode(gather.new_sub());
- purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
-
if (is_starting() ||
whoami == mdsmap->get_root()) { // load root inode off disk if we are auth
mdcache->open_root_inode(gather.new_sub());
break;
case MDS_BOOT_PREPARE_LOG:
if (is_any_replay()) {
- dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
- mdlog->replay(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+ dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+ MDSGatherBuilder gather(g_ceph_context,
+ new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+
+ if (!standby_replaying) {
+ dout(2) << "boot_start " << step << ": waiting for purge queue recovered" << dendl;
+ purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
+ }
+
+ mdlog->replay(gather.new_sub());
+ gather.activate();
} else {
dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
mdlog->append();
}
}
-inline void MDSRank::standby_replay_restart()
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+ explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+ void finish(int r) override {
+ assert(!r);
+ mds->standby_replay_restart();
+ }
+};
+
+void MDSRank::standby_replay_restart()
{
if (standby_replaying) {
/* Go around for another pass of replaying in standby */
/* We are transitioning out of standby: wait for OSD map update
before making final pass */
dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
- Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
- bool const ready =
- objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
+ Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
+ bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
if (ready) {
delete fin;
mdlog->get_journaler()->reread_head_and_probe(
new C_MDS_StandbyReplayRestartFinish(
this,
mdlog->get_journaler()->get_read_pos()));
+
+ dout(1) << " opening purge queue (async)" << dendl;
+ purge_queue.open(NULL);
} else {
dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
<< " (which blacklists prior instance)" << dendl;
}
}
-class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
-public:
- explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
- void finish(int r) override {
- assert(!r);
- mds->standby_replay_restart();
- }
-};
-
void MDSRank::replay_done()
{
dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
max_purge_ops(0),
drain_initial(0),
draining(false),
- delayed_flush(nullptr)
+ delayed_flush(nullptr),
+ recovered(false)
{
assert(cct != nullptr);
assert(on_error != nullptr);
Mutex::Locker l(lock);
- journaler.recover(new FunctionContext([this, completion](int r){
+ if (completion)
+ waiting_for_recovery.push_back(completion);
+
+ journaler.recover(new FunctionContext([this](int r){
if (r == -ENOENT) {
dout(1) << "Purge Queue not found, assuming this is an upgrade and "
"creating it." << dendl;
- create(completion);
+ create(NULL);
} else if (r == 0) {
Mutex::Locker l(lock);
dout(4) << "open complete" << dendl;
if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
dout(4) << "recovering write_pos" << dendl;
journaler.set_read_pos(journaler.last_committed.write_pos);
- _recover(completion);
+ _recover();
return;
}
journaler.set_writeable();
- completion->complete(0);
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
} else {
derr << "Error " << r << " loading Journaler" << dendl;
on_error->complete(r);
}));
}
+void PurgeQueue::wait_for_recovery(Context* c)
+{
+ Mutex::Locker l(lock);
+ if (recovered)
+ c->complete(0);
+ else
+ waiting_for_recovery.push_back(c);
+}
-void PurgeQueue::_recover(Context *completion)
+void PurgeQueue::_recover()
{
assert(lock.is_locked_by_me());
if (!journaler.is_readable() &&
!journaler.get_error() &&
journaler.get_read_pos() < journaler.get_write_pos()) {
- journaler.wait_for_readable(new FunctionContext([this, completion](int r) {
+ journaler.wait_for_readable(new FunctionContext([this](int r) {
Mutex::Locker l(lock);
- _recover(completion);
+ _recover();
}));
return;
}
// restore original read_pos
journaler.set_read_pos(journaler.last_committed.expire_pos);
journaler.set_writeable();
- completion->complete(0);
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
return;
}
dout(4) << "creating" << dendl;
Mutex::Locker l(lock);
+ if (fin)
+ waiting_for_recovery.push_back(fin);
+
file_layout_t layout = file_layout_t::get_default();
layout.pool_id = metadata_pool;
journaler.set_writeable();
journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
- journaler.write_head(fin);
+ journaler.write_head(new FunctionContext([this](int r) {
+ Mutex::Locker l(lock);
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
+ }));
}
/**
bool draining;
// recover the journal write_pos (drop any partial written entry)
- void _recover(Context *completion);
+ void _recover();
/**
* @return true if we were in a position to try and consume something:
void _execute_item_complete(
uint64_t expire_to);
+ bool recovered;
+ std::list<Context*> waiting_for_recovery;
public:
void init();
// Read the Journaler header for an existing queue and start consuming
void open(Context *completion);
+ void wait_for_recovery(Context *c);
+
// Submit one entry to the work queue. Call back when it is persisted
// to the queue (there is no callback for when it is executed)
void push(const PurgeItem &pi, Context *completion);
session->is_stale() ||
session->is_killing()) {
dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
+ // set client metadata for session opened by prepare_force_open_sessions
+ if (!m->client_meta.empty())
+ session->set_client_metadata(m->client_meta);
m->put();
return;
}
<< " initial v " << mds->sessionmap.get_version() << dendl;
- int sessions_inserted = 0;
for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
- sessions_inserted++;
Session *session = mds->sessionmap.get_session(p->second.name);
assert(session);
void Server::recall_client_state(void)
{
/* try to recall at least 80% of all caps */
- uint64_t max_caps_per_client = (Capability::count() * .8);
- uint64_t min_caps_per_client = 100;
+ uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
+ uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+ if (max_caps_per_client < min_caps_per_client) {
+ dout(0) << "max_caps_per_client " << max_caps_per_client
+ << " < min_caps_per_client " << min_caps_per_client << dendl;
+ max_caps_per_client = min_caps_per_client + 1;
+ }
+
/* unless this ratio is smaller: */
/* ratio: determine the amount of caps to recall from each client. Use
* percentage full over the cache reservation. Cap the ratio at 80% of client
<< ", leases " << session->leases.size()
<< dendl;
- if (session->caps.size() > min_caps_per_client) {
- uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
- if (session->caps.size() > newlim) {
- MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
- m->head.max_caps = newlim;
- mds->send_message_client(m, session);
- session->notify_recall_sent(newlim);
- }
+ uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
+ if (session->caps.size() > newlim) {
+ MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+ m->head.max_caps = newlim;
+ mds->send_message_client(m, session);
+ session->notify_recall_sent(newlim);
}
}
}
class MMgrBeacon : public PaxosServiceMessage {
- static const int HEAD_VERSION = 5;
+ static const int HEAD_VERSION = 6;
static const int COMPAT_VERSION = 1;
protected:
std::set<std::string> available_modules;
map<string,string> metadata; ///< misc metadata about this osd
+ // From active daemon to populate MgrMap::services
+ std::map<std::string, std::string> services;
+
// Only populated during activation
std::vector<MonCommand> command_descs;
return metadata;
}
+ const std::map<std::string,std::string>& get_services() const {
+ return services;
+ }
+
+ void set_services(const std::map<std::string, std::string> &svcs)
+ {
+ services = svcs;
+ }
+
void set_command_descs(const std::vector<MonCommand> &cmds)
{
command_descs = cmds;
::encode(available_modules, payload);
::encode(command_descs, payload);
::encode(metadata, payload);
+ ::encode(services, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
if (header.version >= 5) {
::decode(metadata, p);
}
+ if (header.version >= 6) {
+ ::decode(services, p);
+ }
}
};
*/
class MMgrConfigure : public Message
{
- static const int HEAD_VERSION = 1;
+ static const int HEAD_VERSION = 2;
static const int COMPAT_VERSION = 1;
public:
uint32_t stats_period;
+
+ // Default 0 means if unspecified will include all stats
+ uint32_t stats_threshold = 0;
void decode_payload() override
{
bufferlist::iterator p = payload.begin();
::decode(stats_period, p);
+ if (header.version >= 2) {
+ ::decode(stats_threshold, p);
+ }
}
void encode_payload(uint64_t features) override {
::encode(stats_period, payload);
+ ::encode(stats_threshold, payload);
}
const char *get_type_name() const override { return "mgrconfigure"; }
void print(ostream& out) const override {
- out << get_type_name() << "()";
+ out << get_type_name() << "(period=" << stats_period
+ << ", threshold=" << stats_threshold << ")";
}
MMgrConfigure()
std::string nick;
enum perfcounter_type_d type;
+ // For older clients that did not send priority, pretend everything
+ // is "useful" so that mgr plugins filtering on prio will get some
+ // data (albeit probably more than they wanted)
+ uint8_t priority = PerfCountersBuilder::PRIO_USEFUL;
+
void encode(bufferlist &bl) const
{
// TODO: decide whether to drop the per-type
// encoding here, we could rely on the MgrReport
// verisoning instead.
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(path, bl);
::encode(description, bl);
::encode(nick, bl);
static_assert(sizeof(type) == 1, "perfcounter_type_d must be one byte");
::encode((uint8_t)type, bl);
+ ::encode(priority, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
::decode(path, p);
::decode(description, p);
::decode(nick, p);
::decode((uint8_t&)type, p);
+ if (struct_v >= 2) {
+ ::decode(priority, p);
+ }
DECODE_FINISH(p);
}
};
(features & CEPH_FEATURE_PGPOOL3) == 0 ||
(features & CEPH_FEATURE_OSDENC) == 0 ||
(features & CEPH_FEATURE_OSDMAP_ENC) == 0 ||
- (features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+ (features & CEPH_FEATURE_MSG_ADDR2) == 0 ||
+ !HAVE_FEATURE(features, SERVER_LUMINOUS)) {
if ((features & CEPH_FEATURE_PGID64) == 0 ||
(features & CEPH_FEATURE_PGPOOL3) == 0)
header.version = 1; // old old_client version
inc.fullmap.clear();
m.encode(inc.fullmap, features | CEPH_FEATURE_RESERVED);
}
+ if (inc.crush.length()) {
+ // embedded crush map
+ CrushWrapper c;
+ auto p = inc.crush.begin();
+ c.decode(p);
+ inc.crush.clear();
+ c.encode(inc.crush, features);
+ }
inc.encode(p->second, features | CEPH_FEATURE_RESERVED);
}
for (map<epoch_t,bufferlist>::iterator p = maps.begin();
public:
epoch_t map_epoch = 0;
map<pg_t, vector<int32_t> > pg_temp;
+ bool forced = false;
- MOSDPGTemp(epoch_t e) : PaxosServiceMessage(MSG_OSD_PGTEMP, e), map_epoch(e) { }
- MOSDPGTemp() : PaxosServiceMessage(MSG_OSD_PGTEMP, 0) {}
+ MOSDPGTemp(epoch_t e)
+ : PaxosServiceMessage(MSG_OSD_PGTEMP, e, HEAD_VERSION, COMPAT_VERSION),
+ map_epoch(e)
+ {}
+ MOSDPGTemp()
+ : MOSDPGTemp(0)
+ {}
private:
~MOSDPGTemp() override {}
paxos_encode();
::encode(map_epoch, payload);
::encode(pg_temp, payload);
+ ::encode(forced, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
paxos_decode(p);
::decode(map_epoch, p);
::decode(pg_temp, p);
+ if (header.version >= 2) {
+ ::decode(forced, p);
+ }
}
const char *get_type_name() const override { return "osd_pgtemp"; }
void print(ostream &out) const override {
out << "osd_pgtemp(e" << map_epoch << " " << pg_temp << " v" << version << ")";
}
-
+private:
+ static constexpr int HEAD_VERSION = 2;
+ static constexpr int COMPAT_VERSION = 1;
};
#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "BaseMgrModule.h"
+
+#include "PyFormatter.h"
+
+#include "common/debug.h"
+
+#include "ActivePyModule.h"
+
+//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
+#include <boost/python.hpp>
+#include "include/assert.h" // boost clobbers this
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// decode a Python exception into a string
+std::string handle_pyerror()
+{
+ using namespace boost::python;
+ using namespace boost;
+
+ PyObject *exc, *val, *tb;
+ object formatted_list, formatted;
+ PyErr_Fetch(&exc, &val, &tb);
+ handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+ object traceback(import("traceback"));
+ if (!tb) {
+ object format_exception_only(traceback.attr("format_exception_only"));
+ formatted_list = format_exception_only(hexc, hval);
+ } else {
+ object format_exception(traceback.attr("format_exception"));
+ formatted_list = format_exception(hexc,hval, htb);
+ }
+ formatted = str("").join(formatted_list);
+ return extract<std::string>(formatted);
+}
+
+int ActivePyModule::load(ActivePyModules *py_modules)
+{
+ assert(py_modules);
+ Gil gil(pMyThreadState, true);
+
+ // We tell the module how we name it, so that it can be consistent
+ // with us in logging etc.
+ auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+ auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr);
+ auto pModuleName = PyString_FromString(module_name.c_str());
+ auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr);
+
+ pClassInstance = PyObject_CallObject(pClass, pArgs);
+ Py_DECREF(pClass);
+ Py_DECREF(pModuleName);
+ Py_DECREF(pArgs);
+ if (pClassInstance == nullptr) {
+ derr << "Failed to construct class in '" << module_name << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ } else {
+ dout(1) << "Constructed class from module: " << module_name << dendl;
+ }
+
+ return load_commands();
+}
+
+void ActivePyModule::notify(const std::string ¬ify_type, const std::string ¬ify_id)
+{
+ assert(pClassInstance != nullptr);
+
+ Gil gil(pMyThreadState, true);
+
+ // Execute
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("notify"), const_cast<char*>("(ss)"),
+ notify_type.c_str(), notify_id.c_str());
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << module_name << ".notify:" << dendl;
+ derr << handle_pyerror() << dendl;
+ // FIXME: callers can't be expected to handle a python module
+ // that has spontaneously broken, but Mgr() should provide
+ // a hook to unload misbehaving modules when they have an
+ // error somewhere like this
+ }
+}
+
+void ActivePyModule::notify_clog(const LogEntry &log_entry)
+{
+ assert(pClassInstance != nullptr);
+
+ Gil gil(pMyThreadState, true);
+
+ // Construct python-ized LogEntry
+ PyFormatter f;
+ log_entry.dump(&f);
+ auto py_log_entry = f.get();
+
+ // Execute
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("notify"), const_cast<char*>("(sN)"),
+ "clog", py_log_entry);
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << module_name << ".notify_clog:" << dendl;
+ derr << handle_pyerror() << dendl;
+ // FIXME: callers can't be expected to handle a python module
+ // that has spontaneously broken, but Mgr() should provide
+ // a hook to unload misbehaving modules when they have an
+ // error somewhere like this
+ }
+}
+
+int ActivePyModule::load_commands()
+{
+ // Don't need a Gil here -- this is called from ActivePyModule::load(),
+ // which already has one.
+ PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
+ if (command_list == nullptr) {
+ // Even modules that don't define command should still have the COMMANDS
+ // from the MgrModule definition. Something is wrong!
+ derr << "Module " << get_name() << " has missing COMMANDS member" << dendl;
+ return -EINVAL;
+ }
+ if (!PyObject_TypeCheck(command_list, &PyList_Type)) {
+ // Relatively easy mistake for human to make, e.g. defining COMMANDS
+ // as a {} instead of a []
+ derr << "Module " << get_name() << " has COMMANDS member of wrong type ("
+ "should be a list)" << dendl;
+ return -EINVAL;
+ }
+ const size_t list_size = PyList_Size(command_list);
+ for (size_t i = 0; i < list_size; ++i) {
+ PyObject *command = PyList_GetItem(command_list, i);
+ assert(command != nullptr);
+
+ ModuleCommand item;
+
+ PyObject *pCmd = PyDict_GetItemString(command, "cmd");
+ assert(pCmd != nullptr);
+ item.cmdstring = PyString_AsString(pCmd);
+
+ dout(20) << "loaded command " << item.cmdstring << dendl;
+
+ PyObject *pDesc = PyDict_GetItemString(command, "desc");
+ assert(pDesc != nullptr);
+ item.helpstring = PyString_AsString(pDesc);
+
+ PyObject *pPerm = PyDict_GetItemString(command, "perm");
+ assert(pPerm != nullptr);
+ item.perm = PyString_AsString(pPerm);
+
+ item.handler = this;
+
+ commands.push_back(item);
+ }
+ Py_DECREF(command_list);
+
+ dout(10) << "loaded " << commands.size() << " commands" << dendl;
+
+ return 0;
+}
+
+int ActivePyModule::handle_command(
+ const cmdmap_t &cmdmap,
+ std::stringstream *ds,
+ std::stringstream *ss)
+{
+ assert(ss != nullptr);
+ assert(ds != nullptr);
+
+ Gil gil(pMyThreadState, true);
+
+ PyFormatter f;
+ cmdmap_dump(cmdmap, &f);
+ PyObject *py_cmd = f.get();
+
+ auto pResult = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
+
+ Py_DECREF(py_cmd);
+
+ int r = 0;
+ if (pResult != NULL) {
+ if (PyTuple_Size(pResult) != 3) {
+ r = -EINVAL;
+ } else {
+ r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
+ *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
+ *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
+ }
+
+ Py_DECREF(pResult);
+ } else {
+ *ds << "";
+ *ss << handle_pyerror();
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+void ActivePyModule::get_health_checks(health_check_map_t *checks)
+{
+ checks->merge(health_checks);
+}
+
--- /dev/null
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/cmdparse.h"
+#include "common/LogEntry.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "mon/health_check.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#include <vector>
+#include <string>
+
+
+class ActivePyModule;
+class ActivePyModules;
+
+/**
+ * A Ceph CLI command description provided from a Python module
+ */
+class ModuleCommand {
+public:
+ std::string cmdstring;
+ std::string helpstring;
+ std::string perm;
+ ActivePyModule *handler;
+};
+
+class ActivePyModule : public PyModuleRunner
+{
+private:
+ health_check_map_t health_checks;
+
+ std::vector<ModuleCommand> commands;
+
+ int load_commands();
+
+ // Optional, URI exposed by plugins that implement serve()
+ std::string uri;
+
+public:
+ ActivePyModule(const std::string &module_name_,
+ PyObject *pClass_,
+ const SafeThreadState &my_ts_)
+ : PyModuleRunner(module_name_, pClass_, my_ts_)
+ {}
+
+ int load(ActivePyModules *py_modules);
+ void notify(const std::string ¬ify_type, const std::string ¬ify_id);
+ void notify_clog(const LogEntry &le);
+
+ const std::vector<ModuleCommand> &get_commands() const
+ {
+ return commands;
+ }
+
+ int handle_command(
+ const cmdmap_t &cmdmap,
+ std::stringstream *ds,
+ std::stringstream *ss);
+
+ void set_health_checks(health_check_map_t&& c) {
+ health_checks = std::move(c);
+ }
+ void get_health_checks(health_check_map_t *checks);
+
+ void set_uri(const std::string &str)
+ {
+ uri = str;
+ }
+
+ std::string get_uri() const
+ {
+ return uri;
+ }
+};
+
+std::string handle_pyerror();
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+// Include this first to get python headers earlier
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#include "common/errno.h"
+#include "include/stringify.h"
+
+#include "PyFormatter.h"
+
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "mgr/MgrContext.h"
+
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#include "ActivePyModules.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+ActivePyModules::ActivePyModules(PyModuleConfig const &config_,
+ DaemonStateIndex &ds, ClusterState &cs,
+ MonClient &mc, LogChannelRef clog_, Objecter &objecter_,
+ Client &client_, Finisher &f)
+ : config_cache(config_), daemon_state(ds), cluster_state(cs),
+ monc(mc), clog(clog_), objecter(objecter_), client(client_), finisher(f),
+ lock("ActivePyModules")
+{}
+
+ActivePyModules::~ActivePyModules() = default;
+
+void ActivePyModules::dump_server(const std::string &hostname,
+ const DaemonStateCollection &dmc,
+ Formatter *f)
+{
+ f->dump_string("hostname", hostname);
+ f->open_array_section("services");
+ std::string ceph_version;
+
+ for (const auto &i : dmc) {
+ Mutex::Locker l(i.second->lock);
+ const auto &key = i.first;
+ const std::string &str_type = key.first;
+ const std::string &svc_name = key.second;
+
+ // TODO: pick the highest version, and make sure that
+ // somewhere else (during health reporting?) we are
+ // indicating to the user if we see mixed versions
+ auto ver_iter = i.second->metadata.find("ceph_version");
+ if (ver_iter != i.second->metadata.end()) {
+ ceph_version = i.second->metadata.at("ceph_version");
+ }
+
+ f->open_object_section("service");
+ f->dump_string("type", str_type);
+ f->dump_string("id", svc_name);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->dump_string("ceph_version", ceph_version);
+}
+
+
+
+PyObject *ActivePyModules::get_server_python(const std::string &hostname)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+ dout(10) << " (" << hostname << ")" << dendl;
+
+ auto dmc = daemon_state.get_by_server(hostname);
+
+ PyFormatter f;
+ dump_server(hostname, dmc, &f);
+ return f.get();
+}
+
+
+PyObject *ActivePyModules::list_servers_python()
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+ dout(10) << " >" << dendl;
+
+ PyFormatter f(false, true);
+ daemon_state.with_daemons_by_server([this, &f]
+ (const std::map<std::string, DaemonStateCollection> &all) {
+ for (const auto &i : all) {
+ const auto &hostname = i.first;
+
+ f.open_object_section("server");
+ dump_server(hostname, i.second, &f);
+ f.close_section();
+ }
+ });
+
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_metadata_python(
+ const std::string &svc_type,
+ const std::string &svc_id)
+{
+ auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+ if (metadata == nullptr) {
+ derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+ Py_RETURN_NONE;
+ }
+
+ Mutex::Locker l(metadata->lock);
+ PyFormatter f;
+ f.dump_string("hostname", metadata->hostname);
+ for (const auto &i : metadata->metadata) {
+ f.dump_string(i.first.c_str(), i.second);
+ }
+
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_daemon_status_python(
+ const std::string &svc_type,
+ const std::string &svc_id)
+{
+ auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id));
+ if (metadata == nullptr) {
+ derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+ Py_RETURN_NONE;
+ }
+
+ Mutex::Locker l(metadata->lock);
+ PyFormatter f;
+ for (const auto &i : metadata->service_status) {
+ f.dump_string(i.first.c_str(), i.second);
+ }
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_python(const std::string &what)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ if (what == "fs_map") {
+ PyFormatter f;
+ cluster_state.with_fsmap([&f](const FSMap &fsmap) {
+ fsmap.dump(&f);
+ });
+ return f.get();
+ } else if (what == "osdmap_crush_map_text") {
+ bufferlist rdata;
+ cluster_state.with_osdmap([&rdata](const OSDMap &osd_map){
+ osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ });
+ std::string crush_text = rdata.to_str();
+ return PyString_FromString(crush_text.c_str());
+ } else if (what.substr(0, 7) == "osd_map") {
+ PyFormatter f;
+ cluster_state.with_osdmap([&f, &what](const OSDMap &osd_map){
+ if (what == "osd_map") {
+ osd_map.dump(&f);
+ } else if (what == "osd_map_tree") {
+ osd_map.print_tree(&f, nullptr);
+ } else if (what == "osd_map_crush") {
+ osd_map.crush->dump(&f);
+ }
+ });
+ return f.get();
+ } else if (what == "config") {
+ PyFormatter f;
+ g_conf->show_config(&f);
+ return f.get();
+ } else if (what == "mon_map") {
+ PyFormatter f;
+ cluster_state.with_monmap(
+ [&f](const MonMap &monmap) {
+ monmap.dump(&f);
+ }
+ );
+ return f.get();
+ } else if (what == "service_map") {
+ PyFormatter f;
+ cluster_state.with_servicemap(
+ [&f](const ServiceMap &service_map) {
+ service_map.dump(&f);
+ }
+ );
+ return f.get();
+ } else if (what == "osd_metadata") {
+ PyFormatter f;
+ auto dmc = daemon_state.get_by_service("osd");
+ for (const auto &i : dmc) {
+ Mutex::Locker l(i.second->lock);
+ f.open_object_section(i.first.second.c_str());
+ f.dump_string("hostname", i.second->hostname);
+ for (const auto &j : i.second->metadata) {
+ f.dump_string(j.first.c_str(), j.second);
+ }
+ f.close_section();
+ }
+ return f.get();
+ } else if (what == "pg_summary") {
+ PyFormatter f;
+ cluster_state.with_pgmap(
+ [&f](const PGMap &pg_map) {
+ std::map<std::string, std::map<std::string, uint32_t> > osds;
+ std::map<std::string, std::map<std::string, uint32_t> > pools;
+ std::map<std::string, uint32_t> all;
+ for (const auto &i : pg_map.pg_stat) {
+ const auto pool = i.first.m_pool;
+ const std::string state = pg_state_string(i.second.state);
+ // Insert to per-pool map
+ pools[stringify(pool)][state]++;
+ for (const auto &osd_id : i.second.acting) {
+ osds[stringify(osd_id)][state]++;
+ }
+ all[state]++;
+ }
+ f.open_object_section("by_osd");
+ for (const auto &i : osds) {
+ f.open_object_section(i.first.c_str());
+ for (const auto &j : i.second) {
+ f.dump_int(j.first.c_str(), j.second);
+ }
+ f.close_section();
+ }
+ f.close_section();
+ f.open_object_section("by_pool");
+ for (const auto &i : pools) {
+ f.open_object_section(i.first.c_str());
+ for (const auto &j : i.second) {
+ f.dump_int(j.first.c_str(), j.second);
+ }
+ f.close_section();
+ }
+ f.close_section();
+ f.open_object_section("all");
+ for (const auto &i : all) {
+ f.dump_int(i.first.c_str(), i.second);
+ }
+ f.close_section();
+ }
+ );
+ return f.get();
+ } else if (what == "pg_status") {
+ PyFormatter f;
+ cluster_state.with_pgmap(
+ [&f](const PGMap &pg_map) {
+ pg_map.print_summary(&f, nullptr);
+ }
+ );
+ return f.get();
+ } else if (what == "pg_dump") {
+ PyFormatter f;
+ cluster_state.with_pgmap(
+ [&f](const PGMap &pg_map) {
+ pg_map.dump(&f);
+ }
+ );
+ return f.get();
+ } else if (what == "df") {
+ PyFormatter f;
+
+ cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
+ cluster_state.with_pgmap(
+ [&osd_map, &f](const PGMap &pg_map) {
+ pg_map.dump_fs_stats(nullptr, &f, true);
+ pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
+ });
+ });
+ return f.get();
+ } else if (what == "osd_stats") {
+ PyFormatter f;
+ cluster_state.with_pgmap(
+ [&f](const PGMap &pg_map) {
+ pg_map.dump_osd_stats(&f);
+ });
+ return f.get();
+ } else if (what == "health" || what == "mon_status") {
+ PyFormatter f;
+ bufferlist json;
+ if (what == "health") {
+ json = cluster_state.get_health();
+ } else if (what == "mon_status") {
+ json = cluster_state.get_mon_status();
+ } else {
+ assert(false);
+ }
+ f.dump_string("json", json.to_str());
+ return f.get();
+ } else if (what == "mgr_map") {
+ PyFormatter f;
+ cluster_state.with_mgrmap([&f](const MgrMap &mgr_map) {
+ mgr_map.dump(&f);
+ });
+ return f.get();
+ } else {
+ derr << "Python module requested unknown data '" << what << "'" << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+int ActivePyModules::start_one(std::string const &module_name,
+ PyObject *pClass, const SafeThreadState &pMyThreadState)
+{
+ Mutex::Locker l(lock);
+
+ assert(modules.count(module_name) == 0);
+
+ modules[module_name].reset(new ActivePyModule(
+ module_name, pClass,
+ pMyThreadState));
+
+ int r = modules[module_name]->load(this);
+ if (r != 0) {
+ return r;
+ } else {
+ dout(4) << "Starting thread for " << module_name << dendl;
+ // Giving Thread the module's module_name member as its
+ // char* thread name: thread must not outlive module class lifetime.
+ modules[module_name]->thread.create(
+ modules[module_name]->get_name().c_str());
+
+ return 0;
+ }
+}
+
+void ActivePyModules::shutdown()
+{
+ Mutex::Locker locker(lock);
+
+ // Signal modules to drop out of serve() and/or tear down resources
+ for (auto &i : modules) {
+ auto module = i.second.get();
+ const auto& name = i.first;
+
+ lock.Unlock();
+ dout(10) << "calling module " << name << " shutdown()" << dendl;
+ module->shutdown();
+ dout(10) << "module " << name << " shutdown() returned" << dendl;
+ lock.Lock();
+ }
+
+ // For modules implementing serve(), finish the threads where we
+ // were running that.
+ for (auto &i : modules) {
+ lock.Unlock();
+ dout(10) << "joining module " << i.first << dendl;
+ i.second->thread.join();
+ dout(10) << "joined module " << i.first << dendl;
+ lock.Lock();
+ }
+
+ modules.clear();
+}
+
+void ActivePyModules::notify_all(const std::string ¬ify_type,
+ const std::string ¬ify_id)
+{
+ Mutex::Locker l(lock);
+
+ dout(10) << __func__ << ": notify_all " << notify_type << dendl;
+ for (auto& i : modules) {
+ auto module = i.second.get();
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){
+ module->notify(notify_type, notify_id);
+ }));
+ }
+}
+
+void ActivePyModules::notify_all(const LogEntry &log_entry)
+{
+ Mutex::Locker l(lock);
+
+ dout(10) << __func__ << ": notify_all (clog)" << dendl;
+ for (auto& i : modules) {
+ auto module = i.second.get();
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ //
+ // Note intentional use of non-reference lambda binding on
+ // log_entry: we take a copy because caller's instance is
+ // probably ephemeral.
+ finisher.queue(new FunctionContext([module, log_entry](int r){
+ module->notify_clog(log_entry);
+ }));
+ }
+}
+
+bool ActivePyModules::get_config(const std::string &module_name,
+ const std::string &key, std::string *val) const
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ const std::string global_key = PyModuleRegistry::config_prefix
+ + module_name + "/" + key;
+
+ dout(4) << __func__ << "key: " << global_key << dendl;
+
+ if (config_cache.count(global_key)) {
+ *val = config_cache.at(global_key);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+PyObject *ActivePyModules::get_config_prefix(const std::string &module_name,
+ const std::string &prefix) const
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ const std::string base_prefix = PyModuleRegistry::config_prefix
+ + module_name + "/";
+ const std::string global_prefix = base_prefix + prefix;
+ dout(4) << __func__ << "prefix: " << global_prefix << dendl;
+
+ PyFormatter f;
+ for (auto p = config_cache.lower_bound(global_prefix);
+ p != config_cache.end() && p->first.find(global_prefix) == 0;
+ ++p) {
+ f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
+ }
+ return f.get();
+}
+
+void ActivePyModules::set_config(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string>& val)
+{
+ const std::string global_key = PyModuleRegistry::config_prefix
+ + module_name + "/" + key;
+
+ Command set_cmd;
+ {
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+ if (val) {
+ config_cache[global_key] = *val;
+ } else {
+ config_cache.erase(global_key);
+ }
+
+ std::ostringstream cmd_json;
+ JSONFormatter jf;
+ jf.open_object_section("cmd");
+ if (val) {
+ jf.dump_string("prefix", "config-key set");
+ jf.dump_string("key", global_key);
+ jf.dump_string("val", *val);
+ } else {
+ jf.dump_string("prefix", "config-key del");
+ jf.dump_string("key", global_key);
+ }
+ jf.close_section();
+ jf.flush(cmd_json);
+ set_cmd.run(&monc, cmd_json.str());
+ }
+ set_cmd.wait();
+
+ if (set_cmd.r != 0) {
+ // config-key set will fail if mgr's auth key has insufficient
+ // permission to set config keys
+ // FIXME: should this somehow raise an exception back into Python land?
+ dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
+ << cpp_strerror(set_cmd.r) << dendl;
+ dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+ }
+}
+
+std::vector<ModuleCommand> ActivePyModules::get_py_commands() const
+{
+ Mutex::Locker l(lock);
+
+ std::vector<ModuleCommand> result;
+ for (const auto& i : modules) {
+ auto module = i.second.get();
+ auto mod_commands = module->get_commands();
+ for (auto j : mod_commands) {
+ result.push_back(j);
+ }
+ }
+
+ return result;
+}
+
+std::vector<MonCommand> ActivePyModules::get_commands() const
+{
+ std::vector<ModuleCommand> commands = get_py_commands();
+ std::vector<MonCommand> result;
+ for (auto &pyc: commands) {
+ result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
+ pyc.perm, "cli", MonCommand::FLAG_MGR});
+ }
+ return result;
+}
+
+
+std::map<std::string, std::string> ActivePyModules::get_services() const
+{
+ std::map<std::string, std::string> result;
+ Mutex::Locker l(lock);
+ for (const auto& i : modules) {
+ const auto &module = i.second.get();
+ std::string svc_str = module->get_uri();
+ if (!svc_str.empty()) {
+ result[module->get_name()] = svc_str;
+ }
+ }
+
+ return result;
+}
+
+PyObject* ActivePyModules::get_counter_python(
+ const std::string &svc_name,
+ const std::string &svc_id,
+ const std::string &path)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ PyFormatter f;
+ f.open_array_section(path.c_str());
+
+ auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+ if (metadata) {
+ Mutex::Locker l2(metadata->lock);
+ if (metadata->perf_counters.instances.count(path)) {
+ auto counter_instance = metadata->perf_counters.instances.at(path);
+ const auto &data = counter_instance.get_data();
+ for (const auto &datapoint : data) {
+ f.open_array_section("datapoint");
+ f.dump_unsigned("t", datapoint.t.sec());
+ f.dump_unsigned("v", datapoint.v);
+ f.close_section();
+
+ }
+ } else {
+ dout(4) << "Missing counter: '" << path << "' ("
+ << svc_name << "." << svc_id << ")" << dendl;
+ dout(20) << "Paths are:" << dendl;
+ for (const auto &i : metadata->perf_counters.instances) {
+ dout(20) << i.first << dendl;
+ }
+ }
+ } else {
+ dout(4) << "No daemon state for "
+ << svc_name << "." << svc_id << ")" << dendl;
+ }
+ f.close_section();
+ return f.get();
+}
+
+PyObject* ActivePyModules::get_perf_schema_python(
+ const std::string svc_type,
+ const std::string &svc_id)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ DaemonStateCollection daemons;
+
+ if (svc_type == "") {
+ daemons = std::move(daemon_state.get_all());
+ } else if (svc_id.empty()) {
+ daemons = std::move(daemon_state.get_by_service(svc_type));
+ } else {
+ auto key = DaemonKey(svc_type, svc_id);
+ // so that the below can be a loop in all cases
+ auto got = daemon_state.get(key);
+ if (got != nullptr) {
+ daemons[key] = got;
+ }
+ }
+
+ PyFormatter f;
+ if (!daemons.empty()) {
+ for (auto statepair : daemons) {
+ auto key = statepair.first;
+ auto state = statepair.second;
+
+ std::ostringstream daemon_name;
+ daemon_name << key.first << "." << key.second;
+ f.open_object_section(daemon_name.str().c_str());
+
+ Mutex::Locker l(state->lock);
+ for (auto ctr_inst_iter : state->perf_counters.instances) {
+ const auto &counter_name = ctr_inst_iter.first;
+ f.open_object_section(counter_name.c_str());
+ auto type = state->perf_counters.types[counter_name];
+ f.dump_string("description", type.description);
+ if (!type.nick.empty()) {
+ f.dump_string("nick", type.nick);
+ }
+ f.dump_unsigned("type", type.type);
+ f.dump_unsigned("priority", type.priority);
+ f.close_section();
+ }
+ f.close_section();
+ }
+ } else {
+ dout(4) << __func__ << ": No daemon state found for "
+ << svc_type << "." << svc_id << ")" << dendl;
+ }
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_context()
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ // Construct a capsule containing ceph context.
+ // Not incrementing/decrementing ref count on the context because
+ // it's the global one and it has process lifetime.
+ auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
+ return capsule;
+}
+
+/**
+ * Helper for our wrapped types that take a capsule in their constructor.
+ */
+PyObject *construct_with_capsule(
+ const std::string &module_name,
+ const std::string &clsname,
+ void *wrapped)
+{
+ // Look up the OSDMap type which we will construct
+ PyObject *module = PyImport_ImportModule(module_name.c_str());
+ if (!module) {
+ derr << "Failed to import python module:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ assert(module);
+
+ PyObject *wrapper_type = PyObject_GetAttrString(
+ module, (const char*)clsname.c_str());
+ if (!wrapper_type) {
+ derr << "Failed to get python type:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ assert(wrapper_type);
+
+ // Construct a capsule containing an OSDMap.
+ auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr);
+ assert(wrapped_capsule);
+
+ // Construct the python OSDMap
+ auto pArgs = PyTuple_Pack(1, wrapped_capsule);
+ auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs);
+ if (wrapper_instance == nullptr) {
+ derr << "Failed to construct python OSDMap:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ assert(wrapper_instance != nullptr);
+ Py_DECREF(pArgs);
+ Py_DECREF(wrapped_capsule);
+
+ Py_DECREF(wrapper_type);
+ Py_DECREF(module);
+
+ return wrapper_instance;
+}
+
+PyObject *ActivePyModules::get_osdmap()
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ OSDMap *newmap = new OSDMap;
+
+ cluster_state.with_osdmap([&](const OSDMap& o) {
+ newmap->deepish_copy_from(o);
+ });
+
+ return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap);
+}
+
+void ActivePyModules::set_health_checks(const std::string& module_name,
+ health_check_map_t&& checks)
+{
+ Mutex::Locker l(lock);
+ auto p = modules.find(module_name);
+ if (p != modules.end()) {
+ p->second->set_health_checks(std::move(checks));
+ }
+}
+
+void ActivePyModules::get_health_checks(health_check_map_t *checks)
+{
+ Mutex::Locker l(lock);
+ for (auto& p : modules) {
+ p.second->get_health_checks(checks);
+ }
+}
+
+void ActivePyModules::set_uri(const std::string& module_name,
+ const std::string &uri)
+{
+ Mutex::Locker l(lock);
+
+ dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl;
+
+ modules[module_name]->set_uri(uri);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include "ActivePyModule.h"
+
+#include "common/Finisher.h"
+#include "common/Mutex.h"
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/LogClient.h"
+#include "mon/MgrMap.h"
+#include "mon/MonCommand.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+
+class health_check_map_t;
+
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+class ActivePyModules
+{
+
+ std::map<std::string, std::unique_ptr<ActivePyModule>> modules;
+ PyModuleConfig config_cache;
+ DaemonStateIndex &daemon_state;
+ ClusterState &cluster_state;
+ MonClient &monc;
+ LogChannelRef clog;
+ Objecter &objecter;
+ Client &client;
+ Finisher &finisher;
+
+
+ mutable Mutex lock{"ActivePyModules::lock"};
+
+public:
+ ActivePyModules(PyModuleConfig const &config_,
+ DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+ LogChannelRef clog_, Objecter &objecter_, Client &client_,
+ Finisher &f);
+
+ ~ActivePyModules();
+
+ // FIXME: wrap for send_command?
+ MonClient &get_monc() {return monc;}
+ Objecter &get_objecter() {return objecter;}
+ Client &get_client() {return client;}
+
+ PyObject *get_python(const std::string &what);
+ PyObject *get_server_python(const std::string &hostname);
+ PyObject *list_servers_python();
+ PyObject *get_metadata_python(
+ const std::string &svc_type, const std::string &svc_id);
+ PyObject *get_daemon_status_python(
+ const std::string &svc_type, const std::string &svc_id);
+ PyObject *get_counter_python(
+ const std::string &svc_type,
+ const std::string &svc_id,
+ const std::string &path);
+ PyObject *get_perf_schema_python(
+ const std::string svc_type,
+ const std::string &svc_id);
+ PyObject *get_context();
+ PyObject *get_osdmap();
+
+ bool get_config(const std::string &module_name,
+ const std::string &key, std::string *val) const;
+ PyObject *get_config_prefix(const std::string &module_name,
+ const std::string &prefix) const;
+ void set_config(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string> &val);
+
+ void set_health_checks(const std::string& module_name,
+ health_check_map_t&& checks);
+ void get_health_checks(health_check_map_t *checks);
+
+ void set_uri(const std::string& module_name, const std::string &uri);
+
+ // Python command definitions, including callback
+ std::vector<ModuleCommand> get_py_commands() const;
+
+ // Monitor command definitions, suitable for CLI
+ std::vector<MonCommand> get_commands() const;
+
+ std::map<std::string, std::string> get_services() const;
+
+ // Public so that MonCommandCompletion can use it
+ // FIXME: for send_command completion notifications,
+ // send it to only the module that sent the command, not everyone
+ void notify_all(const std::string ¬ify_type,
+ const std::string ¬ify_id);
+ void notify_all(const LogEntry &log_entry);
+
+ int init();
+ void shutdown();
+
+ int start_one(std::string const &module_name,
+ PyObject *pClass,
+ const SafeThreadState &pMyThreadState);
+
+ void dump_server(const std::string &hostname,
+ const DaemonStateCollection &dmc,
+ Formatter *f);
+};
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+/**
+ * The interface we present to python code that runs within
+ * ceph-mgr. This is implemented as a Python class from which
+ * all modules must inherit -- access to the Ceph state is then
+ * available as methods on that object.
+ */
+
+#include "Python.h"
+
+#include "Mgr.h"
+
+#include "mon/MonClient.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#define PLACEHOLDER ""
+
+
+typedef struct {
+ PyObject_HEAD
+ ActivePyModules *py_modules;
+ ActivePyModule *this_module;
+} BaseMgrModule;
+
+class MonCommandCompletion : public Context
+{
+ ActivePyModules *py_modules;
+ PyObject *python_completion;
+ const std::string tag;
+ SafeThreadState pThreadState;
+
+public:
+ std::string outs;
+ bufferlist outbl;
+
+ MonCommandCompletion(
+ ActivePyModules *py_modules_, PyObject* ev,
+ const std::string &tag_, PyThreadState *ts_)
+ : py_modules(py_modules_), python_completion(ev),
+ tag(tag_), pThreadState(ts_)
+ {
+ assert(python_completion != nullptr);
+ Py_INCREF(python_completion);
+ }
+
+ ~MonCommandCompletion() override
+ {
+ if (python_completion) {
+ // Usually do this in finish(): this path is only for if we're
+ // being destroyed without completing.
+ Gil gil(pThreadState, true);
+ Py_DECREF(python_completion);
+ python_completion = nullptr;
+ }
+ }
+
+ void finish(int r) override
+ {
+ assert(python_completion != nullptr);
+
+ dout(10) << "MonCommandCompletion::finish()" << dendl;
+ {
+ // Scoped so the Gil is released before calling notify_all()
+ // Create new thread state because this is called via the MonClient
+ // Finisher, not the PyModules finisher.
+ Gil gil(pThreadState, true);
+
+ auto set_fn = PyObject_GetAttrString(python_completion, "complete");
+ assert(set_fn != nullptr);
+
+ auto pyR = PyInt_FromLong(r);
+ auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
+ auto pyOutS = PyString_FromString(outs.c_str());
+ auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
+ Py_DECREF(pyR);
+ Py_DECREF(pyOutBl);
+ Py_DECREF(pyOutS);
+
+ auto rtn = PyObject_CallObject(set_fn, args);
+ if (rtn != nullptr) {
+ Py_DECREF(rtn);
+ }
+ Py_DECREF(args);
+ Py_DECREF(set_fn);
+
+ Py_DECREF(python_completion);
+ python_completion = nullptr;
+ }
+ py_modules->notify_all("command", tag);
+ }
+};
+
+
+static PyObject*
+ceph_send_command(BaseMgrModule *self, PyObject *args)
+{
+ // Like mon, osd, mds
+ char *type = nullptr;
+
+ // Like "23" for an OSD or "myid" for an MDS
+ char *name = nullptr;
+
+ char *cmd_json = nullptr;
+ char *tag = nullptr;
+ PyObject *completion = nullptr;
+ if (!PyArg_ParseTuple(args, "Ossss:ceph_send_command",
+ &completion, &type, &name, &cmd_json, &tag)) {
+ return nullptr;
+ }
+
+ auto set_fn = PyObject_GetAttrString(completion, "complete");
+ if (set_fn == nullptr) {
+ ceph_abort(); // TODO raise python exception instead
+ } else {
+ assert(PyCallable_Check(set_fn));
+ }
+ Py_DECREF(set_fn);
+
+ auto c = new MonCommandCompletion(self->py_modules,
+ completion, tag, PyThreadState_Get());
+ if (std::string(type) == "mon") {
+ self->py_modules->get_monc().start_mon_command(
+ {cmd_json},
+ {},
+ &c->outbl,
+ &c->outs,
+ c);
+ } else if (std::string(type) == "osd") {
+ std::string err;
+ uint64_t osd_id = strict_strtoll(name, 10, &err);
+ if (!err.empty()) {
+ delete c;
+ string msg("invalid osd_id: ");
+ msg.append("\"").append(name).append("\"");
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ ceph_tid_t tid;
+ self->py_modules->get_objecter().osd_command(
+ osd_id,
+ {cmd_json},
+ {},
+ &tid,
+ &c->outbl,
+ &c->outs,
+ c);
+ } else if (std::string(type) == "mds") {
+ int r = self->py_modules->get_client().mds_command(
+ name,
+ {cmd_json},
+ {},
+ &c->outbl,
+ &c->outs,
+ c);
+ if (r != 0) {
+ string msg("failed to send command to mds: ");
+ msg.append(cpp_strerror(r));
+ PyErr_SetString(PyExc_RuntimeError, msg.c_str());
+ return nullptr;
+ }
+ } else if (std::string(type) == "pg") {
+ pg_t pgid;
+ if (!pgid.parse(name)) {
+ delete c;
+ string msg("invalid pgid: ");
+ msg.append("\"").append(name).append("\"");
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ ceph_tid_t tid;
+ self->py_modules->get_objecter().pg_command(
+ pgid,
+ {cmd_json},
+ {},
+ &tid,
+ &c->outbl,
+ &c->outs,
+ c);
+ return nullptr;
+ } else {
+ delete c;
+ string msg("unknown service type: ");
+ msg.append(type);
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
+{
+ PyObject *checks = NULL;
+ if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) {
+ return NULL;
+ }
+ if (!PyDict_Check(checks)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+ PyObject *checksls = PyDict_Items(checks);
+ health_check_map_t out_checks;
+ for (int i = 0; i < PyList_Size(checksls); ++i) {
+ PyObject *kv = PyList_GET_ITEM(checksls, i);
+ char *check_name = nullptr;
+ PyObject *check_info = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+ derr << __func__ << " dict item " << i
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ if (!PyDict_Check(check_info)) {
+ derr << __func__ << " item " << i << " " << check_name
+ << " value not a dict" << dendl;
+ continue;
+ }
+ health_status_t severity = HEALTH_OK;
+ string summary;
+ list<string> detail;
+ PyObject *infols = PyDict_Items(check_info);
+ for (int j = 0; j < PyList_Size(infols); ++j) {
+ PyObject *pair = PyList_GET_ITEM(infols, j);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a tuple" << dendl;
+ continue;
+ }
+ char *k = nullptr;
+ PyObject *v = nullptr;
+ if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ string ks(k);
+ if (ks == "severity") {
+ if (!PyString_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " severity value not string" << dendl;
+ continue;
+ }
+ string vs(PyString_AsString(v));
+ if (vs == "warning") {
+ severity = HEALTH_WARN;
+ } else if (vs == "error") {
+ severity = HEALTH_ERR;
+ }
+ } else if (ks == "summary") {
+ if (!PyString_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " summary value not string" << dendl;
+ continue;
+ }
+ summary = PyString_AsString(v);
+ } else if (ks == "detail") {
+ if (!PyList_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " detail value not list" << dendl;
+ continue;
+ }
+ for (int k = 0; k < PyList_Size(v); ++k) {
+ PyObject *di = PyList_GET_ITEM(v, k);
+ if (!PyString_Check(di)) {
+ derr << __func__ << " check " << check_name
+ << " detail item " << k << " not a string" << dendl;
+ continue;
+ }
+ detail.push_back(PyString_AsString(di));
+ }
+ } else {
+ derr << __func__ << " check " << check_name
+ << " unexpected key " << k << dendl;
+ }
+ }
+ auto& d = out_checks.add(check_name, severity, summary);
+ d.detail.swap(detail);
+ }
+
+ JSONFormatter jf(true);
+ dout(10) << "module " << self->this_module->get_name()
+ << " health checks:\n";
+ out_checks.dump(&jf);
+ jf.flush(*_dout);
+ *_dout << dendl;
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_health_checks(self->this_module->get_name(),
+ std::move(out_checks));
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+
+static PyObject*
+ceph_state_get(BaseMgrModule *self, PyObject *args)
+{
+ char *what = NULL;
+ if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) {
+ return NULL;
+ }
+
+ return self->py_modules->get_python(what);
+}
+
+
+static PyObject*
+ceph_get_server(BaseMgrModule *self, PyObject *args)
+{
+ char *hostname = NULL;
+ if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) {
+ return NULL;
+ }
+
+ if (hostname) {
+ return self->py_modules->get_server_python(hostname);
+ } else {
+ return self->py_modules->list_servers_python();
+ }
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrModule *self, PyObject *args)
+{
+ return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ std::string value;
+ bool found = self->py_modules->get_config(self->this_module->get_name(),
+ what, &value);
+ if (found) {
+ dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+ return PyString_FromString(value.c_str());
+ } else {
+ dout(4) << "ceph_config_get " << what << " not found " << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_config_get_prefix(BaseMgrModule *self, PyObject *args)
+{
+ char *prefix = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_config_get", &prefix)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ return self->py_modules->get_config_prefix(self->this_module->get_name(),
+ prefix);
+}
+
+static PyObject*
+ceph_config_set(BaseMgrModule *self, PyObject *args)
+{
+ char *key = nullptr;
+ char *value = nullptr;
+ if (!PyArg_ParseTuple(args, "sz:ceph_config_set", &key, &value)) {
+ return nullptr;
+ }
+ boost::optional<string> val;
+ if (value) {
+ val = value;
+ }
+ self->py_modules->set_config(self->this_module->get_name(), key, val);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+get_metadata(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = NULL;
+ char *svc_id = NULL;
+ if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) {
+ return nullptr;
+ }
+ return self->py_modules->get_metadata_python(svc_name, svc_id);
+}
+
+static PyObject*
+get_daemon_status(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = NULL;
+ char *svc_id = NULL;
+ if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name,
+ &svc_id)) {
+ return nullptr;
+ }
+ return self->py_modules->get_daemon_status_python(svc_name, svc_id);
+}
+
+static PyObject*
+ceph_log(BaseMgrModule *self, PyObject *args)
+{
+
+ int level = 0;
+ char *record = nullptr;
+ if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+ return nullptr;
+ }
+
+ assert(self->this_module);
+
+ self->this_module->log(level, record);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject *
+ceph_get_version(BaseMgrModule *self, PyObject *args)
+{
+ return PyString_FromString(pretty_version_to_str().c_str());
+}
+
+static PyObject *
+ceph_get_context(BaseMgrModule *self, PyObject *args)
+{
+ return self->py_modules->get_context();
+}
+
+static PyObject*
+get_counter(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = nullptr;
+ char *svc_id = nullptr;
+ char *counter_path = nullptr;
+ if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+ &svc_id, &counter_path)) {
+ return nullptr;
+ }
+ return self->py_modules->get_counter_python(
+ svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_perf_schema(BaseMgrModule *self, PyObject *args)
+{
+ char *type_str = nullptr;
+ char *svc_id = nullptr;
+ if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str,
+ &svc_id)) {
+ return nullptr;
+ }
+
+ return self->py_modules->get_perf_schema_python(type_str, svc_id);
+}
+
+static PyObject *
+ceph_get_osdmap(BaseMgrModule *self, PyObject *args)
+{
+ return self->py_modules->get_osdmap();
+}
+
+static PyObject*
+ceph_set_uri(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_str = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_advertize_service",
+ &svc_str)) {
+ return nullptr;
+ }
+
+ // We call down into PyModules even though we have a MgrPyModule
+ // reference here, because MgrPyModule's fields are protected
+ // by PyModules' lock.
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_uri(self->this_module->get_name(), svc_str);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+
+PyMethodDef BaseMgrModule_methods[] = {
+ {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
+ "Get a cluster object"},
+
+ {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS,
+ "Get a server object"},
+
+ {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS,
+ "Get a service's metadata"},
+
+ {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS,
+ "Get a service's status"},
+
+ {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS,
+ "Send a mon command"},
+
+ {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS,
+ "Set health checks for this module"},
+
+ {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+ "Get the name of the Mgr daemon where we are running"},
+
+ {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+ "Get a configuration value"},
+
+ {"_ceph_get_config_prefix", (PyCFunction)ceph_config_get_prefix, METH_VARARGS,
+ "Get all configuration values with a given prefix"},
+
+ {"_ceph_set_config", (PyCFunction)ceph_config_set, METH_VARARGS,
+ "Set a configuration value"},
+
+ {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS,
+ "Get a performance counter"},
+
+ {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS,
+ "Get the performance counter schema"},
+
+ {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+ "Emit a (local) log message"},
+
+ {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_VARARGS,
+ "Get the ceph version of this process"},
+
+ {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS,
+ "Get a CephContext* in a python capsule"},
+
+ {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS,
+ "Get an OSDMap* in a python capsule"},
+
+ {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
+ "Advertize a service URI served by this module"},
+
+ {NULL, NULL, 0, NULL}
+};
+
+
+static PyObject *
+BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ BaseMgrModule *self;
+
+ self = (BaseMgrModule *)type->tp_alloc(type, 0);
+
+ return (PyObject *)self;
+}
+
+static int
+BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *py_modules_capsule = nullptr;
+ PyObject *this_module_capsule = nullptr;
+ static const char *kwlist[] = {"py_modules", "this_module", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO",
+ const_cast<char**>(kwlist),
+ &py_modules_capsule,
+ &this_module_capsule)) {
+ return -1;
+ }
+
+ self->py_modules = (ActivePyModules*)PyCapsule_GetPointer(
+ py_modules_capsule, nullptr);
+ assert(self->py_modules);
+ self->this_module = (ActivePyModule*)PyCapsule_GetPointer(
+ this_module_capsule, nullptr);
+ assert(self->this_module);
+
+ return 0;
+}
+
+PyTypeObject BaseMgrModuleType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BaseMgrModule", /* tp_name */
+ sizeof(BaseMgrModule), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "ceph-mgr Python Plugin", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BaseMgrModule_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BaseMgrModule_init, /* tp_init */
+ 0, /* tp_alloc */
+ BaseMgrModule_new, /* tp_new */
+};
+
--- /dev/null
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrModuleType;
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "BaseMgrStandbyModule.h"
+
+#include "StandbyPyModules.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+typedef struct {
+ PyObject_HEAD
+ StandbyPyModule *this_module;
+} BaseMgrStandbyModule;
+
+static PyObject *
+BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ BaseMgrStandbyModule *self;
+
+ self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0);
+
+ return (PyObject *)self;
+}
+
+static int
+BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *this_module_capsule = nullptr;
+ static const char *kwlist[] = {"this_module", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &this_module_capsule)) {
+ return -1;
+ }
+
+ self->this_module = (StandbyPyModule*)PyCapsule_GetPointer(
+ this_module_capsule, nullptr);
+ assert(self->this_module);
+
+ return 0;
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args)
+{
+ return PyString_FromString(g_conf->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_config_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_config_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ std::string value;
+ bool found = self->this_module->get_config(what, &value);
+ if (found) {
+ dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
+ return PyString_FromString(value.c_str());
+ } else {
+ dout(4) << "ceph_config_get " << what << " not found " << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args)
+{
+ return PyString_FromString(self->this_module->get_active_uri().c_str());
+}
+
+static PyObject*
+ceph_log(BaseMgrStandbyModule *self, PyObject *args)
+{
+ int level = 0;
+ char *record = nullptr;
+ if (!PyArg_ParseTuple(args, "is:log", &level, &record)) {
+ return nullptr;
+ }
+
+ assert(self->this_module);
+
+ self->this_module->log(level, record);
+
+ Py_RETURN_NONE;
+}
+
+PyMethodDef BaseMgrStandbyModule_methods[] = {
+
+ {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+ "Get the name of the Mgr daemon where we are running"},
+
+ {"_ceph_get_config", (PyCFunction)ceph_config_get, METH_VARARGS,
+ "Get a configuration value"},
+
+ {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS,
+ "Get the URI of the active instance of this module, if any"},
+
+ {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+ "Emit a log message"},
+
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BaseMgrStandbyModuleType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BaseMgrStandbyModule", /* tp_name */
+ sizeof(BaseMgrStandbyModule), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "ceph-mgr Standby Python Plugin", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BaseMgrStandbyModule_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BaseMgrStandbyModule_init, /* tp_init */
+ 0, /* tp_alloc */
+ BaseMgrStandbyModule_new, /* tp_new */
+};
--- /dev/null
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrStandbyModuleType;
+
Finisher &finisher_,
DaemonStateIndex &daemon_state_,
ClusterState &cluster_state_,
- PyModules &py_modules_,
+ PyModuleRegistry &py_modules_,
LogChannelRef clog_,
LogChannelRef audit_clog_)
: Dispatcher(g_ceph_context),
client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
- g_conf->mgr_client_bytes)),
+ g_conf->get_val<uint64_t>("mgr_client_bytes"))),
client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
- g_conf->mgr_client_messages)),
+ g_conf->get_val<uint64_t>("mgr_client_messages"))),
osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
- g_conf->mgr_osd_bytes)),
+ g_conf->get_val<uint64_t>("mgr_osd_bytes"))),
osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
- g_conf->mgr_osd_messages)),
+ g_conf->get_val<uint64_t>("mgr_osd_messages"))),
mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
- g_conf->mgr_mds_bytes)),
+ g_conf->get_val<uint64_t>("mgr_mds_bytes"))),
mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
- g_conf->mgr_mds_messages)),
+ g_conf->get_val<uint64_t>("mgr_mds_messages"))),
mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
- g_conf->mgr_mon_bytes)),
+ g_conf->get_val<uint64_t>("mgr_mon_bytes"))),
mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
- g_conf->mgr_mon_messages)),
+ g_conf->get_val<uint64_t>("mgr_mon_messages"))),
msgr(nullptr),
monc(monc_),
finisher(finisher_),
g_conf->auth_supported.empty() ?
g_conf->auth_cluster_required :
g_conf->auth_supported),
- lock("DaemonServer")
-{}
+ lock("DaemonServer"),
+ pgmap_ready(false)
+{
+ g_conf->add_observer(this);
+}
DaemonServer::~DaemonServer() {
delete msgr;
+ g_conf->remove_observer(this);
}
int DaemonServer::init(uint64_t gid, entity_addr_t client_addr)
dout(10) << "unregistering osd." << session->osd_id
<< " session " << session << " con " << con << dendl;
osd_cons[session->osd_id].erase(con);
+
+ auto iter = daemon_connections.find(con);
+ if (iter != daemon_connections.end()) {
+ daemon_connections.erase(iter);
+ }
}
return false;
}
bool DaemonServer::ms_dispatch(Message *m)
{
- Mutex::Locker l(lock);
-
+ // Note that we do *not* take ::lock here, in order to avoid
+ // serializing all message handling. It's up to each handler
+ // to take whatever locks it needs.
switch (m->get_type()) {
case MSG_PGSTATS:
cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));
void DaemonServer::maybe_ready(int32_t osd_id)
{
- if (!pgmap_ready && reported_osds.find(osd_id) == reported_osds.end()) {
- dout(4) << "initial report from osd " << osd_id << dendl;
- reported_osds.insert(osd_id);
- std::set<int32_t> up_osds;
+ if (pgmap_ready.load()) {
+ // Fast path: we don't need to take lock because pgmap_ready
+ // is already set
+ } else {
+ Mutex::Locker l(lock);
- cluster_state.with_osdmap([&](const OSDMap& osdmap) {
- osdmap.get_up_osds(up_osds);
- });
+ if (reported_osds.find(osd_id) == reported_osds.end()) {
+ dout(4) << "initial report from osd " << osd_id << dendl;
+ reported_osds.insert(osd_id);
+ std::set<int32_t> up_osds;
- std::set<int32_t> unreported_osds;
- std::set_difference(up_osds.begin(), up_osds.end(),
- reported_osds.begin(), reported_osds.end(),
- std::inserter(unreported_osds, unreported_osds.begin()));
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ osdmap.get_up_osds(up_osds);
+ });
- if (unreported_osds.size() == 0) {
- dout(4) << "all osds have reported, sending PG state to mon" << dendl;
- pgmap_ready = true;
- reported_osds.clear();
- // Avoid waiting for next tick
- send_report();
- } else {
- dout(4) << "still waiting for " << unreported_osds.size() << " osds"
- " to report in before PGMap is ready" << dendl;
+ std::set<int32_t> unreported_osds;
+ std::set_difference(up_osds.begin(), up_osds.end(),
+ reported_osds.begin(), reported_osds.end(),
+ std::inserter(unreported_osds, unreported_osds.begin()));
+
+ if (unreported_osds.size() == 0) {
+ dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+ pgmap_ready = true;
+ reported_osds.clear();
+ // Avoid waiting for next tick
+ send_report();
+ } else {
+ dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+ " to report in before PGMap is ready" << dendl;
+ }
}
}
}
bool DaemonServer::handle_open(MMgrOpen *m)
{
+ Mutex::Locker l(lock);
+
DaemonKey key;
if (!m->service_name.empty()) {
key.first = m->service_name;
dout(4) << "from " << m->get_connection() << " " << key << dendl;
- auto configure = new MMgrConfigure();
- configure->stats_period = g_conf->mgr_stats_period;
- m->get_connection()->send_message(configure);
+ _send_configure(m->get_connection());
DaemonStatePtr daemon;
if (daemon_state.exists(key)) {
if (daemon) {
dout(20) << "updating existing DaemonState for " << m->daemon_name << dendl;
Mutex::Locker l(daemon->lock);
- daemon_state.get(key)->perf_counters.clear();
+ daemon->perf_counters.clear();
}
if (m->service_daemon) {
}
}
+ if (m->get_connection()->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+ m->service_name.empty())
+ {
+ // Store in set of the daemon/service connections, i.e. those
+ // connections that require an update in the event of stats
+ // configuration changes.
+ daemon_connections.insert(m->get_connection());
+ }
+
m->put();
return true;
}
return true;
}
+ // Look up the DaemonState
DaemonStatePtr daemon;
if (daemon_state.exists(key)) {
dout(20) << "updating existing DaemonState for " << key << dendl;
// daemons without sessions, and ensuring that session open
// always contains metadata.
}
+
+ // Update the DaemonState
assert(daemon != nullptr);
- auto &daemon_counters = daemon->perf_counters;
{
Mutex::Locker l(daemon->lock);
+ auto &daemon_counters = daemon->perf_counters;
daemon_counters.update(m);
+
+ if (daemon->service_daemon) {
+ utime_t now = ceph_clock_now();
+ if (m->daemon_status) {
+ daemon->service_status = *m->daemon_status;
+ daemon->service_status_stamp = now;
+ }
+ daemon->last_service_beacon = now;
+ } else if (m->daemon_status) {
+ derr << "got status from non-daemon " << key << dendl;
+ }
}
+
// if there are any schema updates, notify the python modules
if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
ostringstream oss;
py_modules.notify_all("perf_schema_update", oss.str());
}
- if (daemon->service_daemon) {
- utime_t now = ceph_clock_now();
- if (m->daemon_status) {
- daemon->service_status = *m->daemon_status;
- daemon->service_status_stamp = now;
- }
- daemon->last_service_beacon = now;
- } else if (m->daemon_status) {
- derr << "got status from non-daemon " << key << dendl;
- }
-
m->put();
return true;
}
bool DaemonServer::handle_command(MCommand *m)
{
+ Mutex::Locker l(lock);
int r = 0;
std::stringstream ss;
std::string prefix;
return true;
}
+ if (prefix == "config set") {
+ std::string key;
+ std::string val;
+ cmd_getval(cct, cmdctx->cmdmap, "key", key);
+ cmd_getval(cct, cmdctx->cmdmap, "value", val);
+ r = cct->_conf->set_val(key, val, true, &ss);
+ if (r == 0) {
+ cct->_conf->apply_changes(nullptr);
+ }
+ cmdctx->reply(0, ss);
+ return true;
+ }
+
// -----------
// PG commands
}
break;
case OFR_BACKFILL:
- if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL)) == 0) {
+ if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
ss << "pg " << pstr << " doesn't require backfilling; ";
continue;
} else if (workpg.state & PG_STATE_FORCED_BACKFILL) {
}
// None of the special native commands,
- MgrPyModule *handler = nullptr;
+ ActivePyModule *handler = nullptr;
auto py_commands = py_modules.get_py_commands();
for (const auto &pyc : py_commands) {
auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
void DaemonServer::_prune_pending_service_map()
{
utime_t cutoff = ceph_clock_now();
- cutoff -= g_conf->mgr_service_beacon_grace;
+ cutoff -= g_conf->get_val<double>("mgr_service_beacon_grace");
auto p = pending_service_map.services.begin();
while (p != pending_service_map.services.end()) {
auto q = p->second.daemons.begin();
void DaemonServer::send_report()
{
if (!pgmap_ready) {
- if (ceph_clock_now() - started_at > g_conf->mgr_stats_period * 4.0) {
+ if (ceph_clock_now() - started_at > g_conf->get_val<int64_t>("mgr_stats_period") * 4.0) {
pgmap_ready = true;
reported_osds.clear();
dout(1) << "Giving up on OSDs that haven't reported yet, sending "
daemon_state.cull(p.first, names);
}
}
+
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+ static const char *KEYS[] = {
+ "mgr_stats_threshold",
+ "mgr_stats_period",
+ nullptr
+ };
+
+ return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed)
+{
+ dout(4) << "ohai" << dendl;
+ // We may be called within lock (via MCommand `config set`) or outwith the
+ // lock (via admin socket `config set`), so handle either case.
+ const bool initially_locked = lock.is_locked_by_me();
+ if (!initially_locked) {
+ lock.Lock();
+ }
+
+ if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+ dout(4) << "Updating stats threshold/period on "
+ << daemon_connections.size() << " clients" << dendl;
+ // Send a fresh MMgrConfigure to all clients, so that they can follow
+ // the new policy for transmitting stats
+ for (auto &c : daemon_connections) {
+ _send_configure(c);
+ }
+ }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+ assert(lock.is_locked_by_me());
+
+ auto configure = new MMgrConfigure();
+ configure->stats_period = g_conf->get_val<int64_t>("mgr_stats_period");
+ configure->stats_threshold = g_conf->get_val<int64_t>("mgr_stats_threshold");
+ c->send_message(configure);
+}
+
#ifndef DAEMON_SERVER_H_
#define DAEMON_SERVER_H_
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
#include <set>
#include <string>
* Server used in ceph-mgr to communicate with Ceph daemons like
* MDSs and OSDs.
*/
-class DaemonServer : public Dispatcher
+class DaemonServer : public Dispatcher, public md_config_obs_t
{
protected:
boost::scoped_ptr<Throttle> client_byte_throttler;
Finisher &finisher;
DaemonStateIndex &daemon_state;
ClusterState &cluster_state;
- PyModules &py_modules;
+ PyModuleRegistry &py_modules;
LogChannelRef clog, audit_clog;
AuthAuthorizeHandlerRegistry auth_registry;
+ // Connections for daemons, and clients with service names set
+ // (i.e. those MgrClients that are allowed to send MMgrReports)
+ std::set<ConnectionRef> daemon_connections;
+
/// connections for osds
ceph::unordered_map<int,set<ConnectionRef>> osd_cons;
ServiceMap pending_service_map; // uncommitted
+
epoch_t pending_service_map_dirty = 0;
Mutex lock;
void _prune_pending_service_map();
utime_t started_at;
- bool pgmap_ready = false;
+ std::atomic<bool> pgmap_ready;
std::set<int32_t> reported_osds;
void maybe_ready(int32_t osd_id);
Finisher &finisher_,
DaemonStateIndex &daemon_state_,
ClusterState &cluster_state_,
- PyModules &py_modules_,
+ PyModuleRegistry &py_modules_,
LogChannelRef cl,
LogChannelRef auditcl);
~DaemonServer() override;
bool handle_command(MCommand *m);
void send_report();
void got_service_map();
+
+ void _send_configure(ConnectionRef c);
+
+ virtual const char** get_tracked_conf_keys() const override;
+ virtual void handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed) override;
};
#endif
#include "DaemonState.h"
+#include "MgrSession.h"
+
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_mgr
#undef dout_prefix
void DaemonStateIndex::insert(DaemonStatePtr dm)
{
- Mutex::Locker l(lock);
+ RWLock::WLocker l(lock);
if (all.count(dm->key)) {
_erase(dm->key);
void DaemonStateIndex::_erase(const DaemonKey& dmk)
{
- assert(lock.is_locked_by_me());
+ assert(lock.is_wlocked());
const auto to_erase = all.find(dmk);
assert(to_erase != all.end());
DaemonStateCollection DaemonStateIndex::get_by_service(
const std::string& svc) const
{
- Mutex::Locker l(lock);
+ RWLock::RLocker l(lock);
DaemonStateCollection result;
DaemonStateCollection DaemonStateIndex::get_by_server(
const std::string &hostname) const
{
- Mutex::Locker l(lock);
+ RWLock::RLocker l(lock);
if (by_server.count(hostname)) {
return by_server.at(hostname);
bool DaemonStateIndex::exists(const DaemonKey &key) const
{
- Mutex::Locker l(lock);
+ RWLock::RLocker l(lock);
return all.count(key) > 0;
}
DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
{
- Mutex::Locker l(lock);
+ RWLock::RLocker l(lock);
- return all.at(key);
+ auto iter = all.find(key);
+ if (iter != all.end()) {
+ return iter->second;
+ } else {
+ return nullptr;
+ }
}
void DaemonStateIndex::cull(const std::string& svc_name,
{
std::vector<string> victims;
- Mutex::Locker l(lock);
+ RWLock::WLocker l(lock);
auto begin = all.lower_bound({svc_name, ""});
auto end = all.end();
for (auto &i = begin; i != end; ++i) {
<< types.size() << " types, got "
<< report->packed.length() << " bytes of data" << dendl;
+ // Retrieve session state
+ MgrSessionRef session(static_cast<MgrSession*>(
+ report->get_connection()->get_priv()));
+
// Load any newly declared types
for (const auto &t : report->declare_types) {
types.insert(std::make_pair(t.path, t));
- declared_types.insert(t.path);
+ session->declared_types.insert(t.path);
}
// Remove any old types
for (const auto &t : report->undeclare_types) {
- declared_types.erase(t);
+ session->declared_types.erase(t);
}
const auto now = ceph_clock_now();
// Parse packed data according to declared set of types
bufferlist::iterator p = report->packed.begin();
DECODE_START(1, p);
- for (const auto &t_path : declared_types) {
+ for (const auto &t_path : session->declared_types) {
const auto &t = types.at(t_path);
uint64_t val = 0;
uint64_t avgcount = 0;
#include <set>
#include <boost/circular_buffer.hpp>
-#include "common/Mutex.h"
+#include "common/RWLock.h"
#include "msg/msg_types.h"
std::map<std::string, PerfCounterInstance> instances;
- // FIXME: this state is really local to DaemonServer, it's part
- // of the protocol rather than being part of what other classes
- // mgiht want to read. Maybe have a separate session object
- // inside DaemonServer instead of stashing session-ish state here?
- std::set<std::string> declared_types;
-
void update(MMgrReport *report);
void clear()
{
instances.clear();
- declared_types.clear();
}
};
class DaemonStateIndex
{
private:
+ mutable RWLock lock = {"DaemonStateIndex", true, true, true};
+
std::map<std::string, DaemonStateCollection> by_server;
DaemonStateCollection all;
-
std::set<DaemonKey> updating;
- mutable Mutex lock;
+ void _erase(const DaemonKey& dmk);
public:
-
- DaemonStateIndex() : lock("DaemonState") {}
+ DaemonStateIndex() {}
// FIXME: shouldn't really be public, maybe construct DaemonState
// objects internally to avoid this.
PerfCounterTypes types;
void insert(DaemonStatePtr dm);
- void _erase(const DaemonKey& dmk);
-
bool exists(const DaemonKey &key) const;
DaemonStatePtr get(const DaemonKey &key);
+
+ // Note that these return by value rather than reference to avoid
+ // callers needing to stay in lock while using result. Callers must
+ // still take the individual DaemonState::lock on each entry though.
DaemonStateCollection get_by_server(const std::string &hostname) const;
DaemonStateCollection get_by_service(const std::string &svc_name) const;
-
- const DaemonStateCollection &get_all() const {return all;}
- const std::map<std::string, DaemonStateCollection> &get_all_servers() const
- {
- return by_server;
+ DaemonStateCollection get_all() const {return all;}
+
+ template<typename Callback, typename...Args>
+ auto with_daemons_by_server(Callback&& cb, Args&&... args) const ->
+ decltype(cb(by_server, std::forward<Args>(args)...)) {
+ RWLock::RLocker l(lock);
+
+ return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...);
}
- void notify_updating(const DaemonKey &k) { updating.insert(k); }
- void clear_updating(const DaemonKey &k) { updating.erase(k); }
- bool is_updating(const DaemonKey &k) { return updating.count(k) > 0; }
+ void notify_updating(const DaemonKey &k) {
+ RWLock::WLocker l(lock);
+ updating.insert(k);
+ }
+ void clear_updating(const DaemonKey &k) {
+ RWLock::WLocker l(lock);
+ updating.erase(k);
+ }
+ bool is_updating(const DaemonKey &k) {
+ RWLock::RLocker l(lock);
+ return updating.count(k) > 0;
+ }
/**
* Remove state for all daemons of this type whose names are
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "Python.h"
+
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+#include "Gil.h"
+
+SafeThreadState::SafeThreadState(PyThreadState *ts_)
+ : ts(ts_)
+{
+ assert(ts != nullptr);
+ thread = pthread_self();
+}
+
+Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts)
+{
+ // Acquire the GIL, set the current thread state
+ PyEval_RestoreThread(pThreadState.ts);
+ dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl;
+
+ //
+ // If called from a separate OS thread (i.e. a thread not created
+ // by Python, that does't already have a python thread state that
+ // was created when that thread was active), we need to manually
+ // create and switch to a python thread state specifically for this
+ // OS thread.
+ //
+ // Note that instead of requring the caller to set new_thread == true
+ // when calling this from a separate OS thread, we could figure out
+ // if this was necessary automatically, as follows:
+ //
+ // if (pThreadState->thread_id != PyThread_get_thread_ident()) {
+ //
+ // However, this means we're accessing pThreadState->thread_id, but
+ // the Python C API docs say that "The only public data member is
+ // PyInterpreterState *interp", i.e. doing this would violate
+ // something that's meant to be a black box.
+ //
+ if (new_thread) {
+ pNewThreadState = PyThreadState_New(pThreadState.ts->interp);
+ PyThreadState_Swap(pNewThreadState);
+ dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
+ } else {
+ assert(pthread_self() == pThreadState.thread);
+ }
+}
+
+Gil::~Gil()
+{
+ if (pNewThreadState != nullptr) {
+ dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
+ PyThreadState_Swap(pThreadState.ts);
+ PyThreadState_Clear(pNewThreadState);
+ PyThreadState_Delete(pNewThreadState);
+ }
+ // Release the GIL, reset the thread state to NULL
+ PyEval_SaveThread();
+ dout(25) << "GIL released for thread state " << pThreadState.ts << dendl;
+}
+
*
*/
-#ifndef GIL_H_
-#define GIL_H_
+#pragma once
-#include "Python.h"
+struct _ts;
+typedef struct _ts PyThreadState;
-#include "common/debug.h"
+#include <pthread.h>
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+/**
+ * Wrap PyThreadState to carry a record of which POSIX thread
+ * the thread state relates to. This allows the Gil class to
+ * validate that we're being used from the right thread.
+ */
+class SafeThreadState
+{
+ public:
+ SafeThreadState(PyThreadState *ts_);
+
+ SafeThreadState()
+ : ts(nullptr), thread(0)
+ {
+ }
+
+ PyThreadState *ts;
+ pthread_t thread;
+
+ void set(PyThreadState *ts_)
+ {
+ ts = ts_;
+ thread = pthread_self();
+ }
+};
//
// Use one of these in any scope in which you need to hold Python's
// If in doubt, explicitly put a scope around the block of code you
// know you need the GIL in.
//
-// See the comment below for when to set new_thread == true
+// See the comment in Gil::Gil for when to set new_thread == true
//
class Gil {
public:
Gil(const Gil&) = delete;
Gil& operator=(const Gil&) = delete;
- Gil(PyThreadState *ts, bool new_thread = false) : pThreadState(ts)
- {
- assert(pThreadState != nullptr);
-
- // Acquire the GIL, set the current thread state
- PyEval_RestoreThread(pThreadState);
- dout(20) << "GIL acquired for thread state " << pThreadState << dendl;
-
- //
- // If called from a separate OS thread (i.e. a thread not created
- // by Python, that does't already have a python thread state that
- // was created when that thread was active), we need to manually
- // create and switch to a python thread state specifically for this
- // OS thread.
- //
- // Note that instead of requring the caller to set new_thread == true
- // when calling this from a separate OS thread, we could figure out
- // if this was necessary automatically, as follows:
- //
- // if (pThreadState->thread_id != PyThread_get_thread_ident()) {
- //
- // However, this means we're accessing pThreadState->thread_id, but
- // the Python C API docs say that "The only public data member is
- // PyInterpreterState *interp", i.e. doing this would violate
- // something that's meant to be a black box.
- //
- if (new_thread) {
- pNewThreadState = PyThreadState_New(pThreadState->interp);
- PyThreadState_Swap(pNewThreadState);
- dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
- }
- }
-
- ~Gil()
- {
- if (pNewThreadState != nullptr) {
- dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
- PyThreadState_Swap(pThreadState);
- PyThreadState_Clear(pNewThreadState);
- PyThreadState_Delete(pNewThreadState);
- }
- // Release the GIL, reset the thread state to NULL
- PyEval_SaveThread();
- dout(20) << "GIL released for thread state " << pThreadState << dendl;
- }
+ Gil(SafeThreadState &ts, bool new_thread = false);
+ ~Gil();
private:
- PyThreadState *pThreadState;
+ SafeThreadState &pThreadState;
PyThreadState *pNewThreadState = nullptr;
};
-#endif
-
#include "mgr/MgrContext.h"
#include "mgr/mgr_commands.h"
-#include "MgrPyModule.h"
+//#include "MgrPyModule.h"
#include "DaemonServer.h"
#include "messages/MMgrBeacon.h"
#include "messages/MMgrDigest.h"
Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
+ PyModuleRegistry *py_module_registry_,
Messenger *clientm_, Objecter *objecter_,
Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
monc(monc_),
timer(g_ceph_context, lock),
finisher(g_ceph_context, "Mgr", "mgr-fin"),
digest_received(false),
- py_modules(daemon_state, cluster_state, *monc, clog_, *objecter, *client,
- finisher),
+ py_module_registry(py_module_registry_),
cluster_state(monc, nullptr, mgrmap),
- server(monc, finisher, daemon_state, cluster_state, py_modules,
+ server(monc, finisher, daemon_state, cluster_state, *py_module_registry,
clog_, audit_clog_),
+ clog(clog_),
+ audit_clog(audit_clog_),
initialized(false),
initializing(false)
{
// Preload config keys (`get` for plugins is to be a fast local
// operation, we we don't have to synchronize these later because
// all sets will come via mgr)
- load_config();
+ auto loaded_config = load_config();
// Wait for MgrDigest...
dout(4) << "waiting for MgrDigest..." << dendl;
}
// assume finisher already initialized in background_init
- dout(4) << "starting PyModules..." << dendl;
- py_modules.init();
- py_modules.start();
+ dout(4) << "starting python modules..." << dendl;
+ py_module_registry->active_start(loaded_config, daemon_state, cluster_state, *monc,
+ clog, *objecter, *client, finisher);
dout(4) << "Complete." << dendl;
initializing = false;
}
}
-void Mgr::load_config()
+std::map<std::string, std::string> Mgr::load_config()
{
assert(lock.is_locked_by_me());
std::string const key = key_str.get_str();
dout(20) << "saw key '" << key << "'" << dendl;
- const std::string config_prefix = PyModules::config_prefix;
+ const std::string config_prefix = PyModuleRegistry::config_prefix;
if (key.substr(0, config_prefix.size()) == config_prefix) {
dout(20) << "fetching '" << key << "'" << dendl;
}
}
- py_modules.insert_config(loaded);
+ return loaded;
}
void Mgr::shutdown()
server.shutdown();
}
// after the messenger is stopped, signal modules to shutdown via finisher
- py_modules.shutdown();
+ py_module_registry->active_shutdown();
}));
// Then stop the finisher to ensure its enqueued contexts aren't going
void Mgr::handle_log(MLog *m)
{
for (const auto &e : m->entries) {
- py_modules.notify_all(e);
+ py_module_registry->notify_all(e);
}
m->put();
handle_mgr_digest(static_cast<MMgrDigest*>(m));
break;
case CEPH_MSG_MON_MAP:
- py_modules.notify_all("mon_map", "");
+ py_module_registry->notify_all("mon_map", "");
m->put();
break;
case CEPH_MSG_FS_MAP:
- py_modules.notify_all("fs_map", "");
+ py_module_registry->notify_all("fs_map", "");
handle_fs_map((MFSMap*)m);
return false; // I shall let this pass through for Client
break;
case CEPH_MSG_OSD_MAP:
handle_osd_map();
- py_modules.notify_all("osd_map", "");
+ py_module_registry->notify_all("osd_map", "");
// Continuous subscribe, so that we can generate notifications
// for our MgrPyModules
break;
case MSG_SERVICE_MAP:
handle_service_map((MServiceMap*)m);
- py_modules.notify_all("service_map", "");
+ py_module_registry->notify_all("service_map", "");
m->put();
break;
case MSG_LOG:
dout(10) << m->mon_status_json.length() << dendl;
dout(10) << m->health_json.length() << dendl;
cluster_state.load_digest(m);
- py_modules.notify_all("mon_status", "");
- py_modules.notify_all("health", "");
+ py_module_registry->notify_all("mon_status", "");
+ py_module_registry->notify_all("health", "");
// Hack: use this as a tick/opportunity to prompt python-land that
// the pgmap might have changed since last time we were here.
- py_modules.notify_all("pg_summary", "");
+ py_module_registry->notify_all("pg_summary", "");
dout(10) << "done." << dendl;
m->put();
Mutex::Locker l(lock);
std::vector<MonCommand> commands = mgr_commands;
- std::vector<MonCommand> py_commands = py_modules.get_commands();
+ std::vector<MonCommand> py_commands = py_module_registry->get_commands();
commands.insert(commands.end(), py_commands.begin(), py_commands.end());
return commands;
}
+std::map<std::string, std::string> Mgr::get_services() const
+{
+ Mutex::Locker l(lock);
+
+ return py_module_registry->get_services();
+}
+
#include "mon/MgrMap.h"
#include "DaemonServer.h"
-#include "PyModules.h"
+#include "PyModuleRegistry.h"
#include "DaemonState.h"
#include "ClusterState.h"
class Objecter;
class Client;
-class MgrPyModule;
-
class Mgr {
protected:
MonClient *monc;
bool digest_received;
Cond digest_cond;
- PyModules py_modules;
+ PyModuleRegistry *py_module_registry;
DaemonStateIndex daemon_state;
ClusterState cluster_state;
DaemonServer server;
- void load_config();
+ LogChannelRef clog;
+ LogChannelRef audit_clog;
+
+ PyModuleConfig load_config();
void load_all_metadata();
void init();
public:
Mgr(MonClient *monc_, const MgrMap& mgrmap,
+ PyModuleRegistry *py_module_registry_,
Messenger *clientm_, Objecter *objecter_,
Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
~Mgr();
void shutdown();
std::vector<MonCommand> get_command_set() const;
+ std::map<std::string, std::string> get_services() const;
};
#endif
if (last_connect_attempt != utime_t()) {
utime_t now = ceph_clock_now();
utime_t when = last_connect_attempt;
- when += cct->_conf->mgr_connect_retry_interval;
+ when += cct->_conf->get_val<double>("mgr_connect_retry_interval");
if (now < when) {
if (!connect_retry_callback) {
- connect_retry_callback = new FunctionContext([this](int r){
- connect_retry_callback = nullptr;
- reconnect();
- });
- timer.add_event_at(when, connect_retry_callback);
+ connect_retry_callback = timer.add_event_at(
+ when,
+ new FunctionContext([this](int r){
+ connect_retry_callback = nullptr;
+ reconnect();
+ }));
}
ldout(cct, 4) << "waiting to retry connect until " << when << dendl;
return;
pcc->with_counters([this, report](
const PerfCountersCollection::CounterMap &by_path)
{
+ // Helper for checking whether a counter should be included
+ auto include_counter = [this](
+ const PerfCounters::perf_counter_data_any_d &ctr,
+ const PerfCounters &perf_counters)
+ {
+ return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold;
+ };
+
+ // Helper for cases where we want to forget a counter
+ auto undeclare = [report, this](const std::string &path)
+ {
+ report->undeclare_types.push_back(path);
+ ldout(cct,20) << " undeclare " << path << dendl;
+ session->declared.erase(path);
+ };
+
ENCODE_START(1, 1, report->packed);
+
+ // Find counters that no longer exist, and undeclare them
for (auto p = session->declared.begin(); p != session->declared.end(); ) {
- if (by_path.count(*p) == 0) {
- report->undeclare_types.push_back(*p);
- ldout(cct,20) << __func__ << " undeclare " << *p << dendl;
- p = session->declared.erase(p);
- } else {
- ++p;
+ const auto &path = *(p++);
+ if (by_path.count(path) == 0) {
+ undeclare(path);
}
}
+
for (const auto &i : by_path) {
auto& path = i.first;
- auto& data = *(i.second);
+ auto& data = *(i.second.data);
+ auto& perf_counters = *(i.second.perf_counters);
+
+ // Find counters that still exist, but are no longer permitted by
+ // stats_threshold
+ if (!include_counter(data, perf_counters)) {
+ if (session->declared.count(path)) {
+ undeclare(path);
+ }
+ continue;
+ }
if (session->declared.count(path) == 0) {
- ldout(cct,20) << __func__ << " declare " << path << dendl;
+ ldout(cct,20) << " declare " << path << dendl;
PerfCounterType type;
type.path = path;
if (data.description) {
type.nick = data.nick;
}
type.type = data.type;
+ type.priority = perf_counters.get_adjusted_priority(data.prio);
report->declare_types.push_back(std::move(type));
session->declared.insert(path);
}
}
ENCODE_FINISH(report->packed);
- ldout(cct, 20) << by_path.size() << " counters, of which "
- << report->declare_types.size() << " new" << dendl;
+ ldout(cct, 20) << "sending " << session->declared.size() << " counters ("
+ "of possible " << by_path.size() << "), "
+ << report->declare_types.size() << " new, "
+ << report->undeclare_types.size() << " removed"
+ << dendl;
});
ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl;
ldout(cct, 4) << "stats_period=" << m->stats_period << dendl;
+ if (stats_threshold != m->stats_threshold) {
+ ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl;
+ stats_threshold = m->stats_threshold;
+ }
+
bool starting = (stats_period == 0) && (m->stats_period != 0);
stats_period = m->stats_period;
if (starting) {
Mutex lock = {"MgrClient::lock"};
uint32_t stats_period = 0;
+ uint32_t stats_threshold = 0;
SafeTimer timer;
CommandTable<MgrCommand> command_table;
"dump service map", "service", "r", "cli,rest")
COMMAND("service status",
"dump service state", "service", "r", "cli,rest")
+
+COMMAND("config set " \
+ "name=key,type=CephString name=value,type=CephString",
+ "Set a configuration option at runtime (not persistent)",
+ "mgr", "rw", "cli,rest")
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- */
-
-#include "PyState.h"
-#include "Gil.h"
-
-#include "PyFormatter.h"
-
-#include "common/debug.h"
-
-#include "MgrPyModule.h"
-
-//XXX courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
-#include <boost/python.hpp>
-#include "include/assert.h" // boost clobbers this
-
-// decode a Python exception into a string
-std::string handle_pyerror()
-{
- using namespace boost::python;
- using namespace boost;
-
- PyObject *exc, *val, *tb;
- object formatted_list, formatted;
- PyErr_Fetch(&exc, &val, &tb);
- handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
- object traceback(import("traceback"));
- if (!tb) {
- object format_exception_only(traceback.attr("format_exception_only"));
- formatted_list = format_exception_only(hexc, hval);
- } else {
- object format_exception(traceback.attr("format_exception"));
- formatted_list = format_exception(hexc,hval, htb);
- }
- formatted = str("").join(formatted_list);
- return extract<std::string>(formatted);
-}
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[py] "
-
-namespace {
- PyObject* log_write(PyObject*, PyObject* args) {
- char* m = nullptr;
- if (PyArg_ParseTuple(args, "s", &m)) {
- auto len = strlen(m);
- if (len && m[len-1] == '\n') {
- m[len-1] = '\0';
- }
- dout(4) << m << dendl;
- }
- Py_RETURN_NONE;
- }
-
- PyObject* log_flush(PyObject*, PyObject*){
- Py_RETURN_NONE;
- }
-
- static PyMethodDef log_methods[] = {
- {"write", log_write, METH_VARARGS, "write stdout and stderr"},
- {"flush", log_flush, METH_VARARGS, "flush"},
- {nullptr, nullptr, 0, nullptr}
- };
-}
-
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-
-MgrPyModule::MgrPyModule(const std::string &module_name_, const std::string &sys_path, PyThreadState *main_ts_)
- : module_name(module_name_),
- pClassInstance(nullptr),
- pMainThreadState(main_ts_)
-{
- assert(pMainThreadState != nullptr);
-
- Gil gil(pMainThreadState);
-
- pMyThreadState = Py_NewInterpreter();
- if (pMyThreadState == nullptr) {
- derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
- } else {
- // Some python modules do not cope with an unpopulated argv, so lets
- // fake one. This step also picks up site-packages into sys.path.
- const char *argv[] = {"ceph-mgr"};
- PySys_SetArgv(1, (char**)argv);
-
- if (g_conf->daemonize) {
- auto py_logger = Py_InitModule("ceph_logger", log_methods);
-#if PY_MAJOR_VERSION >= 3
- PySys_SetObject("stderr", py_logger);
- PySys_SetObject("stdout", py_logger);
-#else
- PySys_SetObject(const_cast<char*>("stderr"), py_logger);
- PySys_SetObject(const_cast<char*>("stdout"), py_logger);
-#endif
- }
- // Populate python namespace with callable hooks
- Py_InitModule("ceph_state", CephStateMethods);
-
- PySys_SetPath(const_cast<char*>(sys_path.c_str()));
- }
-}
-
-MgrPyModule::~MgrPyModule()
-{
- if (pMyThreadState != nullptr) {
- Gil gil(pMyThreadState);
-
- Py_XDECREF(pClassInstance);
-
- //
- // Ideally, now, we'd be able to do this:
- //
- // Py_EndInterpreter(pMyThreadState);
- // PyThreadState_Swap(pMainThreadState);
- //
- // Unfortunately, if the module has any other *python* threads active
- // at this point, Py_EndInterpreter() will abort with:
- //
- // Fatal Python error: Py_EndInterpreter: not the last thread
- //
- // This can happen when using CherryPy in a module, becuase CherryPy
- // runs an extra thread as a timeout monitor, which spends most of its
- // life inside a time.sleep(60). Unless you are very, very lucky with
- // the timing calling this destructor, that thread will still be stuck
- // in a sleep, and Py_EndInterpreter() will abort.
- //
- // This could of course also happen with a poorly written module which
- // made no attempt to clean up any additional threads it created.
- //
- // The safest thing to do is just not call Py_EndInterpreter(), and
- // let Py_Finalize() kill everything after all modules are shut down.
- //
- }
-}
-
-int MgrPyModule::load()
-{
- if (pMyThreadState == nullptr) {
- derr << "No python sub-interpreter exists for module '" << module_name << "'" << dendl;
- return -EINVAL;
- }
-
- Gil gil(pMyThreadState);
-
- // Load the module
- PyObject *pName = PyString_FromString(module_name.c_str());
- auto pModule = PyImport_Import(pName);
- Py_DECREF(pName);
- if (pModule == nullptr) {
- derr << "Module not found: '" << module_name << "'" << dendl;
- derr << handle_pyerror() << dendl;
- return -ENOENT;
- }
-
- // Find the class
- // TODO: let them call it what they want instead of just 'Module'
- auto pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
- Py_DECREF(pModule);
- if (pClass == nullptr) {
- derr << "Class not found in module '" << module_name << "'" << dendl;
- derr << handle_pyerror() << dendl;
- return -EINVAL;
- }
-
-
- // Just using the module name as the handle, replace with a
- // uuidish thing if needed
- auto pyHandle = PyString_FromString(module_name.c_str());
- auto pArgs = PyTuple_Pack(1, pyHandle);
- pClassInstance = PyObject_CallObject(pClass, pArgs);
- Py_DECREF(pClass);
- Py_DECREF(pyHandle);
- Py_DECREF(pArgs);
- if (pClassInstance == nullptr) {
- derr << "Failed to construct class in '" << module_name << "'" << dendl;
- derr << handle_pyerror() << dendl;
- return -EINVAL;
- } else {
- dout(1) << "Constructed class from module: " << module_name << dendl;
- }
-
- return load_commands();
-}
-
-int MgrPyModule::serve()
-{
- assert(pClassInstance != nullptr);
-
- // This method is called from a separate OS thread (i.e. a thread not
- // created by Python), so tell Gil to wrap this in a new thread state.
- Gil gil(pMyThreadState, true);
-
- auto pValue = PyObject_CallMethod(pClassInstance,
- const_cast<char*>("serve"), nullptr);
-
- int r = 0;
- if (pValue != NULL) {
- Py_DECREF(pValue);
- } else {
- derr << module_name << ".serve:" << dendl;
- derr << handle_pyerror() << dendl;
- return -EINVAL;
- }
-
- return r;
-}
-
-// FIXME: DRY wrt serve
-void MgrPyModule::shutdown()
-{
- assert(pClassInstance != nullptr);
-
- Gil gil(pMyThreadState);
-
- auto pValue = PyObject_CallMethod(pClassInstance,
- const_cast<char*>("shutdown"), nullptr);
-
- if (pValue != NULL) {
- Py_DECREF(pValue);
- } else {
- derr << "Failed to invoke shutdown() on " << module_name << dendl;
- derr << handle_pyerror() << dendl;
- }
-}
-
-void MgrPyModule::notify(const std::string ¬ify_type, const std::string ¬ify_id)
-{
- assert(pClassInstance != nullptr);
-
- Gil gil(pMyThreadState);
-
- // Execute
- auto pValue = PyObject_CallMethod(pClassInstance,
- const_cast<char*>("notify"), const_cast<char*>("(ss)"),
- notify_type.c_str(), notify_id.c_str());
-
- if (pValue != NULL) {
- Py_DECREF(pValue);
- } else {
- derr << module_name << ".notify:" << dendl;
- derr << handle_pyerror() << dendl;
- // FIXME: callers can't be expected to handle a python module
- // that has spontaneously broken, but Mgr() should provide
- // a hook to unload misbehaving modules when they have an
- // error somewhere like this
- }
-}
-
-void MgrPyModule::notify_clog(const LogEntry &log_entry)
-{
- assert(pClassInstance != nullptr);
-
- Gil gil(pMyThreadState);
-
- // Construct python-ized LogEntry
- PyFormatter f;
- log_entry.dump(&f);
- auto py_log_entry = f.get();
-
- // Execute
- auto pValue = PyObject_CallMethod(pClassInstance,
- const_cast<char*>("notify"), const_cast<char*>("(sN)"),
- "clog", py_log_entry);
-
- if (pValue != NULL) {
- Py_DECREF(pValue);
- } else {
- derr << module_name << ".notify_clog:" << dendl;
- derr << handle_pyerror() << dendl;
- // FIXME: callers can't be expected to handle a python module
- // that has spontaneously broken, but Mgr() should provide
- // a hook to unload misbehaving modules when they have an
- // error somewhere like this
- }
-}
-
-int MgrPyModule::load_commands()
-{
- // Don't need a Gil here -- this is called from MgrPyModule::load(),
- // which already has one.
- PyObject *command_list = PyObject_GetAttrString(pClassInstance, "COMMANDS");
- assert(command_list != nullptr);
- const size_t list_size = PyList_Size(command_list);
- for (size_t i = 0; i < list_size; ++i) {
- PyObject *command = PyList_GetItem(command_list, i);
- assert(command != nullptr);
-
- ModuleCommand item;
-
- PyObject *pCmd = PyDict_GetItemString(command, "cmd");
- assert(pCmd != nullptr);
- item.cmdstring = PyString_AsString(pCmd);
-
- dout(20) << "loaded command " << item.cmdstring << dendl;
-
- PyObject *pDesc = PyDict_GetItemString(command, "desc");
- assert(pDesc != nullptr);
- item.helpstring = PyString_AsString(pDesc);
-
- PyObject *pPerm = PyDict_GetItemString(command, "perm");
- assert(pPerm != nullptr);
- item.perm = PyString_AsString(pPerm);
-
- item.handler = this;
-
- commands.push_back(item);
- }
- Py_DECREF(command_list);
-
- dout(10) << "loaded " << commands.size() << " commands" << dendl;
-
- return 0;
-}
-
-int MgrPyModule::handle_command(
- const cmdmap_t &cmdmap,
- std::stringstream *ds,
- std::stringstream *ss)
-{
- assert(ss != nullptr);
- assert(ds != nullptr);
-
- Gil gil(pMyThreadState);
-
- PyFormatter f;
- cmdmap_dump(cmdmap, &f);
- PyObject *py_cmd = f.get();
-
- auto pResult = PyObject_CallMethod(pClassInstance,
- const_cast<char*>("handle_command"), const_cast<char*>("(O)"), py_cmd);
-
- Py_DECREF(py_cmd);
-
- int r = 0;
- if (pResult != NULL) {
- if (PyTuple_Size(pResult) != 3) {
- r = -EINVAL;
- } else {
- r = PyInt_AsLong(PyTuple_GetItem(pResult, 0));
- *ds << PyString_AsString(PyTuple_GetItem(pResult, 1));
- *ss << PyString_AsString(PyTuple_GetItem(pResult, 2));
- }
-
- Py_DECREF(pResult);
- } else {
- *ds << "";
- *ss << handle_pyerror();
- r = -EINVAL;
- }
-
- return r;
-}
-
-void MgrPyModule::get_health_checks(health_check_map_t *checks)
-{
- checks->merge(health_checks);
-}
+++ /dev/null
-
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- */
-
-
-#ifndef MGR_PY_MODULE_H_
-#define MGR_PY_MODULE_H_
-
-// Python.h comes first because otherwise it clobbers ceph's assert
-#include "Python.h"
-
-#include "common/cmdparse.h"
-#include "common/LogEntry.h"
-#include "common/Mutex.h"
-#include "mon/health_check.h"
-
-#include <vector>
-#include <string>
-
-
-class MgrPyModule;
-
-/**
- * A Ceph CLI command description provided from a Python module
- */
-class ModuleCommand {
-public:
- std::string cmdstring;
- std::string helpstring;
- std::string perm;
- MgrPyModule *handler;
-};
-
-class MgrPyModule
-{
-private:
- const std::string module_name;
- PyObject *pClassInstance;
- PyThreadState *pMainThreadState;
- PyThreadState *pMyThreadState = nullptr;
-
- health_check_map_t health_checks;
-
- std::vector<ModuleCommand> commands;
-
- int load_commands();
-
-public:
- MgrPyModule(const std::string &module_name, const std::string &sys_path, PyThreadState *main_ts);
- ~MgrPyModule();
-
- int load();
- int serve();
- void shutdown();
- void notify(const std::string ¬ify_type, const std::string ¬ify_id);
- void notify_clog(const LogEntry &le);
-
- const std::vector<ModuleCommand> &get_commands() const
- {
- return commands;
- }
-
- const std::string &get_name() const
- {
- return module_name;
- }
-
- int handle_command(
- const cmdmap_t &cmdmap,
- std::stringstream *ds,
- std::stringstream *ss);
-
- void set_health_checks(health_check_map_t&& c) {
- health_checks = std::move(c);
- }
- void get_health_checks(health_check_map_t *checks);
-};
-
-std::string handle_pyerror();
-
-#endif
-
// mon caps are suitably generic for mgr
MonCap caps;
+ std::set<std::string> declared_types;
+
MgrSession(CephContext *cct) : RefCountedObject(cct, 0) {}
~MgrSession() override {}
};
audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
lock("MgrStandby::lock"),
timer(g_ceph_context, lock),
+ py_module_registry(clog),
active_mgr(nullptr),
orig_argc(argc),
orig_argv(argv),
dout(1) << state_str() << dendl;
set<string> modules;
- PyModules::list_modules(&modules);
+ PyModuleRegistry::list_modules(&modules);
// Whether I think I am available (request MgrMonitor to set me
// as available in the map)
bool available = active_mgr != nullptr && active_mgr->is_initialized();
auto addr = available ? active_mgr->get_server_addr() : entity_addr_t();
- dout(10) << "sending beacon as gid " << monc.get_global_id()
- << " modules " << modules << dendl;
+ dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
map<string,string> metadata;
collect_sys_info(&metadata, g_ceph_context);
modules,
std::move(metadata));
- if (available && !available_in_map) {
- // We are informing the mon that we are done initializing: inform
- // it of our command set. This has to happen after init() because
- // it needs the python modules to have loaded.
- m->set_command_descs(active_mgr->get_command_set());
- dout(4) << "going active, including " << m->get_command_descs().size()
- << " commands in beacon" << dendl;
+ if (available) {
+ if (!available_in_map) {
+ // We are informing the mon that we are done initializing: inform
+ // it of our command set. This has to happen after init() because
+ // it needs the python modules to have loaded.
+ m->set_command_descs(active_mgr->get_command_set());
+ dout(4) << "going active, including " << m->get_command_descs().size()
+ << " commands in beacon" << dendl;
+ }
+
+ m->set_services(active_mgr->get_services());
}
monc.send_mon_message(m);
dout(10) << __func__ << dendl;
send_beacon();
- if (active_mgr) {
+ if (active_mgr && active_mgr->is_initialized()) {
active_mgr->tick();
}
- timer.add_event_after(g_conf->mgr_tick_period, new FunctionContext(
- [this](int r){
+ timer.add_event_after(g_conf->get_val<int64_t>("mgr_tick_period"),
+ new FunctionContext([this](int r){
tick();
- }
+ }
));
}
// Expect already to be locked as we're called from signal handler
assert(lock.is_locked_by_me());
+ dout(4) << "Shutting down" << dendl;
+
// stop sending beacon first, i use monc to talk with monitors
timer.shutdown();
// client uses monc and objecter
if (active_mgr) {
active_mgr->shutdown();
}
+
+ py_module_registry.shutdown();
+
// objecter is used by monc and active_mgr
objecter.shutdown();
// client_messenger is used by all of them, so stop it in the end
const bool active_in_map = map.active_gid == monc.get_global_id();
dout(4) << "active in map: " << active_in_map
<< " active is " << map.active_gid << dendl;
+
+ if (!py_module_registry.is_initialized()) {
+ int r = py_module_registry.init(map);
+
+ // FIXME: error handling
+ assert(r == 0);
+ } else {
+ bool need_respawn = py_module_registry.handle_mgr_map(map);
+ if (need_respawn) {
+ respawn();
+ }
+ }
+
if (active_in_map) {
if (!active_mgr) {
dout(1) << "Activating!" << dendl;
- active_mgr.reset(new Mgr(&monc, map, client_messenger.get(), &objecter,
+ active_mgr.reset(new Mgr(&monc, map, &py_module_registry,
+ client_messenger.get(), &objecter,
&client, clog, audit_clog));
active_mgr->background_init(new FunctionContext(
[this](int r){
dout(4) << "Map now says I am available" << dendl;
available_in_map = true;
}
+ } else if (active_mgr != nullptr) {
+ derr << "I was active but no longer am" << dendl;
+ respawn();
} else {
- if (active_mgr != nullptr) {
- derr << "I was active but no longer am" << dendl;
- respawn();
+ if (map.active_gid != 0 && map.active_name != g_conf->name.get_id()) {
+ // I am the standby and someone else is active, start modules
+ // in standby mode to do redirects if needed
+ if (!py_module_registry.is_standby_running()) {
+ py_module_registry.standby_start(&monc);
+ }
}
}
std::string MgrStandby::state_str()
{
- return active_mgr == nullptr ? "standby" : "active";
+ if (active_mgr == nullptr) {
+ return "standby";
+ } else if (active_mgr->is_initialized()) {
+ return "active";
+ } else {
+ return "active (starting)";
+ }
}
#include "client/Client.h"
#include "mon/MonClient.h"
#include "osdc/Objecter.h"
+#include "PyModuleRegistry.h"
class MMgrMap;
Mutex lock;
SafeTimer timer;
+ PyModuleRegistry py_module_registry;
std::shared_ptr<Mgr> active_mgr;
int orig_argc;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/backport14.h"
+
+#include "BaseMgrModule.h"
+#include "PyOSDMap.h"
+#include "BaseMgrStandbyModule.h"
+#include "Gil.h"
+
+#include "ActivePyModules.h"
+
+#include "PyModuleRegistry.h"
+
+// definition for non-const static member
+std::string PyModuleRegistry::config_prefix;
+
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+namespace {
+ PyObject* log_write(PyObject*, PyObject* args) {
+ char* m = nullptr;
+ if (PyArg_ParseTuple(args, "s", &m)) {
+ auto len = strlen(m);
+ if (len && m[len-1] == '\n') {
+ m[len-1] = '\0';
+ }
+ dout(4) << m << dendl;
+ }
+ Py_RETURN_NONE;
+ }
+
+ PyObject* log_flush(PyObject*, PyObject*){
+ Py_RETURN_NONE;
+ }
+
+ static PyMethodDef log_methods[] = {
+ {"write", log_write, METH_VARARGS, "write stdout and stderr"},
+ {"flush", log_flush, METH_VARARGS, "flush"},
+ {nullptr, nullptr, 0, nullptr}
+ };
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+
+std::string PyModule::get_site_packages()
+{
+ std::stringstream site_packages;
+
+ // CPython doesn't auto-add site-packages dirs to sys.path for us,
+ // but it does provide a module that we can ask for them.
+ auto site_module = PyImport_ImportModule("site");
+ assert(site_module);
+
+ auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
+ if (site_packages_fn != nullptr) {
+ auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
+ assert(site_packages_list);
+
+ auto n = PyList_Size(site_packages_list);
+ for (Py_ssize_t i = 0; i < n; ++i) {
+ if (i != 0) {
+ site_packages << ":";
+ }
+ site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
+ }
+
+ Py_DECREF(site_packages_list);
+ Py_DECREF(site_packages_fn);
+ } else {
+ // Fall back to generating our own site-packages paths by imitating
+ // what the standard site.py does. This is annoying but it lets us
+ // run inside virtualenvs :-/
+
+ auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
+ assert(site_packages_fn);
+
+ auto known_paths = PySet_New(nullptr);
+ auto pArgs = PyTuple_Pack(1, known_paths);
+ PyObject_CallObject(site_packages_fn, pArgs);
+ Py_DECREF(pArgs);
+ Py_DECREF(known_paths);
+ Py_DECREF(site_packages_fn);
+
+ auto sys_module = PyImport_ImportModule("sys");
+ assert(sys_module);
+ auto sys_path = PyObject_GetAttrString(sys_module, "path");
+ assert(sys_path);
+
+ dout(1) << "sys.path:" << dendl;
+ auto n = PyList_Size(sys_path);
+ bool first = true;
+ for (Py_ssize_t i = 0; i < n; ++i) {
+ dout(1) << " " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
+ if (first) {
+ first = false;
+ } else {
+ site_packages << ":";
+ }
+ site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
+ }
+
+ Py_DECREF(sys_path);
+ Py_DECREF(sys_module);
+ }
+
+ Py_DECREF(site_module);
+
+ return site_packages.str();
+}
+
+int PyModuleRegistry::init(const MgrMap &map)
+{
+ Mutex::Locker locker(lock);
+
+ // Don't try and init me if you don't really have a map
+ assert(map.epoch > 0);
+
+ mgr_map = map;
+
+ // namespace in config-key prefixed by "mgr/"
+ config_prefix = std::string(g_conf->name.get_type_str()) + "/";
+
+ // Set up global python interpreter
+ Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
+ Py_InitializeEx(0);
+
+ // Let CPython know that we will be calling it back from other
+ // threads in future.
+ if (! PyEval_ThreadsInitialized()) {
+ PyEval_InitThreads();
+ }
+
+ // Drop the GIL and remember the main thread state (current
+ // thread state becomes NULL)
+ pMainThreadState = PyEval_SaveThread();
+ assert(pMainThreadState != nullptr);
+
+ std::list<std::string> failed_modules;
+
+ // Load python code
+ for (const auto& module_name : mgr_map.modules) {
+ dout(1) << "Loading python module '" << module_name << "'" << dendl;
+ auto mod = ceph::make_unique<PyModule>(module_name);
+ int r = mod->load(pMainThreadState);
+ if (r != 0) {
+ // Don't use handle_pyerror() here; we don't have the GIL
+ // or the right thread state (this is deliberate).
+ derr << "Error loading module '" << module_name << "': "
+ << cpp_strerror(r) << dendl;
+ failed_modules.push_back(module_name);
+ // Don't drop out here, load the other modules
+ } else {
+ // Success!
+ modules[module_name] = std::move(mod);
+ }
+ }
+
+ if (!failed_modules.empty()) {
+ clog->error() << "Failed to load ceph-mgr modules: " << joinify(
+ failed_modules.begin(), failed_modules.end(), std::string(", "));
+ }
+
+ return 0;
+}
+
+
+int PyModule::load(PyThreadState *pMainThreadState)
+{
+ assert(pMainThreadState != nullptr);
+
+ // Configure sub-interpreter and construct C++-generated python classes
+ {
+ SafeThreadState sts(pMainThreadState);
+ Gil gil(sts);
+
+ auto thread_state = Py_NewInterpreter();
+ if (thread_state == nullptr) {
+ derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
+ return -EINVAL;
+ } else {
+ pMyThreadState.set(thread_state);
+ // Some python modules do not cope with an unpopulated argv, so lets
+ // fake one. This step also picks up site-packages into sys.path.
+ const char *argv[] = {"ceph-mgr"};
+ PySys_SetArgv(1, (char**)argv);
+
+ if (g_conf->daemonize) {
+ auto py_logger = Py_InitModule("ceph_logger", log_methods);
+#if PY_MAJOR_VERSION >= 3
+ PySys_SetObject("stderr", py_logger);
+ PySys_SetObject("stdout", py_logger);
+#else
+ PySys_SetObject(const_cast<char*>("stderr"), py_logger);
+ PySys_SetObject(const_cast<char*>("stdout"), py_logger);
+#endif
+ }
+
+ // Configure sys.path to include mgr_module_path
+ std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
+ + ":" + g_conf->get_val<std::string>("mgr_module_path");
+ dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
+
+ PySys_SetPath(const_cast<char*>(sys_path.c_str()));
+ }
+
+ PyMethodDef ModuleMethods[] = {
+ {nullptr}
+ };
+
+ // Initialize module
+ PyObject *ceph_module = Py_InitModule("ceph_module", ModuleMethods);
+ assert(ceph_module != nullptr);
+
+ auto load_class = [ceph_module](const char *name, PyTypeObject *type)
+ {
+ type->tp_new = PyType_GenericNew;
+ if (PyType_Ready(type) < 0) {
+ assert(0);
+ }
+ Py_INCREF(type);
+
+ PyModule_AddObject(ceph_module, name, (PyObject *)type);
+ };
+
+ load_class("BaseMgrModule", &BaseMgrModuleType);
+ load_class("BaseMgrStandbyModule", &BaseMgrStandbyModuleType);
+ load_class("BasePyOSDMap", &BasePyOSDMapType);
+ load_class("BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType);
+ load_class("BasePyCRUSH", &BasePyCRUSHType);
+ }
+
+ // Environment is all good, import the external module
+ {
+ Gil gil(pMyThreadState);
+
+ // Load the module
+ PyObject *pName = PyString_FromString(module_name.c_str());
+ auto pModule = PyImport_Import(pName);
+ Py_DECREF(pName);
+ if (pModule == nullptr) {
+ derr << "Module not found: '" << module_name << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -ENOENT;
+ }
+
+ // Find the class
+ // TODO: let them call it what they want instead of just 'Module'
+ pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
+ if (pClass == nullptr) {
+ derr << "Class not found in module '" << module_name << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ }
+
+ pStandbyClass = PyObject_GetAttrString(pModule,
+ (const char*)"StandbyModule");
+ if (pStandbyClass) {
+ dout(4) << "Standby mode available in module '" << module_name
+ << "'" << dendl;
+ } else {
+ dout(4) << "Standby mode not provided by module '" << module_name
+ << "'" << dendl;
+ PyErr_Clear();
+ }
+
+ Py_DECREF(pModule);
+ }
+
+ return 0;
+}
+
+PyModule::~PyModule()
+{
+ if (pMyThreadState.ts != nullptr) {
+ Gil gil(pMyThreadState, true);
+ Py_XDECREF(pClass);
+ Py_XDECREF(pStandbyClass);
+ }
+}
+
+void PyModuleRegistry::standby_start(MonClient *monc)
+{
+ Mutex::Locker l(lock);
+ assert(active_modules == nullptr);
+ assert(standby_modules == nullptr);
+ assert(is_initialized());
+
+ dout(4) << "Starting modules in standby mode" << dendl;
+
+ standby_modules.reset(new StandbyPyModules(monc, mgr_map));
+
+ std::set<std::string> failed_modules;
+ for (const auto &i : modules) {
+ if (i.second->pStandbyClass) {
+ dout(4) << "starting module " << i.second->get_name() << dendl;
+ int r = standby_modules->start_one(i.first,
+ i.second->pStandbyClass,
+ i.second->pMyThreadState);
+ if (r != 0) {
+ derr << "failed to start module '" << i.second->get_name()
+ << "'" << dendl;;
+ failed_modules.insert(i.second->get_name());
+ // Continue trying to load any other modules
+ }
+ } else {
+ dout(4) << "skipping module '" << i.second->get_name() << "' because "
+ "it does not implement a standby mode" << dendl;
+ }
+ }
+
+ if (!failed_modules.empty()) {
+ clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: "
+ << joinify(failed_modules.begin(), failed_modules.end(),
+ std::string(", "));
+ }
+}
+
+void PyModuleRegistry::active_start(
+ PyModuleConfig &config_,
+ DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+ LogChannelRef clog_, Objecter &objecter_, Client &client_,
+ Finisher &f)
+{
+ Mutex::Locker locker(lock);
+
+ dout(4) << "Starting modules in active mode" << dendl;
+
+ assert(active_modules == nullptr);
+ assert(is_initialized());
+
+ if (standby_modules != nullptr) {
+ standby_modules->shutdown();
+ standby_modules.reset();
+ }
+
+ active_modules.reset(new ActivePyModules(
+ config_, ds, cs, mc, clog_, objecter_, client_, f));
+
+ for (const auto &i : modules) {
+ dout(4) << "Starting " << i.first << dendl;
+ int r = active_modules->start_one(i.first,
+ i.second->pClass,
+ i.second->pMyThreadState);
+ if (r != 0) {
+ derr << "Failed to run module in active mode ('" << i.first << "')"
+ << dendl;
+ }
+ }
+}
+
+void PyModuleRegistry::active_shutdown()
+{
+ Mutex::Locker locker(lock);
+
+ if (active_modules != nullptr) {
+ active_modules->shutdown();
+ active_modules.reset();
+ }
+}
+
+void PyModuleRegistry::shutdown()
+{
+ Mutex::Locker locker(lock);
+
+ if (standby_modules != nullptr) {
+ standby_modules->shutdown();
+ standby_modules.reset();
+ }
+
+ // Ideally, now, we'd be able to do this for all modules:
+ //
+ // Py_EndInterpreter(pMyThreadState);
+ // PyThreadState_Swap(pMainThreadState);
+ //
+ // Unfortunately, if the module has any other *python* threads active
+ // at this point, Py_EndInterpreter() will abort with:
+ //
+ // Fatal Python error: Py_EndInterpreter: not the last thread
+ //
+ // This can happen when using CherryPy in a module, becuase CherryPy
+ // runs an extra thread as a timeout monitor, which spends most of its
+ // life inside a time.sleep(60). Unless you are very, very lucky with
+ // the timing calling this destructor, that thread will still be stuck
+ // in a sleep, and Py_EndInterpreter() will abort.
+ //
+ // This could of course also happen with a poorly written module which
+ // made no attempt to clean up any additional threads it created.
+ //
+ // The safest thing to do is just not call Py_EndInterpreter(), and
+ // let Py_Finalize() kill everything after all modules are shut down.
+
+ modules.clear();
+
+ PyEval_RestoreThread(pMainThreadState);
+ Py_Finalize();
+}
+
+static void _list_modules(
+ const std::string path,
+ std::set<std::string> *modules)
+{
+ DIR *dir = opendir(path.c_str());
+ if (!dir) {
+ return;
+ }
+ struct dirent *entry = NULL;
+ while ((entry = readdir(dir)) != NULL) {
+ string n(entry->d_name);
+ string fn = path + "/" + n;
+ struct stat st;
+ int r = ::stat(fn.c_str(), &st);
+ if (r == 0 && S_ISDIR(st.st_mode)) {
+ string initfn = fn + "/module.py";
+ r = ::stat(initfn.c_str(), &st);
+ if (r == 0) {
+ modules->insert(n);
+ }
+ }
+ }
+ closedir(dir);
+}
+
+void PyModuleRegistry::list_modules(std::set<std::string> *modules)
+{
+ _list_modules(g_conf->get_val<std::string>("mgr_module_path"), modules);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include <string>
+#include <map>
+#include <memory>
+
+#include "common/LogClient.h"
+
+#include "ActivePyModules.h"
+#include "StandbyPyModules.h"
+
+class PyModule
+{
+private:
+ const std::string module_name;
+ std::string get_site_packages();
+
+public:
+ SafeThreadState pMyThreadState;
+ PyObject *pClass = nullptr;
+ PyObject *pStandbyClass = nullptr;
+
+ PyModule(const std::string &module_name_)
+ : module_name(module_name_)
+ {
+ }
+
+ ~PyModule();
+
+ int load(PyThreadState *pMainThreadState);
+
+ std::string get_name() const {
+ return module_name;
+ }
+};
+
+/**
+ * This class is responsible for setting up the python runtime environment
+ * and importing the python modules.
+ *
+ * It is *not* responsible for constructing instances of their BaseMgrModule
+ * subclasses.
+ */
+class PyModuleRegistry
+{
+private:
+ mutable Mutex lock{"PyModuleRegistry::lock"};
+
+ LogChannelRef clog;
+
+ std::map<std::string, std::unique_ptr<PyModule>> modules;
+
+ std::unique_ptr<ActivePyModules> active_modules;
+ std::unique_ptr<StandbyPyModules> standby_modules;
+
+ PyThreadState *pMainThreadState;
+
+ // We have our own copy of MgrMap, because we are constructed
+ // before ClusterState exists.
+ MgrMap mgr_map;
+
+public:
+ static std::string config_prefix;
+
+ static void list_modules(std::set<std::string> *modules);
+
+ PyModuleRegistry(LogChannelRef clog_)
+ : clog(clog_)
+ {}
+
+ bool handle_mgr_map(const MgrMap &mgr_map_)
+ {
+ Mutex::Locker l(lock);
+
+ bool modules_changed = mgr_map_.modules != mgr_map.modules;
+ mgr_map = mgr_map_;
+
+ if (standby_modules != nullptr) {
+ standby_modules->handle_mgr_map(mgr_map_);
+ }
+
+ return modules_changed;
+ }
+
+ bool is_initialized() const
+ {
+ return mgr_map.epoch > 0;
+ }
+
+ int init(const MgrMap &map);
+
+ void active_start(
+ PyModuleConfig &config_,
+ DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+ LogChannelRef clog_, Objecter &objecter_, Client &client_,
+ Finisher &f);
+ void standby_start(
+ MonClient *monc);
+
+ bool is_standby_running() const
+ {
+ return standby_modules != nullptr;
+ }
+
+ void active_shutdown();
+ void shutdown();
+
+ template<typename Callback, typename...Args>
+ void with_active_modules(Callback&& cb, Args&&...args) const
+ {
+ Mutex::Locker l(lock);
+ assert(active_modules != nullptr);
+
+ std::forward<Callback>(cb)(*active_modules, std::forward<Args>(args)...);
+ }
+
+ // FIXME: breaking interface so that I don't have to go rewrite all
+ // the places that call into these (for now)
+ // >>>
+ void notify_all(const std::string ¬ify_type,
+ const std::string ¬ify_id)
+ {
+ if (active_modules) {
+ active_modules->notify_all(notify_type, notify_id);
+ }
+ }
+
+ void notify_all(const LogEntry &log_entry)
+ {
+ if (active_modules) {
+ active_modules->notify_all(log_entry);
+ }
+ }
+
+ std::vector<MonCommand> get_commands() const
+ {
+ assert(active_modules);
+ return active_modules->get_commands();
+ }
+ std::vector<ModuleCommand> get_py_commands() const
+ {
+ assert(active_modules);
+ return active_modules->get_py_commands();
+ }
+ void get_health_checks(health_check_map_t *checks)
+ {
+ assert(active_modules);
+ active_modules->get_health_checks(checks);
+ }
+ std::map<std::string, std::string> get_services() const
+ {
+ assert(active_modules);
+ return active_modules->get_services();
+ }
+ // <<< (end of ActivePyModules cheeky call-throughs)
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/debug.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+std::string handle_pyerror();
+
+PyModuleRunner::~PyModuleRunner()
+{
+ Gil gil(pMyThreadState, true);
+
+ if (pClassInstance) {
+ Py_XDECREF(pClassInstance);
+ pClassInstance = nullptr;
+ }
+
+ Py_DECREF(pClass);
+ pClass = nullptr;
+}
+
+int PyModuleRunner::serve()
+{
+ assert(pClassInstance != nullptr);
+
+ // This method is called from a separate OS thread (i.e. a thread not
+ // created by Python), so tell Gil to wrap this in a new thread state.
+ Gil gil(pMyThreadState, true);
+
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("serve"), nullptr);
+
+ int r = 0;
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << module_name << ".serve:" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ }
+
+ return r;
+}
+
+void PyModuleRunner::shutdown()
+{
+ assert(pClassInstance != nullptr);
+
+ Gil gil(pMyThreadState, true);
+
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("shutdown"), nullptr);
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << "Failed to invoke shutdown() on " << module_name << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+}
+
+void PyModuleRunner::log(int level, const std::string &record)
+{
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[" << module_name << "] "
+ dout(level) << record << dendl;
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+}
+
+void* PyModuleRunner::PyModuleRunnerThread::entry()
+{
+ // No need to acquire the GIL here; the module does it.
+ dout(4) << "Entering thread for " << mod->get_name() << dendl;
+ mod->serve();
+ return nullptr;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+#include "common/Thread.h"
+#include "mgr/Gil.h"
+
+/**
+ * Implement the pattern of calling serve() on a module in a thread,
+ * until shutdown() is called.
+ */
+class PyModuleRunner
+{
+protected:
+ const std::string module_name;
+
+ // Passed in by whoever loaded our python module and looked up
+ // the symbols in it.
+ PyObject *pClass = nullptr;
+
+ // Passed in by whoever created our subinterpreter for us
+ SafeThreadState pMyThreadState = nullptr;
+
+ // Populated when we construct our instance of pClass in load()
+ PyObject *pClassInstance = nullptr;
+
+ class PyModuleRunnerThread : public Thread
+ {
+ PyModuleRunner *mod;
+
+ public:
+ PyModuleRunnerThread(PyModuleRunner *mod_)
+ : mod(mod_) {}
+
+ void *entry() override;
+ };
+
+public:
+ int serve();
+ void shutdown();
+ void log(int level, const std::string &record);
+
+ PyModuleRunner(
+ const std::string &module_name_,
+ PyObject *pClass_,
+ const SafeThreadState &pMyThreadState_)
+ :
+ module_name(module_name_),
+ pClass(pClass_), pMyThreadState(pMyThreadState_),
+ thread(this)
+ {
+ assert(pClass != nullptr);
+ assert(pMyThreadState.ts != nullptr);
+ assert(!module_name.empty());
+ }
+
+ ~PyModuleRunner();
+
+ PyModuleRunnerThread thread;
+
+ std::string const &get_name() const { return module_name; }
+};
+
+
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 John Spray <john.spray@inktank.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- */
-
-// Include this first to get python headers earlier
-#include "PyState.h"
-#include "Gil.h"
-
-#include "common/errno.h"
-#include "include/stringify.h"
-
-#include "PyFormatter.h"
-
-#include "osd/OSDMap.h"
-#include "mon/MonMap.h"
-
-#include "mgr/MgrContext.h"
-
-#include "PyModules.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-
-// definition for non-const static member
-std::string PyModules::config_prefix;
-
-// constructor/destructor implementations cannot be in .h,
-// because ServeThread is still an "incomplete" type there
-
-PyModules::PyModules(DaemonStateIndex &ds, ClusterState &cs,
- MonClient &mc, LogChannelRef clog_, Objecter &objecter_,
- Client &client_, Finisher &f)
- : daemon_state(ds), cluster_state(cs), monc(mc), clog(clog_),
- objecter(objecter_), client(client_), finisher(f),
- lock("PyModules")
-{}
-
-PyModules::~PyModules() = default;
-
-void PyModules::dump_server(const std::string &hostname,
- const DaemonStateCollection &dmc,
- Formatter *f)
-{
- f->dump_string("hostname", hostname);
- f->open_array_section("services");
- std::string ceph_version;
-
- for (const auto &i : dmc) {
- Mutex::Locker l(i.second->lock);
- const auto &key = i.first;
- const std::string &str_type = key.first;
- const std::string &svc_name = key.second;
-
- // TODO: pick the highest version, and make sure that
- // somewhere else (during health reporting?) we are
- // indicating to the user if we see mixed versions
- auto ver_iter = i.second->metadata.find("ceph_version");
- if (ver_iter != i.second->metadata.end()) {
- ceph_version = i.second->metadata.at("ceph_version");
- }
-
- f->open_object_section("service");
- f->dump_string("type", str_type);
- f->dump_string("id", svc_name);
- f->close_section();
- }
- f->close_section();
-
- f->dump_string("ceph_version", ceph_version);
-}
-
-
-
-PyObject *PyModules::get_server_python(const std::string &hostname)
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
- dout(10) << " (" << hostname << ")" << dendl;
-
- auto dmc = daemon_state.get_by_server(hostname);
-
- PyFormatter f;
- dump_server(hostname, dmc, &f);
- return f.get();
-}
-
-
-PyObject *PyModules::list_servers_python()
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
- dout(10) << " >" << dendl;
-
- PyFormatter f(false, true);
- const auto &all = daemon_state.get_all_servers();
- for (const auto &i : all) {
- const auto &hostname = i.first;
-
- f.open_object_section("server");
- dump_server(hostname, i.second, &f);
- f.close_section();
- }
-
- return f.get();
-}
-
-PyObject *PyModules::get_metadata_python(
- std::string const &handle,
- const std::string &svc_name,
- const std::string &svc_id)
-{
- auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
- Mutex::Locker l(metadata->lock);
- PyFormatter f;
- f.dump_string("hostname", metadata->hostname);
- for (const auto &i : metadata->metadata) {
- f.dump_string(i.first.c_str(), i.second);
- }
-
- return f.get();
-}
-
-PyObject *PyModules::get_daemon_status_python(
- std::string const &handle,
- const std::string &svc_name,
- const std::string &svc_id)
-{
- auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
- Mutex::Locker l(metadata->lock);
- PyFormatter f;
- for (const auto &i : metadata->service_status) {
- f.dump_string(i.first.c_str(), i.second);
- }
- return f.get();
-}
-
-PyObject *PyModules::get_python(const std::string &what)
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- if (what == "fs_map") {
- PyFormatter f;
- cluster_state.with_fsmap([&f](const FSMap &fsmap) {
- fsmap.dump(&f);
- });
- return f.get();
- } else if (what == "osdmap_crush_map_text") {
- bufferlist rdata;
- cluster_state.with_osdmap([&rdata](const OSDMap &osd_map){
- osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
- });
- std::string crush_text = rdata.to_str();
- return PyString_FromString(crush_text.c_str());
- } else if (what.substr(0, 7) == "osd_map") {
- PyFormatter f;
- cluster_state.with_osdmap([&f, &what](const OSDMap &osd_map){
- if (what == "osd_map") {
- osd_map.dump(&f);
- } else if (what == "osd_map_tree") {
- osd_map.print_tree(&f, nullptr);
- } else if (what == "osd_map_crush") {
- osd_map.crush->dump(&f);
- }
- });
- return f.get();
- } else if (what == "config") {
- PyFormatter f;
- g_conf->show_config(&f);
- return f.get();
- } else if (what == "mon_map") {
- PyFormatter f;
- cluster_state.with_monmap(
- [&f](const MonMap &monmap) {
- monmap.dump(&f);
- }
- );
- return f.get();
- } else if (what == "service_map") {
- PyFormatter f;
- cluster_state.with_servicemap(
- [&f](const ServiceMap &service_map) {
- service_map.dump(&f);
- }
- );
- return f.get();
- } else if (what == "osd_metadata") {
- PyFormatter f;
- auto dmc = daemon_state.get_by_service("osd");
- for (const auto &i : dmc) {
- Mutex::Locker l(i.second->lock);
- f.open_object_section(i.first.second.c_str());
- f.dump_string("hostname", i.second->hostname);
- for (const auto &j : i.second->metadata) {
- f.dump_string(j.first.c_str(), j.second);
- }
- f.close_section();
- }
- return f.get();
- } else if (what == "pg_summary") {
- PyFormatter f;
- cluster_state.with_pgmap(
- [&f](const PGMap &pg_map) {
- std::map<std::string, std::map<std::string, uint32_t> > osds;
- std::map<std::string, std::map<std::string, uint32_t> > pools;
- std::map<std::string, uint32_t> all;
- for (const auto &i : pg_map.pg_stat) {
- const auto pool = i.first.m_pool;
- const std::string state = pg_state_string(i.second.state);
- // Insert to per-pool map
- pools[stringify(pool)][state]++;
- for (const auto &osd_id : i.second.acting) {
- osds[stringify(osd_id)][state]++;
- }
- all[state]++;
- }
- f.open_object_section("by_osd");
- for (const auto &i : osds) {
- f.open_object_section(i.first.c_str());
- for (const auto &j : i.second) {
- f.dump_int(j.first.c_str(), j.second);
- }
- f.close_section();
- }
- f.close_section();
- f.open_object_section("by_pool");
- for (const auto &i : pools) {
- f.open_object_section(i.first.c_str());
- for (const auto &j : i.second) {
- f.dump_int(j.first.c_str(), j.second);
- }
- f.close_section();
- }
- f.close_section();
- f.open_object_section("all");
- for (const auto &i : all) {
- f.dump_int(i.first.c_str(), i.second);
- }
- f.close_section();
- }
- );
- return f.get();
-
- } else if (what == "df") {
- PyFormatter f;
-
- cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
- cluster_state.with_pgmap(
- [&osd_map, &f](const PGMap &pg_map) {
- pg_map.dump_fs_stats(nullptr, &f, true);
- pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
- });
- });
- return f.get();
- } else if (what == "osd_stats") {
- PyFormatter f;
- cluster_state.with_pgmap(
- [&f](const PGMap &pg_map) {
- pg_map.dump_osd_stats(&f);
- });
- return f.get();
- } else if (what == "health" || what == "mon_status") {
- PyFormatter f;
- bufferlist json;
- if (what == "health") {
- json = cluster_state.get_health();
- } else if (what == "mon_status") {
- json = cluster_state.get_mon_status();
- } else {
- assert(false);
- }
- f.dump_string("json", json.to_str());
- return f.get();
- } else if (what == "mgr_map") {
- PyFormatter f;
- cluster_state.with_mgrmap([&f](const MgrMap &mgr_map) {
- mgr_map.dump(&f);
- });
- return f.get();
- } else {
- derr << "Python module requested unknown data '" << what << "'" << dendl;
- Py_RETURN_NONE;
- }
-}
-
-std::string PyModules::get_site_packages()
-{
- std::stringstream site_packages;
-
- // CPython doesn't auto-add site-packages dirs to sys.path for us,
- // but it does provide a module that we can ask for them.
- auto site_module = PyImport_ImportModule("site");
- assert(site_module);
-
- auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
- if (site_packages_fn != nullptr) {
- auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
- assert(site_packages_list);
-
- auto n = PyList_Size(site_packages_list);
- for (Py_ssize_t i = 0; i < n; ++i) {
- if (i != 0) {
- site_packages << ":";
- }
- site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i));
- }
-
- Py_DECREF(site_packages_list);
- Py_DECREF(site_packages_fn);
- } else {
- // Fall back to generating our own site-packages paths by imitating
- // what the standard site.py does. This is annoying but it lets us
- // run inside virtualenvs :-/
-
- auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
- assert(site_packages_fn);
-
- auto known_paths = PySet_New(nullptr);
- auto pArgs = PyTuple_Pack(1, known_paths);
- PyObject_CallObject(site_packages_fn, pArgs);
- Py_DECREF(pArgs);
- Py_DECREF(known_paths);
- Py_DECREF(site_packages_fn);
-
- auto sys_module = PyImport_ImportModule("sys");
- assert(sys_module);
- auto sys_path = PyObject_GetAttrString(sys_module, "path");
- assert(sys_path);
-
- dout(1) << "sys.path:" << dendl;
- auto n = PyList_Size(sys_path);
- bool first = true;
- for (Py_ssize_t i = 0; i < n; ++i) {
- dout(1) << " " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl;
- if (first) {
- first = false;
- } else {
- site_packages << ":";
- }
- site_packages << PyString_AsString(PyList_GetItem(sys_path, i));
- }
-
- Py_DECREF(sys_path);
- Py_DECREF(sys_module);
- }
-
- Py_DECREF(site_module);
-
- return site_packages.str();
-}
-
-
-int PyModules::init()
-{
- Mutex::Locker locker(lock);
-
- global_handle = this;
- // namespace in config-key prefixed by "mgr/"
- config_prefix = std::string(g_conf->name.get_type_str()) + "/";
-
- // Set up global python interpreter
- Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
- Py_InitializeEx(0);
-
- // Let CPython know that we will be calling it back from other
- // threads in future.
- if (! PyEval_ThreadsInitialized()) {
- PyEval_InitThreads();
- }
-
- // Configure sys.path to include mgr_module_path
- std::string sys_path = std::string(Py_GetPath()) + ":" + get_site_packages()
- + ":" + g_conf->mgr_module_path;
- dout(10) << "Computed sys.path '" << sys_path << "'" << dendl;
-
- // Drop the GIL and remember the main thread state (current
- // thread state becomes NULL)
- pMainThreadState = PyEval_SaveThread();
-
- std::list<std::string> failed_modules;
-
- // Load python code
- set<string> ls;
- cluster_state.with_mgrmap([&](const MgrMap& m) {
- ls = m.modules;
- });
- for (const auto& module_name : ls) {
- dout(1) << "Loading python module '" << module_name << "'" << dendl;
- auto mod = std::unique_ptr<MgrPyModule>(new MgrPyModule(module_name, sys_path, pMainThreadState));
- int r = mod->load();
- if (r != 0) {
- // Don't use handle_pyerror() here; we don't have the GIL
- // or the right thread state (this is deliberate).
- derr << "Error loading module '" << module_name << "': "
- << cpp_strerror(r) << dendl;
- failed_modules.push_back(module_name);
- // Don't drop out here, load the other modules
- } else {
- // Success!
- modules[module_name] = std::move(mod);
- }
- }
-
- if (!failed_modules.empty()) {
- clog->error() << "Failed to load ceph-mgr modules: " << joinify(
- failed_modules.begin(), failed_modules.end(), std::string(", "));
- }
-
- return 0;
-}
-
-class ServeThread : public Thread
-{
- MgrPyModule *mod;
-
-public:
- bool running;
-
- ServeThread(MgrPyModule *mod_)
- : mod(mod_) {}
-
- void *entry() override
- {
- running = true;
-
- // No need to acquire the GIL here; the module does it.
- dout(4) << "Entering thread for " << mod->get_name() << dendl;
- mod->serve();
-
- running = false;
- return nullptr;
- }
-};
-
-void PyModules::start()
-{
- Mutex::Locker l(lock);
-
- dout(1) << "Creating threads for " << modules.size() << " modules" << dendl;
- for (auto& i : modules) {
- auto thread = new ServeThread(i.second.get());
- serve_threads[i.first].reset(thread);
- }
-
- for (auto &i : serve_threads) {
- std::ostringstream thread_name;
- thread_name << "mgr." << i.first;
- dout(4) << "Starting thread for " << i.first << dendl;
- i.second->create(thread_name.str().c_str());
- }
-}
-
-void PyModules::shutdown()
-{
- Mutex::Locker locker(lock);
- assert(global_handle);
-
- // Signal modules to drop out of serve() and/or tear down resources
- for (auto &i : modules) {
- auto module = i.second.get();
- const auto& name = i.first;
- dout(10) << "waiting for module " << name << " to shutdown" << dendl;
- lock.Unlock();
- module->shutdown();
- lock.Lock();
- dout(10) << "module " << name << " shutdown" << dendl;
- }
-
- // For modules implementing serve(), finish the threads where we
- // were running that.
- for (auto &i : serve_threads) {
- lock.Unlock();
- i.second->join();
- lock.Lock();
- }
- serve_threads.clear();
-
- modules.clear();
-
- PyEval_RestoreThread(pMainThreadState);
- Py_Finalize();
-
- // nobody needs me anymore.
- global_handle = nullptr;
-}
-
-void PyModules::notify_all(const std::string ¬ify_type,
- const std::string ¬ify_id)
-{
- Mutex::Locker l(lock);
-
- dout(10) << __func__ << ": notify_all " << notify_type << dendl;
- for (auto& i : modules) {
- auto module = i.second.get();
- if (!serve_threads[i.first]->running)
- continue;
- // Send all python calls down a Finisher to avoid blocking
- // C++ code, and avoid any potential lock cycles.
- finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){
- module->notify(notify_type, notify_id);
- }));
- }
-}
-
-void PyModules::notify_all(const LogEntry &log_entry)
-{
- Mutex::Locker l(lock);
-
- dout(10) << __func__ << ": notify_all (clog)" << dendl;
- for (auto& i : modules) {
- auto module = i.second.get();
- if (!serve_threads[i.first]->running)
- continue;
- // Send all python calls down a Finisher to avoid blocking
- // C++ code, and avoid any potential lock cycles.
- //
- // Note intentional use of non-reference lambda binding on
- // log_entry: we take a copy because caller's instance is
- // probably ephemeral.
- finisher.queue(new FunctionContext([module, log_entry](int r){
- module->notify_clog(log_entry);
- }));
- }
-}
-
-bool PyModules::get_config(const std::string &handle,
- const std::string &key, std::string *val) const
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- const std::string global_key = config_prefix + handle + "/" + key;
-
- dout(4) << __func__ << "key: " << global_key << dendl;
-
- if (config_cache.count(global_key)) {
- *val = config_cache.at(global_key);
- return true;
- } else {
- return false;
- }
-}
-
-PyObject *PyModules::get_config_prefix(const std::string &handle,
- const std::string &prefix) const
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- const std::string base_prefix = config_prefix + handle + "/";
- const std::string global_prefix = base_prefix + prefix;
- dout(4) << __func__ << "prefix: " << global_prefix << dendl;
-
- PyFormatter f;
- for (auto p = config_cache.lower_bound(global_prefix);
- p != config_cache.end() && p->first.find(global_prefix) == 0;
- ++p) {
- f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
- }
- return f.get();
-}
-
-void PyModules::set_config(const std::string &handle,
- const std::string &key, const boost::optional<std::string>& val)
-{
- const std::string global_key = config_prefix + handle + "/" + key;
-
- Command set_cmd;
- {
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
- if (val) {
- config_cache[global_key] = *val;
- } else {
- config_cache.erase(global_key);
- }
-
- std::ostringstream cmd_json;
- JSONFormatter jf;
- jf.open_object_section("cmd");
- if (val) {
- jf.dump_string("prefix", "config-key set");
- jf.dump_string("key", global_key);
- jf.dump_string("val", *val);
- } else {
- jf.dump_string("prefix", "config-key del");
- jf.dump_string("key", global_key);
- }
- jf.close_section();
- jf.flush(cmd_json);
- set_cmd.run(&monc, cmd_json.str());
- }
- set_cmd.wait();
-
- if (set_cmd.r != 0) {
- // config-key set will fail if mgr's auth key has insufficient
- // permission to set config keys
- // FIXME: should this somehow raise an exception back into Python land?
- dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
- << cpp_strerror(set_cmd.r) << dendl;
- dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
- }
-}
-
-std::vector<ModuleCommand> PyModules::get_py_commands() const
-{
- Mutex::Locker l(lock);
-
- std::vector<ModuleCommand> result;
- for (const auto& i : modules) {
- auto module = i.second.get();
- auto mod_commands = module->get_commands();
- for (auto j : mod_commands) {
- result.push_back(j);
- }
- }
-
- return result;
-}
-
-std::vector<MonCommand> PyModules::get_commands() const
-{
- std::vector<ModuleCommand> commands = get_py_commands();
- std::vector<MonCommand> result;
- for (auto &pyc: commands) {
- result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
- pyc.perm, "cli", MonCommand::FLAG_MGR});
- }
- return result;
-}
-
-void PyModules::insert_config(const std::map<std::string,
- std::string> &new_config)
-{
- Mutex::Locker l(lock);
-
- dout(4) << "Loaded " << new_config.size() << " config settings" << dendl;
- config_cache = new_config;
-}
-
-void PyModules::log(const std::string &handle,
- int level, const std::string &record)
-{
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr[" << handle << "] "
- dout(level) << record << dendl;
-#undef dout_prefix
-#define dout_prefix *_dout << "mgr " << __func__ << " "
-}
-
-PyObject* PyModules::get_counter_python(
- const std::string &handle,
- const std::string &svc_name,
- const std::string &svc_id,
- const std::string &path)
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- PyFormatter f;
- f.open_array_section(path.c_str());
-
- auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
-
- Mutex::Locker l2(metadata->lock);
- if (metadata) {
- if (metadata->perf_counters.instances.count(path)) {
- auto counter_instance = metadata->perf_counters.instances.at(path);
- const auto &data = counter_instance.get_data();
- for (const auto &datapoint : data) {
- f.open_array_section("datapoint");
- f.dump_unsigned("t", datapoint.t.sec());
- f.dump_unsigned("v", datapoint.v);
- f.close_section();
-
- }
- } else {
- dout(4) << "Missing counter: '" << path << "' ("
- << svc_name << "." << svc_id << ")" << dendl;
- dout(20) << "Paths are:" << dendl;
- for (const auto &i : metadata->perf_counters.instances) {
- dout(20) << i.first << dendl;
- }
- }
- } else {
- dout(4) << "No daemon state for "
- << svc_name << "." << svc_id << ")" << dendl;
- }
- f.close_section();
- return f.get();
-}
-
-PyObject* PyModules::get_perf_schema_python(
- const std::string &handle,
- const std::string svc_type,
- const std::string &svc_id)
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- DaemonStateCollection states;
-
- if (svc_type == "") {
- states = daemon_state.get_all();
- } else if (svc_id.empty()) {
- states = daemon_state.get_by_service(svc_type);
- } else {
- auto key = DaemonKey(svc_type, svc_id);
- // so that the below can be a loop in all cases
- if (daemon_state.exists(key)) {
- states[key] = daemon_state.get(key);
- }
- }
-
- PyFormatter f;
- f.open_object_section("perf_schema");
-
- // FIXME: this is unsafe, I need to either be inside DaemonStateIndex's
- // lock or put a lock on individual DaemonStates
- if (!states.empty()) {
- for (auto statepair : states) {
- std::ostringstream daemon_name;
- auto key = statepair.first;
- auto state = statepair.second;
- Mutex::Locker l(state->lock);
- daemon_name << key.first << "." << key.second;
- f.open_object_section(daemon_name.str().c_str());
-
- for (auto typestr : state->perf_counters.declared_types) {
- f.open_object_section(typestr.c_str());
- auto type = state->perf_counters.types[typestr];
- f.dump_string("description", type.description);
- if (!type.nick.empty()) {
- f.dump_string("nick", type.nick);
- }
- f.dump_unsigned("type", type.type);
- f.close_section();
- }
- f.close_section();
- }
- } else {
- dout(4) << __func__ << ": No daemon state found for "
- << svc_type << "." << svc_id << ")" << dendl;
- }
- f.close_section();
- return f.get();
-}
-
-PyObject *PyModules::get_context()
-{
- PyThreadState *tstate = PyEval_SaveThread();
- Mutex::Locker l(lock);
- PyEval_RestoreThread(tstate);
-
- // Construct a capsule containing ceph context.
- // Not incrementing/decrementing ref count on the context because
- // it's the global one and it has process lifetime.
- auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
- return capsule;
-}
-
-static void _list_modules(
- const std::string path,
- std::set<std::string> *modules)
-{
- DIR *dir = opendir(path.c_str());
- if (!dir) {
- return;
- }
- struct dirent *entry = NULL;
- while ((entry = readdir(dir)) != NULL) {
- string n(entry->d_name);
- string fn = path + "/" + n;
- struct stat st;
- int r = ::stat(fn.c_str(), &st);
- if (r == 0 && S_ISDIR(st.st_mode)) {
- string initfn = fn + "/module.py";
- r = ::stat(initfn.c_str(), &st);
- if (r == 0) {
- modules->insert(n);
- }
- }
- }
- closedir(dir);
-}
-
-void PyModules::list_modules(std::set<std::string> *modules)
-{
- _list_modules(g_conf->mgr_module_path, modules);
-}
-
-void PyModules::set_health_checks(const std::string& handle,
- health_check_map_t&& checks)
-{
- Mutex::Locker l(lock);
- auto p = modules.find(handle);
- if (p != modules.end()) {
- p->second->set_health_checks(std::move(checks));
- }
-}
-
-void PyModules::get_health_checks(health_check_map_t *checks)
-{
- Mutex::Locker l(lock);
- for (auto& p : modules) {
- p.second->get_health_checks(checks);
- }
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 John Spray <john.spray@inktank.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- */
-
-#ifndef PY_MODULES_H_
-#define PY_MODULES_H_
-
-#include "MgrPyModule.h"
-
-#include "common/Finisher.h"
-#include "common/Mutex.h"
-#include "common/Thread.h"
-
-#include "osdc/Objecter.h"
-#include "client/Client.h"
-#include "common/LogClient.h"
-#include "mon/MgrMap.h"
-#include "mon/MonCommand.h"
-
-#include "DaemonState.h"
-#include "ClusterState.h"
-
-class ServeThread;
-class health_check_map_t;
-
-class PyModules
-{
- std::map<std::string, std::unique_ptr<MgrPyModule>> modules;
- std::map<std::string, std::unique_ptr<ServeThread>> serve_threads;
- DaemonStateIndex &daemon_state;
- ClusterState &cluster_state;
- MonClient &monc;
- LogChannelRef clog;
- Objecter &objecter;
- Client &client;
- Finisher &finisher;
-
- mutable Mutex lock{"PyModules::lock"};
-
- std::string get_site_packages();
-
- PyThreadState *pMainThreadState = nullptr;
-
-public:
- static std::string config_prefix;
-
- PyModules(DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
- LogChannelRef clog_, Objecter &objecter_, Client &client_,
- Finisher &f);
-
- ~PyModules();
-
- // FIXME: wrap for send_command?
- MonClient &get_monc() {return monc;}
- Objecter &get_objecter() {return objecter;}
- Client &get_client() {return client;}
-
- PyObject *get_python(const std::string &what);
- PyObject *get_server_python(const std::string &hostname);
- PyObject *list_servers_python();
- PyObject *get_metadata_python(
- std::string const &handle,
- const std::string &svc_name, const std::string &svc_id);
- PyObject *get_daemon_status_python(
- std::string const &handle,
- const std::string &svc_name, const std::string &svc_id);
- PyObject *get_counter_python(
- std::string const &handle,
- const std::string &svc_name,
- const std::string &svc_id,
- const std::string &path);
- PyObject *get_perf_schema_python(
- const std::string &handle,
- const std::string svc_type,
- const std::string &svc_id);
- PyObject *get_context();
-
- std::map<std::string, std::string> config_cache;
-
- // Python command definitions, including callback
- std::vector<ModuleCommand> get_py_commands() const;
-
- // Monitor command definitions, suitable for CLI
- std::vector<MonCommand> get_commands() const;
-
- void insert_config(const std::map<std::string, std::string> &new_config);
-
- // Public so that MonCommandCompletion can use it
- // FIXME: for send_command completion notifications,
- // send it to only the module that sent the command, not everyone
- void notify_all(const std::string ¬ify_type,
- const std::string ¬ify_id);
- void notify_all(const LogEntry &log_entry);
-
- int init();
- void start();
- void shutdown();
-
- void dump_server(const std::string &hostname,
- const DaemonStateCollection &dmc,
- Formatter *f);
-
- bool get_config(const std::string &handle,
- const std::string &key, std::string *val) const;
- PyObject *get_config_prefix(const std::string &handle,
- const std::string &prefix) const;
- void set_config(const std::string &handle,
- const std::string &key, const boost::optional<std::string> &val);
-
- void set_health_checks(const std::string& handle,
- health_check_map_t&& checks);
- void get_health_checks(health_check_map_t *checks);
-
- void log(const std::string &handle,
- int level, const std::string &record);
-
- static void list_modules(std::set<std::string> *modules);
-};
-
-#endif
-
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "include/stringify.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+typedef struct {
+ PyObject_HEAD
+ OSDMap *osdmap;
+} BasePyOSDMap;
+
+typedef struct {
+ PyObject_HEAD
+ OSDMap::Incremental *inc;
+} BasePyOSDMapIncremental;
+
+typedef struct {
+ PyObject_HEAD
+ ceph::shared_ptr<CrushWrapper> crush;
+} BasePyCRUSH;
+
+// ----------
+
+static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj)
+{
+ return PyInt_FromLong(self->osdmap->get_epoch());
+}
+
+static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj)
+{
+ return PyInt_FromLong(self->osdmap->get_crush_version());
+}
+
+static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj)
+{
+ PyFormatter f;
+ self->osdmap->dump(&f);
+ return f.get();
+}
+
+static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj)
+{
+ OSDMap::Incremental *inc = new OSDMap::Incremental;
+
+ inc->fsid = self->osdmap->get_fsid();
+ inc->epoch = self->osdmap->get_epoch() + 1;
+ // always include latest crush map here... this is okay since we never
+ // actually use this map in the real world (and even if we did it would
+ // be a no-op).
+ self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+ dout(10) << __func__ << " " << inc << dendl;
+
+ return construct_with_capsule("mgr_module", "OSDMapIncremental",
+ (void*)(inc));
+}
+
+static PyObject *osdmap_apply_incremental(BasePyOSDMap *self,
+ BasePyOSDMapIncremental *incobj)
+{
+ if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) {
+ derr << "Wrong type in osdmap_apply_incremental!" << dendl;
+ return nullptr;
+ }
+
+ bufferlist bl;
+ self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ OSDMap *next = new OSDMap;
+ next->decode(bl);
+ next->apply_incremental(*(incobj->inc));
+ dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc
+ << " next " << next << dendl;
+
+ return construct_with_capsule("mgr_module", "OSDMap", (void*)next);
+}
+
+static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj)
+{
+ return construct_with_capsule("mgr_module", "CRUSHMap",
+ (void*)(&(self->osdmap->crush)));
+}
+
+static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args)
+{
+ int take;
+ if (!PyArg_ParseTuple(args, "i:get_pools_by_take",
+ &take)) {
+ return nullptr;
+ }
+
+ PyFormatter f;
+ f.open_array_section("pools");
+ for (auto& p : self->osdmap->get_pools()) {
+ if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+ f.dump_int("pool", p.first);
+ }
+ }
+ f.close_section();
+ return f.get();
+}
+
+static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args)
+{
+ PyObject *pool_list;
+ BasePyOSDMapIncremental *incobj;
+ double max_deviation = 0;
+ int max_iterations = 0;
+ if (!PyArg_ParseTuple(args, "OdiO:calc_pg_upmaps",
+ &incobj, &max_deviation,
+ &max_iterations, &pool_list)) {
+ return nullptr;
+ }
+
+ dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
+ << " max_deviation " << max_deviation
+ << " max_iterations " << max_iterations
+ << dendl;
+ set<int64_t> pools;
+ // FIXME: unpack pool_list and translate to pools set
+ int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
+ max_deviation,
+ max_iterations,
+ pools,
+ incobj->inc);
+ dout(10) << __func__ << " r = " << r << dendl;
+ return PyInt_FromLong(r);
+}
+
+static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args)
+{
+ int poolid;
+ if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up",
+ &poolid)) {
+ return nullptr;
+ }
+ auto pi = self->osdmap->get_pg_pool(poolid);
+ if (!pi)
+ return nullptr;
+ map<pg_t,vector<int>> pm;
+ for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+ pg_t pgid(ps, poolid);
+ self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+ }
+ PyFormatter f;
+ for (auto p : pm) {
+ string pg = stringify(p.first);
+ f.open_array_section(pg.c_str());
+ for (auto o : p.second) {
+ f.dump_int("osd", o);
+ }
+ f.close_section();
+ }
+ return f.get();
+}
+
+static int
+BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *osdmap_capsule = nullptr;
+ static const char *kwlist[] = {"osdmap_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &osdmap_capsule)) {
+ assert(0);
+ return -1;
+ }
+ assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type));
+
+ self->osdmap = (OSDMap*)PyCapsule_GetPointer(
+ osdmap_capsule, nullptr);
+ assert(self->osdmap);
+
+ return 0;
+}
+
+
+static void
+BasePyOSDMap_dealloc(BasePyOSDMap *self)
+{
+ if (self->osdmap) {
+ delete self->osdmap;
+ self->osdmap = nullptr;
+ } else {
+ derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+ }
+ Py_TYPE(self)->tp_free(self);
+}
+
+
+PyMethodDef BasePyOSDMap_methods[] = {
+ {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"},
+ {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS,
+ "Get CRUSH version"},
+ {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"},
+ {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS,
+ "Create OSDMap::Incremental"},
+ {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O,
+ "Apply OSDMap::Incremental and return the resulting OSDMap"},
+ {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"},
+ {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS,
+ "Get pools that have CRUSH rules that TAKE the given root"},
+ {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS,
+ "Calculate new pg-upmap values"},
+ {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS,
+ "Calculate up set mappings for all PGs in a pool"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyOSDMap", /* tp_name */
+ sizeof(BasePyOSDMap), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyOSDMap_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMap", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyOSDMap_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyOSDMap_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+// ----------
+
+
+static int
+BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self,
+ PyObject *args, PyObject *kwds)
+{
+ PyObject *inc_capsule = nullptr;
+ static const char *kwlist[] = {"inc_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &inc_capsule)) {
+ assert(0);
+ return -1;
+ }
+ assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type));
+
+ self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer(
+ inc_capsule, nullptr);
+ assert(self->inc);
+
+ return 0;
+}
+
+static void
+BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self)
+{
+ if (self->inc) {
+ delete self->inc;
+ self->inc = nullptr;
+ } else {
+ derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+ }
+ Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self,
+ PyObject *obj)
+{
+ return PyInt_FromLong(self->inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self,
+ PyObject *obj)
+{
+ PyFormatter f;
+ self->inc->dump(&f);
+ return f.get();
+}
+
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+ PyObject *ls = PyDict_Items(obj);
+ for (int j = 0; j < PyList_Size(ls); ++j) {
+ PyObject *pair = PyList_GET_ITEM(ls, j);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " item " << j << " not a tuple" << dendl;
+ Py_DECREF(ls);
+ return -1;
+ }
+ int k;
+ double v;
+ if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+ derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+ Py_DECREF(ls);
+ return -1;
+ }
+ (*out)[k] = v;
+ }
+
+ Py_DECREF(ls);
+ return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self,
+ PyObject *weightobj)
+{
+ map<int,double> wm;
+ if (get_int_float_map(weightobj, &wm) < 0) {
+ return nullptr;
+ }
+
+ for (auto i : wm) {
+ self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+ }
+ Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+ BasePyOSDMapIncremental *self, PyObject *weightobj)
+{
+ map<int,double> wm;
+ if (get_int_float_map(weightobj, &wm) < 0) {
+ return nullptr;
+ }
+
+ CrushWrapper crush;
+ assert(self->inc->crush.length()); // see new_incremental
+ auto p = self->inc->crush.begin();
+ ::decode(crush, p);
+ crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+ for (auto i : wm) {
+ crush.choose_args_adjust_item_weightf(
+ g_ceph_context,
+ crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+ i.first,
+ { i.second },
+ nullptr);
+ }
+ self->inc->crush.clear();
+ crush.encode(self->inc->crush, CEPH_FEATURES_ALL);
+ Py_RETURN_NONE;
+}
+
+PyMethodDef BasePyOSDMapIncremental_methods[] = {
+ {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS,
+ "Get OSDMap::Incremental epoch"},
+ {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS,
+ "Dump OSDMap::Incremental"},
+ {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights,
+ METH_O, "Set osd reweight values"},
+ {"_set_crush_compat_weight_set_weights",
+ (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O,
+ "Set weight values in the pending CRUSH compat weight-set"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapIncrementalType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyOSDMapIncremental", /* tp_name */
+ sizeof(BasePyOSDMapIncremental), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyOSDMapIncremental_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMapIncremental", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyOSDMapIncremental_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyOSDMapIncremental_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+
+// ----------
+
+static int
+BasePyCRUSH_init(BasePyCRUSH *self,
+ PyObject *args, PyObject *kwds)
+{
+ PyObject *crush_capsule = nullptr;
+ static const char *kwlist[] = {"crush_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &crush_capsule)) {
+ assert(0);
+ return -1;
+ }
+ assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type));
+
+ auto ptr_ref = (ceph::shared_ptr<CrushWrapper>*)(
+ PyCapsule_GetPointer(crush_capsule, nullptr));
+
+ // We passed a pointer to a shared pointer, which is weird, but
+ // just enough to get it into the constructor: this is a real shared
+ // pointer construction now, and then we throw away that pointer to
+ // the shared pointer.
+ self->crush = *ptr_ref;
+ assert(self->crush);
+
+ return 0;
+}
+
+static void
+BasePyCRUSH_dealloc(BasePyCRUSH *self)
+{
+ self->crush.reset();
+ Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj)
+{
+ PyFormatter f;
+ self->crush->dump(&f);
+ return f.get();
+}
+
+static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args)
+{
+ int item;
+ if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) {
+ return nullptr;
+ }
+ if (!self->crush->item_exists(item)) {
+ Py_RETURN_NONE;
+ }
+ return PyString_FromString(self->crush->get_item_name(item));
+}
+
+static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args)
+{
+ int item;
+ if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) {
+ return nullptr;
+ }
+ if (!self->crush->item_exists(item)) {
+ Py_RETURN_NONE;
+ }
+ return PyFloat_FromDouble(self->crush->get_item_weightf(item));
+}
+
+static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj)
+{
+ set<int> takes;
+ self->crush->find_takes(&takes);
+ PyFormatter f;
+ f.open_array_section("takes");
+ for (auto root : takes) {
+ f.dump_int("root", root);
+ }
+ f.close_section();
+ return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args)
+{
+ int root;
+ if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map",
+ &root)) {
+ return nullptr;
+ }
+ map<int,float> wmap;
+
+ if (!self->crush->item_exists(root)) {
+ return nullptr;
+ }
+
+ self->crush->get_take_weight_osd_map(root, &wmap);
+ PyFormatter f;
+ f.open_object_section("weights");
+ for (auto& p : wmap) {
+ string n = stringify(p.first); // ick
+ f.dump_float(n.c_str(), p.second);
+ }
+ f.close_section();
+ return f.get();
+}
+
+PyMethodDef BasePyCRUSH_methods[] = {
+ {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"},
+ {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS,
+ "Get item name"},
+ {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS,
+ "Get item weight"},
+ {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS,
+ "Find distinct TAKE roots"},
+ {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map,
+ METH_VARARGS, "Get OSD weight map for a given TAKE root node"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyCRUSHType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyCRUSH", /* tp_name */
+ sizeof(BasePyCRUSH), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyCRUSH_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMapIncremental", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyCRUSH_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyCRUSH_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+#include "Python.h"
+
+
+
+extern PyTypeObject BasePyOSDMapType;
+extern PyTypeObject BasePyOSDMapIncrementalType;
+extern PyTypeObject BasePyCRUSHType;
+
+PyObject *construct_with_capsule(
+ const std::string &module,
+ const std::string &clsname,
+ void *wrapped);
+
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 John Spray <john.spray@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- */
-
-/**
- * The interface we present to python code that runs within
- * ceph-mgr.
- */
-
-#include "Mgr.h"
-
-#include "mon/MonClient.h"
-#include "common/errno.h"
-#include "common/version.h"
-
-#include "PyState.h"
-#include "Gil.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_mgr
-
-PyModules *global_handle = NULL;
-
-
-class MonCommandCompletion : public Context
-{
- PyObject *python_completion;
- const std::string tag;
- PyThreadState *pThreadState;
-
-public:
- std::string outs;
- bufferlist outbl;
-
- MonCommandCompletion(PyObject* ev, const std::string &tag_, PyThreadState *ts_)
- : python_completion(ev), tag(tag_), pThreadState(ts_)
- {
- assert(python_completion != nullptr);
- Py_INCREF(python_completion);
- }
-
- ~MonCommandCompletion() override
- {
- Py_DECREF(python_completion);
- }
-
- void finish(int r) override
- {
- dout(10) << "MonCommandCompletion::finish()" << dendl;
- {
- // Scoped so the Gil is released before calling notify_all()
- // Create new thread state because this is called via the MonClient
- // Finisher, not the PyModules finisher.
- Gil gil(pThreadState, true);
-
- auto set_fn = PyObject_GetAttrString(python_completion, "complete");
- assert(set_fn != nullptr);
-
- auto pyR = PyInt_FromLong(r);
- auto pyOutBl = PyString_FromString(outbl.to_str().c_str());
- auto pyOutS = PyString_FromString(outs.c_str());
- auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
- Py_DECREF(pyR);
- Py_DECREF(pyOutBl);
- Py_DECREF(pyOutS);
-
- auto rtn = PyObject_CallObject(set_fn, args);
- if (rtn != nullptr) {
- Py_DECREF(rtn);
- }
- Py_DECREF(args);
- }
- global_handle->notify_all("command", tag);
- }
-};
-
-
-static PyObject*
-ceph_send_command(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
-
- // Like mon, osd, mds
- char *type = nullptr;
-
- // Like "23" for an OSD or "myid" for an MDS
- char *name = nullptr;
-
- char *cmd_json = nullptr;
- char *tag = nullptr;
- PyObject *completion = nullptr;
- if (!PyArg_ParseTuple(args, "sOssss:ceph_send_command",
- &handle, &completion, &type, &name, &cmd_json, &tag)) {
- return nullptr;
- }
-
- auto set_fn = PyObject_GetAttrString(completion, "complete");
- if (set_fn == nullptr) {
- ceph_abort(); // TODO raise python exception instead
- } else {
- assert(PyCallable_Check(set_fn));
- }
- Py_DECREF(set_fn);
-
- auto c = new MonCommandCompletion(completion, tag, PyThreadState_Get());
- if (std::string(type) == "mon") {
- global_handle->get_monc().start_mon_command(
- {cmd_json},
- {},
- &c->outbl,
- &c->outs,
- c);
- } else if (std::string(type) == "osd") {
- std::string err;
- uint64_t osd_id = strict_strtoll(name, 10, &err);
- if (!err.empty()) {
- delete c;
- string msg("invalid osd_id: ");
- msg.append("\"").append(name).append("\"");
- PyErr_SetString(PyExc_ValueError, msg.c_str());
- return nullptr;
- }
-
- ceph_tid_t tid;
- global_handle->get_objecter().osd_command(
- osd_id,
- {cmd_json},
- {},
- &tid,
- &c->outbl,
- &c->outs,
- c);
- } else if (std::string(type) == "mds") {
- int r = global_handle->get_client().mds_command(
- name,
- {cmd_json},
- {},
- &c->outbl,
- &c->outs,
- c);
- if (r != 0) {
- string msg("failed to send command to mds: ");
- msg.append(cpp_strerror(r));
- PyErr_SetString(PyExc_RuntimeError, msg.c_str());
- return nullptr;
- }
- } else if (std::string(type) == "pg") {
- pg_t pgid;
- if (!pgid.parse(name)) {
- delete c;
- string msg("invalid pgid: ");
- msg.append("\"").append(name).append("\"");
- PyErr_SetString(PyExc_ValueError, msg.c_str());
- return nullptr;
- }
-
- ceph_tid_t tid;
- global_handle->get_objecter().pg_command(
- pgid,
- {cmd_json},
- {},
- &tid,
- &c->outbl,
- &c->outs,
- c);
- return nullptr;
- } else {
- delete c;
- string msg("unknown service type: ");
- msg.append(type);
- PyErr_SetString(PyExc_ValueError, msg.c_str());
- return nullptr;
- }
-
- Py_RETURN_NONE;
-}
-
-static PyObject*
-ceph_set_health_checks(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- PyObject *checks = NULL;
- if (!PyArg_ParseTuple(args, "sO:ceph_set_health_checks", &handle, &checks)) {
- return NULL;
- }
- if (!PyDict_Check(checks)) {
- derr << __func__ << " arg not a dict" << dendl;
- Py_RETURN_NONE;
- }
- PyObject *checksls = PyDict_Items(checks);
- health_check_map_t out_checks;
- for (int i = 0; i < PyList_Size(checksls); ++i) {
- PyObject *kv = PyList_GET_ITEM(checksls, i);
- char *check_name = nullptr;
- PyObject *check_info = nullptr;
- if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
- derr << __func__ << " dict item " << i
- << " not a size 2 tuple" << dendl;
- continue;
- }
- if (!PyDict_Check(check_info)) {
- derr << __func__ << " item " << i << " " << check_name
- << " value not a dict" << dendl;
- continue;
- }
- health_status_t severity = HEALTH_OK;
- string summary;
- list<string> detail;
- PyObject *infols = PyDict_Items(check_info);
- for (int j = 0; j < PyList_Size(infols); ++j) {
- PyObject *pair = PyList_GET_ITEM(infols, j);
- if (!PyTuple_Check(pair)) {
- derr << __func__ << " item " << i << " pair " << j
- << " not a tuple" << dendl;
- continue;
- }
- char *k = nullptr;
- PyObject *v = nullptr;
- if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
- derr << __func__ << " item " << i << " pair " << j
- << " not a size 2 tuple" << dendl;
- continue;
- }
- string ks(k);
- if (ks == "severity") {
- if (!PyString_Check(v)) {
- derr << __func__ << " check " << check_name
- << " severity value not string" << dendl;
- continue;
- }
- string vs(PyString_AsString(v));
- if (vs == "warning") {
- severity = HEALTH_WARN;
- } else if (vs == "error") {
- severity = HEALTH_ERR;
- }
- } else if (ks == "summary") {
- if (!PyString_Check(v)) {
- derr << __func__ << " check " << check_name
- << " summary value not string" << dendl;
- continue;
- }
- summary = PyString_AsString(v);
- } else if (ks == "detail") {
- if (!PyList_Check(v)) {
- derr << __func__ << " check " << check_name
- << " detail value not list" << dendl;
- continue;
- }
- for (int k = 0; k < PyList_Size(v); ++k) {
- PyObject *di = PyList_GET_ITEM(v, k);
- if (!PyString_Check(di)) {
- derr << __func__ << " check " << check_name
- << " detail item " << k << " not a string" << dendl;
- continue;
- }
- detail.push_back(PyString_AsString(di));
- }
- } else {
- derr << __func__ << " check " << check_name
- << " unexpected key " << k << dendl;
- }
- }
- auto& d = out_checks.add(check_name, severity, summary);
- d.detail.swap(detail);
- }
-
- JSONFormatter jf(true);
- dout(10) << "module " << handle << " health checks:\n";
- out_checks.dump(&jf);
- jf.flush(*_dout);
- *_dout << dendl;
-
- global_handle->set_health_checks(handle, std::move(out_checks));
-
- Py_RETURN_NONE;
-}
-
-
-static PyObject*
-ceph_state_get(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *what = NULL;
- if (!PyArg_ParseTuple(args, "ss:ceph_state_get", &handle, &what)) {
- return NULL;
- }
-
- return global_handle->get_python(what);
-}
-
-
-static PyObject*
-ceph_get_server(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *hostname = NULL;
- if (!PyArg_ParseTuple(args, "sz:ceph_get_server", &handle, &hostname)) {
- return NULL;
- }
-
- if (hostname) {
- return global_handle->get_server_python(hostname);
- } else {
- return global_handle->list_servers_python();
- }
-}
-
-static PyObject*
-ceph_get_mgr_id(PyObject *self, PyObject *args)
-{
- return PyString_FromString(g_conf->name.get_id().c_str());
-}
-
-static PyObject*
-ceph_config_get(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *what = nullptr;
- if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &what)) {
- derr << "Invalid args!" << dendl;
- return nullptr;
- }
-
- std::string value;
- bool found = global_handle->get_config(handle, what, &value);
- if (found) {
- dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
- return PyString_FromString(value.c_str());
- } else {
- dout(4) << "ceph_config_get " << what << " not found " << dendl;
- Py_RETURN_NONE;
- }
-}
-
-static PyObject*
-ceph_config_get_prefix(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *prefix = nullptr;
- if (!PyArg_ParseTuple(args, "ss:ceph_config_get", &handle, &prefix)) {
- derr << "Invalid args!" << dendl;
- return nullptr;
- }
-
- return global_handle->get_config_prefix(handle, prefix);
-}
-
-static PyObject*
-ceph_config_set(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *key = nullptr;
- char *value = nullptr;
- if (!PyArg_ParseTuple(args, "ssz:ceph_config_set", &handle, &key, &value)) {
- return nullptr;
- }
- boost::optional<string> val;
- if (value) {
- val = value;
- }
- global_handle->set_config(handle, key, val);
-
- Py_RETURN_NONE;
-}
-
-static PyObject*
-get_metadata(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *svc_name = NULL;
- char *svc_id = NULL;
- if (!PyArg_ParseTuple(args, "sss:get_metadata", &handle, &svc_name, &svc_id)) {
- return nullptr;
- }
- return global_handle->get_metadata_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-get_daemon_status(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *svc_name = NULL;
- char *svc_id = NULL;
- if (!PyArg_ParseTuple(args, "sss:get_daemon_status", &handle, &svc_name,
- &svc_id)) {
- return nullptr;
- }
- return global_handle->get_daemon_status_python(handle, svc_name, svc_id);
-}
-
-static PyObject*
-ceph_log(PyObject *self, PyObject *args)
-{
- int level = 0;
- char *record = nullptr;
- char *handle = nullptr;
- if (!PyArg_ParseTuple(args, "sis:log", &handle, &level, &record)) {
- return nullptr;
- }
-
- global_handle->log(handle, level, record);
-
- Py_RETURN_NONE;
-}
-
-static PyObject *
-ceph_get_version(PyObject *self, PyObject *args)
-{
- return PyString_FromString(pretty_version_to_str().c_str());
-}
-
-static PyObject *
-ceph_get_context(PyObject *self, PyObject *args)
-{
- return global_handle->get_context();
-}
-
-static PyObject*
-get_counter(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *svc_name = nullptr;
- char *svc_id = nullptr;
- char *counter_path = nullptr;
- if (!PyArg_ParseTuple(args, "ssss:get_counter", &handle, &svc_name,
- &svc_id, &counter_path)) {
- return nullptr;
- }
- return global_handle->get_counter_python(
- handle, svc_name, svc_id, counter_path);
-}
-
-static PyObject*
-get_perf_schema(PyObject *self, PyObject *args)
-{
- char *handle = nullptr;
- char *type_str = nullptr;
- char *svc_id = nullptr;
- if (!PyArg_ParseTuple(args, "sss:get_perf_schema", &handle, &type_str,
- &svc_id)) {
- return nullptr;
- }
-
- return global_handle->get_perf_schema_python(handle, type_str, svc_id);
-}
-
-PyMethodDef CephStateMethods[] = {
- {"get", ceph_state_get, METH_VARARGS,
- "Get a cluster object"},
- {"get_server", ceph_get_server, METH_VARARGS,
- "Get a server object"},
- {"get_metadata", get_metadata, METH_VARARGS,
- "Get a service's metadata"},
- {"get_daemon_status", get_daemon_status, METH_VARARGS,
- "Get a service's status"},
- {"send_command", ceph_send_command, METH_VARARGS,
- "Send a mon command"},
- {"set_health_checks", ceph_set_health_checks, METH_VARARGS,
- "Set health checks for this module"},
- {"get_mgr_id", ceph_get_mgr_id, METH_NOARGS,
- "Get the mgr id"},
- {"get_config", ceph_config_get, METH_VARARGS,
- "Get a configuration value"},
- {"get_config_prefix", ceph_config_get_prefix, METH_VARARGS,
- "Get all configuration values with a given prefix"},
- {"set_config", ceph_config_set, METH_VARARGS,
- "Set a configuration value"},
- {"get_counter", get_counter, METH_VARARGS,
- "Get a performance counter"},
- {"get_perf_schema", get_perf_schema, METH_VARARGS,
- "Get the performance counter schema"},
- {"log", ceph_log, METH_VARARGS,
- "Emit a (local) log message"},
- {"get_version", ceph_get_version, METH_VARARGS,
- "Get the ceph version of this process"},
- {"get_context", ceph_get_context, METH_NOARGS,
- "Get a CephContext* in a python capsule"},
- {NULL, NULL, 0, NULL}
-};
-
+++ /dev/null
-#ifndef PYSTATE_H_
-#define PYSTATE_H_
-
-#include "Python.h"
-
-class PyModules;
-
-extern PyModules *global_handle;
-extern PyMethodDef CephStateMethods[];
-
-#endif
-
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "StandbyPyModules.h"
+
+#include "common/debug.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/Gil.h"
+
+
+#include <boost/python.hpp>
+#include "include/assert.h" // boost clobbers this
+
+// For ::config_prefix
+#include "PyModuleRegistry.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+// Declaration fulfilled by ActivePyModules
+std::string handle_pyerror();
+
+
+StandbyPyModules::StandbyPyModules(MonClient *monc_, const MgrMap &mgr_map_)
+ : monc(monc_), load_config_thread(monc, &state)
+{
+ state.set_mgr_map(mgr_map_);
+}
+
+// FIXME: completely identical to ActivePyModules
+void StandbyPyModules::shutdown()
+{
+ Mutex::Locker locker(lock);
+
+ if (!state.is_config_loaded && load_config_thread.is_started()) {
+ // FIXME: handle cases where initial load races with shutdown
+ // this is actually not super rare because
+ assert(0);
+ //load_config_thread.kill(SIGKILL);
+ }
+
+ // Signal modules to drop out of serve() and/or tear down resources
+ for (auto &i : modules) {
+ auto module = i.second.get();
+ const auto& name = i.first;
+ dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+ lock.Unlock();
+ module->shutdown();
+ lock.Lock();
+ dout(10) << "module " << name << " shutdown" << dendl;
+ }
+
+ // For modules implementing serve(), finish the threads where we
+ // were running that.
+ for (auto &i : modules) {
+ lock.Unlock();
+ dout(10) << "joining thread for module " << i.first << dendl;
+ i.second->thread.join();
+ dout(10) << "joined thread for module " << i.first << dendl;
+ lock.Lock();
+ }
+
+ modules.clear();
+}
+
+int StandbyPyModules::start_one(std::string const &module_name,
+ PyObject *pClass, const SafeThreadState &pMyThreadState)
+{
+ Mutex::Locker l(lock);
+
+ assert(modules.count(module_name) == 0);
+
+ modules[module_name].reset(new StandbyPyModule(
+ state,
+ module_name, pClass,
+ pMyThreadState));
+
+ if (modules.size() == 1) {
+ load_config_thread.create("LoadConfig");
+ }
+
+ int r = modules[module_name]->load();
+ if (r != 0) {
+ modules.erase(module_name);
+ return r;
+ } else {
+ dout(4) << "Starting thread for " << module_name << dendl;
+ // Giving Thread the module's module_name member as its
+ // char* thread name: thread must not outlive module class lifetime.
+ modules[module_name]->thread.create(
+ modules[module_name]->get_name().c_str());
+ return 0;
+ }
+}
+
+int StandbyPyModule::load()
+{
+ Gil gil(pMyThreadState, true);
+
+ // We tell the module how we name it, so that it can be consistent
+ // with us in logging etc.
+ auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+ assert(pThisPtr != nullptr);
+ auto pModuleName = PyString_FromString(module_name.c_str());
+ assert(pModuleName != nullptr);
+ auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr);
+ Py_DECREF(pThisPtr);
+ Py_DECREF(pModuleName);
+
+ pClassInstance = PyObject_CallObject(pClass, pArgs);
+ Py_DECREF(pArgs);
+ if (pClassInstance == nullptr) {
+ derr << "Failed to construct class in '" << module_name << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ } else {
+ dout(1) << "Constructed class from module: " << module_name << dendl;
+ return 0;
+ }
+}
+
+void *StandbyPyModules::LoadConfigThread::entry()
+{
+ dout(10) << "listing keys" << dendl;
+ JSONCommand cmd;
+ cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
+ cmd.wait();
+ assert(cmd.r == 0);
+
+ std::map<std::string, std::string> loaded;
+
+ for (auto &key_str : cmd.json_result.get_array()) {
+ std::string const key = key_str.get_str();
+ dout(20) << "saw key '" << key << "'" << dendl;
+
+ const std::string config_prefix = PyModuleRegistry::config_prefix;
+
+ if (key.substr(0, config_prefix.size()) == config_prefix) {
+ dout(20) << "fetching '" << key << "'" << dendl;
+ Command get_cmd;
+ std::ostringstream cmd_json;
+ cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}";
+ get_cmd.run(monc, cmd_json.str());
+ get_cmd.wait();
+ assert(get_cmd.r == 0);
+ loaded[key] = get_cmd.outbl.to_str();
+ }
+ }
+ state->loaded_config(loaded);
+
+ return nullptr;
+}
+
+bool StandbyPyModule::get_config(const std::string &key,
+ std::string *value) const
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ PyEval_RestoreThread(tstate);
+
+ const std::string global_key = PyModuleRegistry::config_prefix
+ + module_name + "/" + key;
+
+ dout(4) << __func__ << "key: " << global_key << dendl;
+
+ return state.with_config([global_key, value](const PyModuleConfig &config){
+ if (config.count(global_key)) {
+ *value = config.at(global_key);
+ return true;
+ } else {
+ return false;
+ }
+ });
+}
+
+std::string StandbyPyModule::get_active_uri() const
+{
+ std::string result;
+ state.with_mgr_map([&result, this](const MgrMap &mgr_map){
+ auto iter = mgr_map.services.find(module_name);
+ if (iter != mgr_map.services.end()) {
+ result = iter->second;
+ }
+ });
+
+ return result;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include "Python.h"
+
+#include <string>
+#include <map>
+
+#include "common/Thread.h"
+#include "common/Mutex.h"
+
+#include "mgr/Gil.h"
+#include "mon/MonClient.h"
+#include "mon/MgrMap.h"
+#include "mgr/PyModuleRunner.h"
+
+typedef std::map<std::string, std::string> PyModuleConfig;
+
+/**
+ * State that is read by all modules running in standby mode
+ */
+class StandbyPyModuleState
+{
+ mutable Mutex lock{"StandbyPyModuleState::lock"};
+
+ MgrMap mgr_map;
+ PyModuleConfig config_cache;
+
+ mutable Cond config_loaded;
+
+public:
+
+ bool is_config_loaded = false;
+
+ void set_mgr_map(const MgrMap &mgr_map_)
+ {
+ Mutex::Locker l(lock);
+
+ mgr_map = mgr_map_;
+ }
+
+ void loaded_config(const PyModuleConfig &config_)
+ {
+ Mutex::Locker l(lock);
+
+ config_cache = config_;
+ is_config_loaded = true;
+ config_loaded.Signal();
+ }
+
+ template<typename Callback, typename...Args>
+ void with_mgr_map(Callback&& cb, Args&&...args) const
+ {
+ Mutex::Locker l(lock);
+ std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_config(Callback&& cb, Args&&... args) const ->
+ decltype(cb(config_cache, std::forward<Args>(args)...)) {
+ Mutex::Locker l(lock);
+
+ if (!is_config_loaded) {
+ config_loaded.Wait(lock);
+ }
+
+ return std::forward<Callback>(cb)(config_cache, std::forward<Args>(args)...);
+ }
+};
+
+
+class StandbyPyModule : public PyModuleRunner
+{
+ StandbyPyModuleState &state;
+
+ public:
+
+ StandbyPyModule(
+ StandbyPyModuleState &state_,
+ const std::string &module_name_,
+ PyObject *pClass_,
+ const SafeThreadState &pMyThreadState_)
+ :
+ PyModuleRunner(module_name_, pClass_, pMyThreadState_),
+ state(state_)
+ {
+ }
+
+ bool get_config(const std::string &key, std::string *value) const;
+ std::string get_active_uri() const;
+
+ int load();
+};
+
+class StandbyPyModules
+{
+private:
+ mutable Mutex lock{"StandbyPyModules::lock"};
+ std::map<std::string, std::unique_ptr<StandbyPyModule>> modules;
+
+ MonClient *monc;
+
+ StandbyPyModuleState state;
+
+ void load_config();
+ class LoadConfigThread : public Thread
+ {
+ protected:
+ MonClient *monc;
+ StandbyPyModuleState *state;
+ public:
+ LoadConfigThread(MonClient *monc_, StandbyPyModuleState *state_)
+ : monc(monc_), state(state_)
+ {}
+ void *entry() override;
+ };
+
+ LoadConfigThread load_config_thread;
+
+public:
+
+ StandbyPyModules(
+ MonClient *monc_,
+ const MgrMap &mgr_map_);
+
+ int start_one(std::string const &module_name,
+ PyObject *pClass,
+ const SafeThreadState &pMyThreadState);
+
+ void shutdown();
+
+ void handle_mgr_map(const MgrMap &mgr_map)
+ {
+ state.set_mgr_map(mgr_map);
+ }
+
+};
}
}
- // add bootstrap key
- {
+ // add bootstrap key if it does not already exist
+ // (might have already been get-or-create'd by
+ // ceph-create-keys)
+ EntityName bootstrap_mgr_name;
+ int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr");
+ assert(r);
+ if (!mon->key_server.contains(bootstrap_mgr_name)) {
KeyServerData::Incremental auth_inc;
- bool r = auth_inc.name.from_str("client.bootstrap-mgr");
- assert(r);
+ auth_inc.name = bootstrap_mgr_name;
::encode("allow profile bootstrap-mgr", auth_inc.auth.caps["mon"]);
auth_inc.op = KeyServerData::AUTH_INC_ADD;
// generate key
* as far as we know, we may even be dead); so, just propose ourselves as the
* Leader.
*/
- expire_event = new C_MonContext(mon, [this](int) {
- expire();
- });
- mon->timer.add_event_after(g_conf->mon_election_timeout + plus,
- expire_event);
+ expire_event = mon->timer.add_event_after(
+ g_conf->mon_election_timeout + plus,
+ new C_MonContext(mon, [this](int) {
+ expire();
+ }));
}
};
auto rp = summary.tail.rbegin();
- while (num > 0 && rp != summary.tail.rend()) {
+ for (; num > 0 && rp != summary.tail.rend(); ++rp) {
if (match(*rp)) {
num--;
}
- ++rp;
+ }
+ if (rp == summary.tail.rend()) {
+ --rp;
}
ostringstream ss;
- auto p = summary.tail.begin();
- for ( ; p != summary.tail.end(); ++p) {
- if (!match(*p)) {
+ for (; rp != summary.tail.rbegin(); --rp) {
+ if (!match(*rp)) {
continue;
}
if (f) {
- f->dump_object("entry", *p);
+ f->dump_object("entry", *rp);
} else {
- ss << *p << "\n";
+ ss << *rp << "\n";
}
}
if (f) {
if (graylogs.count(channel) == 0) {
auto graylog(std::make_shared<ceph::logging::Graylog>("mon"));
- graylog->set_fsid(g_conf->fsid);
+ graylog->set_fsid(g_conf->get_val<uuid_d>("fsid"));
graylog->set_hostname(g_conf->host);
graylog->set_destination(get_str_map_key(log_to_graylog_host, channel,
&CLOG_CONFIG_DEFAULT_KEY),
<< ").mds e" << fsmap.get_epoch() << " ";
}
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
/*
* Specialized implementation of cmd_getval to allow us to parse
* out strongly-typedef'd types
return cmd_getval(cct, cmdmap, k, (int64_t&)val);
}
-static const string MDS_METADATA_PREFIX("mds_metadata");
-
-
// my methods
void MDSMonitor::print_map(FSMap &m, int dbl)
dout(10) << "create_initial" << dendl;
}
+void MDSMonitor::get_store_prefixes(std::set<string>& s)
+{
+ s.insert(service_name);
+ s.insert(MDS_METADATA_PREFIX);
+ s.insert(MDS_HEALTH_PREFIX);
+}
void MDSMonitor::update_from_paxos(bool *need_bootstrap)
{
pending_fsmap = fsmap;
pending_fsmap.epoch++;
+ if (mon->osdmon()->is_readable()) {
+ auto &osdmap = mon->osdmon()->osdmap;
+ pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+ }
+
dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
}
class MMDSMap;
class FileSystemCommandHandler;
-#define MDS_HEALTH_PREFIX "mds_health"
-
class MDSMonitor : public PaxosService {
public:
MDSMonitor(Monitor *mn, Paxos *p, string service_name);
// service methods
void create_initial() override;
+ void get_store_prefixes(std::set<string>& s) override;
void update_from_paxos(bool *need_bootstrap) override;
void init() override;
void create_pending() override;
std::set<std::string> modules;
std::set<std::string> available_modules;
+ // Map of module name to URI, indicating services exposed by
+ // running modules on the active mgr daemon.
+ std::map<std::string, std::string> services;
+
epoch_t get_epoch() const { return epoch; }
entity_addr_t get_active_addr() const { return active_addr; }
uint64_t get_active_gid() const { return active_gid; }
void encode(bufferlist& bl, uint64_t features) const
{
- ENCODE_START(2, 1, bl);
+ ENCODE_START(3, 1, bl);
::encode(epoch, bl);
::encode(active_addr, bl, features);
::encode(active_gid, bl);
::encode(standbys, bl);
::encode(modules, bl);
::encode(available_modules, bl);
+ ::encode(services, bl);
ENCODE_FINISH(bl);
}
::decode(modules, p);
::decode(available_modules, p);
}
+ if (struct_v >= 3) {
+ ::decode(services, p);
+ }
DECODE_FINISH(p);
}
f->dump_string("module", j);
}
f->close_section();
+
+ f->open_object_section("services");
+ for (const auto &i : services) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
+ f->close_section();
}
static void generate_test_instances(list<MgrMap*> &l) {
void MgrMonitor::create_initial()
{
- boost::tokenizer<> tok(g_conf->mgr_initial_modules);
+ // Take a local copy of initial_modules for tokenizer to iterate over.
+ auto initial_modules = g_conf->get_val<std::string>("mgr_initial_modules");
+ boost::tokenizer<> tok(initial_modules);
for (auto& m : tok) {
pending_map.modules.insert(m);
}
<< dendl;
}
+void MgrMonitor::get_store_prefixes(std::set<string>& s)
+{
+ s.insert(service_name);
+ s.insert(command_descs_prefix);
+ s.insert(MGR_METADATA_PREFIX);
+}
+
void MgrMonitor::update_from_paxos(bool *need_bootstrap)
{
version_t version = get_last_committed();
check_subs();
if (version == 1
- || (map.get_available()
- && (!old_available || old_gid != map.get_active_gid()))) {
+ || command_descs.empty()
+ || (map.get_available()
+ && (!old_available || old_gid != map.get_active_gid()))) {
dout(4) << "mkfs or daemon transitioned to available, loading commands"
<< dendl;
bufferlist loaded_commands;
{
pending_map = map;
pending_map.epoch++;
+
+ if (map.get_epoch() == 1 &&
+ command_descs.empty() &&
+ pending_command_descs.empty()) {
+ // we've been through the initial map and we haven't populated the
+ // command_descs vector. This likely means we came from kraken, where
+ // we wouldn't populate the vector, nor would we write it to disk, on
+ // create_initial().
+ create_initial();
+ }
}
health_status_t MgrMonitor::should_warn_about_mgr_down()
// no OSDs are ever created.
if (ever_had_active_mgr ||
(mon->osdmon()->osdmap.get_num_osds() > 0 &&
- now > mon->monmap->created + g_conf->mon_mgr_mkfs_grace)) {
+ now > mon->monmap->created + g_conf->get_val<int64_t>("mon_mgr_mkfs_grace"))) {
health_status_t level = HEALTH_WARN;
if (first_seen_inactive != utime_t() &&
- now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+ now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
level = HEALTH_ERR;
}
return level;
bool updated = false;
if (pending_map.active_gid == m->get_gid()) {
+ if (pending_map.services != m->get_services()) {
+ dout(4) << "updated services from mgr." << m->get_name()
+ << ": " << m->get_services() << dendl;
+ pending_map.services = m->get_services();
+ updated = true;
+ }
+
// A beacon from the currently active daemon
if (pending_map.active_addr != m->get_server_addr()) {
dout(4) << "learned address " << m->get_server_addr()
sub->session->con->send_message(mdigest);
}
- digest_event = new C_MonContext(mon, [this](int){
+ digest_event = mon->timer.add_event_after(
+ g_conf->get_val<int64_t>("mon_mgr_digest_period"),
+ new C_MonContext(mon, [this](int) {
send_digests();
- });
- mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_event);
+ }));
}
void MgrMonitor::cancel_timer()
if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
utime_t now = ceph_clock_now();
if (first_seen_inactive != utime_t() &&
- now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+ now - first_seen_inactive > g_conf->get_val<int64_t>("mon_mgr_inactive_grace")) {
level = HEALTH_ERR;
}
}
return;
const auto now = ceph::coarse_mono_clock::now();
- const auto cutoff = now - std::chrono::seconds(g_conf->mon_mgr_beacon_grace);
+
+ const auto mgr_beacon_grace = std::chrono::seconds(
+ g_conf->get_val<int64_t>("mon_mgr_beacon_grace"));
+
+ // Note that this is the mgr daemon's tick period, not ours (the
+ // beacon is sent with this period).
+ const auto mgr_tick_period = std::chrono::seconds(
+ g_conf->get_val<int64_t>("mgr_tick_period"));
+
+ if (last_tick != ceph::coarse_mono_clock::time_point::min()
+ && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+ // This case handles either local slowness (calls being delayed
+ // for whatever reason) or cluster election slowness (a long gap
+ // between calls while an election happened)
+ dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+ "(slow election?) of " << now - last_tick << " seconds" << dendl;
+ for (auto &i : last_beacon) {
+ i.second = now;
+ }
+ }
+
+ last_tick = now;
// Populate any missing beacons (i.e. no beacon since MgrMonitor
// instantiation) with the current time, so that they will
// Cull standbys first so that any remaining standbys
// will be eligible to take over from the active if we cull him.
std::list<uint64_t> dead_standbys;
+ const auto cutoff = now - mgr_beacon_grace;
for (const auto &i : pending_map.standbys) {
auto last_beacon_time = last_beacon.at(i.first);
if (last_beacon_time < cutoff) {
if (promote_standby()) {
dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
mon->clog->info() << "Activating manager daemon "
- << pending_map.active_name;
+ << pending_map.active_name;
propose = true;
}
}
if (!pending_map.available &&
!ever_had_active_mgr &&
should_warn_about_mgr_down() != HEALTH_OK) {
- dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace
- << " seconds" << dendl;
+ dout(10) << " exceeded mon_mgr_mkfs_grace "
+ << g_conf->get_val<int64_t>("mon_mgr_mkfs_grace")
+ << " seconds" << dendl;
propose = true;
}
{
// Clear out the leader-specific state.
last_beacon.clear();
+ last_tick = ceph::coarse_mono_clock::now();
}
pending_map.active_gid = 0;
pending_map.available = false;
pending_map.active_addr = entity_addr_t();
+ pending_map.services.clear();
// So that when new active mgr subscribes to mgrdigest, it will
// get an immediate response instead of waiting for next timer
}
f->flush(rdata);
} else if (prefix == "mgr module ls") {
- f->open_array_section("modules");
- for (auto& p : map.modules) {
- f->dump_string("module", p);
+ f->open_object_section("modules");
+ {
+ f->open_array_section("enabled_modules");
+ for (auto& p : map.modules) {
+ f->dump_string("module", p);
+ }
+ f->close_section();
+ f->open_array_section("disabled_modules");
+ for (auto& p : map.available_modules) {
+ if (map.modules.count(p) == 0) {
+ f->dump_string("module", p);
+ }
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "mgr services") {
+ f->open_object_section("services");
+ for (const auto &i : map.services) {
+ f->dump_string(i.first.c_str(), i.second);
}
f->close_section();
f->flush(rdata);
bool in_use() const { return map.epoch > 0; }
void create_initial() override;
+ void get_store_prefixes(std::set<string>& s) override;
void update_from_paxos(bool *need_bootstrap) override;
void create_pending() override;
void encode_pending(MonitorDBStore::TransactionRef t) override;
void count_metadata(const string& field, std::map<string,int> *out);
friend class C_Updated;
+
+ // When did the mon last call into our tick() method? Used for detecting
+ // when the mon was not updating us for some period (e.g. during slow
+ // election) to reset last_beacon timeouts
+ ceph::coarse_mono_clock::time_point last_tick;
};
#endif
"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
"add or update crushmap position and weight for <name> with <weight> and location <args>", \
"osd", "rw", "cli,rest")
+COMMAND("osd crush set-all-straw-buckets-to-straw2",
+ "convert all CRUSH current straw buckets to use the straw2 algorithm",
+ "osd", "rw", "cli,rest")
COMMAND("osd crush set-device-class " \
"name=class,type=CephString " \
"name=ids,type=CephString,n=N", \
"list all erasure code profiles", \
"osd", "r", "cli,rest")
COMMAND("osd set " \
- "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds", \
+ "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set <key>", "osd", "rw", "cli,rest")
COMMAND("osd unset " \
"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
"unset <key>", "osd", "rw", "cli,rest")
COMMAND("osd require-osd-release "\
- "name=release,type=CephChoices,strings=luminous",
+ "name=release,type=CephChoices,strings=luminous " \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set the minimum allowed OSD release to participate in the cluster",
"osd", "rw", "cli,rest")
COMMAND("osd cluster_snap", "take cluster snapshot (disabled)", \
"treat the named manager daemon as failed", "mgr", "rw", "cli,rest")
COMMAND("mgr module ls",
"list active mgr modules", "mgr", "r", "cli,rest")
+COMMAND("mgr services",
+ "list service endpoints provided by mgr modules",
+ "mgr", "r", "cli,rest")
COMMAND("mgr module enable " \
"name=module,type=CephString " \
"name=force,type=CephChoices,strings=--force,req=false",
{
const md_config_t *conf = cct->_conf;
// file?
- if (!conf->monmap.empty()) {
+ const auto monmap = conf->get_val<std::string>("monmap");
+ if (!monmap.empty()) {
int r;
try {
- r = read(conf->monmap.c_str());
+ r = read(monmap.c_str());
}
catch (const buffer::error &e) {
r = -EINVAL;
}
if (r >= 0)
return 0;
- errout << "unable to read/decode monmap from " << conf->monmap
+ errout << "unable to read/decode monmap from " << monmap
<< ": " << cpp_strerror(-r) << std::endl;
return r;
}
// fsid from conf?
- if (!cct->_conf->fsid.is_zero()) {
- fsid = cct->_conf->fsid;
+ const auto new_fsid = conf->get_val<uuid_d>("fsid");
+ if (!new_fsid.is_zero()) {
+ fsid = new_fsid;
}
// -m foo?
- if (!conf->mon_host.empty()) {
- int r = build_from_host_list(conf->mon_host, "noname-");
+ const auto mon_host = conf->get_val<std::string>("mon_host");
+ if (!mon_host.empty()) {
+ int r = build_from_host_list(mon_host, "noname-");
if (r < 0) {
- errout << "unable to parse addrs in '" << conf->mon_host << "'"
+ errout << "unable to parse addrs in '" << mon_host << "'"
<< std::endl;
return r;
}
if (size() == 0) {
// no info found from conf options lets try use DNS SRV records
- string srv_name = conf->mon_dns_srv_name;
+ string srv_name = conf->get_val<std::string>("mon_dns_srv_name");
string domain;
// check if domain is also provided and extract it from srv_name
size_t idx = srv_name.find("_");
assert(!logger);
{
PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
- pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess");
- pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions", "sadd");
- pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions", "srm");
- pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions");
- pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in");
- pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started");
- pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won");
- pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost");
+ pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess",
+ PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions",
+ "sadd", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions",
+ "srm", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions",
+ "strm", PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in",
+ "ecnt", PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started",
+ "estt", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won",
+ "ewon", PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost",
+ "elst", PerfCountersBuilder::PRIO_INTERESTING);
logger = pcb.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
dout(10) << __func__ << dendl;
if (sync_timeout_event)
timer.cancel_event(sync_timeout_event);
- sync_timeout_event = new C_MonContext(this, [this](int) {
- sync_timeout();
- });
- timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event);
+ sync_timeout_event = timer.add_event_after(
+ g_conf->mon_sync_timeout,
+ new C_MonContext(this, [this](int) {
+ sync_timeout();
+ }));
}
void Monitor::sync_finish(version_t last_committed)
probe_timeout(r);
});
double t = g_conf->mon_probe_timeout;
- timer.add_event_after(t, probe_timeout_event);
- dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl;
+ if (timer.add_event_after(t, probe_timeout_event)) {
+ dout(10) << "reset_probe_timeout " << probe_timeout_event
+ << " after " << t << " seconds" << dendl;
+ } else {
+ probe_timeout_event = nullptr;
+ }
}
void Monitor::probe_timeout(int r)
dout(15) << __func__ << dendl;
health_tick_stop();
- health_tick_event = new C_MonContext(this, [this](int r) {
- if (r < 0)
- return;
- do_health_to_clog();
- health_tick_start();
- });
- timer.add_event_after(cct->_conf->mon_health_to_clog_tick_interval,
- health_tick_event);
+ health_tick_event = timer.add_event_after(
+ cct->_conf->mon_health_to_clog_tick_interval,
+ new C_MonContext(this, [this](int r) {
+ if (r < 0)
+ return;
+ do_health_to_clog();
+ health_tick_start();
+ }));
}
void Monitor::health_tick_stop()
return;
do_health_to_clog_interval();
});
- timer.add_event_at(next, health_interval_event);
+ if (!timer.add_event_at(next, health_interval_event)) {
+ health_interval_event = nullptr;
+ }
}
void Monitor::health_interval_stop()
*plain += "\n";
}
+ const std::string old_fields_message = "'ceph health' JSON format has "
+ "changed in luminous. If you see this your monitoring system is "
+ "scraping the wrong fields. Disable this with 'mon health preluminous "
+ "compat warning = false'";
+
if (f && (compat || compat_warn)) {
health_status_t cr = compat_warn ? min(HEALTH_WARN, r) : r;
+ f->open_array_section("summary");
+ if (compat_warn) {
+ f->open_object_section("item");
+ f->dump_stream("severity") << HEALTH_WARN;
+ f->dump_string("summary", old_fields_message);
+ f->close_section();
+ }
if (compat) {
- f->open_array_section("summary");
- if (compat_warn) {
- f->open_object_section("item");
- f->dump_stream("severity") << HEALTH_WARN;
- f->dump_string("summary", "'ceph health' JSON format has changed in luminous; update your health monitoring scripts");
- f->close_section();
- }
for (auto& svc : paxos_service) {
- svc->get_health_checks().dump_summary_compat(f);
+ svc->get_health_checks().dump_summary_compat(f);
}
- f->close_section();
}
+ f->close_section();
f->dump_stream("overall_status") << cr;
}
if (f && (compat || compat_warn)) {
f->open_array_section("detail");
if (compat_warn) {
- f->dump_string("item", "'ceph health' JSON format has changed in luminous. If you see this your monitoring system is scraping the wrong fields. Disable this with 'mon health preluminous compat warning = false'");
+ f->dump_string("item", old_fields_message);
}
}
osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
const auto& hdr = m->get_header();
uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len;
- uint64_t max =
- g_conf->mon_client_bytes * g_conf->mon_mgr_proxy_client_bytes_ratio;
+ uint64_t max = g_conf->get_val<uint64_t>("mon_client_bytes")
+ * g_conf->get_val<double>("mon_mgr_proxy_client_bytes_ratio");
if (mgr_proxy_bytes + size > max) {
dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes
<< " + " << size << " > max " << max << dendl;
<< " rounds_since_clean " << timecheck_rounds_since_clean
<< dendl;
- timecheck_event = new C_MonContext(this, [this](int) {
- timecheck_start_round();
- });
- timer.add_event_after(delay, timecheck_event);
+ timecheck_event = timer.add_event_after(
+ delay,
+ new C_MonContext(this, [this](int) {
+ timecheck_start_round();
+ }));
}
void Monitor::timecheck_check_skews()
return;
}
- scrub_event = new C_MonContext(this, [this](int) {
+ scrub_event = timer.add_event_after(
+ cct->_conf->mon_scrub_interval,
+ new C_MonContext(this, [this](int) {
scrub_start();
- });
- timer.add_event_after(cct->_conf->mon_scrub_interval, scrub_event);
+ }));
}
void Monitor::scrub_event_cancel()
{
dout(15) << __func__ << " reset timeout event" << dendl;
scrub_cancel_timeout();
-
- scrub_timeout_event = new C_MonContext(this, [this](int) {
+ scrub_timeout_event = timer.add_event_after(
+ g_conf->mon_scrub_timeout,
+ new C_MonContext(this, [this](int) {
scrub_timeout();
- });
- timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
+ }));
}
/************ TICK ***************/
db->init(g_conf->mon_rocksdb_options);
else
db->init();
+
+
}
int open(ostream &out) {
r = db->open(out);
if (r < 0)
return r;
+
+ // Monitors are few in number, so the resource cost of exposing
+ // very detailed stats is low: ramp up the priority of all the
+ // KV store's perf counters. Do this after open, because backend may
+ // not have constructed PerfCounters earlier.
+ if (db->get_perf_counters()) {
+ db->get_perf_counters()->set_prio_adjust(
+ PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY);
+ }
+
io_work.start();
is_open = true;
return 0;
#include <boost/algorithm/string/predicate.hpp>
#define dout_subsys ceph_subsys_mon
-#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
namespace {
{
s.insert(service_name);
s.insert(OSD_PG_CREATING_PREFIX);
+ s.insert(OSD_METADATA_PREFIX);
}
void OSDMonitor::update_from_paxos(bool *need_bootstrap)
void OSDMonitor::on_restart()
{
last_osd_report.clear();
-
- if (mon->is_leader()) {
- // fix ruleset != ruleid
- if (osdmap.crush->has_legacy_rulesets() &&
- !osdmap.crush->has_multirule_rulesets()) {
- CrushWrapper newcrush;
- _get_pending_crush(newcrush);
- int r = newcrush.renumber_rules_by_ruleset();
- if (r >= 0) {
- dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
- pending_inc.crush.clear();
- newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
- } else {
- dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
- }
- }
- }
}
void OSDMonitor::on_shutdown()
<< pending_inc.new_nearfull_ratio << dendl;
}
}
+
+ // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
+ // structure.
+ if (osdmap.crush->has_legacy_rule_ids()) {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ // First, for all pools, work out which rule they really used
+ // by resolving ruleset to rule.
+ for (const auto &i : osdmap.get_pools()) {
+ const auto pool_id = i.first;
+ const auto &pool = i.second;
+ int new_rule_id = newcrush.find_rule(pool.crush_rule,
+ pool.type, pool.size);
+
+ dout(1) << __func__ << " rewriting pool "
+ << osdmap.get_pool_name(pool_id) << " crush ruleset "
+ << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
+ if (pending_inc.new_pools.count(pool_id) == 0) {
+ pending_inc.new_pools[pool_id] = pool;
+ }
+ pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
+ }
+
+ // Now, go ahead and renumber all the rules so that their
+ // rule_id field corresponds to their position in the array
+ auto old_to_new = newcrush.renumber_rules();
+ dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
+ for (const auto &i : old_to_new) {
+ dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+ }
}
creating_pgs_t
tmp.apply_incremental(pending_inc);
if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
- // set or clear full/nearfull?
- int full, backfill, nearfull;
- tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
- if (full > 0) {
- if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
- dout(10) << __func__ << " setting full flag" << dendl;
- add_flag(CEPH_OSDMAP_FULL);
- remove_flag(CEPH_OSDMAP_NEARFULL);
- }
- } else {
- if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
- dout(10) << __func__ << " clearing full flag" << dendl;
- remove_flag(CEPH_OSDMAP_FULL);
- }
- if (nearfull > 0) {
- if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
- dout(10) << __func__ << " setting nearfull flag" << dendl;
- add_flag(CEPH_OSDMAP_NEARFULL);
- }
- } else {
- if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
- dout(10) << __func__ << " clearing nearfull flag" << dendl;
- remove_flag(CEPH_OSDMAP_NEARFULL);
- }
- }
+ // remove any legacy osdmap nearfull/full flags
+ {
+ if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
+ dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
+ << dendl;
+ remove_flag(CEPH_OSDMAP_NEARFULL);
+ remove_flag(CEPH_OSDMAP_FULL);
+ }
+ }
+ // collect which pools are currently affected by
+ // the near/backfill/full osd(s),
+ // and set per-pool near/backfill/full flag instead
+ set<int64_t> full_pool_ids;
+ set<int64_t> backfillfull_pool_ids;
+ set<int64_t> nearfull_pool_ids;
+ tmp.get_full_pools(g_ceph_context,
+ &full_pool_ids,
+ &backfillfull_pool_ids,
+ &nearfull_pool_ids);
+ if (full_pool_ids.empty() ||
+ backfillfull_pool_ids.empty() ||
+ nearfull_pool_ids.empty()) {
+ // normal case - no nearfull, backfillfull or full osds
+ // try cancel any improper nearfull/backfillfull/full pool
+ // flags first
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+ nearfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ // load original pool info first!
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+ backfillfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+ full_pool_ids.empty()) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // set by EQUOTA, skipping
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ }
+ if (!full_pool_ids.empty()) {
+ dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+ << " as full" << dendl;
+ for (auto &p: full_pool_ids) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+ continue;
+ }
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_FULL for pools which are no longer full too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p)) {
+ // skip pools we have just marked as full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+ tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // don't touch if currently is not full
+ // or is running out of quota (and hence considered as full)
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ if (!backfillfull_pool_ids.empty()) {
+ for (auto &p: backfillfull_pool_ids) {
+ if (full_pool_ids.count(p)) {
+ // skip pools we have already considered as full above
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_BACKFILLFULL flag
+ assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // don't bother if pool is already marked as backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as backfillfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_BACKFILLFULL for pools
+ // which are no longer backfillfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ // skip pools we have just marked as backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // and don't touch if currently is not backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ }
+ if (!nearfull_pool_ids.empty()) {
+ for (auto &p: nearfull_pool_ids) {
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_NEARFULL flag
+ assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // don't bother if pool is already marked as nearfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as nearfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_NEARFULL for pools
+ // which are no longer nearfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) ||
+ backfillfull_pool_ids.count(p) ||
+ nearfull_pool_ids.count(p)) {
+ // skip pools we have just marked as
+ // nearfull/backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // and don't touch if currently is not nearfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
}
// min_compat_client?
goto ignore;
}
+ if (m->forced) {
+ return false;
+ }
+
for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
dout(20) << " " << p->first
<< (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
}
}
- if (osdmap.crush->has_multirule_rulesets()) {
- ostringstream ss;
- ss << "CRUSH map contains multirule rulesets";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail) {
- ss << "; please manually fix the map";
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
// Not using 'sortbitwise' and should be?
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
(osdmap.get_up_osd_features() &
return true;
}
-void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ assert(pool);
+ pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
{
- const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
- pending_inc.get_new_pool(pool_id, pool)->flags = flags;
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ assert(pool);
+ pool->unset_flag(flags);
}
bool OSDMonitor::update_pools_status()
(pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
(pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
- if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
if (pool_is_full)
continue;
mon->clog->info() << "pool '" << pool_name
- << "' no longer full; removing FULL flag";
-
- update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
+ << "' no longer out of quota; removing NO_QUOTA flag";
+ // below we cancel FLAG_FULL too, we'll set it again in
+ // OSDMonitor::encode_pending if it still fails the osd-full checking.
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
ret = true;
} else {
if (!pool_is_full)
<< " (reached quota's max_objects: "
<< pool.quota_max_objects << ")";
}
- update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
+ // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
+ // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+ // since FLAG_FULL should always take precedence
+ set_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_NEARFULL |
+ pg_pool_t::FLAG_BACKFILLFULL);
ret = true;
}
}
user_map[*i] = string();
(*erasure_code_profile_map)[*i] = string();
} else {
- const string key = i->substr(0, equal);
+ string key = i->substr(0, equal);
equal++;
const string value = i->substr(equal);
+ if (key.find("ruleset-") == 0) {
+ if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+ g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
+ mon->clog->warn() << "erasure code profile property '" << key
+ << "' is no longer supported; try "
+ << "'crush-" << key.substr(8) << "' instead";
+ key = string("crush-") + key.substr(8);
+ } else {
+ *ss << "property '" << key << "' is no longer supported; try "
+ << "'crush-" << key.substr(8) << "' instead";
+ return -EINVAL;
+ }
+ }
user_map[key] = value;
(*erasure_code_profile_map)[key] = value;
}
return 0;
}
+int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
+{
+ auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+ auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
+ auto max_pgs = max_pgs_per_osd * num_osds;
+ uint64_t projected = 0;
+ if (pool < 0) {
+ projected += pg_num * size;
+ }
+ for (const auto& i : osdmap.get_pools()) {
+ if (i.first == pool) {
+ projected += pg_num * size;
+ } else {
+ projected += i.second.get_pg_num() * i.second.get_size();
+ }
+ }
+ if (projected > max_pgs) {
+ if (pool >= 0) {
+ *ss << "pool id " << pool;
+ }
+ *ss << " pg_num " << pg_num << " size " << size
+ << " would mean " << projected
+ << " total pgs, which exceeds max " << max_pgs
+ << " (mon_max_pg_per_osd " << max_pgs_per_osd
+ << " * num_in_osds " << num_osds << ")";
+ return -ERANGE;
+ }
+ return 0;
+}
+
/**
* @param name The name of the new pool
* @param auid The auid of the pool owner. Can be -1
dout(10) << " prepare_pool_size returns " << r << dendl;
return r;
}
+ r = check_pg_num(-1, pg_num, size, ss);
+ if (r) {
+ dout(10) << " prepare_pool_size returns " << r << dendl;
+ return r;
+ }
if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
return -EINVAL;
ss << "pool size must be between 1 and 10";
return -EINVAL;
}
+ int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
+ if (r < 0) {
+ return r;
+ }
p.size = n;
if (n < p.min_size)
p.min_size = n;
<< " (you may adjust 'mon max pool pg num' for higher values)";
return -ERANGE;
}
+ int r = check_pg_num(pool, n, p.get_size(), &ss);
+ if (r) {
+ return r;
+ }
string force;
cmd_getval(g_ceph_context,cmdmap, "force", force);
if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
bloomp->set_fpp(f);
} else if (var == "use_gmt_hitset") {
if (val == "true" || (interr.empty() && n == 1)) {
- if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+ string force;
+ cmd_getval(g_ceph_context, cmdmap, "force", force);
+ if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ return -EPERM;
+ }
+ if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
+ && force != "--yes-i-really-mean-it") {
ss << "not all OSDs support GMT hit set.";
return -EINVAL;
}
}
}
- if (crush.has_legacy_rulesets()) {
+ if (crush.has_legacy_rule_ids()) {
err = -EINVAL;
ss << "crush maps with ruleset != ruleid are no longer allowed";
goto reply;
goto reply;
}
- const auto& osdmap_pools = osdmap.get_pools();
- for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
- const int64_t pool_id = pit->first;
- const pg_pool_t &pool = pit->second;
- int ruleno = pool.get_crush_rule();
- if (!crush.rule_exists(ruleno)) {
- ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
- err = -EINVAL;
- goto reply;
- }
+ err = osdmap.validate_crush_rules(&crush, &ss);
+ if (err < 0) {
+ goto reply;
}
if (g_conf->mon_osd_crush_smoke_test) {
ss << osdmap.get_crush_version() + 1;
goto update;
+ } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+ int bid = -1 - b;
+ if (newcrush.bucket_exists(bid) &&
+ newcrush.get_bucket_alg(bid)) {
+ dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+ newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+ }
+ }
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
} else if (prefix == "osd crush set-device-class") {
if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
ss << "you must complete the upgrade and 'ceph osd require-osd-release "
// FIXME: this is ok in some situations, but let's not bother with that
// complexity now.
int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
- if (osdmap.crush_ruleset_in_use(ruleset)) {
+ if (osdmap.crush_rule_in_use(ruleset)) {
ss << "crush ruleset " << name << " " << ruleset << " is in use";
err = -EBUSY;
goto reply;
return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
} else if (prefix == "osd set") {
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
string key;
cmd_getval(g_ceph_context, cmdmap, "key", key);
if (key == "full")
else if (key == "notieragent")
return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
else if (key == "sortbitwise") {
- if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
+ if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
+ || sure == "--yes-i-really-mean-it") {
return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
} else {
ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
goto reply;
}
} else if (key == "recovery_deletes") {
- if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
+ if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
+ if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
+ || sure == "--yes-i-really-mean-it") {
return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
} else {
ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
goto reply;
}
} else if (key == "require_jewel_osds") {
+ if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ss << "the sortbitwise flag must be set before require_jewel_osds";
err = -EPERM;
ss << "require_osd_release is already >= jewel";
err = 0;
goto reply;
- } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
+ } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
+ || sure == "--yes-i-really-mean-it") {
return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
} else {
ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
err = -EPERM;
}
} else if (key == "require_kraken_osds") {
+ if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ss << "the sortbitwise flag must be set before require_kraken_osds";
err = -EPERM;
ss << "require_osd_release is already >= kraken";
err = 0;
goto reply;
- } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
+ } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
+ || sure == "--yes-i-really-mean-it") {
bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
// ensure JEWEL is also set
pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
} else if (prefix == "osd require-osd-release") {
string release;
cmd_getval(g_ceph_context, cmdmap, "release", release);
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ss << "the sortbitwise flag must be set first";
err = -EPERM;
if (osdmap.is_up(osd)) {
msg << ", while it was still marked up";
} else {
- msg << ", after it was down for " << int(down_pending_out[osd].sec())
+ auto period = ceph_clock_now() - down_pending_out[osd];
+ msg << ", after it was down for " << int(period.sec())
<< " seconds";
}
#include "erasure-code/ErasureCodeInterface.h"
#include "mon/MonOpRequest.h"
-#define OSD_METADATA_PREFIX "osd_metadata"
-
/// information about a particular peer's failure reports for one osd
struct failure_reporter_t {
utime_t failed_since; ///< when they think it failed
const string &erasure_code_profile,
unsigned *stripe_width,
ostream *ss);
+ int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
int prepare_new_pool(string& name, uint64_t auid,
int crush_rule,
const string &crush_rule_name,
ostream *ss);
int prepare_new_pool(MonOpRequestRef op);
- void update_pool_flags(int64_t pool_id, uint64_t flags);
+ void set_pool_flags(int64_t pool_id, uint64_t flags);
+ void clear_pool_flags(int64_t pool_id, uint64_t flags);
bool update_pools_status();
bool prepare_set_flag(MonOpRequestRef op, int flag);
curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
float used = 0.0;
+ // note avail passed in is raw_avail, calc raw_used here.
if (avail) {
- used = sum.num_bytes * curr_object_copies_rate;
+ used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
used /= used + avail;
} else if (sum.num_bytes) {
used = 1.0;
}
// TOO_FEW_PGS
- int num_in = osdmap.get_num_in_osds();
- int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
- if (num_in &&
- cct->_conf->mon_pg_warn_min_per_osd > 0 &&
- osdmap.get_pools().size() > 0) {
- int per = sum_pg_up / num_in;
- if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+ unsigned num_in = osdmap.get_num_in_osds();
+ auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
+ const auto min_pg_per_osd =
+ cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+ if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
+ auto per = sum_pg_up / num_in;
+ if (per < min_pg_per_osd && per) {
ostringstream ss;
ss << "too few PGs per OSD (" << per
- << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+ << " < min " << min_pg_per_osd << ")";
checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
}
}
// TOO_MANY_PGS
- if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
- int per = sum_pg_up / num_in;
- if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+ auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+ if (num_in && max_pg_per_osd > 0) {
+ auto per = sum_pg_up / num_in;
+ if (per > max_pg_per_osd) {
ostringstream ss;
ss << "too many PGs per OSD (" << per
- << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+ << " > max " << max_pg_per_osd << ")";
checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
}
}
note["incomplete"] += p->second;
if (p->first & PG_STATE_BACKFILL_WAIT)
note["backfill_wait"] += p->second;
- if (p->first & PG_STATE_BACKFILL)
+ if (p->first & PG_STATE_BACKFILLING)
note["backfilling"] += p->second;
if (p->first & PG_STATE_BACKFILL_TOOFULL)
note["backfill_toofull"] += p->second;
PG_STATE_RECOVERY_TOOFULL |
PG_STATE_INCOMPLETE |
PG_STATE_BACKFILL_WAIT |
- PG_STATE_BACKFILL |
+ PG_STATE_BACKFILLING |
PG_STATE_BACKFILL_TOOFULL)) &&
stuck_pgs.count(p->first) == 0) {
if (max > 0) {
}
// pg skew
- int num_in = osdmap.get_num_in_osds();
- int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+ auto num_in = osdmap.get_num_in_osds();
+ auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
int sum_objects = pg_sum.stats.sum.num_objects;
if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
return;
}
- if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
- int per = sum_pg_up / num_in;
- if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+ const auto min_pg_per_osd =
+ cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
+ if (num_in && min_pg_per_osd > 0) {
+ auto per = sum_pg_up / num_in;
+ if (per < min_pg_per_osd && per) {
ostringstream ss;
- ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+ ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail)
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
- if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
+ int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
+ if (num_in && max_pg_per_osd > 0) {
int per = sum_pg_up / num_in;
- if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+ if (per > max_pg_per_osd) {
ostringstream ss;
- ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+ ss << "too many PGs per OSD (" << per << " > max "
+ << max_pg_per_osd << ")";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail)
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
state = -1;
break;
} else {
- int filter = pg_string_state(state_str);
- if (filter < 0) {
+ auto filter = pg_string_state(state_str);
+ if (!filter) {
*ss << "'" << state_str << "' is not a valid pg state,"
<< " available choices: " << pg_state_string(0xFFFFFFFF);
return -EINVAL;
}
- state |= filter;
+ state |= *filter;
}
states.pop_back();
void Paxos::init_logger()
{
PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+
+ // Because monitors are so few in number, the resource cost of capturing
+ // almost all their perf counters at USEFUL is trivial.
+ pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
}
// set timeout event
- collect_timeout_event = new C_MonContext(mon, [this](int r) {
+ collect_timeout_event = mon->timer.add_event_after(
+ g_conf->mon_accept_timeout_factor *
+ g_conf->mon_lease,
+ new C_MonContext(mon, [this](int r) {
if (r == -ECANCELED)
return;
collect_timeout();
- });
- mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
- g_conf->mon_lease,
- collect_timeout_event);
+ }));
}
}
// set timeout event
- accept_timeout_event = new C_MonContext(mon, [this](int r) {
- if (r == -ECANCELED)
- return;
- accept_timeout();
- });
- mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
- g_conf->mon_lease,
- accept_timeout_event);
+ accept_timeout_event = mon->timer.add_event_after(
+ g_conf->mon_accept_timeout_factor * g_conf->mon_lease,
+ new C_MonContext(mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ accept_timeout();
+ }));
}
// peon
// set timeout event.
// if old timeout is still in place, leave it.
if (!lease_ack_timeout_event) {
- lease_ack_timeout_event = new C_MonContext(mon, [this](int r) {
- if (r == -ECANCELED)
- return;
- lease_ack_timeout();
- });
- mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
- g_conf->mon_lease,
- lease_ack_timeout_event);
+ lease_ack_timeout_event = mon->timer.add_event_after(
+ g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+ new C_MonContext(mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_ack_timeout();
+ }));
}
// set renew event
- lease_renew_event = new C_MonContext(mon, [this](int r) {
- if (r == -ECANCELED)
- return;
- lease_renew_timeout();
- });
utime_t at = lease_expire;
at -= g_conf->mon_lease;
at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
- mon->timer.add_event_at(at, lease_renew_event);
+ lease_renew_event = mon->timer.add_event_at(
+ at, new C_MonContext(mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_renew_timeout();
+ }));
}
void Paxos::warn_on_future_time(utime_t t, entity_name_t from)
dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
if (lease_timeout_event)
mon->timer.cancel_event(lease_timeout_event);
- lease_timeout_event = new C_MonContext(mon, [this](int r) {
- if (r == -ECANCELED)
- return;
- lease_timeout();
- });
- mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
- g_conf->mon_lease,
- lease_timeout_event);
+ lease_timeout_event = mon->timer.add_event_after(
+ g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
+ new C_MonContext(mon, [this](int r) {
+ if (r == -ECANCELED)
+ return;
+ lease_timeout();
+ }));
}
void Paxos::lease_timeout()
* Callback class used to propose the pending value once the proposal_timer
* fires up.
*/
- proposal_timer = new C_MonContext(mon, [this](int r) {
+ auto do_propose = new C_MonContext(mon, [this](int r) {
proposal_timer = 0;
if (r >= 0) {
propose_pending();
assert(0 == "bad return value for proposal_timer");
}
});
- dout(10) << " setting proposal_timer " << proposal_timer
+ dout(10) << " setting proposal_timer " << do_propose
<< " with delay of " << delay << dendl;
- mon->timer.add_event_after(delay, proposal_timer);
+ proposal_timer = mon->timer.add_event_after(delay, do_propose);
} else {
dout(10) << " proposal_timer already set" << dendl;
}
#include <errno.h>
#include <sstream>
+#include <signal.h>
#define SOCKET_PRIORITY_MIN_DELAY 6
/**
* @} // Subclass Interfacing
*/
+public:
+#ifdef CEPH_USE_SIGPIPE_BLOCKER
+ /**
+ * We need to disable SIGPIPE on all platforms, and if they
+ * don't give us a better mechanism (read: are on Solaris) that
+ * means blocking the signal whenever we do a send or sendmsg...
+ * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope
+ * whenever doing so. On most systems that's blank, but on systems where
+ * it's needed we construct an RAII object to plug and un-plug the SIGPIPE.
+ * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
+ */
+ struct sigpipe_stopper {
+ bool blocked;
+ sigset_t existing_mask;
+ sigset_t pipe_mask;
+ sigpipe_stopper() {
+ sigemptyset(&pipe_mask);
+ sigaddset(&pipe_mask, SIGPIPE);
+ sigset_t signals;
+ sigemptyset(&signals);
+ sigpending(&signals);
+ if (sigismember(&signals, SIGPIPE)) {
+ blocked = false;
+ } else {
+ blocked = true;
+ int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask);
+ assert(r == 0);
+ }
+ }
+ ~sigpipe_stopper() {
+ if (blocked) {
+ struct timespec nowait{0};
+ int r = sigtimedwait(&pipe_mask, 0, &nowait);
+ assert(r == EAGAIN || r == 0);
+ r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0);
+ assert(r == 0);
+ }
+ }
+ };
+# define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper();
+#else
+# define MSGR_SIGPIPE_STOPPER
+#endif
/**
* @defgroup Dispatcher Interfacing
* @{
*/
-public:
/**
* Determine whether a message can be fast-dispatched. We will
* query each Dispatcher in sequence to determine if they are
#include <atomic>
#include <pthread.h>
-#include <signal.h>
#include <climits>
#include <list>
#include <mutex>
#include "include/buffer.h"
#include "include/str_list.h"
-#include "include/sock_compat.h"
#include "common/errno.h"
#include "common/strtol.h"
#include "common/dout.h"
#include "common/simple_spin.h"
+#include "msg/Messenger.h"
+#include "include/sock_compat.h"
#define dout_subsys ceph_subsys_ms
#undef dout_prefix
int _fd;
entity_addr_t sa;
bool connected;
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- sigset_t sigpipe_mask;
- bool sigpipe_pending;
- bool sigpipe_unblock;
-#endif
public:
explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
return r;
}
- /*
- SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
- http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html
- http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
- */
- static void suppress_sigpipe()
- {
- #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- /*
- We want to ignore possible SIGPIPE that we can generate on write.
- SIGPIPE is delivered *synchronously* and *only* to the thread
- doing the write. So if it is reported as already pending (which
- means the thread blocks it), then we do nothing: if we generate
- SIGPIPE, it will be merged with the pending one (there's no
- queuing), and that suits us well. If it is not pending, we block
- it in this thread (and we avoid changing signal action, because it
- is per-process).
- */
- sigset_t pending;
- sigemptyset(&pending);
- sigpending(&pending);
- sigpipe_pending = sigismember(&pending, SIGPIPE);
- if (!sigpipe_pending) {
- sigset_t blocked;
- sigemptyset(&blocked);
- pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
- /* Maybe is was blocked already? */
- sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
- }
- #endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
- }
-
- static void restore_sigpipe()
- {
- #if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- /*
- If SIGPIPE was pending already we do nothing. Otherwise, if it
- become pending (i.e., we generated it), then we sigwait() it (thus
- clearing pending status). Then we unblock SIGPIPE, but only if it
- were us who blocked it.
- */
- if (!sigpipe_pending) {
- sigset_t pending;
- sigemptyset(&pending);
- sigpending(&pending);
- if (sigismember(&pending, SIGPIPE)) {
- /*
- Protect ourselves from a situation when SIGPIPE was sent
- by the user to the whole process, and was delivered to
- other thread before we had a chance to wait for it.
- */
- static const struct timespec nowait = { 0, 0 };
- TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
- }
-
- if (sigpipe_unblock)
- pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
- }
- #endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
- }
-
// return the sent length
// < 0 means error occured
static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
{
- suppress_sigpipe();
-
size_t sent = 0;
while (1) {
+ MSGR_SIGPIPE_STOPPER;
ssize_t r;
- #if defined(MSG_NOSIGNAL)
r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
- #else
- r = ::sendmsg(fd, &msg, (more ? MSG_MORE : 0));
- #endif /* defined(MSG_NOSIGNAL) */
-
if (r < 0) {
if (errno == EINTR) {
continue;
}
}
}
- restore_sigpipe();
return (ssize_t)sent;
}
}
// block ESIGPIPE
-#ifdef SO_NOSIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
int val = 1;
r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
if (r) {
}
// block ESIGPIPE
-#if defined(SO_NOSIGPIPE)
+#ifdef CEPH_USE_SO_NOSIGPIPE
int val = 1;
int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
if (r) {
return ret;
}
-/*
- SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
- http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html
- http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
-*/
-void Pipe::suppress_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- /*
- We want to ignore possible SIGPIPE that we can generate on write.
- SIGPIPE is delivered *synchronously* and *only* to the thread
- doing the write. So if it is reported as already pending (which
- means the thread blocks it), then we do nothing: if we generate
- SIGPIPE, it will be merged with the pending one (there's no
- queuing), and that suits us well. If it is not pending, we block
- it in this thread (and we avoid changing signal action, because it
- is per-process).
- */
- sigset_t pending;
- sigemptyset(&pending);
- sigpending(&pending);
- sigpipe_pending = sigismember(&pending, SIGPIPE);
- if (!sigpipe_pending) {
- sigset_t blocked;
- sigemptyset(&blocked);
- pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
-
- /* Maybe is was blocked already? */
- sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
- }
-#endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
-void Pipe::restore_sigpipe()
-{
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- /*
- If SIGPIPE was pending already we do nothing. Otherwise, if it
- become pending (i.e., we generated it), then we sigwait() it (thus
- clearing pending status). Then we unblock SIGPIPE, but only if it
- were us who blocked it.
- */
- if (!sigpipe_pending) {
- sigset_t pending;
- sigemptyset(&pending);
- sigpending(&pending);
- if (sigismember(&pending, SIGPIPE)) {
- /*
- Protect ourselves from a situation when SIGPIPE was sent
- by the user to the whole process, and was delivered to
- other thread before we had a chance to wait for it.
- */
- static const struct timespec nowait = { 0, 0 };
- TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
- }
-
- if (sigpipe_unblock)
- pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
- }
-#endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
-}
-
-
int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
{
- suppress_sigpipe();
+ MSGR_SIGPIPE_STOPPER;
while (len > 0) {
int r;
-#if defined(MSG_NOSIGNAL)
r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-#else
- r = ::sendmsg(sd, msg, (more ? MSG_MORE : 0));
-#endif
if (r == 0)
ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
if (r < 0) {
r = -errno;
ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(r) << dendl;
- restore_sigpipe();
return r;
}
if (state == STATE_CLOSED) {
ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
- restore_sigpipe();
return -EINTR; // close enough
}
}
}
}
- restore_sigpipe();
return 0;
}
//lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl;
assert(len > 0);
- suppress_sigpipe();
-
while (len > 0) {
- int did;
-#if defined(MSG_NOSIGNAL)
- did = ::send( sd, buf, len, MSG_NOSIGNAL );
-#else
- did = ::send( sd, buf, len, 0);
-#endif
+ MSGR_SIGPIPE_STOPPER;
+ int did = ::send( sd, buf, len, MSG_NOSIGNAL );
if (did < 0) {
//lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
//lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
buf += did;
//lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl;
}
- restore_sigpipe();
-
return 0;
}
private:
int sd;
struct iovec msgvec[SM_IOV_MAX];
-#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
- sigset_t sigpipe_mask;
- bool sigpipe_pending;
- bool sigpipe_unblock;
-#endif
public:
int port;
int write_keepalive();
int write_keepalive2(char tag, const utime_t &t);
- void suppress_sigpipe();
- void restore_sigpipe();
-
-
void fault(bool reader=false);
void was_session_reset();
const SequencerPosition *spos=0 ///< [in] Sequencer
) { return 0; }
- virtual int check(std::ostream &out, bool repair = false) { return 0; }
+ virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
virtual void compact() {}
virtual int fsck(bool deep) {
return -EOPNOTSUPP;
}
+ virtual int repair(bool deep) {
+ return -EOPNOTSUPP;
+ }
virtual void set_cache_shards(unsigned num) { }
{
}
-int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn)
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size,
+ KeyValueDB::Transaction txn)
{
- bytes_per_block = cct->_conf->bdev_block_size;
+ bytes_per_block = std::max(cct->_conf->bdev_block_size,
+ (int64_t)min_alloc_size);
assert(ISP2(bytes_per_block));
size = P2ALIGN(new_size, bytes_per_block);
blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
return 0;
}
-int BitmapFreelistManager::init()
+int BitmapFreelistManager::init(uint64_t dev_size)
{
dout(1) << __func__ << dendl;
<< " blocks_per_key 0x" << blocks_per_key
<< std::dec << dendl;
_init_misc();
+
+ // check for http://tracker.ceph.com/issues/21089 inconsistency
+ {
+ uint64_t new_size = P2ALIGN(dev_size, bytes_per_block);
+ if (new_size != size) {
+ uint64_t bad_size = new_size & ~bytes_per_block;
+ if (size == bad_size) {
+ derr << __func__ << " size is 0x" << std::hex << size << " should be 0x"
+ << new_size << " and appears to be due to #21089" << std::dec
+ << dendl;
+
+ uint64_t new_blocks = new_size / bytes_per_block;
+ if (new_blocks / blocks_per_key * blocks_per_key != new_blocks) {
+ new_blocks = (new_blocks / blocks_per_key + 1) *
+ blocks_per_key;
+ }
+
+ KeyValueDB::Transaction t = kvdb->get_transaction();
+ {
+ bufferlist sizebl;
+ ::encode(new_size, sizebl);
+ t->set(meta_prefix, "size", sizebl);
+ }
+ if (new_blocks != blocks) {
+ derr << "blocks is 0x" << std::hex << blocks << " should be 0x"
+ << new_blocks << std::dec << dendl;
+ bufferlist bl;
+ ::encode(new_blocks, bl);
+ t->set(meta_prefix, "blocks", bl);
+ _xor(new_size, new_blocks * bytes_per_block - new_size, t);
+ } else {
+ derr << "blocks are ok" << dendl;
+ _xor(bad_size, bytes_per_block, t);
+ }
+ int r = kvdb->submit_transaction_sync(t);
+ assert(r == 0);
+ size = new_size;
+ blocks = new_blocks;
+ derr << __func__ << " fixed inconsistency, size now 0x" << std::hex
+ << size << " blocks 0x" << blocks << std::dec << dendl;
+ }
+ }
+ }
return 0;
}
static void setup_merge_operator(KeyValueDB *db, string prefix);
- int create(uint64_t size, KeyValueDB::Transaction txn) override;
+ int create(uint64_t size, uint64_t min_alloc_size,
+ KeyValueDB::Transaction txn) override;
- int init() override;
+ int init(uint64_t dev_size) override;
void shutdown() override;
void dump() override;
f->close_section();
}
+void BlueFS::dump_block_extents(ostream& out)
+{
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (!bdev[i]) {
+ continue;
+ }
+ out << i << " : size 0x" << std::hex << bdev[i]->get_size()
+ << " : own 0x" << block_all[i] << std::dec << "\n";
+ }
+}
void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
{
new_log = new File;
new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
+ // 0. wait for any racing flushes to complete. (We do not want to block
+ // in _flush_sync_log with jump_to set or else a racing thread might flush
+ // our entries and our jump_to update won't be correct.)
+ while (log_flushing) {
+ dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
+ log_cond.wait(l);
+ }
+
// 1. allocate new log space and jump to it.
old_log_jump_to = log_file->fnode.get_allocated();
uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
while (log_flushing) {
dout(10) << __func__ << " want_seq " << want_seq
<< " log is currently flushing, waiting" << dendl;
+ assert(!jump_to);
log_cond.wait(l);
}
if (want_seq && want_seq <= log_seq_stable) {
dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
<< log_seq_stable << ", done" << dendl;
+ assert(!jump_to);
return 0;
}
if (log_t.empty() && dirty_files.empty()) {
dout(10) << __func__ << " want_seq " << want_seq
<< " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
+ assert(!jump_to);
return 0;
}
derr << __func__ << " allocated: 0x" << std::hex << allocated
<< " offset: 0x" << offset << " length: 0x" << length << std::dec
<< dendl;
+ assert(0 == "bluefs enospc");
return r;
}
h->file->fnode.recalc_allocated();
void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
void dump_perf_counters(Formatter *f);
+ void dump_block_extents(ostream& out);
+
/// get current extents that we own for given block device
int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
return false;
}
+void BlueStore::OnodeSpace::dump(CephContext *cct, int lvl)
+{
+ for (auto& i : onode_map) {
+ ldout(cct, lvl) << i.first << " : " << i.second << dendl;
+ }
+}
// SharedBlob
<< " removing self from set " << get_parent()
<< dendl;
if (get_parent()) {
- if (get_parent()->remove(this)) {
+ if (get_parent()->try_remove(this)) {
delete this;
} else {
ldout(coll->store->cct, 20)
}
}
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+void BlueStore::SharedBlobSet::dump(CephContext *cct, int lvl)
+{
+ std::lock_guard<std::mutex> l(lock);
+ for (auto& i : sb_map) {
+ ldout(cct, lvl) << i.first << " : " << *i.second << dendl;
+ }
+}
+
// Blob
#undef dout_prefix
unsigned n;
// we need to encode inline_bl to measure encoded length
bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+ inline_bl.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
assert(!never_happen);
size_t len = inline_bl.length();
dout(20) << __func__ << " inline shard " << len << " bytes from " << n
on->exists = true;
bufferptr::iterator p = v.front().begin_deep();
on->onode.decode(p);
+ for (auto& i : on->onode.attrs) {
+ i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
// initialize extent_map
on->extent_map.decode_spanning_blobs(p);
if (on->onode.extent_map_shards.empty()) {
denc(on->extent_map.inline_bl, p);
on->extent_map.decode_some(on->extent_map.inline_bl);
+ on->extent_map.inline_bl.reassign_to_mempool(
+ mempool::mempool_bluestore_cache_other);
} else {
on->extent_map.init_shards(false, false);
}
continue;
}
ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl;
- sb->coll = dest;
if (sb->get_sbid()) {
ldout(store->cct, 20) << __func__
<< " moving registration " << *sb << dendl;
shared_blob_set.remove(sb);
dest->shared_blob_set.add(dest, sb);
}
+ sb->coll = dest;
if (dest->cache != cache) {
for (auto& i : sb->bc.buffer_map) {
if (!i.second->is_writing()) {
return;
}
- if (cct->_conf->bluestore_compression_max_blob_size) {
- comp_min_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+ if (cct->_conf->bluestore_compression_min_blob_size) {
+ comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
} else {
assert(bdev);
if (bdev->is_rotational()) {
return 0;
}
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::write_meta(key, value);
+ }
+ label.meta[key] = value;
+ r = _write_bdev_label(cct, p, label);
+ assert(r == 0);
+ return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+ bluestore_bdev_label_t label;
+ string p = path + "/block";
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ return ObjectStore::read_meta(key, value);
+ }
+ auto i = label.meta.find(key);
+ if (i == label.meta.end()) {
+ return ObjectStore::read_meta(key, value);
+ }
+ *value = i->second;
+ return 0;
+}
+
void BlueStore::_init_logger()
{
PerfCountersBuilder b(cct, "bluestore",
path_fd = -1;
}
-int BlueStore::_write_bdev_label(string path, bluestore_bdev_label_t label)
+int BlueStore::_write_bdev_label(CephContext *cct,
+ string path, bluestore_bdev_label_t label)
{
dout(10) << __func__ << " path " << path << " label " << label << dendl;
bufferlist bl;
derr << __func__ << " failed to write to " << path
<< ": " << cpp_strerror(r) << dendl;
}
+ r = ::fsync(fd);
+ if (r < 0) {
+ derr << __func__ << " failed to fsync " << path
+ << ": " << cpp_strerror(r) << dendl;
+ }
VOID_TEMP_FAILURE_RETRY(::close(fd));
return r;
}
label.size = size;
label.btime = ceph_clock_now();
label.description = desc;
- int r = _write_bdev_label(path, label);
+ int r = _write_bdev_label(cct, path, label);
if (r < 0)
return r;
} else {
bl.append(freelist_type);
t->set(PREFIX_SUPER, "freelist_type", bl);
}
- fm->create(bdev->get_size(), t);
+ fm->create(bdev->get_size(), min_alloc_size, t);
// allocate superblock reserved space. note that we do not mark
// bluefs space as allocated in the freelist; we instead rely on
// bluefs_extents.
- fm->allocate(0, SUPER_RESERVED, t);
+ uint64_t reserved = ROUND_UP_TO(MAX(SUPER_RESERVED, min_alloc_size),
+ min_alloc_size);
+ fm->allocate(0, reserved, t);
- uint64_t reserved = 0;
if (cct->_conf->bluestore_bluefs) {
assert(bluefs_extents.num_intervals() == 1);
interval_set<uint64_t>::iterator p = bluefs_extents.begin();
- reserved = p.get_start() + p.get_len();
+ reserved = ROUND_UP_TO(p.get_start() + p.get_len(), min_alloc_size);
dout(20) << __func__ << " reserved 0x" << std::hex << reserved << std::dec
<< " for bluefs" << dendl;
bufferlist bl;
t->set(PREFIX_SUPER, "bluefs_extents", bl);
dout(20) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
<< std::dec << dendl;
- } else {
- reserved = SUPER_RESERVED;
}
if (cct->_conf->bluestore_debug_prefill > 0) {
db->submit_transaction_sync(t);
}
- int r = fm->init();
+ int r = fm->init(bdev->get_size());
if (r < 0) {
derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
delete fm;
string bfn;
struct stat st;
- bfn = path + "/block.db";
+ if (read_meta("path_block.db", &bfn) < 0) {
+ bfn = path + "/block.db";
+ }
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (r < 0) {
}
// shared device
- bfn = path + "/block";
+ if (read_meta("path_block", &bfn) < 0) {
+ bfn = path + "/block";
+ }
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
cct->_conf->bluestore_bluefs_gift_ratio);
initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
+ if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+ derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+ << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+ << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ r = -EINVAL;
+ goto free_bluefs;
+ }
// align to bluefs's alloc_size
initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
// put bluefs in the middle of the device in case it is an HDD
bluefs_extents.insert(start, initial);
}
- bfn = path + "/block.wal";
+ if (read_meta("path_block.wal", &bfn) < 0) {
+ bfn = path + "/block.wal";
+ }
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (r < 0) {
<< " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
<< ", should reclaim " << pretty_si_t(reclaim) << dendl;
}
+
+ // don't take over too much of the freespace
+ uint64_t free_cap = cct->_conf->bluestore_bluefs_max_ratio * total_free;
if (bluefs_total < cct->_conf->bluestore_bluefs_min &&
- cct->_conf->bluestore_bluefs_min <
- (uint64_t)(cct->_conf->bluestore_bluefs_max_ratio * total_free)) {
+ cct->_conf->bluestore_bluefs_min < free_cap) {
uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
dout(10) << __func__ << " bluefs_total " << bluefs_total
<< " < min " << cct->_conf->bluestore_bluefs_min
gift = g;
reclaim = 0;
}
+ uint64_t min_free = cct->_conf->get_val<uint64_t>("bluestore_bluefs_min_free");
+ if (bluefs_free < min_free &&
+ min_free < free_cap) {
+ uint64_t g = min_free - bluefs_free;
+ dout(10) << __func__ << " bluefs_free " << bluefs_total
+ << " < min " << min_free
+ << ", should gift " << pretty_si_t(g) << dendl;
+ if (g > gift)
+ gift = g;
+ reclaim = 0;
+ }
if (gift) {
// round up to alloc size
if (r < 0)
goto out_close_fsid;
+ {
+ string wal_path = cct->_conf->get_val<string>("bluestore_block_wal_path");
+ if (wal_path.size()) {
+ write_meta("path_block.wal", wal_path);
+ }
+ string db_path = cct->_conf->get_val<string>("bluestore_block_db_path");
+ if (db_path.size()) {
+ write_meta("path_block.db", db_path);
+ }
+ }
+
+ // choose min_alloc_size
+ if (cct->_conf->bluestore_min_alloc_size) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+ } else {
+ assert(bdev);
+ if (bdev->is_rotational()) {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+ } else {
+ min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+ }
+ }
+
+ // make sure min_alloc_size is power of 2 aligned.
+ if (!ISP2(min_alloc_size)) {
+ derr << __func__ << " min_alloc_size 0x"
+ << std::hex << min_alloc_size << std::dec
+ << " is not power of 2 aligned!"
+ << dendl;
+ r = -EINVAL;
+ goto out_close_bdev;
+ }
+
r = _open_db(true);
if (r < 0)
goto out_close_bdev;
t->set(PREFIX_SUPER, "blobid_max", bl);
}
- // choose min_alloc_size
- if (cct->_conf->bluestore_min_alloc_size) {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size;
- } else {
- assert(bdev);
- if (bdev->is_rotational()) {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
- } else {
- min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
- }
- }
-
- // make sure min_alloc_size is power of 2 aligned.
- if (!ISP2(min_alloc_size)) {
- derr << __func__ << " min_alloc_size 0x"
- << std::hex << min_alloc_size << std::dec
- << " is not power of 2 aligned!"
- << dendl;
- r = -EINVAL;
- goto out_close_fm;
- }
-
{
bufferlist bl;
::encode((uint64_t)min_alloc_size, bl);
if (r < 0)
goto out_close_fm;
- r = write_meta("bluefs", stringify((int)cct->_conf->bluestore_bluefs));
+ r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
if (r < 0)
goto out_close_fm;
{
dout(1) << __func__ << " path " << path << dendl;
+ _kv_only = kv_only;
+
{
string type;
int r = read_meta("type", &type);
int BlueStore::umount()
{
- assert(mounted);
+ assert(_kv_only || mounted);
dout(1) << __func__ << dendl;
_osr_drain_all();
_osr_unregister_all();
- mempool_thread.shutdown();
-
- dout(20) << __func__ << " stopping kv thread" << dendl;
- _kv_stop();
- _reap_collections();
- _flush_cache();
- dout(20) << __func__ << " closing" << dendl;
-
mounted = false;
- _close_alloc();
- _close_fm();
+ if (!_kv_only) {
+ mempool_thread.shutdown();
+ dout(20) << __func__ << " stopping kv thread" << dendl;
+ _kv_stop();
+ _reap_collections();
+ _flush_cache();
+ dout(20) << __func__ << " closing" << dendl;
+
+ _close_alloc();
+ _close_fm();
+ }
_close_db();
_close_bdev();
_close_fsid();
}
bool already = false;
apply(
- e.offset, e.length, block_size, used_blocks, __func__,
+ e.offset, e.length, min_alloc_size, used_blocks, __func__,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
if (bs.test(pos))
already = true;
return errors;
}
-int BlueStore::fsck(bool deep)
+int BlueStore::_fsck(bool deep, bool repair)
{
- dout(1) << __func__ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
+ dout(1) << __func__
+ << (repair ? " fsck" : " repair")
+ << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
int errors = 0;
+ int repaired = 0;
typedef btree::btree_set<
uint64_t,std::less<uint64_t>,
if (r < 0)
goto out_scan;
- used_blocks.resize(bdev->get_size() / block_size);
+ used_blocks.resize(bdev->get_size() / min_alloc_size);
apply(
- 0, SUPER_RESERVED, block_size, used_blocks, "0~SUPER_RESERVED",
+ 0, MAX(min_alloc_size, SUPER_RESERVED), min_alloc_size, used_blocks,
+ "0~SUPER_RESERVED",
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
bs.set(pos);
}
if (bluefs) {
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "bluefs",
+ e.get_start(), e.get_len(), min_alloc_size, used_blocks, "bluefs",
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
bs.set(pos);
}
if (is_extent_shard_key(it->key())) {
while (!expecting_shards.empty() &&
expecting_shards.front() < it->key()) {
- derr << __func__ << " error: missing shard key "
+ derr << "fsck error: missing shard key "
<< pretty_binary_string(expecting_shards.front())
<< dendl;
++errors;
uint32_t offset;
string okey;
get_key_extent_shard(it->key(), &okey, &offset);
- derr << __func__ << " error: stray shard 0x" << std::hex << offset
+ derr << "fsck error: stray shard 0x" << std::hex << offset
<< std::dec << dendl;
if (expecting_shards.empty()) {
- derr << __func__ << " error: " << pretty_binary_string(it->key())
+ derr << "fsck error: " << pretty_binary_string(it->key())
<< " is unexpected" << dendl;
++errors;
continue;
}
while (expecting_shards.front() > it->key()) {
- derr << __func__ << " error: saw " << pretty_binary_string(it->key())
+ derr << "fsck error: saw " << pretty_binary_string(it->key())
<< dendl;
- derr << __func__ << " error: exp "
+ derr << "fsck error: exp "
<< pretty_binary_string(expecting_shards.front()) << dendl;
++errors;
expecting_shards.pop_front();
ghobject_t oid;
int r = get_key_object(it->key(), &oid);
if (r < 0) {
- derr << __func__ << " error: bad object key "
+ derr << "fsck error: bad object key "
<< pretty_binary_string(it->key()) << dendl;
++errors;
continue;
}
}
if (!c) {
- derr << __func__ << " error: stray object " << oid
+ derr << "fsck error: stray object " << oid
<< " not owned by any collection" << dendl;
++errors;
continue;
if (!expecting_shards.empty()) {
for (auto &k : expecting_shards) {
- derr << __func__ << " error: missing shard key "
+ derr << "fsck error: missing shard key "
<< pretty_binary_string(k) << dendl;
}
++errors;
OnodeRef o = c->get_onode(oid, false);
if (o->onode.nid) {
if (o->onode.nid > nid_max) {
- derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
<< " > nid_max " << nid_max << dendl;
++errors;
}
if (used_nids.count(o->onode.nid)) {
- derr << __func__ << " error: " << oid << " nid " << o->onode.nid
+ derr << "fsck error: " << oid << " nid " << o->onode.nid
<< " already in use" << dendl;
++errors;
continue; // go for next object
get_extent_shard_key(o->key, s.shard_info->offset,
&expecting_shards.back());
if (s.shard_info->offset >= o->onode.size) {
- derr << __func__ << " error: " << oid << " shard 0x" << std::hex
+ derr << "fsck error: " << oid << " shard 0x" << std::hex
<< s.shard_info->offset << " past EOF at 0x" << o->onode.size
<< std::dec << dendl;
++errors;
for (auto& l : o->extent_map.extent_map) {
dout(20) << __func__ << " " << l << dendl;
if (l.logical_offset < pos) {
- derr << __func__ << " error: " << oid << " lextent at 0x"
+ derr << "fsck error: " << oid << " lextent at 0x"
<< std::hex << l.logical_offset
<< " overlaps with the previous, which ends at 0x" << pos
<< std::dec << dendl;
++errors;
}
if (o->extent_map.spans_shard(l.logical_offset, l.length)) {
- derr << __func__ << " error: " << oid << " lextent at 0x"
+ derr << "fsck error: " << oid << " lextent at 0x"
<< std::hex << l.logical_offset << "~" << l.length
<< " spans a shard boundary"
<< std::dec << dendl;
<< std::dec << " for " << *i.first << dendl;
const bluestore_blob_t& blob = i.first->get_blob();
if (i.second & blob.unused) {
- derr << __func__ << " error: " << oid << " blob claims unused 0x"
+ derr << "fsck error: " << oid << " blob claims unused 0x"
<< std::hex << blob.unused
<< " but extents reference 0x" << i.second
<< " on blob " << *i.first << dendl;
if ((blob.unused & mask) == mask) {
// this csum chunk region is marked unused
if (blob.get_csum_item(p) != 0) {
- derr << __func__ << " error: " << oid
+ derr << "fsck error: " << oid
<< " blob claims csum chunk 0x" << std::hex << pos
<< "~" << csum_chunk_size
<< " is unused (mask 0x" << mask << " of unused 0x"
const bluestore_blob_t& blob = i.first->get_blob();
bool equal = i.first->get_blob_use_tracker().equal(i.second);
if (!equal) {
- derr << __func__ << " error: " << oid << " blob " << *i.first
+ derr << "fsck error: " << oid << " blob " << *i.first
<< " doesn't match expected ref_map " << i.second << dendl;
++errors;
}
}
if (blob.is_shared()) {
if (i.first->shared_blob->get_sbid() > blobid_max) {
- derr << __func__ << " error: " << oid << " blob " << blob
+ derr << "fsck error: " << oid << " blob " << blob
<< " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
<< blobid_max << dendl;
++errors;
} else if (i.first->shared_blob->get_sbid() == 0) {
- derr << __func__ << " error: " << oid << " blob " << blob
+ derr << "fsck error: " << oid << " blob " << blob
<< " marked as shared but has uninitialized sbid"
<< dendl;
++errors;
int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
if (r < 0) {
++errors;
- derr << __func__ << " error: " << oid << " error during read: "
+ derr << "fsck error: " << oid << " error during read: "
<< cpp_strerror(r) << dendl;
}
}
// omap
if (o->onode.has_omap()) {
if (used_omap_head.count(o->onode.nid)) {
- derr << __func__ << " error: " << oid << " omap_head " << o->onode.nid
+ derr << "fsck error: " << oid << " omap_head " << o->onode.nid
<< " already in use" << dendl;
++errors;
} else {
string key = it->key();
uint64_t sbid;
if (get_key_shared_blob(key, &sbid)) {
- derr << __func__ << " error: bad key '" << key
+ derr << "fsck error: bad key '" << key
<< "' in shared blob namespace" << dendl;
++errors;
continue;
}
auto p = sb_info.find(sbid);
if (p == sb_info.end()) {
- derr << __func__ << " error: found stray shared blob data for sbid 0x"
+ derr << "fsck error: found stray shared blob data for sbid 0x"
<< std::hex << sbid << std::dec << dendl;
++errors;
} else {
::decode(shared_blob, blp);
dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
if (shared_blob.ref_map != sbi.ref_map) {
- derr << __func__ << " error: shared blob 0x" << std::hex << sbid
+ derr << "fsck error: shared blob 0x" << std::hex << sbid
<< std::dec << " ref_map " << shared_blob.ref_map
<< " != expected " << sbi.ref_map << dendl;
++errors;
}
}
for (auto &p : sb_info) {
- derr << __func__ << " error: shared_blob 0x" << p.first
+ derr << "fsck error: shared_blob 0x" << p.first
<< " key is missing (" << *p.second.sb << ")" << dendl;
++errors;
}
if (!(actual_statfs == expected_statfs)) {
- derr << __func__ << " error: actual " << actual_statfs
+ derr << "fsck error: actual " << actual_statfs
<< " != expected " << expected_statfs << dendl;
++errors;
}
uint64_t omap_head;
_key_decode_u64(it->key().c_str(), &omap_head);
if (used_omap_head.count(omap_head) == 0) {
- derr << __func__ << " error: found stray omap data on omap_head "
+ derr << "fsck error: found stray omap data on omap_head "
<< omap_head << dendl;
++errors;
}
try {
::decode(wt, p);
} catch (buffer::error& e) {
- derr << __func__ << " error: failed to decode deferred txn "
+ derr << "fsck error: failed to decode deferred txn "
<< pretty_binary_string(it->key()) << dendl;
r = -EIO;
goto out_scan;
<< " released 0x" << std::hex << wt.released << std::dec << dendl;
for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "deferred",
+ e.get_start(), e.get_len(), min_alloc_size, used_blocks, "deferred",
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
bs.set(pos);
}
// know they are allocated.
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
apply(
- e.get_start(), e.get_len(), block_size, used_blocks, "bluefs_extents",
+ e.get_start(), e.get_len(), min_alloc_size, used_blocks,
+ "bluefs_extents",
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
bs.reset(pos);
}
while (fm->enumerate_next(&offset, &length)) {
bool intersects = false;
apply(
- offset, length, block_size, used_blocks, "free",
+ offset, length, min_alloc_size, used_blocks, "free",
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
if (bs.test(pos)) {
intersects = true;
}
);
if (intersects) {
- derr << __func__ << " error: free extent 0x" << std::hex << offset
- << "~" << length << std::dec
- << " intersects allocated blocks" << dendl;
- ++errors;
- }
- }
- fm->enumerate_reset();
- size_t count = used_blocks.count();
- if (used_blocks.size() == count + 1) {
- // this due to http://tracker.ceph.com/issues/21089
- bufferlist fm_bpb_bl, fm_blocks_bl, fm_bpk_bl;
- db->get(PREFIX_ALLOC, "bytes_per_block", &fm_bpb_bl);
- db->get(PREFIX_ALLOC, "blocks", &fm_blocks_bl);
- db->get(PREFIX_ALLOC, "blocks_per_key", &fm_bpk_bl);
- uint64_t fm_blocks = 0;
- uint64_t fm_bsize = 1;
- uint64_t fm_blocks_per_key = 1;
- try {
- auto p = fm_blocks_bl.begin();
- ::decode(fm_blocks, p);
- auto q = fm_bpb_bl.begin();
- ::decode(fm_bsize, q);
- auto r = fm_bpk_bl.begin();
- ::decode(fm_blocks_per_key, r);
- } catch (buffer::error& e) {
- }
- uint64_t dev_bsize = bdev->get_block_size();
- uint64_t bad_size = bdev->get_size() & ~fm_bsize;
- if (used_blocks.test(bad_size / dev_bsize) == 0) {
- // this is the last block of the device that we previously
- // (incorrectly) truncated off of the effective device size. this
- // prevented BitmapFreelistManager from marking it as used along with
- // the other "past-eof" blocks in the last key slot. mark it used
- // now.
- derr << __func__ << " warning: fixing leaked block 0x" << std::hex
- << bad_size << "~" << fm_bsize << std::dec << " due to old bug"
- << dendl;
- KeyValueDB::Transaction t = db->get_transaction();
- // fix freelistmanager metadata (the internal 'blocks' count is
- // rounded up to include the trailing key, past eof)
- uint64_t new_blocks = bdev->get_size() / fm_bsize;
- if (new_blocks / fm_blocks_per_key * fm_blocks_per_key != new_blocks) {
- new_blocks = (new_blocks / fm_blocks_per_key + 1) *
- fm_blocks_per_key;
- }
- if (new_blocks != fm_blocks) {
- // the fm block count increased
- derr << __func__ << " freelist block and key count changed, fixing 0x"
- << std::hex << bdev->get_size() << "~"
- << ((new_blocks * fm_bsize) - bdev->get_size()) << std::dec
- << dendl;
- bufferlist bl;
- ::encode(new_blocks, bl);
- t->set(PREFIX_ALLOC, "blocks", bl);
- fm->allocate(bdev->get_size(),
- (new_blocks * fm_bsize) - bdev->get_size(),
- t);
+ if (offset == SUPER_RESERVED &&
+ length == min_alloc_size - SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << dendl;
} else {
- // block count is the same, but size changed; fix just the size
- derr << __func__ << " fixing just the stray block at 0x"
- << std::hex << bad_size << "~" << fm_bsize << std::dec << dendl;
- fm->allocate(bad_size, fm_bsize, t);
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
}
- bufferlist sizebl;
- ::encode(bdev->get_size(), sizebl);
- t->set(PREFIX_ALLOC, "size", sizebl);
- int r = db->submit_transaction_sync(t);
- assert(r == 0);
-
- used_blocks.set(bad_size / dev_bsize);
- ++count;
}
}
+ fm->enumerate_reset();
+ size_t count = used_blocks.count();
if (used_blocks.size() != count) {
assert(used_blocks.size() > count);
++errors;
while (true) {
size_t next = used_blocks.find_next(cur);
if (next != cur + 1) {
- derr << __func__ << " error: leaked extent 0x" << std::hex
- << ((uint64_t)start * block_size) << "~"
- << ((cur + 1 - start) * block_size) << std::dec
+ derr << "fsck error: leaked extent 0x" << std::hex
+ << ((uint64_t)start * min_alloc_size) << "~"
+ << ((cur + 1 - start) * min_alloc_size) << std::dec
<< dendl;
start = next;
break;
<< dendl;
utime_t duration = ceph_clock_now() - start;
- dout(1) << __func__ << " finish with " << errors << " errors in "
+ dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
+ << " repaired, " << (errors - repaired) << " remaining in "
<< duration << " seconds" << dendl;
- return errors;
+ return errors - repaired;
}
void BlueStore::collect_metadata(map<string,string> *pm)
bdev->aio_submit(&b->ioc);
}
+struct C_DeferredTrySubmit : public Context {
+ BlueStore *store;
+ C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+ void finish(int r) {
+ store->deferred_try_submit();
+ }
+};
+
void BlueStore::_deferred_aio_finish(OpSequencer *osr)
{
dout(10) << __func__ << " osr " << osr << dendl;
deferred_queue.erase(q);
} else if (deferred_aggressive) {
dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
- deferred_finisher.queue(new FunctionContext([&](int) {
- deferred_try_submit();
- }));
+ deferred_finisher.queue(new C_DeferredTrySubmit(this));
} else {
dout(20) << __func__ << " leaving queued, more pending" << dendl;
}
<< dendl;
++deferred_aggressive;
deferred_try_submit();
+ {
+ // wake up any previously finished deferred events
+ std::lock_guard<std::mutex> l(kv_lock);
+ kv_cond.notify_one();
+ }
throttle_deferred_bytes.get(txc->cost);
--deferred_aggressive;
}
dout(20) << __func__ << " txc " << txc
<< " " << wctx->writes.size() << " blobs"
<< dendl;
-
- uint64_t need = 0;
- auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
- for (auto &wi : wctx->writes) {
- need += wi.blob_length;
- }
- int r = alloc->reserve(need);
- if (r < 0) {
- derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
- << dendl;
- return r;
+ if (wctx->writes.empty()) {
+ return 0;
}
- uint64_t hint = 0;
CompressorRef c;
double crr = 0;
if (wctx->compress) {
cct->_conf->bluestore_compression_required_ratio,
[&]() {
double val;
- if(coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+ if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
return boost::optional<double>(val);
}
return boost::optional<double>();
csum,
[&]() {
int val;
- if(coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+ if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
return boost::optional<int>(val);
}
return boost::optional<int>();
}
);
+ // compress (as needed) and calc needed space
+ uint64_t need = 0;
+ auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
for (auto& wi : wctx->writes) {
- BlobRef b = wi.b;
- bluestore_blob_t& dblob = b->dirty_blob();
- uint64_t b_off = wi.b_off;
- bufferlist *l = &wi.bl;
- uint64_t final_length = wi.blob_length;
- uint64_t csum_length = wi.blob_length;
- unsigned csum_order = block_size_order;
- bufferlist compressed_bl;
- bool compressed = false;
- if(c && wi.blob_length > min_alloc_size) {
-
+ if (c && wi.blob_length > min_alloc_size) {
utime_t start = ceph_clock_now();
// compress
- assert(b_off == 0);
- assert(wi.blob_length == l->length());
- bluestore_compression_header_t chdr;
- chdr.type = c->get_type();
+ assert(wi.b_off == 0);
+ assert(wi.blob_length == wi.bl.length());
+
// FIXME: memory alignment here is bad
bufferlist t;
-
- r = c->compress(*l, t);
+ int r = c->compress(wi.bl, t);
assert(r == 0);
+ bluestore_compression_header_t chdr;
+ chdr.type = c->get_type();
chdr.length = t.length();
- ::encode(chdr, compressed_bl);
- compressed_bl.claim_append(t);
- uint64_t rawlen = compressed_bl.length();
- uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
- uint64_t want_len_raw = final_length * crr;
+ ::encode(chdr, wi.compressed_bl);
+ wi.compressed_bl.claim_append(t);
+
+ wi.compressed_len = wi.compressed_bl.length();
+ uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
+ uint64_t want_len_raw = wi.blob_length * crr;
uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
- if (newlen <= want_len && newlen < final_length) {
- // Cool. We compressed at least as much as we were hoping to.
- // pad out to min_alloc_size
- compressed_bl.append_zero(newlen - rawlen);
- logger->inc(l_bluestore_write_pad_bytes, newlen - rawlen);
+ if (newlen <= want_len && newlen < wi.blob_length) {
+ // Cool. We compressed at least as much as we were hoping to.
+ // pad out to min_alloc_size
+ wi.compressed_bl.append_zero(newlen - wi.compressed_len);
+ logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
dout(20) << __func__ << std::hex << " compressed 0x" << wi.blob_length
- << " -> 0x" << rawlen << " => 0x" << newlen
+ << " -> 0x" << wi.compressed_len << " => 0x" << newlen
<< " with " << c->get_type()
<< std::dec << dendl;
- txc->statfs_delta.compressed() += rawlen;
- txc->statfs_delta.compressed_original() += l->length();
+ txc->statfs_delta.compressed() += wi.compressed_len;
+ txc->statfs_delta.compressed_original() += wi.blob_length;
txc->statfs_delta.compressed_allocated() += newlen;
- l = &compressed_bl;
- final_length = newlen;
- csum_length = newlen;
- csum_order = ctz(newlen);
- dblob.set_compressed(wi.blob_length, rawlen);
- compressed = true;
- logger->inc(l_bluestore_compress_success_count);
+ logger->inc(l_bluestore_compress_success_count);
+ wi.compressed = true;
+ need += newlen;
} else {
- dout(20) << __func__ << std::hex << " 0x" << l->length()
- << " compressed to 0x" << rawlen << " -> 0x" << newlen
- << " with " << c->get_type()
- << ", which is more than required 0x" << want_len_raw
+ dout(20) << __func__ << std::hex << " 0x" << wi.blob_length
+ << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+ << " with " << c->get_type()
+ << ", which is more than required 0x" << want_len_raw
<< " -> 0x" << want_len
- << ", leaving uncompressed"
- << std::dec << dendl;
- logger->inc(l_bluestore_compress_rejected_count);
+ << ", leaving uncompressed"
+ << std::dec << dendl;
+ logger->inc(l_bluestore_compress_rejected_count);
+ need += wi.blob_length;
}
logger->tinc(l_bluestore_compress_lat,
ceph_clock_now() - start);
+ } else {
+ need += wi.blob_length;
}
- if (!compressed && wi.new_blob) {
+ }
+ int r = alloc->reserve(need);
+ if (r < 0) {
+ derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
+ << dendl;
+ return r;
+ }
+ AllocExtentVector prealloc;
+ prealloc.reserve(2 * wctx->writes.size());;
+ int prealloc_left = 0;
+ prealloc_left = alloc->allocate(
+ need, min_alloc_size, need,
+ 0, &prealloc);
+ assert(prealloc_left == (int64_t)need);
+ dout(20) << __func__ << " prealloc " << prealloc << dendl;
+ auto prealloc_pos = prealloc.begin();
+
+ for (auto& wi : wctx->writes) {
+ BlobRef b = wi.b;
+ bluestore_blob_t& dblob = b->dirty_blob();
+ uint64_t b_off = wi.b_off;
+ bufferlist *l = &wi.bl;
+ uint64_t final_length = wi.blob_length;
+ uint64_t csum_length = wi.blob_length;
+ unsigned csum_order = block_size_order;
+ if (wi.compressed) {
+ final_length = wi.compressed_bl.length();
+ csum_length = final_length;
+ csum_order = ctz(csum_length);
+ l = &wi.compressed_bl;
+ dblob.set_compressed(wi.blob_length, wi.compressed_len);
+ } else if (wi.new_blob) {
// initialize newly created blob only
assert(dblob.is_mutable());
if (l->length() != wi.blob_length) {
}
AllocExtentVector extents;
- extents.reserve(4); // 4 should be (more than) enough for most allocations
- int64_t got = alloc->allocate(final_length, min_alloc_size,
- max_alloc_size.load(),
- hint, &extents);
- assert(got == (int64_t)final_length);
- need -= got;
- txc->statfs_delta.allocated() += got;
+ int64_t left = final_length;
+ while (left > 0) {
+ assert(prealloc_left > 0);
+ if (prealloc_pos->length <= left) {
+ prealloc_left -= prealloc_pos->length;
+ left -= prealloc_pos->length;
+ txc->statfs_delta.allocated() += prealloc_pos->length;
+ extents.push_back(*prealloc_pos);
+ ++prealloc_pos;
+ } else {
+ extents.emplace_back(prealloc_pos->offset, left);
+ prealloc_pos->offset += left;
+ prealloc_pos->length -= left;
+ prealloc_left -= left;
+ txc->statfs_delta.allocated() += left;
+ left = 0;
+ break;
+ }
+ }
for (auto& p : extents) {
- bluestore_pextent_t e = bluestore_pextent_t(p);
- txc->allocated.insert(e.offset, e.length);
- hint = p.end();
+ txc->allocated.insert(p.offset, p.length);
}
dblob.allocated(P2ALIGN(b_off, min_alloc_size), final_length, extents);
}
}
}
- if (need > 0) {
- alloc->unreserve(need);
- }
+ assert(prealloc_pos == prealloc.end());
+ assert(prealloc_left == 0);
return 0;
}
if (b.is_shared() &&
sb->loaded &&
maybe_unshared_blobs.count(sb)) {
- b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
- expect[sb].get(off, len);
- return 0;
- });
+ if (b.is_compressed()) {
+ expect[sb].get(0, b.get_ondisk_length());
+ } else {
+ b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+ expect[sb].get(off, len);
+ return 0;
+ });
+ }
}
}
<< " " << name << " (" << val.length() << " bytes)"
<< dendl;
int r = 0;
- if (val.is_partial())
- o->onode.attrs[name.c_str()] = bufferptr(val.c_str(), val.length());
- else
- o->onode.attrs[name.c_str()] = val;
+ if (val.is_partial()) {
+ auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+ val.length());
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ } else {
+ auto& b = o->onode.attrs[name.c_str()] = val;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
txc->write_onode(o);
dout(10) << __func__ << " " << c->cid << " " << o->oid
<< " " << name << " (" << val.length() << " bytes)"
int r = 0;
for (map<string,bufferptr>::const_iterator p = aset.begin();
p != aset.end(); ++p) {
- if (p->second.is_partial())
- o->onode.attrs[p->first.c_str()] =
+ if (p->second.is_partial()) {
+ auto& b = o->onode.attrs[p->first.c_str()] =
bufferptr(p->second.c_str(), p->second.length());
- else
- o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ } else {
+ auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+ b.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+ }
}
txc->write_onode(o);
dout(10) << __func__ << " " << c->cid << " " << o->oid
assert(i->empty());
}
for (auto& p : coll_map) {
+ if (!p.second->onode_map.empty()) {
+ derr << __func__ << "stray onodes on " << p.first << dendl;
+ p.second->onode_map.dump(cct, 0);
+ }
+ if (!p.second->shared_blob_set.empty()) {
+ derr << __func__ << " stray shared blobs on " << p.first << dendl;
+ p.second->shared_blob_set.dump(cct, 0);
+ }
assert(p.second->onode_map.empty());
assert(p.second->shared_blob_set.empty());
}
sb->coll = coll;
}
- bool remove(SharedBlob *sb) {
+ bool try_remove(SharedBlob *sb) {
std::lock_guard<std::mutex> l(lock);
if (sb->nref == 0) {
assert(sb->get_parent() == this);
return false;
}
+ void remove(SharedBlob *sb) {
+ std::lock_guard<std::mutex> l(lock);
+ assert(sb->get_parent() == this);
+ sb_map.erase(sb->get_sbid());
+ }
+
bool empty() {
std::lock_guard<std::mutex> l(lock);
return sb_map.empty();
}
+
+ void dump(CephContext *cct, int lvl);
};
//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
void clear();
bool empty();
+ void dump(CephContext *cct, int lvl);
+
/// return true if f true for any item
bool map_any(std::function<bool(OnodeRef)> f);
};
KVSyncThread kv_sync_thread;
std::mutex kv_lock;
std::condition_variable kv_cond;
+ bool _kv_only = false;
bool kv_sync_started = false;
bool kv_stop = false;
bool kv_finalize_started = false;
int _setup_block_symlink_or_file(string name, string path, uint64_t size,
bool create);
- int _write_bdev_label(string path, bluestore_bdev_label_t label);
public:
+ static int _write_bdev_label(CephContext* cct,
+ string path, bluestore_bdev_label_t label);
static int _read_bdev_label(CephContext* cct, string path,
bluestore_bdev_label_t *label);
private:
bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
void _deferred_queue(TransContext *txc);
+public:
void deferred_try_submit();
+private:
void _deferred_submit_unlock(OpSequencer *osr);
void _deferred_aio_finish(OpSequencer *osr);
int _deferred_replay();
return 0;
}
- int fsck(bool deep) override;
+ int write_meta(const std::string& key, const std::string& value) override;
+ int read_meta(const std::string& key, std::string *value) override;
+
+
+ int fsck(bool deep) override {
+ return _fsck(deep, false);
+ }
+ int repair(bool deep) override {
+ return _fsck(deep, true);
+ }
+ int _fsck(bool deep, bool repair);
void set_cache_shards(unsigned num) override;
bool mark_unused;
bool new_blob; ///< whether new blob was created
+ bool compressed = false;
+ bufferlist compressed_bl;
+ size_t compressed_len = 0;
+
write_item(
uint64_t logical_offs,
BlobRef b,
static void setup_merge_operators(KeyValueDB *db);
- virtual int create(uint64_t size, KeyValueDB::Transaction txn) = 0;
+ virtual int create(uint64_t size, uint64_t min_alloc_size,
+ KeyValueDB::Transaction txn) = 0;
- virtual int init() = 0;
+ virtual int init(uint64_t dev_size) = 0;
virtual void shutdown() = 0;
virtual void dump() = 0;
} else {
size = st.st_size;
}
+ if (cct->_conf->get_val<bool>("bdev_inject_bad_size")) {
+ derr << "injecting bad size; actual 0x" << std::hex << size
+ << " but using 0x" << (size & ~block_size) << std::dec << dendl;
+ size &= ~(block_size);
+ }
{
char partition[PATH_MAX], devname[PATH_MAX];
#include "common/ceph_argparse.h"
#include "include/stringify.h"
#include "common/errno.h"
+#include "common/safe_io.h"
#include "os/bluestore/BlueFS.h"
#include "os/bluestore/BlueStore.h"
}
}
+BlueFS *open_bluefs(
+ CephContext *cct,
+ const string& path,
+ const vector<string>& devs)
+{
+ validate_path(cct, path, true);
+ BlueFS *fs = new BlueFS(cct);
+
+ string main;
+ set<int> got;
+ for (auto& i : devs) {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct, i, &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << i << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int id = -1;
+ if (label.description == "main")
+ main = i;
+ else if (label.description == "bluefs db")
+ id = BlueFS::BDEV_DB;
+ else if (label.description == "bluefs wal")
+ id = BlueFS::BDEV_WAL;
+ if (id >= 0) {
+ got.insert(id);
+ cout << " slot " << id << " " << i << std::endl;
+ int r = fs->add_block_device(id, i);
+ if (r < 0) {
+ cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+ if (main.length()) {
+ int id = BlueFS::BDEV_DB;
+ if (got.count(BlueFS::BDEV_DB))
+ id = BlueFS::BDEV_SLOW;
+ cout << " slot " << id << " " << main << std::endl;
+ int r = fs->add_block_device(id, main);
+ if (r < 0) {
+ cerr << "unable to open " << main << ": " << cpp_strerror(r)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ int r = fs->mount();
+ if (r < 0) {
+ cerr << "unable to mount bluefs: " << cpp_strerror(r)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ return fs;
+}
+
int main(int argc, char **argv)
{
string out_dir;
vector<string> devs;
string path;
string action;
+ string log_file;
+ string key, value;
+ int log_level = 30;
bool fsck_deep = false;
po::options_description po_options("Options");
po_options.add_options()
("help,h", "produce help message")
("path", po::value<string>(&path), "bluestore path")
("out-dir", po::value<string>(&out_dir), "output directory")
+ ("log-file,l", po::value<string>(&log_file), "log file")
+ ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
("dev", po::value<vector<string>>(&devs), "device(s)")
("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+ ("key,k", po::value<string>(&key), "label metadata key name")
+ ("value,v", po::value<string>(&value), "label metadata value")
;
po::options_description po_positional("Positional options");
po_positional.add_options()
- ("command", po::value<string>(&action), "fsck, bluefs-export, show-label")
+ ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir")
;
po::options_description po_all("All options");
po_all.add(po_options).add(po_positional);
exit(EXIT_FAILURE);
}
- if (action == "fsck") {
+ if (action == "fsck" || action == "repair") {
if (path.empty()) {
cerr << "must specify bluestore path" << std::endl;
exit(EXIT_FAILURE);
}
}
+ if (action == "prime-osd-dir") {
+ if (devs.size() != 1) {
+ cerr << "must specify the main bluestore device" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (path.empty()) {
+ cerr << "must specify osd dir to prime" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (action == "set-label-key" ||
+ action == "rm-label-key") {
+ if (devs.size() != 1) {
+ cerr << "must specify the main bluestore device" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (key.size() == 0) {
+ cerr << "must specify a key name with -k" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (action == "set-label-key" && value.size() == 0) {
+ cerr << "must specify a value with -v" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
if (action == "show-label") {
if (devs.empty() && path.empty()) {
cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
exit(EXIT_FAILURE);
}
- cout << "infering bluefs devices from bluestore path" << std::endl;
- for (auto fn : {"block", "block.wal", "block.db"}) {
- string p = path + "/" + fn;
- struct stat st;
- if (::stat(p.c_str(), &st) == 0) {
- devs.push_back(p);
+ if (devs.empty()) {
+ cout << "infering bluefs devices from bluestore path" << std::endl;
+ for (auto fn : {"block", "block.wal", "block.db"}) {
+ string p = path + "/" + fn;
+ struct stat st;
+ if (::stat(p.c_str(), &st) == 0) {
+ devs.push_back(p);
+ }
}
}
}
}
}
}
+ if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ cout << "infering bluefs devices from bluestore path" << std::endl;
+ for (auto fn : {"block", "block.wal", "block.db"}) {
+ string p = path + "/" + fn;
+ struct stat st;
+ if (::stat(p.c_str(), &st) == 0) {
+ devs.push_back(p);
+ }
+ }
+ }
vector<const char*> args;
+ if (log_file.size()) {
+ args.push_back("--log-file");
+ args.push_back(log_file.c_str());
+ static char ll[10];
+ snprintf(ll, sizeof(ll), "%d", log_level);
+ args.push_back("--debug-bluestore");
+ args.push_back(ll);
+ args.push_back("--debug-bluefs");
+ args.push_back(ll);
+ }
+ args.push_back("--no-log-to-stderr");
+ args.push_back("--err-to-stderr");
+
for (auto& i : ceph_option_strings) {
args.push_back(i.c_str());
}
CODE_ENVIRONMENT_UTILITY, 0);
common_init_finish(cct.get());
- cout << "action " << action << std::endl;
-
if (action == "fsck" ||
- action == "fsck-deep") {
+ action == "repair") {
validate_path(cct.get(), path, false);
BlueStore bluestore(cct.get(), path);
- int r = bluestore.fsck(fsck_deep);
+ int r;
+ if (action == "fsck") {
+ r = bluestore.fsck(fsck_deep);
+ } else {
+ r = bluestore.repair(fsck_deep);
+ }
if (r < 0) {
cerr << "error from fsck: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
+ cout << action << " success" << std::endl;
+ }
+ else if (action == "prime-osd-dir") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+ if (r < 0) {
+ cerr << "failed to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // kludge some things into the map that we want to populate into
+ // target dir
+ label.meta["path_block"] = devs.front();
+ label.meta["type"] = "bluestore";
+ label.meta["fsid"] = stringify(label.osd_uuid);
+
+ for (auto kk : {
+ "whoami",
+ "osd_key",
+ "path_block", "path_block.db", "path_block.wal",
+ "ceph_fsid",
+ "fsid",
+ "type",
+ "ready" }) {
+ string k = kk;
+ auto i = label.meta.find(k);
+ if (i == label.meta.end()) {
+ continue;
+ }
+ string p = path + "/" + k;
+ string v = i->second;
+ if (k == "osd_key") {
+ p = path + "/keyring";
+ v = "[osd.";
+ v += label.meta["whoami"];
+ v += "]\nkey = " + i->second;
+ }
+ if (k.find("path_") == 0) {
+ p = path + "/" + k.substr(5);
+ int r = ::symlink(v.c_str(), p.c_str());
+ if (r < 0 && errno == EEXIST) {
+ struct stat st;
+ r = ::stat(p.c_str(), &st);
+ if (r == 0 && S_ISLNK(st.st_mode)) {
+ char target[PATH_MAX];
+ r = ::readlink(p.c_str(), target, sizeof(target));
+ if (r > 0) {
+ if (v == target) {
+ r = 0; // already matches our target
+ } else {
+ ::unlink(p.c_str());
+ r = ::symlink(v.c_str(), p.c_str());
+ }
+ } else {
+ cerr << "error reading existing link at " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ }
+ }
+ }
+ if (r < 0) {
+ cerr << "error symlinking " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ v += "\n";
+ int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0600);
+ if (fd < 0) {
+ cerr << "error writing " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ int r = safe_write(fd, v.c_str(), v.size());
+ if (r < 0) {
+ cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ ::close(fd);
+ }
+ }
}
else if (action == "show-label") {
JSONFormatter jf(true);
- jf.open_array_section("devices");
+ jf.open_object_section("devices");
for (auto& i : devs) {
bluestore_bdev_label_t label;
int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
jf.close_section();
jf.flush(cout);
}
- else if (action == "bluefs-export") {
- validate_path(cct.get(), path, true);
- BlueFS fs(&(*cct));
- string main;
- set<int> got;
- for (auto& i : devs) {
- bluestore_bdev_label_t label;
- int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
- if (r < 0) {
- cerr << "unable to read label for " << i << ": "
- << cpp_strerror(r) << std::endl;
- exit(EXIT_FAILURE);
- }
- int id = -1;
- if (label.description == "main")
- main = i;
- else if (label.description == "bluefs db")
- id = BlueFS::BDEV_DB;
- else if (label.description == "bluefs wal")
- id = BlueFS::BDEV_WAL;
- if (id >= 0) {
- got.insert(id);
- cout << " slot " << id << " " << i << std::endl;
- int r = fs.add_block_device(id, i);
- if (r < 0) {
- cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
- exit(EXIT_FAILURE);
- }
- }
+ else if (action == "set-label-key") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+ if (r < 0) {
+ cerr << "unable to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
}
- if (main.length()) {
- int id = BlueFS::BDEV_DB;
- if (got.count(BlueFS::BDEV_DB))
- id = BlueFS::BDEV_SLOW;
- cout << " slot " << id << " " << main << std::endl;
- int r = fs.add_block_device(id, main);
- if (r < 0) {
- cerr << "unable to open " << main << ": " << cpp_strerror(r)
- << std::endl;
- exit(EXIT_FAILURE);
- }
+ label.meta[key] = value;
+ r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+ if (r < 0) {
+ cerr << "unable to write label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
}
-
- int r = fs.mount();
+ }
+ else if (action == "rm-label-key") {
+ bluestore_bdev_label_t label;
+ int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
if (r < 0) {
- cerr << "unable to mount bluefs: " << cpp_strerror(r)
- << std::endl;
+ cerr << "unable to read label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (!label.meta.count(key)) {
+ cerr << "key '" << key << "' not present" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ label.meta.erase(key);
+ r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+ if (r < 0) {
+ cerr << "unable to write label for " << devs.front() << ": "
+ << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
+ }
+ else if (action == "bluefs-bdev-sizes") {
+ BlueFS *fs = open_bluefs(cct.get(), path, devs);
+ fs->dump_block_extents(cout);
+ delete fs;
+ }
+ else if (action == "bluefs-bdev-expand") {
+ BlueFS *fs = open_bluefs(cct.get(), path, devs);
+ cout << "start:" << std::endl;
+ fs->dump_block_extents(cout);
+ for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+ interval_set<uint64_t> before;
+ fs->get_block_extents(devid, &before);
+ uint64_t end = before.range_end();
+ uint64_t size = fs->get_block_device_size(devid);
+ if (end < size) {
+ cout << "expanding dev " << devid << " from 0x" << std::hex
+ << end << " to 0x" << size << std::dec << std::endl;
+ fs->add_block_extent(devid, end, size-end);
+ }
+ }
+ delete fs;
+ }
+ else if (action == "bluefs-export") {
+ BlueFS *fs = open_bluefs(cct.get(), path, devs);
vector<string> dirs;
- r = fs.readdir("", &dirs);
+ int r = fs->readdir("", &dirs);
if (r < 0) {
cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
continue;
cout << dir << "/" << std::endl;
vector<string> ls;
- r = fs.readdir(dir, &ls);
+ r = fs->readdir(dir, &ls);
if (r < 0) {
cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
cout << dir << "/" << file << std::endl;
uint64_t size;
utime_t mtime;
- r = fs.stat(dir, file, &size, &mtime);
+ r = fs->stat(dir, file, &size, &mtime);
if (r < 0) {
cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
assert(fd >= 0);
if (size > 0) {
BlueFS::FileReader *h;
- r = fs.open_for_read(dir, file, &h, false);
+ r = fs->open_for_read(dir, file, &h, false);
if (r < 0) {
cerr << "open_for_read " << dir << "/" << file << " failed: "
<< cpp_strerror(r) << std::endl;
int left = size;
while (left) {
bufferlist bl;
- r = fs.read(h, &h->buf, pos, left, &bl, NULL);
+ r = fs->read(h, &h->buf, pos, left, &bl, NULL);
if (r <= 0) {
cerr << "read " << dir << "/" << file << " from " << pos
<< " failed: " << cpp_strerror(r) << std::endl;
::close(fd);
}
}
- fs.umount();
+ fs->umount();
+ delete fs;
} else {
cerr << "unrecognized action " << action << std::endl;
return 1;
bl.append("bluestore block device\n");
bl.append(stringify(osd_uuid));
bl.append("\n");
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(osd_uuid, bl);
::encode(size, bl);
::encode(btime, bl);
::encode(description, bl);
+ ::encode(meta, bl);
ENCODE_FINISH(bl);
}
void bluestore_bdev_label_t::decode(bufferlist::iterator& p)
{
p.advance(60); // see above
- DECODE_START(1, p);
+ DECODE_START(2, p);
::decode(osd_uuid, p);
::decode(size, p);
::decode(btime, p);
::decode(description, p);
+ if (struct_v >= 2) {
+ ::decode(meta, p);
+ }
DECODE_FINISH(p);
}
f->dump_unsigned("size", size);
f->dump_stream("btime") << btime;
f->dump_string("description", description);
+ for (auto& i : meta) {
+ f->dump_string(i.first.c_str(), i.second);
+ }
}
void bluestore_bdev_label_t::generate_test_instances(
o.back()->size = 123;
o.back()->btime = utime_t(4, 5);
o.back()->description = "fakey";
+ o.back()->meta["foo"] = "bar";
}
ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
{
return out << "bdev(osd_uuid " << l.osd_uuid
- << " size 0x" << std::hex << l.size << std::dec
- << " btime " << l.btime
- << " desc " << l.description << ")";
+ << ", size 0x" << std::hex << l.size << std::dec
+ << ", btime " << l.btime
+ << ", desc " << l.description
+ << ", " << l.meta.size() << " meta"
+ << ")";
}
// cnode_t
utime_t btime; ///< birth time
string description; ///< device description
+ map<string,string> meta; ///< {read,write}_meta() content from ObjectStore
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p);
void dump(Formatter *f) const;
int len;
denc_varint(len, p);
csum_data = p.get_ptr(len);
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
}
if (has_unused()) {
denc(unused, p);
csum_chunk_order = order;
csum_data = buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
csum_data.zero();
+ csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
}
/// calculate csum for the buffer at the given b_off
}
}
-int DBObjectMap::check(std::ostream &out, bool repair)
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
{
- int errors = 0;
+ int errors = 0, comp_errors = 0;
bool repaired = false;
map<uint64_t, uint64_t> parent_to_num_children;
map<uint64_t, uint64_t> parent_to_actual_num_children;
if (header.seq != 0)
parent_to_actual_num_children[header.seq] = header.num_children;
- // Check complete table
- bool complete_error = false;
- boost::optional<string> prev;
- KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
- for (complete_iter->seek_to_first(); complete_iter->valid();
- complete_iter->next()) {
- if (prev && prev >= complete_iter->key()) {
- out << "Bad complete for " << header.oid << std::endl;
- complete_error = true;
- break;
- }
- prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
- }
- if (complete_error) {
- out << "Complete mapping for " << header.seq << " :" << std::endl;
- for (complete_iter->seek_to_first(); complete_iter->valid();
- complete_iter->next()) {
- out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
- }
- if (repair) {
- repaired = true;
- KeyValueDB::Transaction t = db->get_transaction();
- t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
- db->submit_transaction(t);
- out << "Cleared complete mapping to repair" << std::endl;
- } else {
- errors++; // Only count when not repaired
- }
+ if (state.v == 2 || force) {
+ // Check complete table
+ bool complete_error = false;
+ boost::optional<string> prev;
+ KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ if (prev && prev >= complete_iter->key()) {
+ out << "Bad complete for " << header.oid << std::endl;
+ complete_error = true;
+ break;
+ }
+ prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+ }
+ if (complete_error) {
+ out << "Complete mapping for " << header.seq << " :" << std::endl;
+ for (complete_iter->seek_to_first(); complete_iter->valid();
+ complete_iter->next()) {
+ out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+ }
+ if (repair) {
+ repaired = true;
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+ db->submit_transaction(t);
+ out << "Cleared complete mapping to repair" << std::endl;
+ } else {
+ errors++; // Only count when not repaired
+ comp_errors++; // Track errors here for version update
+ }
+ }
}
if (header.parent == 0)
}
parent_to_actual_num_children.erase(i->first);
}
+
+ // Only advance the version from 2 to 3 here
+ // Mark as legacy because there are still older structures
+ // we don't update. The value of legacy is only used
+ // for internal assertions.
+ if (comp_errors == 0 && state.v == 2 && repair) {
+ state.v = 3;
+ state.legacy = true;
+ set_state();
+ }
+
if (errors == 0 && repaired)
return -1;
return errors;
return db->submit_transaction(t);
}
- assert(state.v < 3);
+ assert(state.legacy);
{
// We only get here for legacy (v2) stores
const ghobject_t &target,
const SequencerPosition *spos)
{
- state.v = 2;
+ state.legacy = true;
if (oid == target)
return 0;
state.v = 2;
+ set_state();
+ return 0;
+}
+
+void DBObjectMap::set_state()
+{
Mutex::Locker l(header_lock);
KeyValueDB::Transaction t = db->get_transaction();
write_state(t);
- db->submit_transaction_sync(t);
+ int ret = db->submit_transaction_sync(t);
+ assert(ret == 0);
dout(1) << __func__ << " done" << dendl;
- return 0;
+ return;
}
-int DBObjectMap::init(bool do_upgrade)
+int DBObjectMap::get_state()
{
map<string, bufferlist> result;
set<string> to_get;
if (!result.empty()) {
bufferlist::iterator bliter = result.begin()->second.begin();
state.decode(bliter);
- if (state.v < 1) {
- dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
- << dendl;
- return -ENOTSUP;
- }
- if (state.v < 2) { // Needs upgrade
- if (!do_upgrade) {
- dout(1) << "DOBjbectMap requires an upgrade,"
- << " set filestore_update_to"
- << dendl;
- return -ENOTSUP;
- } else {
- r = upgrade_to_v2();
- if (r < 0)
- return r;
- }
- }
} else {
// New store
- // Version 3 means that complete regions never used
- state.v = 3;
+ state.v = State::CUR_VERSION;
state.seq = 1;
+ state.legacy = false;
+ }
+ return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+ int ret = get_state();
+ if (ret < 0)
+ return ret;
+ if (state.v < 1) {
+ dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+ << dendl;
+ return -ENOTSUP;
+ }
+ if (state.v < 2) { // Needs upgrade
+ if (!do_upgrade) {
+ dout(1) << "DOBjbectMap requires an upgrade,"
+ << " set filestore_update_to"
+ << dendl;
+ return -ENOTSUP;
+ } else {
+ int r = upgrade_to_v2();
+ if (r < 0)
+ return r;
+ }
}
ostringstream ss;
int errors = check(ss, true);
dout(20) << "clear_header: clearing seq " << header->seq << dendl;
t->rmkeys_by_prefix(user_prefix(header));
t->rmkeys_by_prefix(sys_prefix(header));
- if (state.v < 3)
+ if (state.legacy)
t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
t->rmkeys_by_prefix(xattr_prefix(header));
set<string> keys;
);
/// Read initial state from backing store
+ int get_state();
+ /// Write current state settings to DB
+ void set_state();
+ /// Read initial state and upgrade or initialize state
int init(bool upgrade = false);
/// Upgrade store to current version
int upgrade_to_v2();
/// Consistency check, debug, there must be no parallel writes
- int check(std::ostream &out, bool repair = false) override;
+ int check(std::ostream &out, bool repair = false, bool force = false) override;
/// Ensure that all previous operations are durable
int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
/// persistent state for store @see generate_header
struct State {
+ static const __u8 CUR_VERSION = 3;
__u8 v;
uint64_t seq;
- State() : v(0), seq(1) {}
- explicit State(uint64_t seq) : v(0), seq(seq) {}
+ // legacy is false when complete regions never used
+ bool legacy;
+ State() : v(0), seq(1), legacy(false) {}
+ explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
void encode(bufferlist &bl) const {
- ENCODE_START(2, 1, bl);
+ ENCODE_START(3, 1, bl);
::encode(v, bl);
::encode(seq, bl);
+ ::encode(legacy, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &bl) {
- DECODE_START(2, bl);
+ DECODE_START(3, bl);
if (struct_v >= 2)
::decode(v, bl);
else
v = 0;
::decode(seq, bl);
+ if (struct_v >= 3)
+ ::decode(legacy, bl);
+ else
+ legacy = false;
DECODE_FINISH(bl);
}
void dump(Formatter *f) const {
+ f->dump_unsigned("v", v);
f->dump_unsigned("seq", seq);
+ f->dump_bool("legacy", legacy);
}
static void generate_test_instances(list<State*> &o) {
journaled_seq = seq;
return true;
}
+ } else {
+ derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
}
if (seq && seq < header.committed_up_to) {
}
}
- dout(25) << ss.str() << dendl;
dout(2) << "No further valid entries found, journal is most likely valid"
<< dendl;
return false;
(*pm)["filestore_f_type"] = ss.str();
if (cct->_conf->filestore_collect_device_partition_information) {
- rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
- dev_node);
+ rc = get_device_by_fd(fsid_fd, partition_path, dev_node, PATH_MAX);
} else {
rc = -EINVAL;
}
recovery_sleep_lock("OSDService::recovery_sleep_lock"),
recovery_sleep_timer(cct, recovery_sleep_lock, false),
reserver_finisher(cct),
- local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+ local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
cct->_conf->osd_min_recovery_priority),
- remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+ remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
cct->_conf->osd_min_recovery_priority),
pg_temp_lock("OSDService::pg_temp_lock"),
snap_sleep_lock("OSDService::snap_sleep_lock"),
scrub_sleep_lock("OSDService::scrub_sleep_lock"),
scrub_sleep_timer(
osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
- snap_reserver(&reserver_finisher,
+ snap_reserver(cct, &reserver_finisher,
cct->_conf->osd_max_trimming_pgs),
recovery_lock("OSDService::recovery_lock"),
recovery_ops_active(0),
waiter.wait();
}
- ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
+ ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
if (ret) {
derr << "OSD::mkfs: failed to write fsid file: error "
<< cpp_strerror(ret) << dendl;
return ret;
}
-int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
{
char val[80];
int r;
if (r < 0)
return r;
+ string key = cct->_conf->get_val<string>("key");
+ lderr(cct) << "key " << key << dendl;
+ if (key.size()) {
+ r = store->write_meta("osd_key", key);
+ if (r < 0)
+ return r;
+ }
+
r = store->write_meta("ready", "ready");
if (r < 0)
return r;
};
+ // All the basic OSD operation stats are to be considered useful
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
osd_plb.add_u64(
l_osd_op_wip, "op_wip",
"Replication operations currently being processed (primary)");
l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
"Latency of read-modify-write operations (excluding queue time and wait for finished)");
+ // Now we move on to some more obscure stats, revert to assuming things
+ // are low priority unless otherwise specified.
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
"Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
"OSDMap buffer cache misses");
- osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
- osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
+ osd_plb.add_u64(
+ l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+ PerfCountersBuilder::PRIO_USEFUL);
+ osd_plb.add_u64(
+ l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+ PerfCountersBuilder::PRIO_USEFUL);
osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
osd_plb.add_u64_counter(
set_state(STATE_STOPPING);
// Debugging
- cct->_conf->set_val("debug_osd", "100");
- cct->_conf->set_val("debug_journal", "100");
- cct->_conf->set_val("debug_filestore", "100");
- cct->_conf->set_val("debug_ms", "100");
- cct->_conf->apply_changes(NULL);
+ if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
+ cct->_conf->set_val("debug_osd", "100");
+ cct->_conf->set_val("debug_journal", "100");
+ cct->_conf->set_val("debug_filestore", "100");
+ cct->_conf->set_val("debug_bluestore", "100");
+ cct->_conf->set_val("debug_ms", "100");
+ cct->_conf->apply_changes(NULL);
+ }
// stop MgrClient earlier as it's more like an internal consumer of OSD
mgrc.shutdown();
++i) {
PG *pg = i->second;
+ // Ignore PGs only partially created (DNE)
+ if (pg->info.dne()) {
+ continue;
+ }
+
auto rpib = pg->get_required_past_interval_bounds(
pg->info,
superblock.oldest_map);
ceph_abort();
}
+ const bool is_mon_create =
+ evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+ if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+ return -EAGAIN;
+ }
// do we need to resurrect a deleting pg?
spg_t resurrected;
PGRef old_pg_state;
}
}
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+ RWLock::RLocker pg_map_locker{pg_map_lock};
+ if (pg_map.size() < max_pgs_per_osd) {
+ return false;
+ }
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (is_mon_create) {
+ pending_creates_from_mon++;
+ } else {
+ pending_creates_from_osd.emplace(pgid.pgid);
+ }
+ dout(5) << __func__ << " withhold creation of pg " << pgid
+ << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+ return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+ if (acting.size() > 1) {
+ return {acting[0]};
+ } else {
+ vector<int32_t> twiddled(acting.begin(), acting.end());
+ twiddled.push_back(-1);
+ return twiddled;
+ }
+}
+
+void OSD::resume_creating_pg()
+{
+ bool do_sub_pg_creates = false;
+ MOSDPGTemp *pgtemp = nullptr;
+ {
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+ RWLock::RLocker l(pg_map_lock);
+ if (max_pgs_per_osd <= pg_map.size()) {
+ // this could happen if admin decreases this setting before a PG is removed
+ return;
+ }
+ unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (pending_creates_from_mon > 0) {
+ do_sub_pg_creates = true;
+ if (pending_creates_from_mon >= spare_pgs) {
+ spare_pgs = pending_creates_from_mon = 0;
+ } else {
+ spare_pgs -= pending_creates_from_mon;
+ pending_creates_from_mon = 0;
+ }
+ }
+ auto pg = pending_creates_from_osd.cbegin();
+ while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+ if (!pgtemp) {
+ pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+ }
+ vector<int> acting;
+ osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+ pgtemp->pg_temp[*pg] = twiddle(acting);
+ pg = pending_creates_from_osd.erase(pg);
+ spare_pgs--;
+ }
+ }
+ if (do_sub_pg_creates) {
+ if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+ dout(4) << __func__ << ": resolicit pg creates from mon since "
+ << last_pg_create_epoch << dendl;
+ monc->renew_subs();
+ }
+ }
+ if (pgtemp) {
+ pgtemp->forced = true;
+ monc->send_mon_message(pgtemp);
+ }
+}
void OSD::build_initial_pg_history(
spg_t pgid,
sched_scrub();
}
service.promote_throttle_recalibrate();
+ resume_creating_pg();
bool need_send_beacon = false;
const auto now = ceph::coarse_mono_clock::now();
{
assert(osd_lock.is_locked());
dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
+ /** make sure the cluster is speaking in SORTBITWISE, because we don't
+ * speak the older sorting version any more. Be careful not to force
+ * a shutdown if we are merely processing old maps, though.
+ */
+ if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+ derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+ ceph_abort();
+ }
+
int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
list<PGRef> to_remove;
pg->unlock();
}
+
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ for (auto pg = pending_creates_from_osd.cbegin();
+ pg != pending_creates_from_osd.cend();) {
+ if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+ pg = pending_creates_from_osd.erase(pg);
+ } else {
+ ++pg;
+ }
+ }
}
for (list<PGRef>::iterator i = to_remove.begin();
dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
- if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
- derr << __func__ << " SORTBITWISE flag is not set" << dendl;
- ceph_abort();
- }
-
if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
osdmap_subscribe(osdmap->get_epoch() + 1, false);
<< dendl;
continue;
}
-
if (handle_pg_peering_evt(
pgid,
history,
service.send_pg_created(pgid.pgid);
}
}
- last_pg_create_epoch = m->epoch;
+ {
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (pending_creates_from_mon == 0) {
+ last_pg_create_epoch = m->epoch;
+ }
+ }
maybe_update_heartbeat_peers();
}
continue;
}
service.share_map_peer(it->first, con.get(), curmap);
- dout(7) << __func__ << " osd " << it->first
+ dout(7) << __func__ << " osd." << it->first
<< " on " << it->second.size() << " PGs" << dendl;
MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
it->second);
m->query_epoch,
PG::RemoteBackfillReserved()));
} else if (m->type == MBackfillReserve::REJECT) {
+ // NOTE: this is replica -> primary "i reject your request"
+ // and also primary -> replica "cancel my previously-granted request"
evt = PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
m->query_epoch,
pg->put("PGMap"); // since we've taken it out of map
}
-
// =========================================================
// RECOVERY
i->lock();
int pgstate = i->get_state();
if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
- ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+ ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
i->_change_recovery_force_mode(newstate, false);
i->unlock();
}
pg->discover_all_missing(*rctx.query_map);
if (rctx.query_map->empty()) {
string action;
- if (pg->state_test(PG_STATE_BACKFILL)) {
+ if (pg->state_test(PG_STATE_BACKFILLING)) {
auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
queued,
queued,
- PG::CancelBackfill()));
+ PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
pg->queue_peering_event(evt);
action = "in backfill";
} else if (pg->state_test(PG_STATE_RECOVERING)) {
auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
queued,
queued,
- PG::CancelRecovery()));
+ PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
pg->queue_peering_event(evt);
action = "in recovery";
} else {
RWLock pg_map_lock; // this lock orders *above* individual PG _locks
ceph::unordered_map<spg_t, PG*> pg_map; // protected by pg_map lock
+ std::mutex pending_creates_lock;
+ std::set<pg_t> pending_creates_from_osd;
+ unsigned pending_creates_from_mon = 0;
+
map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
PGRecoveryStats pg_recovery_stats;
const PastIntervals& pi,
epoch_t epoch,
PG::CephPeeringEvtRef evt);
-
+ bool maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create);
+ void resume_creating_pg();
+
void load_pgs();
void build_past_intervals_parallel();
int update_crush_device_class();
int update_crush_location();
- static int write_meta(ObjectStore *store,
+ static int write_meta(CephContext *cct,
+ ObjectStore *store,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
void handle_pg_scrub(struct MOSDScrub *m, PG* pg);
#include "OSDMap.h"
#include <algorithm>
#include "common/config.h"
+#include "common/errno.h"
#include "common/Formatter.h"
#include "common/TextTable.h"
#include "include/ceph_features.h"
return num_osd;
}
-void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
+void OSDMap::get_full_pools(CephContext *cct,
+ set<int64_t> *full,
+ set<int64_t> *backfillfull,
+ set<int64_t> *nearfull) const
{
- *full = 0;
- *backfill = 0;
- *nearfull = 0;
+ assert(full);
+ assert(backfillfull);
+ assert(nearfull);
+ full->clear();
+ backfillfull->clear();
+ nearfull->clear();
+
+ vector<int> full_osds;
+ vector<int> backfillfull_osds;
+ vector<int> nearfull_osds;
for (int i = 0; i < max_osd; ++i) {
if (exists(i) && is_up(i) && is_in(i)) {
if (osd_state[i] & CEPH_OSD_FULL)
- ++(*full);
+ full_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
- ++(*backfill);
+ backfillfull_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_NEARFULL)
- ++(*nearfull);
+ nearfull_osds.push_back(i);
}
}
+
+ for (auto i: full_osds) {
+ get_pool_ids_by_osd(cct, i, full);
+ }
+ for (auto i: backfillfull_osds) {
+ get_pool_ids_by_osd(cct, i, backfillfull);
+ }
+ for (auto i: nearfull_osds) {
+ get_pool_ids_by_osd(cct, i, nearfull);
+ }
}
static bool get_osd_utilization(
if (!is_up(osd))
continue;
const osd_xinfo_t &xi = get_xinfo(osd);
+ if (xi.features == 0)
+ continue; // bogus xinfo, maybe #20751 or similar, skipping
if (first) {
cached_up_osd_features = xi.features;
first = false;
out << " nearfull";
}
-bool OSDMap::crush_ruleset_in_use(int ruleset) const
+bool OSDMap::crush_rule_in_use(int rule_id) const
{
for (const auto &pool : pools) {
- if (pool.second.crush_rule == ruleset)
+ if (pool.second.crush_rule == rule_id)
return true;
}
return false;
}
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+ ostream *ss) const
+{
+ for (auto& i : pools) {
+ auto& pool = i.second;
+ int ruleno = pool.get_crush_rule();
+ if (!newcrush->rule_exists(ruleno)) {
+ *ss << "pool " << i.first << " references crush_rule " << ruleno
+ << " but it is not present";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+ *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+ *ss << "pool " << i.first << " type does not match rule " << ruleno;
+ return -EINVAL;
+ }
+ if (pool.get_size() < (int)newcrush->get_rule_mask_min_size(ruleno) ||
+ pool.get_size() > (int)newcrush->get_rule_mask_max_size(ruleno)) {
+ *ss << "pool " << i.first << " size " << pool.get_size() << " does not"
+ << " fall within rule " << ruleno
+ << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+ << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
int nosd, int pg_bits, int pgp_bits,
bool default_pool)
tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
for (auto p : pmap) {
- osd_weight[p.first] += p.second;
- osd_weight_total += p.second;
+ auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+ osd_weight[p.first] += adjusted_weight;
+ osd_weight_total += adjusted_weight;
}
}
for (auto& i : osd_weight) {
return crush->get_leaves(name, osds);
}
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+ int osd,
+ set<int64_t> *pool_ids) const
+{
+ assert(pool_ids);
+ set<int> raw_rules;
+ int r = crush->get_rules_by_osd(osd, &raw_rules);
+ if (r < 0) {
+ lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+ << dendl;
+ assert(r >= 0);
+ }
+ set<int> rules;
+ for (auto &i: raw_rules) {
+ // exclude any dead rule
+ if (crush_rule_in_use(i)) {
+ rules.insert(i);
+ }
+ }
+ for (auto &r: rules) {
+ get_pool_ids_by_rule(r, pool_ids);
+ }
+}
+
template <typename F>
class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
public:
{
// warn about flags
uint64_t warn_flags =
+ CEPH_OSDMAP_NEARFULL |
CEPH_OSDMAP_FULL |
CEPH_OSDMAP_PAUSERD |
CEPH_OSDMAP_PAUSEWR |
// OSD_UPGRADE_FINISHED
// none of these (yet) since we don't run until luminous upgrade is done.
- // POOL_FULL
+ // POOL_NEARFULL/BACKFILLFULL/FULL
{
- list<string> detail;
+ list<string> full_detail, backfillfull_detail, nearfull_detail;
for (auto it : get_pools()) {
const pg_pool_t &pool = it.second;
+ const string& pool_name = get_pool_name(it.first);
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
- const string& pool_name = get_pool_name(it.first);
stringstream ss;
- ss << "pool '" << pool_name << "' is full";
- detail.push_back(ss.str());
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // may run out of space too,
+ // but we want EQUOTA taking precedence
+ ss << "pool '" << pool_name << "' is full (no quota)";
+ } else {
+ ss << "pool '" << pool_name << "' is full (no space)";
+ }
+ full_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is backfillfull";
+ backfillfull_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is nearfull";
+ nearfull_detail.push_back(ss.str());
}
}
- if (!detail.empty()) {
+ if (!full_detail.empty()) {
ostringstream ss;
- ss << detail.size() << " pool(s) full";
+ ss << full_detail.size() << " pool(s) full";
auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
- d.detail.swap(detail);
+ d.detail.swap(full_detail);
+ }
+ if (!backfillfull_detail.empty()) {
+ ostringstream ss;
+ ss << backfillfull_detail.size() << " pool(s) backfillfull";
+ auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
+ d.detail.swap(backfillfull_detail);
+ }
+ if (!nearfull_detail.empty()) {
+ ostringstream ss;
+ ss << nearfull_detail.size() << " pool(s) nearfull";
+ auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
+ d.detail.swap(nearfull_detail);
}
}
}
float get_nearfull_ratio() const {
return nearfull_ratio;
}
- void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
void get_full_osd_util(
const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
map<int, float> *full,
map<int, float> *backfill,
map<int, float> *nearfull) const;
-
+ void get_full_pools(CephContext *cct,
+ set<int64_t> *full,
+ set<int64_t> *backfillfull,
+ set<int64_t> *nearfull) const;
void get_full_osd_counts(set<int> *full, set<int> *backfill,
set<int> *nearfull) const;
mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
return pools;
}
+ void get_pool_ids_by_rule(int rule_id, set<int64_t> *pool_ids) const {
+ assert(pool_ids);
+ for (auto &p: pools) {
+ if ((int)p.second.get_crush_rule() == rule_id) {
+ pool_ids->insert(p.first);
+ }
+ }
+ }
+ void get_pool_ids_by_osd(CephContext *cct,
+ int osd,
+ set<int64_t> *pool_ids) const;
const string& get_pool_name(int64_t p) const {
auto i = pool_name.find(p);
assert(i != pool_name.end());
const string& root,
ostream *ss);
- bool crush_ruleset_in_use(int ruleset) const;
+ bool crush_rule_in_use(int rule_id) const;
+
+ int validate_crush_rules(CrushWrapper *crush, ostream *ss) const;
void clear_temp() {
pg_temp->clear();
get_osdmap()->get_epoch());
}
-void PG::schedule_backfill_full_retry()
+void PG::schedule_backfill_retry(float delay)
{
Mutex::Locker lock(osd->recovery_request_lock);
osd->recovery_request_timer.add_event_after(
- cct->_conf->osd_backfill_retry_interval,
+ delay,
new QueuePeeringEvt<RequestBackfill>(
this, get_osdmap()->get_epoch(),
RequestBackfill()));
}
-void PG::schedule_recovery_full_retry()
+void PG::schedule_recovery_retry(float delay)
{
Mutex::Locker lock(osd->recovery_request_lock);
osd->recovery_request_timer.add_event_after(
- cct->_conf->osd_recovery_retry_interval,
+ delay,
new QueuePeeringEvt<DoRecovery>(
this, get_osdmap()->get_epoch(),
DoRecovery()));
upacting_features &= osdmap->get_xinfo(*p).features;
}
- assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
-
_on_new_interval();
}
pg->queue_recovery();
pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
pg->state_clear(PG_STATE_BACKFILL_WAIT);
- pg->state_set(PG_STATE_BACKFILL);
+ pg->state_set(PG_STATE_BACKFILLING);
pg->publish_stats_to_osd();
}
boost::statechart::result
-PG::RecoveryState::Backfilling::react(const CancelBackfill &)
+PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
{
PG *pg = context< RecoveryMachine >().pg;
+ ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
- // XXX: Add a new pg state so user can see why backfill isn't proceeding
- // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
- //pg->state_set(PG_STATE_BACKFILL_STALLED????);
+
+ pg->state_set(PG_STATE_BACKFILL_WAIT);
+ pg->state_clear(PG_STATE_BACKFILLING);
for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
}
}
- pg->waiting_on_backfill.clear();
- pg->schedule_backfill_full_retry();
+ if (!pg->waiting_on_backfill.empty()) {
+ pg->waiting_on_backfill.clear();
+ pg->finish_recovery_op(hobject_t::get_max());
+ }
+
+ pg->schedule_backfill_retry(c.delay);
return transit<NotBackfilling>();
}
}
}
- pg->waiting_on_backfill.clear();
- pg->finish_recovery_op(hobject_t::get_max());
+ if (!pg->waiting_on_backfill.empty()) {
+ pg->waiting_on_backfill.clear();
+ pg->finish_recovery_op(hobject_t::get_max());
+ }
- pg->schedule_backfill_full_retry();
+ pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
return transit<NotBackfilling>();
}
PG *pg = context< RecoveryMachine >().pg;
pg->backfill_reserved = false;
pg->backfill_reserving = false;
- pg->state_clear(PG_STATE_BACKFILL);
+ pg->state_clear(PG_STATE_BACKFILLING);
pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
pg->state_set(PG_STATE_BACKFILL_TOOFULL);
pg->publish_stats_to_osd();
- pg->schedule_backfill_full_retry();
+ pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
return transit<NotBackfilling>();
}
new QueuePeeringEvt<LocalBackfillReserved>(
pg, pg->get_osdmap()->get_epoch(),
LocalBackfillReserved()),
- pg->get_backfill_priority());
+ pg->get_backfill_priority(),
+ new QueuePeeringEvt<DeferBackfill>(
+ pg, pg->get_osdmap()->get_epoch(),
+ DeferBackfill(0.0)));
pg->publish_stats_to_osd();
}
context< RecoveryMachine >().log_enter(state_name);
}
+boost::statechart::result
+PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->reject_reservation();
+ post_event(RemoteReservationRejected());
+ return discard_event();
+}
+
void PG::RecoveryState::RepNotRecovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
return transit<RepRecovering>();
}
+boost::statechart::result
+PG::RecoveryState::RepWaitRecoveryReserved::react(
+ const RemoteReservationCanceled &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+ return transit<RepNotRecovering>();
+}
+
void PG::RecoveryState::RepWaitRecoveryReserved::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
<< dendl;
- post_event(RemoteReservationRejected());
+ post_event(RejectRemoteReservation());
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
pg->osd->check_backfill_full(ss)) {
ldout(pg->cct, 10) << "backfill reservation rejected: "
<< ss.str() << dendl;
- post_event(RemoteReservationRejected());
+ post_event(RejectRemoteReservation());
} else {
pg->osd->remote_reserver.request_reservation(
pg->info.pgid,
(rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
<< "failure injection" << dendl;
- pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
- post_event(RemoteReservationRejected());
+ post_event(RejectRemoteReservation());
return discard_event();
} else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
pg->osd->check_backfill_full(ss)) {
ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
<< ss.str() << dendl;
- pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
- post_event(RemoteReservationRejected());
+ post_event(RejectRemoteReservation());
return discard_event();
} else {
pg->osd->send_message_osd_cluster(
}
boost::statechart::result
-PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
+PG::RecoveryState::RepWaitBackfillReserved::react(
+ const RejectRemoteReservation &evt)
{
PG *pg = context< RecoveryMachine >().pg;
pg->reject_reservation();
+ post_event(RemoteReservationRejected());
+ return discard_event();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+ const RemoteReservationRejected &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+ return transit<RepNotRecovering>();
+}
+
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(
+ const RemoteReservationCanceled &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
return transit<RepNotRecovering>();
}
new QueuePeeringEvt<LocalRecoveryReserved>(
pg, pg->get_osdmap()->get_epoch(),
LocalRecoveryReserved()),
- pg->get_recovery_priority());
+ pg->get_recovery_priority(),
+ new QueuePeeringEvt<DeferRecovery>(
+ pg, pg->get_osdmap()->get_epoch(),
+ DeferRecovery(0.0)));
pg->publish_stats_to_osd();
}
{
PG *pg = context< RecoveryMachine >().pg;
pg->state_set(PG_STATE_RECOVERY_TOOFULL);
- pg->schedule_recovery_full_retry();
+ pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
return transit<NotRecovering>();
}
pg->state_clear(PG_STATE_RECOVERING);
pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
return transit<Recovered>();
}
pg->state_clear(PG_STATE_RECOVERING);
pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
- return transit<WaitRemoteBackfillReserved>();
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+ return transit<WaitLocalBackfillReserved>();
}
boost::statechart::result
-PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
+PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
{
PG *pg = context< RecoveryMachine >().pg;
+ ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
pg->state_clear(PG_STATE_RECOVERING);
+ pg->state_set(PG_STATE_RECOVERY_WAIT);
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
release_reservations(true);
- pg->schedule_recovery_full_retry();
+ pg->schedule_recovery_retry(evt.delay);
return transit<NotRecovering>();
}
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
- pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
assert(!pg->needs_recovery());
void handle_scrub_reserve_release(OpRequestRef op);
void reject_reservation();
- void schedule_backfill_full_retry();
- void schedule_recovery_full_retry();
+ void schedule_backfill_retry(float retry);
+ void schedule_recovery_retry(float retry);
// -- recovery state --
*out << #T; \
} \
};
+ struct DeferBackfill : boost::statechart::event<DeferBackfill> {
+ float delay;
+ explicit DeferBackfill(float delay) : delay(delay) {}
+ void print(std::ostream *out) const {
+ *out << "DeferBackfill: delay " << delay;
+ }
+ };
+ struct DeferRecovery : boost::statechart::event<DeferRecovery> {
+ float delay;
+ explicit DeferRecovery(float delay) : delay(delay) {}
+ void print(std::ostream *out) const {
+ *out << "DeferRecovery: delay " << delay;
+ }
+ };
+
TrivialEvent(Initialize)
TrivialEvent(Load)
TrivialEvent(GotInfo)
TrivialEvent(Backfilled)
TrivialEvent(LocalBackfillReserved)
TrivialEvent(RemoteBackfillReserved)
+ TrivialEvent(RejectRemoteReservation)
TrivialEvent(RemoteReservationRejected)
- TrivialEvent(CancelBackfill)
+ TrivialEvent(RemoteReservationCanceled)
TrivialEvent(RequestBackfill)
TrivialEvent(RequestRecovery)
TrivialEvent(RecoveryDone)
TrivialEvent(BackfillTooFull)
TrivialEvent(RecoveryTooFull)
- TrivialEvent(CancelRecovery)
+
+ TrivialEvent(MakePrimary)
+ TrivialEvent(MakeStray)
+ TrivialEvent(NeedActingChange)
+ TrivialEvent(IsIncomplete)
+ TrivialEvent(IsDown)
TrivialEvent(AllReplicasRecovered)
TrivialEvent(DoRecovery)
}
};
- struct MakePrimary : boost::statechart::event< MakePrimary > {
- MakePrimary() : boost::statechart::event< MakePrimary >() {}
- };
- struct MakeStray : boost::statechart::event< MakeStray > {
- MakeStray() : boost::statechart::event< MakeStray >() {}
- };
struct Primary;
struct Stray;
struct Peering;
struct WaitActingChange;
- struct NeedActingChange : boost::statechart::event< NeedActingChange > {
- NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
- };
struct Incomplete;
- struct IsIncomplete : boost::statechart::event< IsIncomplete > {
- IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
- };
struct Down;
- struct IsDown : boost::statechart::event< IsDown > {
- IsDown() : boost::statechart::event< IsDown >() {}
- };
struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
explicit Primary(my_context ctx);
boost::statechart::custom_reaction< MNotifyRec >,
boost::statechart::custom_reaction< MLogRec >,
boost::statechart::custom_reaction< Backfilled >,
- boost::statechart::custom_reaction< AllReplicasActivated >
+ boost::statechart::custom_reaction< AllReplicasActivated >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >
> reactions;
boost::statechart::result react(const QueryState& q);
boost::statechart::result react(const ActMap&);
return discard_event();
}
boost::statechart::result react(const AllReplicasActivated&);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
};
struct Clean : boost::statechart::state< Clean, Active >, NamedState {
struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
typedef boost::mpl::list<
boost::statechart::transition< Backfilled, Recovered >,
- boost::statechart::custom_reaction< CancelBackfill >,
+ boost::statechart::custom_reaction< DeferBackfill >,
boost::statechart::custom_reaction< RemoteReservationRejected >
> reactions;
explicit Backfilling(my_context ctx);
boost::statechart::result react(const RemoteReservationRejected& evt);
- boost::statechart::result react(const CancelBackfill& evt);
+ boost::statechart::result react(const DeferBackfill& evt);
void exit();
};
struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
typedef boost::mpl::list<
boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
- boost::statechart::custom_reaction< CancelRecovery >
+ boost::statechart::custom_reaction< DeferRecovery >
> reactions;
explicit NotRecovering(my_context ctx);
- boost::statechart::result react(const CancelRecovery& evt) {
+ boost::statechart::result react(const DeferRecovery& evt) {
/* no-op */
return discard_event();
}
boost::statechart::custom_reaction< MQuery >,
boost::statechart::custom_reaction< MInfoRec >,
boost::statechart::custom_reaction< MLogRec >,
- boost::statechart::custom_reaction< Activate >
+ boost::statechart::custom_reaction< Activate >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >
> reactions;
boost::statechart::result react(const QueryState& q);
boost::statechart::result react(const MInfoRec& infoevt);
boost::statechart::result react(const ActMap&);
boost::statechart::result react(const MQuery&);
boost::statechart::result react(const Activate&);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
};
struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
typedef boost::mpl::list<
boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+ // for compat with old peers
boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
boost::statechart::custom_reaction< BackfillTooFull >
> reactions;
explicit RepRecovering(my_context ctx);
struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
typedef boost::mpl::list<
boost::statechart::custom_reaction< RemoteBackfillReserved >,
- boost::statechart::custom_reaction< RemoteReservationRejected >
+ boost::statechart::custom_reaction< RejectRemoteReservation >,
+ boost::statechart::custom_reaction< RemoteReservationRejected >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
> reactions;
explicit RepWaitBackfillReserved(my_context ctx);
void exit();
boost::statechart::result react(const RemoteBackfillReserved &evt);
+ boost::statechart::result react(const RejectRemoteReservation &evt);
boost::statechart::result react(const RemoteReservationRejected &evt);
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
};
struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
typedef boost::mpl::list<
- boost::statechart::custom_reaction< RemoteRecoveryReserved >
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ // for compat with old peers
+ boost::statechart::custom_reaction< RemoteReservationRejected >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
> reactions;
explicit RepWaitRecoveryReserved(my_context ctx);
void exit();
boost::statechart::result react(const RemoteRecoveryReserved &evt);
+ boost::statechart::result react(const RemoteReservationRejected &evt) {
+ // for compat with old peers
+ post_event(RemoteReservationCanceled());
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
};
struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
typedef boost::mpl::list<
boost::statechart::custom_reaction< RequestBackfillPrio >,
boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
+ boost::statechart::custom_reaction< RejectRemoteReservation >,
+ boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
> reactions;
explicit RepNotRecovering(my_context ctx);
boost::statechart::result react(const RequestBackfillPrio &evt);
+ boost::statechart::result react(const RejectRemoteReservation &evt);
void exit();
};
struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
typedef boost::mpl::list <
boost::statechart::custom_reaction< AllReplicasRecovered >,
- boost::statechart::custom_reaction< CancelRecovery >,
+ boost::statechart::custom_reaction< DeferRecovery >,
boost::statechart::custom_reaction< RequestBackfill >
> reactions;
explicit Recovering(my_context ctx);
void exit();
void release_reservations(bool cancel = false);
boost::statechart::result react(const AllReplicasRecovered &evt);
- boost::statechart::result react(const CancelRecovery& evt);
+ boost::statechart::result react(const DeferRecovery& evt);
boost::statechart::result react(const RequestBackfill &evt);
};
if (!is_degraded() &&
!state_test(PG_STATE_RECOVERING |
PG_STATE_RECOVERY_WAIT |
- PG_STATE_BACKFILL |
+ PG_STATE_BACKFILLING |
PG_STATE_BACKFILL_WAIT |
PG_STATE_BACKFILL_TOOFULL))
return;
if (is_degraded() ||
state_test(PG_STATE_RECOVERING |
PG_STATE_RECOVERY_WAIT |
- PG_STATE_BACKFILL |
+ PG_STATE_BACKFILLING |
PG_STATE_BACKFILL_WAIT |
PG_STATE_BACKFILL_TOOFULL)) {
target = cct->_conf->osd_max_pg_log_entries;
if (write_ordered && is_degraded_or_backfilling_object(head)) {
if (can_backoff && g_conf->osd_backoff_on_degraded) {
add_backoff(session, head, head);
+ maybe_kick_recovery(head);
} else {
wait_for_degraded_object(head, op);
}
dout(20) << __func__ << " r=" << r << dendl;
assert(op->may_write());
const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
- ObjectContextRef obc;
mempool::osd_pglog::list<pg_log_entry_t> entries;
entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
get_next_version(), eversion_t(), 0,
dout(20) << __func__ << dendl;
ceph_osd_op& op = osd_op.op;
- if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+ auto& oi = ctx->new_obs.oi;
+ uint64_t size = oi.size;
+ if ((oi.truncate_seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
+ size = op.extent.truncate_size;
+ }
+
+ if (op.extent.offset >= size) {
+ op.extent.length = 0;
+ } else if (op.extent.offset + op.extent.length > size) {
+ op.extent.length = size - op.extent.offset;
+ }
+
+ if (op.extent.length == 0) {
+ dout(20) << __func__ << " zero length extent" << dendl;
+ return finish_extent_cmp(osd_op, bufferlist{});
+ } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
dout(20) << __func__ << " object DNE" << dendl;
return finish_extent_cmp(osd_op, {});
} else if (pool.info.require_rollback()) {
// If there is a data digest and it is possible we are reading
// entire object, pass the digest.
- auto& oi = ctx->new_obs.oi;
boost::optional<uint32_t> maybe_crc;
if (oi.is_data_digest() && op.checksum.offset == 0 &&
op.checksum.length >= oi.size) {
}
}
} else {
- legacy = false;
+ legacy = true;
}
dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
<< " no_whiteout=" << (int)no_whiteout
assert(is_primary());
if (!state_test(PG_STATE_RECOVERING) &&
- !state_test(PG_STATE_BACKFILL)) {
+ !state_test(PG_STATE_BACKFILLING)) {
/* TODO: I think this case is broken and will make do_recovery()
* unhappy since we're returning false */
dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
bool deferred_backfill = false;
if (recovering.empty() &&
- state_test(PG_STATE_BACKFILL) &&
+ state_test(PG_STATE_BACKFILLING) &&
!backfill_targets.empty() && started < max &&
missing.num_missing() == 0 &&
waiting_on_backfill.empty()) {
if (state_test(PG_STATE_RECOVERING)) {
state_clear(PG_STATE_RECOVERING);
state_clear(PG_STATE_FORCED_RECOVERY);
+ if (get_osdmap()->get_pg_size(info.pgid.pgid) <= acting.size()) {
+ state_clear(PG_STATE_DEGRADED);
+ }
if (needs_backfill()) {
dout(10) << "recovery done, queuing backfill" << dendl;
queue_peering_event(
AllReplicasRecovered())));
}
} else { // backfilling
- state_clear(PG_STATE_BACKFILL);
+ state_clear(PG_STATE_BACKFILLING);
state_clear(PG_STATE_FORCED_BACKFILL);
state_clear(PG_STATE_FORCED_RECOVERY);
dout(10) << "recovery done, backfill done" << dendl;
};
auto *pg = context< SnapTrimmer >().pg;
if (pg->cct->_conf->osd_snap_trim_sleep > 0) {
- wakeup = new OnTimer{pg, pg->get_osdmap()->get_epoch()};
Mutex::Locker l(pg->osd->snap_sleep_lock);
- pg->osd->snap_sleep_timer.add_event_after(
- pg->cct->_conf->osd_snap_trim_sleep, wakeup);
+ wakeup = pg->osd->snap_sleep_timer.add_event_after(
+ pg->cct->_conf->osd_snap_trim_sleep,
+ new OnTimer{pg, pg->get_osdmap()->get_epoch()});
} else {
post_event(SnapTrimTimerReady());
}
void check_recovery_sources(const OSDMapRef& osdmap) override;
- /// @see PGBackend::delay_message_until_active
bool can_handle_while_inactive(OpRequestRef op) override;
/// @see PGBackend::handle_message
{
osd->watch_lock.Lock();
cb = new NotifyTimeoutCB(self.lock());
- osd->watch_timer.add_event_after(
- timeout,
- cb);
+ if (!osd->watch_timer.add_event_after(timeout, cb)) {
+ cb = nullptr;
+ }
osd->watch_lock.Unlock();
}
}
dout(15) << "registering callback, timeout: " << timeout << dendl;
}
cb = new HandleWatchTimeout(self.lock());
- osd->watch_timer.add_event_after(
- timeout,
- cb);
+ if (!osd->watch_timer.add_event_after(timeout, cb)) {
+ cb = nullptr;
+ }
}
void Watch::unregister_cb()
oss << "peering+";
if (state & PG_STATE_REPAIR)
oss << "repair+";
- if ((state & PG_STATE_BACKFILL_WAIT) &&
- !(state &PG_STATE_BACKFILL))
+ if (state & PG_STATE_BACKFILL_WAIT)
oss << "backfill_wait+";
- if (state & PG_STATE_BACKFILL)
+ if (state & PG_STATE_BACKFILLING)
oss << "backfilling+";
if (state & PG_STATE_FORCED_BACKFILL)
oss << "forced_backfill+";
return ret;
}
-int pg_string_state(const std::string& state)
+boost::optional<uint64_t> pg_string_state(const std::string& state)
{
- int type;
+ boost::optional<uint64_t> type;
if (state == "active")
type = PG_STATE_ACTIVE;
else if (state == "clean")
type = PG_STATE_REMAPPED;
else if (state == "deep_scrub")
type = PG_STATE_DEEP_SCRUB;
- else if (state == "backfill")
- type = PG_STATE_BACKFILL;
+ else if (state == "backfilling")
+ type = PG_STATE_BACKFILLING;
else if (state == "forced_backfill")
type = PG_STATE_FORCED_BACKFILL;
else if (state == "backfill_toofull")
else if (state == "snaptrim_error")
type = PG_STATE_SNAPTRIM_ERROR;
else
- type = -1;
+ type = boost::none;
return type;
}
#define PG_STATE_STALE (1<<17) // our state for this pg is stale, unknown.
#define PG_STATE_REMAPPED (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
#define PG_STATE_DEEP_SCRUB (1<<19) // deep scrub: check CRC32 on files
-#define PG_STATE_BACKFILL (1<<20) // [active] backfilling pg content
+#define PG_STATE_BACKFILLING (1<<20) // [active] backfilling pg content
#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
#define PG_STATE_RECOVERY_WAIT (1<<22) // waiting for recovery reservations
#define PG_STATE_UNDERSIZED (1<<23) // pg acting < pool size
std::string pg_state_string(int state);
std::string pg_vector_string(const vector<int32_t> &a);
-int pg_string_state(const std::string& state);
+boost::optional<uint64_t> pg_string_state(const std::string& state);
/*
FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
FLAG_NOSCRUB = 1<<8, // block periodic scrub
FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+ FLAG_FULL_NO_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+ FLAG_NEARFULL = 1<<11, // pool is nearfull
+ FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
};
static const char *get_flag_name(int f) {
case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
case FLAG_NOSCRUB: return "noscrub";
case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+ case FLAG_FULL_NO_QUOTA: return "full_no_quota";
+ case FLAG_NEARFULL: return "nearfull";
+ case FLAG_BACKFILLFULL: return "backfillfull";
default: return "???";
}
}
return FLAG_NOSCRUB;
if (name == "nodeep-scrub")
return FLAG_NODEEP_SCRUB;
+ if (name == "full_no_quota")
+ return FLAG_FULL_NO_QUOTA;
+ if (name == "nearfull")
+ return FLAG_NEARFULL;
+ if (name == "backfillfull")
+ return FLAG_BACKFILLFULL;
return 0;
}
#include "include/assert.h"
#define MAX_FLUSH_UNDER_LOCK 20 ///< max bh's we start writeback on
+#define BUFFER_MEMORY_WEIGHT 12 // memory usage of BufferHead, count in (1<<n)
using std::chrono::seconds;
/// while holding the lock
flush_set_callback_arg(flush_callback_arg),
last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0),
- stat_missing(0), stat_error(0), stat_dirty_waiting(0), reads_outstanding(0)
+ stat_missing(0), stat_error(0), stat_dirty_waiting(0),
+ stat_nr_dirty_waiters(0), reads_outstanding(0)
{
perf_start();
finisher.start();
<< get_stat_clean() << ", objects: max " << max_objects
<< " current " << ob_lru.lru_get_size() << dendl;
- while (get_stat_clean() > 0 && (uint64_t) get_stat_clean() > max_size) {
+ uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT;
+ uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned();
+ while (get_stat_clean() > 0 &&
+ ((uint64_t)get_stat_clean() > max_size ||
+ nr_clean_bh > max_clean_bh)) {
BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire());
if (!bh)
break;
bh_remove(ob, bh);
delete bh;
+ --nr_clean_bh;
+
if (ob->complete) {
ldout(cct, 10) << "trim clearing complete on " << *ob << dendl;
ob->complete = false;
// - do not wait for bytes other waiters are waiting on. this means that
// threads do not wait for each other. this effectively allows the cache
// size to balloon proportional to the data that is in flight.
+
+ uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT;
while (get_stat_dirty() + get_stat_tx() > 0 &&
- (uint64_t) (get_stat_dirty() + get_stat_tx()) >=
- max_dirty + get_stat_dirty_waiting()) {
+ (((uint64_t)(get_stat_dirty() + get_stat_tx()) >=
+ max_dirty + get_stat_dirty_waiting()) ||
+ (dirty_or_tx_bh.size() >=
+ max_dirty_bh + get_stat_nr_dirty_waiters()))) {
+
if (blocked == 0) {
trace->event("start wait for writeback");
}
<< get_stat_dirty_waiting() << dendl;
flusher_cond.Signal();
stat_dirty_waiting += len;
+ ++stat_nr_dirty_waiters;
stat_cond.Wait(lock);
stat_dirty_waiting -= len;
+ --stat_nr_dirty_waiters;
++blocked;
ldout(cct, 10) << __func__ << " woke up" << dendl;
}
loff_t stat_error;
loff_t stat_dirty_waiting; // bytes that writers are waiting on to write
+ size_t stat_nr_dirty_waiters;
+
void verify_stats() const;
void bh_stat_add(BufferHead *bh);
loff_t get_stat_tx() const { return stat_tx; }
loff_t get_stat_rx() const { return stat_rx; }
loff_t get_stat_dirty() const { return stat_dirty; }
- loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
loff_t get_stat_clean() const { return stat_clean; }
loff_t get_stat_zero() const { return stat_zero; }
+ loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
+ size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; }
void touch_bh(BufferHead *bh) {
if (bh->is_dirty())
RADOS_TIMEOUT = 10
-SNAP_DIR = ".snap"
log = logging.getLogger(__name__)
CephFSVolumeClient Version History:
* 1 - Initial version
+ * 2 - Added get_object, put_object, delete_object methods to CephFSVolumeClient
"""
"""
# Current version
- version = 1
- # Earliest compatible version
- compat_version = 1
+ version = 2
# Where shall we create our volumes?
POOL_PREFIX = "fsvolume_"
# We can't query the actual cluster config remotely, but since this is
# just a heuristic we'll assume that the ceph.conf we have locally reflects
# that in use in the rest of the cluster.
- pg_warn_max_per_osd = int(self.rados.conf_get('mon_pg_warn_max_per_osd'))
+ pg_warn_max_per_osd = int(self.rados.conf_get('mon_max_pg_per_osd'))
other_pgs = 0
for pool in osd_map['pools']:
that encoded the metadata.
"""
data['compat_version'] = 1
- data['version'] = 1
+ data['version'] = self.version
return self._metadata_set(self._auth_metadata_path(auth_id), data)
def _volume_metadata_path(self, volume_path):
that encoded the metadata.
"""
data['compat_version'] = 1
- data['version'] = 1
+ data['version'] = self.version
return self._metadata_set(self._volume_metadata_path(volume_path), data)
def authorize(self, volume_path, auth_id, readonly=False, tenant_id=None):
# occurrence of wanted auth caps and no occurrence of
# conflicting auth caps.
+ if not orig:
+ return want
+
cap_tokens = set(orig.split(","))
cap_tokens.discard(unwanted)
def _snapshot_path(self, dir_path, snapshot_name):
return os.path.join(
- dir_path, SNAP_DIR, snapshot_name
+ dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
)
def _snapshot_create(self, dir_path, snapshot_name):
src_snapshot_path = self._snapshot_path(self._get_path(src_volume_path), src_snapshot_name)
self._cp_r(src_snapshot_path, dest_fs_path)
+
+ def put_object(self, pool_name, object_name, data):
+ """
+ Synchronously write data to an object.
+
+ :param pool_name: name of the pool
+ :type pool_name: str
+ :param object_name: name of the object
+ :type object_name: str
+ :param data: data to write
+ :type data: bytes
+ """
+ ioctx = self.rados.open_ioctx(pool_name)
+ max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+ if len(data) > max_size:
+ msg = ("Data to be written to object '{0}' exceeds "
+ "{1} bytes".format(object_name, max_size))
+ log.error(msg)
+ raise CephFSVolumeClientError(msg)
+ try:
+ ioctx.write_full(object_name, data)
+ finally:
+ ioctx.close()
+
+ def get_object(self, pool_name, object_name):
+ """
+ Synchronously read data from object.
+
+ :param pool_name: name of the pool
+ :type pool_name: str
+ :param object_name: name of the object
+ :type object_name: str
+
+ :returns: bytes - data read from object
+ """
+ ioctx = self.rados.open_ioctx(pool_name)
+ max_size = int(self.rados.conf_get('osd_max_write_size')) * 1024 * 1024
+ try:
+ bytes_read = ioctx.read(object_name, max_size)
+ if ((len(bytes_read) == max_size) and
+ (ioctx.read(object_name, 1, offset=max_size))):
+ log.warning("Size of object {0} exceeds '{1}' bytes "
+ "read".format(object_name, max_size))
+ finally:
+ ioctx.close()
+ return bytes_read
+
+ def delete_object(self, pool_name, object_name):
+ ioctx = self.rados.open_ioctx(pool_name)
+ try:
+ ioctx.remove_object(object_name)
+ except rados.ObjectNotFound:
+ log.warn("Object '{0}' was already removed".format(object_name))
+ finally:
+ ioctx.close()
--- /dev/null
+
+from module import * # NOQA
--- /dev/null
+
+"""
+Balance PG distribution across OSDs.
+"""
+
+import copy
+import errno
+import json
+import math
+import random
+import time
+from mgr_module import MgrModule, CommandResult
+from threading import Event
+
+# available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
+default_mode = 'none'
+default_sleep_interval = 60 # seconds
+default_max_misplaced = .05 # max ratio of pgs replaced at a time
+
+TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
+
+
+class MappingState:
+ def __init__(self, osdmap, pg_dump, desc=''):
+ self.desc = desc
+ self.osdmap = osdmap
+ self.osdmap_dump = self.osdmap.dump()
+ self.crush = osdmap.get_crush()
+ self.crush_dump = self.crush.dump()
+ self.pg_dump = pg_dump
+ self.pg_stat = {
+ i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', [])
+ }
+ self.poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
+ self.pg_up = {}
+ self.pg_up_by_poolid = {}
+ for poolid in self.poolids:
+ self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid)
+ for a,b in self.pg_up_by_poolid[poolid].iteritems():
+ self.pg_up[a] = b
+
+ def calc_misplaced_from(self, other_ms):
+ num = len(other_ms.pg_up)
+ misplaced = 0
+ for pgid, before in other_ms.pg_up.iteritems():
+ if before != self.pg_up.get(pgid, []):
+ misplaced += 1
+ if num > 0:
+ return float(misplaced) / float(num)
+ return 0.0
+
+class Plan:
+ def __init__(self, name, ms):
+ self.mode = 'unknown'
+ self.name = name
+ self.initial = ms
+
+ self.osd_weights = {}
+ self.compat_ws = {}
+ self.inc = ms.osdmap.new_incremental()
+
+ def final_state(self):
+ self.inc.set_osd_reweights(self.osd_weights)
+ self.inc.set_crush_compat_weight_set_weights(self.compat_ws)
+ return MappingState(self.initial.osdmap.apply_incremental(self.inc),
+ self.initial.pg_dump,
+ 'plan %s final' % self.name)
+
+ def dump(self):
+ return json.dumps(self.inc.dump(), indent=4)
+
+ def show(self):
+ ls = []
+ ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch())
+ ls.append('# starting crush version %d' %
+ self.initial.osdmap.get_crush_version())
+ ls.append('# mode %s' % self.mode)
+ if len(self.compat_ws) and \
+ '-1' not in self.initial.crush_dump.get('choose_args', {}):
+ ls.append('ceph osd crush weight-set create-compat')
+ for osd, weight in self.compat_ws.iteritems():
+ ls.append('ceph osd crush weight-set reweight-compat %s %f' %
+ (osd, weight))
+ for osd, weight in self.osd_weights.iteritems():
+ ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
+ incdump = self.inc.dump()
+ for pgid in incdump.get('old_pg_upmap_items', []):
+ ls.append('ceph osd rm-pg-upmap-items %s' % pgid)
+ for item in incdump.get('new_pg_upmap_items', []):
+ osdlist = []
+ for m in item['mappings']:
+ osdlist += [m['from'], m['to']]
+ ls.append('ceph osd pg-upmap-items %s %s' %
+ (item['pgid'], ' '.join([str(a) for a in osdlist])))
+ return '\n'.join(ls)
+
+
+class Eval:
+ root_ids = {} # root name -> id
+ pool_name = {} # pool id -> pool name
+ pool_id = {} # pool name -> id
+ pool_roots = {} # pool name -> root name
+ root_pools = {} # root name -> pools
+ target_by_root = {} # root name -> target weight map
+ count_by_pool = {}
+ count_by_root = {}
+ actual_by_pool = {} # pool -> by_* -> actual weight map
+ actual_by_root = {} # pool -> by_* -> actual weight map
+ total_by_pool = {} # pool -> by_* -> total
+ total_by_root = {} # root -> by_* -> total
+ stats_by_pool = {} # pool -> by_* -> stddev or avg -> value
+ stats_by_root = {} # root -> by_* -> stddev or avg -> value
+
+ score_by_pool = {}
+ score_by_root = {}
+
+ score = 0.0
+
+ def __init__(self, ms):
+ self.ms = ms
+
+ def show(self, verbose=False):
+ if verbose:
+ r = self.ms.desc + '\n'
+ r += 'target_by_root %s\n' % self.target_by_root
+ r += 'actual_by_pool %s\n' % self.actual_by_pool
+ r += 'actual_by_root %s\n' % self.actual_by_root
+ r += 'count_by_pool %s\n' % self.count_by_pool
+ r += 'count_by_root %s\n' % self.count_by_root
+ r += 'total_by_pool %s\n' % self.total_by_pool
+ r += 'total_by_root %s\n' % self.total_by_root
+ r += 'stats_by_root %s\n' % self.stats_by_root
+ r += 'score_by_pool %s\n' % self.score_by_pool
+ r += 'score_by_root %s\n' % self.score_by_root
+ else:
+ r = self.ms.desc + ' '
+ r += 'score %f (lower is better)\n' % self.score
+ return r
+
+ def calc_stats(self, count, target, total):
+ num = max(len(target), 1)
+ r = {}
+ for t in ('pgs', 'objects', 'bytes'):
+ avg = float(total[t]) / float(num)
+ dev = 0.0
+
+ # score is a measure of how uneven the data distribution is.
+ # score lies between [0, 1), 0 means perfect distribution.
+ score = 0.0
+ sum_weight = 0.0
+
+ for k, v in count[t].iteritems():
+ # adjust/normalize by weight
+ if target[k]:
+ adjusted = float(v) / target[k] / float(num)
+ else:
+ adjusted = 0.0
+
+ # Overweighted devices and their weights are factors to calculate reweight_urgency.
+ # One 10% underfilled device with 5 2% overfilled devices, is arguably a better
+ # situation than one 10% overfilled with 5 2% underfilled devices
+ if adjusted > avg:
+ '''
+ F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution
+ x = (adjusted - avg)/avg.
+ Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1).
+ To bring range of F(x) in range [0, 1), we need to make the above modification.
+
+ In general, we need to use a function F(x), where x = (adjusted - avg)/avg
+ 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded.
+ 2. A larger value of x, should imply more urgency to reweight.
+ 3. Also, the difference between F(x) when x is large, should be minimal.
+ 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply.
+
+ Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use.
+
+ cdf of standard normal distribution: https://stackoverflow.com/a/29273201
+ '''
+ score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0)))
+ sum_weight += target[k]
+ dev += (avg - adjusted) * (avg - adjusted)
+ stddev = math.sqrt(dev / float(max(num - 1, 1)))
+ score = score / max(sum_weight, 1)
+ r[t] = {
+ 'avg': avg,
+ 'stddev': stddev,
+ 'sum_weight': sum_weight,
+ 'score': score,
+ }
+ return r
+
+class Module(MgrModule):
+ COMMANDS = [
+ {
+ "cmd": "balancer status",
+ "desc": "Show balancer status",
+ "perm": "r",
+ },
+ {
+ "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
+ "desc": "Set balancer mode",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer on",
+ "desc": "Enable automatic balancing",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer off",
+ "desc": "Disable automatic balancing",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer eval name=plan,type=CephString,req=false",
+ "desc": "Evaluate data distribution for the current cluster or specific plan",
+ "perm": "r",
+ },
+ {
+ "cmd": "balancer eval-verbose name=plan,type=CephString,req=false",
+ "desc": "Evaluate data distribution for the current cluster or specific plan (verbosely)",
+ "perm": "r",
+ },
+ {
+ "cmd": "balancer optimize name=plan,type=CephString",
+ "desc": "Run optimizer to create a new plan",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer show name=plan,type=CephString",
+ "desc": "Show details of an optimization plan",
+ "perm": "r",
+ },
+ {
+ "cmd": "balancer rm name=plan,type=CephString",
+ "desc": "Discard an optimization plan",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer reset",
+ "desc": "Discard all optimization plans",
+ "perm": "rw",
+ },
+ {
+ "cmd": "balancer dump name=plan,type=CephString",
+ "desc": "Show an optimization plan",
+ "perm": "r",
+ },
+ {
+ "cmd": "balancer execute name=plan,type=CephString",
+ "desc": "Execute an optimization plan",
+ "perm": "r",
+ },
+ ]
+ active = False
+ run = True
+ plans = {}
+ mode = ''
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self.event = Event()
+
+ def handle_command(self, command):
+ self.log.warn("Handling command: '%s'" % str(command))
+ if command['prefix'] == 'balancer status':
+ s = {
+ 'plans': self.plans.keys(),
+ 'active': self.active,
+ 'mode': self.get_config('mode', default_mode),
+ }
+ return (0, json.dumps(s, indent=4), '')
+ elif command['prefix'] == 'balancer mode':
+ self.set_config('mode', command['mode'])
+ return (0, '', '')
+ elif command['prefix'] == 'balancer on':
+ if not self.active:
+ self.set_config('active', '1')
+ self.active = True
+ self.event.set()
+ return (0, '', '')
+ elif command['prefix'] == 'balancer off':
+ if self.active:
+ self.set_config('active', '')
+ self.active = False
+ self.event.set()
+ return (0, '', '')
+ elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose':
+ verbose = command['prefix'] == 'balancer eval-verbose'
+ if 'plan' in command:
+ plan = self.plans.get(command['plan'])
+ if not plan:
+ return (-errno.ENOENT, '', 'plan %s not found' %
+ command['plan'])
+ ms = plan.final_state()
+ else:
+ ms = MappingState(self.get_osdmap(),
+ self.get("pg_dump"),
+ 'current cluster')
+ return (0, self.evaluate(ms, verbose=verbose), '')
+ elif command['prefix'] == 'balancer optimize':
+ plan = self.plan_create(command['plan'])
+ self.optimize(plan)
+ return (0, '', '')
+ elif command['prefix'] == 'balancer rm':
+ self.plan_rm(command['name'])
+ return (0, '', '')
+ elif command['prefix'] == 'balancer reset':
+ self.plans = {}
+ return (0, '', '')
+ elif command['prefix'] == 'balancer dump':
+ plan = self.plans.get(command['plan'])
+ if not plan:
+ return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+ return (0, plan.dump(), '')
+ elif command['prefix'] == 'balancer show':
+ plan = self.plans.get(command['plan'])
+ if not plan:
+ return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+ return (0, plan.show(), '')
+ elif command['prefix'] == 'balancer execute':
+ plan = self.plans.get(command['plan'])
+ if not plan:
+ return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+ self.execute(plan)
+ self.plan_rm(plan)
+ return (0, '', '')
+ else:
+ return (-errno.EINVAL, '',
+ "Command not found '{0}'".format(command['prefix']))
+
+ def shutdown(self):
+ self.log.info('Stopping')
+ self.run = False
+ self.event.set()
+
+ def time_in_interval(self, tod, begin, end):
+ if begin <= end:
+ return tod >= begin and tod < end
+ else:
+ return tod >= begin or tod < end
+
+ def serve(self):
+ self.log.info('Starting')
+ while self.run:
+ self.active = self.get_config('active', '') is not ''
+ begin_time = self.get_config('begin_time') or '0000'
+ end_time = self.get_config('end_time') or '2400'
+ timeofday = time.strftime('%H%M', time.localtime())
+ self.log.debug('Waking up [%s, scheduled for %s-%s, now %s]',
+ "active" if self.active else "inactive",
+ begin_time, end_time, timeofday)
+ sleep_interval = float(self.get_config('sleep_interval',
+ default_sleep_interval))
+ if self.active and self.time_in_interval(timeofday, begin_time, end_time):
+ self.log.debug('Running')
+ name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
+ plan = self.plan_create(name)
+ if self.optimize(plan):
+ self.execute(plan)
+ self.plan_rm(name)
+ self.log.debug('Sleeping for %d', sleep_interval)
+ self.event.wait(sleep_interval)
+ self.event.clear()
+
+ def plan_create(self, name):
+ plan = Plan(name, MappingState(self.get_osdmap(),
+ self.get("pg_dump"),
+ 'plan %s initial' % name))
+ self.plans[name] = plan
+ return plan
+
+ def plan_rm(self, name):
+ if name in self.plans:
+ del self.plans[name]
+
+ def calc_eval(self, ms):
+ pe = Eval(ms)
+ pool_rule = {}
+ pool_info = {}
+ for p in ms.osdmap_dump.get('pools',[]):
+ pe.pool_name[p['pool']] = p['pool_name']
+ pe.pool_id[p['pool_name']] = p['pool']
+ pool_rule[p['pool_name']] = p['crush_rule']
+ pe.pool_roots[p['pool_name']] = []
+ pool_info[p['pool_name']] = p
+ pools = pe.pool_id.keys()
+ if len(pools) == 0:
+ return pe
+ self.log.debug('pool_name %s' % pe.pool_name)
+ self.log.debug('pool_id %s' % pe.pool_id)
+ self.log.debug('pools %s' % pools)
+ self.log.debug('pool_rule %s' % pool_rule)
+
+ osd_weight = { a['osd']: a['weight']
+ for a in ms.osdmap_dump.get('osds',[]) }
+
+ # get expected distributions by root
+ actual_by_root = {}
+ rootids = ms.crush.find_takes()
+ roots = []
+ for rootid in rootids:
+ root = ms.crush.get_item_name(rootid)
+ pe.root_ids[root] = rootid
+ roots.append(root)
+ ls = ms.osdmap.get_pools_by_take(rootid)
+ pe.root_pools[root] = []
+ for poolid in ls:
+ pe.pool_roots[pe.pool_name[poolid]].append(root)
+ pe.root_pools[root].append(pe.pool_name[poolid])
+ weight_map = ms.crush.get_take_weight_osd_map(rootid)
+ adjusted_map = {
+ osd: cw * osd_weight.get(osd, 1.0)
+ for osd,cw in weight_map.iteritems()
+ }
+ sum_w = sum(adjusted_map.values()) or 1.0
+ pe.target_by_root[root] = { osd: w / sum_w
+ for osd,w in adjusted_map.iteritems() }
+ actual_by_root[root] = {
+ 'pgs': {},
+ 'objects': {},
+ 'bytes': {},
+ }
+ for osd in pe.target_by_root[root].iterkeys():
+ actual_by_root[root]['pgs'][osd] = 0
+ actual_by_root[root]['objects'][osd] = 0
+ actual_by_root[root]['bytes'][osd] = 0
+ pe.total_by_root[root] = {
+ 'pgs': 0,
+ 'objects': 0,
+ 'bytes': 0,
+ }
+ self.log.debug('pool_roots %s' % pe.pool_roots)
+ self.log.debug('root_pools %s' % pe.root_pools)
+ self.log.debug('target_by_root %s' % pe.target_by_root)
+
+ # pool and root actual
+ for pool, pi in pool_info.iteritems():
+ poolid = pi['pool']
+ pm = ms.pg_up_by_poolid[poolid]
+ pgs = 0
+ objects = 0
+ bytes = 0
+ pgs_by_osd = {}
+ objects_by_osd = {}
+ bytes_by_osd = {}
+ for root in pe.pool_roots[pool]:
+ for osd in pe.target_by_root[root].iterkeys():
+ pgs_by_osd[osd] = 0
+ objects_by_osd[osd] = 0
+ bytes_by_osd[osd] = 0
+ for pgid, up in pm.iteritems():
+ for osd in [int(osd) for osd in up]:
+ pgs_by_osd[osd] += 1
+ objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects']
+ bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes']
+ # pick a root to associate this pg instance with.
+ # note that this is imprecise if the roots have
+ # overlapping children.
+ # FIXME: divide bytes by k for EC pools.
+ for root in pe.pool_roots[pool]:
+ if osd in pe.target_by_root[root]:
+ actual_by_root[root]['pgs'][osd] += 1
+ actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects']
+ actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes']
+ pgs += 1
+ objects += ms.pg_stat[pgid]['num_objects']
+ bytes += ms.pg_stat[pgid]['num_bytes']
+ pe.total_by_root[root]['pgs'] += 1
+ pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects']
+ pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes']
+ break
+ pe.count_by_pool[pool] = {
+ 'pgs': {
+ k: v
+ for k, v in pgs_by_osd.iteritems()
+ },
+ 'objects': {
+ k: v
+ for k, v in objects_by_osd.iteritems()
+ },
+ 'bytes': {
+ k: v
+ for k, v in bytes_by_osd.iteritems()
+ },
+ }
+ pe.actual_by_pool[pool] = {
+ 'pgs': {
+ k: float(v) / float(max(pgs, 1))
+ for k, v in pgs_by_osd.iteritems()
+ },
+ 'objects': {
+ k: float(v) / float(max(objects, 1))
+ for k, v in objects_by_osd.iteritems()
+ },
+ 'bytes': {
+ k: float(v) / float(max(bytes, 1))
+ for k, v in bytes_by_osd.iteritems()
+ },
+ }
+ pe.total_by_pool[pool] = {
+ 'pgs': pgs,
+ 'objects': objects,
+ 'bytes': bytes,
+ }
+ for root, m in pe.total_by_root.iteritems():
+ pe.count_by_root[root] = {
+ 'pgs': {
+ k: float(v)
+ for k, v in actual_by_root[root]['pgs'].iteritems()
+ },
+ 'objects': {
+ k: float(v)
+ for k, v in actual_by_root[root]['objects'].iteritems()
+ },
+ 'bytes': {
+ k: float(v)
+ for k, v in actual_by_root[root]['bytes'].iteritems()
+ },
+ }
+ pe.actual_by_root[root] = {
+ 'pgs': {
+ k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
+ for k, v in actual_by_root[root]['pgs'].iteritems()
+ },
+ 'objects': {
+ k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
+ for k, v in actual_by_root[root]['objects'].iteritems()
+ },
+ 'bytes': {
+ k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
+ for k, v in actual_by_root[root]['bytes'].iteritems()
+ },
+ }
+ self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
+ self.log.debug('actual_by_root %s' % pe.actual_by_root)
+
+ # average and stddev and score
+ pe.stats_by_root = {
+ a: pe.calc_stats(
+ b,
+ pe.target_by_root[a],
+ pe.total_by_root[a]
+ ) for a, b in pe.count_by_root.iteritems()
+ }
+
+ # the scores are already normalized
+ pe.score_by_root = {
+ r: {
+ 'pgs': pe.stats_by_root[r]['pgs']['score'],
+ 'objects': pe.stats_by_root[r]['objects']['score'],
+ 'bytes': pe.stats_by_root[r]['bytes']['score'],
+ } for r in pe.total_by_root.keys()
+ }
+
+ # total score is just average of normalized stddevs
+ pe.score = 0.0
+ for r, vs in pe.score_by_root.iteritems():
+ for k, v in vs.iteritems():
+ pe.score += v
+ pe.score /= 3 * len(roots)
+ return pe
+
+ def evaluate(self, ms, verbose=False):
+ pe = self.calc_eval(ms)
+ return pe.show(verbose=verbose)
+
+ def optimize(self, plan):
+ self.log.info('Optimize plan %s' % plan.name)
+ plan.mode = self.get_config('mode', default_mode)
+ max_misplaced = float(self.get_config('max_misplaced',
+ default_max_misplaced))
+ self.log.info('Mode %s, max misplaced %f' %
+ (plan.mode, max_misplaced))
+
+ info = self.get('pg_status')
+ unknown = info.get('unknown_pgs_ratio', 0.0)
+ degraded = info.get('degraded_ratio', 0.0)
+ inactive = info.get('inactive_pgs_ratio', 0.0)
+ misplaced = info.get('misplaced_ratio', 0.0)
+ self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
+ unknown, degraded, inactive, misplaced)
+ if unknown > 0.0:
+ self.log.info('Some PGs (%f) are unknown; waiting', unknown)
+ elif degraded > 0.0:
+ self.log.info('Some objects (%f) are degraded; waiting', degraded)
+ elif inactive > 0.0:
+ self.log.info('Some PGs (%f) are inactive; waiting', inactive)
+ elif misplaced >= max_misplaced:
+ self.log.info('Too many objects (%f > %f) are misplaced; waiting',
+ misplaced, max_misplaced)
+ else:
+ if plan.mode == 'upmap':
+ return self.do_upmap(plan)
+ elif plan.mode == 'crush-compat':
+ return self.do_crush_compat(plan)
+ elif plan.mode == 'none':
+ self.log.info('Idle')
+ else:
+ self.log.info('Unrecognized mode %s' % plan.mode)
+ return False
+
+ ##
+
+ def do_upmap(self, plan):
+ self.log.info('do_upmap')
+ max_iterations = self.get_config('upmap_max_iterations', 10)
+ max_deviation = self.get_config('upmap_max_deviation', .01)
+
+ ms = plan.initial
+ pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
+ if len(pools) == 0:
+ self.log.info('no pools, nothing to do')
+ return False
+ # shuffle pool list so they all get equal (in)attention
+ random.shuffle(pools)
+ self.log.info('pools %s' % pools)
+
+ inc = plan.inc
+ total_did = 0
+ left = max_iterations
+ for pool in pools:
+ did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
+ total_did += did
+ left -= did
+ if left <= 0:
+ break
+ self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
+ return True
+
+ def do_crush_compat(self, plan):
+ self.log.info('do_crush_compat')
+ max_iterations = self.get_config('crush_compat_max_iterations', 25)
+ if max_iterations < 1:
+ return False
+ step = self.get_config('crush_compat_step', .5)
+ if step <= 0 or step >= 1.0:
+ return False
+ max_misplaced = float(self.get_config('max_misplaced',
+ default_max_misplaced))
+ min_pg_per_osd = 2
+
+ ms = plan.initial
+ osdmap = ms.osdmap
+ crush = osdmap.get_crush()
+ pe = self.calc_eval(ms)
+ if pe.score == 0:
+ self.log.info('Distribution is already perfect')
+ return False
+
+ # get current osd reweights
+ orig_osd_weight = { a['osd']: a['weight']
+ for a in ms.osdmap_dump.get('osds',[]) }
+ reweighted_osds = [ a for a,b in orig_osd_weight.iteritems()
+ if b < 1.0 and b > 0.0 ]
+
+ # get current compat weight-set weights
+ orig_ws = self.get_compat_weight_set_weights()
+ orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 }
+
+ # Make sure roots don't overlap their devices. If so, we
+ # can't proceed.
+ roots = pe.target_by_root.keys()
+ self.log.debug('roots %s', roots)
+ visited = {}
+ overlap = {}
+ root_ids = {}
+ for root, wm in pe.target_by_root.iteritems():
+ for osd in wm.iterkeys():
+ if osd in visited:
+ overlap[osd] = 1
+ visited[osd] = 1
+ if len(overlap) > 0:
+ self.log.err('error: some osds belong to multiple subtrees: %s' %
+ overlap)
+ return False
+
+ key = 'pgs' # pgs objects or bytes
+
+ # go
+ best_ws = copy.deepcopy(orig_ws)
+ best_ow = copy.deepcopy(orig_osd_weight)
+ best_pe = pe
+ left = max_iterations
+ bad_steps = 0
+ next_ws = copy.deepcopy(best_ws)
+ next_ow = copy.deepcopy(best_ow)
+ while left > 0:
+ # adjust
+ self.log.debug('best_ws %s' % best_ws)
+ random.shuffle(roots)
+ for root in roots:
+ pools = best_pe.root_pools[root]
+ pgs = len(best_pe.target_by_root[root])
+ min_pgs = pgs * min_pg_per_osd
+ if best_pe.total_by_root[root] < min_pgs:
+ self.log.info('Skipping root %s (pools %s), total pgs %d '
+ '< minimum %d (%d per osd)',
+ root, pools, pgs, min_pgs, min_pg_per_osd)
+ continue
+ self.log.info('Balancing root %s (pools %s) by %s' %
+ (root, pools, key))
+ target = best_pe.target_by_root[root]
+ actual = best_pe.actual_by_root[root][key]
+ queue = sorted(actual.keys(),
+ key=lambda osd: -abs(target[osd] - actual[osd]))
+ for osd in queue:
+ if orig_osd_weight[osd] == 0:
+ self.log.debug('skipping out osd.%d', osd)
+ else:
+ deviation = target[osd] - actual[osd]
+ if deviation == 0:
+ break
+ self.log.debug('osd.%d deviation %f', osd, deviation)
+ weight = best_ws[osd]
+ ow = orig_osd_weight[osd]
+ if actual[osd] > 0:
+ calc_weight = target[osd] / actual[osd] * weight * ow
+ else:
+ # not enough to go on here... keep orig weight
+ calc_weight = weight / orig_osd_weight[osd]
+ new_weight = weight * (1.0 - step) + calc_weight * step
+ self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
+ new_weight)
+ next_ws[osd] = new_weight
+ if ow < 1.0:
+ new_ow = min(1.0, max(step + (1.0 - step) * ow,
+ ow + .005))
+ self.log.debug('Reweight osd.%d reweight %f -> %f',
+ osd, ow, new_ow)
+ next_ow[osd] = new_ow
+
+ # normalize weights under this root
+ root_weight = crush.get_item_weight(pe.root_ids[root])
+ root_sum = sum(b for a,b in next_ws.iteritems()
+ if a in target.keys())
+ if root_sum > 0 and root_weight > 0:
+ factor = root_sum / root_weight
+ self.log.debug('normalizing root %s %d, weight %f, '
+ 'ws sum %f, factor %f',
+ root, pe.root_ids[root], root_weight,
+ root_sum, factor)
+ for osd in actual.keys():
+ next_ws[osd] = next_ws[osd] / factor
+
+ # recalc
+ plan.compat_ws = copy.deepcopy(next_ws)
+ next_ms = plan.final_state()
+ next_pe = self.calc_eval(next_ms)
+ next_misplaced = next_ms.calc_misplaced_from(ms)
+ self.log.debug('Step result score %f -> %f, misplacing %f',
+ best_pe.score, next_pe.score, next_misplaced)
+
+ if next_misplaced > max_misplaced:
+ if best_pe.score < pe.score:
+ self.log.debug('Step misplaced %f > max %f, stopping',
+ next_misplaced, max_misplaced)
+ break
+ step /= 2.0
+ next_ws = copy.deepcopy(best_ws)
+ next_ow = copy.deepcopy(best_ow)
+ self.log.debug('Step misplaced %f > max %f, reducing step to %f',
+ next_misplaced, max_misplaced, step)
+ else:
+ if next_pe.score > best_pe.score * 1.0001:
+ if bad_steps < 5 and random.randint(0, 100) < 70:
+ self.log.debug('Score got worse, taking another step')
+ else:
+ step /= 2.0
+ next_ws = copy.deepcopy(best_ws)
+ next_ow = copy.deepcopy(best_ow)
+ self.log.debug('Score got worse, trying smaller step %f',
+ step)
+ else:
+ bad_steps = 0
+ best_pe = next_pe
+ best_ws = next_ws
+ best_ow = next_ow
+ if best_pe.score == 0:
+ break
+ left -= 1
+
+ # allow a small regression if we are phasing out osd weights
+ fudge = 0
+ if next_ow != orig_osd_weight:
+ fudge = .001
+
+ if best_pe.score < pe.score + fudge:
+ self.log.info('Success, score %f -> %f', pe.score, best_pe.score)
+ plan.compat_ws = best_ws
+ for osd, w in best_ow.iteritems():
+ if w != orig_osd_weight[osd]:
+ self.log.debug('osd.%d reweight %f', osd, w)
+ plan.osd_weights[osd] = w
+ return True
+ else:
+ self.log.info('Failed to find further optimization, score %f',
+ pe.score)
+ return False
+
+ def get_compat_weight_set_weights(self):
+ # enable compat weight-set
+ self.log.debug('ceph osd crush weight-set create-compat')
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush weight-set create-compat',
+ 'format': 'json',
+ }), '')
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error creating compat weight-set')
+ return
+
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush dump',
+ 'format': 'json',
+ }), '')
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error dumping crush map')
+ return
+ try:
+ crushmap = json.loads(outb)
+ except:
+ raise RuntimeError('unable to parse crush map')
+
+ raw = crushmap.get('choose_args',{}).get('-1', [])
+ weight_set = {}
+ for b in raw:
+ bucket = None
+ for t in crushmap['buckets']:
+ if t['id'] == b['bucket_id']:
+ bucket = t
+ break
+ if not bucket:
+ raise RuntimeError('could not find bucket %s' % b['bucket_id'])
+ self.log.debug('bucket items %s' % bucket['items'])
+ self.log.debug('weight set %s' % b['weight_set'][0])
+ if len(bucket['items']) != len(b['weight_set'][0]):
+ raise RuntimeError('weight-set size does not match bucket items')
+ for pos in range(len(bucket['items'])):
+ weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos]
+
+ self.log.debug('weight_set weights %s' % weight_set)
+ return weight_set
+
+ def do_crush(self):
+ self.log.info('do_crush (not yet implemented)')
+
+ def do_osd_weight(self):
+ self.log.info('do_osd_weight (not yet implemented)')
+
+ def execute(self, plan):
+ self.log.info('Executing plan %s' % plan.name)
+
+ commands = []
+
+ # compat weight-set
+ if len(plan.compat_ws) and \
+ '-1' not in plan.initial.crush_dump.get('choose_args', {}):
+ self.log.debug('ceph osd crush weight-set create-compat')
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush weight-set create-compat',
+ 'format': 'json',
+ }), '')
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error creating compat weight-set')
+ return
+
+ for osd, weight in plan.compat_ws.iteritems():
+ self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
+ osd, weight)
+ result = CommandResult('foo')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd crush weight-set reweight-compat',
+ 'format': 'json',
+ 'item': 'osd.%d' % osd,
+ 'weight': [weight],
+ }), 'foo')
+ commands.append(result)
+
+ # new_weight
+ reweightn = {}
+ for osd, weight in plan.osd_weights.iteritems():
+ reweightn[str(osd)] = str(int(weight * float(0x10000)))
+ if len(reweightn):
+ self.log.info('ceph osd reweightn %s', reweightn)
+ result = CommandResult('foo')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd reweightn',
+ 'format': 'json',
+ 'weights': json.dumps(reweightn),
+ }), 'foo')
+ commands.append(result)
+
+ # upmap
+ incdump = plan.inc.dump()
+ for pgid in incdump.get('old_pg_upmap_items', []):
+ self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
+ result = CommandResult('foo')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd rm-pg-upmap-items',
+ 'format': 'json',
+ 'pgid': pgid,
+ }), 'foo')
+ commands.append(result)
+
+ for item in incdump.get('new_pg_upmap_items', []):
+ self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
+ item['mappings'])
+ osdlist = []
+ for m in item['mappings']:
+ osdlist += [m['from'], m['to']]
+ result = CommandResult('foo')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd pg-upmap-items',
+ 'format': 'json',
+ 'pgid': item['pgid'],
+ 'id': osdlist,
+ }), 'foo')
+ commands.append(result)
+
+ # wait for commands
+ self.log.debug('commands %s' % commands)
+ for result in commands:
+ r, outb, outs = result.wait()
+ if r != 0:
+ self.log.error('Error on command')
+ return
+ self.log.debug('done')
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"
name="viewport">
<link rel="stylesheet"
- href="/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
+ href="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.5.0/css/font-awesome.min.css">
<link rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/ionicons/2.0.1/css/ionicons.min.css">
<link rel="stylesheet"
- href="/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
+ href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
<link rel="stylesheet"
- href="/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
+ href="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
<link rel="stylesheet"
- href="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
+ href="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
- <script src="/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
- <script src="/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
+ <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
+ <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
- <script src="/static/rivets.bundled.min.js"></script>
- <script src="/static/underscore-min.js"></script>
+ <script src="{{ url_prefix }}/static/rivets.bundled.min.js"></script>
+ <script src="{{ url_prefix }}/static/underscore-min.js"></script>
- <script src="/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
- <script src="/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
- <script src="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
+ <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
+ <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
+ <script src="{{ url_prefix }}/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.17.1/moment.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>
var refresh_interval = 5000;
var refresh = function() {
- $.get("/toplevel_data", function(data) {
+ $.get("{{ url_prefix }}/toplevel_data", function(data) {
_.extend(toplevel_data, data);
setTimeout(refresh, refresh_interval);
});
var width=4;
var unit = 0;
+ if (n == null) {
+ // People shouldn't really be passing null, but let's
+ // do something sensible instead of barfing.
+ return "-";
+ }
+
while (Math.floor(n / (divisor**unit)).toString().length > width - 1) {
unit = unit + 1;
}
});
</script>
- <link rel="shortcut icon" href="http://ceph.com/wp-content/themes/ceph/favicon.ico">
- <link rel="shortcut icon" href="/static/favicon.ico">
+ <link rel="shortcut icon" href="https://ceph.com/wp-content/themes/cephTheme/Resources/Favicons/favicon-96x96.png">
+ <link rel="shortcut icon" href="{{ url_prefix }}/static/favicon.ico">
<style>
div.box {
<!-- Main Header -->
<header class="main-header">
<!-- Logo -->
- <a href="/" class="logo">
+ <a href="{{ url_prefix }}/" class="logo">
<span class="logo-lg">
- <img src="/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
+ <img src="{{ url_prefix }}/static/Ceph_Logo_Standard_RGB_White_120411_fa.png"
width="123px" height="34px"/>
</span>
<span class="logo-mini">
- <img src="/static/logo-mini.png"
+ <img src="{{ url_prefix }}/static/logo-mini.png"
width="34px" height="34px"/>
</span>
</a>
<ul class="sidebar-menu">
<!-- Optionally, you can add icons to the links -->
<li class="{%if path_info=='/' or path_info.startswith('/health')%}active{%endif%}">
- <a href="/health">
+ <a href="{{ url_prefix }}/health">
<i class="fa fa-heartbeat" rv-style="health_status | health_color"></i>
<span>Cluster health</span></a>
</li>
</a>
<ul class="treeview-menu menu-open">
<li>
- <a href="/servers">Servers</a>
+ <a href="{{ url_prefix }}/servers">Servers</a>
</li>
<li>
- <a href="/osd">OSDs</a>
+ <a href="{{ url_prefix }}/osd">OSDs</a>
</li>
</ul>
</li>
</a>
<ul class="treeview-menu menu-open">
<li>
- <a href="/rbd_mirroring">
+ <a href="{{ url_prefix }}/rbd_mirroring">
<i class="fa fa-exchange"></i> Mirroring
<span class="pull-right-container">
<small rv-hide="rbd_mirroring.warnings | hide_count_box" class="label pull-right bg-yellow">{rbd_mirroring.warnings}</small>
</a>
</li>
<li>
- <a href="/rbd_iscsi">
+ <a href="{{ url_prefix }}/rbd_iscsi">
<i class="fa fa-upload"></i> iSCSI
<span class="pull-right-container" />
</a>
<strong>Copyright © 2016 by Ceph Contributors.</strong> Free software (LGPL 2.1)
</footer>
- <!-- Control Sidebar -->
- <aside class="control-sidebar control-sidebar-dark">
- <!-- Create the tabs -->
- <ul class="nav nav-tabs nav-justified control-sidebar-tabs">
- <li class="active"><a href="#control-sidebar-home-tab" data-toggle="tab"><i class="fa fa-home"></i></a></li>
- <li><a href="#control-sidebar-settings-tab" data-toggle="tab"><i class="fa fa-gears"></i></a></li>
- </ul>
- <!-- Tab panes -->
- <div class="tab-content">
- <!-- Home tab content -->
- <div class="tab-pane active" id="control-sidebar-home-tab">
- <h3 class="control-sidebar-heading">Recent Activity</h3>
- <ul class="control-sidebar-menu">
- <li>
- <a href="javascript::;">
- <i class="menu-icon fa fa-birthday-cake bg-red"></i>
-
- <div class="menu-info">
- <h4 class="control-sidebar-subheading">Langdon's Birthday</h4>
-
- <p>Will be 23 on April 24th</p>
- </div>
- </a>
- </li>
- </ul>
- <!-- /.control-sidebar-menu -->
-
- <h3 class="control-sidebar-heading">Tasks Progress</h3>
- <ul class="control-sidebar-menu">
- <li>
- <a href="javascript::;">
- <h4 class="control-sidebar-subheading">
- Custom Template Design
- <span class="pull-right-container">
- <span class="label label-danger pull-right">70%</span>
- </span>
- </h4>
-
- <div class="progress progress-xxs">
- <div class="progress-bar progress-bar-danger" style="width: 70%"></div>
- </div>
- </a>
- </li>
- </ul>
- <!-- /.control-sidebar-menu -->
-
- </div>
- <!-- /.tab-pane -->
- <!-- Stats tab content -->
- <div class="tab-pane" id="control-sidebar-stats-tab">Stats Tab Content</div>
- <!-- /.tab-pane -->
- <!-- Settings tab content -->
- <div class="tab-pane" id="control-sidebar-settings-tab">
- <form method="post">
- <h3 class="control-sidebar-heading">General Settings</h3>
-
- <div class="form-group">
- <label class="control-sidebar-subheading">
- Report panel usage
- <input type="checkbox" class="pull-right" checked>
- </label>
-
- <p>
- Some information about this general settings option
- </p>
- </div>
- <!-- /.form-group -->
- </form>
- </div>
- <!-- /.tab-pane -->
- </div>
- </aside>
- <!-- /.control-sidebar -->
- <!-- Add the sidebar's background. This div must be placed
- immediately after the control sidebar -->
- <div class="control-sidebar-bg"></div>
-
</div>
<!--
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/clients_data/" + content_data.fscid + "/", function(data) {
+ $.get("{{ url_prefix }}/clients_data/" + content_data.fscid + "/", function(data) {
content_data.clients = data;
setTimeout(refresh, 5000);
});
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/filesystem_data/" + content_data.fs_status.filesystem.id + "/", function(data) {
+ $.get("{{ url_prefix }}/filesystem_data/" + content_data.fs_status.filesystem.id + "/", function(data) {
_.extend(content_data.fs_status, data);
setTimeout(refresh, 5000);
});
var rhs_transform = delta_timeseries;
var draw_chart = function() {
- $.get("/mds_counters/" + content_data.fs_status.filesystem.id + "/", function(data) {
+ $.get("{{ url_prefix }}/mds_counters/" + content_data.fs_status.filesystem.id + "/", function(data) {
var top_chart = true;
// Cull any chart elements that correspond to MDSs no
<div class="box-body">
<table>
<thead>
- <tr>
+ <tr rv-show="standbys | length">
<th>Daemon</th>
</tr>
</thead>
<tr rv-each-standby="standbys">
<td>{standby.name}</td>
</tr>
+ <tr class="ceph-none-found" rv-hide="standbys | length">
+ <td>None found</td>
+ </tr>
</tbody>
</table>
</div>
};
rivets.formatters.pg_status_style = function(pg_status) {
- var unhealthy = false;
- var scrubbing = false;
$.each(pg_status, function(state, count) {
- if (state == "active+clean") {
-
- } else if (state == "active+clean+scrubbing"
- || state == "active+clean+scrubbing+deep") {
- scrubbing = true;
+ if (state == "active+clean"
+ || state == "active+clean+scrubbing"
+ || state == "active+clean+scrubbing+deep") {
+ return "color: #00bb00";
} else {
- unhealthy = true;
+ return "color: #FFC200";
}
});
-
- if (unhealthy) {
- return "color: #FFC200";
- } else if (scrubbing) {
- return "color: #0000bb";
- } else {
- return "color: #00bb00";
- }
};
rivets.formatters.pg_status = function(pg_status) {
rivets.bind($("#content"), content_data);
var refresh = function() {
- $.get("/health_data", function(data) {
+ $.get("{{ url_prefix }}/health_data", function(data) {
_.extend(content_data, data);
draw_usage_charts();
setTimeout(refresh, 5000);
import sys
import time
import threading
+import socket
import cherrypy
import jinja2
-from mgr_module import MgrModule, CommandResult
+from mgr_module import MgrModule, MgrStandbyModule, CommandResult
from types import OsdMap, NotFound, Config, FsMap, MonMap, \
PgSummary, Health, MonStatus
LOG_BUFFER_SIZE = 30
# cherrypy likes to sys.exit on error. don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
pass
os._exit = os_exit_noop
log.info("%s %d (%s)" % (path, sys.getrefcount(root), root.__class__))
+def get_prefixed_url(url):
+ return global_instance().url_prefix + url
+
+
+
+class StandbyModule(MgrStandbyModule):
+ def serve(self):
+ server_addr = self.get_localized_config('server_addr', '::')
+ server_port = self.get_localized_config('server_port', '7000')
+ if server_addr is None:
+ raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/dashboard/server_addr <ip>"')
+ log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
+ cherrypy.config.update({
+ 'server.socket_host': server_addr,
+ 'server.socket_port': int(server_port),
+ 'engine.autoreload.on': False
+ })
+
+ current_dir = os.path.dirname(os.path.abspath(__file__))
+ jinja_loader = jinja2.FileSystemLoader(current_dir)
+ env = jinja2.Environment(loader=jinja_loader)
+
+ module = self
+
+ class Root(object):
+ @cherrypy.expose
+ def index(self):
+ active_uri = module.get_active_uri()
+ if active_uri:
+ log.info("Redirecting to active '{0}'".format(active_uri))
+ raise cherrypy.HTTPRedirect(active_uri)
+ else:
+ template = env.get_template("standby.html")
+ return template.render(delay=5)
+
+ cherrypy.tree.mount(Root(), "/", {})
+ log.info("Starting engine...")
+ cherrypy.engine.start()
+ log.info("Waiting for engine...")
+ cherrypy.engine.wait(state=cherrypy.engine.states.STOPPED)
+ log.info("Engine done.")
+
+ def shutdown(self):
+ log.info("Stopping server...")
+ cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
+ cherrypy.engine.stop()
+ log.info("Stopped server")
+
class Module(MgrModule):
def __init__(self, *args, **kwargs):
self.pool_stats = defaultdict(lambda: defaultdict(
lambda: collections.deque(maxlen=10)))
+ # A prefix for all URLs to use the dashboard with a reverse http proxy
+ self.url_prefix = ''
+
@property
def rados(self):
"""
if self._rados:
return self._rados
- from mgr_module import ceph_state
- ctx_capsule = ceph_state.get_context()
+ ctx_capsule = self.get_context()
self._rados = rados.Rados(context=ctx_capsule)
self._rados.connect()
"id": fs_id,
"name": mdsmap['fs_name'],
"client_count": client_count,
- "clients_url": "/clients/{0}/".format(fs_id),
+ "clients_url": get_prefixed_url("/clients/{0}/".format(fs_id)),
"ranks": rank_table,
"pools": pools_table
},
rbd_pools = sorted([
{
"name": name,
- "url": "/rbd_pool/{0}/".format(name)
+ "url": get_prefixed_url("/rbd_pool/{0}/".format(name))
}
for name in data
], key=lambda k: k['name'])
{
"id": f['id'],
"name": f['mdsmap']['fs_name'],
- "url": "/filesystem/{0}/".format(f['id'])
+ "url": get_prefixed_url("/filesystem/{0}/".format(f['id']))
}
for f in fsmap.data['filesystems']
]
}
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
"clients": clients,
"fs_name": fs_name,
"fscid": fscid,
- "fs_url": "/filesystem/" + fscid_str + "/"
+ "fs_url": get_prefixed_url("/filesystem/" + fscid_str + "/")
}
template = env.get_template("clients.html")
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
}
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
content_data = self._rbd_mirroring()
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
content_data = self._rbd_iscsi()
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
def health(self):
template = env.get_template("health.html")
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
def servers(self):
template = env.get_template("servers.html")
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
ret[k1][k2] = sorted_dict
return ret
+ url_prefix = self.get_config('url_prefix')
+ if url_prefix == None:
+ url_prefix = ''
+ else:
+ if len(url_prefix) != 0:
+ if url_prefix[0] != '/':
+ url_prefix = '/'+url_prefix
+ if url_prefix[-1] == '/':
+ url_prefix = url_prefix[:-1]
+ self.url_prefix = url_prefix
+
server_addr = self.get_localized_config('server_addr', '::')
server_port = self.get_localized_config('server_port', '7000')
if server_addr is None:
'engine.autoreload.on': False
})
+ osdmap = self.get_osdmap()
+ log.info("latest osdmap is %d" % osdmap.get_epoch())
+
+ # Publish the URI that others may use to access the service we're
+ # about to start serving
+ self.set_uri("http://{0}:{1}/".format(
+ socket.getfqdn() if server_addr == "::" else server_addr,
+ server_port
+ ))
+
static_dir = os.path.join(current_dir, 'static')
conf = {
"/static": {
toplevel_data = self._toplevel_data()
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info='/osd' + cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
result['up'] = osd_info['up']
result['in'] = osd_info['in']
- result['url'] = "/osd/perf/{0}".format(osd_id)
+ result['url'] = get_prefixed_url("/osd/perf/{0}".format(osd_id))
return result
for server in servers:
hostname = server['hostname']
services = server['services']
- first = True
for s in services:
if s["type"] == "osd":
osd_id = int(s["id"])
summary = self._osd_summary(osd_id,
osd_map.osds_by_id[osd_id])
- if first:
- # A little helper for rendering
- summary['first'] = True
- first = False
result[hostname].append(summary)
+ result[hostname].sort(key=lambda a: a['id'])
+ if len(result[hostname]):
+ result[hostname][0]['first'] = True
+
global_instance().log.warn("result.size {0} servers.size {1}".format(
len(result), len(servers)
))
# Return list form for convenience of rendering
- return result.items()
+ return sorted(result.items(), key=lambda a: a[0])
@cherrypy.expose
def index(self):
}
return template.render(
+ url_prefix = global_instance().url_prefix,
ceph_version=global_instance().version,
path_info='/osd' + cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
content_data=json.dumps(content_data, indent=2)
)
- cherrypy.tree.mount(Root(), "/", conf)
- cherrypy.tree.mount(OSDEndpoint(), "/osd", conf)
+ cherrypy.tree.mount(Root(), get_prefixed_url("/"), conf)
+ cherrypy.tree.mount(OSDEndpoint(), get_prefixed_url("/osd"), conf)
- log.info("Starting engine...")
+ log.info("Starting engine on {0}:{1}...".format(
+ server_addr, server_port))
cherrypy.engine.start()
log.info("Waiting for engine...")
cherrypy.engine.block()
post_load();
var refresh = function() {
- $.get("/osd/perf_data/" + content_data.osd.osd + "/", function(data) {
+ $.get("{{ url_prefix }}/osd/perf_data/" + content_data.osd.osd + "/", function(data) {
_.extend(content_data.osd_histogram, data.osd_histogram);
_.extend(content_data.osd, data.osd);
_.extend(content_data.osd_metadata, data.osd_metadata);
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/osd/list_data/", function(data) {
+ $.get("{{ url_prefix }}/osd/list_data/", function(data) {
content_data.osds_by_server = data;
$('.inlinesparkline').sparkline();
setTimeout(refresh, 5000);
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/rbd_iscsi_data", function(data) {
+ $.get("{{ url_prefix }}/rbd_iscsi_data", function(data) {
_.extend(content_data, data);
setTimeout(refresh, 30000);
});
class RbdPoolLs(RemoteViewCache):
def _get(self):
- from mgr_module import ceph_state
- ctx_capsule = ceph_state.get_context()
+ ctx_capsule = self._module.get_context()
+
osd_map = self._module.get_sync_object(OsdMap).data
osd_pools = [pool['pool_name'] for pool in osd_map['pools']]
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/rbd_mirroring_data", function(data) {
+ $.get("{{ url_prefix }}/rbd_mirroring_data", function(data) {
_.extend(content_data, data);
setTimeout(refresh, 30000);
});
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/rbd_pool_data/" + content_data.pool_name + "/", function(data) {
+ $.get("{{ url_prefix }}/rbd_pool_data/" + content_data.pool_name + "/", function(data) {
content_data.images = data;
setTimeout(refresh, 10000);
});
var content_data = {{ content_data }};
var refresh = function() {
- $.get("/servers_data", function(data) {
+ $.get("{{ url_prefix }}/servers_data", function(data) {
_.extend(content_data, data);
setTimeout(refresh, 5000);
});
--- /dev/null
+
+<html>
+ <!-- Note: this is only displayed when the standby
+ does not know an active URI to redirect to, otherwise
+ a simple redirect is returned instead -->
+ <head>
+ <title>Ceph</title>
+ <meta http-equiv="refresh" content="{{delay}}">
+ </head>
+ <body>
+ No active ceph-mgr instance is currently running
+ the dashboard. A failover may be in progress.
+ Retrying in {{delay}} seconds...
+ </body>
+</html>
--- /dev/null
+from module import * # NOQA
--- /dev/null
+
+from datetime import datetime
+from threading import Event
+import json
+import errno
+
+from mgr_module import MgrModule
+
+try:
+ from influxdb import InfluxDBClient
+ from influxdb.exceptions import InfluxDBClientError
+except ImportError:
+ InfluxDBClient = None
+
+class Module(MgrModule):
+ COMMANDS = [
+ {
+ "cmd": "influx self-test",
+ "desc": "debug the module",
+ "perm": "rw"
+ },
+ ]
+
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self.event = Event()
+ self.run = True
+
+
+ def get_latest(self, daemon_type, daemon_name, stat):
+ data = self.get_counter(daemon_type, daemon_name, stat)[stat]
+ if data:
+ return data[-1][1]
+ else:
+ return 0
+
+
+ def get_df_stats(self):
+ df = self.get("df")
+ data = []
+
+ df_types = [
+ 'bytes_used',
+ 'dirty',
+ 'rd_bytes',
+ 'raw_bytes_used',
+ 'wr_bytes',
+ 'objects',
+ 'max_avail'
+ ]
+
+ for df_type in df_types:
+ for pool in df['pools']:
+ point = {
+ "measurement": "ceph_pool_stats",
+ "tags": {
+ "pool_name" : pool['name'],
+ "pool_id" : pool['id'],
+ "type_instance" : df_type,
+ "mgr_id" : self.get_mgr_id(),
+ },
+ "time" : datetime.utcnow().isoformat() + 'Z',
+ "fields": {
+ "value" : pool['stats'][df_type],
+ }
+ }
+ data.append(point)
+ return data
+
+ def get_daemon_stats(self):
+ data = []
+
+ for daemon, counters in self.get_all_perf_counters().iteritems():
+ svc_type, svc_id = daemon.split(".")
+ metadata = self.get_metadata(svc_type, svc_id)
+
+ for path, counter_info in counters.items():
+ if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
+ continue
+
+ value = counter_info['value']
+
+ data.append({
+ "measurement": "ceph_daemon_stats",
+ "tags": {
+ "ceph_daemon": daemon,
+ "type_instance": path,
+ "host": metadata['hostname']
+ },
+ "time": datetime.utcnow().isoformat() + 'Z',
+ "fields": {
+ "value": value
+ }
+ })
+
+ return data
+
+ def send_to_influx(self):
+ host = self.get_config("hostname")
+ if not host:
+ self.log.error("No InfluxDB server configured, please set"
+ "`hostname` configuration key.")
+ return
+
+ port = int(self.get_config("port", default="8086"))
+ database = self.get_config("database", default="ceph")
+
+ # If influx server has authentication turned off then
+ # missing username/password is valid.
+ username = self.get_config("username", default="")
+ password = self.get_config("password", default="")
+
+ client = InfluxDBClient(host, port, username, password, database)
+
+ # using influx client get_list_database requires admin privs, instead we'll catch the not found exception and inform the user if db can't be created
+ try:
+ client.write_points(self.get_df_stats(), 'ms')
+ client.write_points(self.get_daemon_stats(), 'ms')
+ except InfluxDBClientError as e:
+ if e.code == 404:
+ self.log.info("Database '{0}' not found, trying to create (requires admin privs). You can also create manually and grant write privs to user '{1}'".format(database,username))
+ client.create_database(database)
+ else:
+ raise
+
+ def shutdown(self):
+ self.log.info('Stopping influx module')
+ self.run = False
+ self.event.set()
+
+ def handle_command(self, cmd):
+ if cmd['prefix'] == 'influx self-test':
+ daemon_stats = self.get_daemon_stats()
+ assert len(daemon_stats)
+ df_stats = self.get_df_stats()
+ result = {
+ 'daemon_stats': daemon_stats,
+ 'df_stats': df_stats
+ }
+ return 0, json.dumps(result, indent=2), 'Self-test OK'
+ else:
+ return (-errno.EINVAL, '',
+ "Command not found '{0}'".format(cmd['prefix']))
+
+ def serve(self):
+ if InfluxDBClient is None:
+ self.log.error("Cannot transmit statistics: influxdb python "
+ "module not found. Did you install it?")
+ return
+
+ self.log.info('Starting influx module')
+ self.run = True
+ while self.run:
+ self.send_to_influx()
+ self.log.debug("Running interval loop")
+ interval = self.get_config("interval")
+ if interval is None:
+ interval = 5
+ self.log.debug("sleeping for %d seconds",interval)
+ self.event.wait(interval)
+
--- /dev/null
+
+from module import * # NOQA
--- /dev/null
+from mgr_module import MgrModule, CommandResult
+import json
+import threading
+
+class Module(MgrModule):
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self.serve_event = threading.Event()
+
+ def notify(self, notify_type, notify_id):
+ if notify_type == 'osd_map':
+ self.handle_osd_map()
+
+ def handle_osd_map(self):
+ """
+ Check pools on each OSDMap change
+ """
+ subtree_type = self.get_config('subtree') or 'rack'
+ failure_domain = self.get_config('failure_domain') or 'host'
+ pg_num = self.get_config('pg_num') or '128'
+ num_rep = self.get_config('num_rep') or '3'
+ min_size = self.get_config('min_size')
+ prefix = self.get_config('prefix') or 'by-' + subtree_type + '-'
+
+ osdmap = self.get("osd_map")
+ lpools = []
+ for pool in osdmap['pools']:
+ if pool['pool_name'].find(prefix) == 0:
+ lpools.append(pool['pool_name'])
+
+ self.log.debug('localized pools = %s', lpools)
+ subtrees = []
+ tree = self.get('osd_map_tree')
+ for node in tree['nodes']:
+ if node['type'] == subtree_type:
+ subtrees.append(node['name'])
+ pool_name = prefix + node['name']
+ if pool_name not in lpools:
+ self.log.info('Creating localized pool %s', pool_name)
+ #
+ result = CommandResult("")
+ self.send_command(result, "mon", "", json.dumps({
+ "prefix": "osd crush rule create-replicated",
+ "format": "json",
+ "name": pool_name,
+ "root": node['name'],
+ "type": failure_domain,
+ }), "")
+ r, outb, outs = result.wait()
+
+ result = CommandResult("")
+ self.send_command(result, "mon", "", json.dumps({
+ "prefix": "osd pool create",
+ "format": "json",
+ "pool": pool_name,
+ 'rule': pool_name,
+ 'erasure_code_profile': pool_name,
+ "pool_type": 'replicated',
+ 'pg_num': str(pg_num),
+ }), "")
+ r, outb, outs = result.wait()
+
+ result = CommandResult("")
+ self.send_command(result, "mon", "", json.dumps({
+ "prefix": "osd pool set",
+ "format": "json",
+ "pool": pool_name,
+ 'var': 'size',
+ "val": str(num_rep),
+ }), "")
+ r, outb, outs = result.wait()
+
+ if min_size:
+ result = CommandResult("")
+ self.send_command(result, "mon", "", json.dumps({
+ "prefix": "osd pool set",
+ "format": "json",
+ "pool": pool_name,
+ 'var': 'min_size',
+ "val": str(min_size),
+ }), "")
+ r, outb, outs = result.wait()
+
+ # TODO remove pools for hosts that don't exist?
+
+ def serve(self):
+ self.handle_osd_map()
+ self.serve_event.wait()
+ self.serve_event.clear()
+
+ def shutdown(self):
+ self.serve_event.set()
-import ceph_state #noqa
+import ceph_module # noqa
+#import ceph_osdmap #noqa
+#import ceph_osdmap_incremental #noqa
+#import ceph_crushmap #noqa
+
import json
import logging
import threading
+from collections import defaultdict
+
+
+class CPlusPlusHandler(logging.Handler):
+ def __init__(self, module_inst):
+ super(CPlusPlusHandler, self).__init__()
+ self._module = module_inst
+
+ def emit(self, record):
+ if record.levelno <= logging.DEBUG:
+ ceph_level = 20
+ elif record.levelno <= logging.INFO:
+ ceph_level = 4
+ elif record.levelno <= logging.WARNING:
+ ceph_level = 1
+ else:
+ ceph_level = 0
+
+ self._module._ceph_log(ceph_level, self.format(record))
+
+
+def configure_logger(module_inst, name):
+ logger = logging.getLogger(name)
+
+
+ # Don't filter any logs at the python level, leave it to C++
+ logger.setLevel(logging.DEBUG)
+
+ # FIXME: we should learn the log level from C++ land, and then
+ # avoid calling the C++ level log when we know a message is of
+ # an insufficient level to be ultimately output
+ logger.addHandler(CPlusPlusHandler(module_inst))
+
+ return logger
+def unconfigure_logger(module_inst, name):
+ logger = logging.getLogger(name)
+ rm_handlers = [h for h in logger.handlers if isinstance(h, CPlusPlusHandler)]
+ for h in rm_handlers:
+ logger.removeHandler(h)
+
class CommandResult(object):
"""
Use with MgrModule.send_command
return self.r, self.outb, self.outs
-class MgrModule(object):
+class OSDMap(ceph_module.BasePyOSDMap):
+ def get_epoch(self):
+ return self._get_epoch()
+
+ def get_crush_version(self):
+ return self._get_crush_version()
+
+ def dump(self):
+ return self._dump()
+
+ def new_incremental(self):
+ return self._new_incremental()
+
+ def apply_incremental(self, inc):
+ return self._apply_incremental(inc)
+
+ def get_crush(self):
+ return self._get_crush()
+
+ def get_pools_by_take(self, take):
+ return self._get_pools_by_take(take).get('pools', [])
+
+ def calc_pg_upmaps(self, inc,
+ max_deviation=.01, max_iterations=10, pools=[]):
+ return self._calc_pg_upmaps(
+ inc,
+ max_deviation, max_iterations, pools)
+
+ def map_pool_pgs_up(self, poolid):
+ return self._map_pool_pgs_up(poolid)
+
+class OSDMapIncremental(ceph_module.BasePyOSDMapIncremental):
+ def get_epoch(self):
+ return self._get_epoch()
+
+ def dump(self):
+ return self._dump()
+
+ def set_osd_reweights(self, weightmap):
+ """
+ weightmap is a dict, int to float. e.g. { 0: .9, 1: 1.0, 3: .997 }
+ """
+ return self._set_osd_reweights(weightmap)
+
+ def set_crush_compat_weight_set_weights(self, weightmap):
+ """
+ weightmap is a dict, int to float. devices only. e.g.,
+ { 0: 3.4, 1: 3.3, 2: 3.334 }
+ """
+ return self._set_crush_compat_weight_set_weights(weightmap)
+
+class CRUSHMap(ceph_module.BasePyCRUSH):
+ def dump(self):
+ return self._dump()
+
+ def get_item_weight(self, item):
+ return self._get_item_weight(item)
+
+ def get_item_name(self, item):
+ return self._get_item_name(item)
+
+ def find_takes(self):
+ return self._find_takes().get('takes', [])
+
+ def get_take_weight_osd_map(self, root):
+ uglymap = self._get_take_weight_osd_map(root)
+ return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() }
+
+class MgrStandbyModule(ceph_module.BaseMgrStandbyModule):
+ """
+ Standby modules only implement a serve and shutdown method, they
+ are not permitted to implement commands and they do not receive
+ any notifications.
+
+ They only have access to the mgrmap (for acecssing service URI info
+ from their active peer), and to configuration settings (read only).
+ """
+
+ def __init__(self, module_name, capsule):
+ super(MgrStandbyModule, self).__init__(capsule)
+ self.module_name = module_name
+ self._logger = configure_logger(self, module_name)
+
+ def __del__(self):
+ unconfigure_logger(self, self.module_name)
+
+ @property
+ def log(self):
+ return self._logger
+
+ def serve(self):
+ """
+ The serve method is mandatory for standby modules.
+ :return:
+ """
+ raise NotImplementedError()
+
+ def get_mgr_id(self):
+ return self._ceph_get_mgr_id()
+
+ def get_config(self, key):
+ return self._ceph_get_config(key)
+
+ def get_active_uri(self):
+ return self._ceph_get_active_uri()
+
+ def get_localized_config(self, key, default=None):
+ r = self.get_config(self.get_mgr_id() + '/' + key)
+ if r is None:
+ r = self.get_config(key)
+
+ if r is None:
+ r = default
+ return r
+
+class MgrModule(ceph_module.BaseMgrModule):
COMMANDS = []
- def __init__(self, handle):
- self._handle = handle
- self._logger = logging.getLogger(handle)
+ # Priority definitions for perf counters
+ PRIO_CRITICAL = 10
+ PRIO_INTERESTING = 8
+ PRIO_USEFUL = 5
+ PRIO_UNINTERESTING = 2
+ PRIO_DEBUGONLY = 0
+
+ # counter value types
+ PERFCOUNTER_TIME = 1
+ PERFCOUNTER_U64 = 2
+
+ # counter types
+ PERFCOUNTER_LONGRUNAVG = 4
+ PERFCOUNTER_COUNTER = 8
+ PERFCOUNTER_HISTOGRAM = 0x10
+ PERFCOUNTER_TYPE_MASK = ~2
- # Don't filter any logs at the python level, leave it to C++
- self._logger.setLevel(logging.DEBUG)
+ def __init__(self, module_name, py_modules_ptr, this_ptr):
+ self.module_name = module_name
- # FIXME: we should learn the log level from C++ land, and then
- # avoid calling ceph_state.log when we know a message is of
- # an insufficient level to be ultimately output
+ # If we're taking over from a standby module, let's make sure
+ # its logger was unconfigured before we hook ours up
+ unconfigure_logger(self, self.module_name)
+ self._logger = configure_logger(self, module_name)
- class CPlusPlusHandler(logging.Handler):
- def emit(self, record):
- if record.levelno <= logging.DEBUG:
- ceph_level = 20
- elif record.levelno <= logging.INFO:
- ceph_level = 4
- elif record.levelno <= logging.WARNING:
- ceph_level = 1
- else:
- ceph_level = 0
+ super(MgrModule, self).__init__(py_modules_ptr, this_ptr)
- ceph_state.log(handle, ceph_level, self.format(record))
+ self._version = self._ceph_get_version()
- self._logger.addHandler(CPlusPlusHandler())
+ self._perf_schema_cache = None
- self._version = ceph_state.get_version()
+ def __del__(self):
+ unconfigure_logger(self, self.module_name)
+
+ def update_perf_schema(self, daemon_type, daemon_name):
+ """
+ For plugins that use get_all_perf_counters, call this when
+ receiving a notification of type 'perf_schema_update', to
+ prompt MgrModule to update its cache of counter schemas.
+
+ :param daemon_type:
+ :param daemon_name:
+ :return:
+ """
@property
def log(self):
def version(self):
return self._version
+ def get_context(self):
+ """
+ :return: a Python capsule containing a C++ CephContext pointer
+ """
+ return self._ceph_get_context()
+
def notify(self, notify_type, notify_id):
"""
Called by the ceph-mgr service to notify the Python plugin
"""
Called by the plugin to load some cluster state from ceph-mgr
"""
- return ceph_state.get(self._handle, data_name)
+ return self._ceph_get(data_name)
def get_server(self, hostname):
"""
:param hostname: a hostame
"""
- return ceph_state.get_server(self._handle, hostname)
+ return self._ceph_get_server(hostname)
def get_perf_schema(self, svc_type, svc_name):
"""
:param svc_name:
:return: list of dicts describing the counters requested
"""
- return ceph_state.get_perf_schema(self._handle, svc_type, svc_name)
+ return self._ceph_get_perf_schema(svc_type, svc_name)
def get_counter(self, svc_type, svc_name, path):
"""
:param path:
:return: A list of two-element lists containing time and value
"""
- return ceph_state.get_counter(self._handle, svc_type, svc_name, path)
+ return self._ceph_get_counter(svc_type, svc_name, path)
def list_servers(self):
"""
Like ``get_server``, but instead of returning information
about just one node, return all the nodes in an array.
"""
- return ceph_state.get_server(self._handle, None)
+ return self._ceph_get_server(None)
def get_metadata(self, svc_type, svc_id):
"""
:param svc_id: string
:return: dict
"""
- return ceph_state.get_metadata(self._handle, svc_type, svc_id)
+ return self._ceph_get_metadata(svc_type, svc_id)
def get_daemon_status(self, svc_type, svc_id):
"""
:param svc_id: string
:return: dict
"""
- return ceph_state.get_daemon_status(self._handle, svc_type, svc_id)
+ return self._ceph_get_daemon_status(svc_type, svc_id)
def send_command(self, *args, **kwargs):
"""
Called by the plugin to send a command to the mon
cluster.
"""
- ceph_state.send_command(self._handle, *args, **kwargs)
+ self._ceph_send_command(*args, **kwargs)
def set_health_checks(self, checks):
"""
:param list: dict of health check dicts
"""
- ceph_state.set_health_checks(self._handle, checks)
+ self._ceph_set_health_checks(checks)
def handle_command(self, cmd):
"""
:return: str
"""
- return ceph_state.get_mgr_id()
+ return self._ceph_get_mgr_id()
- def get_config(self, key):
+ def get_config(self, key, default=None):
"""
Retrieve the value of a persistent configuration setting
:param key: str
:return: str
"""
- return ceph_state.get_config(self._handle, key)
+ r = self._ceph_get_config(key)
+ if r is None:
+ return default
+ else:
+ return r
def get_config_prefix(self, key_prefix):
"""
:param key_prefix: str
:return: str
"""
- return ceph_state.get_config_prefix(self._handle, key_prefix)
+ return self._ceph_get_config_prefix(key_prefix)
def get_localized_config(self, key, default=None):
"""
:param key: str
:param val: str
"""
- ceph_state.set_config(self._handle, key, val)
+ self._ceph_set_config(key, val)
def set_localized_config(self, key, val):
"""
:param default: str
:return: str
"""
- return self.set_config(self.get_mgr_id() + '/' + key, val)
+ return self._ceph_set_config(self.get_mgr_id() + '/' + key, val)
def set_config_json(self, key, val):
"""
:param key: str
:param val: json-serializable object
"""
- self.set_config(key, json.dumps(val))
+ self._ceph_set_config(key, json.dumps(val))
def get_config_json(self, key):
"""
:return: bool
"""
pass
+
+ def get_osdmap(self):
+ """
+ Get a handle to an OSDMap. If epoch==0, get a handle for the latest
+ OSDMap.
+ :return: OSDMap
+ """
+ return self._ceph_get_osdmap()
+
+ def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
+ """
+ Return the perf counters currently known to this ceph-mgr
+ instance, filtered by priority equal to or greater than `prio_limit`.
+
+ The result us a map of string to dict, associating services
+ (like "osd.123") with their counters. The counter
+ dict for each service maps counter paths to a counter
+ info structure, which is the information from
+ the schema, plus an additional "value" member with the latest
+ value.
+ """
+
+ result = defaultdict(dict)
+
+ # TODO: improve C++->Python interface to return just
+ # the latest if that's all we want.
+ def get_latest(daemon_type, daemon_name, counter):
+ data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+ if data:
+ return data[-1][1]
+ else:
+ return 0
+
+ for server in self.list_servers():
+ for service in server['services']:
+ if service['type'] not in ("mds", "osd", "mon"):
+ continue
+
+ schema = self.get_perf_schema(service['type'], service['id'])
+ if not schema:
+ self.log.warn("No perf counter schema for {0}.{1}".format(
+ service['type'], service['id']
+ ))
+ continue
+
+ # Value is returned in a potentially-multi-service format,
+ # get just the service we're asking about
+ svc_full_name = "{0}.{1}".format(service['type'], service['id'])
+ schema = schema[svc_full_name]
+
+ # Populate latest values
+ for counter_path, counter_schema in schema.items():
+ # self.log.debug("{0}: {1}".format(
+ # counter_path, json.dumps(counter_schema)
+ # ))
+ if counter_schema['priority'] < prio_limit:
+ continue
+
+ counter_info = counter_schema
+ counter_info['value'] = get_latest(service['type'], service['id'], counter_path)
+ result[svc_full_name][counter_path] = counter_info
+
+ self.log.debug("returning {0} counter".format(len(result)))
+
+ return result
+
+ def set_uri(self, uri):
+ """
+ If the module exposes a service, then call this to publish the
+ address once it is available.
+
+ :return: a string
+ """
+ return self._ceph_set_uri(uri)
import cherrypy
+import json
+import errno
import math
import os
-import time
from collections import OrderedDict
from mgr_module import MgrModule
# cherrypy likes to sys.exit on error. don't let it take us down too!
-def os_exit_noop():
+def os_exit_noop(*args, **kwargs):
pass
return _global_instance['plugin']
-# counter value types
-PERFCOUNTER_TIME = 1
-PERFCOUNTER_U64 = 2
+def health_status_to_number(status):
-# counter types
-PERFCOUNTER_LONGRUNAVG = 4
-PERFCOUNTER_COUNTER = 8
-PERFCOUNTER_HISTOGRAM = 0x10
-PERFCOUNTER_TYPE_MASK = ~2
+ if status == 'HEALTH_OK':
+ return 0
+ elif status == 'HEALTH_WARN':
+ return 1
+ elif status == 'HEALTH_ERR':
+ return 2
+PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
+ 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
+ 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
+ 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
-def stattype_to_str(stattype):
+DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
- typeonly = stattype & PERFCOUNTER_TYPE_MASK
- if typeonly == 0:
- return 'gauge'
- if typeonly == PERFCOUNTER_LONGRUNAVG:
- # this lie matches the DaemonState decoding: only val, no counts
- return 'counter'
- if typeonly == PERFCOUNTER_COUNTER:
- return 'counter'
- if typeonly == PERFCOUNTER_HISTOGRAM:
- return 'histogram'
+DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
+ 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
- return ''
+OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
+
+OSD_STATUS = ['weight', 'up', 'in']
+
+POOL_METADATA = ('pool_id', 'name')
+
+DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
class Metric(object):
def promethize(path):
''' replace illegal metric name characters '''
- return path.replace('.', '_').replace('-', '_')
+ result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
+
+ # Hyphens usually turn into underscores, unless they are
+ # trailing
+ if result.endswith("-"):
+ result = result[0:-1] + "_minus"
+ else:
+ result = result.replace("-", "_")
+
+ return "ceph_{0}".format(result)
def floatstr(value):
''' represent as Go-compatible float '''
class Module(MgrModule):
+ COMMANDS = [
+ {
+ "cmd": "prometheus self-test",
+ "desc": "Run a self test on the prometheus module",
+ "perm": "rw"
+ },
+ ]
def __init__(self, *args, **kwargs):
super(Module, self).__init__(*args, **kwargs)
self.notified = False
self.serving = False
- self.metrics = dict()
+ self.metrics = self._setup_static_metrics()
self.schema = OrderedDict()
_global_instance['plugin'] = self
- def _get_ordered_schema(self, **kwargs):
-
- '''
- fetch an ordered-by-key performance counter schema
- ['perf_schema'][daemontype.id][countername] with keys
- 'nick' (if present)
- 'description'
- 'type' (counter type....counter/gauge/avg/histogram/etc.)
- '''
-
- daemon_type = kwargs.get('daemon_type', '')
- daemon_id = kwargs.get('daemon_id', '')
-
- schema = self.get_perf_schema(daemon_type, daemon_id)
- if not schema:
- self.log.warning('_get_ordered_schema: no data')
- return
-
- new_schema = dict()
- for k1 in schema.keys(): # 'perf_schema', but assume only one
- for k2 in sorted(schema[k1].keys()):
- sorted_dict = OrderedDict(
- sorted(schema[k1][k2].items(), key=lambda i: i[0])
- )
- new_schema[k2] = sorted_dict
- for k in sorted(new_schema.keys()):
- self.log.debug("updating schema for %s" % k)
- self.schema[k] = new_schema[k]
-
- def shutdown(self):
- self.serving = False
- pass
-
- # XXX duplicated from dashboard; factor out?
- def get_latest(self, daemon_type, daemon_name, stat):
- data = self.get_counter(daemon_type, daemon_name, stat)[stat]
- if data:
- return data[-1][1]
- else:
- return 0
-
- def get_stat(self, daemon, path):
+ def _stattype_to_str(self, stattype):
+
+ typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
+ if typeonly == 0:
+ return 'gauge'
+ if typeonly == self.PERFCOUNTER_LONGRUNAVG:
+ # this lie matches the DaemonState decoding: only val, no counts
+ return 'counter'
+ if typeonly == self.PERFCOUNTER_COUNTER:
+ return 'counter'
+ if typeonly == self.PERFCOUNTER_HISTOGRAM:
+ return 'histogram'
+
+ return ''
+
+ def _setup_static_metrics(self):
+ metrics = {}
+ metrics['health_status'] = Metric(
+ 'untyped',
+ 'health_status',
+ 'Cluster health status'
+ )
+ metrics['mon_quorum_count'] = Metric(
+ 'gauge',
+ 'mon_quorum_count',
+ 'Monitors in quorum'
+ )
+ metrics['osd_metadata'] = Metric(
+ 'untyped',
+ 'osd_metadata',
+ 'OSD Metadata',
+ OSD_METADATA
+ )
- perfcounter = self.schema[daemon][path]
- stattype = stattype_to_str(perfcounter['type'])
- # XXX simplify first effort: no histograms
- # averages are already collapsed to one value for us
- if not stattype or stattype == 'histogram':
- self.log.debug('ignoring %s, type %s' % (path, stattype))
- return
+ # The reason for having this separate to OSD_METADATA is
+ # so that we can stably use the same tag names that
+ # the Prometheus node_exporter does
+ metrics['disk_occupation'] = Metric(
+ 'undef',
+ 'disk_occupation',
+ 'Associate Ceph daemon with disk used',
+ DISK_OCCUPATION
+ )
- if path not in self.metrics:
- self.metrics[path] = Metric(
- stattype,
+ metrics['pool_metadata'] = Metric(
+ 'untyped',
+ 'pool_metadata',
+ 'POOL Metadata',
+ POOL_METADATA
+ )
+ for state in OSD_STATUS:
+ path = 'osd_{}'.format(state)
+ self.log.debug("init: creating {}".format(path))
+ metrics[path] = Metric(
+ 'untyped',
path,
- perfcounter['description'],
- ('daemon',),
+ 'OSD status {}'.format(state),
+ ('ceph_daemon',)
)
+ for state in PG_STATES:
+ path = 'pg_{}'.format(state)
+ self.log.debug("init: creating {}".format(path))
+ metrics[path] = Metric(
+ 'gauge',
+ path,
+ 'PG {}'.format(state),
+ )
+ for state in DF_CLUSTER:
+ path = 'cluster_{}'.format(state)
+ self.log.debug("init: creating {}".format(path))
+ metrics[path] = Metric(
+ 'gauge',
+ path,
+ 'DF {}'.format(state),
+ )
+ for state in DF_POOL:
+ path = 'pool_{}'.format(state)
+ self.log.debug("init: creating {}".format(path))
+ metrics[path] = Metric(
+ 'gauge',
+ path,
+ 'DF pool {}'.format(state),
+ ('pool_id',)
+ )
+
+ return metrics
- daemon_type, daemon_id = daemon.split('.')
+ def shutdown(self):
+ self.serving = False
+ pass
- self.metrics[path].set(
- self.get_latest(daemon_type, daemon_id, path),
- (daemon,)
+ def get_health(self):
+ health = json.loads(self.get('health')['json'])
+ self.metrics['health_status'].set(
+ health_status_to_number(health['status'])
)
+ def get_df(self):
+ # maybe get the to-be-exported metrics from a config?
+ df = self.get('df')
+ for stat in DF_CLUSTER:
+ path = 'cluster_{}'.format(stat)
+ self.metrics[path].set(df['stats'][stat])
+
+ for pool in df['pools']:
+ for stat in DF_POOL:
+ path = 'pool_{}'.format(stat)
+ self.metrics[path].set(pool['stats'][stat], (pool['id'],))
+
+ def get_quorum_status(self):
+ mon_status = json.loads(self.get('mon_status')['json'])
+ self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
+
+ def get_pg_status(self):
+ # TODO add per pool status?
+ pg_s = self.get('pg_summary')['all']
+ reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
+ key.split('+')]
+ for state, value in reported_pg_s:
+ path = 'pg_{}'.format(state)
+ self.metrics[path].set(value)
+ reported_states = [s[0] for s in reported_pg_s]
+ for state in PG_STATES:
+ path = 'pg_{}'.format(state)
+ if state not in reported_states:
+ self.metrics[path].set(0)
+
+ def get_metadata_and_osd_status(self):
+ osd_map = self.get('osd_map')
+ osd_devices = self.get('osd_map_crush')['devices']
+ for osd in osd_map['osds']:
+ id_ = osd['osd']
+ p_addr = osd['public_addr'].split(':')[0]
+ c_addr = osd['cluster_addr'].split(':')[0]
+ dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
+ self.metrics['osd_metadata'].set(0, (
+ c_addr,
+ dev_class['class'],
+ id_,
+ p_addr
+ ))
+ for state in OSD_STATUS:
+ status = osd[state]
+ self.metrics['osd_{}'.format(state)].set(
+ status,
+ ('osd.{}'.format(id_),))
+
+ osd_metadata = self.get_metadata("osd", str(id_))
+ dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
+ osd_dev_node = None
+ for dev_key in dev_keys:
+ val = osd_metadata.get(dev_key, None)
+ if val and val != "unknown":
+ osd_dev_node = val
+ break
+ osd_hostname = osd_metadata.get('hostname', None)
+ if osd_dev_node and osd_hostname:
+ self.log.debug("Got dev for osd {0}: {1}/{2}".format(
+ id_, osd_hostname, osd_dev_node))
+ self.metrics['disk_occupation'].set(0, (
+ osd_hostname,
+ osd_dev_node,
+ "osd.{0}".format(id_)
+ ))
+ else:
+ self.log.info("Missing dev node metadata for osd {0}, skipping "
+ "occupation record for this osd".format(id_))
+
+ for pool in osd_map['pools']:
+ id_ = pool['pool']
+ name = pool['pool_name']
+ self.metrics['pool_metadata'].set(0, (id_, name))
+
def collect(self):
- for daemon in self.schema.keys():
- for path in self.schema[daemon].keys():
- self.get_stat(daemon, path)
+ self.get_health()
+ self.get_df()
+ self.get_quorum_status()
+ self.get_metadata_and_osd_status()
+ self.get_pg_status()
+
+ for daemon, counters in self.get_all_perf_counters().iteritems():
+ for path, counter_info in counters.items():
+ stattype = self._stattype_to_str(counter_info['type'])
+ # XXX simplify first effort: no histograms
+ # averages are already collapsed to one value for us
+ if not stattype or stattype == 'histogram':
+ self.log.debug('ignoring %s, type %s' % (path, stattype))
+ continue
+
+ if path not in self.metrics:
+ self.metrics[path] = Metric(
+ stattype,
+ path,
+ counter_info['description'],
+ ("ceph_daemon",),
+ )
+
+ self.metrics[path].set(
+ counter_info['value'],
+ (daemon,)
+ )
+
return self.metrics
- def notify(self, ntype, nid):
- ''' Just try to sync and not run until we're notified once '''
- if not self.notified:
- self.serving = True
- self.notified = True
- if ntype == 'perf_schema_update':
- daemon_type, daemon_id = nid.split('.')
- self._get_ordered_schema(
- daemon_type=daemon_type,
- daemon_id=daemon_id
- )
+ def handle_command(self, cmd):
+ if cmd['prefix'] == 'prometheus self-test':
+ self.collect()
+ return 0, '', 'Self-test OK'
+ else:
+ return (-errno.EINVAL, '',
+ "Command not found '{0}'".format(cmd['prefix']))
def serve(self):
@cherrypy.expose
def index(self):
+ return '''<!DOCTYPE html>
+<html>
+ <head><title>Ceph Exporter</title></head>
+ <body>
+ <h1>Ceph Exporter</h1>
+ <p><a href='/metrics'>Metrics</a></p>
+ </body>
+</html>'''
+
+ @cherrypy.expose
+ def metrics(self):
metrics = global_instance().collect()
cherrypy.response.headers['Content-Type'] = 'text/plain'
if metrics:
"server_addr: %s server_port: %s" %
(server_addr, server_port)
)
- # wait for first notification (of any kind) to start up
- while not self.serving:
- time.sleep(1)
cherrypy.config.update({
'server.socket_host': server_addr,
- 'server.socket_port': server_port,
+ 'server.socket_port': int(server_port),
'engine.autoreload.on': False
})
cherrypy.tree.mount(Root(), "/")
import tempfile
import threading
import traceback
+import socket
import common
instance = None
+class CannotServe(Exception):
+ pass
+
class CommandsRequest(object):
"""
try:
self._serve()
self.server.socket.close()
+ except CannotServe as cs:
+ self.log.warn("server not running: {0}".format(cs.message))
except:
self.log.error(str(traceback.format_exc()))
server_addr = self.get_localized_config('server_addr', '::')
if server_addr is None:
- raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+ raise CannotServe('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
+
server_port = int(self.get_localized_config('server_port', '8003'))
self.log.info('server_addr: %s server_port: %d',
server_addr, server_port)
pkey_fname = self.get_localized_config('key_file')
if not cert_fname or not pkey_fname:
- raise RuntimeError('no certificate configured')
+ raise CannotServe('no certificate configured')
if not os.path.isfile(cert_fname):
- raise RuntimeError('certificate %s does not exist' % cert_fname)
+ raise CannotServe('certificate %s does not exist' % cert_fname)
if not os.path.isfile(pkey_fname):
- raise RuntimeError('private key %s does not exist' % pkey_fname)
+ raise CannotServe('private key %s does not exist' % pkey_fname)
+
+ # Publish the URI that others may use to access the service we're
+ # about to start serving
+ self.set_uri("https://{0}:{1}/".format(
+ socket.gethostname() if server_addr == "::" else server_addr,
+ server_port
+ ))
# Create the HTTPS werkzeug server serving pecan app
self.server = make_server(
--- /dev/null
+
+from module import *
+
--- /dev/null
+
+from mgr_module import MgrModule, CommandResult
+import threading
+import random
+import json
+import errno
+
+
+class Module(MgrModule):
+ """
+ This module is for testing the ceph-mgr python interface from within
+ a running ceph-mgr daemon.
+
+ It implements a sychronous self-test command for calling the functions
+ in the MgrModule interface one by one, and a background "workload"
+ command for causing the module to perform some thrashing-type
+ activities in its serve() thread.
+ """
+
+ WORKLOAD_COMMAND_SPAM = "command_spam"
+ SHUTDOWN = "shutdown"
+
+ WORKLOADS = (WORKLOAD_COMMAND_SPAM, )
+
+ COMMANDS = [
+ {
+ "cmd": "mgr self-test run",
+ "desc": "Run mgr python interface tests",
+ "perm": "r"
+ },
+ {
+ "cmd": "mgr self-test background start name=workload,type=CephString",
+ "desc": "Activate a background workload (one of {0})".format(
+ ", ".join(WORKLOADS)),
+ "perm": "r"
+ },
+ {
+ "cmd": "mgr self-test background stop",
+ "desc": "Stop background workload if any is running",
+ "perm": "r"
+ },
+ ]
+
+
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self._event = threading.Event()
+ self._workload = None
+
+ def handle_command(self, command):
+ if command['prefix'] == 'mgr self-test run':
+ self._self_test()
+ return 0, '', 'Self-test succeeded'
+
+ elif command['prefix'] == 'mgr self-test background start':
+ if command['workload'] not in self.WORKLOADS:
+ return (-errno.EINVAL, '',
+ "Workload not found '{0}'".format(command['workload']))
+ self._workload = command['workload']
+ self._event.set()
+ return 0, '', 'Running `{0}` in background'.format(self._workload)
+
+ elif command['prefix'] == 'mgr self-test background stop':
+ if self._workload:
+ was_running = self._workload
+ self._workload = None
+ self._event.set()
+ return 0, '', 'Stopping background workload `{0}`'.format(
+ was_running)
+ else:
+ return 0, '', 'No background workload was running'
+
+ else:
+ return (-errno.EINVAL, '',
+ "Command not found '{0}'".format(command['prefix']))
+
+ def _self_test(self):
+ self.log.info("Running self-test procedure...")
+
+ self._self_test_osdmap()
+ self._self_test_getters()
+ self._self_test_config()
+ self._self_test_misc()
+ self._self_test_perf_counters()
+
+ def _self_test_getters(self):
+ self.version
+ self.get_context()
+ self.get_mgr_id()
+
+ # In this function, we will assume that the system is in a steady
+ # state, i.e. if a server/service appears in one call, it will
+ # not have gone by the time we call another function referring to it
+
+ objects = [
+ "fs_map",
+ "osdmap_crush_map_text",
+ "osd_map",
+ "config",
+ "mon_map",
+ "service_map",
+ "osd_metadata",
+ "pg_summary",
+ "pg_status",
+ "pg_dump",
+ "df",
+ "osd_stats",
+ "health",
+ "mon_status",
+ "mgr_map"
+ ]
+ for obj in objects:
+ self.get(obj)
+
+ servers = self.list_servers()
+ for server in servers:
+ self.get_server(server['hostname'])
+
+ osdmap = self.get('osd_map')
+ for o in osdmap['osds']:
+ osd_id = o['osd']
+ self.get_metadata("osd", str(osd_id))
+
+ self.get_daemon_status("osd", "0")
+ #send_command
+
+ def _self_test_config(self):
+ # This is not a strong test (can't tell if values really
+ # persisted), it's just for the python interface bit.
+
+ self.set_config("testkey", "testvalue")
+ assert self.get_config("testkey") == "testvalue"
+
+ self.set_localized_config("testkey", "testvalue")
+ assert self.get_localized_config("testkey") == "testvalue"
+
+ self.set_config_json("testjsonkey", {"testblob": 2})
+ assert self.get_config_json("testjsonkey") == {"testblob": 2}
+
+ assert sorted(self.get_config_prefix("test").keys()) == sorted(
+ ["testkey", "testjsonkey"])
+
+ def _self_test_perf_counters(self):
+ self.get_perf_schema("osd", "0")
+ self.get_counter("osd", "0", "osd.op")
+ #get_counter
+ #get_all_perf_coutners
+
+ def _self_test_misc(self):
+ self.set_uri("http://this.is.a.test.com")
+ self.set_health_checks({})
+
+ def _self_test_osdmap(self):
+ osdmap = self.get_osdmap()
+ osdmap.get_epoch()
+ osdmap.get_crush_version()
+ osdmap.dump()
+
+ inc = osdmap.new_incremental()
+ osdmap.apply_incremental(inc)
+ inc.get_epoch()
+ inc.dump()
+
+ crush = osdmap.get_crush()
+ crush.dump()
+ crush.get_item_name(-1)
+ crush.get_item_weight(-1)
+ crush.find_takes()
+ crush.get_take_weight_osd_map(-1)
+
+ #osdmap.get_pools_by_take()
+ #osdmap.calc_pg_upmaps()
+ #osdmap.map_pools_pgs_up()
+
+ #inc.set_osd_reweights
+ #inc.set_crush_compat_weight_set_weights
+
+ self.log.info("Finished self-test procedure.")
+
+ def shutdown(self):
+ self._workload = self.SHUTDOWN
+ self._event.set()
+
+ def _command_spam(self):
+ self.log.info("Starting command_spam workload...")
+ while not self._event.is_set():
+ osdmap = self.get_osdmap()
+ dump = osdmap.dump()
+ count = len(dump['osds'])
+ i = int(random.random() * count)
+ w = random.random()
+
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd reweight',
+ 'id': i,
+ 'weight': w
+ }), '')
+
+ crush = osdmap.get_crush().dump()
+ r, outb, outs = result.wait()
+
+ self._event.clear()
+ self.log.info("Ended command_spam workload...")
+
+ def serve(self):
+ while True:
+ if self._workload == self.WORKLOAD_COMMAND_SPAM:
+ self._command_spam()
+ elif self._workload == self.SHUTDOWN:
+ self.log.info("Shutting down...")
+ break
+ else:
+ self.log.info("Waiting for workload request...")
+ self._event.wait()
+ self._event.clear()
"""
return self.BOLD_SEQ + msg + self.RESET_SEQ
- def format_dimless(self, n, width, colored=True):
+ def format_units(self, n, width, colored, decimal):
"""
Format a number without units, so as to fit into `width` characters, substituting
an appropriate unit suffix.
+
+ Use decimal for dimensionless things, use base 2 (decimal=False) for byte sizes/rates.
"""
+
+ factor = 1000 if decimal else 1024
units = [' ', 'k', 'M', 'G', 'T', 'P']
unit = 0
- while len("%s" % (int(n) // (1000**unit))) > width - 1:
+ while len("%s" % (int(n) // (factor**unit))) > width - 1:
unit += 1
if unit > 0:
- truncated_float = ("%f" % (n / (1000.0 ** unit)))[0:width - 1]
+ truncated_float = ("%f" % (n / (float(factor) ** unit)))[0:width - 1]
if truncated_float[-1] == '.':
truncated_float = " " + truncated_float[0:-1]
else:
else:
return formatted
+ def format_dimless(self, n, width, colored=True):
+ return self.format_units(n, width, colored, decimal=True)
+
+ def format_bytes(self, n, width, colored=True):
+ return self.format_units(n, width, colored, decimal=False)
+
def get_latest(self, daemon_type, daemon_name, stat):
data = self.get_counter(daemon_type, daemon_name, stat)[stat]
#self.log.error("get_latest {0} data={1}".format(stat, data))
stats = pool_stats[pool_id]
pools_table.add_row([
pools[pool_id]['pool_name'], pool_type,
- self.format_dimless(stats['bytes_used'], 5),
- self.format_dimless(stats['max_avail'], 5)
+ self.format_bytes(stats['bytes_used'], 5),
+ self.format_bytes(stats['max_avail'], 5)
])
output += "{0} - {1} clients\n".format(
stats = osd_stats[osd_id]
osd_table.add_row([osd_id, metadata['hostname'],
- self.format_dimless(stats['kb_used'] * 1024, 5),
- self.format_dimless(stats['kb_avail'] * 1024, 5),
+ self.format_bytes(stats['kb_used'] * 1024, 5),
+ self.format_bytes(stats['kb_avail'] * 1024, 5),
self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_w") +
self.get_rate("osd", osd_id.__str__(), "osd.op_rw"), 5),
- self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
+ self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_in_bytes"), 5),
self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_r"), 5),
- self.format_dimless(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
+ self.format_bytes(self.get_rate("osd", osd_id.__str__(), "osd.op_out_bytes"), 5),
])
return 0, "", osd_table.get_string()
def avg(data):
- return sum(data) / float(len(data))
+ if len(data):
+ return sum(data) / float(len(data))
+ else:
+ return 0
class ZabbixSender(object):
while self.run:
self.log.debug('Waking up for new iteration')
- # Sometimes fetching data fails, should be fixed by PR #16020
try:
self.send()
except Exception as exc:
- self.log.error(exc)
+ # Shouldn't happen, but let's log it and retry next interval,
+ # rather than dying completely.
+ self.log.exception("Unexpected error during send():")
interval = self.config['interval']
self.log.debug('Sleeping for %d seconds', interval)
unmount_unmap() {
local rbd_dev=$1
- local mnt=$(findmnt --mtab --source ${rbd_dev} --noheadings \
+ local mnts=$(findmnt --mtab --source ${rbd_dev} --noheadings \
| awk '{print $1'})
logger -p "daemon.debug" -t rbdmap "Unmapping '${rbd_dev}'"
- if [ -n "${mnt}" ]; then
+ for mnt in ${mnts}; do
logger -p "daemon.debug" -t rbdmap "Unmounting '${mnt}'"
umount "${mnt}" >>/dev/null 2>&1
- fi
- if mountpoint -q "${mnt}"; then
- ## Un-mounting failed.
- logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
- return 1
- fi
+ if mountpoint -q "${mnt}"; then
+ ## Un-mounting failed.
+ logger -p "daemon.warning" -t rbdmap "Failed to unmount '${mnt}'"
+ return 1
+ fi
+ done
## Un-mapping.
rbd unmap $rbd_dev >>/dev/null 2>&1
if [ $? -ne 0 ]; then
#include <boost/optional.hpp>
#include "auth/Crypto.h"
+#include "compressor/Compressor.h"
#include "common/armor.h"
#include "common/ceph_json.h"
cerr << "ERROR: --placement-id not specified" << std::endl;
return EINVAL;
}
+ // validate compression type
+ if (compression_type && *compression_type != "random"
+ && !Compressor::get_comp_alg_type(*compression_type)) {
+ std::cerr << "Unrecognized compression type" << std::endl;
+ return EINVAL;
+ }
+
RGWZoneParams zone(zone_id, zone_name);
int ret = zone.init(g_ceph_context, store);
if (ret < 0) {
using keystone_cache_t = rgw::keystone::TokenCache;
using EC2Engine = rgw::auth::keystone::EC2Engine;
- EC2Engine keystone_engine;
+ boost::optional <EC2Engine> keystone_engine;
LDAPEngine ldap_engine;
aplptr_t create_apl_remote(CephContext* const cct,
RGWRados* const store,
AWSEngine::VersionAbstractor* const ver_abstractor)
: store(store),
- keystone_engine(cct, ver_abstractor,
- static_cast<rgw::auth::RemoteApplier::Factory*>(this),
- keystone_config_t::get_instance(),
- keystone_cache_t::get_instance<keystone_config_t>()),
ldap_engine(cct, store, *ver_abstractor,
static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
if (cct->_conf->rgw_s3_auth_use_keystone &&
! cct->_conf->rgw_keystone_url.empty()) {
- add_engine(Control::SUFFICIENT, keystone_engine);
+
+ keystone_engine.emplace(cct, ver_abstractor,
+ static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+ keystone_config_t::get_instance(),
+ keystone_cache_t::get_instance<keystone_config_t>());
+ add_engine(Control::SUFFICIENT, *keystone_engine);
+
}
if (cct->_conf->rgw_s3_auth_use_ldap &&
}
const std::string& get_tenant() const {
- ceph_assert(t != Wildcard);
return u.tenant;
}
const std::string& get_id() const {
- ceph_assert(t != Wildcard && t != Tenant);
return u.id;
}
{
int ret;
buckets.clear();
- string buckets_obj_id;
+ std::string buckets_obj_id;
rgw_get_buckets_obj(user_id, buckets_obj_id);
rgw_raw_obj obj(store->get_zone_params().user_uid_pool, buckets_obj_id);
- list<cls_user_bucket_entry> entries;
bool truncated = false;
string m = marker;
}
do {
+ std::list<cls_user_bucket_entry> entries;
ret = store->cls_user_list_buckets(obj, m, end_marker, max - total, entries, &m, &truncated);
- if (ret == -ENOENT)
+ if (ret == -ENOENT) {
ret = 0;
+ }
- if (ret < 0)
+ if (ret < 0) {
return ret;
+ }
- for (const auto& entry : entries) {
- buckets.add(RGWBucketEnt(user_id, entry));
+ for (auto& entry : entries) {
+ buckets.add(RGWBucketEnt(user_id, std::move(entry)));
total++;
}
return 0;
}
-int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint)
+int rgw_link_bucket(RGWRados* const store,
+ const rgw_user& user_id,
+ rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ bool update_entrypoint)
{
int ret;
string& tenant_name = bucket.tenant;
cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
if (fix) {
cout << "fixing" << std::endl;
- r = rgw_link_bucket(store, user_id, actual_bucket, bucket_info.creation_time);
+ r = rgw_link_bucket(store, user_id, actual_bucket,
+ bucket_info.creation_time);
if (r < 0) {
cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
}
return r;
}
- r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket, real_time());
+ r = rgw_link_bucket(store, user_info.user_id, bucket_info.bucket,
+ ceph::real_time());
if (r < 0) {
return r;
}
if (be.linked) {
ret = rgw_link_bucket(store, be.owner, be.bucket, be.creation_time, false);
} else {
- ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant, be.bucket.name, false);
+ ret = rgw_unlink_bucket(store, be.owner, be.bucket.tenant,
+ be.bucket.name, false);
}
return ret;
*/
class RGWUserBuckets
{
- map<string, RGWBucketEnt> buckets;
+ std::map<std::string, RGWBucketEnt> buckets;
public:
- RGWUserBuckets() {}
+ RGWUserBuckets() = default;
+ RGWUserBuckets(RGWUserBuckets&&) = default;
+
+ RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
void encode(bufferlist& bl) const {
::encode(buckets, bl);
}
bool* is_truncated,
uint64_t default_amount = 1000);
-extern int rgw_link_bucket(RGWRados *store, const rgw_user& user_id, rgw_bucket& bucket, real_time creation_time, bool update_entrypoint = true);
+extern int rgw_link_bucket(RGWRados* store,
+ const rgw_user& user_id,
+ rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ bool update_entrypoint = true);
extern int rgw_unlink_bucket(RGWRados *store, const rgw_user& user_id,
const string& tenant_name, const string& bucket_name, bool update_entrypoint = true);
{ ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
{ ERR_INVALID_TAG, {400, "InvalidTag"}},
{ ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }},
+ { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }},
{ ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
{ EACCES, {403, "AccessDenied" }},
{ EPERM, {403, "AccessDenied" }},
rgw_http_errors rgw_http_swift_errors({
{ EACCES, {403, "AccessDenied" }},
{ EPERM, {401, "AccessDenied" }},
+ { ENAMETOOLONG, {400, "Metadata name too long" }},
{ ERR_USER_SUSPENDED, {401, "UserSuspended" }},
{ ERR_INVALID_UTF8, {412, "Invalid UTF8" }},
{ ERR_BAD_URL, {412, "Bad URL" }},
{ ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }},
{ ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }},
+ { ENOTEMPTY, {409, "There was a conflict when trying "
+ "to complete your request." }},
/* FIXME(rzarzynski): we need to find a way to apply Swift's error handling
* procedures also for ERR_ZERO_IN_URL. This make a problem as the validation
* is performed very early, even before setting the req_state::proto_flags. */
{
if (s) {
set_req_state_err(s, err_no);
- s->err.message = err_msg;
+ if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) {
+ /* TODO(rzarzynski): there never ever should be a check like this one.
+ * It's here only for the sake of the patch's backportability. Further
+ * commits will move the logic to a per-RGWHandler replacement of
+ * the end_header() function. Alternativaly, we might consider making
+ * that just for the dump(). Please take a look on @cbodley's comments
+ * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */
+ s->err.err_code = err_msg;
+ } else {
+ s->err.message = err_msg;
+ }
}
}
#define ERR_ZERO_IN_URL 2211
#define ERR_MALFORMED_ACL_ERROR 2212
#define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213
+#define ERR_INVALID_ENCRYPTION_ALGORITHM 2214
#define ERR_BUSY_RESHARDING 2300
RGW_OP_STAT_ACCOUNT,
RGW_OP_LIST_BUCKET,
RGW_OP_GET_BUCKET_LOGGING,
+ RGW_OP_GET_BUCKET_LOCATION,
RGW_OP_GET_BUCKET_VERSIONING,
RGW_OP_SET_BUCKET_VERSIONING,
RGW_OP_GET_BUCKET_WEBSITE,
WRITE_CLASS_ENCODER(RGWUserInfo)
struct rgw_pool {
- string name;
- string ns;
+ std::string name;
+ std::string ns;
- rgw_pool() {}
+ rgw_pool() = default;
rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {}
+ rgw_pool(rgw_pool&&) = default;
rgw_pool(const string& _s) {
from_str(_s);
}
DECODE_FINISH(bl);
}
+ rgw_pool& operator=(const rgw_pool&) = default;
+
bool operator==(const rgw_pool& p) const {
return (compare(p) == 0);
}
rgw_pool data_extra_pool;
rgw_pool index_pool;
- rgw_data_placement_target() {}
+ rgw_data_placement_target() = default;
+ rgw_data_placement_target(const rgw_data_placement_target&) = default;
+ rgw_data_placement_target(rgw_data_placement_target&&) = default;
+
+ rgw_data_placement_target(const rgw_pool& data_pool,
+ const rgw_pool& data_extra_pool,
+ const rgw_pool& index_pool)
+ : data_pool(data_pool),
+ data_extra_pool(data_extra_pool),
+ index_pool(index_pool) {
+ }
- rgw_data_placement_target(const rgw_pool& _data_pool, const rgw_pool& _data_extra_pool, const rgw_pool& _index_pool)
- : data_pool(_data_pool), data_extra_pool(_data_extra_pool), index_pool(_index_pool) {}
+ rgw_data_placement_target&
+ operator=(const rgw_data_placement_target&) = default;
const rgw_pool& get_data_extra_pool() const {
if (data_extra_pool.empty()) {
explicit_placement(b.explicit_placement.data_pool,
b.explicit_placement.data_extra_pool,
b.explicit_placement.index_pool) {}
+ rgw_bucket(const rgw_bucket&) = default;
+ rgw_bucket(rgw_bucket&&) = default;
void convert(cls_user_bucket *b) const {
b->name = name;
void decode_json(JSONObj *obj);
static void generate_test_instances(list<rgw_bucket*>& o);
+ rgw_bucket& operator=(const rgw_bucket&) = default;
+
bool operator<(const rgw_bucket& b) const {
return name.compare(b.name) < 0;
}
rgw_bucket bucket;
size_t size;
size_t size_rounded;
- real_time creation_time;
+ ceph::real_time creation_time;
uint64_t count;
- RGWBucketEnt() : size(0), size_rounded(0), count(0) {}
+ /* The placement_rule is necessary to calculate per-storage-policy statics
+ * of the Swift API. Although the info available in RGWBucketInfo, we need
+ * to duplicate it here to not affect the performance of buckets listing. */
+ std::string placement_rule;
- explicit RGWBucketEnt(const rgw_user& u, const cls_user_bucket_entry& e)
- : bucket(u, e.bucket),
+ RGWBucketEnt()
+ : size(0),
+ size_rounded(0),
+ count(0) {
+ }
+ RGWBucketEnt(const RGWBucketEnt&) = default;
+ RGWBucketEnt(RGWBucketEnt&&) = default;
+ explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e)
+ : bucket(u, std::move(e.bucket)),
size(e.size),
size_rounded(e.size_rounded),
creation_time(e.creation_time),
count(e.count) {
}
+ RGWBucketEnt& operator=(const RGWBucketEnt&) = default;
+
void convert(cls_user_bucket_entry *b) const {
bucket.convert(&b->bucket);
b->size = size;
}
void encode(bufferlist& bl) const {
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
uint64_t s = size;
__u32 mt = ceph::real_clock::to_time_t(creation_time);
string empty_str; // originally had the bucket name here, but we encode bucket later
s = size_rounded;
::encode(s, bl);
::encode(creation_time, bl);
+ ::encode(placement_rule, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
size_rounded = s;
if (struct_v >= 6)
::decode(creation_time, bl);
+ if (struct_v >= 7)
+ ::decode(placement_rule, bl);
DECODE_FINISH(bl);
}
void dump(Formatter *f) const;
map<string, string>::iterator it = str_map.find(std::string(key_id));
if (it != str_map.end() ) {
- std::string master_key = from_base64((*it).second);
+ std::string master_key;
+ try {
+ master_key = from_base64((*it).second);
+ } catch (...) {
+ ldout(cct, 5) << "ERROR: get_actual_key_from_kms invalid encryption key id "
+ << "which contains character that is not base64 encoded."
+ << dendl;
+ return -EINVAL;
+ }
+
if (master_key.length() == AES_256_KEYSIZE) {
uint8_t _actual_key[AES_256_KEYSIZE];
if (AES_256_ECB_encrypt(cct,
ldout(s->cct, 5) << "ERROR: Invalid value for header "
<< "x-amz-server-side-encryption-customer-algorithm"
<< dendl;
- return -ERR_INVALID_REQUEST;
+ s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+ return -ERR_INVALID_ENCRYPTION_ALGORITHM;
}
if (s->cct->_conf->rgw_crypt_require_ssl &&
!s->info.env->exists("SERVER_PORT_SECURE")) {
ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
return -ERR_INVALID_REQUEST;
}
- std::string key_bin = from_base64(
+
+ std::string key_bin;
+ try {
+ key_bin = from_base64(
get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) );
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption "
+ << "key which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key.";
+ return -EINVAL;
+ }
+
if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
ldout(s->cct, 5) << "ERROR: invalid encryption key size" << dendl;
- return -ERR_INVALID_REQUEST;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key.";
+ return -EINVAL;
}
+
boost::string_view keymd5 =
get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
- std::string keymd5_bin = from_base64(keymd5);
+
+ std::string keymd5_bin;
+ try {
+ keymd5_bin = from_base64(keymd5);
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key "
+ << "md5 which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key md5.";
+ return -EINVAL;
+ }
+
if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
ldout(s->cct, 5) << "ERROR: Invalid key md5 size" << dendl;
- return -ERR_INVALID_DIGEST;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key md5.";
+ return -EINVAL;
}
+
MD5 key_hash;
byte key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
key_hash.Update(reinterpret_cast<const byte*>(key_bin.c_str()), key_bin.size());
if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
ldout(s->cct, 5) << "ERROR: Invalid key md5 hash" << dendl;
- return -ERR_INVALID_DIGEST;
+ s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+ return -EINVAL;
}
set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256");
crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256";
crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5.to_string();
return 0;
+ } else {
+ boost::string_view customer_key =
+ get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY);
+ if (!customer_key.empty()) {
+ ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+ << "x-amz-server-side-encryption-customer-algorithm"
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide a valid encryption algorithm.";
+ return -EINVAL;
+ }
+
+ boost::string_view customer_key_md5 =
+ get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
+ if (!customer_key_md5.empty()) {
+ ldout(s->cct, 5) << "ERROR: SSE-C encryption request is missing the header "
+ << "x-amz-server-side-encryption-customer-algorithm"
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide a valid encryption algorithm.";
+ return -EINVAL;
+ }
}
+
/* AMAZON server side encryption with KMS (key management service) */
boost::string_view req_sse =
get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION);
if (req_sse != "aws:kms") {
ldout(s->cct, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption"
<< dendl;
- return -ERR_INVALID_REQUEST;
+ s->err.message = "Server Side Encryption with KMS managed key requires "
+ "HTTP header x-amz-server-side-encryption : aws:kms";
+ return -EINVAL;
}
if (s->cct->_conf->rgw_crypt_require_ssl &&
!s->info.env->exists("SERVER_PORT_SECURE")) {
boost::string_view key_id =
get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
if (key_id.empty()) {
+ ldout(s->cct, 5) << "ERROR: not provide a valid key id" << dendl;
+ s->err.message = "Server Side Encryption with KMS managed key requires "
+ "HTTP header x-amz-server-side-encryption-aws-kms-key-id";
return -ERR_INVALID_ACCESS_KEY;
}
/* try to retrieve actual key */
std::string key_selector = create_random_key_selector(s->cct);
std::string actual_key;
res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
- if (res != 0)
+ if (res != 0) {
+ ldout(s->cct, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+ s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id.to_string();
return res;
+ }
if (actual_key.size() != AES_256_KEYSIZE) {
ldout(s->cct, 5) << "ERROR: key obtained from key_id:" <<
key_id << " is not 256 bit size" << dendl;
+ s->err.message = "KMS provided an invalid key for the given kms-keyid.";
return -ERR_INVALID_ACCESS_KEY;
}
set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS");
*block_crypt = std::move(aes);
}
actual_key.replace(0, actual_key.length(), actual_key.length(), '\000');
+
+ crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms";
+ crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id.to_string();
return 0;
+ } else {
+ boost::string_view key_id =
+ get_crypt_attribute(s->info.env, parts, X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
+ if (!key_id.empty()) {
+ ldout(s->cct, 5) << "ERROR: SSE-KMS encryption request is missing the header "
+ << "x-amz-server-side-encryption"
+ << dendl;
+ s->err.message = "Server Side Encryption with KMS managed key requires "
+ "HTTP header x-amz-server-side-encryption : aws:kms";
+ return -EINVAL;
+ }
}
/* no other encryption mode, check if default encryption is selected */
if (s->cct->_conf->rgw_crypt_default_encryption_key != "") {
- std::string master_encryption_key =
- from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+ std::string master_encryption_key;
+ try {
+ master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key "
+ << "which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key.";
+ return -EINVAL;
+ }
+
if (master_encryption_key.size() != 256 / 8) {
ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
/* not an error to return; missing encryption does not inhibit processing */
const char *req_cust_alg =
s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL);
- if ((nullptr == req_cust_alg) || (strcmp(req_cust_alg, "AES256") != 0)) {
- ldout(s->cct, 5) << "ERROR: Invalid value for header "
+ if (nullptr == req_cust_alg) {
+ ldout(s->cct, 5) << "ERROR: Request for SSE-C encrypted object missing "
<< "x-amz-server-side-encryption-customer-algorithm"
<< dendl;
- return -ERR_INVALID_REQUEST;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide a valid encryption algorithm.";
+ return -EINVAL;
+ } else if (strcmp(req_cust_alg, "AES256") != 0) {
+ ldout(s->cct, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl;
+ s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+ return -ERR_INVALID_ENCRYPTION_ALGORITHM;
+ }
+
+ std::string key_bin;
+ try {
+ key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key "
+ << "which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key.";
+ return -EINVAL;
}
- std::string key_bin =
- from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
ldout(s->cct, 5) << "ERROR: Invalid encryption key size" << dendl;
- return -ERR_INVALID_REQUEST;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key.";
+ return -EINVAL;
}
std::string keymd5 =
s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", "");
- std::string keymd5_bin = from_base64(keymd5);
+ std::string keymd5_bin;
+ try {
+ keymd5_bin = from_base64(keymd5);
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 "
+ << "which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key md5.";
+ return -EINVAL;
+ }
+
+
if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
ldout(s->cct, 5) << "ERROR: Invalid key md5 size " << dendl;
- return -ERR_INVALID_DIGEST;
+ s->err.message = "Requests specifying Server Side Encryption with Customer "
+ "provided keys must provide an appropriate secret key md5.";
+ return -EINVAL;
}
MD5 key_hash;
if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) ||
(get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) {
- return -ERR_INVALID_DIGEST;
+ s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+ return -EINVAL;
}
auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s->cct));
aes->set_key(reinterpret_cast<const uint8_t*>(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE);
std::string actual_key;
res = get_actual_key_from_kms(s->cct, key_id, key_selector, actual_key);
if (res != 0) {
- ldout(s->cct, 10) << "No encryption key for key-id=" << key_id << dendl;
+ ldout(s->cct, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+ s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id;
return res;
}
if (actual_key.size() != AES_256_KEYSIZE) {
ldout(s->cct, 0) << "ERROR: key obtained from key_id:" <<
key_id << " is not 256 bit size" << dendl;
+ s->err.message = "KMS provided an invalid key for the given kms-keyid.";
return -ERR_INVALID_ACCESS_KEY;
}
}
if (stored_mode == "RGW-AUTO") {
- std::string master_encryption_key =
- from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+ std::string master_encryption_key;
+ try {
+ master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+ } catch (...) {
+ ldout(s->cct, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key "
+ << "which contains character that is not base64 encoded."
+ << dendl;
+ s->err.message = "The default encryption key is not valid base64.";
+ return -EINVAL;
+ }
+
if (master_encryption_key.size() != 256 / 8) {
ldout(s->cct, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
return -EIO;
int operate() override {
reenter(this) {
- entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
- store->get_zone_params().log_pool,
- oid_prefix);
yield {
string entrypoint = string("/admin/metadata/bucket.instance");
/* FIXME: need a better scaling solution here, requires streaming output */
call(new RGWReadRESTResourceCR<list<string> >(store->ctx(), sync_env->conn, sync_env->http_manager,
entrypoint, NULL, &result));
}
- if (get_ret_status() < 0) {
+ if (retcode < 0) {
ldout(sync_env->cct, 0) << "ERROR: failed to fetch metadata for section bucket.index" << dendl;
- return set_state(RGWCoroutine_Error);
+ return set_cr_error(retcode);
}
+ entries_index = new RGWShardedOmapCRManager(sync_env->async_rados, store, this, num_shards,
+ store->get_zone_params().log_pool,
+ oid_prefix);
+ yield; // yield so OmapAppendCRs can start
for (iter = result.begin(); iter != result.end(); ++iter) {
ldout(sync_env->cct, 20) << "list metadata: section=bucket.index key=" << *iter << dendl;
RGWDataSyncEnv *sync_env;
uint32_t num_shards;
+ static constexpr bool exit_on_error = false; // retry on all errors
public:
- RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, true),
+ RGWDataSyncControlCR(RGWDataSyncEnv *_sync_env, uint32_t _num_shards) : RGWBackoffControlCR(_sync_env->cct, exit_on_error),
sync_env(_sync_env), num_shards(_num_shards) {
}
auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
if (ux_key && ux_attrs) {
- bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
- if (old_key) {
- update_fhk(rgw_fh);
- }
+ DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+ if (get<0>(dar) || get<1>(dar)) {
+ update_fh(rgw_fh);
+ }
}
if (! (flags & RGWFileHandle::FLAG_LOCKED)) {
rgw_fh->mtx.unlock();
auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
if (ux_key && ux_attrs) {
- bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
- if (old_key) {
- update_fhk(rgw_fh);
- }
+ DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+ if (get<0>(dar) || get<1>(dar)) {
+ update_fh(rgw_fh);
+ }
}
}
goto done;
auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
if (ux_key && ux_attrs) {
- bool old_key = rgw_fh->decode_attrs(ux_key, ux_attrs);
- if (old_key) {
- update_fhk(rgw_fh);
- }
+ DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+ if (get<0>(dar) || get<1>(dar)) {
+ update_fh(rgw_fh);
+ }
}
}
goto done;
} /* RGWLibFS::setattr */
/* called under rgw_fh->mtx held */
- void RGWLibFS::update_fhk(RGWFileHandle *rgw_fh)
+ void RGWLibFS::update_fh(RGWFileHandle *rgw_fh)
{
int rc, rc2;
string obj_name{rgw_fh->relative_object_name()};
lsubdout(get_context(), rgw, 17)
<< __func__
- << " update old versioned fhk : " << obj_name
+ << " update old versioned fh : " << obj_name
<< dendl;
RGWSetAttrsRequest req(cct, get_user(), rgw_fh->bucket_name(), obj_name);
rgw_fh->encode_attrs(ux_key, ux_attrs);
- /* update ux_key only */
req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+ req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
rc = rgwlib.get_fe()->execute_req(&req);
rc2 = req.get_ret();
if ((rc != 0) || (rc2 != 0)) {
lsubdout(get_context(), rgw, 17)
<< __func__
- << " update fhk failed : " << obj_name
+ << " update fh failed : " << obj_name
<< dendl;
}
- } /* RGWLibFS::update_fhk */
+ } /* RGWLibFS::update_fh */
void RGWLibFS::close()
{
{
RGWLibFS* fs;
public:
- ObjUnref(RGWLibFS* fs) : fs(fs) {}
+ ObjUnref(RGWLibFS* _fs) : fs(_fs) {}
void operator()(RGWFileHandle* fh) const {
lsubdout(fs->get_context(), rgw, 5)
<< __func__
fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
}
/* cond-unref parent */
- if (parent && (! parent->is_root())) {
+ if (parent && (! parent->is_mount())) {
/* safe because if parent->unref causes its deletion,
* there are a) by refcnt, no other objects/paths pointing
* to it and b) by the semantics of valid iteration of
rgw::encode(*this, ux_attrs1);
} /* RGWFileHandle::encode_attrs */
- bool RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
- const ceph::buffer::list* ux_attrs1)
+ DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
+ const ceph::buffer::list* ux_attrs1)
{
- bool old_key = false;
+ DecodeAttrsResult dar { false, false };
fh_key fhk;
auto bl_iter_key1 = const_cast<buffer::list*>(ux_key1)->begin();
rgw::decode(fhk, bl_iter_key1);
if (fhk.version >= 2) {
assert(this->fh.fh_hk == fhk.fh_hk);
} else {
- old_key = true;
+ get<0>(dar) = true;
}
auto bl_iter_unix1 = const_cast<buffer::list*>(ux_attrs1)->begin();
rgw::decode(*this, bl_iter_unix1);
+ if (this->state.version < 2) {
+ get<1>(dar) = true;
+ }
- return old_key;
+ return dar;
} /* RGWFileHandle::decode_attrs */
bool RGWFileHandle::reclaim() {
return false;
}
- int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset,
+ std::ostream& operator<<(std::ostream &os,
+ RGWFileHandle::readdir_offset const &offset)
+ {
+ using boost::get;
+ if (unlikely(!! get<uint64_t*>(&offset))) {
+ uint64_t* ioff = get<uint64_t*>(offset);
+ os << *ioff;
+ }
+ else
+ os << get<const char*>(offset);
+ return os;
+ }
+
+ int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg,
+ readdir_offset offset,
bool *eof, uint32_t flags)
{
using event = RGWLibFS::event;
+ using boost::get;
int rc = 0;
struct timespec now;
CephContext* cct = fs->get_context();
- if ((*offset == 0) &&
- (flags & RGW_READDIR_FLAG_DOTDOT)) {
- /* send '.' and '..' with their NFS-defined offsets */
- rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
- rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
- }
-
- lsubdout(fs->get_context(), rgw, 15)
- << __func__
- << " offset=" << *offset
- << dendl;
-
directory* d = get<directory>(&variant_type);
if (d) {
(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
d->last_readdir = now;
}
+ bool initial_off;
+ if (likely(!! get<const char*>(&offset))) {
+ initial_off = ! get<const char*>(offset);
+ } else {
+ initial_off = (*get<uint64_t*>(offset) == 0);
+ }
+
if (is_root()) {
RGWListBucketsRequest req(cct, fs->get_user(), this, rcb, cb_arg,
offset);
(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
lock_guard guard(mtx);
state.atime = now;
- if (*offset == 0)
+ if (initial_off)
set_nlink(2);
inc_nlink(req.d_count);
*eof = req.eof();
(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
lock_guard guard(mtx);
state.atime = now;
- if (*offset == 0)
+ if (initial_off)
set_nlink(2);
inc_nlink(req.d_count);
*eof = req.eof();
}
}
+ int overlap = 0;
+ if ((static_cast<off_t>(off) < f->write_req->real_ofs) &&
+ ((f->write_req->real_ofs - off) <= len)) {
+ overlap = f->write_req->real_ofs - off;
+ off = f->write_req->real_ofs;
+ buffer = static_cast<char*>(buffer) + overlap;
+ len -= overlap;
+ }
+
buffer::list bl;
/* XXXX */
#if 0
rc = -EIO;
}
- *bytes_written = (rc == 0) ? len : 0;
+ *bytes_written = (rc == 0) ? (len + overlap) : 0;
return rc;
} /* RGWFileHandle::write */
struct timespec omtime = rgw_fh->get_mtime();
real_time appx_t = real_clock::now();
- s->obj_size = ofs; // XXX check ofs
+ s->obj_size = bytes_written;
perfcounter->inc(l_rgw_put_b, s->obj_size);
op_ret = get_store()->check_quota(s->bucket_owner.get_id(), s->bucket,
goto done;
}
- op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket, bucket_quota);
+ op_ret = get_store()->check_bucket_shards(s->bucket_info, s->bucket,
+ bucket_quota);
if (op_ret < 0) {
goto done;
}
attrbl.append(val.c_str(), val.size() + 1);
}
- rgw_get_request_metadata(s->cct, s->info, attrs);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+ if (op_ret < 0) {
+ goto done;
+ }
encode_delete_at_attr(delete_at, attrs);
/* Add a custom metadata to expose the information whether an object
/* stash access data for "mount" */
RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
- sec_key);
+ sec_key, "/");
+ assert(new_fs);
+
+ rc = new_fs->authorize(rgwlib.get_store());
+ if (rc != 0) {
+ delete new_fs;
+ return -EINVAL;
+ }
+
+ /* register fs for shared gc */
+ rgwlib.get_fe()->get_process()->register_fs(new_fs);
+
+ struct rgw_fs *fs = new_fs->get_fs();
+ fs->rgw = rgw;
+
+ /* XXX we no longer assume "/" is unique, but we aren't tracking the
+ * roots atm */
+
+ *rgw_fs = fs;
+
+ return 0;
+}
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key,
+ const char *sec_key, const char *root, struct rgw_fs **rgw_fs,
+ uint32_t flags)
+{
+ int rc = 0;
+
+ /* stash access data for "mount" */
+ RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
+ sec_key, root);
assert(new_fs);
rc = new_fs->authorize(rgwlib.get_store());
vfs_st->f_bavail = UINT64_MAX;
vfs_st->f_files = 1024; /* object count, do we have an est? */
vfs_st->f_ffree = UINT64_MAX;
- vfs_st->f_fsid[0] = fs->get_inst();
- vfs_st->f_fsid[1] = fs->get_inst();
+ vfs_st->f_fsid[0] = fs->get_fsid();
+ vfs_st->f_fsid[1] = fs->get_fsid();
vfs_st->f_flag = 0;
vfs_st->f_namemax = 4096;
return 0;
/* bad parent */
return -EINVAL;
}
+
+ lsubdout(parent->get_fs()->get_context(), rgw, 15)
+ << __func__
+ << " offset=" << *offset
+ << dendl;
+
+ if ((*offset == 0) &&
+ (flags & RGW_READDIR_FLAG_DOTDOT)) {
+ /* send '.' and '..' with their NFS-defined offsets */
+ rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+ rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+ }
+
int rc = parent->readdir(rcb, cb_arg, offset, eof, flags);
return rc;
-}
+} /* rgw_readdir */
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *name,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags)
+{
+ RGWFileHandle* parent = get_rgwfh(parent_fh);
+ if (! parent) {
+ /* bad parent */
+ return -EINVAL;
+ }
+
+ lsubdout(parent->get_fs()->get_context(), rgw, 15)
+ << __func__
+ << " offset=" << name
+ << dendl;
+
+ if ((! name) &&
+ (flags & RGW_READDIR_FLAG_DOTDOT)) {
+ /* send '.' and '..' with their NFS-defined offsets */
+ rcb(".", cb_arg, 1, RGW_LOOKUP_FLAG_DIR);
+ rcb("..", cb_arg, 2, RGW_LOOKUP_FLAG_DIR);
+ }
+
+ int rc = parent->readdir(rcb, cb_arg, name, eof, flags);
+ return rc;
+} /* rgw_readdir2 */
/* project offset of dirent name */
int rgw_dirent_offset(struct rgw_fs *rgw_fs,
if (! rgw_fh->is_file())
return -EISDIR;
- if (! rgw_fh->is_open())
- return -EPERM;
+ if (! rgw_fh->is_open()) {
+ if (flags & RGW_OPEN_FLAG_V3) {
+ rc = rgw_fh->open(flags);
+ if (!! rc)
+ return rc;
+ } else
+ return -EPERM;
+ }
rc = rgw_fh->write(offset, length, bytes_written, buffer);
using boost::variant;
using boost::container::flat_map;
+ typedef std::tuple<bool, bool> DecodeAttrsResult;
+
class RGWFileHandle : public cohort::lru::Object
{
struct rgw_file_handle fh;
struct timespec ctime;
struct timespec mtime;
struct timespec atime;
+ uint32_t version;
State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0),
- ctime{0,0}, mtime{0,0}, atime{0,0} {}
+ ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {}
} state;
struct file {
static constexpr uint32_t FLAG_LOCKED = 0x0200;
static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400;
static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800;
+ static constexpr uint32_t FLAG_MOUNT = 0x1000;
#define CREATE_FLAGS(x) \
((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK))
friend class RGWLibFS;
private:
- RGWFileHandle(RGWLibFS* _fs, uint32_t fs_inst)
+ RGWFileHandle(RGWLibFS* _fs)
: fs(_fs), bucket(nullptr), parent(nullptr), variant_type{directory()},
- depth(0), flags(FLAG_ROOT)
+ depth(0), flags(FLAG_NONE)
{
/* root */
fh.fh_type = RGW_FS_TYPE_DIRECTORY;
variant_type = directory();
/* stat */
- state.dev = fs_inst;
state.unix_mode = RGW_RWXMODE|S_IFDIR;
/* pointer to self */
fh.fh_private = this;
}
- void init_rootfs(std::string& fsid, const std::string& object_name) {
+ uint64_t init_fsid(std::string& uid) {
+ return XXH64(uid.c_str(), uid.length(), fh_key::seed);
+ }
+
+ void init_rootfs(std::string& fsid, const std::string& object_name,
+ bool is_bucket) {
/* fh_key */
fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed);
fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(),
fh_key::seed);
fhk = fh.fh_hk;
name = object_name;
+
+ state.dev = init_fsid(fsid);
+
+ if (is_bucket) {
+ flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT;
+ bucket = this;
+ depth = 1;
+ } else {
+ flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT;
+ }
}
public:
- RGWFileHandle(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* _parent,
+ RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent,
const fh_key& _fhk, std::string& _name, uint32_t _flags)
- : fs(fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
+ : fs(_fs), bucket(nullptr), parent(_parent), name(std::move(_name)),
fhk(_fhk), flags(_flags) {
if (parent->is_root()) {
/* save constant fhk */
fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */
- /* stat */
- state.dev = fs_inst;
+ /* inherits parent's fsid */
+ state.dev = parent->state.dev;
switch (fh.fh_type) {
case RGW_FS_TYPE_DIRECTORY:
bool is_open() const { return flags & FLAG_OPEN; }
bool is_root() const { return flags & FLAG_ROOT; }
+ bool is_mount() const { return flags & FLAG_MOUNT; }
bool is_bucket() const { return flags & FLAG_BUCKET; }
bool is_object() const { return !is_bucket(); }
bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); }
return -EPERM;
}
- int readdir(rgw_readdir_cb rcb, void *cb_arg, uint64_t *offset, bool *eof,
- uint32_t flags);
+ typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+ int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset,
+ bool *eof, uint32_t flags);
+
int write(uint64_t off, size_t len, size_t *nbytes, void *buffer);
int commit(uint64_t offset, uint64_t length, uint32_t flags) {
}
void encode(buffer::list& bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(uint32_t(fh.fh_type), bl);
::encode(state.dev, bl);
::encode(state.size, bl);
for (const auto& t : { state.ctime, state.mtime, state.atime }) {
::encode(real_clock::from_timespec(t), bl);
}
+ ::encode((uint32_t)2, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
uint32_t fh_type;
::decode(fh_type, bl);
assert(fh.fh_type == fh_type);
::decode(enc_time, bl);
*t = real_clock::to_timespec(enc_time);
}
+ if (struct_v >= 2) {
+ ::decode(state.version, bl);
+ }
DECODE_FINISH(bl);
}
void encode_attrs(ceph::buffer::list& ux_key1,
ceph::buffer::list& ux_attrs1);
- bool decode_attrs(const ceph::buffer::list* ux_key1,
- const ceph::buffer::list* ux_attrs1);
+ DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1,
+ const ceph::buffer::list* ux_attrs1);
void invalidate();
{
public:
RGWLibFS* fs;
- uint32_t fs_inst;
RGWFileHandle* parent;
const fh_key& fhk;
std::string& name;
Factory() = delete;
- Factory(RGWLibFS* fs, uint32_t fs_inst, RGWFileHandle* parent,
- const fh_key& fhk, std::string& name, uint32_t flags)
- : fs(fs), fs_inst(fs_inst), parent(parent), fhk(fhk), name(name),
- flags(flags) {}
+ Factory(RGWLibFS* _fs, RGWFileHandle* _parent,
+ const fh_key& _fhk, std::string& _name, uint32_t _flags)
+ : fs(_fs), parent(_parent), fhk(_fhk), name(_name),
+ flags(_flags) {}
void recycle (cohort::lru::Object* o) override {
/* re-use an existing object */
o->~Object(); // call lru::Object virtual dtor
// placement new!
- new (o) RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+ new (o) RGWFileHandle(fs, parent, fhk, name, flags);
}
cohort::lru::Object* alloc() override {
- return new RGWFileHandle(fs, fs_inst, parent, fhk, name, flags);
+ return new RGWFileHandle(fs, parent, fhk, name, flags);
}
}; /* Factory */
static std::atomic<uint32_t> fs_inst_counter;
static uint32_t write_completion_interval_s;
- std::string fsid;
using lock_guard = std::lock_guard<std::mutex>;
using unique_lock = std::unique_lock<std::mutex>;
};
RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id,
- const char* _key)
- : cct(_cct), root_fh(this, new_inst()), invalidate_cb(nullptr),
+ const char* _key, const char *root)
+ : cct(_cct), root_fh(this), invalidate_cb(nullptr),
invalidate_arg(nullptr), shutdown(false), refcnt(1),
fh_cache(cct->_conf->rgw_nfs_fhcache_partitions,
cct->_conf->rgw_nfs_fhcache_size),
cct->_conf->rgw_nfs_lru_lane_hiwat),
uid(_uid), key(_user_id, _key) {
- /* no bucket may be named rgw_fs_inst-(.*) */
- fsid = RGWFileHandle::root_name + "rgw_fs_inst-" +
- std::to_string(get_inst());
-
- root_fh.init_rootfs(fsid /* bucket */, RGWFileHandle::root_name);
+ if (!root || !strcmp(root, "/")) {
+ root_fh.init_rootfs(uid, RGWFileHandle::root_name, false);
+ } else {
+ root_fh.init_rootfs(uid, root, true);
+ }
/* pointer to self */
fs.fs_private = this;
/* expose public root fh */
fs.root_fh = root_fh.get_fh();
+
+ new_inst();
}
friend void intrusive_ptr_add_ref(const RGWLibFS* fs) {
fh->mtx.unlock(); /* ! LOCKED */
} else {
/* make or re-use handle */
- RGWFileHandle::Factory prototype(this, get_inst(), parent, fhk,
+ RGWFileHandle::Factory prototype(this, parent, fhk,
obj_name, CREATE_FLAGS(flags));
fh = static_cast<RGWFileHandle*>(
fh_lru.insert(&prototype,
fh_cache.insert_latched(fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK);
get<1>(fhr) |= RGWFileHandle::FLAG_CREATE;
/* ref parent (non-initial ref cannot fail on valid object) */
- if (! parent->is_root()) {
+ if (! parent->is_mount()) {
(void) fh_lru.ref(parent, cohort::lru::FLAG_NONE);
}
goto out; /* !LATCHED */
} /* lookup_fh(RGWFileHandle*, const char *, const uint32_t) */
inline void unref(RGWFileHandle* fh) {
- if (likely(! fh->is_root())) {
+ if (likely(! fh->is_mount())) {
(void) fh_lru.unref(fh, cohort::lru::FLAG_NONE);
}
}
inline RGWFileHandle* ref(RGWFileHandle* fh) {
- if (likely(! fh->is_root())) {
+ if (likely(! fh->is_mount())) {
fh_lru.ref(fh, cohort::lru::FLAG_NONE);
}
return fh;
int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask,
uint32_t flags);
- void update_fhk(RGWFileHandle *rgw_fh);
-
+ void update_fh(RGWFileHandle *rgw_fh);
LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path,
RGWLibFS::BucketStats& bs,
struct rgw_fs* get_fs() { return &fs; }
- uint32_t get_inst() { return root_fh.state.dev; }
+ uint64_t get_fsid() { return root_fh.state.dev; }
RGWUserInfo* get_user() { return &user; }
{
public:
RGWFileHandle* rgw_fh;
- uint64_t* offset;
+ RGWFileHandle::readdir_offset offset;
void* cb_arg;
rgw_readdir_cb rcb;
+ uint64_t* ioff;
size_t ix;
uint32_t d_count;
RGWListBucketsRequest(CephContext* _cct, RGWUserInfo *_user,
RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
- void* _cb_arg, uint64_t* _offset)
+ void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
: RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
- cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
- const auto& mk = rgw_fh->find_marker(*offset);
- if (mk) {
- marker = mk->name;
+ cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+ using boost::get;
+
+ if (unlikely(!! get<uint64_t*>(&offset))) {
+ ioff = get<uint64_t*>(offset);
+ const auto& mk = rgw_fh->find_marker(*ioff);
+ if (mk) {
+ marker = mk->name;
+ }
+ } else {
+ const char* mk = get<const char*>(offset);
+ if (mk) {
+ marker = mk;
+ }
}
op = this;
}
int operator()(const boost::string_ref& name,
const boost::string_ref& marker) {
uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
- *offset = off;
+ if (!! ioff) {
+ *ioff = off;
+ }
/* update traversal cache */
rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""},
RGW_FS_TYPE_DIRECTORY);
}
bool eof() {
- lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+ lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
<< " is_truncated: " << is_truncated
<< dendl;
return !is_truncated;
{
public:
RGWFileHandle* rgw_fh;
- uint64_t* offset;
+ RGWFileHandle::readdir_offset offset;
void* cb_arg;
rgw_readdir_cb rcb;
+ uint64_t* ioff;
size_t ix;
uint32_t d_count;
RGWReaddirRequest(CephContext* _cct, RGWUserInfo *_user,
RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
- void* _cb_arg, uint64_t* _offset)
+ void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
: RGWLibRequest(_cct, _user), rgw_fh(_rgw_fh), offset(_offset),
- cb_arg(_cb_arg), rcb(_rcb), ix(0), d_count(0) {
- const auto& mk = rgw_fh->find_marker(*offset);
- if (mk) {
- marker = *mk;
+ cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0) {
+
+ using boost::get;
+
+ if (unlikely(!! get<uint64_t*>(&offset))) {
+ ioff = get<uint64_t*>(offset);
+ const auto& mk = rgw_fh->find_marker(*ioff);
+ if (mk) {
+ marker = *mk;
+ }
+ } else {
+ const char* mk = get<const char*>(offset);
+ if (mk) {
+ std::string tmark{rgw_fh->relative_object_name()};
+ tmark += "/";
+ tmark += mk;
+ marker = rgw_obj_key{std::move(tmark), "", ""};
+ }
}
+
default_max = 1000; // XXX was being omitted
op = this;
}
/* hash offset of name in parent (short name) for NFS readdir cookie */
uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
- *offset = off;
+ if (unlikely(!! ioff)) {
+ *ioff = off;
+ }
/* update traversal cache */
rgw_fh->add_marker(off, marker, type);
++d_count;
}
bool eof() {
- lsubdout(cct, rgw, 15) << "READDIR offset: " << *offset
+ lsubdout(cct, rgw, 15) << "READDIR offset: " << offset
<< " next marker: " << next_marker
<< " is_truncated: " << is_truncated
<< dendl;
if ((s == "*") && wildcards) {
return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*");
} else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild)) {
- ceph_assert(match.size() == 6);
+ if (match.size() != 6) {
+ return boost::none;
+ }
ARN a;
{
ECMAScript | optimize);
smatch match;
if (regex_match(a->resource, match, rx)) {
- ceph_assert(match.size() == 3);
+ if (match.size() != 3) {
+ return boost::none;
+ }
if (match[1] == "user") {
return Principal::user(std::move(a->account),
// Principals
} else if (w->kind == TokenKind::princ_type) {
- ceph_assert(pp->s.size() > 1);
+ if (pp->s.size() <= 1) {
+ return false;
+ }
auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ?
t->princ : t->noprinc;
#include "rgw_iam_policy_keywords.h"
#include "rgw_string.h"
-#include "include/assert.h" // razzin' frazzin' ...grrr.
-
class RGWRados;
namespace rgw {
namespace auth {
inline bool operator ==(const MaskedIP& l, const MaskedIP& r) {
auto shift = std::max((l.v6 ? 128 : 32) - l.prefix,
(r.v6 ? 128 : 32) - r.prefix);
- ceph_assert(shift > 0);
return (l.addr >> shift) == (r.addr >> shift);
}
utime_t ut(creation_time);
encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */
encode_json("count", count, f);
+ encode_json("placement_rule", placement_rule, f);
}
void RGWUploadPartInfo::dump(Formatter *f) const
~TokenCache() {
down_flag = true;
- revocator.stop();
- revocator.join();
+ // Only stop and join if revocator thread is started.
+ if (revocator.is_started()) {
+ revocator.stop();
+ revocator.join();
+ }
}
public:
localtime_r(&start_date, &bdt);
if (cct->_conf->rgw_lc_debug_interval > 0) {
- /* We're debugging, so say we can run */
- return false;
+ if (now - start_date < cct->_conf->rgw_lc_debug_interval)
+ return true;
+ else
+ return false;
}
bdt.tm_hour = 0;
l.unlock(&store->lc_pool_ctx, obj_names[index]);
ret = bucket_lc_process(entry.first);
bucket_lc_post(index, max_lock_secs, entry, ret);
- return 0;
+ }while(1);
+
exit:
l.unlock(&store->lc_pool_ctx, obj_names[index]);
return 0;
-
- }while(1);
-
}
void RGWLC::start_processor()
*/
if (store->get_zonegroup().is_master_zonegroup() && s->system_request) {
/*If this is the master, don't redirect*/
+ } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) {
+ /* If op is get bucket location, don't redirect */
} else if (!s->local_source ||
(s->op != OP_PUT && s->op != OP_COPY) ||
s->object.empty()) {
/* start gettorrent */
if (torrent.get_flag())
{
+ attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
+ if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
+ op_ret = -ERR_INVALID_REQUEST;
+ goto done_err;
+ }
torrent.init(s, store);
- torrent.get_torrent_file(op_ret, read_op, total_len, bl, obj);
+ op_ret = torrent.get_torrent_file(read_op, total_len, bl, obj);
if (op_ret < 0)
{
ldout(s->cct, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret
bool started = false;
uint64_t total_count = 0;
- uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
+ const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
op_ret = get_params();
if (op_ret < 0) {
<< s->user->user_id << dendl;
break;
}
- map<string, RGWBucketEnt>& m = buckets.get_buckets();
- map<string, RGWBucketEnt>::iterator iter;
- for (iter = m.begin(); iter != m.end(); ++iter) {
- RGWBucketEnt& bucket = iter->second;
- buckets_size += bucket.size;
- buckets_size_rounded += bucket.size_rounded;
- buckets_objcount += bucket.count;
+
+ /* We need to have stats for all our policies - even if a given policy
+ * isn't actually used in a given account. In such situation its usage
+ * stats would be simply full of zeros. */
+ for (const auto& policy : store->get_zonegroup().placement_targets) {
+ policies_stats.emplace(policy.second.name,
+ decltype(policies_stats)::mapped_type());
+ }
+
+ std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+ for (const auto& kv : m) {
+ const auto& bucket = kv.second;
+
+ global_stats.bytes_used += bucket.size;
+ global_stats.bytes_used_rounded += bucket.size_rounded;
+ global_stats.objects_count += bucket.count;
+
+ /* operator[] still can create a new entry for storage policy seen
+ * for first time. */
+ auto& policy_stats = policies_stats[bucket.placement_rule];
+ policy_stats.bytes_used += bucket.size;
+ policy_stats.bytes_used_rounded += bucket.size_rounded;
+ policy_stats.buckets_count++;
+ policy_stats.objects_count += bucket.count;
}
- buckets_count += m.size();
+ global_stats.buckets_count += m.size();
total_count += m.size();
done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit));
}
if (!m.empty()) {
- send_response_data(buckets);
-
map<string, RGWBucketEnt>::reverse_iterator riter = m.rbegin();
marker = riter->first;
+
+ handle_listing_chunk(std::move(buckets));
}
} while (is_truncated && !done);
<< s->user->user_id << dendl;
break;
} else {
- map<string, RGWBucketEnt>& m = buckets.get_buckets();
- map<string, RGWBucketEnt>::iterator iter;
- for (iter = m.begin(); iter != m.end(); ++iter) {
- RGWBucketEnt& bucket = iter->second;
- buckets_size += bucket.size;
- buckets_size_rounded += bucket.size_rounded;
- buckets_objcount += bucket.count;
-
- marker = iter->first;
+ /* We need to have stats for all our policies - even if a given policy
+ * isn't actually used in a given account. In such situation its usage
+ * stats would be simply full of zeros. */
+ for (const auto& policy : store->get_zonegroup().placement_targets) {
+ policies_stats.emplace(policy.second.name,
+ decltype(policies_stats)::mapped_type());
+ }
+
+ std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+ for (const auto& kv : m) {
+ const auto& bucket = kv.second;
+
+ global_stats.bytes_used += bucket.size;
+ global_stats.bytes_used_rounded += bucket.size_rounded;
+ global_stats.objects_count += bucket.count;
+
+ /* operator[] still can create a new entry for storage policy seen
+ * for first time. */
+ auto& policy_stats = policies_stats[bucket.placement_rule];
+ policy_stats.bytes_used += bucket.size;
+ policy_stats.bytes_used_rounded += bucket.size_rounded;
+ policy_stats.buckets_count++;
+ policy_stats.objects_count += bucket.count;
}
- buckets_count += m.size();
+ global_stats.buckets_count += m.size();
}
} while (is_truncated);
int RGWGetBucketVersioning::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3GetBucketVersioning,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
void RGWGetBucketVersioning::pre_exec()
int RGWSetBucketVersioning::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3PutBucketVersioning,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
void RGWSetBucketVersioning::pre_exec()
int RGWGetBucketWebsite::verify_permission()
{
- if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3GetBucketWebsite,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
+ }
- return 0;
+ return -EACCES;
}
void RGWGetBucketWebsite::pre_exec()
int RGWSetBucketWebsite::verify_permission()
{
- if (s->user->user_id.compare(s->bucket_owner.get_id()) != 0)
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3PutBucketWebsite,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
+ }
- return 0;
+ return -EACCES;
}
void RGWSetBucketWebsite::pre_exec()
int RGWGetBucketLocation::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3GetBucketLocation,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
int RGWCreateBucket::verify_permission()
if (need_metadata_upload()) {
/* It's supposed that following functions WILL NOT change any special
* attributes (like RGW_ATTR_ACL) if they are already present in attrs. */
- rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ if (op_ret < 0) {
+ return;
+ }
prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
populate_with_generic_attrs(s, attrs);
attrs.clear();
- rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ if (op_ret < 0) {
+ return;
+ }
prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
populate_with_generic_attrs(s, attrs);
op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota);
emplace_attr(RGW_ATTR_ETAG, std::move(bl));
populate_with_generic_attrs(s, attrs);
- rgw_get_request_metadata(s->cct, s->info, attrs);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+ if (op_ret < 0) {
+ goto done;
+ }
encode_delete_at_attr(delete_at, attrs);
encode_obj_tags_attr(obj_tags.get(), attrs);
attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl));
}
- rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ if (op_ret < 0) {
+ return op_ret;
+ }
prepare_add_del_attrs(orig_attrs, rmattr_names, attrs);
populate_with_generic_attrs(s, attrs);
return;
}
- rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs, false);
+ if (op_ret < 0) {
+ return;
+ }
if (!placement_rule.empty() &&
placement_rule != s->bucket_info.placement_rule) {
return;
}
- rgw_get_request_metadata(s->cct, s->info, attrs);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+ if (op_ret < 0) {
+ return;
+ }
+
/* check if obj exists, read orig attrs */
op_ret = get_obj_attrs(store, s, obj, orig_attrs);
if (op_ret < 0) {
dest_policy.encode(aclbl);
emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
- rgw_get_request_metadata(s->cct, s->info, attrs);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+ if (op_ret < 0) {
+ return op_ret;
+ }
populate_with_generic_attrs(s, attrs);
return 0;
int RGWGetCORS::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3PutBucketCORS,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
void RGWGetCORS::execute()
int RGWPutCORS::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3PutBucketCORS,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
void RGWPutCORS::execute()
int RGWGetRequestPayment::verify_permission()
{
+ if (s->iam_policy &&
+ s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3GetBucketRequestPayment,
+ ARN(s->bucket)) != Effect::Allow) {
+ return -EACCES;
+ }
return 0;
}
int RGWSetRequestPayment::verify_permission()
{
- if (false == s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
- return -EACCES;
+ if (s->iam_policy) {
+ if (s->iam_policy->eval(s->env, *s->auth.identity,
+ rgw::IAM::s3PutBucketRequestPayment,
+ ARN(s->bucket)) == Effect::Allow) {
+ return 0;
+ }
+ } else if (s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+ return 0;
}
-
- return 0;
+ return -EACCES;
}
void RGWSetRequestPayment::pre_exec()
if (op_ret != 0)
return;
- rgw_get_request_metadata(s->cct, s->info, attrs);
+ op_ret = rgw_get_request_metadata(s->cct, s->info, attrs);
+ if (op_ret < 0) {
+ return;
+ }
do {
char buf[33];
from deleting the parts*/
rgw_pool meta_pool;
rgw_raw_obj raw_obj;
- librados::ObjectWriteOperation op;
- librados::IoCtx ioctx;
- rados::cls::lock::Lock l("RGWCompleteMultipart");
- int max_lock_secs_mp = s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+ int max_lock_secs_mp =
+ s->cct->_conf->get_val<int64_t>("rgw_mp_lock_max_time");
+ utime_t dur(max_lock_secs_mp, 0);
- op.assert_exists();
store->obj_to_raw((s->bucket_info).placement_rule, meta_obj, &raw_obj);
- store->get_obj_data_pool((s->bucket_info).placement_rule,meta_obj,&meta_pool);
- store->open_pool_ctx(meta_pool, ioctx);
-
- const string raw_meta_oid = raw_obj.oid;
- utime_t time(max_lock_secs_mp, 0);
- l.set_duration(time);
- l.lock_exclusive(&op);
- op_ret = ioctx.operate(raw_meta_oid, &op);
+ store->get_obj_data_pool((s->bucket_info).placement_rule,
+ meta_obj,&meta_pool);
+ store->open_pool_ctx(meta_pool, serializer.ioctx);
+ op_ret = serializer.try_lock(raw_obj.oid, dur);
if (op_ret < 0) {
dout(0) << "RGWCompleteMultipart::execute() failed to acquire lock " << dendl;
op_ret = -ERR_INTERNAL_ERROR;
obj_op.meta.owner = s->owner.get_id();
obj_op.meta.flags = PUT_OBJ_CREATE;
obj_op.meta.modify_tail = true;
+ obj_op.meta.completeMultipart = true;
op_ret = obj_op.write_meta(ofs, accounted_size, attrs);
if (op_ret < 0)
return;
// remove the upload obj
int r = store->delete_obj(*static_cast<RGWObjectCtx *>(s->obj_ctx),
s->bucket_info, meta_obj, 0);
- if (r < 0) {
- ldout(store->ctx(), 0) << "WARNING: failed to remove object " << meta_obj << dendl;
- r = l.unlock(&ioctx, raw_meta_oid);
+ if (r >= 0) {
+ /* serializer's exclusive lock is released */
+ serializer.clear_locked();
+ } else {
+ ldout(store->ctx(), 0) << "WARNING: failed to remove object "
+ << meta_obj << dendl;
+ }
+}
+
+int RGWCompleteMultipart::MPSerializer::try_lock(
+ const std::string& _oid,
+ utime_t dur)
+{
+ oid = _oid;
+ op.assert_exists();
+ lock.set_duration(dur);
+ lock.lock_exclusive(&op);
+ int ret = ioctx.operate(oid, &op);
+ if (! ret) {
+ locked = true;
+ }
+ return ret;
+}
+
+void RGWCompleteMultipart::complete()
+{
+ /* release exclusive lock iff not already */
+ if (unlikely(serializer.locked)) {
+ int r = serializer.unlock();
if (r < 0) {
- ldout(store->ctx(), 0) << "WARNING: failed to unlock " << raw_meta_oid << dendl;
+ ldout(store->ctx(), 0) << "WARNING: failed to unlock "
+ << serializer.oid << dendl;
}
}
+ send_response();
}
int RGWAbortMultipart::verify_permission()
#include "rgw_lc.h"
#include "rgw_torrent.h"
#include "rgw_tag.h"
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
#include "include/assert.h"
}; /* RGWBulkUploadOp::AlignedStreamGetter */
+struct RGWUsageStats {
+ uint64_t bytes_used = 0;
+ uint64_t bytes_used_rounded = 0;
+ uint64_t buckets_count = 0;
+ uint64_t objects_count = 0;
+};
+
#define RGW_LIST_BUCKETS_LIMIT_MAX 10000
class RGWListBuckets : public RGWOp {
protected:
bool sent_data;
- string marker;
- string end_marker;
+ std::string marker;
+ std::string end_marker;
int64_t limit;
uint64_t limit_max;
- uint32_t buckets_count;
- uint64_t buckets_objcount;
- uint64_t buckets_size;
- uint64_t buckets_size_rounded;
- map<string, bufferlist> attrs;
+ std::map<std::string, ceph::bufferlist> attrs;
bool is_truncated;
+ RGWUsageStats global_stats;
+ std::map<std::string, RGWUsageStats> policies_stats;
+
virtual uint64_t get_default_max() const {
return 1000;
}
public:
- RGWListBuckets() : sent_data(false) {
- limit = limit_max = RGW_LIST_BUCKETS_LIMIT_MAX;
- buckets_count = 0;
- buckets_objcount = 0;
- buckets_size = 0;
- buckets_size_rounded = 0;
- is_truncated = false;
+ RGWListBuckets()
+ : sent_data(false),
+ limit(RGW_LIST_BUCKETS_LIMIT_MAX),
+ limit_max(RGW_LIST_BUCKETS_LIMIT_MAX),
+ is_truncated(false) {
}
int verify_permission() override;
void execute() override;
virtual int get_params() = 0;
+ virtual void handle_listing_chunk(RGWUserBuckets&& buckets) {
+ /* The default implementation, used by e.g. S3, just generates a new
+ * part of listing and sends it client immediately. Swift can behave
+ * differently: when the reverse option is requested, all incoming
+ * instances of RGWUserBuckets are buffered and finally reversed. */
+ return send_response_data(buckets);
+ }
virtual void send_response_begin(bool has_buckets) = 0;
virtual void send_response_data(RGWUserBuckets& buckets) = 0;
virtual void send_response_end() = 0;
class RGWStatAccount : public RGWOp {
protected:
- uint32_t buckets_count;
- uint64_t buckets_objcount;
- uint64_t buckets_size;
- uint64_t buckets_size_rounded;
+ RGWUsageStats global_stats;
+ std::map<std::string, RGWUsageStats> policies_stats;
public:
- RGWStatAccount() {
- buckets_count = 0;
- buckets_objcount = 0;
- buckets_size = 0;
- buckets_size_rounded = 0;
- }
+ RGWStatAccount() = default;
int verify_permission() override;
void execute() override;
void send_response() override = 0;
- const string name() override { return "stat_account"; }
+ const std::string name() override { return "stat_account"; }
RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; }
uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
};
void send_response() override = 0;
const string name() override { return "get_bucket_location"; }
+ RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; }
uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
};
char *data;
int len;
+ struct MPSerializer {
+ librados::IoCtx ioctx;
+ rados::cls::lock::Lock lock;
+ librados::ObjectWriteOperation op;
+ std::string oid;
+ bool locked;
+
+ MPSerializer() : lock("RGWCompleteMultipart"), locked(false)
+ {}
+
+ int try_lock(const std::string& oid, utime_t dur);
+
+ int unlock() {
+ return lock.unlock(&ioctx, oid);
+ }
+
+ void clear_locked() {
+ locked = false;
+ }
+ } serializer;
+
public:
RGWCompleteMultipart() {
data = NULL;
int verify_permission() override;
void pre_exec() override;
void execute() override;
+ void complete() override;
virtual int get_params() = 0;
void send_response() override = 0;
* map(<attr_name, attr_contents>, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME)
* s: The request state
* attrs: will be filled up with attrs mapped as <attr_name, attr_contents>
+ * On success returns 0.
+ * On failure returns a negative error code.
*
*/
-static inline void rgw_get_request_metadata(CephContext *cct,
- struct req_info& info,
- map<string, bufferlist>& attrs,
- const bool allow_empty_attrs = true)
+static inline int rgw_get_request_metadata(CephContext* const cct,
+ struct req_info& info,
+ std::map<std::string, ceph::bufferlist>& attrs,
+ const bool allow_empty_attrs = true)
{
static const std::set<std::string> blacklisted_headers = {
"x-amz-server-side-encryption-customer-algorithm",
"x-amz-server-side-encryption-customer-key",
"x-amz-server-side-encryption-customer-key-md5"
};
- map<string, string>::iterator iter;
- for (iter = info.x_meta_map.begin(); iter != info.x_meta_map.end(); ++iter) {
- const string &name(iter->first);
- string &xattr(iter->second);
+
+ size_t valid_meta_count = 0;
+ for (auto& kv : info.x_meta_map) {
+ const std::string& name = kv.first;
+ std::string& xattr = kv.second;
+
if (blacklisted_headers.count(name) == 1) {
lsubdout(cct, rgw, 10) << "skipping x>> " << name << dendl;
continue;
- }
- if (allow_empty_attrs || !xattr.empty()) {
+ } else if (allow_empty_attrs || !xattr.empty()) {
lsubdout(cct, rgw, 10) << "x>> " << name << ":" << xattr << dendl;
format_xattr(xattr);
- string attr_name(RGW_ATTR_PREFIX);
+
+ std::string attr_name(RGW_ATTR_PREFIX);
attr_name.append(name);
- map<string, bufferlist>::value_type v(attr_name, bufferlist());
- std::pair < map<string, bufferlist>::iterator, bool >
- rval(attrs.insert(v));
- bufferlist& bl(rval.first->second);
+
+ /* Check roughly whether we aren't going behind the limit on attribute
+ * name. Passing here doesn't guarantee that an OSD will accept that
+ * as ObjectStore::get_max_attr_name_length() can set the limit even
+ * lower than the "osd_max_attr_name_len" configurable. */
+ const size_t max_attr_name_len = \
+ cct->_conf->get_val<size_t>("rgw_max_attr_name_len");
+ if (max_attr_name_len && attr_name.length() > max_attr_name_len) {
+ return -ENAMETOOLONG;
+ }
+
+ /* Similar remarks apply to the check for value size. We're veryfing
+ * it early at the RGW's side as it's being claimed in /info. */
+ const size_t max_attr_size = \
+ cct->_conf->get_val<size_t>("rgw_max_attr_size");
+ if (max_attr_size && xattr.length() > max_attr_size) {
+ return -EFBIG;
+ }
+
+ /* Swift allows administrators to limit the number of metadats items
+ * send _in a single request_. */
+ const auto rgw_max_attrs_num_in_req = \
+ cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+ if (rgw_max_attrs_num_in_req &&
+ ++valid_meta_count > rgw_max_attrs_num_in_req) {
+ return -E2BIG;
+ }
+
+ auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist());
+ /* At the moment the value of the freshly created attribute key-value
+ * pair is an empty bufferlist. */
+
+ ceph::bufferlist& bl = rval.first->second;
bl.append(xattr.c_str(), xattr.size() + 1);
}
}
+
+ return 0;
} /* rgw_get_request_metadata */
static inline void encode_delete_at_attr(boost::optional<ceph::real_time> delete_at,
quota.max_size_soft_threshold = quota.max_size * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
}
- const auto cached_stats_num_kb_rounded = rgw_rounded_kb(cached_stats.size_rounded);
- if (cached_stats_num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) {
+ if (cached_stats.size_rounded >= (uint64_t)quota.max_size_soft_threshold) {
ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): "
- << cached_stats_num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl;
+ << cached_stats.size_rounded << " >= " << quota.max_size_soft_threshold << dendl;
return false;
}
}
result->clear();
rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
-
- rgw_obj_key end_marker_obj;
- rgw_obj_index_key cur_end_marker;
- if (!params.ns.empty()) {
- end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
- end_marker_obj.ns = params.ns;
- end_marker_obj.get_index_key(&cur_end_marker);
- }
rgw_obj_index_key cur_marker;
marker_obj.get_index_key(&cur_marker);
+ rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
+ params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
const bool cur_end_marker_valid = !params.end_marker.empty();
rgw_obj_key prefix_obj(params.prefix);
meta.canceled = false;
/* update quota cache */
- store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
- accounted_size, orig_size);
+ if (meta.completeMultipart){
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ 0, orig_size);
+ }
+ else {
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ accounted_size, orig_size);
+ }
return 0;
done_cancel:
if (tail_placement.bucket.name.empty()) {
manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
}
+ string ref_tag;
for (; miter != astate->manifest.obj_end(); ++miter) {
ObjectWriteOperation op;
- cls_refcount_get(op, tag, true);
+ ref_tag = tag + '\0';
+ cls_refcount_get(op, ref_tag, true);
const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
ref.ioctx.locator_set_key(loc.loc);
index_op.set_zones_trace(params.zones_trace);
index_op.set_bilog_flags(params.bilog_flags);
-
r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
if (r < 0)
return r;
return r;
bl.append(tag.c_str(), tag.size() + 1);
-
op.setxattr(RGW_ATTR_ID_TAG, bl);
}
+
+ real_time mtime = real_clock::now();
+ struct timespec mtime_ts = real_clock::to_timespec(mtime);
+ op.mtime2(&mtime_ts);
r = ref.ioctx.operate(ref.oid, &op);
if (state) {
if (r >= 0) {
string content_type(content_type_bl.c_str(), content_type_bl.length());
uint64_t epoch = ref.ioctx.get_last_version();
int64_t poolid = ref.ioctx.get_id();
- real_time mtime = real_clock::now();
r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
mtime, etag, content_type, &acl_bl,
RGW_OBJ_CATEGORY_MAIN, NULL);
ent.size_rounded += stats.total_size_rounded;
}
}
+
+ // fill in placement_rule from the bucket instance for use in swift's
+ // per-storage policy statistics
+ ent.placement_rule = std::move(bucket_info.placement_rule);
}
return m.size();
const string& name = vcurrents[pos]->first;
struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
- bool force_check = force_check_filter && force_check_filter(dirent.key.name);
- if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
+ bool force_check = force_check_filter &&
+ force_check_filter(dirent.key.name);
+ if ((!dirent.exists && !dirent.is_delete_marker()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
/* there are uncommitted ops. We need to check the current state,
* and if the tags are old we need to do cleanup as well. */
librados::IoCtx sub_ctx;
int get_zonegroup(RGWZoneGroup& zonegroup,
const string& zonegroup_id);
- bool is_single_zonegroup()
+ bool is_single_zonegroup() const
{
return (period_map.zonegroups.size() == 1);
}
const string *user_data;
rgw_zone_set *zones_trace;
bool modify_tail;
+ bool completeMultipart;
MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr), zones_trace(nullptr),
- modify_tail(false) {}
+ modify_tail(false), completeMultipart(false) {}
} meta;
explicit Write(RGWRados::Object *_target) : target(_target) {}
(get_zonegroup().zones.size() > 1 || current_period.is_multi_zonegroups_with_zones());
}
+ bool can_reshard() const {
+ return current_period.get_id().empty() ||
+ (zonegroup.zones.size() == 1 && current_period.is_single_zonegroup());
+ }
+
librados::Rados* get_rados_handle();
int delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles);
int RGWReshard::add(cls_rgw_reshard_entry& entry)
{
+ if (!store->can_reshard()) {
+ ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl;
+ return 0;
+ }
+
string logshard_oid;
get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
int RGWReshard::process_all_logshards()
{
+ if (!store->can_reshard()) {
+ ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl;
+ return 0;
+ }
int ret = 0;
for (int i = 0; i < num_logshards; i++) {
utime_t last_run;
do {
utime_t start = ceph_clock_now();
- ldout(cct, 2) << "object expiration: start" << dendl;
if (reshard->process_all_logshards()) {
/* All shards have been processed properly. Next time we can start
* from this moment. */
last_run = start;
}
- ldout(cct, 2) << "object expiration: stop" << dendl;
-
if (reshard->going_down())
break;
extern void dump_header(struct req_state* s,
const boost::string_ref& name,
const utime_t& val);
+
template <class... Args>
static inline void dump_header_prefixed(struct req_state* s,
const boost::string_ref& name_prefix,
return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
}
+template <class... Args>
+static inline void dump_header_infixed(struct req_state* s,
+ const boost::string_ref& prefix,
+ const boost::string_ref& infix,
+ const boost::string_ref& sufix,
+ Args&&... args) {
+ char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1];
+ const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s",
+ static_cast<int>(prefix.length()),
+ prefix.data(),
+ static_cast<int>(infix.length()),
+ infix.data(),
+ static_cast<int>(sufix.length()),
+ sufix.data());
+ boost::string_ref full_name(full_name_buf, len);
+ return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
+}
+
template <class... Args>
static inline void dump_header_quoted(struct req_state* s,
const boost::string_ref& name,
// vim: ts=8 sw=2 smarttab
#include <boost/algorithm/string/predicate.hpp>
+#include <boost/format.hpp>
#include <boost/optional.hpp>
#include <boost/utility/in_place_factory.hpp>
prefix = s->info.args.get("prefix");
marker = s->info.args.get("marker");
end_marker = s->info.args.get("end_marker");
+ wants_reversed = s->info.args.exists("reverse");
- string limit_str = s->info.args.get("limit");
+ if (wants_reversed) {
+ std::swap(marker, end_marker);
+ }
+
+ std::string limit_str = s->info.args.get("limit");
if (!limit_str.empty()) {
- string err;
+ std::string err;
long l = strict_strtol(limit_str.c_str(), 10, &err);
if (!err.empty()) {
return -EINVAL;
}
static void dump_account_metadata(struct req_state * const s,
- const uint32_t buckets_count,
- const uint64_t buckets_object_count,
- const uint64_t buckets_size,
- const uint64_t buckets_size_rounded,
+ const RGWUsageStats& global_stats,
+ const std::map<std::string, RGWUsageStats> policies_stats,
/* const */map<string, bufferlist>& attrs,
const RGWQuotaInfo& quota,
const RGWAccessControlPolicy_SWIFTAcct &policy)
/* Adding X-Timestamp to keep align with Swift API */
dump_header(s, "X-Timestamp", ceph_clock_now());
- dump_header(s, "X-Account-Container-Count", buckets_count);
- dump_header(s, "X-Account-Object-Count", buckets_object_count);
- dump_header(s, "X-Account-Bytes-Used", buckets_size);
- dump_header(s, "X-Account-Bytes-Used-Actual", buckets_size_rounded);
+ dump_header(s, "X-Account-Container-Count", global_stats.buckets_count);
+ dump_header(s, "X-Account-Object-Count", global_stats.objects_count);
+ dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used);
+ dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded);
+
+ for (const auto& kv : policies_stats) {
+ const auto& policy_name = camelcase_dash_http_attr(kv.first);
+ const auto& policy_stats = kv.second;
+
+ dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+ "-Container-Count", policy_stats.buckets_count);
+ dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+ "-Object-Count", policy_stats.objects_count);
+ dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+ "-Bytes-Used", policy_stats.bytes_used);
+ dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+ "-Bytes-Used-Actual", policy_stats.bytes_used_rounded);
+ }
/* Dump TempURL-related stuff */
if (s->perm_mask == RGW_PERM_FULL_CONTROL) {
if (! s->cct->_conf->rgw_swift_enforce_content_length) {
/* Adding account stats in the header to keep align with Swift API */
dump_account_metadata(s,
- buckets_count,
- buckets_objcount,
- buckets_size,
- buckets_size_rounded,
+ global_stats,
+ policies_stats,
attrs,
user_quota,
static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
}
}
+void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(RGWUserBuckets&& buckets)
+{
+ if (wants_reversed) {
+ /* Just store in the reversal buffer. Its content will be handled later,
+ * in send_response_end(). */
+ reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets));
+ } else {
+ return send_response_data(buckets);
+ }
+}
+
void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
{
if (! sent_data) {
for (auto iter = m.lower_bound(prefix);
iter != m.end() && boost::algorithm::starts_with(iter->first, prefix);
++iter) {
- const RGWBucketEnt& obj = iter->second;
+ dump_bucket_entry(iter->second);
+ }
+}
- s->formatter->open_object_section("container");
- s->formatter->dump_string("name", obj.bucket.name);
- if (need_stats) {
- s->formatter->dump_int("count", obj.count);
- s->formatter->dump_int("bytes", obj.size);
- }
- s->formatter->close_section();
- if (! s->cct->_conf->rgw_swift_enforce_content_length) {
- rgw_flush_formatter(s, s->formatter);
- }
+void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& obj)
+{
+ s->formatter->open_object_section("container");
+ s->formatter->dump_string("name", obj.bucket.name);
+
+ if (need_stats) {
+ s->formatter->dump_int("count", obj.count);
+ s->formatter->dump_int("bytes", obj.size);
+ }
+
+ s->formatter->close_section();
+
+ if (! s->cct->_conf->rgw_swift_enforce_content_length) {
+ rgw_flush_formatter(s, s->formatter);
+ }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(RGWUserBuckets& buckets)
+{
+ if (! sent_data) {
+ return;
+ }
+
+ /* Take care of the prefix parameter of Swift API. There is no business
+ * in applying the filter earlier as we really need to go through all
+ * entries regardless of it (the headers like X-Account-Container-Count
+ * aren't affected by specifying prefix). */
+ std::map<std::string, RGWBucketEnt>& m = buckets.get_buckets();
+
+ auto iter = m.rbegin();
+ for (/* initialized above */;
+ iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix);
+ ++iter) {
+ /* NOP */;
+ }
+
+ for (/* iter carried */;
+ iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix);
+ ++iter) {
+ dump_bucket_entry(iter->second);
}
}
void RGWListBuckets_ObjStore_SWIFT::send_response_end()
{
+ if (wants_reversed) {
+ for (auto& buckets : reverse_buffer) {
+ send_response_data_reversed(buckets);
+ }
+ }
+
if (sent_data) {
s->formatter->close_section();
}
if (s->cct->_conf->rgw_swift_enforce_content_length) {
/* Adding account stats in the header to keep align with Swift API */
dump_account_metadata(s,
- buckets_count,
- buckets_objcount,
- buckets_size,
- buckets_size_rounded,
+ global_stats,
+ policies_stats,
attrs,
user_quota,
static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
dump_errno(s);
- end_header(s, NULL, NULL, s->formatter->get_len(), true);
+ end_header(s, nullptr, nullptr, s->formatter->get_len(), true);
}
if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) {
if (op_ret >= 0) {
op_ret = STATUS_NO_CONTENT;
dump_account_metadata(s,
- buckets_count,
- buckets_objcount,
- buckets_size,
- buckets_size_rounded,
+ global_stats,
+ policies_stats,
attrs,
user_quota,
static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
return get_swift_versioning_settings(s, swift_ver_location);
}
+static inline int handle_metadata_errors(req_state* const s, const int op_ret)
+{
+ if (op_ret == -EFBIG) {
+ /* Handle the custom error message of exceeding maximum custom attribute
+ * (stored as xattr) size. */
+ const auto error_message = boost::str(
+ boost::format("Metadata value longer than %lld")
+ % s->cct->_conf->get_val<size_t>("rgw_max_attr_size"));
+ set_req_state_err(s, EINVAL, error_message);
+ return -EINVAL;
+ } else if (op_ret == -E2BIG) {
+ const auto error_message = boost::str(
+ boost::format("Too many metadata items; max %lld")
+ % s->cct->_conf->get_val<size_t>("rgw_max_attrs_num_in_req"));
+ set_req_state_err(s, EINVAL, error_message);
+ return -EINVAL;
+ }
+
+ return op_ret;
+}
+
void RGWCreateBucket_ObjStore_SWIFT::send_response()
{
- if (! op_ret)
- op_ret = STATUS_CREATED;
- else if (op_ret == -ERR_BUCKET_EXISTS)
- op_ret = STATUS_ACCEPTED;
- set_req_state_err(s, op_ret);
+ const auto meta_ret = handle_metadata_errors(s, op_ret);
+ if (meta_ret != op_ret) {
+ op_ret = meta_ret;
+ } else {
+ if (!op_ret) {
+ op_ret = STATUS_CREATED;
+ } else if (op_ret == -ERR_BUCKET_EXISTS) {
+ op_ret = STATUS_ACCEPTED;
+ }
+ set_req_state_err(s, op_ret);
+ }
+
dump_errno(s);
/* Propose ending HTTP header with 0 Content-Length header. */
end_header(s, NULL, NULL, 0);
void RGWPutObj_ObjStore_SWIFT::send_response()
{
- if (! op_ret) {
- op_ret = STATUS_CREATED;
+ const auto meta_ret = handle_metadata_errors(s, op_ret);
+ if (meta_ret) {
+ op_ret = meta_ret;
+ } else {
+ if (!op_ret) {
+ op_ret = STATUS_CREATED;
+ }
+ set_req_state_err(s, op_ret);
}
if (! lo_etag.empty()) {
void RGWPutMetadataAccount_ObjStore_SWIFT::send_response()
{
- if (! op_ret) {
- op_ret = STATUS_NO_CONTENT;
+ const auto meta_ret = handle_metadata_errors(s, op_ret);
+ if (meta_ret != op_ret) {
+ op_ret = meta_ret;
+ } else {
+ if (!op_ret) {
+ op_ret = STATUS_NO_CONTENT;
+ }
+ set_req_state_err(s, op_ret);
}
- set_req_state_err(s, op_ret);
+
dump_errno(s);
end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
void RGWPutMetadataBucket_ObjStore_SWIFT::send_response()
{
- if (!op_ret && (op_ret != -EINVAL)) {
- op_ret = STATUS_NO_CONTENT;
+ const auto meta_ret = handle_metadata_errors(s, op_ret);
+ if (meta_ret != op_ret) {
+ op_ret = meta_ret;
+ } else {
+ if (!op_ret && (op_ret != -EINVAL)) {
+ op_ret = STATUS_NO_CONTENT;
+ }
+ set_req_state_err(s, op_ret);
}
- set_req_state_err(s, op_ret);
+
dump_errno(s);
end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
{
- if (! op_ret) {
- op_ret = STATUS_ACCEPTED;
+ const auto meta_ret = handle_metadata_errors(s, op_ret);
+ if (meta_ret != op_ret) {
+ op_ret = meta_ret;
+ } else {
+ if (!op_ret) {
+ op_ret = STATUS_ACCEPTED;
+ }
+ set_req_state_err(s, op_ret);
}
- set_req_state_err(s, op_ret);
+
if (!s->is_err()) {
dump_content_length(s, 0);
}
+
dump_errno(s);
end_header(s, this);
rgw_flush_formatter_and_reset(s, s->formatter);
string ceph_version(CEPH_GIT_NICE_VER);
formatter.dump_string("version", ceph_version);
- formatter.dump_int("max_meta_name_length", 81);
+
+ const size_t max_attr_name_len = \
+ g_conf->get_val<size_t>("rgw_max_attr_name_len");
+ if (max_attr_name_len) {
+ const size_t meta_name_limit = \
+ max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX);
+ formatter.dump_int("max_meta_name_length", meta_name_limit);
+ }
+
+ const size_t meta_value_limit = g_conf->get_val<size_t>("rgw_max_attr_size");
+ if (meta_value_limit) {
+ formatter.dump_int("max_meta_value_length", meta_value_limit);
+ }
+
+ const size_t meta_num_limit = \
+ g_conf->get_val<size_t>("rgw_max_attrs_num_in_req");
+ if (meta_num_limit) {
+ formatter.dump_int("max_meta_count", meta_num_limit);
+ }
formatter.open_array_section("policies");
RGWZoneGroup& zonegroup = store.get_zonegroup();
int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
{
- int ret = RGWHandler_REST::validate_bucket_name(bucket);
- if (ret < 0)
- return ret;
+ const size_t len = bucket.size();
- int len = bucket.size();
+ if (len > MAX_BUCKET_NAME_LEN) {
+ /* Bucket Name too long. Generate custom error message and bind it
+ * to an R-value reference. */
+ const auto msg = boost::str(
+ boost::format("Container name length of %lld longer than %lld")
+ % len % int(MAX_BUCKET_NAME_LEN));
+ set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg);
+ return -ERR_INVALID_BUCKET_NAME;
+ }
+
+ const auto ret = RGWHandler_REST::validate_bucket_name(bucket);
+ if (ret < 0) {
+ return ret;
+ }
if (len == 0)
return 0;
const char *s = bucket.c_str();
- for (int i = 0; i < len; ++i, ++s) {
+ for (size_t i = 0; i < len; ++i, ++s) {
if (*(unsigned char *)s == 0xff)
return -ERR_INVALID_BUCKET_NAME;
}
class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore {
bool need_stats;
+ bool wants_reversed;
std::string prefix;
+ std::vector<RGWUserBuckets> reverse_buffer;
uint64_t get_default_max() const override {
return 0;
}
+
public:
- RGWListBuckets_ObjStore_SWIFT() : need_stats(true) {}
+ RGWListBuckets_ObjStore_SWIFT()
+ : need_stats(true),
+ wants_reversed(false) {
+ }
~RGWListBuckets_ObjStore_SWIFT() override {}
int get_params() override;
+ void handle_listing_chunk(RGWUserBuckets&& buckets) override;
void send_response_begin(bool has_buckets) override;
void send_response_data(RGWUserBuckets& buckets) override;
+ void send_response_data_reversed(RGWUserBuckets& buckets);
+ void dump_bucket_entry(const RGWBucketEnt& obj);
void send_response_end() override;
bool should_get_stats() override { return need_stats; }
}
~RGWHandler_REST_SWIFT() override = default;
- static int validate_bucket_name(const string& bucket);
+ int validate_bucket_name(const string& bucket);
int init(RGWRados *store, struct req_state *s, rgw::io::BasicClient *cio) override;
int authorize() override;
std::string uid_str;
bool fetch_stats;
+ bool sync_stats;
RESTArgs::get_string(s, "uid", uid_str, &uid_str);
RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+ RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
op_state.set_user_id(uid);
op_state.set_fetch_stats(fetch_stats);
+ op_state.set_sync_stats(sync_stats);
http_ret = RGWUserAdminOp_User::info(store, op_state, flusher);
}
const boost::optional<const std::string&> prefix;
public:
- PrefixableSignatureHelper(const std::string& decoded_uri,
+ PrefixableSignatureHelper(const std::string& _decoded_uri,
const std::string& object_name,
const boost::optional<const std::string&> prefix)
- : decoded_uri(decoded_uri),
+ : decoded_uri(_decoded_uri),
object_name(object_name),
prefix(prefix) {
- /* Transform: v1/acct/cont/obj - > v1/acct/cont/ */
+ /* Transform: v1/acct/cont/obj - > v1/acct/cont/
+ *
+ * NOTE(rzarzynski): we really want to substr() on boost::string_view,
+ * not std::string. Otherwise we would end with no_obj_uri referencing
+ * a temporary. */
no_obj_uri = \
decoded_uri.substr(0, decoded_uri.length() - object_name.length());
}
/* The engines. */
const rgw::auth::swift::TempURLEngine tempurl_engine;
const rgw::auth::swift::SignedTokenEngine signed_engine;
- const rgw::auth::keystone::TokenEngine keystone_engine;
+ boost::optional <const rgw::auth::keystone::TokenEngine> keystone_engine;
const rgw::auth::swift::ExternalTokenEngine external_engine;
const rgw::auth::swift::SwiftAnonymousEngine anon_engine;
store,
static_cast<rgw::auth::TokenExtractor*>(this),
static_cast<rgw::auth::LocalApplier::Factory*>(this)),
- keystone_engine(cct,
- static_cast<rgw::auth::TokenExtractor*>(this),
- static_cast<rgw::auth::RemoteApplier::Factory*>(this),
- keystone_config_t::get_instance(),
- keystone_cache_t::get_instance<keystone_config_t>()),
external_engine(cct,
store,
static_cast<rgw::auth::TokenExtractor*>(this),
/* The auth strategy is responsible for deciding whether a parcular
* engine is disabled or not. */
if (! cct->_conf->rgw_keystone_url.empty()) {
- add_engine(Control::SUFFICIENT, keystone_engine);
+ keystone_engine.emplace(cct,
+ static_cast<rgw::auth::TokenExtractor*>(this),
+ static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+ keystone_config_t::get_instance(),
+ keystone_cache_t::get_instance<keystone_config_t>());
+
+ add_engine(Control::SUFFICIENT, *keystone_engine);
}
if (! cct->_conf->rgw_swift_auth_url.empty()) {
add_engine(Control::SUFFICIENT, external_engine);
store = p_store;
}
-void seed::get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op, uint64_t &total_len,
- bufferlist &bl_data, rgw_obj &obj)
+int seed::get_torrent_file(RGWRados::Object::Read &read_op,
+ uint64_t &total_len,
+ ceph::bufferlist &bl_data,
+ rgw_obj &obj)
{
/* add other field if config is set */
dencode.bencode_dict(bl);
ldout(s->cct, 0) << "NOTICE: head obj oid= " << oid << dendl;
obj_key.insert(RGW_OBJ_TORRENT);
- op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
+ const int op_ret = read_op.state.io_ctx.omap_get_vals_by_keys(oid, obj_key, &m);
if (op_ret < 0)
{
- ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = " << op_ret << dendl;
- return;
+ ldout(s->cct, 0) << "ERROR: failed to omap_get_vals_by_keys op_ret = "
+ << op_ret << dendl;
+ return op_ret;
}
map<string, bufferlist>::iterator iter;
bl_data = bl;
total_len = bl.length();
- return;
+ return 0;
}
bool seed::get_flag()
int get_params();
void init(struct req_state *p_req, RGWRados *p_store);
- void get_torrent_file(int &op_ret, RGWRados::Object::Read &read_op,
- uint64_t &total_len, bufferlist &bl_data, rgw_obj &obj);
+ int get_torrent_file(RGWRados::Object::Read &read_op,
+ uint64_t &total_len,
+ ceph::bufferlist &bl_data,
+ rgw_obj &obj);
off_t get_data_len();
bool get_flag();
{
bool found = false;
std::string swift_user;
- rgw_user& uid = op_state.get_user_id();
+ user_id = op_state.get_user_id();
std::string user_email = op_state.get_user_email();
std::string access_key = op_state.get_access_key();
std::string subuser = op_state.get_subuser();
clear_populated();
- if (uid.empty() && !subuser.empty()) {
+ if (user_id.empty() && !subuser.empty()) {
size_t pos = subuser.find(':');
if (pos != string::npos) {
- uid = subuser.substr(0, pos);
- op_state.set_user_id(uid);
+ user_id = subuser.substr(0, pos);
+ op_state.set_user_id(user_id);
}
}
- if (!uid.empty() && (uid.compare(RGW_USER_ANON_ID) != 0)) {
- found = (rgw_get_user_info_by_uid(store, uid, user_info, &op_state.objv) >= 0);
+ if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+ found = (rgw_get_user_info_by_uid(store, user_id, user_info, &op_state.objv) >= 0);
op_state.found_by_uid = found;
}
if (!user_email.empty() && !found) {
set_populated();
}
- user_id = user_info.user_id;
+ if (user_id.empty()) {
+ user_id = user_info.user_id;
+ }
op_state.set_initialized();
// this may have been called by a helper object
if (ret < 0)
return ret;
+ if (op_state.sync_stats) {
+ ret = rgw_user_sync_all_stats(store, info.user_id);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
RGWStorageStats stats;
RGWStorageStats *arg_stats = NULL;
if (op_state.fetch_stats) {
__u8 system;
__u8 exclusive;
__u8 fetch_stats;
+ __u8 sync_stats;
std::string caps;
RGWObjVersionTracker objv;
uint32_t op_mask;
fetch_stats = is_fetch_stats;
}
+ void set_sync_stats(__u8 is_sync_stats) {
+ sync_stats = is_sync_stats;
+ }
+
void set_user_info(RGWUserInfo& user_info) {
user_id = user_info.user_id;
info = user_info;
add_subdirectory(cls_numops)
add_subdirectory(cls_sdk)
if(WITH_RBD)
+ add_subdirectory(cls_journal)
add_subdirectory(cls_rbd)
endif(WITH_RBD)
add_subdirectory(cls_refcount)
${EXTRALIBS}
)
+# ceph_test_librgw_file_marker (READDIR with string and uint64 offsets)
+add_executable(ceph_test_librgw_file_marker
+ librgw_file_marker.cc
+ )
+set_target_properties(ceph_test_librgw_file_marker PROPERTIES COMPILE_FLAGS
+ ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_librgw_file_marker
+ rgw
+ librados
+ ceph-common
+ ${UNITTEST_LIBS}
+ ${EXTRALIBS}
+ )
+
# ceph_test_rgw_token
add_executable(ceph_test_rgw_token
test_rgw_token.cc
+++ /dev/null
-#!/usr/bin/env python
-
-from __future__ import print_function
-from subprocess import call
-try:
- from subprocess import check_output
-except ImportError:
- def check_output(*popenargs, **kwargs):
- import subprocess
- # backported from python 2.7 stdlib
- process = subprocess.Popen(
- stdout=subprocess.PIPE, *popenargs, **kwargs)
- output, unused_err = process.communicate()
- retcode = process.poll()
- if retcode:
- cmd = kwargs.get("args")
- if cmd is None:
- cmd = popenargs[0]
- error = subprocess.CalledProcessError(retcode, cmd)
- error.output = output
- raise error
- return output
-
-import filecmp
-import os
-import subprocess
-import math
-import time
-import sys
-import re
-import logging
-import json
-import tempfile
-import platform
-
-try:
- from subprocess import DEVNULL
-except ImportError:
- DEVNULL = open(os.devnull, "wb")
-
-logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
-
-
-if sys.version_info[0] >= 3:
- def decode(s):
- return s.decode('utf-8')
-
- def check_output(*args, **kwargs):
- return decode(subprocess.check_output(*args, **kwargs))
-else:
- def decode(s):
- return s
-
-
-
-def wait_for_health():
- print("Wait for health_ok...", end="")
- tries = 0
- while call("{path}/ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null".format(path=CEPH_BIN), shell=True) == 0:
- tries += 1
- if tries == 150:
- raise Exception("Time exceeded to go to health")
- time.sleep(1)
- print("DONE")
-
-
-def get_pool_id(name, nullfd):
- cmd = "{path}/ceph osd pool stats {pool}".format(pool=name, path=CEPH_BIN).split()
- # pool {pool} id # .... grab the 4 field
- return check_output(cmd, stderr=nullfd).split()[3]
-
-
-# return a list of unique PGS given an osd subdirectory
-def get_osd_pgs(SUBDIR, ID):
- PGS = []
- if ID:
- endhead = re.compile("{id}.*_head$".format(id=ID))
- DIR = os.path.join(SUBDIR, "current")
- PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
- PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
- return PGS
-
-
-# return a sorted list of unique PGs given a directory
-def get_pgs(DIR, ID):
- OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
- PGS = []
- for d in OSDS:
- SUBDIR = os.path.join(DIR, d)
- PGS += get_osd_pgs(SUBDIR, ID)
- return sorted(set(PGS))
-
-
-# return a sorted list of PGS a subset of ALLPGS that contain objects with prefix specified
-def get_objs(ALLPGS, prefix, DIR, ID):
- OSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
- PGS = []
- for d in OSDS:
- DIRL2 = os.path.join(DIR, d)
- SUBDIR = os.path.join(DIRL2, "current")
- for p in ALLPGS:
- PGDIR = p + "_head"
- if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
- continue
- FINALDIR = os.path.join(SUBDIR, PGDIR)
- # See if there are any objects there
- if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
- PGS += [p]
- return sorted(set(PGS))
-
-
-# return a sorted list of OSDS which have data from a given PG
-def get_osds(PG, DIR):
- ALLOSDS = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and f.find("osd") == 0]
- OSDS = []
- for d in ALLOSDS:
- DIRL2 = os.path.join(DIR, d)
- SUBDIR = os.path.join(DIRL2, "current")
- PGDIR = PG + "_head"
- if not os.path.isdir(os.path.join(SUBDIR, PGDIR)):
- continue
- OSDS += [d]
- return sorted(OSDS)
-
-
-def get_lines(filename):
- tmpfd = open(filename, "r")
- line = True
- lines = []
- while line:
- line = tmpfd.readline().rstrip('\n')
- if line:
- lines += [line]
- tmpfd.close()
- os.unlink(filename)
- return lines
-
-
-def cat_file(level, filename):
- if level < logging.getLogger().getEffectiveLevel():
- return
- print("File: " + filename)
- with open(filename, "r") as f:
- while True:
- line = f.readline().rstrip('\n')
- if not line:
- break
- print(line)
- print("<EOF>")
-
-
-def vstart(new, opt=""):
- print("vstarting....", end="")
- NEW = new and "-n" or "-N"
- call("MON=1 OSD=4 MDS=0 MGR=1 CEPH_PORT=7400 {path}/src/vstart.sh --short -l {new} -d {opt} > /dev/null 2>&1".format(new=NEW, opt=opt, path=CEPH_ROOT), shell=True)
- print("DONE")
-
-
-def test_failure(cmd, errmsg, tty=False):
- if tty:
- try:
- ttyfd = open("/dev/tty", "rwb")
- except Exception as e:
- logging.info(str(e))
- logging.info("SKIP " + cmd)
- return 0
- TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
- tmpfd = open(TMPFILE, "wb")
-
- logging.debug(cmd)
- if tty:
- ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
- ttyfd.close()
- else:
- ret = call(cmd, shell=True, stderr=tmpfd)
- tmpfd.close()
- if ret == 0:
- logging.error(cmd)
- logging.error("Should have failed, but got exit 0")
- return 1
- lines = get_lines(TMPFILE)
- matched = [ l for l in lines if errmsg in l ]
- if any(matched):
- logging.info("Correctly failed with message \"" + matched[0] + "\"")
- return 0
- else:
- logging.error("Command: " + cmd )
- logging.error("Bad messages to stderr \"" + str(lines) + "\"")
- logging.error("Expected \"" + errmsg + "\"")
- return 1
-
-
-def get_nspace(num):
- if num == 0:
- return ""
- return "ns{num}".format(num=num)
-
-
-def verify(DATADIR, POOL, NAME_PREFIX, db):
- TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
- ERRORS = 0
- for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
- nsfile = rawnsfile.split("__")[0]
- clone = rawnsfile.split("__")[1]
- nspace = nsfile.split("-")[0]
- file = nsfile.split("-")[1]
- # Skip clones
- if clone != "head":
- continue
- path = os.path.join(DATADIR, rawnsfile)
- try:
- os.unlink(TMPFILE)
- except:
- pass
- cmd = "{path}/rados -p {pool} -N '{nspace}' get {file} {out}".format(pool=POOL, file=file, out=TMPFILE, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=DEVNULL, stderr=DEVNULL)
- cmd = "diff -q {src} {result}".format(src=path, result=TMPFILE)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("{file} data not imported properly".format(file=file))
- ERRORS += 1
- try:
- os.unlink(TMPFILE)
- except:
- pass
- for key, val in db[nspace][file]["xattr"].items():
- cmd = "{path}/rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- getval = check_output(cmd, shell=True, stderr=DEVNULL)
- logging.debug("getxattr {key} {val}".format(key=key, val=getval))
- if getval != val:
- logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
- ERRORS += 1
- continue
- hdr = db[nspace][file].get("omapheader", "")
- cmd = "{path}/rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=DEVNULL)
- if ret != 0:
- logging.error("rados getomapheader returned {ret}".format(ret=ret))
- ERRORS += 1
- else:
- getlines = get_lines(TMPFILE)
- assert(len(getlines) == 0 or len(getlines) == 1)
- if len(getlines) == 0:
- gethdr = ""
- else:
- gethdr = getlines[0]
- logging.debug("header: {hdr}".format(hdr=gethdr))
- if gethdr != hdr:
- logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
- ERRORS += 1
- for key, val in db[nspace][file]["omap"].items():
- cmd = "{path}/rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=DEVNULL)
- if ret != 0:
- logging.error("getomapval returned {ret}".format(ret=ret))
- ERRORS += 1
- continue
- getlines = get_lines(TMPFILE)
- if len(getlines) != 1:
- logging.error("Bad data from getomapval {lines}".format(lines=getlines))
- ERRORS += 1
- continue
- getval = getlines[0]
- logging.debug("getomapval {key} {val}".format(key=key, val=getval))
- if getval != val:
- logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
- ERRORS += 1
- try:
- os.unlink(TMPFILE)
- except:
- pass
- return ERRORS
-
-
-def check_journal(jsondict):
- errors = 0
- if 'header' not in jsondict:
- logging.error("Key 'header' not in dump-journal")
- errors += 1
- elif 'max_size' not in jsondict['header']:
- logging.error("Key 'max_size' not in dump-journal header")
- errors += 1
- else:
- print("\tJournal max_size = {size}".format(size=jsondict['header']['max_size']))
- if 'entries' not in jsondict:
- logging.error("Key 'entries' not in dump-journal output")
- errors += 1
- elif len(jsondict['entries']) == 0:
- logging.info("No entries in journal found")
- else:
- errors += check_journal_entries(jsondict['entries'])
- return errors
-
-
-def check_journal_entries(entries):
- errors = 0
- for enum in range(len(entries)):
- if 'offset' not in entries[enum]:
- logging.error("No 'offset' key in entry {e}".format(e=enum))
- errors += 1
- if 'seq' not in entries[enum]:
- logging.error("No 'seq' key in entry {e}".format(e=enum))
- errors += 1
- if 'transactions' not in entries[enum]:
- logging.error("No 'transactions' key in entry {e}".format(e=enum))
- errors += 1
- elif len(entries[enum]['transactions']) == 0:
- logging.error("No transactions found in entry {e}".format(e=enum))
- errors += 1
- else:
- errors += check_entry_transactions(entries[enum], enum)
- return errors
-
-
-def check_entry_transactions(entry, enum):
- errors = 0
- for tnum in range(len(entry['transactions'])):
- if 'trans_num' not in entry['transactions'][tnum]:
- logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
- errors += 1
- elif entry['transactions'][tnum]['trans_num'] != tnum:
- ft = entry['transactions'][tnum]['trans_num']
- logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
- errors += 1
- if 'ops' not in entry['transactions'][tnum]:
- logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
- errors += 1
- else:
- errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
- return errors
-
-
-def check_transaction_ops(ops, enum, tnum):
- if len(ops) is 0:
- logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
- errors = 0
- for onum in range(len(ops)):
- if 'op_num' not in ops[onum]:
- logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
- errors += 1
- elif ops[onum]['op_num'] != onum:
- fo = ops[onum]['op_num']
- logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
- errors += 1
- if 'op_name' not in ops[onum]:
- logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
- errors += 1
- return errors
-
-
-def test_dump_journal(CFSD_PREFIX, osds):
- ERRORS = 0
- pid = os.getpid()
- TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
-
- for osd in osds:
- # Test --op dump-journal by loading json
- cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
- logging.debug(cmd)
- tmpfd = open(TMPFILE, "wb")
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
- ERRORS += 1
- continue
- tmpfd.close()
- tmpfd = open(TMPFILE, "r")
- jsondict = json.load(tmpfd)
- tmpfd.close()
- os.unlink(TMPFILE)
-
- journal_errors = check_journal(jsondict)
- if journal_errors is not 0:
- logging.error(jsondict)
- ERRORS += journal_errors
-
- return ERRORS
-
-CEPH_BUILD_DIR = os.environ.get('CEPH_BUILD_DIR')
-CEPH_BIN = os.environ.get('CEPH_BIN')
-CEPH_ROOT = os.environ.get('CEPH_ROOT')
-
-if not CEPH_BUILD_DIR:
- CEPH_BUILD_DIR=os.getcwd()
- os.putenv('CEPH_BUILD_DIR', CEPH_BUILD_DIR)
- CEPH_BIN=CEPH_BUILD_DIR
- os.putenv('CEPH_BIN', CEPH_BIN)
- CEPH_ROOT=os.path.dirname(CEPH_BUILD_DIR)
- os.putenv('CEPH_ROOT', CEPH_ROOT)
- CEPH_LIB=os.path.join(CEPH_BIN, '.libs')
- os.putenv('CEPH_LIB', CEPH_LIB)
-
-CEPH_DIR = CEPH_BUILD_DIR + "/cot_dir"
-CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
-
-def kill_daemons():
- call("{path}/init-ceph -c {conf} stop > /dev/null 2>&1".format(conf=CEPH_CONF, path=CEPH_BIN), shell=True)
-
-
-def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
- repcount = 0
- ERRORS = 0
- for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
- nsfile = rawnsfile.split("__")[0]
- clone = rawnsfile.split("__")[1]
- nspace = nsfile.split("-")[0]
- file = nsfile.split("-")[1] + "__" + clone
- # Skip clones
- if clone != "head":
- continue
- path = os.path.join(DATADIR, rawnsfile)
- tmpfd = open(TMPFILE, "wb")
- cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret:
- logging.critical("INTERNAL ERROR")
- return 1
- tmpfd.close()
- obj_locs = get_lines(TMPFILE)
- if len(obj_locs) == 0:
- logging.error("Can't find imported object {name}".format(name=file))
- ERRORS += 1
- for obj_loc in obj_locs:
- # For btrfs skip snap_* dirs
- if re.search("/snap_[0-9]*/", obj_loc) is not None:
- continue
- repcount += 1
- cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
- ERRORS += 1
- return ERRORS, repcount
-
-
-def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
- # change the weight of osd.0 to math.pi in the newest osdmap of given osd
- osdmap_file = tempfile.NamedTemporaryFile(delete=True)
- cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
- osdmap_file=osdmap_file.name)
- output = check_output(cmd, shell=True)
- epoch = int(re.findall('#(\d+)', output)[0])
-
- new_crush_file = tempfile.NamedTemporaryFile(delete=True)
- old_crush_file = tempfile.NamedTemporaryFile(delete=True)
- ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
- crush_file=old_crush_file.name, path=CEPH_BIN),
- stdout=DEVNULL,
- stderr=DEVNULL,
- shell=True)
- assert(ret == 0)
-
- for osd_id in osd_ids:
- cmd = "{path}/crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
- crush_file=old_crush_file.name,
- weight=weight,
- new_crush_file=new_crush_file.name, path=CEPH_BIN)
- ret = call(cmd, stdout=DEVNULL, shell=True)
- assert(ret == 0)
- old_crush_file, new_crush_file = new_crush_file, old_crush_file
-
- # change them back, since we don't need to preapre for another round
- old_crush_file, new_crush_file = new_crush_file, old_crush_file
- old_crush_file.close()
-
- ret = call("{path}/osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
- crush_file=new_crush_file.name, path=CEPH_BIN),
- stdout=DEVNULL,
- stderr=DEVNULL,
- shell=True)
- assert(ret == 0)
-
- # Minimum test of --dry-run by using it, but not checking anything
- cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
- cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
- ret = call(cmd, stdout=DEVNULL, shell=True)
- assert(ret == 0)
-
- # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
- # to use use a different epoch than the one in osdmap
- cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
- cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
- ret = call(cmd, stdout=DEVNULL, shell=True)
-
- return ret == 0
-
-def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
- osdmap_file = tempfile.NamedTemporaryFile(delete=True)
- cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
- osdmap_file=osdmap_file.name)
- ret = call(cmd, stdout=DEVNULL, shell=True)
- if ret != 0:
- return None
- # we have to read the weights from the crush map, even we can query the weights using
- # osdmaptool, but please keep in mind, they are different:
- # item weights in crush map versus weight associated with each osd in osdmap
- crush_file = tempfile.NamedTemporaryFile(delete=True)
- ret = call("{path}/osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
- crush_file=crush_file.name, path=CEPH_BIN),
- stdout=DEVNULL,
- shell=True)
- assert(ret == 0)
- output = check_output("{path}/crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
- num_osd=len(osd_ids), path=CEPH_BIN),
- stderr=DEVNULL,
- shell=True)
- weights = []
- for line in output.strip().split('\n'):
- print(line)
- linev = re.split('\s+', line)
- if linev[0] is '':
- linev.pop(0)
- print('linev %s' % linev)
- weights.append(float(linev[1]))
-
- return weights
-
-
-def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
- print("Testing get-osdmap and set-osdmap")
- errors = 0
- kill_daemons()
- weight = 1 / math.e # just some magic number in [0, 1]
- changed = []
- for osd_path in osd_paths:
- if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
- changed.append(osd_path)
- else:
- logging.warning("Failed to change the weights: {0}".format(osd_path))
- # i am pissed off if none of the store gets changed
- if not changed:
- errors += 1
-
- for osd_path in changed:
- weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
- if not weights:
- errors += 1
- continue
- if any(abs(w - weight) > 1e-5 for w in weights):
- logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
- errors += 1
- return errors
-
-def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
- # incrementals are not used unless we need to build an MOSDMap to update
- # OSD's peers, so an obvious way to test it is simply overwrite an epoch
- # with a different copy, and read it back to see if it matches.
- kill_daemons()
- file_e2 = tempfile.NamedTemporaryFile(delete=True)
- cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
- file=file_e2.name)
- output = check_output(cmd, shell=True)
- epoch = int(re.findall('#(\d+)', output)[0])
- # backup e1 incremental before overwriting it
- epoch -= 1
- file_e1_backup = tempfile.NamedTemporaryFile(delete=True)
- cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
- ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
- if ret: return 1
- # overwrite e1 with e2
- cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
- ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
- if ret: return 1
- # Use dry-run to set back to e1 which shouldn't happen
- cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
- ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
- if ret: return 1
- # read from e1
- file_e1_read = tempfile.NamedTemporaryFile(delete=True)
- cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
- ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
- if ret: return 1
- errors = 0
- try:
- if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
- logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
- errors += 1
- finally:
- # revert the change with file_e1_backup
- cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
- ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
- if ret:
- logging.error("Failed to revert the changed inc-osdmap")
- errors += 1
-
- return errors
-
-
-def test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS):
- # Test removeall
- TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
- nullfd = open(os.devnull, "w")
- errors=0
- print("Test removeall")
- kill_daemons()
- for nspace in db.keys():
- for basename in db[nspace].keys():
- JSON = db[nspace][basename]['json']
- for pg in OBJREPPGS:
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
- fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
- and f.split("_")[0] == basename and f.split("_")[4] == nspace]
- if not fnames:
- continue
-
- if int(basename.split(REP_NAME)[1]) <= int(NUM_CLONED_REP_OBJECTS):
- cmd = (CFSD_PREFIX + "'{json}' remove").format(osd=osd, json=JSON)
- errors += test_failure(cmd, "Snapshots are present, use removeall to delete everything")
-
- cmd = (CFSD_PREFIX + " --force --dry-run '{json}' remove").format(osd=osd, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("remove with --force failed for {json}".format(json=JSON))
- errors += 1
-
- cmd = (CFSD_PREFIX + " --dry-run '{json}' removeall").format(osd=osd, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("removeall failed for {json}".format(json=JSON))
- errors += 1
-
- cmd = (CFSD_PREFIX + " '{json}' removeall").format(osd=osd, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("removeall failed for {json}".format(json=JSON))
- errors += 1
-
- tmpfd = open(TMPFILE, "w")
- cmd = (CFSD_PREFIX + "--op list --pgid {pg} --namespace {ns} {name}").format(osd=osd, pg=pg, ns=nspace, name=basename)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
- errors += 1
- tmpfd.close()
- lines = get_lines(TMPFILE)
- if len(lines) != 0:
- logging.error("Removeall didn't remove all objects {ns}/{name} : {lines}".format(ns=nspace, name=basename, lines=lines))
- errors += 1
- vstart(new=False)
- wait_for_health()
- cmd = "{path}/rados -p {pool} rmsnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("rados rmsnap failed")
- errors += 1
- time.sleep(2)
- wait_for_health()
- return errors
-
-
-def main(argv):
- if sys.version_info[0] < 3:
- sys.stdout = stdout = os.fdopen(sys.stdout.fileno(), 'wb', 0)
- else:
- stdout = sys.stdout.buffer
- if len(argv) > 1 and argv[1] == "debug":
- nullfd = stdout
- else:
- nullfd = DEVNULL
-
- call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True)
- os.environ["CEPH_DIR"] = CEPH_DIR
- OSDDIR = os.path.join(CEPH_DIR, "dev")
- REP_POOL = "rep_pool"
- REP_NAME = "REPobject"
- EC_POOL = "ec_pool"
- EC_NAME = "ECobject"
- if len(argv) > 0 and argv[0] == 'large':
- PG_COUNT = 12
- NUM_REP_OBJECTS = 800
- NUM_CLONED_REP_OBJECTS = 100
- NUM_EC_OBJECTS = 12
- NUM_NSPACES = 4
- # Larger data sets for first object per namespace
- DATALINECOUNT = 50000
- # Number of objects to do xattr/omap testing on
- ATTR_OBJS = 10
- else:
- PG_COUNT = 4
- NUM_REP_OBJECTS = 2
- NUM_CLONED_REP_OBJECTS = 2
- NUM_EC_OBJECTS = 2
- NUM_NSPACES = 2
- # Larger data sets for first object per namespace
- DATALINECOUNT = 10
- # Number of objects to do xattr/omap testing on
- ATTR_OBJS = 2
- ERRORS = 0
- pid = os.getpid()
- TESTDIR = "/tmp/test.{pid}".format(pid=pid)
- DATADIR = "/tmp/data.{pid}".format(pid=pid)
- CFSD_PREFIX = CEPH_BIN + "/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} "
- PROFNAME = "testecprofile"
-
- os.environ['CEPH_CONF'] = CEPH_CONF
- vstart(new=True)
- wait_for_health()
-
- cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=REP_POOL, pg=PG_COUNT, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- REPID = get_pool_id(REP_POOL, nullfd)
-
- print("Created Replicated pool #{repid}".format(repid=REPID))
-
- cmd = "{path}/ceph osd erasure-code-profile set {prof} crush-failure-domain=osd".format(prof=PROFNAME, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- cmd = "{path}/ceph osd erasure-code-profile get {prof}".format(prof=PROFNAME, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- cmd = "{path}/ceph osd pool create {pool} {pg} {pg} erasure {prof}".format(pool=EC_POOL, prof=PROFNAME, pg=PG_COUNT, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- ECID = get_pool_id(EC_POOL, nullfd)
-
- print("Created Erasure coded pool #{ecid}".format(ecid=ECID))
-
- print("Creating {objs} objects in replicated pool".format(objs=(NUM_REP_OBJECTS*NUM_NSPACES)))
- cmd = "mkdir -p {datadir}".format(datadir=DATADIR)
- logging.debug(cmd)
- call(cmd, shell=True)
-
- db = {}
-
- objects = range(1, NUM_REP_OBJECTS + 1)
- nspaces = range(NUM_NSPACES)
- for n in nspaces:
- nspace = get_nspace(n)
-
- db[nspace] = {}
-
- for i in objects:
- NAME = REP_NAME + "{num}".format(num=i)
- LNAME = nspace + "-" + NAME
- DDNAME = os.path.join(DATADIR, LNAME)
- DDNAME += "__head"
-
- cmd = "rm -f " + DDNAME
- logging.debug(cmd)
- call(cmd, shell=True)
-
- if i == 1:
- dataline = range(DATALINECOUNT)
- else:
- dataline = range(1)
- fd = open(DDNAME, "w")
- data = "This is the replicated data for " + LNAME + "\n"
- for _ in dataline:
- fd.write(data)
- fd.close()
-
- cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd)
- if ret != 0:
- logging.critical("Rados put command failed with {ret}".format(ret=ret))
- return 1
-
- db[nspace][NAME] = {}
-
- if i < ATTR_OBJS + 1:
- keys = range(i)
- else:
- keys = range(0)
- db[nspace][NAME]["xattr"] = {}
- for k in keys:
- if k == 0:
- continue
- mykey = "key{i}-{k}".format(i=i, k=k)
- myval = "val{i}-{k}".format(i=i, k=k)
- cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("setxattr failed with {ret}".format(ret=ret))
- ERRORS += 1
- db[nspace][NAME]["xattr"][mykey] = myval
-
- # Create omap header in all objects but REPobject1
- if i < ATTR_OBJS + 1 and i != 1:
- myhdr = "hdr{i}".format(i=i)
- cmd = "{path}/rados -p {pool} -N '{nspace}' setomapheader {name} {hdr}".format(pool=REP_POOL, name=NAME, hdr=myhdr, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.critical("setomapheader failed with {ret}".format(ret=ret))
- ERRORS += 1
- db[nspace][NAME]["omapheader"] = myhdr
-
- db[nspace][NAME]["omap"] = {}
- for k in keys:
- if k == 0:
- continue
- mykey = "okey{i}-{k}".format(i=i, k=k)
- myval = "oval{i}-{k}".format(i=i, k=k)
- cmd = "{path}/rados -p {pool} -N '{nspace}' setomapval {name} {key} {val}".format(pool=REP_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.critical("setomapval failed with {ret}".format(ret=ret))
- db[nspace][NAME]["omap"][mykey] = myval
-
- # Create some clones
- cmd = "{path}/rados -p {pool} mksnap snap1".format(pool=REP_POOL, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True)
-
- objects = range(1, NUM_CLONED_REP_OBJECTS + 1)
- nspaces = range(NUM_NSPACES)
- for n in nspaces:
- nspace = get_nspace(n)
-
- for i in objects:
- NAME = REP_NAME + "{num}".format(num=i)
- LNAME = nspace + "-" + NAME
- DDNAME = os.path.join(DATADIR, LNAME)
- # First clone
- CLONENAME = DDNAME + "__1"
- DDNAME += "__head"
-
- cmd = "mv -f " + DDNAME + " " + CLONENAME
- logging.debug(cmd)
- call(cmd, shell=True)
-
- if i == 1:
- dataline = range(DATALINECOUNT)
- else:
- dataline = range(1)
- fd = open(DDNAME, "w")
- data = "This is the replicated data after a snapshot for " + LNAME + "\n"
- for _ in dataline:
- fd.write(data)
- fd.close()
-
- cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd)
- if ret != 0:
- logging.critical("Rados put command failed with {ret}".format(ret=ret))
- return 1
-
- print("Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES)))
-
- objects = range(1, NUM_EC_OBJECTS + 1)
- nspaces = range(NUM_NSPACES)
- for n in nspaces:
- nspace = get_nspace(n)
-
- for i in objects:
- NAME = EC_NAME + "{num}".format(num=i)
- LNAME = nspace + "-" + NAME
- DDNAME = os.path.join(DATADIR, LNAME)
- DDNAME += "__head"
-
- cmd = "rm -f " + DDNAME
- logging.debug(cmd)
- call(cmd, shell=True)
-
- if i == 1:
- dataline = range(DATALINECOUNT)
- else:
- dataline = range(1)
- fd = open(DDNAME, "w")
- data = "This is the erasure coded data for " + LNAME + "\n"
- for j in dataline:
- fd.write(data)
- fd.close()
-
- cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=EC_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd)
- if ret != 0:
- logging.critical("Erasure coded pool creation failed with {ret}".format(ret=ret))
- return 1
-
- db[nspace][NAME] = {}
-
- db[nspace][NAME]["xattr"] = {}
- if i < ATTR_OBJS + 1:
- keys = range(i)
- else:
- keys = range(0)
- for k in keys:
- if k == 0:
- continue
- mykey = "key{i}-{k}".format(i=i, k=k)
- myval = "val{i}-{k}".format(i=i, k=k)
- cmd = "{path}/rados -p {pool} -N '{nspace}' setxattr {name} {key} {val}".format(pool=EC_POOL, name=NAME, key=mykey, val=myval, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("setxattr failed with {ret}".format(ret=ret))
- ERRORS += 1
- db[nspace][NAME]["xattr"][mykey] = myval
-
- # Omap isn't supported in EC pools
- db[nspace][NAME]["omap"] = {}
-
- logging.debug(db)
-
- kill_daemons()
-
- if ERRORS:
- logging.critical("Unable to set up test")
- return 1
-
- ALLREPPGS = get_pgs(OSDDIR, REPID)
- logging.debug(ALLREPPGS)
- ALLECPGS = get_pgs(OSDDIR, ECID)
- logging.debug(ALLECPGS)
-
- OBJREPPGS = get_objs(ALLREPPGS, REP_NAME, OSDDIR, REPID)
- logging.debug(OBJREPPGS)
- OBJECPGS = get_objs(ALLECPGS, EC_NAME, OSDDIR, ECID)
- logging.debug(OBJECPGS)
-
- ONEPG = ALLREPPGS[0]
- logging.debug(ONEPG)
- osds = get_osds(ONEPG, OSDDIR)
- ONEOSD = osds[0]
- logging.debug(ONEOSD)
-
- print("Test invalid parameters")
- # On export can't use stdout to a terminal
- cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
-
- # On export can't use stdout to a terminal
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
-
- # Prep a valid ec export file for import failure tests
- ONEECPG = ALLECPGS[0]
- osds = get_osds(ONEECPG, OSDDIR)
- ONEECOSD = osds[0]
- OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
- # On import can't specify a different shard
- BADPG = ONEECPG.split('s')[0] + "s10"
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
- ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
-
- os.unlink(OTHERFILE)
-
- # Prep a valid export file for import failure tests
- OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
- # On import can't specify a PG with a non-existent pool
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
- ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
-
- # On import can't specify shard for a replicated export
- cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
- ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
-
- # On import can't specify a PG with a bad seed
- TMPPG="{pool}.80".format(pool=REPID)
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
- ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
-
- os.unlink(OTHERFILE)
- cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
- ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
-
- cmd = "{path}/ceph-objectstore-tool --data-path BAD_DATA_PATH --op list".format(osd=ONEOSD, path=CEPH_BIN)
- ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
-
- cmd = "{path}/ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal".format(path=CEPH_BIN)
- ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
-
- # On import can't use stdin from a terminal
- cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
-
- # On import can't use stdin from a terminal
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
-
- # Specify a bad --type
- os.mkdir(OSDDIR + "/fakeosd")
- cmd = ("{path}/ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} --type foobar --op list --pgid {pg}").format(osd="fakeosd", pg=ONEPG, path=CEPH_BIN)
- ERRORS += test_failure(cmd, "Unable to create store of type foobar")
-
- # Don't specify a data-path
- cmd = "{path}/ceph-objectstore-tool --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG, path=CEPH_BIN)
- ERRORS += test_failure(cmd, "Must provide --data-path")
-
- cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "Must provide pgid")
-
- # Don't secify a --op nor object command
- cmd = CFSD_PREFIX.format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "Must provide --op or object command...")
-
- # Specify a bad --op command
- cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
-
- # Provide just the object param not a command
- cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "Invalid syntax, missing command")
-
- # Provide an object name that doesn't exist
- cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
- ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
-
- # Provide an invalid object command
- cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
-
- cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
-
- cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
-
- cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
-
- cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
-
- cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
-
- cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
-
- cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
- ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
-
- TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
- ALLPGS = OBJREPPGS + OBJECPGS
- OSDS = get_osds(ALLPGS[0], OSDDIR)
- osd = OSDS[0]
-
- print("Test all --op dump-journal")
- ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
- ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
-
- # Test --op list and generate json for all objects
- print("Test --op list variants")
-
- # retrieve all objects from all PGs
- tmpfd = open(TMPFILE, "wb")
- cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
- ERRORS += 1
- tmpfd.close()
- lines = get_lines(TMPFILE)
- JSONOBJ = sorted(set(lines))
- (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
-
- # retrieve all objects in a given PG
- tmpfd = open(OTHERFILE, "ab")
- cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
- ERRORS += 1
- tmpfd.close()
- lines = get_lines(OTHERFILE)
- JSONOBJ = sorted(set(lines))
- (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
-
- if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
- logging.error("the first line of --op list is different "
- "from the first line of --op list --pgid {pg}".format(pg=pgid))
- ERRORS += 1
-
- # retrieve all objects with a given name in a given PG
- tmpfd = open(OTHERFILE, "wb")
- cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
- ERRORS += 1
- tmpfd.close()
- lines = get_lines(OTHERFILE)
- JSONOBJ = sorted(set(lines))
- (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
-
- if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
- logging.error("the first line of --op list is different "
- "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
- ERRORS += 1
-
- print("Test --op list by generating json for all objects using default format")
- for pg in ALLPGS:
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- tmpfd = open(TMPFILE, "ab")
- cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from --op list request".format(ret=ret))
- ERRORS += 1
-
- tmpfd.close()
- lines = get_lines(TMPFILE)
- JSONOBJ = sorted(set(lines))
- for JSON in JSONOBJ:
- (pgid, jsondict) = json.loads(JSON)
- # Skip clones for now
- if jsondict['snapid'] != -2:
- continue
- db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
- # print db[jsondict['namespace']][jsondict['oid']]['json']
- if jsondict['oid'].find(EC_NAME) == 0 and 'shard_id' not in jsondict:
- logging.error("Malformed JSON {json}".format(json=JSON))
- ERRORS += 1
-
- # Test get-bytes
- print("Test get-bytes and set-bytes")
- for nspace in db.keys():
- for basename in db[nspace].keys():
- file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
- JSON = db[nspace][basename]['json']
- GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
- TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
- SETNAME = "/tmp/setbytes.{pid}".format(pid=pid)
- BADNAME = "/tmp/badbytes.{pid}".format(pid=pid)
- for pg in OBJREPPGS:
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
- fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
- and f.split("_")[0] == basename and f.split("_")[4] == nspace]
- if not fnames:
- continue
- try:
- os.unlink(GETNAME)
- except:
- pass
- cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-bytes {fname}").format(osd=osd, pg=pg, json=JSON, fname=GETNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret}".format(ret=ret))
- ERRORS += 1
- continue
- cmd = "diff -q {file} {getfile}".format(file=file, getfile=GETNAME)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Data from get-bytes differ")
- logging.debug("Got:")
- cat_file(logging.DEBUG, GETNAME)
- logging.debug("Expected:")
- cat_file(logging.DEBUG, file)
- ERRORS += 1
- fd = open(SETNAME, "w")
- data = "put-bytes going into {file}\n".format(file=file)
- fd.write(data)
- fd.close()
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=SETNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-bytes".format(ret=ret))
- ERRORS += 1
- fd = open(TESTNAME, "wb")
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=fd)
- fd.close()
- if ret != 0:
- logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
- ERRORS += 1
- cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Data after set-bytes differ")
- logging.debug("Got:")
- cat_file(logging.DEBUG, TESTNAME)
- logging.debug("Expected:")
- cat_file(logging.DEBUG, SETNAME)
- ERRORS += 1
-
- # Use set-bytes with --dry-run and make sure contents haven't changed
- fd = open(BADNAME, "w")
- data = "Bad data for --dry-run in {file}\n".format(file=file)
- fd.write(data)
- fd.close()
- cmd = (CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=BADNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-bytes --dry-run".format(ret=ret))
- ERRORS += 1
- fd = open(TESTNAME, "wb")
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=fd)
- fd.close()
- if ret != 0:
- logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
- ERRORS += 1
- cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Data after set-bytes --dry-run changed!")
- logging.debug("Got:")
- cat_file(logging.DEBUG, TESTNAME)
- logging.debug("Expected:")
- cat_file(logging.DEBUG, SETNAME)
- ERRORS += 1
-
- fd = open(file, "rb")
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdin=fd)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
- ERRORS += 1
- fd.close()
-
- try:
- os.unlink(GETNAME)
- except:
- pass
- try:
- os.unlink(TESTNAME)
- except:
- pass
- try:
- os.unlink(SETNAME)
- except:
- pass
- try:
- os.unlink(BADNAME)
- except:
- pass
-
- # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
- print("Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap")
- for nspace in db.keys():
- for basename in db[nspace].keys():
- file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
- JSON = db[nspace][basename]['json']
- for pg in OBJREPPGS:
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
- fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
- and f.split("_")[0] == basename and f.split("_")[4] == nspace]
- if not fnames:
- continue
- for key, val in db[nspace][basename]["xattr"].items():
- attrkey = "_" + key
- cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
- logging.debug(cmd)
- getval = check_output(cmd, shell=True)
- if getval != val:
- logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
- ERRORS += 1
- continue
- # set-attr to bogus value "foobar"
- cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
- ERRORS += 1
- continue
- # Test set-attr with dry-run
- cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
- ERRORS += 1
- continue
- # Check the set-attr
- cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- getval = check_output(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
- ERRORS += 1
- continue
- if getval != "foobar":
- logging.error("Check of set-attr failed because we got {val}".format(val=getval))
- ERRORS += 1
- continue
- # Test rm-attr
- cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
- ERRORS += 1
- continue
- # Check rm-attr with dry-run
- cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
- ERRORS += 1
- continue
- cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
- if ret == 0:
- logging.error("For rm-attr expect get-attr to fail, but it succeeded")
- ERRORS += 1
- # Put back value
- cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
- ERRORS += 1
- continue
-
- hdr = db[nspace][basename].get("omapheader", "")
- cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
- logging.debug(cmd)
- gethdr = check_output(cmd, shell=True)
- if gethdr != hdr:
- logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
- ERRORS += 1
- continue
- # set-omaphdr to bogus value "foobar"
- cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
- ERRORS += 1
- continue
- # Check the set-omaphdr
- cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- gethdr = check_output(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
- ERRORS += 1
- continue
- if gethdr != "foobar":
- logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
- ERRORS += 1
- continue
- # Test dry-run with set-omaphdr
- cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
- ERRORS += 1
- continue
- # Put back value
- cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
- ERRORS += 1
- continue
-
- for omapkey, val in db[nspace][basename]["omap"].items():
- cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
- logging.debug(cmd)
- getval = check_output(cmd, shell=True)
- if getval != val:
- logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
- ERRORS += 1
- continue
- # set-omap to bogus value "foobar"
- cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
- ERRORS += 1
- continue
- # Check set-omap with dry-run
- cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
- ERRORS += 1
- continue
- # Check the set-omap
- cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- getval = check_output(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
- ERRORS += 1
- continue
- if getval != "foobar":
- logging.error("Check of set-omap failed because we got {val}".format(val=getval))
- ERRORS += 1
- continue
- # Test rm-omap
- cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
- ERRORS += 1
- # Check rm-omap with dry-run
- cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
- ERRORS += 1
- cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
- if ret == 0:
- logging.error("For rm-omap expect get-omap to fail, but it succeeded")
- ERRORS += 1
- # Put back value
- cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
- ERRORS += 1
- continue
-
- # Test dump
- print("Test dump")
- for nspace in db.keys():
- for basename in db[nspace].keys():
- file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
- JSON = db[nspace][basename]['json']
- GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
- for pg in OBJREPPGS:
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
- fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
- and f.split("_")[0] == basename and f.split("_")[4] == nspace]
- if not fnames:
- continue
- if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
- continue
- cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Invalid dump for {json}".format(json=JSON))
- ERRORS += 1
-
- print("Test list-attrs get-attr")
- ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
- VALFILE = r"/tmp/val.{pid}".format(pid=pid)
- for nspace in db.keys():
- for basename in db[nspace].keys():
- file = os.path.join(DATADIR, nspace + "-" + basename)
- JSON = db[nspace][basename]['json']
- jsondict = json.loads(JSON)
-
- if 'shard_id' in jsondict:
- logging.debug("ECobject " + JSON)
- found = 0
- for pg in OBJECPGS:
- OSDS = get_osds(pg, OSDDIR)
- # Fix shard_id since we only have one json instance for each object
- jsondict['shard_id'] = int(pg.split('s')[1])
- JSON = json.dumps(jsondict)
- for osd in OSDS:
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
- logging.debug("TRY: " + cmd)
- try:
- out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
- logging.debug("FOUND: {json} in {osd} has value '{val}'".format(osd=osd, json=JSON, val=out))
- found += 1
- except subprocess.CalledProcessError as e:
- if "No such file or directory" not in e.output and "No data available" not in e.output:
- raise
- # Assuming k=2 m=1 for the default ec pool
- if found != 3:
- logging.error("{json} hinfo_key found {found} times instead of 3".format(json=JSON, found=found))
- ERRORS += 1
-
- for pg in ALLPGS:
- # Make sure rep obj with rep pg or ec obj with ec pg
- if ('shard_id' in jsondict) != (pg.find('s') > 0):
- continue
- if 'shard_id' in jsondict:
- # Fix shard_id since we only have one json instance for each object
- jsondict['shard_id'] = int(pg.split('s')[1])
- JSON = json.dumps(jsondict)
- OSDS = get_osds(pg, OSDDIR)
- for osd in OSDS:
- DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
- fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
- and f.split("_")[0] == basename and f.split("_")[4] == nspace]
- if not fnames:
- continue
- afd = open(ATTRFILE, "wb")
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=afd)
- afd.close()
- if ret != 0:
- logging.error("list-attrs failed with {ret}".format(ret=ret))
- ERRORS += 1
- continue
- keys = get_lines(ATTRFILE)
- values = dict(db[nspace][basename]["xattr"])
- for key in keys:
- if key == "_" or key == "snapset" or key == "hinfo_key":
- continue
- key = key.strip("_")
- if key not in values:
- logging.error("Unexpected key {key} present".format(key=key))
- ERRORS += 1
- continue
- exp = values.pop(key)
- vfd = open(VALFILE, "wb")
- cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=vfd)
- vfd.close()
- if ret != 0:
- logging.error("get-attr failed with {ret}".format(ret=ret))
- ERRORS += 1
- continue
- lines = get_lines(VALFILE)
- val = lines[0]
- if exp != val:
- logging.error("For key {key} got value {got} instead of {expected}".format(key=key, got=val, expected=exp))
- ERRORS += 1
- if len(values) != 0:
- logging.error("Not all keys found, remaining keys:")
- print(values)
-
- print("Test --op meta-list")
- tmpfd = open(TMPFILE, "wb")
- cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
- ERRORS += 1
-
- print("Test get-bytes on meta")
- tmpfd.close()
- lines = get_lines(TMPFILE)
- JSONOBJ = sorted(set(lines))
- for JSON in JSONOBJ:
- (pgid, jsondict) = json.loads(JSON)
- if pgid != "meta":
- logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
- ERRORS += 1
- if jsondict['namespace'] != "":
- logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
- ERRORS += 1
- logging.info(JSON)
- try:
- os.unlink(GETNAME)
- except:
- pass
- cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
- logging.debug(cmd)
- ret = call(cmd, shell=True)
- if ret != 0:
- logging.error("Bad exit status {ret}".format(ret=ret))
- ERRORS += 1
-
- try:
- os.unlink(GETNAME)
- except:
- pass
- try:
- os.unlink(TESTNAME)
- except:
- pass
-
- print("Test pg info")
- for pg in ALLREPPGS + ALLECPGS:
- for osd in get_osds(pg, OSDDIR):
- cmd = (CFSD_PREFIX + "--op info --pgid {pg} | grep '\"pgid\": \"{pg}\"'").format(osd=osd, pg=pg)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Getting info failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- ERRORS += 1
-
- print("Test pg logging")
- if len(ALLREPPGS + ALLECPGS) == len(OBJREPPGS + OBJECPGS):
- logging.warning("All PGs have objects, so no log without modify entries")
- for pg in ALLREPPGS + ALLECPGS:
- for osd in get_osds(pg, OSDDIR):
- tmpfd = open(TMPFILE, "wb")
- cmd = (CFSD_PREFIX + "--op log --pgid {pg}").format(osd=osd, pg=pg)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=tmpfd)
- if ret != 0:
- logging.error("Getting log failed for pg {pg} from {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- ERRORS += 1
- HASOBJ = pg in OBJREPPGS + OBJECPGS
- MODOBJ = False
- for line in get_lines(TMPFILE):
- if line.find("modify") != -1:
- MODOBJ = True
- break
- if HASOBJ != MODOBJ:
- logging.error("Bad log for pg {pg} from {osd}".format(pg=pg, osd=osd))
- MSG = (HASOBJ and [""] or ["NOT "])[0]
- print("Log should {msg}have a modify entry".format(msg=MSG))
- ERRORS += 1
-
- try:
- os.unlink(TMPFILE)
- except:
- pass
-
- print("Test list-pgs")
- for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
-
- CHECK_PGS = get_osd_pgs(os.path.join(OSDDIR, osd), None)
- CHECK_PGS = sorted(CHECK_PGS)
-
- cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
- logging.debug(cmd)
- TEST_PGS = check_output(cmd, shell=True).split("\n")
- TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line
-
- if TEST_PGS != CHECK_PGS:
- logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
- logging.error("Expected {pgs}".format(pgs=CHECK_PGS))
- logging.error("Got {pgs}".format(pgs=TEST_PGS))
- ERRORS += 1
-
- EXP_ERRORS = 0
- print("Test pg export --dry-run")
- pg = ALLREPPGS[0]
- osd = get_osds(pg, OSDDIR)[0]
- fname = "/tmp/fname.{pid}".format(pid=pid)
- cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- EXP_ERRORS += 1
- elif os.path.exists(fname):
- logging.error("Exporting --dry-run created file")
- EXP_ERRORS += 1
-
- cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- EXP_ERRORS += 1
- else:
- outdata = get_lines(fname)
- if len(outdata) > 0:
- logging.error("Exporting --dry-run to stdout not empty")
- logging.error("Data: " + outdata)
- EXP_ERRORS += 1
-
- os.mkdir(TESTDIR)
- for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
- os.mkdir(os.path.join(TESTDIR, osd))
- print("Test pg export")
- for pg in ALLREPPGS + ALLECPGS:
- for osd in get_osds(pg, OSDDIR):
- mydir = os.path.join(TESTDIR, osd)
- fname = os.path.join(mydir, pg)
- if pg == ALLREPPGS[0]:
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
- elif pg == ALLREPPGS[1]:
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
- else:
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- EXP_ERRORS += 1
-
- ERRORS += EXP_ERRORS
-
- print("Test pg removal")
- RM_ERRORS = 0
- for pg in ALLREPPGS + ALLECPGS:
- for osd in get_osds(pg, OSDDIR):
- # This should do nothing
- cmd = (CFSD_PREFIX + "--op remove --pgid {pg} --dry-run").format(pg=pg, osd=osd)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- RM_ERRORS += 1
- cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Removing failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- RM_ERRORS += 1
-
- ERRORS += RM_ERRORS
-
- IMP_ERRORS = 0
- if EXP_ERRORS == 0 and RM_ERRORS == 0:
- print("Test pg import")
- for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
- dir = os.path.join(TESTDIR, osd)
- PGS = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
- for pg in PGS:
- file = os.path.join(dir, pg)
- # This should do nothing
- cmd = (CFSD_PREFIX + "--op import --file {file} --dry-run").format(osd=osd, file=file)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
- IMP_ERRORS += 1
- if pg == PGS[0]:
- cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
- elif pg == PGS[1]:
- cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
- else:
- cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
- IMP_ERRORS += 1
- else:
- logging.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
-
- ERRORS += IMP_ERRORS
- logging.debug(cmd)
-
- if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
- print("Verify replicated import data")
- data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
- ERRORS += data_errors
- else:
- logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
-
- print("Test all --op dump-journal again")
- ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]
- ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
-
- vstart(new=False)
- wait_for_health()
-
- if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
- print("Verify erasure coded import data")
- ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
- # Check replicated data/xattr/omap using rados
- print("Verify replicated import data using rados")
- ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
-
- if EXP_ERRORS == 0:
- NEWPOOL = "rados-import-pool"
- cmd = "{path}/rados mkpool {pool}".format(pool=NEWPOOL, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
-
- print("Test rados import")
- first = True
- for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
- dir = os.path.join(TESTDIR, osd)
- for pg in [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]:
- if pg.find("{id}.".format(id=REPID)) != 0:
- continue
- file = os.path.join(dir, pg)
- if first:
- first = False
- # This should do nothing
- cmd = "{path}/rados import -p {pool} --dry-run {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Rados import --dry-run failed from {file} with {ret}".format(file=file, ret=ret))
- ERRORS += 1
- cmd = "{path}/rados -p {pool} ls".format(pool=NEWPOOL, path=CEPH_BIN)
- logging.debug(cmd)
- data = check_output(cmd, shell=True)
- if data:
- logging.error("'{data}'".format(data=data))
- logging.error("Found objects after dry-run")
- ERRORS += 1
- cmd = "{path}/rados import -p {pool} {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Rados import failed from {file} with {ret}".format(file=file, ret=ret))
- ERRORS += 1
- cmd = "{path}/rados import -p {pool} --no-overwrite {file}".format(pool=NEWPOOL, file=file, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
- ERRORS += 1
-
- ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
- else:
- logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
-
- # Clear directories of previous portion
- call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
- call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
- os.mkdir(TESTDIR)
- os.mkdir(DATADIR)
-
- # Cause SPLIT_POOL to split and test import with object/log filtering
- print("Testing import all objects after a split")
- SPLIT_POOL = "split_pool"
- PG_COUNT = 1
- SPLIT_OBJ_COUNT = 5
- SPLIT_NSPACE_COUNT = 2
- SPLIT_NAME = "split"
- cmd = "{path}/ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT, path=CEPH_BIN)
- logging.debug(cmd)
- call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- SPLITID = get_pool_id(SPLIT_POOL, nullfd)
- pool_size = int(check_output("{path}/ceph osd pool get {pool} size".format(pool=SPLIT_POOL, path=CEPH_BIN), shell=True, stderr=nullfd).split(" ")[1])
- EXP_ERRORS = 0
- RM_ERRORS = 0
- IMP_ERRORS = 0
-
- objects = range(1, SPLIT_OBJ_COUNT + 1)
- nspaces = range(SPLIT_NSPACE_COUNT)
- for n in nspaces:
- nspace = get_nspace(n)
-
- for i in objects:
- NAME = SPLIT_NAME + "{num}".format(num=i)
- LNAME = nspace + "-" + NAME
- DDNAME = os.path.join(DATADIR, LNAME)
- DDNAME += "__head"
-
- cmd = "rm -f " + DDNAME
- logging.debug(cmd)
- call(cmd, shell=True)
-
- if i == 1:
- dataline = range(DATALINECOUNT)
- else:
- dataline = range(1)
- fd = open(DDNAME, "w")
- data = "This is the split data for " + LNAME + "\n"
- for _ in dataline:
- fd.write(data)
- fd.close()
-
- cmd = "{path}/rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stderr=nullfd)
- if ret != 0:
- logging.critical("Rados put command failed with {ret}".format(ret=ret))
- return 1
-
- wait_for_health()
- kill_daemons()
-
- for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and f.find("osd") == 0]:
- os.mkdir(os.path.join(TESTDIR, osd))
-
- pg = "{pool}.0".format(pool=SPLITID)
- EXPORT_PG = pg
-
- export_osds = get_osds(pg, OSDDIR)
- for osd in export_osds:
- mydir = os.path.join(TESTDIR, osd)
- fname = os.path.join(mydir, pg)
- cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- if ret != 0:
- logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
- EXP_ERRORS += 1
-
- ERRORS += EXP_ERRORS
-
- if EXP_ERRORS == 0:
- vstart(new=False)
- wait_for_health()
-
- cmd = "{path}/ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL, path=CEPH_BIN)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
- time.sleep(5)
- wait_for_health()
-
- kill_daemons()
-
- # Now 2 PGs, poolid.0 and poolid.1
- for seed in range(2):
- pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
-
- which = 0
- for osd in get_osds(pg, OSDDIR):
- cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
-
- # This is weird. The export files are based on only the EXPORT_PG
- # and where that pg was before the split. Use 'which' to use all
- # export copies in import.
- mydir = os.path.join(TESTDIR, export_osds[which])
- fname = os.path.join(mydir, EXPORT_PG)
- which += 1
- cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
- logging.debug(cmd)
- ret = call(cmd, shell=True, stdout=nullfd)
- if ret != 0:
- logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
- IMP_ERRORS += 1
-
- ERRORS += IMP_ERRORS
-
- # Start up again to make sure imports didn't corrupt anything
- if IMP_ERRORS == 0:
- print("Verify split import data")
- data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
- ERRORS += data_errors
- if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
- logging.error("Incorrect number of replicas seen {count}".format(count=count))
- ERRORS += 1
- vstart(new=False)
- wait_for_health()
-
- call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
- call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
-
- ERRORS += test_removeall(CFSD_PREFIX, db, OBJREPPGS, REP_POOL, CEPH_BIN, OSDDIR, REP_NAME, NUM_CLONED_REP_OBJECTS)
-
- # vstart() starts 4 OSDs
- ERRORS += test_get_set_osdmap(CFSD_PREFIX, list(range(4)), ALLOSDS)
- ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
- if ERRORS == 0:
- print("TEST PASSED")
- return 0
- else:
- print("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
- return 1
-
-
-def remove_btrfs_subvolumes(path):
- if platform.system() == "FreeBSD":
- return
- result = subprocess.Popen("stat -f -c '%%T' %s" % path, shell=True, stdout=subprocess.PIPE)
- for line in result.stdout:
- filesystem = decode(line).rstrip('\n')
- if filesystem == "btrfs":
- result = subprocess.Popen("sudo btrfs subvolume list %s" % path, shell=True, stdout=subprocess.PIPE)
- for line in result.stdout:
- subvolume = decode(line).split()[8]
- # extracting the relative volume name
- m = re.search(".*(%s.*)" % path, subvolume)
- if m:
- found = m.group(1)
- call("sudo btrfs subvolume delete %s" % found, shell=True)
-
-
-if __name__ == "__main__":
- status = 1
- try:
- status = main(sys.argv[1:])
- finally:
- kill_daemons()
- remove_btrfs_subvolumes(CEPH_DIR)
- call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
- sys.exit(status)
# display a warning if there is more than one root
#
$ crushtool --outfn "$map" --build --num_osds 5 node straw 2 rack straw 1
- .* The crush rulesets will use the root rack0 (re)
+ The crush rulesets will use the root rack0 (re)
and ignore the others.
There are 3 roots, they can be
grouped into a single root by appending something like:
--test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
--health dump health checks
--mark-up-in mark osds up and in (but do not persist)
+ --mark-out <osdid> mark an osd as out (but do not persist)
--with-default-pool include default pool when creating map
--clear-temp clear pg_temp and primary_temp
--test-random do random placements
--test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds
--health dump health checks
--mark-up-in mark osds up and in (but do not persist)
+ --mark-out <osdid> mark an osd as out (but do not persist)
--with-default-pool include default pool when creating map
--clear-temp clear pg_temp and primary_temp
--test-random do random placements
--- /dev/null
+ $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
+ osdmaptool: osdmap file 'om'
+ osdmaptool: writing epoch 1 to om
+ $ osdmaptool om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
+ osdmaptool: osdmap file 'om'
+ marking all OSDs up and in
+ marking OSD@147 as out
+ writing upmap command output to: c
+ checking for upmap cleanups
+ upmap, max-count 11, max deviation 0.01
+ $ cat c
+ ceph osd pg-upmap-items 1.7 142 145
+ ceph osd pg-upmap-items 1.8 219 223 99 103
+ ceph osd pg-upmap-items 1.17 171 173 201 202
+ ceph osd pg-upmap-items 1.1a 201 202 115 114
+ ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+ ceph osd pg-upmap-items 1.20 88 87 201 202
+ ceph osd pg-upmap-items 1.21 207 206 142 145
+ ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
+ ceph osd pg-upmap-items 1.62 219 223
+ ceph osd pg-upmap-items 1.6f 219 223 108 111
+ ceph osd pg-upmap-items 1.82 219 223 157 158 6 3
+ $ rm -f om c
--- /dev/null
+# cls_test_cls_journal
+add_executable(ceph_test_cls_journal
+ test_cls_journal.cc
+ $<TARGET_OBJECTS:common_texttable_obj>)
+set_target_properties(ceph_test_cls_journal PROPERTIES COMPILE_FLAGS
+ ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_cls_journal
+ cls_journal_client
+ librados
+ global
+ ${UNITTEST_LIBS}
+ ${CMAKE_DL_LIBS}
+ ${CRYPTO_LIBS}
+ ${EXTRALIBS}
+ radostest)
+install(TARGETS
+ ceph_test_cls_journal
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
bufferlist()));
ASSERT_EQ(0, client::tag_create(ioctx, oid, 1, Tag::TAG_CLASS_NEW,
bufferlist()));
- ASSERT_EQ(0, client::tag_create(ioctx, oid, 2, 1, bufferlist()));
+
+ for (uint32_t i = 2; i <= 96; ++i) {
+ ASSERT_EQ(0, client::tag_create(ioctx, oid, i, 1, bufferlist()));
+ }
librados::ObjectWriteOperation op1;
- client::client_commit(&op1, "id1", {{{1, 2, 120}}});
+ client::client_commit(&op1, "id1", {{{1, 32, 120}}});
ASSERT_EQ(0, ioctx.operate(oid, &op1));
ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id2"));
- std::set<Tag> expected_tags = {{0, 0, {}}, {2, 1, {}}};
+ std::set<Tag> expected_tags = {{0, 0, {}}};
+ for (uint32_t i = 32; i <= 96; ++i) {
+ expected_tags.insert({i, 1, {}});
+ }
std::set<Tag> tags;
ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1",
boost::optional<uint64_t>(), &tags));
ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(0),
&tags));
ASSERT_EQ(expected_filtered_tags, tags);
+
+ librados::ObjectWriteOperation op1;
+ client::client_commit(&op1, "id1", {{{96, 0, 120}}});
+ ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+ ASSERT_EQ(0, client::tag_list(ioctx, oid, "id1", boost::optional<uint64_t>(),
+ &tags));
+ ASSERT_EQ(expected_all_tags, tags);
}
TEST_F(TestClsJournal, GuardAppend) {
ASSERT_THROW(bit_vector2.decode_data(data_it, byte_offset),
buffer::malformed_input);
}
+
+TYPED_TEST(BitVectorTest, iterator) {
+ typename TestFixture::bit_vector_t bit_vector;
+
+ uint64_t radix = 1 << bit_vector.BIT_COUNT;
+ uint64_t size = 25 * (1ULL << 20);
+ uint64_t offset = 0;
+
+ // create fragmented in-memory bufferlist layout
+ uint64_t resize = 0;
+ while (resize < size) {
+ resize += 4096;
+ if (resize > size) {
+ resize = size;
+ }
+ bit_vector.resize(resize);
+ }
+
+ for (auto it = bit_vector.begin(); it != bit_vector.end(); ++it, ++offset) {
+ *it = offset % radix;
+ }
+
+ offset = 123;
+ auto end_it = bit_vector.begin() + (size - 1024);
+ for (auto it = bit_vector.begin() + offset; it != end_it; ++it, ++offset) {
+ ASSERT_EQ(offset % radix, *it);
+ }
+}
TEST(DaemonConfig, SimpleSet) {
int ret;
ret = g_ceph_context->_conf->set_val("log_graylog_port", "21");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("21"), string(buf));
}
TEST(DaemonConfig, Substitution) {
int ret;
ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("host", "foo");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("public_network", "bar$host.baz", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("barfoo.baz"), string(buf));
}
TEST(DaemonConfig, SubstitutionTrailing) {
int ret;
ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("host", "foo");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("public_network", "bar$host", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("barfoo"), string(buf));
}
TEST(DaemonConfig, SubstitutionBraces) {
int ret;
ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("host", "foo");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("public_network", "bar${host}baz", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("barfoobaz"), string(buf));
}
TEST(DaemonConfig, SubstitutionBracesTrailing) {
int ret;
ret = g_ceph_context->_conf->set_val("internal_safe_to_start_threads", "false");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("host", "foo");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("public_network", "bar${host}", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("public_network", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("barfoo"), string(buf));
}
TEST(DaemonConfig, SubstitutionMultiple) {
int ret;
ret = g_ceph_context->_conf->set_val("mon_host", "localhost", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->set_val("keyring", "$mon_host/$cluster.keyring,$mon_host/$cluster.mon.keyring", false);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
char buf[512];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
ret = g_ceph_context->_conf->get_val("keyring", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("localhost/ceph.keyring,localhost/ceph.mon.keyring"), tmp);
ASSERT_TRUE(strchr(buf, '$') == NULL);
}
char *tmp = buf;
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("keyfile", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("22"), string(buf));
ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
int ret;
std::string injection("--log-graylog-port 56 --leveldb-max-open-files 42");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
char buf[128];
char *tmp = buf;
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("42"), string(buf));
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("56"), string(buf));
injection = "--log-graylog-port 57";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("57"), string(buf));
}
// We should complain about the garbage in the input
std::string injection("--random-garbage-in-injectargs 26 --log-graylog-port 28");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
// But, debug should still be set...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("28"), string(buf));
// What's the current value of osd_data?
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
// Injectargs shouldn't let us change this, since it is a string-valued
// variable and there isn't an observer for it.
std::string injection2("--osd_data /tmp/some-other-directory --log-graylog-port 4");
ret = g_ceph_context->_conf->injectargs(injection2, &cout);
- ASSERT_EQ(ret, -ENOSYS);
+ ASSERT_EQ(-ENOSYS, ret);
// It should be unchanged.
memset(buf2, 0, sizeof(buf2));
ret = g_ceph_context->_conf->get_val("osd_data", &tmp2, sizeof(buf2));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string(buf), string(buf2));
+
+ // We should complain about the missing arguments.
+ std::string injection3("--log-graylog-port 28 --debug_ms");
+ ret = g_ceph_context->_conf->injectargs(injection3, &cout);
+ ASSERT_EQ(-EINVAL, ret);
}
TEST(DaemonConfig, InjectArgsBooleans) {
// Change log_to_syslog
std::string injection("--log_to_syslog --log-graylog-port 28");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
// log_to_syslog should be set...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("true"), string(buf));
// Turn off log_to_syslog
injection = "--log_to_syslog=false --log-graylog-port 28";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
// log_to_syslog should be cleared...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("false"), string(buf));
// Turn on log_to_syslog
injection = "--log-graylog-port=1 --log_to_syslog=true --leveldb-max-open-files 40";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
// log_to_syslog should be set...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("true"), string(buf));
// parse error
injection = "--log-graylog-port 1 --log_to_syslog=falsey --leveldb-max-open-files 42";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
// log_to_syslog should still be set...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_to_syslog", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("true"), string(buf));
// debug-ms should still become 42...
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("42"), string(buf));
}
injection += tmpfile;
// We're allowed to change log_file because there is an observer.
ret = g_ceph_context->_conf->injectargs(injection, &cout);
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
// It should have taken effect.
char buf[128];
char *tmp = buf;
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string(buf), string(tmpfile));
// The logfile should exist.
- ASSERT_EQ(access(tmpfile, R_OK), 0);
+ ASSERT_EQ(0, access(tmpfile, R_OK));
// Let's turn off the logfile.
ret = g_ceph_context->_conf->set_val("log_file", "");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(NULL);
ret = g_ceph_context->_conf->get_val("log_file", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string(""), string(buf));
// Clean up the garbage
// Verify that we can't change this, since internal_safe_to_start_threads has
// been set.
ret = g_ceph_context->_conf->set_val("osd_data", "");
- ASSERT_EQ(ret, -ENOSYS);
+ ASSERT_EQ(-ENOSYS, ret);
ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
"false"));
// OSD threads running, we know changing osd_data won't actually blow up the
// world.
ret = g_ceph_context->_conf->set_val("osd_data", "/tmp/crazydata");
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
char buf[128];
char *tmp = buf;
memset(buf, 0, sizeof(buf));
ret = g_ceph_context->_conf->get_val("osd_data", &tmp, sizeof(buf));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
ASSERT_EQ(string("/tmp/crazydata"), string(buf));
ASSERT_EQ(0, g_ceph_context->_conf->set_val("internal_safe_to_start_threads",
"false"));
- ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, ret);
}
TEST(DaemonConfig, InvalidIntegers) {
{
int ret = g_ceph_context->_conf->set_val("log_graylog_port", "rhubarb");
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
}
{
string str = boost::lexical_cast<string>(max);
str = str + "999"; // some extra digits to take us out of bounds
int ret = g_ceph_context->_conf->set_val("log_graylog_port", str);
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
}
}
double bad_value = 2 * (double)std::numeric_limits<float>::max();
string str = boost::lexical_cast<string>(-bad_value);
int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
}
{
double bad_value = 2 * (double)std::numeric_limits<float>::max();
string str = boost::lexical_cast<string>(bad_value);
int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", str);
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
}
{
int ret = g_ceph_context->_conf->set_val("log_stop_at_utilization", "not a float");
- ASSERT_EQ(ret, -EINVAL);
+ ASSERT_EQ(-EINVAL, ret);
}
}
typedef RadosTest LibRadosMisc;
typedef RadosTestPP LibRadosMiscPP;
+typedef RadosTestECPP LibRadosMiscECPP;
TEST(LibRadosMiscVersion, Version) {
int major, minor, extra;
ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
ASSERT_EQ(expected_meta, meta);
}
+
+TEST_F(LibRadosMiscECPP, CompareExtentRange) {
+ bufferlist bl1;
+ bl1.append("ceph");
+ ObjectWriteOperation write;
+ write.write(0, bl1);
+ ASSERT_EQ(0, ioctx.operate("foo", &write));
+
+ bufferlist bl2;
+ bl2.append("ph");
+ bl2.append(std::string(2, '\0'));
+ ObjectReadOperation read1;
+ read1.cmpext(2, bl2, nullptr);
+ ASSERT_EQ(0, ioctx.operate("foo", &read1, nullptr));
+
+ bufferlist bl3;
+ bl3.append(std::string(4, '\0'));
+ ObjectReadOperation read2;
+ read2.cmpext(2097152, bl3, nullptr);
+ ASSERT_EQ(0, ioctx.operate("foo", &read2, nullptr));
+}
operation/test_mock_SnapshotRemoveRequest.cc
operation/test_mock_SnapshotRollbackRequest.cc
operation/test_mock_SnapshotUnprotectRequest.cc
+ operation/test_mock_TrimRequest.cc
watcher/test_mock_RewatchRequest.cc
)
add_executable(unittest_librbd
cls::rbd::SnapshotNamespace *out_snap_namespace));
MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id,
ParentSpec *pspec));
+ MOCK_CONST_METHOD2(get_parent_overlap, int(librados::snap_t in_snap_id,
+ uint64_t *overlap));
MOCK_CONST_METHOD2(is_snap_protected, int(librados::snap_t in_snap_id,
bool *is_protected));
MOCK_METHOD8(write_to_cache, void(object_t, const bufferlist&, size_t,
uint64_t, Context *, int, uint64_t, ZTracer::Trace *));
+ MOCK_CONST_METHOD0(get_stripe_count, uint64_t());
+ MOCK_CONST_METHOD0(get_stripe_period, uint64_t());
+
ImageCtx *image_ctx;
CephContext *cct;
PerfCounters *perfcounter;
MOCK_METHOD3(aio_resize, void(uint64_t new_size, uint8_t default_object_state,
Context *on_finish));
- template <typename T, void(T::*MF)(int)>
+ template <typename T, void(T::*MF)(int) = &T::complete>
bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
const boost::optional<uint8_t> ¤t_state,
const ZTracer::Trace &parent_trace, T *callback_object) {
callback_object);
}
- template <typename T, void(T::*MF)(int)>
+ template <typename T, void(T::*MF)(int) = &T::complete>
bool aio_update(uint64_t snap_id, uint64_t start_object_no,
uint64_t end_object_no, uint8_t new_state,
const boost::optional<uint8_t> ¤t_state,
const ZTracer::Trace &parent_trace, T *callback_object) {
- return aio_update(snap_id, start_object_no, end_object_no, new_state,
- current_state, parent_trace,
- util::create_context_callback<T, MF>(callback_object));
+ auto ctx = util::create_context_callback<T, MF>(callback_object);
+ bool updated = aio_update(snap_id, start_object_no, end_object_no,
+ new_state, current_state, parent_trace, ctx);
+ if (!updated) {
+ delete ctx;
+ }
+ return updated;
}
MOCK_METHOD7(aio_update, bool(uint64_t snap_id, uint64_t start_object_no,
uint64_t end_object_no, uint8_t new_state,
const boost::optional<uint8_t> ¤t_state,
const ZTracer::Trace &parent_trace,
Context *on_finish));
+
MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
MOCK_METHOD2(rollback, void(uint64_t snap_id, Context *on_finish));
+
+ MOCK_CONST_METHOD1(object_may_exist, bool(uint64_t));
+
};
} // namespace librbd
using ::testing::_;
using ::testing::DoDefault;
+using ::testing::InSequence;
using ::testing::Return;
using ::testing::StrEq;
class TestMockObjectMapUpdateRequest : public TestMockFixture {
public:
- void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id, int r) {
+ void expect_update(librbd::ImageCtx *ictx, uint64_t snap_id,
+ uint64_t start_object_no, uint64_t end_object_no,
+ uint8_t new_state,
+ const boost::optional<uint8_t>& current_state, int r) {
+ bufferlist bl;
+ ::encode(start_object_no, bl);
+ ::encode(end_object_no, bl);
+ ::encode(new_state, bl);
+ ::encode(current_state, bl);
+
std::string oid(ObjectMap<>::object_map_name(ictx->id, snap_id));
if (snap_id == CEPH_NOSNAP) {
EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
if (r < 0) {
EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
- exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+ exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+ ContentsEqual(bl), _, _))
.WillOnce(Return(r));
} else {
EXPECT_CALL(get_mock_io_ctx(ictx->md_ctx),
- exec(oid, _, StrEq("rbd"), StrEq("object_map_update"), _, _, _))
+ exec(oid, _, StrEq("rbd"), StrEq("object_map_update"),
+ ContentsEqual(bl), _, _))
.WillOnce(DoDefault());
}
}
ASSERT_EQ(0, open_image(m_image_name, &ictx));
ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
- expect_update(ictx, CEPH_NOSNAP, 0);
+ expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
ceph::BitVector<2> object_map;
object_map.resize(1);
"snap1"));
uint64_t snap_id = ictx->snap_id;
- expect_update(ictx, snap_id, 0);
+ expect_update(ictx, snap_id, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS, 0);
ceph::BitVector<2> object_map;
object_map.resize(1);
ASSERT_EQ(0, open_image(m_image_name, &ictx));
ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
- expect_update(ictx, CEPH_NOSNAP, -EINVAL);
+ expect_update(ictx, CEPH_NOSNAP, 0, 1, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+ -EINVAL);
expect_invalidate(ictx);
ceph::BitVector<2> object_map;
ASSERT_EQ(CEPH_NOSNAP, ictx->snap_id);
uint64_t snap_id = ictx->snap_info.rbegin()->first;
- expect_update(ictx, snap_id, 0);
+ expect_update(ictx, snap_id, 0, 1, OBJECT_EXISTS_CLEAN,
+ boost::optional<uint8_t>(), 0);
expect_unlock_exclusive_lock(*ictx);
ceph::BitVector<2> object_map;
ASSERT_NE(OBJECT_EXISTS_CLEAN, object_map[0]);
}
+TEST_F(TestMockObjectMapUpdateRequest, BatchUpdate) {
+ REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ librbd::NoOpProgressContext no_progress;
+ ASSERT_EQ(0, ictx->operations->resize(712312 * ictx->get_object_size(), false,
+ no_progress));
+ ASSERT_EQ(0, acquire_exclusive_lock(*ictx));
+
+ InSequence seq;
+ expect_update(ictx, CEPH_NOSNAP, 0, 262144, OBJECT_NONEXISTENT, OBJECT_EXISTS,
+ 0);
+ expect_update(ictx, CEPH_NOSNAP, 262144, 524288, OBJECT_NONEXISTENT,
+ OBJECT_EXISTS, 0);
+ expect_update(ictx, CEPH_NOSNAP, 524288, 712312, OBJECT_NONEXISTENT,
+ OBJECT_EXISTS, 0);
+ expect_unlock_exclusive_lock(*ictx);
+
+ ceph::BitVector<2> object_map;
+ object_map.resize(712312);
+
+ C_SaferCond cond_ctx;
+ AsyncRequest<> *req = new UpdateRequest<>(
+ *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
+ OBJECT_EXISTS, {}, &cond_ctx);
+ {
+ RWLock::RLocker snap_locker(ictx->snap_lock);
+ RWLock::WLocker object_map_locker(ictx->object_map_lock);
+ req->send();
+ }
+ ASSERT_EQ(0, cond_ctx.wait());
+}
+
} // namespace object_map
} // namespace librbd
}
void expect_allocate_snap_id(MockImageCtx &mock_image_ctx, int r) {
- auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+ auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
selfmanaged_snap_create(_));
if (r < 0 && r != -ESTALE) {
expect.WillOnce(Return(r));
}
void expect_release_snap_id(MockImageCtx &mock_image_ctx, int r) {
- auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+ auto &expect = EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
selfmanaged_snap_remove(_));
if (r < 0) {
expect.WillOnce(Return(r));
}
void expect_release_snap_id(MockImageCtx &mock_image_ctx) {
- EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.md_ctx),
+ EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx),
selfmanaged_snap_remove(_))
.WillOnce(DoDefault());
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "common/bit_vector.hpp"
+#include "librbd/AsyncRequest.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/operation/TrimRequest.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace librbd {
+namespace {
+
+struct MockTestImageCtx : public MockImageCtx {
+ MockTestImageCtx(ImageCtx &image_ctx) : MockImageCtx(image_ctx) {
+ }
+};
+
+} // anonymous namespace
+
+template<>
+struct AsyncRequest<librbd::MockTestImageCtx> {
+ librbd::MockTestImageCtx& m_image_ctx;
+ Context *on_finish;
+
+ AsyncRequest(librbd::MockTestImageCtx& image_ctx, Context* on_finish)
+ : m_image_ctx(image_ctx), on_finish(on_finish) {
+ }
+ virtual ~AsyncRequest() {
+ }
+
+ Context* create_callback_context() {
+ return util::create_context_callback(this);
+ }
+
+ Context* create_async_callback_context() {
+ return util::create_context_callback<AsyncRequest,
+ &AsyncRequest::async_complete>(this);
+ }
+
+ void complete(int r) {
+ if (should_complete(r)) {
+ async_complete(r);
+ }
+ }
+
+ void async_complete(int r) {
+ on_finish->complete(r);
+ }
+
+ bool is_canceled() const {
+ return false;
+ }
+
+ virtual void send() = 0;
+ virtual bool should_complete(int r) = 0;
+};
+
+namespace io {
+
+template <>
+struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
+ static ObjectRequest* s_instance;
+ Context *on_finish = nullptr;
+
+ static ObjectRequest* create_truncate(librbd::MockTestImageCtx *ictx,
+ const std::string &oid,
+ uint64_t object_no,
+ uint64_t object_off,
+ const ::SnapContext &snapc,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ assert(s_instance != nullptr);
+ s_instance->on_finish = completion;
+ s_instance->construct_truncate();
+ return s_instance;
+ }
+
+ static ObjectRequest* create_trim(librbd::MockTestImageCtx *ictx,
+ const std::string &oid,
+ uint64_t object_no,
+ const ::SnapContext &snapc,
+ bool post_object_map_update,
+ Context *completion) {
+ assert(s_instance != nullptr);
+ s_instance->on_finish = completion;
+ s_instance->construct_trim();
+ return s_instance;
+ }
+
+ ObjectRequest() {
+ s_instance = this;
+ }
+
+ MOCK_METHOD0(construct_truncate, void());
+ MOCK_METHOD0(construct_trim, void());
+ MOCK_METHOD0(send, void());
+ MOCK_METHOD1(complete, void(int));
+};
+
+ObjectRequest<librbd::MockTestImageCtx>* ObjectRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
+
+} // namespace io
+} // namespace librbd
+
+// template definitions
+#include "librbd/AsyncObjectThrottle.cc"
+#include "librbd/operation/TrimRequest.cc"
+
+namespace librbd {
+namespace operation {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Invoke;
+using ::testing::Return;
+using ::testing::StrEq;
+using ::testing::WithArg;
+
+class TestMockOperationTrimRequest : public TestMockFixture {
+public:
+ typedef TrimRequest<MockTestImageCtx> MockTrimRequest;
+ typedef librbd::io::ObjectRequest<MockTestImageCtx> MockObjectRequest;
+
+ int create_snapshot(const char *snap_name) {
+ librbd::ImageCtx *ictx;
+ int r = open_image(m_image_name, &ictx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = snap_create(*ictx, snap_name);
+ if (r < 0) {
+ return r;
+ }
+
+ r = snap_protect(*ictx, snap_name);
+ if (r < 0) {
+ return r;
+ }
+ close_image(ictx);
+ return 0;
+ }
+
+ void expect_is_lock_owner(MockTestImageCtx &mock_image_ctx) {
+ if (mock_image_ctx.exclusive_lock != nullptr) {
+ EXPECT_CALL(*mock_image_ctx.exclusive_lock, is_lock_owner())
+ .WillRepeatedly(Return(true));
+ }
+ }
+
+ void expect_object_map_update(MockTestImageCtx &mock_image_ctx,
+ uint64_t start_object, uint64_t end_object,
+ uint8_t state, uint8_t current_state,
+ bool updated, int ret_val) {
+ if (mock_image_ctx.object_map != nullptr) {
+ EXPECT_CALL(*mock_image_ctx.object_map,
+ aio_update(CEPH_NOSNAP, start_object, end_object, state,
+ boost::optional<uint8_t>(current_state), _, _))
+ .WillOnce(WithArg<6>(Invoke([&mock_image_ctx, updated, ret_val](Context *ctx) {
+ if (updated) {
+ mock_image_ctx.op_work_queue->queue(ctx, ret_val);
+ }
+ return updated;
+ })));
+ }
+ }
+
+ void expect_get_parent_overlap(MockTestImageCtx &mock_image_ctx,
+ uint64_t overlap) {
+ EXPECT_CALL(mock_image_ctx, get_parent_overlap(CEPH_NOSNAP, _))
+ .WillOnce(WithArg<1>(Invoke([overlap](uint64_t *o) {
+ *o = overlap;
+ return 0;
+ })));
+ }
+
+ void expect_object_may_exist(MockTestImageCtx &mock_image_ctx,
+ uint64_t object_no, bool exists) {
+ if (mock_image_ctx.object_map != nullptr) {
+ EXPECT_CALL(*mock_image_ctx.object_map, object_may_exist(object_no))
+ .WillOnce(Return(exists));
+ }
+ }
+
+ void expect_get_object_name(MockTestImageCtx &mock_image_ctx,
+ uint64_t object_no, const std::string& oid) {
+ EXPECT_CALL(mock_image_ctx, get_object_name(object_no))
+ .WillOnce(Return(oid));
+ }
+
+ void expect_aio_remove(MockTestImageCtx &mock_image_ctx,
+ const std::string& oid, int ret_val) {
+ EXPECT_CALL(get_mock_io_ctx(mock_image_ctx.data_ctx), remove(oid, _))
+ .WillOnce(Return(ret_val));
+ }
+
+ void expect_object_trim(MockImageCtx &mock_image_ctx,
+ MockObjectRequest &mock_object_request, int ret_val) {
+ EXPECT_CALL(mock_object_request, construct_trim());
+ EXPECT_CALL(mock_object_request, send())
+ .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+ mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+ }));
+ }
+
+ void expect_object_truncate(MockImageCtx &mock_image_ctx,
+ MockObjectRequest &mock_object_request,
+ int ret_val) {
+ EXPECT_CALL(mock_object_request, construct_truncate());
+ EXPECT_CALL(mock_object_request, send())
+ .WillOnce(Invoke([&mock_image_ctx, &mock_object_request, ret_val]() {
+ mock_image_ctx.op_work_queue->queue(mock_object_request.on_finish, ret_val);
+ }));
+ }
+};
+
+TEST_F(TestMockOperationTrimRequest, SuccessRemove) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // pre
+ expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+ true, 0);
+
+ // copy-up
+ expect_get_parent_overlap(mock_image_ctx, 0);
+
+ // remove
+ expect_object_may_exist(mock_image_ctx, 0, true);
+ expect_get_object_name(mock_image_ctx, 0, "object0");
+ expect_aio_remove(mock_image_ctx, "object0", 0);
+
+ // post
+ expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_NONEXISTENT,
+ OBJECT_PENDING, true, 0);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessCopyUp) {
+ REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+ ASSERT_EQ(0, create_snapshot("snap1"));
+
+ int order = 22;
+ uint64_t features;
+ ASSERT_TRUE(::get_features(&features));
+ std::string clone_name = get_temp_image_name();
+ ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+ clone_name.c_str(), features, &order, 0, 0));
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(clone_name, &ictx));
+ ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // pre
+ expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+ true, 0);
+
+ // copy-up
+ expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+ expect_get_object_name(mock_image_ctx, 0, "object0");
+
+ MockObjectRequest mock_object_request;
+ expect_object_trim(mock_image_ctx, mock_object_request, 0);
+
+ // remove
+ expect_object_may_exist(mock_image_ctx, 1, true);
+ expect_get_object_name(mock_image_ctx, 1, "object1");
+ expect_aio_remove(mock_image_ctx, "object1", 0);
+
+ // post
+ expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_NONEXISTENT,
+ OBJECT_PENDING, true, 0);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessBoundary) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // boundary
+ MockObjectRequest mock_object_request;
+ expect_object_truncate(mock_image_ctx, mock_object_request, 0);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(0, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, SuccessNoOp) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+}
+
+TEST_F(TestMockOperationTrimRequest, RemoveError) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // pre
+ expect_object_map_update(mock_image_ctx, 0, 1, OBJECT_PENDING, OBJECT_EXISTS,
+ false, 0);
+
+ // copy-up
+ expect_get_parent_overlap(mock_image_ctx, 0);
+
+ // remove
+ expect_object_may_exist(mock_image_ctx, 0, true);
+ expect_get_object_name(mock_image_ctx, 0, "object0");
+ expect_aio_remove(mock_image_ctx, "object0", -EPERM);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, m_image_size, 0, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(-EPERM, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, CopyUpError) {
+ REQUIRE_FEATURE(RBD_FEATURE_LAYERING)
+ ASSERT_EQ(0, create_snapshot("snap1"));
+
+ int order = 22;
+ uint64_t features;
+ ASSERT_TRUE(::get_features(&features));
+ std::string clone_name = get_temp_image_name();
+ ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+ clone_name.c_str(), features, &order, 0, 0));
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(clone_name, &ictx));
+ ASSERT_EQ(0, snap_create(*ictx, "snap"));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // pre
+ expect_object_map_update(mock_image_ctx, 0, 2, OBJECT_PENDING, OBJECT_EXISTS,
+ false, 0);
+
+ // copy-up
+ expect_get_parent_overlap(mock_image_ctx, ictx->get_object_size());
+ expect_get_object_name(mock_image_ctx, 0, "object0");
+
+ MockObjectRequest mock_object_request;
+ expect_object_trim(mock_image_ctx, mock_object_request, -EINVAL);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, 2 * ictx->get_object_size(), 0, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+TEST_F(TestMockOperationTrimRequest, BoundaryError) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockExclusiveLock mock_exclusive_lock;
+ MockJournal mock_journal;
+ MockObjectMap mock_object_map;
+ initialize_features(ictx, mock_image_ctx, mock_exclusive_lock, mock_journal,
+ mock_object_map);
+ expect_op_work_queue(mock_image_ctx);
+ expect_is_lock_owner(mock_image_ctx);
+
+ InSequence seq;
+ EXPECT_CALL(mock_image_ctx, get_stripe_period()).WillOnce(Return(ictx->get_object_size()));
+ EXPECT_CALL(mock_image_ctx, get_stripe_count()).WillOnce(Return(ictx->get_stripe_count()));
+
+ // boundary
+ MockObjectRequest mock_object_request;
+ expect_object_truncate(mock_image_ctx, mock_object_request, -EINVAL);
+
+ C_SaferCond cond_ctx;
+ librbd::NoOpProgressContext progress_ctx;
+ MockTrimRequest *req = new MockTrimRequest(
+ mock_image_ctx, &cond_ctx, ictx->get_object_size(), 1, progress_ctx);
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ req->send();
+ }
+ ASSERT_EQ(-EINVAL, cond_ctx.wait());
+}
+
+} // namespace operation
+} // namespace librbd
}
TEST(LibRGW, MOUNT) {
- int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
- &fs, RGW_MOUNT_FLAG_NONE);
+ int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+ "/", &fs, RGW_MOUNT_FLAG_NONE);
ASSERT_EQ(ret, 0);
ASSERT_NE(fs, nullptr);
}
}
TEST(LibRGW, MOUNT) {
- int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
- secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+ int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+ secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
ASSERT_EQ(ret, 0);
ASSERT_NE(fs, nullptr);
}
}
TEST(LibRGW, MOUNT) {
- int ret = rgw_mount(rgw, userid.c_str(), access_key.c_str(),
- secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+ int ret = rgw_mount2(rgw, userid.c_str(), access_key.c_str(),
+ secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
ASSERT_EQ(ret, 0);
ASSERT_NE(fs, nullptr);
}
}
TEST(LibRGW, MOUNT) {
- int ret = rgw_mount(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
- &fs, RGW_MOUNT_FLAG_NONE);
+ int ret = rgw_mount2(rgw, uid.c_str(), access_key.c_str(), secret_key.c_str(),
+ "/", &fs, RGW_MOUNT_FLAG_NONE);
ASSERT_EQ(ret, 0);
ASSERT_NE(fs, nullptr);
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdint.h>
+#include <tuple>
+#include <iostream>
+#include <fstream>
+#include <stack>
+
+#include "include/rados/librgw.h"
+#include "include/rados/rgw_file.h"
+#include "rgw/rgw_file.h"
+#include "rgw/rgw_lib_frontend.h" // direct requests
+
+#include "gtest/gtest.h"
+#include "common/backport14.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "global/global_init.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+ using namespace rgw;
+ using std::get;
+ using std::string;
+
+ librgw_t rgw_h = nullptr;
+ string userid("testuser");
+ string access_key("");
+ string secret_key("");
+ struct rgw_fs *fs = nullptr;
+ CephContext* cct = nullptr;
+
+ uint32_t owner_uid = 867;
+ uint32_t owner_gid = 5309;
+
+ uint32_t create_mask = RGW_SETATTR_UID | RGW_SETATTR_GID | RGW_SETATTR_MODE;
+
+ string bucket_name("nfsroot");
+
+ class obj_rec
+ {
+ public:
+ string name;
+ struct rgw_file_handle* fh;
+ struct rgw_file_handle* parent_fh;
+ RGWFileHandle* rgw_fh; // alias into fh
+
+ struct state {
+ bool readdir;
+ state() : readdir(false) {}
+ } state;
+
+ obj_rec(string _name, struct rgw_file_handle* _fh,
+ struct rgw_file_handle* _parent_fh, RGWFileHandle* _rgw_fh)
+ : name(std::move(_name)), fh(_fh), parent_fh(_parent_fh),
+ rgw_fh(_rgw_fh) {}
+
+ void clear() {
+ fh = nullptr;
+ rgw_fh = nullptr;
+ }
+
+ void sync() {
+ if (fh)
+ rgw_fh = get_rgwfh(fh);
+ }
+
+ friend ostream& operator<<(ostream& os, const obj_rec& rec);
+ };
+
+ ostream& operator<<(ostream& os, const obj_rec& rec)
+ {
+ RGWFileHandle* rgw_fh = rec.rgw_fh;
+ if (rgw_fh) {
+ const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+ os << rec.rgw_fh->full_object_name()
+ << " (" << rec.rgw_fh->object_name() << "): "
+ << type;
+ }
+ return os;
+ }
+
+ std::stack<obj_rec> obj_stack;
+ std::deque<obj_rec> cleanup_queue;
+
+ typedef std::vector<obj_rec> obj_vec;
+ typedef std::tuple<obj_rec, obj_vec> dirs1_rec;
+ typedef std::vector<dirs1_rec> dirs1_vec;
+
+ dirs1_vec dirs_vec;
+
+ struct obj_rec_st
+ {
+ const obj_rec& obj;
+ const struct stat& st;
+
+ obj_rec_st(const obj_rec& _obj, const struct stat& _st)
+ : obj(_obj), st(_st) {}
+ };
+
+ ostream& operator<<(ostream& os, const obj_rec_st& rec)
+ {
+ RGWFileHandle* rgw_fh = rec.obj.rgw_fh;
+ if (rgw_fh) {
+ const char* type = rgw_fh->is_dir() ? "DIR " : "FILE ";
+ os << rgw_fh->full_object_name()
+ << " (" << rgw_fh->object_name() << "): "
+ << type;
+ const struct stat& st = rec.st;
+ switch(uint8_t(rgw_fh->is_dir())) {
+ case 1:
+ os << " mode: " << st.st_mode;
+ os << " nlinks: " << st.st_nlink;
+ break;
+ case 0:
+ default:
+ os << " mode: " << st.st_mode;
+ os << " size: " << st.st_size;
+ // xxx
+ break;
+ }
+ }
+ return os;
+ }
+
+ bool do_marker1 = false;
+ bool do_marker2 = true;
+ bool do_create = false;
+ bool do_delete = false;
+ bool verbose = false;
+
+ string marker_dir("nfs_marker");
+ struct rgw_file_handle *bucket_fh = nullptr;
+ struct rgw_file_handle *marker_fh;
+ static constexpr int marker_nobjs = 2*1024;
+ std::deque<obj_rec> marker_objs;
+
+ using dirent_t = std::tuple<std::string, uint64_t>;
+ struct dirent_vec
+ {
+ std::vector<dirent_t> obj_names;
+ uint32_t count;
+ dirent_vec() : count(0) {}
+ };
+
+ struct {
+ int argc;
+ char **argv;
+ } saved_args;
+}
+
+TEST(LibRGW, TVAR) {
+ typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+ uint64_t i1{64001};
+ std::string s1{"blunderbuss"};
+
+ readdir_offset v1{&i1};
+ readdir_offset v2{s1.c_str()};
+ readdir_offset v3{static_cast<const char*>(nullptr)};
+
+ uint64_t* pi1 = get<uint64_t*>(v1);
+ ASSERT_NE(pi1, nullptr);
+ std::cout << "read i1: " << *pi1 << std::endl;
+
+ const char* ps1 = get<const char*>(v2);
+ ASSERT_NE(ps1, nullptr);
+ std::cout << "read s1: " << ps1 << std::endl;
+
+ const char* ps3 = get<const char*>(v3);
+ ASSERT_EQ(ps3, nullptr);
+ std::cout << "read s3: " << ps3 << std::endl;
+}
+
+TEST(LibRGW, INIT) {
+ int ret = librgw_create(&rgw_h, saved_args.argc, saved_args.argv);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(rgw_h, nullptr);
+}
+
+TEST(LibRGW, MOUNT) {
+ int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+ secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+ ASSERT_NE(fs, nullptr);
+
+ cct = static_cast<RGWLibFS*>(fs->fs_private)->get_context();
+}
+
+TEST(LibRGW, MARKER1_SETUP_BUCKET) {
+ /* "large" directory enumeration test. this one deals only with
+ * file objects */
+ struct stat st;
+ int ret;
+
+ st.st_uid = owner_uid;
+ st.st_gid = owner_gid;
+ st.st_mode = 755;
+
+ (void) rgw_lookup(fs, fs->root_fh, bucket_name.c_str(), &bucket_fh,
+ RGW_LOOKUP_FLAG_NONE);
+ if (! bucket_fh) {
+ if (do_create) {
+ struct stat st;
+
+ st.st_uid = owner_uid;
+ st.st_gid = owner_gid;
+ st.st_mode = 755;
+
+ ret = rgw_mkdir(fs, fs->root_fh, bucket_name.c_str(), &st, create_mask,
+ &bucket_fh, RGW_MKDIR_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+ }
+ }
+
+ ASSERT_NE(bucket_fh, nullptr);
+
+ (void) rgw_lookup(fs, bucket_fh, marker_dir.c_str(), &marker_fh,
+ RGW_LOOKUP_FLAG_NONE);
+ if (! marker_fh) {
+ if (do_create) {
+ ret = rgw_mkdir(fs, bucket_fh, marker_dir.c_str(), &st, create_mask,
+ &marker_fh, RGW_MKDIR_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+ }
+ }
+
+ ASSERT_NE(marker_fh, nullptr);
+}
+
+TEST(LibRGW, MARKER1_SETUP_OBJECTS)
+{
+ /* "large" directory enumeration test. this one deals only with
+ * file objects */
+ if (do_create) {
+ int ret;
+
+ for (int ix = 0; ix < marker_nobjs; ++ix) {
+ std::string object_name("f_");
+ object_name += to_string(ix);
+ obj_rec obj{object_name, nullptr, marker_fh, nullptr};
+ // lookup object--all operations are by handle
+ ret = rgw_lookup(fs, marker_fh, obj.name.c_str(), &obj.fh,
+ RGW_LOOKUP_FLAG_CREATE);
+ ASSERT_EQ(ret, 0);
+ obj.rgw_fh = get_rgwfh(obj.fh);
+ // open object--open transaction
+ ret = rgw_open(fs, obj.fh, 0 /* posix flags */, RGW_OPEN_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(obj.rgw_fh->is_open());
+ // unstable write data
+ size_t nbytes;
+ string data("data for ");
+ data += object_name;
+ int ret = rgw_write(fs, obj.fh, 0, data.length(), &nbytes,
+ (void*) data.c_str(), RGW_WRITE_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(nbytes, data.length());
+ // commit transaction (write on close)
+ ret = rgw_close(fs, obj.fh, 0 /* flags */);
+ ASSERT_EQ(ret, 0);
+ // save for cleanup
+ marker_objs.push_back(obj);
+ }
+ }
+}
+
+extern "C" {
+ static bool r2_cb(const char* name, void *arg, uint64_t offset,
+ uint32_t flags) {
+ dirent_vec& dvec =
+ *(static_cast<dirent_vec*>(arg));
+ lsubdout(cct, rgw, 10) << __func__
+ << " bucket=" << bucket_name
+ << " dir=" << marker_dir
+ << " iv count=" << dvec.count
+ << " called back name=" << name
+ << " flags=" << flags
+ << dendl;
+
+ std::cout << __func__
+ << " bucket=" << bucket_name
+ << " dir=" << marker_dir
+ << " iv count=" << dvec.count
+ << " called back name=" << name
+ << " flags=" << flags
+ << std::endl;
+
+ string name_str{name};
+ if (! ((name_str == ".") ||
+ (name_str == ".."))) {
+ dvec.obj_names.push_back(dirent_t{std::move(name_str), offset});
+ }
+ return true; /* XXX */
+ }
+}
+
+TEST(LibRGW, MARKER1_READDIR)
+{
+ if (do_marker1) {
+ using std::get;
+
+ dirent_vec dvec;
+ uint64_t offset = 0;
+ bool eof = false;
+
+ /* because RGWReaddirRequest::default_max is 1000 (XXX make
+ * configurable?) and marker_nobjs is 5*1024, the number
+ * of required rgw_readdir operations N should be
+ * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+ * marker_nobjs==5*1024 */
+ uint32_t max_iterations = marker_nobjs/1000+1;
+
+ do {
+ ASSERT_TRUE(dvec.count <= max_iterations);
+ int ret = rgw_readdir(fs, marker_fh, &offset, r2_cb, &dvec, &eof,
+ RGW_READDIR_FLAG_DOTDOT);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(offset, get<1>(dvec.obj_names.back())); // cookie check
+ ++dvec.count;
+ } while(!eof);
+ std::cout << "Read " << dvec.obj_names.size() << " objects in "
+ << marker_dir.c_str() << std::endl;
+ }
+}
+
+TEST(LibRGW, MARKER2_READDIR)
+{
+ if (do_marker2) {
+ using std::get;
+
+ dirent_vec dvec;
+ std::string marker{""};
+ bool eof = false;
+
+ /* because RGWReaddirRequest::default_max is 1000 (XXX make
+ * configurable?) and marker_nobjs is 5*1024, the number
+ * of required rgw_readdir operations N should be
+ * marker_nobjs/1000 < N < marker_nobjs/1000+1, i.e., 6 when
+ * marker_nobjs==5*1024 */
+ uint32_t max_iterations = marker_nobjs/1000+1;
+
+ do {
+ ASSERT_TRUE(dvec.count <= max_iterations);
+ int ret = rgw_readdir2(fs, marker_fh,
+ (marker.length() > 0) ? marker.c_str() : nullptr,
+ r2_cb, &dvec, &eof,
+ RGW_READDIR_FLAG_DOTDOT);
+ ASSERT_EQ(ret, 0);
+ marker = get<0>(dvec.obj_names.back());
+ ++dvec.count;
+ } while((!eof) && dvec.count < 4);
+ std::cout << "Read " << dvec.obj_names.size() << " objects in "
+ << marker_dir.c_str() << std::endl;
+ }
+}
+
+TEST(LibRGW, MARKER1_OBJ_CLEANUP)
+{
+ int rc;
+ for (auto& obj : marker_objs) {
+ if (obj.fh) {
+ if (do_delete) {
+ if (verbose) {
+ std::cout << "unlinking: " << bucket_name << ":" << obj.name
+ << std::endl;
+ }
+ rc = rgw_unlink(fs, marker_fh, obj.name.c_str(), RGW_UNLINK_FLAG_NONE);
+ }
+ rc = rgw_fh_rele(fs, obj.fh, 0 /* flags */);
+ ASSERT_EQ(rc, 0);
+ }
+ }
+ marker_objs.clear();
+}
+
+TEST(LibRGW, CLEANUP) {
+ int rc;
+
+ if (do_marker1) {
+ cleanup_queue.push_back(
+ obj_rec{bucket_name, bucket_fh, fs->root_fh, get_rgwfh(fs->root_fh)});
+ }
+
+ for (auto& elt : cleanup_queue) {
+ if (elt.fh) {
+ rc = rgw_fh_rele(fs, elt.fh, 0 /* flags */);
+ ASSERT_EQ(rc, 0);
+ }
+ }
+ cleanup_queue.clear();
+}
+
+TEST(LibRGW, UMOUNT) {
+ if (! fs)
+ return;
+
+ int ret = rgw_umount(fs, RGW_UMOUNT_FLAG_NONE);
+ ASSERT_EQ(ret, 0);
+}
+
+TEST(LibRGW, SHUTDOWN) {
+ librgw_shutdown(rgw_h);
+}
+
+int main(int argc, char *argv[])
+{
+ char *v{nullptr};
+ string val;
+ vector<const char*> args;
+
+ argv_to_vec(argc, const_cast<const char**>(argv), args);
+ env_to_vec(args);
+
+ v = getenv("AWS_ACCESS_KEY_ID");
+ if (v) {
+ access_key = v;
+ }
+
+ v = getenv("AWS_SECRET_ACCESS_KEY");
+ if (v) {
+ secret_key = v;
+ }
+
+ for (auto arg_iter = args.begin(); arg_iter != args.end();) {
+ if (ceph_argparse_witharg(args, arg_iter, &val, "--access",
+ (char*) nullptr)) {
+ access_key = val;
+ } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
+ (char*) nullptr)) {
+ secret_key = val;
+ } else if (ceph_argparse_witharg(args, arg_iter, &val, "--userid",
+ (char*) nullptr)) {
+ userid = val;
+ } else if (ceph_argparse_witharg(args, arg_iter, &val, "--bn",
+ (char*) nullptr)) {
+ bucket_name = val;
+ } else if (ceph_argparse_witharg(args, arg_iter, &val, "--uid",
+ (char*) nullptr)) {
+ owner_uid = std::stoi(val);
+ } else if (ceph_argparse_witharg(args, arg_iter, &val, "--gid",
+ (char*) nullptr)) {
+ owner_gid = std::stoi(val);
+ } else if (ceph_argparse_flag(args, arg_iter, "--marker1",
+ (char*) nullptr)) {
+ do_marker1 = true;
+ } else if (ceph_argparse_flag(args, arg_iter, "--create",
+ (char*) nullptr)) {
+ do_create = true;
+ } else if (ceph_argparse_flag(args, arg_iter, "--delete",
+ (char*) nullptr)) {
+ do_delete = true;
+ } else if (ceph_argparse_flag(args, arg_iter, "--verbose",
+ (char*) nullptr)) {
+ verbose = true;
+ } else {
+ ++arg_iter;
+ }
+ }
+
+ /* dont accidentally run as anonymous */
+ if ((access_key == "") ||
+ (secret_key == "")) {
+ std::cout << argv[0] << " no AWS credentials, exiting" << std::endl;
+ return EPERM;
+ }
+
+ saved_args.argc = argc;
+ saved_args.argv = argv;
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
}
TEST(LibRGW, MOUNT) {
- int ret = rgw_mount(rgw_h, userid.c_str(), access_key.c_str(),
- secret_key.c_str(), &fs, RGW_MOUNT_FLAG_NONE);
+ int ret = rgw_mount2(rgw_h, userid.c_str(), access_key.c_str(),
+ secret_key.c_str(), "/", &fs, RGW_MOUNT_FLAG_NONE);
ASSERT_EQ(ret, 0);
ASSERT_NE(fs, nullptr);
float copies_rate =
(static_cast<float>(sum.num_object_copies - sum.num_objects_degraded) /
sum.num_object_copies);
- float used_bytes = sum.num_bytes * copies_rate;
+ float used_bytes = sum.num_bytes * copies_rate * pool.get_size();
float used_percent = used_bytes / (used_bytes + avail) * 100;
unsigned col = 0;
ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
"\"element3\":{\"avgcount\":0,\"sum\":0.000000000,\"avgtime\":0.000000000}}}"), msg);
ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
- ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\"},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\"}}}"), msg);
+ ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\",\"priority\":0}}}"), msg);
coll->clear();
ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
ASSERT_EQ("{}", msg);
uint64_t start = Cycles::rdtsc();
Mutex::Locker l(lock);
for (int i = 0; i < count; i++) {
- timer.add_event_after(12345, c[i]);
- timer.cancel_event(c[i]);
+ if (timer.add_event_after(12345, c[i])) {
+ timer.cancel_event(c[i]);
+ }
}
uint64_t stop = Cycles::rdtsc();
delete[] c;
// register missing client in remote journal
librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
client_data.client_meta = mirror_peer_client_meta;
expect_journaler_register_client(mock_journaler, client_data, 0);
// re-register the client
expect_journaler_unregister_client(mock_journaler, 0);
mirror_peer_client_meta = {};
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
client_data.client_meta = mirror_peer_client_meta;
expect_journaler_register_client(mock_journaler, client_data, 0);
struct Context;
struct MockSafeTimer {
- MOCK_METHOD2(add_event_after, void(double, Context*));
+ MOCK_METHOD2(add_event_after, Context*(double, Context*));
MOCK_METHOD1(cancel_event, bool(Context *));
};
using ::testing::Invoke;
using ::testing::MatcherCast;
using ::testing::Return;
+using ::testing::ReturnArg;
using ::testing::SetArgPointee;
using ::testing::WithArg;
void expect_add_event_after_repeatedly(MockThreads &mock_threads) {
EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
.WillRepeatedly(
- Invoke([this](double seconds, Context *ctx) {
- m_threads->timer->add_event_after(seconds, ctx);
- }));
+ DoAll(Invoke([this](double seconds, Context *ctx) {
+ m_threads->timer->add_event_after(seconds, ctx);
+ }),
+ ReturnArg<1>()));
EXPECT_CALL(*mock_threads.timer, cancel_event(_))
.WillRepeatedly(
Invoke([this](Context *ctx) {
namespace mirror {
using ::testing::_;
+using ::testing::DoAll;
using ::testing::InSequence;
using ::testing::Invoke;
using ::testing::Return;
+using ::testing::ReturnArg;
using ::testing::ReturnRef;
using ::testing::WithArg;
void expect_add_event_after(MockThreads &mock_threads,
Context** timer_ctx = nullptr) {
EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
- .WillOnce(WithArg<1>(
- Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
+ .WillOnce(DoAll(
+ WithArg<1>(Invoke([this, &mock_threads, timer_ctx](Context *ctx) {
assert(mock_threads.timer_lock.is_locked());
if (timer_ctx != nullptr) {
*timer_ctx = ctx;
ctx->complete(0);
}), 0);
}
- })));
+ })),
+ ReturnArg<1>()));
}
void expect_cancel_event(MockThreads &mock_threads, bool canceled) {
using ::testing::InSequence;
using ::testing::Invoke;
using ::testing::Return;
+using ::testing::ReturnArg;
using ::testing::StrEq;
using ::testing::WithArg;
using ::testing::WithoutArgs;
void expect_timer_add_event(MockThreads &mock_threads) {
EXPECT_CALL(*mock_threads.timer, add_event_after(_, _))
- .WillOnce(WithArg<1>(Invoke([this](Context *ctx) {
- auto wrapped_ctx = new FunctionContext([this, ctx](int r) {
- Mutex::Locker timer_locker(m_threads->timer_lock);
- ctx->complete(r);
- });
- m_threads->work_queue->queue(wrapped_ctx, 0);
- })));
+ .WillOnce(DoAll(WithArg<1>(Invoke([this](Context *ctx) {
+ auto wrapped_ctx =
+ new FunctionContext([this, ctx](int r) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ ctx->complete(r);
+ });
+ m_threads->work_queue->queue(wrapped_ctx, 0);
+ })),
+ ReturnArg<1>()));
}
int when_shut_down(MockPoolWatcher &mock_pool_watcher) {
except ImportError:
from itertools import zip_longest
from itertools import combinations
+from cStringIO import StringIO
import boto
import boto.s3.connection
for bucket_name in buckets:
zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+def test_multipart_object_sync():
+ zonegroup = realm.master_zonegroup()
+ zonegroup_conns = ZonegroupConns(zonegroup)
+ buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+ _, bucket = zone_bucket[0]
+
+ # initiate a multipart upload
+ upload = bucket.initiate_multipart_upload('MULTIPART')
+ mp = boto.s3.multipart.MultiPartUpload(bucket)
+ mp.key_name = upload.key_name
+ mp.id = upload.id
+ part_size = 5 * 1024 * 1024 # 5M min part size
+ mp.upload_part_from_file(StringIO('a' * part_size), 1)
+ mp.upload_part_from_file(StringIO('b' * part_size), 2)
+ mp.upload_part_from_file(StringIO('c' * part_size), 3)
+ mp.upload_part_from_file(StringIO('d' * part_size), 4)
+ mp.complete_upload()
+
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
def test_encrypted_object_sync():
zonegroup = realm.master_zonegroup()
zonegroup_conns = ZonegroupConns(zonegroup)
#include "include/ipaddr.h"
+#include "common/pick_address.h"
+#include "global/global_context.h"
#include "gtest/gtest.h"
#if defined(__FreeBSD__)
ipv6(&want, "2001:1234:5678:90ab::dead:beef");
ASSERT_EQ(0, memcmp(want.sin6_addr.s6_addr, network.sin6_addr.s6_addr, sizeof(network.sin6_addr.s6_addr)));
}
+
+TEST(pick_address, find_ip_in_subnet_list)
+{
+ struct ifaddrs one, two;
+ struct sockaddr_in a_one;
+ struct sockaddr_in a_two;
+ const struct sockaddr *result;
+
+ one.ifa_next = &two;
+ one.ifa_addr = (struct sockaddr*)&a_one;
+ one.ifa_name = eth0;
+
+ two.ifa_next = NULL;
+ two.ifa_addr = (struct sockaddr*)&a_two;
+ two.ifa_name = eth1;
+
+ ipv4(&a_one, "10.1.1.2");
+ ipv4(&a_two, "10.2.1.123");
+
+ // match by network
+ result = find_ip_in_subnet_list(
+ g_ceph_context,
+ &one,
+ "10.1.0.0/16",
+ "eth0");
+ ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+ result = find_ip_in_subnet_list(
+ g_ceph_context,
+ &one,
+ "10.2.0.0/16",
+ "eth1");
+ ASSERT_EQ((struct sockaddr*)&a_two, result);
+
+ // match by eth name
+ result = find_ip_in_subnet_list(
+ g_ceph_context,
+ &one,
+ "10.0.0.0/8",
+ "eth0");
+ ASSERT_EQ((struct sockaddr*)&a_one, result);
+
+ result = find_ip_in_subnet_list(
+ g_ceph_context,
+ &one,
+ "10.0.0.0/8",
+ "eth1");
+ ASSERT_EQ((struct sockaddr*)&a_two, result);
+}
target_link_libraries(ceph-osdomap-tool os global Boost::program_options)
install(TARGETS ceph-osdomap-tool DESTINATION bin)
-add_executable(ceph-monstore-tool ceph_monstore_tool.cc)
+add_executable(ceph-monstore-tool
+ ceph_monstore_tool.cc
+ ../mgr/mgr_commands.cc)
target_link_libraries(ceph-monstore-tool os global Boost::program_options)
install(TARGETS ceph-monstore-tool DESTINATION bin)
install(PROGRAMS
class StoreTool
{
- boost::scoped_ptr<KeyValueDB> db;
+ boost::scoped_ptr<BlueStore> bluestore;
+
+ // TODO: make KeyValueDB enable_shared_from_this
+ // bluestore will hold *db* also, use unique_ptr/shared_ptr will
+ // double free.
+ KeyValueDB* db;
+
string store_path;
public:
#ifdef HAVE_LIBAIO
// note: we'll leak this! the only user is ceph-kvstore-tool and
// we don't care.
- BlueStore *bluestore = new BlueStore(g_ceph_context, path);
+ bluestore.reset(new BlueStore(g_ceph_context, path));
int r = bluestore->start_kv_only(&db_ptr);
if (r < 0) {
exit(1);
exit(1);
}
}
- db.reset(db_ptr);
+ db = db_ptr;
+ }
+
+ ~StoreTool() {
+ if (bluestore) {
+ bluestore->umount();
+ }
+ else {
+ if (db) {
+ delete db;
+ }
+ }
}
uint32_t traverse(const string &prefix,
#include "auth/cephx/CephxKeyServer.h"
#include "global/global_init.h"
#include "include/stringify.h"
+#include "mgr/mgr_commands.h"
#include "mon/AuthMonitor.h"
#include "mon/MonitorDBStore.h"
#include "mon/Paxos.h"
return 0;
}
+static int update_mgrmap(MonitorDBStore& st)
+{
+ auto t = make_shared<MonitorDBStore::Transaction>();
+
+ {
+ MgrMap map;
+ // mgr expects epoch > 1
+ map.epoch++;
+ auto initial_modules =
+ get_str_vec(g_ceph_context->_conf->get_val<string>("mgr_initial_modules"));
+ copy(begin(initial_modules),
+ end(initial_modules),
+ inserter(map.modules, end(map.modules)));
+ bufferlist bl;
+ map.encode(bl, CEPH_FEATURES_ALL);
+ t->put("mgr", map.epoch, bl);
+ t->put("mgr", "last_committed", map.epoch);
+ }
+ {
+ auto mgr_command_descs = mgr_commands;
+ for (auto& c : mgr_command_descs) {
+ c.set_flag(MonCommand::FLAG_MGR);
+ }
+ bufferlist bl;
+ ::encode(mgr_command_descs, bl);
+ t->put("mgr_command_desc", "", bl);
+ }
+ return st.apply_transaction(t);
+}
+
static int update_paxos(MonitorDBStore& st)
{
// build a pending paxos proposal from all non-permanent k/v pairs. once the
{
MonitorDBStore::Transaction t;
vector<string> prefixes = {"auth", "osdmap",
+ "mgr", "mgr_command_desc",
"pgmap", "pgmap_pg", "pgmap_meta"};
for (const auto& prefix : prefixes) {
for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
if ((r = update_monitor(st))) {
return r;
}
+ if ((r = update_mgrmap(st))) {
+ return r;
+ }
return 0;
}
const ssize_t max_read = 1024 * 1024;
const int fd_none = INT_MIN;
bool outistty;
-bool dry_run = false;
+bool dry_run;
struct action_on_object_t {
virtual ~action_on_object_t() {}
ghobject_t biginfo_oid;
int file_fd = fd_none;
-bool debug = false;
+bool debug;
super_header sh;
uint64_t testalign;
("journal-path", po::value<string>(&jpath),
"path to journal, use if tool can't find it")
("pgid", po::value<string>(&pgidstr),
- "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
+ "PG id, mandatory for info, log, remove, export, export-remove, rm-past-intervals, mark-complete, and mandatory for apply-layout-settings if --pool is not specified")
("pool", po::value<string>(&pool),
"Pool name, mandatory for apply-layout-settings if --pgid is not specified")
("op", po::value<string>(&op),
- "Arg is one of [info, log, remove, mkfs, fsck, fuse, dup, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+ "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, apply-layout-settings, update-mon-db]")
("epoch", po::value<unsigned>(&epoch),
"epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
("file", po::value<string>(&file),
- "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
+ "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
("mon-store-path", po::value<string>(&mon_store_path),
"path of monstore to update-mon-db")
("fsid", po::value<string>(&fsid),
return 1;
}
- if (!vm.count("debug")) {
- debug = false;
- } else {
- debug = true;
- }
+ debug = (vm.count("debug") > 0);
- if (!vm.count("force")) {
- force = false;
- } else {
- force = true;
- }
+ force = (vm.count("force") > 0);
if (vm.count("namespace"))
nspace = argnspace;
- if (vm.count("dry-run"))
- dry_run = true;
+ dry_run = (vm.count("dry-run") > 0);
+
osflagbits_t flags = 0;
if (dry_run || vm.count("skip-journal-replay"))
flags |= SKIP_JOURNAL_REPLAY;
flags |= SKIP_MOUNT_OMAP;
if (op == "update-mon-db")
flags |= SKIP_JOURNAL_REPLAY;
+
head = (vm.count("head") > 0);
vector<const char *> ceph_options;
outistty = isatty(STDOUT_FILENO);
file_fd = fd_none;
- if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
+ if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
if (!vm.count("file") || file == "-") {
if (outistty) {
cerr << "stdout is a tty and no --file filename specified" << std::endl;
ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
if (vm.count("file") && file_fd == fd_none && !dry_run) {
- cerr << "--file option only applies to import, export, "
+ cerr << "--file option only applies to import, export, export-remove, "
<< "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
return 1;
}
return 1;
}
+ //Verify that the journal-path really exists
+ if (type == "filestore") {
+ if (::stat(jpath.c_str(), &st) == -1) {
+ string err = string("journal-path: ") + jpath;
+ perror(err.c_str());
+ return 1;
+ }
+ if (S_ISDIR(st.st_mode)) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(EISDIR) << std::endl;
+ return 1;
+ }
+ }
+
ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
if (fs == NULL) {
cerr << "Unable to create store of type " << type << std::endl;
cout << "fsck found no errors" << std::endl;
return 0;
}
+ if (op == "repair" || op == "repair-deep") {
+ int r = fs->repair(op == "repair-deep");
+ if (r < 0) {
+ cerr << "repair failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ if (r > 0) {
+ cerr << "repair found " << r << " errors" << std::endl;
+ return 1;
+ }
+ cout << "repair found no errors" << std::endl;
+ return 0;
+ }
if (op == "mkfs") {
if (fsid.length()) {
uuid_d f;
}
int r = fs->mkfs();
if (r < 0) {
- cerr << "fsck failed: " << cpp_strerror(r) << std::endl;
+ cerr << "mkfs failed: " << cpp_strerror(r) << std::endl;
return 1;
}
return 0;
// The ops which require --pgid option are checked here and
// mentioned in the usage for --pgid.
if ((op == "info" || op == "log" || op == "remove" || op == "export"
- || op == "rm-past-intervals" || op == "mark-complete") &&
+ || op == "export-remove" || op == "rm-past-intervals" || op == "mark-complete") &&
pgidstr.length() == 0) {
cerr << "Must provide pgid" << std::endl;
usage(desc);
biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
if (op == "remove") {
+ if (!force && !dry_run) {
+ cerr << "Please use export-remove or you must use --force option" << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
ret = initiate_new_remove_pg(fs, pgid, *osr);
if (ret < 0) {
cerr << "PG '" << pgid << "' not found" << std::endl;
// If not an object command nor any of the ops handled below, then output this usage
// before complaining about a bad pgid
- if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
- cerr << "Must provide --op (info, log, remove, mkfs, fsck, export, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
+ if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+ cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, rm-past-intervals, dump-journal, dump-super, meta-list, "
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)"
<< std::endl;
usage(desc);
if (debug)
cerr << "struct_v " << (int)struct_ver << std::endl;
- if (op == "export") {
+ if (op == "export" || op == "export-remove") {
ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
- if (ret == 0)
+ if (ret == 0) {
cerr << "Export successful" << std::endl;
+ if (op == "export-remove") {
+ ret = initiate_new_remove_pg(fs, pgid, *osr);
+ // Export succeeded, so pgid is there
+ assert(ret == 0);
+ cerr << "Remove successful" << std::endl;
+ }
+ }
} else if (op == "info") {
formatter->open_object_section("info");
info.dump(formatter);
int main(int argc, char **argv) {
po::options_description desc("Allowed options");
- string store_path, cmd, out_path, oid;
+ string store_path, cmd, oid, backend;
bool debug = false;
desc.add_options()
("help", "produce help message")
("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects")
("command", po::value<string>(&cmd),
"command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair], mandatory")
+ ("backend", po::value<string>(&backend),
+ "DB backend (default rocksdb)")
;
po::positional_options_description p;
p.add("command", 1);
return 1;
}
- KeyValueDB* store(KeyValueDB::create(g_ceph_context, "leveldb", store_path));
+ if (vm.count("backend") == 0) {
+ backend = "rocksdb";
+ }
+
+ KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path));
+ if (store == NULL) {
+ std::cerr << "Invalid backend '" << backend << "' specified" << std::endl;
+ return 1;
+ }
/*if (vm.count("paranoid")) {
std::cerr << "Enabling paranoid checks" << std::endl;
store->options.paranoid_checks = true;
// the DBObjectMap which we might want to examine for diagnostic
// reasons. Instead use --command repair.
+ omap.get_state();
+ std::cout << "Version: " << (int)omap.state.v << std::endl;
+ std::cout << "Seq: " << omap.state.seq << std::endl;
+ std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl;
+
if (cmd == "dump-raw-keys") {
KeyValueDB::WholeSpaceIterator i = store->get_iterator();
for (i->seek_to_first(); i->valid(); i->next()) {
} else if (cmd == "check" || cmd == "repair") {
ostringstream ss;
bool repair = (cmd == "repair");
- r = omap.check(ss, repair);
+ r = omap.check(ss, repair, true);
if (r) {
std::cerr << ss.str() << std::endl;
if (r > 0) {
for (auto i : headers)
std::cout << i << std::endl;
return 0;
+ } else if (cmd == "resetv2") {
+ omap.state.v = 2;
+ omap.state.legacy = false;
+ omap.set_state();
} else {
std::cerr << "Did not recognize command " << cmd << std::endl;
return 1;
{
set<int> roots;
- crush.find_roots(roots);
- if (roots.size() > 1)
- dout(1) << "The crush rulesets will use the root " << root << "\n"
- << "and ignore the others.\n"
- << "There are " << roots.size() << " roots, they can be\n"
- << "grouped into a single root by appending something like:\n"
- << " root straw 0\n"
- << dendl;
+ crush.find_roots(&roots);
+ if (roots.size() > 1) {
+ cerr << "The crush rulesets will use the root " << root << "\n"
+ << "and ignore the others.\n"
+ << "There are " << roots.size() << " roots, they can be\n"
+ << "grouped into a single root by appending something like:\n"
+ << " root straw 0\n"
+ << std::endl;
+ }
}
if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr))
monmap.created = ceph_clock_now();
monmap.last_changed = monmap.created;
srand(getpid() + time(0));
- if (g_conf->fsid.is_zero()) {
+ if (g_conf->get_val<uuid_d>("fsid").is_zero()) {
monmap.generate_fsid();
cout << me << ": generated fsid " << monmap.fsid << std::endl;
}
modified = true;
}
- if (!g_conf->fsid.is_zero()) {
- monmap.fsid = g_conf->fsid;
+ if (!g_conf->get_val<uuid_d>("fsid").is_zero()) {
+ monmap.fsid = g_conf->get_val<uuid_d>("fsid");
cout << me << ": set fsid to " << monmap.fsid << std::endl;
modified = true;
}
cout << " --test-map-pgs-dump-all [--pool <poolid>] map all pgs to osds" << std::endl;
cout << " --health dump health checks" << std::endl;
cout << " --mark-up-in mark osds up and in (but do not persist)" << std::endl;
+ cout << " --mark-out <osdid> mark an osd as out (but do not persist)" << std::endl;
cout << " --with-default-pool include default pool when creating map" << std::endl;
cout << " --clear-temp clear pg_temp and primary_temp" << std::endl;
cout << " --test-random do random placements" << std::endl;
int range_last = -1;
int pool = -1;
bool mark_up_in = false;
+ int marked_out = -1;
bool clear_temp = false;
bool test_map_pgs = false;
bool test_map_pgs_dump = false;
create_from_conf = true;
} else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) {
mark_up_in = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) {
+ marked_out = std::stoi(val);
} else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) {
clear_temp = true;
} else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0);
}
}
+
+ if (marked_out >=0 && marked_out < osdmap.get_max_osd()) {
+ cout << "marking OSD@" << marked_out << " as out" << std::endl;
+ int id = marked_out;
+ osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+ osdmap.set_weight(id, CEPH_OSD_OUT);
+ osdmap.crush->adjust_item_weightf(g_ceph_context, id, 1.0);
+ }
+
if (clear_temp) {
cout << "clearing pg/primary temp" << std::endl;
osdmap.clear_temp();
namespace at = argument_types;
namespace po = boost::program_options;
+namespace {
+
+int validate_mirroring_enabled(librbd::Image& image) {
+ librbd::mirror_image_info_t mirror_image;
+ int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) {
+ std::cerr << "rbd: mirroring not enabled on the image" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+} // anonymous namespace
void get_arguments(po::options_description *positional,
po::options_description *options) {
return r;
}
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
r = image.mirror_image_promote(force);
if (r < 0) {
std::cerr << "rbd: error promoting image to primary" << std::endl;
return r;
}
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
r = image.mirror_image_demote();
if (r < 0) {
std::cerr << "rbd: error demoting image to non-primary" << std::endl;
return r;
}
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
r = image.mirror_image_resync();
if (r < 0) {
std::cerr << "rbd: error flagging image resync" << std::endl;
return r;
}
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
librbd::mirror_image_status_t status;
r = image.mirror_image_get_status(&status, sizeof(status));
if (r < 0) {
namespace {
+int validate_mirroring_enabled(librados::IoCtx& io_ctx) {
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ int r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+ std::cerr << "rbd: mirroring not enabled on the pool" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
int validate_uuid(const std::string &uuid) {
boost::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
boost::regex::icase);
protected:
bool skip_action(const librbd::mirror_image_info_t &info) const override {
- return info.primary;
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary);
}
void execute_action(librbd::Image &image,
if (r >= 0) {
(*m_counter)++;
}
+ ImageRequestBase::handle_execute_action(r);
}
std::string get_action_type() const override {
protected:
bool skip_action(const librbd::mirror_image_info_t &info) const override {
- return !info.primary;
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary);
}
void execute_action(librbd::Image &image,
}
void finalize_action() override {
+ if (m_mirror_image_status.info.global_id.empty()) {
+ return;
+ }
+
std::string state = utils::mirror_image_status_state(m_mirror_image_status);
std::string last_update = (
m_mirror_image_status.last_update == 0 ?
if (r < 0) {
return r;
}
-
- librbd::RBD rbd;
- rbd_mirror_mode_t mirror_mode;
- r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+
+ r = validate_mirroring_enabled(io_ctx);
if (r < 0) {
- std::cerr << "rbd: failed to retrieve mirror mode: "
- << cpp_strerror(r) << std::endl;
return r;
}
-
- if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
- std::cerr << "rbd: failed to add mirror peer: "
- << "mirroring must be enabled on the pool "
- << pool_name << std::endl;
- return -EINVAL;
- }
// TODO: temporary restriction to prevent adding multiple peers
// until rbd-mirror daemon can properly handle the scenario
+ librbd::RBD rbd;
std::vector<librbd::mirror_peer_t> mirror_peers;
r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
if (r < 0) {
return r;
}
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
librbd::RBD rbd;
r = rbd.mirror_peer_remove(io_ctx, uuid);
if (r < 0) {
return r;
}
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
librbd::RBD rbd;
if (key == "client") {
r = rbd.mirror_peer_set_client(io_ctx, uuid.c_str(), value.c_str());
return r;
}
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
librbd::RBD rbd;
std::map<librbd::mirror_image_status_state_t, int> states;
return r;
}
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
std::atomic<unsigned> counter = { 0 };
ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
vm["force"].as<bool>());
return r;
}
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
std::atomic<unsigned> counter { 0 };
ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
r = generator.execute();
}
};
+template <typename I>
class ImageReplayerAdminSocketCommand {
public:
+ ImageReplayerAdminSocketCommand(const std::string &desc,
+ ImageReplayer<I> *replayer)
+ : desc(desc), replayer(replayer) {
+ }
virtual ~ImageReplayerAdminSocketCommand() {}
virtual bool call(Formatter *f, stringstream *ss) = 0;
+
+ std::string desc;
+ ImageReplayer<I> *replayer;
+ bool registered = false;
};
template <typename I>
-class StatusCommand : public ImageReplayerAdminSocketCommand {
+class StatusCommand : public ImageReplayerAdminSocketCommand<I> {
public:
- explicit StatusCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
bool call(Formatter *f, stringstream *ss) override {
- replayer->print_status(f, ss);
+ this->replayer->print_status(f, ss);
return true;
}
-
-private:
- ImageReplayer<I> *replayer;
};
template <typename I>
-class StartCommand : public ImageReplayerAdminSocketCommand {
+class StartCommand : public ImageReplayerAdminSocketCommand<I> {
public:
- explicit StartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
bool call(Formatter *f, stringstream *ss) override {
- replayer->start(nullptr, true);
+ this->replayer->start(nullptr, true);
return true;
}
-
-private:
- ImageReplayer<I> *replayer;
};
template <typename I>
-class StopCommand : public ImageReplayerAdminSocketCommand {
+class StopCommand : public ImageReplayerAdminSocketCommand<I> {
public:
- explicit StopCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
bool call(Formatter *f, stringstream *ss) override {
- replayer->stop(nullptr, true);
+ this->replayer->stop(nullptr, true);
return true;
}
-
-private:
- ImageReplayer<I> *replayer;
};
template <typename I>
-class RestartCommand : public ImageReplayerAdminSocketCommand {
+class RestartCommand : public ImageReplayerAdminSocketCommand<I> {
public:
- explicit RestartCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
bool call(Formatter *f, stringstream *ss) override {
- replayer->restart();
+ this->replayer->restart();
return true;
}
-
-private:
- ImageReplayer<I> *replayer;
};
template <typename I>
-class FlushCommand : public ImageReplayerAdminSocketCommand {
+class FlushCommand : public ImageReplayerAdminSocketCommand<I> {
public:
- explicit FlushCommand(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
bool call(Formatter *f, stringstream *ss) override {
C_SaferCond cond;
- replayer->flush(&cond);
+ this->replayer->flush(&cond);
int r = cond.wait();
if (r < 0) {
*ss << "flush: " << cpp_strerror(r);
}
return true;
}
-
-private:
- ImageReplayer<I> *replayer;
};
template <typename I>
public:
ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
ImageReplayer<I> *replayer)
- : admin_socket(cct->get_admin_socket()), name(name), replayer(replayer),
- lock("ImageReplayerAdminSocketHook::lock " +
- replayer->get_global_image_id()) {
+ : admin_socket(cct->get_admin_socket()),
+ commands{{"rbd mirror flush " + name,
+ new FlushCommand<I>("flush rbd mirror " + name, replayer)},
+ {"rbd mirror restart " + name,
+ new RestartCommand<I>("restart rbd mirror " + name, replayer)},
+ {"rbd mirror start " + name,
+ new StartCommand<I>("start rbd mirror " + name, replayer)},
+ {"rbd mirror status " + name,
+ new StatusCommand<I>("get status for rbd mirror " + name, replayer)},
+ {"rbd mirror stop " + name,
+ new StopCommand<I>("stop rbd mirror " + name, replayer)}} {
}
int register_commands() {
- std::string command;
- int r;
-
- command = "rbd mirror status " + name;
- r = admin_socket->register_command(command, command, this,
- "get status for rbd mirror " + name);
- if (r < 0) {
- return r;
- }
- commands[command] = new StatusCommand<I>(replayer);
-
- command = "rbd mirror start " + name;
- r = admin_socket->register_command(command, command, this,
- "start rbd mirror " + name);
- if (r < 0) {
- return r;
- }
- commands[command] = new StartCommand<I>(replayer);
-
- command = "rbd mirror stop " + name;
- r = admin_socket->register_command(command, command, this,
- "stop rbd mirror " + name);
- if (r < 0) {
- return r;
- }
- commands[command] = new StopCommand<I>(replayer);
-
- command = "rbd mirror restart " + name;
- r = admin_socket->register_command(command, command, this,
- "restart rbd mirror " + name);
- if (r < 0) {
- return r;
- }
- commands[command] = new RestartCommand<I>(replayer);
-
- command = "rbd mirror flush " + name;
- r = admin_socket->register_command(command, command, this,
- "flush rbd mirror " + name);
- if (r < 0) {
- return r;
+ for (auto &it : commands) {
+ int r = admin_socket->register_command(it.first, it.first, this,
+ it.second->desc);
+ if (r < 0) {
+ return r;
+ }
+ it.second->registered = true;
}
- commands[command] = new FlushCommand<I>(replayer);
-
return 0;
}
~ImageReplayerAdminSocketHook() override {
- Mutex::Locker locker(lock);
- for (Commands::const_iterator i = commands.begin(); i != commands.end();
- ++i) {
- (void)admin_socket->unregister_command(i->first);
- delete i->second;
+ for (auto &it : commands) {
+ if (it.second->registered) {
+ admin_socket->unregister_command(it.first);
+ }
+ delete it.second;
}
commands.clear();
}
bool call(std::string command, cmdmap_t& cmdmap, std::string format,
bufferlist& out) override {
- Mutex::Locker locker(lock);
- Commands::const_iterator i = commands.find(command);
+ auto i = commands.find(command);
assert(i != commands.end());
Formatter *f = Formatter::create(format);
stringstream ss;
}
private:
- typedef std::map<std::string, ImageReplayerAdminSocketCommand*> Commands;
+ typedef std::map<std::string, ImageReplayerAdminSocketCommand<I> *> Commands;
AdminSocket *admin_socket;
- std::string name;
- ImageReplayer<I> *replayer;
- Mutex lock;
Commands commands;
};
return;
}
- {
- Mutex::Locker locker(m_lock);
- std::string name = m_local_ioctx.get_pool_name() + "/" +
- m_local_image_ctx->name;
- if (m_name != name) {
- m_name = name;
- if (m_asok_hook) {
- // Re-register asok commands using the new name.
- delete m_asok_hook;
- m_asok_hook = nullptr;
- }
- }
- register_admin_socket_hook();
- }
+ on_name_changed();
update_mirror_image_status(false, boost::none);
init_remote_journaler();
image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr;
bool shut_down_replay = false;
bool running = true;
- bool canceled_task = false;
{
Mutex::Locker locker(m_lock);
std::swap(m_on_stop_finish, on_finish);
m_stop_requested = true;
m_manual_stop = manual;
-
- Mutex::Locker timer_locker(m_threads->timer_lock);
- if (m_delayed_preprocess_task != nullptr) {
- canceled_task = m_threads->timer->cancel_event(
- m_delayed_preprocess_task);
- assert(canceled_task);
- m_delayed_preprocess_task = nullptr;
- }
}
}
}
bootstrap_request->put();
}
- if (canceled_task) {
- m_event_replay_tracker.finish_op();
- on_replay_interrupted();
- }
-
if (!running) {
dout(20) << "not running" << dendl;
if (on_finish) {
dout(20) << dendl;
assert(r == 0);
+ on_name_changed();
+
// attempt to process the next event
handle_replay_ready();
}
template <typename I>
void ImageReplayer<I>::shut_down(int r) {
dout(20) << "r=" << r << dendl;
+
+ bool canceled_delayed_preprocess_task = false;
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_delayed_preprocess_task != nullptr) {
+ canceled_delayed_preprocess_task = m_threads->timer->cancel_event(
+ m_delayed_preprocess_task);
+ assert(canceled_delayed_preprocess_task);
+ m_delayed_preprocess_task = nullptr;
+ }
+ }
+ if (canceled_delayed_preprocess_task) {
+ // wake up sleeping replay
+ m_event_replay_tracker.finish_op();
+ }
+
{
Mutex::Locker locker(m_lock);
assert(m_state == STATE_STOPPING);
void ImageReplayer<I>::handle_shut_down(int r) {
reschedule_update_status_task(-1);
+ bool unregister_asok_hook = false;
{
Mutex::Locker locker(m_lock);
m_local_image_id = "";
m_resync_requested = false;
if (m_delete_requested) {
- unregister_admin_socket_hook();
+ unregister_asok_hook = true;
m_delete_requested = false;
}
} else if (m_last_r == -ENOENT &&
m_local_image_id.empty() && m_remote_image.image_id.empty()) {
dout(0) << "mirror image no longer exists" << dendl;
- unregister_admin_socket_hook();
+ unregister_asok_hook = true;
m_finished = true;
}
}
+ if (unregister_asok_hook) {
+ unregister_admin_socket_hook();
+ }
+
dout(20) << "stop complete" << dendl;
m_local_ioctx.close();
template <typename I>
void ImageReplayer<I>::register_admin_socket_hook() {
- if (m_asok_hook != nullptr) {
- return;
- }
+ ImageReplayerAdminSocketHook<I> *asok_hook;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_asok_hook != nullptr) {
+ return;
+ }
- dout(20) << "registered asok hook: " << m_name << dendl;
- auto asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
- this);
- int r = asok_hook->register_commands();
- if (r < 0) {
+ dout(20) << "registered asok hook: " << m_name << dendl;
+ asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
+ this);
+ int r = asok_hook->register_commands();
+ if (r == 0) {
+ m_asok_hook = asok_hook;
+ return;
+ }
derr << "error registering admin socket commands" << dendl;
- delete asok_hook;
- asok_hook = nullptr;
- return;
}
-
- m_asok_hook = asok_hook;
+ delete asok_hook;
}
template <typename I>
void ImageReplayer<I>::unregister_admin_socket_hook() {
dout(20) << dendl;
- delete m_asok_hook;
- m_asok_hook = nullptr;
+ AdminSocketHook *asok_hook = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(asok_hook, m_asok_hook);
+ }
+ delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::on_name_changed() {
+ {
+ Mutex::Locker locker(m_lock);
+ std::string name = m_local_ioctx.get_pool_name() + "/" +
+ m_local_image_ctx->name;
+ if (m_name == name) {
+ return;
+ }
+ m_name = name;
+ }
+ unregister_admin_socket_hook();
+ register_admin_socket_hook();
}
template <typename I>
void register_admin_socket_hook();
void unregister_admin_socket_hook();
+
+ void on_name_changed();
};
} // namespace mirror
const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
+const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS {
+ {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}};
+
class PoolReplayerAdminSocketCommand {
public:
PoolReplayerAdminSocketCommand(PoolReplayer *pool_replayer)
dout(20) << "replaying for " << m_peer << dendl;
int r = init_rados(g_ceph_context->_conf->cluster,
g_ceph_context->_conf->name.to_str(),
- "local cluster", &m_local_rados);
+ "local cluster", &m_local_rados, false);
if (r < 0) {
m_callout_id = m_service_daemon->add_or_update_callout(
m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
r = init_rados(m_peer.cluster_name, m_peer.client_name,
std::string("remote peer ") + stringify(m_peer),
- &m_remote_rados);
+ &m_remote_rados, true);
if (r < 0) {
m_callout_id = m_service_daemon->add_or_update_callout(
m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
int PoolReplayer::init_rados(const std::string &cluster_name,
const std::string &client_name,
const std::string &description,
- RadosRef *rados_ref) {
+ RadosRef *rados_ref,
+ bool strip_cluster_overrides) {
rados_ref->reset(new librados::Rados());
// NOTE: manually bootstrap a CephContext here instead of via
cct->put();
return r;
}
+
+ // preserve cluster-specific config settings before applying environment/cli
+ // overrides
+ std::map<std::string, std::string> config_values;
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& key : UNIQUE_PEER_CONFIG_KEYS) {
+ config_values[key] = cct->_conf->get_val<std::string>(key);
+ }
+ }
+
cct->_conf->parse_env();
// librados::Rados::conf_parse_env
}
}
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& pair : config_values) {
+ auto value = cct->_conf->get_val<std::string>(pair.first);
+ if (pair.second != value) {
+ dout(0) << "reverting global config option override: "
+ << pair.first << ": " << value << " -> " << pair.second
+ << dendl;
+ cct->_conf->set_val_or_die(pair.first, pair.second);
+ }
+ }
+ }
+
if (!g_ceph_context->_conf->admin_socket.empty()) {
cct->_conf->set_val_or_die("admin_socket",
"$run_dir/$name.$pid.$cluster.$cctid.asok");
int init_rados(const std::string &cluster_name,
const std::string &client_name,
- const std::string &description, RadosRef *rados_ref);
+ const std::string &description, RadosRef *rados_ref,
+ bool strip_cluster_overrides);
void handle_post_acquire_leader(Context *on_finish);
void handle_pre_release_leader(Context *on_finish);
}
m_image_ids_invalid = true;
- m_timer_ctx = new FunctionContext([this](int r) {
- process_refresh_images();
- });
- m_threads->timer->add_event_after(interval, m_timer_ctx);
+ m_timer_ctx = m_threads->timer->add_event_after(
+ interval,
+ new FunctionContext([this](int r) {
+ process_refresh_images();
+ }));
}
template <typename I>
update_progress("REGISTER_CLIENT");
- // record an place-holder record
- librbd::journal::ClientData client_data{
- librbd::journal::MirrorPeerClientMeta{m_local_image_id}};
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+ m_local_image_id};
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data{mirror_peer_client_meta};
bufferlist client_data_bl;
::encode(client_data, client_data_bl);
m_client = {};
*m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+ m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
is_primary();
}
template <typename I>
void BootstrapRequest<I>::get_remote_tags() {
- dout(20) << dendl;
-
- update_progress("GET_REMOTE_TAGS");
-
if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
// optimization -- no need to compare remote tags if we just created
// the image locally or sync was interrupted
}
dout(20) << dendl;
+ update_progress("GET_REMOTE_TAGS");
Context *ctx = create_context_callback<
BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this);
{
Mutex::Locker timer_locker(*m_timer_lock);
if (m_update_sync_ctx) {
- m_timer->add_event_after(m_update_sync_point_interval,
- m_update_sync_ctx);
+ m_update_sync_ctx = m_timer->add_event_after(
+ m_update_sync_point_interval,
+ m_update_sync_ctx);
}
}
osd copyfrom max chunk = 524288
bluestore fsck on mount = true
bluestore block create = true
+ bluestore block db path = $CEPH_DEV_DIR/osd\$id/block.db.file
bluestore block db size = 67108864
bluestore block db create = true
+ bluestore block wal path = $CEPH_DEV_DIR/osd\$id/block.wal.file
bluestore block wal size = 1048576000
bluestore block wal create = true
$COSDDEBUG
$COSDSHORT
$extra_conf
[mon]
- mgr initial modules = restful status dashboard
+ mgr initial modules = restful status dashboard balancer
mon pg warn min per osd = 3
mon osd allow primary affinity = true
mon reweight min pgs per osd = 4
echo "add osd$osd $uuid"
ceph_adm osd create $uuid
ceph_adm osd crush add osd.$osd 1.0 host=$HOSTNAME root=default
- $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
+ OSD_SECRET=$($CEPH_BIN/ceph-authtool --gen-print-key)
+ $SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --key $OSD_SECRET --osd-uuid $uuid
local key_fn=$CEPH_DEV_DIR/osd$osd/keyring
+ cat > $key_fn<<EOF
+[osd.$osd]
+ key = $OSD_SECRET
+EOF
echo adding osd$osd key to auth repository
ceph_adm -i "$key_fn" auth add osd.$osd osd "allow *" mon "allow profile osd" mgr "allow profile osd"
fi
debug rocksdb = 10
debug bdev = 20
debug rgw = 20
+ debug reserver = 10
debug objclass = 20'
CMDSDEBUG='
debug ms = 1
Description=Ceph rbd mirror daemon
After=network-online.target local-fs.target
Wants=network-online.target local-fs.target
+PartOf=ceph-rbd-mirror.target
[Service]
LimitNOFILE=1048576