Zack Cerza <zack@redhat.com> <zack@cerza.org>
Zengran Zhang <zhangzengran@h3c.com>
Zeqiang Zhuang <zhuang.zeqiang@h3c.com>
+Zhang Lei <zhanglei@trendytech.com.cn>
+Zhang Lei <zhanglei@trendytech.com.cn> <243290414@qq.com>
Zhang Shaowen <zhang_shaowen@139.com>
Zhang Zezhu <zhang.zezhu@zte.com.cn>
Guo Zhandong <guozhandong@cmss.chinamobile.com>
Zhi Zhang <willzzhang@tencent.com>
Zhi Zhang <willzzhang@tencent.com> <zhangz.david@outlook.com>
Zhi Zhang <willzzhang@tencent.com> <zhangzhi@localhost.localdomain>
+Zhu Shangzhong <zhu.shangzhong@zte.com.cn>
Zhuang Xiaochun <zhuangxc89@163.com>
Red Hat <contact@redhat.com> Alex Elder <aelder@redhat.com>
Red Hat <contact@redhat.com> Alfredo Deza <adeza@redhat.com>
Red Hat <contact@redhat.com> Ali Maredia <amaredia@redhat.com>
+Red Hat <contact@redhat.com> Amit Kumar <amitkuma@redhat.com>
Red Hat <contact@redhat.com> Andrew Schoen <aschoen@redhat.com>
Red Hat <contact@redhat.com> Barbora Ančincová <bara@redhat.com>
Red Hat <contact@redhat.com> Boris Ranto <branto@redhat.com>
The University of Arizona <contact@arizona.edu> James Ryan Cresawn <jrcresawn@gmail.com>
Time Warner Cable Inc. <contact@twcable.com> Bryan Stillwell <bryan.stillwell@twcable.com>
Trendy Tech <contact@trendytech.com.cn> shiqi <m13913886148@gmail.com>
-Trendy Tech <contact@trendytech.com.cn> Lei Zhang <243290414@qq.com>
+Trendy Tech <contact@trendytech.com.cn> Zhang Lei <zhanglei@trendytech.com.cn>
Uber Technologies Inc. <contact@uber.com> Henrik Korkuc <henrik@uber.com>
Ubuntu Kylin <contact@ubuntukylin.com> Min Chen <minchen@ubuntukylin.com>
UMCloud <contact@umcloud.com> Jiaying Ren <mikulely@gmail.com>
ZTE <contact@zte.com.cn> Gong Chuang <gong.chuang@zte.com.cn>
ZTE <contact@zte.com.cn> Lan De <lan.de3@zte.com.cn>
ZTE <contact@zte.com.cn> Luo Kexue <luo.kexue@zte.com.cn>
-ZTE <contact@zte.com.cn> Ren Huanwen <ren.huanwen@zte.com.cn>
ZTE <contact@zte.com.cn> Luo Runbing <runsisi@zte.com.cn>
-ZTE <contact@zte.com.cn> Xie Xingguo <xie.xingguo@zte.com.cn>
-ZTE <contact@zte.com.cn> Shun Song <song.shun3@zte.com.cn>
+ZTE <contact@zte.com.cn> Ren Huanwen <ren.huanwen@zte.com.cn>
ZTE <contact@zte.com.cn> Song Baisen <song.baisen@zte.com.cn>
+ZTE <contact@zte.com.cn> Song Shun <song.shun3@zte.com.cn>
ZTE <contact@zte.com.cn> Song Weibin <song.weibin@zte.com.cn>
ZTE <contact@zte.com.cn> Tang Wenjun <tang.wenjun3@zte.com.cn>
ZTE <contact@zte.com.cn> Wei Qiaomiao <wei.qiaomiao@zte.com.cn>
+ZTE <contact@zte.com.cn> Xie Xingguo <xie.xingguo@zte.com.cn>
ZTE <contact@zte.com.cn> Yan Jun <yan.jun8@zte.com.cn>
ZTE <contact@zte.com.cn> Zhang Zezhu <zhang.zezhu@zte.com.cn>
+ZTE <contact@zte.com.cn> Zhu Shangzhong <zhu.shangzhong@zte.com.cn>
#
# Local Variables:
# compile-command: "git log --pretty='%aN <%aE>' | \
RBD (kernel) - Ilya Dryomov <idryomov@redhat.com>
RGW - Yehuda Sadeh <yehuda@redhat.com>
Matt Benjamin <mbenjami@redhat.com>
-CephFS - John Spray <jspray@redhat.com>
+CephFS - Patrick Donnelly <pdonnell@redhat.com>
CephFS (kernel) - Yan, Zheng <zyan@redhat.com>
Deployment - Alfredo Deza <adeza@redhat.com>
Teuthology - Zack Cerza <zack@redhat.com>
cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.1.1)
+set(VERSION 12.1.2)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
* Added new configuration "public bind addr" to support dynamic environments
like Kubernetes. When set the Ceph MON daemon could bind locally to an IP
address and advertise a different IP address "public addr" on the network.
+* RGW: bucket index resharding now uses the reshard namespace in upgrade scenarios as well
+ this is a changed behaviour from RC1 where a new pool for reshard was created
12.0.0
------
* The "ceph -w" output no longer contains audit log entries by default.
Add a "--watch-channel=audit" or "--watch-channel=*" to see them.
+12.1.2
+------
+
+* New "ceph -w" behavior - the "ceph -w" output no longer contains I/O rates,
+ available space, pg info, etc. because these are no longer logged to the
+ central log (which is what "ceph -w" shows). The same information can be
+ obtained by running "ceph pg stat"; alternatively, I/O rates per pool can
+ be determined using "ceph osd pool stats". Although these commands do not
+ self-update like "ceph -w" did, they do have the ability to return formatted
+ output by providing a "--format=<format>" option.
+
+* Pools are now expected to be associated with the application using them.
+ Upon completing the upgrade to Luminous, the cluster will attempt to associate
+ existing pools to known applications (i.e. CephFS, RBD, and RGW). In-use pools
+ that are not associated to an application will generate a health warning. Any
+ unassociated pools can be manually associated using the new
+ "ceph osd pool application enable" command. For more details see
+ "Associate Pool to Application" in the documentation.
+
+* ceph-mgr now has a Zabbix plugin. Using zabbix_sender it sends trapper
+ events to a Zabbix server containing high-level information of the Ceph
+ cluster. This makes it easy to monitor a Ceph cluster's status and send
+ out notifications in case of a malfunction.
+
+* The 'mon_warn_osd_usage_min_max_delta' config option has been
+ removed and the associated health warning has been disabled because
+ it does not address clusters undergoing recovery or CRUSH rules that do
+ not target all devices in the cluster.
+
+* Specifying user authorization capabilities for RBD clients has been
+ simplified. The general syntax for using RBD capability profiles is
+ "mon 'profile rbd' osd 'profile rbd[-read-only][ pool={pool-name}[, ...]]'".
+ For more details see "User Management" in the documentation.
+
+* ``ceph config-key put`` has been deprecated in favor of ``ceph config-key set``.
\ No newline at end of file
Note that these instructions are meant for developers who are
compiling the code for development and testing. To build binaries
suitable for installation we recommend you build deb or rpm packages,
-or refer to the ceph.spec.in or debian/rules to see which
+or refer to the `ceph.spec.in` or `debian/rules` to see which
configuration options are specified for production builds.
Prerequisite: CMake 2.8.11
make
This assumes you make your build dir a subdirectory of the ceph.git
-checkout. If you put it elsewhere, just replace .. in do_cmake.sh with a
+checkout. If you put it elsewhere, just replace `..` in do_cmake.sh with a
correct path to the checkout.
To build only certain targets use:
ctest -V -R [regex matching test name(s)]
To run an tests manually and run the jobs in parallel, run `ctest` with
-the -j flag:
+the `-j` flag:
ctest -j [number of jobs]
### Prerequisites
The list of package dependencies for building the documentation can be
-found in doc_deps.deb.txt:
+found in `doc_deps.deb.txt`:
sudo apt-get install `cat doc_deps.deb.txt`
### Building the Documentation
To build the documentation, ensure that you are in the top-level
-`/ceph directory, and execute the build script. For example:
+`/ceph` directory, and execute the build script. For example:
admin/build-doc
If you cannot condense your patch set into a smaller set of patches,
then only post say 15 or so at a time and wait for review and integration.
+5. Document your changes
+------------------------
+
+If you have added or modified any user-facing functionality, such
+as CLI commands or their output, then the patch series or pull request
+must include appropriate updates to documentation.
+
+It is the submitter's responsibility to make the changes, and the reviewer's
+responsibility to make sure they are not merging changes that do not
+have the needed updates to documentation.
+
+Where there are areas that have absent documentation, or there is no
+clear place to note the change that is being made, the reviewer should
+contact the component lead, who should arrange for the missing section
+to be created with sufficient detail for the patch submitter to
+document their changes.
-5. Style check your changes
+6. Style check your changes
---------------------------
Check your patch for basic style violations, details of which can be
found in CodingStyle.
-6. No MIME, no links, no compression, no attachments. Just plain text
+7. No MIME, no links, no compression, no attachments. Just plain text
----------------------------------------------------------------------
Developers need to be able to read and comment on the changes you are
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.1.1
+pkgver=12.1.2
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.1.1.tar.bz2"
+source="ceph-12.1.2.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.1.1
+builddir=$srcdir/ceph-12.1.2
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
%bcond_without ceph_test_package
%endif
%bcond_with make_check
-%bcond_with xio
%ifarch s390 s390x
%bcond_with tcmalloc
%else
# main package definition
#################################################################################
Name: ceph
-Version: 12.1.1
+Version: 12.1.2
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.1.1.tar.bz2
+Source0: http://ceph.com/download/ceph-12.1.2.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
BuildRequires: selinux-policy-devel
BuildRequires: /usr/share/selinux/devel/policyhelp
%endif
+%if 0%{with make_check}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires: python-cherrypy
+BuildRequires: python-werkzeug
+%endif
+%if 0%{?suse_version}
+BuildRequires: python-CherryPy
+BuildRequires: python-Werkzeug
+%endif
+BuildRequires: python-pecan
+BuildRequires: socat
+%endif
BuildRequires: bc
BuildRequires: gperf
BuildRequires: cmake
BuildRequires: python
BuildRequires: python-devel
BuildRequires: python-nose
-BuildRequires: python-pecan
BuildRequires: python-requests
BuildRequires: python-virtualenv
-BuildRequires: python-werkzeug
-BuildRequires: socat
BuildRequires: snappy-devel
BuildRequires: udev
BuildRequires: util-linux
BuildRequires: libopenssl-devel
BuildRequires: lsb-release
BuildRequires: openldap2-devel
-BuildRequires: python-CherryPy
BuildRequires: python-Cython
BuildRequires: python-PrettyTable
BuildRequires: python-Sphinx
BuildRequires: openssl-devel
BuildRequires: redhat-lsb-core
BuildRequires: Cython
-BuildRequires: python-cherrypy
BuildRequires: python-prettytable
BuildRequires: python-sphinx
%endif
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: redhat-rpm-config
%endif
-# Accelio IB/RDMA
-%if 0%{with xio}
-BuildRequires: libxio-devel
-%endif
%description
Ceph is a massively scalable, open-source, distributed storage system that runs
%if 0%{?suse_version}
Recommends: ntp-daemon
%endif
-%if 0%{with xio}
-Requires: libxio
-%endif
%description base
Base is the package that includes all the files shared amongst ceph servers
%if 0%{?suse_version}
Requires(pre): pwdutils
%endif
-%if 0%{with xio}
-Requires: libxio
-%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
Comprised of files that are common to Ceph clients and servers.
Requires: ceph-common
Requires: xmlstarlet
Requires: jq
+Requires: socat
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
%endif
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.1.1
+%autosetup -p1 -n ceph-12.1.2
%build
%if 0%{with cephfs_java}
-DCMAKE_INSTALL_SYSCONFDIR=%{_sysconfdir} \
-DCMAKE_INSTALL_MANDIR=%{_mandir} \
-DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \
+ -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \
-DWITH_EMBEDDED=OFF \
-DWITH_MANPAGE=ON \
-DWITH_PYTHON3=ON \
%if 0%{?rhel} && ! 0%{?centos}
-DWITH_SUBMAN=ON \
%endif
-%if 0%{with xio}
- -DWITH_XIO=ON \
-%endif
%if 0%{without ceph_test_package}
-DWITH_TESTS=OFF \
%endif
%endif
%if %{with lttng}
-DWITH_LTTNG=ON \
- -DWTIH_BABELTRACE=ON \
+ -DWITH_BABELTRACE=ON \
%else
-DWITH_LTTNG=OFF \
- -DWTIH_BABELTRACE=OFF \
+ -DWITH_BABELTRACE=OFF \
%endif
$CEPH_EXTRA_CMAKE_ARGS \
%if 0%{with ocf}
%bcond_without ceph_test_package
%endif
%bcond_with make_check
-%bcond_with xio
%ifarch s390 s390x
%bcond_with tcmalloc
%else
BuildRequires: selinux-policy-devel
BuildRequires: /usr/share/selinux/devel/policyhelp
%endif
+%if 0%{with make_check}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires: python-cherrypy
+BuildRequires: python-werkzeug
+%endif
+%if 0%{?suse_version}
+BuildRequires: python-CherryPy
+BuildRequires: python-Werkzeug
+%endif
+BuildRequires: python-pecan
+BuildRequires: socat
+%endif
BuildRequires: bc
BuildRequires: gperf
BuildRequires: cmake
BuildRequires: python
BuildRequires: python-devel
BuildRequires: python-nose
-BuildRequires: python-pecan
BuildRequires: python-requests
BuildRequires: python-virtualenv
-BuildRequires: python-werkzeug
-BuildRequires: socat
BuildRequires: snappy-devel
BuildRequires: udev
BuildRequires: util-linux
BuildRequires: libopenssl-devel
BuildRequires: lsb-release
BuildRequires: openldap2-devel
-BuildRequires: python-CherryPy
BuildRequires: python-Cython
BuildRequires: python-PrettyTable
BuildRequires: python-Sphinx
BuildRequires: openssl-devel
BuildRequires: redhat-lsb-core
BuildRequires: Cython
-BuildRequires: python-cherrypy
BuildRequires: python-prettytable
BuildRequires: python-sphinx
%endif
%if 0%{?fedora} || 0%{?rhel}
BuildRequires: redhat-rpm-config
%endif
-# Accelio IB/RDMA
-%if 0%{with xio}
-BuildRequires: libxio-devel
-%endif
%description
Ceph is a massively scalable, open-source, distributed storage system that runs
%if 0%{?suse_version}
Recommends: ntp-daemon
%endif
-%if 0%{with xio}
-Requires: libxio
-%endif
%description base
Base is the package that includes all the files shared amongst ceph servers
%if 0%{?suse_version}
Requires(pre): pwdutils
%endif
-%if 0%{with xio}
-Requires: libxio
-%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
Comprised of files that are common to Ceph clients and servers.
Requires: ceph-common
Requires: xmlstarlet
Requires: jq
+Requires: socat
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
%endif
-DCMAKE_INSTALL_SYSCONFDIR=%{_sysconfdir} \
-DCMAKE_INSTALL_MANDIR=%{_mandir} \
-DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \
+ -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \
-DWITH_EMBEDDED=OFF \
-DWITH_MANPAGE=ON \
-DWITH_PYTHON3=ON \
%if 0%{?rhel} && ! 0%{?centos}
-DWITH_SUBMAN=ON \
%endif
-%if 0%{with xio}
- -DWITH_XIO=ON \
-%endif
%if 0%{without ceph_test_package}
-DWITH_TESTS=OFF \
%endif
%endif
%if %{with lttng}
-DWITH_LTTNG=ON \
- -DWTIH_BABELTRACE=ON \
+ -DWITH_BABELTRACE=ON \
%else
-DWITH_LTTNG=OFF \
- -DWTIH_BABELTRACE=OFF \
+ -DWITH_BABELTRACE=OFF \
%endif
$CEPH_EXTRA_CMAKE_ARGS \
%if 0%{with ocf}
+ceph (12.1.2-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Tue, 01 Aug 2017 17:55:37 +0000
+
ceph (12.1.1-1) stable; urgency=medium
* New upstream release
Depends: ceph-common,
curl,
jq,
+ socat,
xmlstarlet,
${misc:Depends},
${shlibs:Depends},
ARGS=""
if which ccache ; then
echo "enabling ccache"
- ARGS+="-DWITH_CCACHE=ON"
+ ARGS="$ARGS -DWITH_CCACHE=ON"
fi
mkdir build
Deactivate an MDS, causing it to flush its entire journal to
backing RADOS objects and close all open client sessions. Deactivating an MDS
is primarily intended for bringing down a rank after reducing the number of
-active MDS (max_mds).
+active MDS (max_mds). Once the rank is deactivated, the MDS daemon will rejoin the
+cluster as a standby.
+``<role>`` can take one of three forms:
+
+::
+
+ <fs_name>:<rank>
+ <fs_id>:<rank>
+ <rank>
Use ``mds deactivate`` in conjunction with adjustments to ``max_mds`` to
shrink an MDS cluster. See :doc:`/cephfs/multimds`
Finally, you can regenerate metadata objects for missing files
and directories based on the contents of a data pool. This is
-a two-phase process. First, scanning *all* objects to calculate
+a three-phase process. First, scanning *all* objects to calculate
size and mtime metadata for inodes. Second, scanning the first
-object from every file to collect this metadata and inject
-it into the metadata pool.
+object from every file to collect this metadata and inject it into
+the metadata pool. Third, checking inode linkages and fixing found
+errors.
::
cephfs-data-scan scan_extents <data pool>
cephfs-data-scan scan_inodes <data pool>
+ cephfs-data-scan scan_links
-This command may take a *very long* time if there are many
-files or very large files in the data pool.
+'scan_extents' and 'scan_inodes' commands may take a *very long* time
+if there are many files or very large files in the data pool.
To accelerate the process, run multiple instances of the tool.
ceph osd pool create recovery <pg-num> replicated <crush-ruleset-name>
ceph fs new recovery-fs recovery <data pool> --allow-dangerous-metadata-overlay
cephfs-data-scan init --force-init --filesystem recovery-fs --alternate-pool recovery
- ceph fs reset recovery-fs --yes-i-realy-mean-it
+ ceph fs reset recovery-fs --yes-i-really-mean-it
cephfs-table-tool recovery-fs:all reset session
cephfs-table-tool recovery-fs:all reset snap
cephfs-table-tool recovery-fs:all reset inode
::
- cephfs-data-scan scan_extents --alternate-pool recovery --filesystem <original filesystem name>
+ cephfs-data-scan scan_extents --alternate-pool recovery --filesystem <original filesystem name> <original data pool name>
cephfs-data-scan scan_inodes --alternate-pool recovery --filesystem <original filesystem name> --force-corrupt --force-init <original data pool name>
+ cephfs-data-scan scan_links --filesystem recovery-fs
If the damaged filesystem contains dirty journal data, it may be recovered next
with:
cephfs-journal-tool --rank=<original filesystem name>:0 event recover_dentries list --alternate-pool recovery
cephfs-journal-tool --rank recovery-fs:0 journal reset --force
-After recovery, some recovered directories will have incorrect link counts.
-Ensure the parameter mds_debug_scatterstat is set to false (the default) to
-prevent the MDS from checking the link counts, then run a forward scrub to
-repair them. Ensure you have an MDS running and issue:
+After recovery, some recovered directories will have incorrect statistics.
+Ensure the parameters mds_verify_scatter and mds_debug_scatterstat are set
+to false (the default) to prevent the MDS from checking the statistics, then
+run a forward scrub to repair them. Ensure you have an MDS running and issue:
::
CephFS includes a number of experimental features which are not fully stabilized
or qualified for users to turn on in real deployments. We generally do our best
-to clearly demarcate these and fence them off so they can't be used by mistake.
+to clearly demarcate these and fence them off so they cannot be used by mistake.
Some of these features are closer to being done than others, though. We describe
each of them with an approximation of how risky they are and briefly describe
``journaler batch max``
-:Description: Maximum bytes we'll delay flushing.
+:Description: Maximum bytes we will delay flushing.
:Type: 64-bit Unsigned Integer
:Required: No
:Default: ``0``
Most of the time this guide will work but sometimes all MDSs lock up and you
cannot actually see them spill. It is much better to run this on a cluster.
-As a pre-requistie, we assume you've installed `mdtest
+As a pre-requistie, we assume you have installed `mdtest
<https://sourceforge.net/projects/mdtest/>`_ or pulled the `Docker image
<https://hub.docker.com/r/michaelsevilla/mdtest/>`_. We use mdtest because we
need to generate enough load to get over the MIN_OFFLOAD threshold that is
done
-6. When you're done, you can kill all the clients with:
+6. When you are done, you can kill all the clients with:
::
in the MDBalancer. We do not want the error propagating up the call chain. The
cls_lua class wants to handle the error itself because it must fail gracefully.
For Mantle, we don't care if a Lua error crashes our balancer -- in that case,
-we'll fall back to the original balancer.
+we will fall back to the original balancer.
The performance improvement of using `lua_call` over `lua_pcall` would not be
leveraged here because the balancer is invoked every 10 seconds by default.
we have decreased max_mds, because max_mds only restricts creation
of new ranks.
-Next, use the ``ceph mds deactivate <rank>`` command to remove the
+Next, use the ``ceph mds deactivate <role>`` command to remove the
unneeded rank:
::
# fsmap e12: 1/1/1 up {0=a=up:active}, 1 up:standby
# fsmap e13: 1/1/1 up {0=a=up:active}, 2 up:standby
+See :doc:`/cephfs/administration` for more details which forms ``<role>`` can
+take.
+
The deactivated rank will first enter the stopping state for a period
of time while it hands off its share of the metadata to the remaining
active daemons. This phase can take from seconds to minutes. If the
RADOS Health
============
-If part of the CephFS metadata or data pools is unavaible and CephFS isn't
+If part of the CephFS metadata or data pools is unavaible and CephFS is not
responding, it is probably because RADOS itself is unhealthy. Resolve those
problems first (:doc:`../../rados/troubleshooting/index`).
the operation off to the MDS log. If it is waiting on the OSDs, fix them. If
operations are stuck on a specific inode, you probably have a client holding
caps which prevent others from using it, either because the client is trying
-to flush out dirty data or because you've encountered a bug in CephFS'
+to flush out dirty data or because you have encountered a bug in CephFS'
distributed file lock code (the file "capabilities" ["caps"] system).
If it's a result of a bug in the capabilities code, restarting the MDS
is likely to resolve the problem.
-If there are no slow requests reported on the MDS, and it isn't reporting
+If there are no slow requests reported on the MDS, and it is not reporting
that clients are misbehaving, either the client has a problem or its
-requests aren't reaching the MDS.
+requests are not reaching the MDS.
ceph-fuse debugging
===================
* osdc: Dumps the current ops in-flight to OSDs (ie, file data IO)
* osdmap: Dumps the current OSDMap epoch, pools, and OSDs
-If there are no stuck requests but you have file IO which isn't progressing,
+If there are no stuck requests but you have file IO which is not progressing,
you might have a...
Disconnected+Remounted FS
Because CephFS has a "consistent cache", if your network connection is
disrupted for a long enough time, the client will be forcibly
disconnected from the system. At this point, the kernel client is in
-a bind: it can't safely write back dirty data, and many applications
+a bind: it cannot safely write back dirty data, and many applications
do not handle IO errors correctly on close().
At the moment, the kernel client will remount the FS, but outstanding filesystem
IO may or may not be satisfied. In these cases, you may need to reboot your
--- /dev/null
+=========
+ceph-disk
+=========
+
+
+device-mapper crypt
+===================
+
+Settings
+--------
+
+``osd_dmcrypt_type``
+
+:Description: this option specifies the mode in which ``cryptsetup`` works. It can be ``luks`` or ``plain``. It kicks in only if the ``--dmcrypt`` option is passed to ``ceph-disk``. See also `cryptsetup document <https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt#configuration-using-cryptsetup>`_ for more details.
+
+:Type: String
+:Default: ``luks``
+
+
+``osd_dmcrypt_key_size``
+
+:Description: the size of the random string in bytes used as the LUKS key. The string is read from ``/dev/urandom`` and then encoded using base64. It will be stored with the key of ``dm-crypt/osd/$uuid/luks`` using config-key.
+
+:Type: String
+:Default: 1024 if ``osd_dmcrypt_type`` is ``luks``, 256 otherwise.
+
+lockbox
+-------
+
+``ceph-disk`` supports dmcrypt (device-mapper crypt). If dmcrypt is enabled, the partitions will be encrypted using this machinary. For each OSD device, a lockbox is introduced for holding the information regarding how the dmcrypt key is stored. To prepare a lockbox, ``ceph-disk``
+
+#. creates a dedicated lockbox partition on device, and
+#. populates it with a tiny filesystem, then
+#. automounts it at ``/var/lib/ceph/osd-lockbox/$uuid``, read-only. where the ``uuid`` is the lockbox's uuid.
+
+under which, settings are stored using plain files:
+
+- key-management-mode: ``ceph-mon v1``
+- osd-uuid: the OSD's uuid
+- ceph_fsid: the fsid of the cluster
+- keyring: the lockbox's allowing one to fetch the LUKS key
+- block_uuid: the partition uuid for the block device
+- journal_uuid: the partition uuid for the journal device
+- block.db_uuid: the partition uuid for the block.db device
+- block.wal_uuid: the partition uuid for the block.wal device
+- magic: a magic string indicating that this partition is a lockbox. It's not used currently.
+- ``${space_uuid}``: symbolic links named after the uuid of space partitions pointing to ``/var/lib/ceph/osd-lockbox/$uuid``. in the case of FileStore, the space partitions are ``data`` and ``journal`` partitions, for BlueStore, they are ``data``, ``block.db`` and ``block.wal``.
+
+Currently, ``ceph-mon v1`` is the only supported key-management-mode. In that case, the LUKS key is stored using the config-key in the monitor store with the key of ``dm-crypt/osd/$uuid/luks``.
+
+
+partitions
+==========
+
+``ceph-disk`` creates partitions for preparing a device for OSD deployment. Their partition numbers are hardcoded. For instance, data partition's partition number is always *1* :
+
+1. data partition
+2. journal partition, if co-located with data
+3. block.db for BlueStore, if co-located with data
+4. block.wal for BlueStore, if co-located with data
+5. lockbox
There are two ways for Ceph code to get configuration values. One way is to
read it directly from a variable named "g_conf," or equivalently,
"g_ceph_ctx->_conf." The other is to register an observer that will be called
-every time the relevant configuration values changes. This observer will be
+every time the relevant configuration values changes. This observer will be
called soon after the initial configuration is read, and every time after that
when one of the relevant values changes. Each observer tracks a set of keys
and is invoked only when one of the relevant keys changes.
Injectargs, parse_argv, and parse_env are three other functions which modify
the configuration. Just like with set_val, you should call apply_changes after
calling these functions to make sure your changes get applied.
+
+
+Defining config options
+=======================
+
+New-style config options are defined in common/options.cc. All new config
+options should go here (and not into legacy_config_opts.h).
+
+Levels
+------
+
+The Option constructor takes a "level" value:
+
+* *LEVEL_BASIC* is for basic config options that a normal operator is likely to adjust.
+* *LEVEL_ADVANCED* is for options that an operator *can* adjust, but should not touch unless they understand what they are doing. Adjusting advanced options poorly can lead to problems (performance or even data loss) if done incorrectly.
+* *LEVEL_DEV* is for options in place for use by developers only, either for testing purposes, or to describe constants that no user should adjust but we prefer not to compile into the code.
+
+Description and long description
+--------------------------------
+
+Short description of the option. Sentence fragment. e.g.::
+
+ .set_description("Default checksum algorithm to use")
+
+The long description is complete sentences, perhaps even multiple
+paragraphs, and may include other detailed information or notes.::
+
+ .set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming.")
+
+Default values
+--------------
+
+There is a default value for every config option. In some cases, there may
+also be a *daemon default* that only applies to code that declares itself
+as a daemon (in thise case, the regular default only applies to non-daemons).
+
+Safety
+------
+
+If an option can be safely changed at runtime::
+
+ .set_safe()
+
+Service
+-------
+
+Service is a component name, like "common", "osd", "rgw", "mds", etc. It may
+be a list of components, like::
+
+ .add_service("mon mds osd mgr")
+
+For example, the rocksdb options affect both the osd and mon.
+
+Tags
+----
+
+Tags identify options across services that relate in some way. Example include;
+
+ - network -- options affecting network configuration
+ - mkfs -- options that only matter at mkfs time
+
+Enums
+-----
+
+For options with a defined set of allowed values::
+
+ .set_enum_allowed({"none", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", "xxhash64"})
assign them a priority
* The bugs with higher priority are worked on first
-Each ``team`` is responsible for a project:
+Each ``team`` is responsible for a project, managed by leads_.
-* rgw lead is Yehuda Sadeh
-* CephFS lead is John Spray
-* rados lead is Samuel Just
-* rbd lead is Jason Dillaman
+.. _leads: index#Leads
The ``developer`` assigned to an issue is responsible for it. The
status of an open issue can be:
Documenting Ceph
==================
+User documentation
+==================
+
+The documentation on docs.ceph.com is generated from the restructuredText
+sources in ``/doc/`` in the Ceph git repository.
+
+Please make sure that your changes are written in a way that is intended
+for end users of the software, unless you are making additions in
+``/doc/dev/``, which is the section for developers.
+
+All pull requests that modify user-facing functionality must
+include corresponding updates to documentation: see
+`Submitting Patches`_ for more detail.
+
+Check your .rst syntax is working as expected by using the "View"
+button in the github user interface when looking at a diff on
+an .rst file, or build the docs locally using the ``admin/build-doc``
+script.
+
+For more information about the Ceph documentation, see
+:doc:`/start/documenting-ceph`.
+
Code Documentation
==================
digraph "example" {
foo -> bar;
bar -> baz;
- bar -> thud;
+ bar -> th
}
Most of the time, you'll want to put the actual DOT source in a
SVG diagrams using Inkscape.
HTML5 will support SVG inline.
+
+.. _Submitting Patches: /SubmittingPatches.rst
.. _github: https://github.com/
-========= =============== =============
-Scope Lead GitHub nick
-========= =============== =============
-Ceph Sage Weil liewegas
-RADOS Samuel Just athanatos
-RGW Yehuda Sadeh yehudasa
-RBD Jason Dillaman dillaman
-CephFS John Spray jcsp
-Build/Ops Ken Dreyer ktdreyer
-========= =============== =============
+========= ================ =============
+Scope Lead GitHub nick
+========= ================ =============
+Ceph Sage Weil liewegas
+RADOS Samuel Just athanatos
+RGW Yehuda Sadeh yehudasa
+RBD Jason Dillaman dillaman
+CephFS Patrick Donnelly batrick
+Build/Ops Ken Dreyer ktdreyer
+========= ================ =============
The Ceph-specific acronyms in the table are explained in
:doc:`/architecture`.
The perf counters provide generic internal infrastructure for gauges and counters. The counted values can be both integer and float. There is also an "average" type (normally float) that combines a sum and num counter which can be divided to provide an average.
-The intention is that this data will be collected and aggregated by a tool like ``collectd`` or ``statsd`` and fed into a tool like ``graphite`` for graphing and analysis.
+The intention is that this data will be collected and aggregated by a tool like ``collectd`` or ``statsd`` and fed into a tool like ``graphite`` for graphing and analysis. Also, note the :doc:`../mgr/prometheus`.
Access
------
$ MON=1 MDS=1 ../src/vstart.sh -d -n -x
-The system creates three pools on startup: `cephfs_data`, `cephfs_metadata`, and `rbd`. Let's get some stats on
+The system creates two pools on startup: `cephfs_data_a` and `cephfs_metadata_a`. Let's get some stats on
the current pools:
.. code::
$ bin/ceph osd pool stats
*** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***
- pool rbd id 0
- nothing is going on
-
- pool cephfs_data id 1
+ pool cephfs_data_a id 1
nothing is going on
- pool cephfs_metadata id 2
+ pool cephfs_metadata_a id 2
nothing is going on
- $ bin/ceph osd pool stats cephfs_data
+ $ bin/ceph osd pool stats cephfs_data_a
*** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***
- pool cephfs_data id 1
+ pool cephfs_data_a id 1
nothing is going on
- $ ./rados df
- pool name category KB objects clones degraded unfound rd rd KB wr wr KB
- rbd - 0 0 0 0 0 0 0 0 0
- cephfs_data - 0 0 0 0 0 0 0 0 0
- cephfs_metadata - 2 20 0 40 0 0 0 21 8
- total used 12771536 20
- total avail 3697045460
- total space 3709816996
+ $ bin/rados df
+ POOL_NAME USED OBJECTS CLONES COPIES MISSING_ON_PRIMARY UNFOUND DEGRADED RD_OPS RD WR_OPS WR
+ cephfs_data_a 0 0 0 0 0 0 0 0 0 0 0
+ cephfs_metadata_a 2246 21 0 63 0 0 0 0 0 42 8192
+
+ total_objects 21
+ total_used 244G
+ total_space 1180G
Make a pool and run some benchmarks against it:
Migrating from Apache to Civetweb
---------------------------------
-If you're running the Ceph Object Gateway on Apache and FastCGI with Ceph
-Storage v0.80 or above, you're already running Civetweb--it starts with the
+If you are running the Ceph Object Gateway on Apache and FastCGI with Ceph
+Storage v0.80 or above, you are already running Civetweb--it starts with the
``ceph-radosgw`` daemon and it's running on port 7480 by default so that it
doesn't conflict with your Apache and FastCGI installation and other commonly
used web service ports. Migrating to use Civetweb basically involves removing
radosgw-admin region set < region.json
-Once you've updated your region, update the region map. For example::
+Once you have updated your region, update the region map. For example::
radosgw-admin regionmap update --name client.rgw.ceph-client
#. Generate an administrator keyring, generate a ``client.admin`` user and add
the user to the keyring. ::
- sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'
+ sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' --cap mgr 'allow *'
#. Add the ``client.admin`` key to the ``ceph.mon.keyring``. ::
Without the benefit of any helper utilities, create an OSD and add it to the
cluster and CRUSH map with the following procedure. To create the first two
-OSDs with the long form procedure, execute the following on ``node2`` and
-``node3``:
+OSDs with the long form procedure, execute the following steps for each OSD.
-#. Connect to the OSD host. ::
+.. note:: This procedure does not describe deployment on top of dm-crypt
+ making use of the dm-crypt 'lockbox'.
- ssh {node-name}
-
-#. Generate a UUID for the OSD. ::
-
- uuidgen
+#. Connect to the OSD host and become root. ::
+ ssh {node-name}
+ sudo bash
-#. Create the OSD. If no UUID is given, it will be set automatically when the
- OSD starts up. The following command will output the OSD number, which you
- will need for subsequent steps. ::
-
- ceph osd create [{uuid} [{id}]]
-
-
-#. Create the default directory on your new OSD. ::
-
- ssh {new-osd-host}
- sudo mkdir /var/lib/ceph/osd/{cluster-name}-{osd-number}
-
+#. Generate a UUID for the OSD. ::
-#. If the OSD is for a drive other than the OS drive, prepare it
- for use with Ceph, and mount it to the directory you just created::
+ UUID=$(uuidgen)
- ssh {new-osd-host}
- sudo mkfs -t {fstype} /dev/{hdd}
- sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/{cluster-name}-{osd-number}
+#. Generate a cephx key for the OSD. ::
-
-#. Initialize the OSD data directory. ::
+ OSD_SECRET=$(ceph-authtool --gen-print-key)
- ssh {new-osd-host}
- sudo ceph-osd -i {osd-num} --mkfs --mkkey --osd-uuid [{uuid}]
+#. Create the OSD. Note that an OSD ID can be provided as an
+ additional argument to ``ceph osd new`` if you need to reuse a
+ previously-destroyed OSD id. We assume that the
+ ``client.bootstrap-osd`` key is present on the machine. You may
+ alternatively execute this command as ``client.admin`` on a
+ different host where that key is present.::
- The directory must be empty before you can run ``ceph-osd`` with the
- ``--mkkey`` option. In addition, the ceph-osd tool requires specification
- of custom cluster names with the ``--cluster`` option.
-
-
-#. Register the OSD authentication key. The value of ``ceph`` for
- ``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
- cluster name differs from ``ceph``, use your cluster name instead.::
+ ID=$(echo "{\"cephx_secret\": \"$OSD_SECRET\"}" | \
+ ceph osd new $UUID -i - \
+ -n client.bootstrap-osd -k /var/lib/ceph/bootstrap-osd/ceph.keyring)
- sudo ceph auth add osd.{osd-num} osd 'allow *' mon 'allow profile osd' -i /var/lib/ceph/osd/{cluster-name}-{osd-num}/keyring
-
-
-#. Add your Ceph Node to the CRUSH map. ::
-
- ceph [--cluster {cluster-name}] osd crush add-bucket {hostname} host
-
- For example::
+#. Create the default directory on your new OSD. ::
- ceph osd crush add-bucket node1 host
+ mkdir /var/lib/ceph/osd/ceph-$ID
+#. If the OSD is for a drive other than the OS drive, prepare it
+ for use with Ceph, and mount it to the directory you just created. ::
-#. Place the Ceph Node under the root ``default``. ::
-
- ceph osd crush move node1 root=default
+ mkfs.xfs /dev/{DEV}
+ mount /dev/{DEV} /var/lib/ceph/osd/ceph-$ID
+#. Write the secret to the OSD keyring file. ::
-#. Add the OSD to the CRUSH map so that it can begin receiving data. You may
- also decompile the CRUSH map, add the OSD to the device list, add the host as a
- bucket (if it's not already in the CRUSH map), add the device as an item in the
- host, assign it a weight, recompile it and set it. ::
+ ceph-authtool --create-keyring /var/lib/ceph/osd/ceph-$ID/keyring \
+ --name osd.$ID --add-key $OSD_SECRET
- ceph [--cluster {cluster-name}] osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
+#. Initialize the OSD data directory. ::
- For example::
+ ceph-osd -i $ID --mkfs --osd-uuid $UUID
- ceph osd crush add osd.0 1.0 host=node1
+#. Fix ownership. ::
+ chown -R ceph:ceph /var/lib/ceph/osd/ceph-$ID
#. After you add an OSD to Ceph, the OSD is in your configuration. However,
- it is not yet running. The OSD is ``down`` and ``in``. You must start
+ it is not yet running. You must start
your new OSD before it can begin receiving data.
- For Ubuntu, use Upstart::
-
- sudo start ceph-osd id={osd-num} [cluster={cluster-name}]
+ For modern systemd distributions::
+ systemctl enable ceph-osd@$ID
+ systemctl start ceph-osd@$ID
+
For example::
- sudo start ceph-osd id=0
- sudo start ceph-osd id=1
-
- For Debian/CentOS/RHEL, use sysvinit::
-
- sudo /etc/init.d/ceph start osd.{osd-num} [--cluster {cluster-name}]
-
- For example::
-
- sudo /etc/init.d/ceph start osd.0
- sudo /etc/init.d/ceph start osd.1
-
- In this case, to allow the start of the daemon at each reboot you
- must create an empty file like this::
-
- sudo touch /var/lib/ceph/osd/{cluster-name}-{osd-num}/sysvinit
-
- For example::
-
- sudo touch /var/lib/ceph/osd/ceph-0/sysvinit
- sudo touch /var/lib/ceph/osd/ceph-1/sysvinit
-
- Once you start your OSD, it is ``up`` and ``in``.
-
+ systemctl enable ceph-osd@12
+ systemctl start ceph-osd@12
Adding MDS
#. Generate an administrator keyring, generate a ``client.admin`` user and add
the user to the keyring. ::
- sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'
+ sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' --cap mgr 'allow *'
#. Add the ``client.admin`` key to the ``ceph.mon.keyring``. ::
To list all users in the cluster::
- ceph auth list
+ ceph auth ls
Options
specifies the client 'name', which is used to find the
client-specific configuration options in the config file, and
also is the name used for authentication when connecting
- to the cluster (the entity name appearing in ceph auth list output,
+ to the cluster (the entity name appearing in 'ceph auth ls' output,
for example). The default is 'client.restapi'.
.. option:: -i/--id id
| **ceph** **mon_status**
-| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd* \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
+| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd* \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
| **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ...
ceph auth import
-Subcommand ``list`` lists authentication state.
+Subcommand ``ls`` lists authentication state.
Usage::
- ceph auth list
+ ceph auth ls
Subcommand ``print-key`` displays requested key.
Usage::
- ceph config-key list
+ ceph config-key ls
Subcommand ``dump`` dumps configuration keys and values.
ceph config-key dump
-Subcommand ``put`` puts configuration key and value.
+Subcommand ``set`` puts configuration key and value.
Usage::
- ceph config-key put <key> {<val>}
+ ceph config-key set <key> {<val>}
daemon
ceph df {detail}
+.. _ceph features:
+
+features
+--------
+
+Show the releases and features of all connected daemons and clients connected
+to the cluster, along with the numbers of them in each bucket grouped by the
+corresponding features/releases. Each release of Ceph supports a different set
+of features, expressed by the features bitmask. New cluster features require
+that clients support the feature, or else they are not allowed to connect to
+these new features. As new features or capabilities are enabled after an
+upgrade, older clients are prevented from connecting.
+
+Usage::
+
+ ceph features
fs
--
ceph mon_status
+mgr
+---
+
+Ceph manager daemon configuration and management.
+
+Subcommand ``dump`` dumps the latest MgrMap, which describes the active
+and standby manager daemons.
+
+Usage::
+
+ ceph mgr dump
+
+Subcommand ``fail`` will mark a manager daemon as failed, removing it
+from the manager map. If it is the active manager daemon a standby
+will take its place.
+
+Usage::
+
+ ceph mgr fail <name>
+
+Subcommand ``module ls`` will list currently enabled manager modules (plugins).
+
+Usage::
+
+ ceph mgr module ls
+
+Subcommand ``module enable`` will enable a manager module. Available modules are included in MgrMap and visible via ``mgr dump``.
+
+Usage::
+
+ ceph mgr module enable <module>
+
+Subcommand ``module disable`` will disable an active manager module.
+
+Usage::
+
+ ceph mgr module disable <module>
+
+Subcommand ``metadata`` will report metadata about all manager daemons or, if the name is specified, a single manager daemon.
+
+Usage::
+
+ ceph mgr metadata [name]
+
+Subcommand ``versions`` will report a count of running daemon versions.
+
+Usage::
+
+ ceph mgr versions
+
+Subcommand ``count-metadata`` will report a count of any daemon metadata field.
+
+Usage::
+
+ ceph mgr count-metadata <field>
+
+
osd
---
ceph osd crush rule dump {<name>}
-Subcommand ``list`` lists crush rules.
-
-Usage::
-
- ceph osd crush rule list
-
Subcommand ``ls`` lists crush rules.
Usage::
ceph osd pg-temp <pgid> {<id> [<id>...]}
+Subcommand ``force-create-pg`` forces creation of pg <pgid>.
+
+Usage::
+
+ ceph osd force-create-pg <pgid>
+
+
Subcommand ``pool`` is used for managing data pools. It uses some additional
subcommands.
ceph osd reweight-by-utilization {<int[100-]>}
{--no-increasing}
-Subcommand ``rm`` removes osd(s) <id> [<id>...] in the cluster.
+Subcommand ``rm`` removes osd(s) <id> [<id>...] from the OSD map.
+
Usage::
ceph osd setmaxosd <int[0-]>
+Subcommand ``set-require-min-compat-client`` enforces the cluster to be backward
+compatible with the specified client version. This subcommand prevents you from
+making any changes (e.g., crush tunables, or using new features) that
+would violate the current setting. Please note, This subcommand will fail if
+any connected daemon or client is not compatible with the features offered by
+the given <version>. To see the features and releases of all clients connected
+to cluster, please see `ceph features`_.
+
+Usage::
+
+ ceph osd set-require-min-compat-client <version>
+
Subcommand ``stat`` prints summary of OSD map.
Usage::
ceph pg dump_stuck {inactive|unclean|stale|undersized|degraded [inactive|unclean|stale|undersized|degraded...]}
{<int>}
-Subcommand ``force_create_pg`` forces creation of pg <pgid>.
-
-Usage::
-
- ceph pg force_create_pg <pgid>
-
Subcommand ``getmap`` gets binary pg map to -o/stdout.
Usage::
:command:`getomapval` [ --omap-key-file *file* ] *name* *key* [ *out-file* ]
Dump the hexadecimal value of key in the object map of object name.
- If the optional *out-file* argument isn't provided, the value will be
+ If the optional *out-file* argument is not provided, the value will be
written to standard output.
:command:`setomapval` [ --omap-key-file *file* ] *name* *key* [ *value* ]
Set the value of key in the object map of object name. If the optional
- *value* argument isn't provided, the value will be read from standard
+ *value* argument is not provided, the value will be read from standard
input.
:command:`rmomapkey` [ --omap-key-file *file* ] *name* *key*
associated snapshots within the specified pool. It can also be used against
individual images and snapshots.
- If the RBD fast-diff feature isn't enabled on images, this operation will
+ If the RBD fast-diff feature is not enabled on images, this operation will
require querying the OSDs for every potential object within the image.
:command:`info` *image-spec* | *snap-spec*
This requires image format 2.
:command:`resize` (-s | --size *size-in-M/G/T*) [--allow-shrink] *image-spec*
- Resizes rbd image. The size parameter also needs to be specified.
+ Resize rbd image. The size parameter also needs to be specified.
The --allow-shrink option lets the size be reduced.
:command:`rm` *image-spec*
- Deletes an rbd image (including all data blocks). If the image has
+ Delete an rbd image (including all data blocks). If the image has
snapshots, this fails and nothing is deleted.
:command:`export` [--export-format *format (1 or 2)*] (*image-spec* | *snap-spec*) [*dest-path*]
- Exports image to dest path (use - for stdout).
+ Export image to dest path (use - for stdout).
The --export-format accepts '1' or '2' currently. Format 2 allow us to export not only the content
of image, but also the snapshots and other properties, such as image_order, features.
:command:`import` [--export-format *format (1 or 2)*] [--image-format *format-id*] [--object-size *size-in-B/K/M*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*]
- Creates a new image and imports its data from path (use - for
+ Create a new image and imports its data from path (use - for
stdin). The import operation will try to create sparse rbd images
if possible. For import from stdin, the sparsification unit is
the data block size of the destination image (object size).
of image, but also the snapshots and other properties, such as image_order, features.
:command:`export-diff` [--from-snap *snap-name*] [--whole-object] (*image-spec* | *snap-spec*) *dest-path*
- Exports an incremental diff for an image to dest path (use - for stdout). If
+ Export an incremental diff for an image to dest path (use - for stdout). If
an initial snapshot is specified, only changes since that snapshot are included; otherwise,
any regions of the image that contain data are included. The end snapshot is specified
using the standard --snap option or @snap syntax (see below). The image diff format includes
currently only support the source incremental diff with stripe_count == 1
:command:`import-diff` *src-path* *image-spec*
- Imports an incremental diff of an image and applies it to the current image. If the diff
+ Import an incremental diff of an image and applies it to the current image. If the diff
was generated relative to a start snapshot, we verify that snapshot already exists before
continuing. If there was an end snapshot we verify it does not already exist before
applying the changes, and create the snapshot when we are done.
whether the region is known to be zeros or may contain other data.
:command:`cp` (*src-image-spec* | *src-snap-spec*) *dest-image-spec*
- Copies the content of a src-image into the newly created dest-image.
+ Copy the content of a src-image into the newly created dest-image.
dest-image will have the same size, object size, and image format as src-image.
:command:`mv` *src-image-spec* *dest-image-spec*
- Renames an image. Note: rename across pools is not supported.
+ Rename an image. Note: rename across pools is not supported.
:command:`image-meta list` *image-spec*
Show metadata held on the image. The first column is the key
Remove metadata key with the value.
:command:`object-map rebuild` *image-spec* | *snap-spec*
- Rebuilds an invalid object map for the specified image. An image snapshot can be
+ Rebuild an invalid object map for the specified image. An image snapshot can be
specified to rebuild an invalid object map for a snapshot.
:command:`snap ls` *image-spec*
- Dumps the list of snapshots inside a specific image.
+ Dump the list of snapshots inside a specific image.
:command:`snap create` *snap-spec*
- Creates a new snapshot. Requires the snapshot name parameter specified.
+ Create a new snapshot. Requires the snapshot name parameter specified.
:command:`snap rollback` *snap-spec*
Rollback image content to snapshot. This will iterate through the entire blocks
array and update the data head content to the snapshotted version.
:command:`snap rm` [--force] *snap-spec*
- Removes the specified snapshot.
+ Remove the specified snapshot.
:command:`snap purge` *image-spec*
- Removes all snapshots from an image.
+ Remove all snapshots from an image.
:command:`snap protect` *snap-spec*
Protect a snapshot from deletion, so that clones can be made of it
an image.
:command:`map` [-o | --options *krbd-options* ] [--read-only] *image-spec* | *snap-spec*
- Maps the specified image to a block device via the rbd kernel module.
+ Map the specified image to a block device via the rbd kernel module.
:command:`unmap` [-o | --options *krbd-options* ] *image-spec* | *snap-spec* | *device-path*
- Unmaps the block device that was mapped via the rbd kernel module.
+ Unmap the block device that was mapped via the rbd kernel module.
:command:`showmapped`
Show the rbd images that are mapped via the rbd kernel module.
:command:`nbd map` [--device *device-path*] [--read-only] *image-spec* | *snap-spec*
- Maps the specified image to a block device via the rbd-nbd tool.
+ Map the specified image to a block device via the rbd-nbd tool.
:command:`nbd unmap` *device-path*
- Unmaps the block device that was mapped via the rbd-nbd tool.
+ Unmap the block device that was mapped via the rbd-nbd tool.
:command:`nbd list`
Show the list of used nbd devices via the rbd-nbd tool.
Show the status of the image, including which clients have it open.
:command:`feature disable` *image-spec* *feature-name*...
- Disables the specified feature on the specified image. Multiple features can
+ Disable the specified feature on the specified image. Multiple features can
be specified.
:command:`feature enable` *image-spec* *feature-name*...
- Enables the specified feature on the specified image. Multiple features can
+ Enable the specified feature on the specified image. Multiple features can
be specified.
:command:`lock list` *image-spec*
--io-total. Defaults are: --io-size 4096, --io-threads 16, --io-total 1G,
--io-pattern seq.
+:command:`trash ls` [*pool-name*]
+ List all entries from trash.
+
+:command:`trash mv` *image-spec*
+ Move an image to the trash. Images, even ones actively in-use by
+ clones, can be moved to the trash and deleted at a later time.
+
+:command:`trash rm` *image-id*
+ Delete an image from trash. If image deferment time has not expired
+ you can not removed it unless use force. But an actively in-use by clones
+ or has snapshots can not be removed.
+
+:command:`trash restore` *image-id*
+ Restore an image from trash.
+
Image and snap specs
====================
rbd lock remove mypool/myimage mylockid client.2485
+To list images from trash::
+
+ rbd trash ls mypool
+
+To defer delete an image (use *--delay* to set delay-time, default is 0)::
+
+ rbd trash mv mypool/myimage
+
+To delete an image from trash (be careful!)::
+
+ rbd trash rm mypool/myimage-id
+
+To force delete an image from trash (be careful!)::
+
+ rbd trash rm mypool/myimage-id --force
+
+To restore an image from trash::
+
+ rbd trash restore mypool/myimage-id
+
+To restore an image from trash and rename it::
+
+ rbd trash restore mypool/myimage-id --image mynewimage
+
Availability
============
be unmounted and unmapped.
``rbdmap unmap-all`` attempts to unmount and subsequently unmap all currently
-mapped RBD images, regardless of whether or not they're listed in the
+mapped RBD images, regardless of whether or not they are listed in the
configuration file.
If successful, the ``rbd map`` operation maps the image to a ``/dev/rbdX``
Calling module commands
-----------------------
-Where a module implements command line hooks, using the Ceph CLI's
-``tell`` command to call them like this::
+Where a module implements command line hooks, the commands will
+be accessible as ordinary Ceph commands::
- ceph tell mgr <command | help>
+ ceph <command | help>
+
+If you would like to see the list of commands handled by the
+manager (where normal ``ceph help`` would show all mon and mgr commands),
+you can send a command directly to the manager daemon::
+
+ ceph tell mgr help
Note that it is not necessary to address a particular mgr instance,
simply ``mgr`` will pick the current active daemon.
-Use the ``help`` command to get a list of available commands from all
-modules.
-
Configuration
-------------
also be necessary to configure them separately. The hostname and port
can be changed via the configuration key facility::
- ceph config-key put mgr/dashboard/$name/server_addr $IP
- ceph config-key put mgr/dashboard/$name/server_port $PORT
+ ceph config-key set mgr/dashboard/$name/server_addr $IP
+ ceph config-key set mgr/dashboard/$name/server_port $PORT
where ``$name`` is the ID of the ceph-mgr who is hosting this
dashboard web app.
These settings can also be configured cluster-wide and not manager
specific. For example,::
- ceph config-key put mgr/dashboard/server_addr $IP
- ceph config-key put mgr/dashboard/server_port $PORT
+ ceph config-key set mgr/dashboard/server_addr $IP
+ ceph config-key set mgr/dashboard/server_port $PORT
If the port is not configured, the web app will bind to port ``7000``.
If the address it not configured, the web app will bind to ``::``,
:maxdepth: 1
Installation and Configuration <administrator>
- Dashboard <dashboard>
- RESTful <restful>
- Zabbix <zabbix>
+ Dashboard plugin <dashboard>
+ RESTful plugin <restful>
+ Zabbix plugin <zabbix>
+ Prometheus plugin <prometheus>
Writing plugins <plugins>
--- /dev/null
+Prometheus plugin
+=================
+
+Provides a Prometheus exporter to pass on Ceph performance counters
+from the collection point in ceph-mgr. Ceph-mgr receives MMgrReport
+messages from all MgrClient processes (mons and OSDs, for instance)
+with performance counter schema data and actual counter data, and keeps
+a circular buffer of the last N samples. This plugin creates an HTTP
+endpoint (like all Prometheus exporters) and retrieves the latest sample
+of every counter when polled (or "scraped" in Prometheus terminology).
+The HTTP path and query parameters are ignored; all extant counters
+for all reporting entities are returned in text exposition format.
+(See the Prometheus `documentation <https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details>`_.)
+
+Enabling
+--------
+
+The *prometheus* module is enabled with::
+
+ ceph mgr module enable prometheus
+
+Configuration
+-------------
+
+By default the module will accept HTTP requests on port ``9283`` on all
+IPv4 and IPv6 addresses on the host. The port and listen address are both
+configurable with ``ceph config-key set``, with keys
+``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``.
+This port is registered with Prometheus's `registry <https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
+
+Notes
+-----
+
+Counters and gauges are exported; currently histograms and long-running
+averages are not. It's possible that Ceph's 2-D histograms could be
+reduced to two separate 1-D histograms, and that long-running averages
+could be exported as Prometheus' Summary type.
+
+The names of the stats are exactly as Ceph names them, with
+illegal characters ``.`` and ``-`` translated to ``_``. There is one
+label applied, ``daemon``, and its value is the daemon.id for the
+daemon in question (e.g. ``{daemon=mon.hosta}`` or ``{daemon=osd.11}``).
+
+Timestamps, as with many Prometheus exporters, are established by
+the server's scrape time (Prometheus expects that it is polling the
+actual counter process synchronously). It is possible to supply a
+timestamp along with the stat report, but the Prometheus team strongly
+advises against this. This means that timestamps will be delayed by
+an unpredictable amount; it's not clear if this will be problematic,
+but it's worth knowing about.
The ``restful.crt`` should then be signed by your organization's CA
(certificate authority). Once that is done, you can set it with::
- ceph config-key put mgr/restful/$name/crt -i restful.crt
- ceph config-key put mgr/restful/$name/key -i restful.key
+ ceph config-key set mgr/restful/$name/crt -i restful.crt
+ ceph config-key set mgr/restful/$name/key -i restful.key
where ``$name`` is the name of the ``ceph-mgr`` instance (usually the
hostname). If all manager instances are to share the same certificate,
you can leave off the ``$name`` portion::
- ceph config-key put mgr/restful/crt -i restful.crt
- ceph config-key put mgr/restful/key -i restful.key
+ ceph config-key set mgr/restful/crt -i restful.crt
+ ceph config-key set mgr/restful/key -i restful.key
Configuring IP and port
also be necessary to configure them separately. The IP and port
can be changed via the configuration key facility::
- ceph config-key put mgr/restful/$name/server_addr $IP
- ceph config-key put mgr/restful/$name/server_port $PORT
+ ceph config-key set mgr/restful/$name/server_addr $IP
+ ceph config-key set mgr/restful/$name/server_port $PORT
where ``$name`` is the ID of the ceph-mgr daemon (usually the hostname).
These settings can also be configured cluster-wide and not manager
specific. For example,::
- ceph config-key put mgr/restful/server_addr $IP
- ceph config-key put mgr/restful/server_port $PORT
+ ceph config-key set mgr/restful/server_addr $IP
+ ceph config-key set mgr/restful/server_port $PORT
If the port is not configured, *restful* will bind to port ``8003``.
If the address it not configured, the *restful* will bind to ``::``,
- Storage utilization
Requirements
-============
+------------
The plugin requires that the *zabbix_sender* executable is present on *all*
machines running ceph-mgr. It can be installed on most distributions using
the package manager.
Dependencies
-------------
+^^^^^^^^^^^^
Installing zabbix_sender can be done under Ubuntu or CentOS using either apt
or dnf.
Enabling
-========
+--------
Add this to your ceph.conf on nodes where you run ceph-mgr:
Configuration
-=============
+-------------
Two configuration keys are mandatory for the module to work:
- mgr/zabbix/zabbix_sender: /usr/bin/zabbix_sender
- mgr/zabbix/interval: 60
-Configurations keys
--------------------
+Configuration keys
+^^^^^^^^^^^^^^^^^^^
Configuration keys can be set on any machine with the proper cephx credentials,
these are usually Monitors where the *client.admin* key is present.
::
- ceph config-key put <key> <value>
+ ceph config-key set <key> <value>
For example:
::
- ceph config-key put mgr/zabbix/zabbix_host zabbix.localdomain
- ceph config-key put mgr/zabbix/identifier ceph.eu-ams02.local
+ ceph config-key set mgr/zabbix/zabbix_host zabbix.localdomain
+ ceph config-key set mgr/zabbix/identifier ceph.eu-ams02.local
Debugging
-=========
+---------
Should you want to debug the Zabbix module increase the logging level for
ceph-mgr and check the logs.
exit(1);
}
-In the end, you'll want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`::
+In the end, you will want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`::
rados_ioctx_destroy(io);
rados_shutdown(cluster);
ceph daemon osd.0 config show | less
+Reading Configuration Metadata at Runtime
+=========================================
+
+Information about the available configuration options is available via
+the ``config help`` command:
+
+::
+
+ ceph daemon {daemon-type}.{id} config help | less
+
+
+This metadata is primarily intended to be used when integrating other
+software with Ceph, such as graphical user interfaces. The output is
+a list of JSON objects, for example:
+
+::
+
+ {
+ "name": "mon_host",
+ "type": "std::string",
+ "level": "basic",
+ "desc": "list of hosts or addresses to search for a monitor",
+ "long_desc": "This is a comma, whitespace, or semicolon separated list of IP addresses or hostnames. Hostnames are resolved via DNS and all A or AAAA records are included in the search list.",
+ "default": "",
+ "daemon_default": "",
+ "tags": [],
+ "services": [
+ "common"
+ ],
+ "see_also": [],
+ "enum_values": [],
+ "min": "",
+ "max": ""
+ }
+
+type
+____
+
+The type of the setting, given as a C++ type name.
+
+level
+_____
+
+One of `basic`, `advanced`, `dev`. The `dev` options are not intended
+for use outside of development and testing.
+
+desc
+____
+
+A short description -- this is a sentence fragment suitable for display
+in small spaces like a single line in a list.
+
+long_desc
+_________
+
+A full description of what the setting does, this may be as long as needed.
+
+default
+_______
+
+The default value, if any.
+
+daemon_default
+______________
+
+An alternative default used for daemons (services) as opposed to clients.
+
+tags
+____
+
+A list of strings indicating topics to which this setting relates. Examples
+of tags are `performance` and `networking`.
+
+services
+________
+
+A list of strings indicating which Ceph services the setting relates to, such
+as `osd`, `mds`, `mon`. For settings that are relevant to any Ceph client
+or server, `common` is used.
+
+see_also
+________
+
+A list of strings indicating other configuration options that may also
+be of interest to a user setting this option.
+
+enum_values
+___________
+
+Optional: a list of strings indicating the valid settings.
+
+min, max
+________
+
+Optional: upper and lower (inclusive) bounds on valid settings.
+
+
+
Running Multiple Clusters
=========================
If Ceph Monitors discovered each other through the Ceph configuration file
instead of through the monmap, it would introduce additional risks because the
-Ceph configuration files aren't updated and distributed automatically. Ceph
+Ceph configuration files are not updated and distributed automatically. Ceph
Monitors might inadvertently use an older Ceph configuration file, fail to
recognize a Ceph Monitor, fall out of a quorum, or develop a situation where
-`Paxos`_ isn't able to determine the current state of the system accurately.
+`Paxos`_ is not able to determine the current state of the system accurately.
.. index:: Ceph Monitor; bootstrapping monitors
.. tip:: You SHOULD install NTP on your Ceph monitor hosts to
ensure that the monitor cluster operates with synchronized clocks.
-Clock drift may still be noticeable with NTP even though the discrepancy isn't
+Clock drift may still be noticeable with NTP even though the discrepancy is not
yet harmful. Ceph's clock drift / clock skew warnings may get triggered even
though NTP maintains a reasonable level of synchronization. Increasing your
clock drift may be tolerable under such circumstances; however, a number of
:Description: Initial number of worker threads used by each Async Messenger instance.
Should be at least equal to highest number of replicas, but you can
- decrease it if you're low on CPU core count and/or you host a lot of
+ decrease it if you are low on CPU core count and/or you host a lot of
OSDs on single server.
:Type: 64-bit Unsigned Integer
:Required: No
workers #1 and #2 to CPU cores #0 and #2, respectively.
NOTE: when manually setting affinity, make sure to not assign workers to
processors that are virtual CPUs created as an effect of Hyperthreading
- or similar technology, because they're slower than regular CPU cores.
+ or similar technology, because they are slower than regular CPU cores.
:Type: String
:Required: No
:Default: ``(empty)``
.. note:: Ceph uses `CIDR`_ notation for subnets (e.g., ``10.0.0.0/24``).
-When you've configured your networks, you may restart your cluster or restart
+When you have configured your networks, you may restart your cluster or restart
each daemon. Ceph daemons bind dynamically, so you do not have to restart the
entire cluster at once if you change your network configuration.
``ms bind ipv6``
:Description: Enables Ceph daemons to bind to IPv6 addresses. Currently the
- messenger *either* uses IPv4 or IPv6, but it can't do both.
+ messenger *either* uses IPv4 or IPv6, but it cannot do both.
:Type: Boolean
:Default: ``false``
:Required: No
``osd client message size cap``
:Description: The largest client data message allowed in memory.
-:Type: 64-bit Integer Unsigned
+:Type: 64-bit Unsigned Integer
:Default: 500MB default. ``500*1024L*1024L``
token bucket system which when there are sufficient tokens will
dequeue high priority queues first. If there are not enough
tokens available, queues are dequeued low priority to high priority.
- The new WeightedPriorityQueue (``wpq``) dequeues all priorities in
+ The WeightedPriorityQueue (``wpq``) dequeues all priorities in
relation to their priorities to prevent starvation of any queue.
WPQ should help in cases where a few OSDs are more overloaded
- than others. Requires a restart.
+ than others. The new mClock based OpClassQueue
+ (``mclock_opclass``) prioritizes operations based on which class
+ they belong to (recovery, scrub, snaptrim, client op, osd subop).
+ And, the mClock based ClientQueue (``mclock_client``) also
+ incorporates the client identifier in order to promote fairness
+ between clients. See `QoS Based on mClock`_. Requires a restart.
:Type: String
-:Valid Choices: prio, wpq
+:Valid Choices: prio, wpq, mclock_opclass, mclock_client
:Default: ``prio``
:Type: 32-bit Integer
:Default: ``5``
+
+QoS Based on mClock
+-------------------
+
+Ceph's use of mClock is currently in the experimental phase and should
+be approached with an exploratory mindset.
+
+Core Concepts
+`````````````
+
+The QoS support of Ceph is implemented using a queueing scheduler
+based on `the dmClock algorithm`_. This algorithm allocates the I/O
+resources of the Ceph cluster in proportion to weights, and enforces
+the constraits of minimum reservation and maximum limitation, so that
+the services can compete for the resources fairly. Currently the
+*mclock_opclass* operation queue divides Ceph services involving I/O
+resources into following buckets:
+
+- client op: the iops issued by client
+- osd subop: the iops issued by primary OSD
+- snap trim: the snap trimming related requests
+- pg recovery: the recovery related requests
+- pg scrub: the scrub related requests
+
+And the resources are partitioned using following three sets of tags. In other
+words, the share of each type of service is controlled by three tags:
+
+#. reservation: the minimum IOPS allocated for the service.
+#. limitation: the maximum IOPS allocated for the service.
+#. weight: the proportional share of capacity if extra capacity or system
+ oversubscribed.
+
+In Ceph operations are graded with "cost". And the resources allocated
+for serving various services are consumed by these "costs". So, for
+example, the more reservation a services has, the more resource it is
+guaranteed to possess, as long as it requires. Assuming there are 2
+services: recovery and client ops:
+
+- recovery: (r:1, l:5, w:1)
+- client ops: (r:2, l:0, w:9)
+
+The settings above ensure that the recovery won't get more than 5
+requests per second serviced, even if it requires so (see CURRENT
+IMPLEMENTATION NOTE below), and no other services are competing with
+it. But if the clients start to issue large amount of I/O requests,
+neither will they exhaust all the I/O resources. 1 request per second
+is always allocated for recovery jobs as long as there are any such
+requests. So the recovery jobs won't be starved even in a cluster with
+high load. And in the meantime, the client ops can enjoy a larger
+portion of the I/O resource, because its weight is "9", while its
+competitor "1". In the case of client ops, it is not clamped by the
+limit setting, so it can make use of all the resources if there is no
+recovery ongoing.
+
+Along with *mclock_opclass* another mclock operation queue named
+*mclock_client* is available. It divides operations based on category
+but also divides them based on the client making the request. This
+helps not only manage the distribution of resources spent on different
+classes of operations but also tries to insure fairness among clients.
+
+CURRENT IMPLEMENTATION NOTE: the current experimental implementation
+does not enforce the limit values. As a first approximation we decided
+not to prevent operations that would otherwise enter the operation
+sequencer from doing so.
+
+Subtleties of mClock
+````````````````````
+
+The reservation and limit values have a unit of requests per
+second. The weight, however, does not technically have a unit and the
+weights are relative to one another. So if one class of requests has a
+weight of 1 and another a weight of 9, then the latter class of
+requests should get 9 executed at a 9 to 1 ratio as the first class.
+However that will only happen once the reservations are met and those
+values include the operations executed under the reservation phase.
+
+Even though the weights do not have units, one must be careful in
+choosing their values due how the algorithm assigns weight tags to
+requests. If the weight is *W*, then for a given class of requests,
+the next one that comes in will have a weight tag of *1/W* plus the
+previous weight tag or the current time, whichever is larger. That
+means if *W* is sufficiently large and therefore *1/W* is sufficiently
+small, the calculated tag may never be assigned as it will get a value
+of the current time. The ultimate lesson is that values for weight
+should not be too large. They should be under the number of requests
+one expects to ve serviced each second.
+
+Caveats
+```````
+
+There are some factors that can reduce the impact of the mClock op
+queues within Ceph. First, requests to an OSD are sharded by their
+placement group identifier. Each shard has its own mClock queue and
+these queues neither interact nor share information among them. The
+number of shards can be controlled with the configuration options
+``osd_op_num_shards``, ``osd_op_num_shards_hdd``, and
+``osd_op_num_shards_ssd``. A lower number of shards will increase the
+impact of the mClock queues, but may have other deliterious effects.
+
+Second, requests are transferred from the operation queue to the
+operation sequencer, in which they go through the phases of
+execution. The operation queue is where mClock resides and mClock
+determines the next op to transfer to the operation sequencer. The
+number of operations allowed in the operation sequencer is a complex
+issue. In general we want to keep enough operations in the sequencer
+so it's always getting work done on some operations while it's waiting
+for disk and network access to complete on other operations. On the
+other hand, once an operation is transferred to the operation
+sequencer, mClock no longer has control over it. Therefore to maximize
+the impact of mClock, we want to keep as few operations in the
+operation sequencer as possible. So we have an inherent tension.
+
+The configuration options that influence the number of operations in
+the operation sequencer are ``bluestore_throttle_bytes``,
+``bluestore_throttle_deferred_bytes``,
+``bluestore_throttle_cost_per_io``,
+``bluestore_throttle_cost_per_io_hdd``, and
+``bluestore_throttle_cost_per_io_ssd``.
+
+A third factor that affects the impact of the mClock algorithm is that
+we're using a distributed system, where requests are made to multiple
+OSDs and each OSD has (can have) multiple shards. Yet we're currently
+using the mClock algorithm, which is not distributed (note: dmClock is
+the distributed version of mClock).
+
+Various organizations and individuals are currently experimenting with
+mClock as it exists in this code base along with their modifications
+to the code base. We hope you'll share you're experiences with your
+mClock and dmClock experiments in the ceph-devel mailing list.
+
+
+``osd push per object cost``
+
+:Description: the overhead for serving a push op
+
+:Type: Unsigned Integer
+:Default: 1000
+
+``osd recovery max chunk``
+
+:Description: the maximum total size of data chunks a recovery op can carry.
+
+:Type: Unsigned Integer
+:Default: 8 MiB
+
+
+``osd op queue mclock client op res``
+
+:Description: the reservation of client op.
+
+:Type: Float
+:Default: 1000.0
+
+
+``osd op queue mclock client op wgt``
+
+:Description: the weight of client op.
+
+:Type: Float
+:Default: 500.0
+
+
+``osd op queue mclock client op lim``
+
+:Description: the limit of client op.
+
+:Type: Float
+:Default: 1000.0
+
+
+``osd op queue mclock osd subop res``
+
+:Description: the reservation of osd subop.
+
+:Type: Float
+:Default: 1000.0
+
+
+``osd op queue mclock osd subop wgt``
+
+:Description: the weight of osd subop.
+
+:Type: Float
+:Default: 500.0
+
+
+``osd op queue mclock osd subop lim``
+
+:Description: the limit of osd subop.
+
+:Type: Float
+:Default: 0.0
+
+
+``osd op queue mclock snap res``
+
+:Description: the reservation of snap trimming.
+
+:Type: Float
+:Default: 0.0
+
+
+``osd op queue mclock snap wgt``
+
+:Description: the weight of snap trimming.
+
+:Type: Float
+:Default: 1.0
+
+
+``osd op queue mclock snap lim``
+
+:Description: the limit of snap trimming.
+
+:Type: Float
+:Default: 0.001
+
+
+``osd op queue mclock recov res``
+
+:Description: the reservation of recovery.
+
+:Type: Float
+:Default: 0.0
+
+
+``osd op queue mclock recov wgt``
+
+:Description: the weight of recovery.
+
+:Type: Float
+:Default: 1.0
+
+
+``osd op queue mclock recov lim``
+
+:Description: the limit of recovery.
+
+:Type: Float
+:Default: 0.001
+
+
+``osd op queue mclock scrub res``
+
+:Description: the reservation of scrub jobs.
+
+:Type: Float
+:Default: 0.0
+
+
+``osd op queue mclock scrub wgt``
+
+:Description: the weight of scrub jobs.
+
+:Type: Float
+:Default: 1.0
+
+
+``osd op queue mclock scrub lim``
+
+:Description: the limit of scrub jobs.
+
+:Type: Float
+:Default: 0.001
+
+.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf
+
+
.. index:: OSD; backfilling
Backfilling
``osd recovery max chunk``
:Description: The maximum size of a recovered chunk of data to push.
-:Type: 64-bit Integer Unsigned
+:Type: 64-bit Unsigned Integer
:Default: ``8 << 20``
:Description: The maximum number of recovery operations per OSD that will be
newly started when an OSD is recovering.
-:Type: 64-bit Integer Unsigned
+:Type: 64-bit Unsigned Integer
:Default: ``1``
``osd recovery sleep``
-:Description: Time to sleep before next recovery. Increasing this value will
- slow down recovery operation while client operations will be
- less impacted.
+:Description: Time in seconds to sleep before next recovery or backfill op.
+ Increasing this value will slow down recovery operation while
+ client operations will be less impacted.
:Type: Float
-:Default: ``0.01``
+:Default: ``0``
+
+
+``osd recovery sleep hdd``
+
+:Description: Time in seconds to sleep before next recovery or backfill op
+ for HDDs.
+
+:Type: Float
+:Default: ``0.1``
+
+
+``osd recovery sleep ssd``
+
+:Description: Time in seconds to sleep before next recovery or backfill op
+ for SSDs.
+
+:Type: Float
+:Default: ``0``
Tiering
=======
``osd default notify timeout``
:Description: The OSD default notification timeout (in seconds).
-:Type: 32-bit Integer Unsigned
+:Type: 32-bit Unsigned Integer
:Default: ``30``
.. note:: If you have specified multiple monitors in the setup of the cluster,
make sure, that all monitors are up and running. If the monitors haven't
- formed quorum, ``ceph-create-keys`` will not finish and the keys aren't
+ formed quorum, ``ceph-create-keys`` will not finish and the keys are not
generated.
Forget Keys
If monitors discovered each other through the Ceph configuration file instead of
through the monmap, it would introduce additional risks because the Ceph
-configuration files aren't updated and distributed automatically. Monitors
+configuration files are not updated and distributed automatically. Monitors
might inadvertently use an older ``ceph.conf`` file, fail to recognize a
-monitor, fall out of a quorum, or develop a situation where `Paxos`_ isn't able
+monitor, fall out of a quorum, or develop a situation where `Paxos`_ is not able
to determine the current state of the system accurately. Consequently, making
changes to an existing monitor's IP address must be done with great care.
subsequent releases.
+Replacing an OSD
+----------------
+
+When disks fail, or if an admnistrator wants to reprovision OSDs with a new
+backend, for instance, for switching from FileStore to BlueStore, OSDs need to
+be replaced. Unlike `Removing the OSD`_, replaced OSD's id and CRUSH map entry
+need to be keep intact after the OSD is destroyed for replacement.
+
+#. Destroy the OSD first::
+
+ ceph osd destroy {id} --yes-i-really-mean-it
+
+#. Zap a disk for the new OSD, if the disk was used before for other purposes.
+ It's not necessary for a new disk::
+
+ ceph-disk zap /dev/sdX
+
+#. Prepare the disk for replacement by using the previously destroyed OSD id::
+
+ ceph-disk prepare --bluestore /dev/sdX --osd-id {id} --osd-uuid `uuidgen`
+
+#. And activate the OSD::
+
+ ceph-disk activate /dev/sdX1
+
+
Starting the OSD
----------------
After that, you can observe the data migration which should come to its
end. The difference between marking ``out`` the OSD and reweighting it
to 0 is that in the first case the weight of the bucket which contains
- the OSD isn't changed whereas in the second case the weight of the bucket
+ the OSD is not changed whereas in the second case the weight of the bucket
is updated (and decreased of the OSD weight). The reweight command could
be sometimes favoured in the case of a "small" cluster.
``ceph.conf`` file. If your host has multiple drives, you may need to remove an
OSD for each drive by repeating this procedure.
+#. Let the cluster forget the OSD first. This step removes the OSD from the CRUSH
+ map, removes its authentication key. And it is removed from the OSD map as
+ well. Please note the `purge subcommand`_ is introduced in Luminous, for older
+ versions, please see below ::
+
+ ceph osd purge {id} --yes-i-really-mean-it
+
+#. Navigate to the host where you keep the master copy of the cluster's
+ ``ceph.conf`` file. ::
+
+ ssh {admin-host}
+ cd /etc/ceph
+ vim ceph.conf
+
+#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). ::
+
+ [osd.1]
+ host = {hostname}
+
+#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
+ copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
+ hosts in your cluster.
+
+If your Ceph cluster is older than Luminous, instead of using ``ceph osd purge``,
+you need to perform this step manually:
+
#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
also decompile the CRUSH map, remove the OSD from the device list, remove the
ceph osd rm {osd-num}
#for example
ceph osd rm 1
-
-#. Navigate to the host where you keep the master copy of the cluster's
- ``ceph.conf`` file. ::
- ssh {admin-host}
- cd /etc/ceph
- vim ceph.conf
-
-#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). ::
-
- [osd.1]
- host = {hostname}
-
-#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
- copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
- hosts in your cluster.
-
-
.. _Remove an OSD: ../crush-map#removeosd
+.. _purge subcommand: /man/8/ceph#osd
To list the cluster's keys and their capabilities, execute the following::
- ceph auth list
+ ceph auth ls
Placement Group Subsystem
Move an existing bucket from one position in the hierarchy to another. ::
- ceph osd crush move {id} {loc1} [{loc2} ...]
+ ceph osd crush move {id} {loc1} [{loc2} ...]
Set the weight of the item given by ``{name}`` to ``{weight}``. ::
ceph osd crush reweight {name} {weight}
-Create a cluster snapshot. ::
-
- ceph osd cluster_snap {name}
-
Mark an OSD as lost. This may result in permanent data loss. Use with caution. ::
ceph osd lost {id} [--yes-i-really-mean-it]
ceph osd in {osd-num}
-List classes that are loaded in the ceph cluster. ::
-
- ceph class list
-
Set or clear the pause flags in the OSD map. If set, no IO requests
will be sent to any OSD. Clearing the flags via unpause results in
resending pending requests. ::
and forces CRUSH to re-place (1-weight) of the data that would
otherwise live on this drive. It does not change the weights assigned
to the buckets above the OSD in the crush map, and is a corrective
-measure in case the normal CRUSH distribution isn't working out quite
+measure in case the normal CRUSH distribution is not working out quite
right. For instance, if one of your OSDs is at 90% and the others are
at 50%, you could reduce this weight to try and compensate for it. ::
--- /dev/null
+Manually editing a CRUSH Map
+============================
+
+.. note:: Manually editing the CRUSH map is considered an advanced
+ administrator operation. All CRUSH changes that are
+ necessary for the overwhelming majority of installations are
+ possible via the standard ceph CLI and do not require manual
+ CRUSH map edits. If you have identified a use case where
+ manual edits *are* necessary, consider contacting the Ceph
+ developers so that future versions of Ceph can make this
+ unnecessary.
+
+To edit an existing CRUSH map:
+
+#. `Get the CRUSH map`_.
+#. `Decompile`_ the CRUSH map.
+#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_.
+#. `Recompile`_ the CRUSH map.
+#. `Set the CRUSH map`_.
+
+To activate CRUSH map rules for a specific pool, identify the common ruleset
+number for those rules and specify that ruleset number for the pool. See `Set
+Pool Values`_ for details.
+
+.. _Get the CRUSH map: #getcrushmap
+.. _Decompile: #decompilecrushmap
+.. _Devices: #crushmapdevices
+.. _Buckets: #crushmapbuckets
+.. _Rules: #crushmaprules
+.. _Recompile: #compilecrushmap
+.. _Set the CRUSH map: #setcrushmap
+.. _Set Pool Values: ../pools#setpoolvalues
+
+.. _getcrushmap:
+
+Get a CRUSH Map
+---------------
+
+To get the CRUSH map for your cluster, execute the following::
+
+ ceph osd getcrushmap -o {compiled-crushmap-filename}
+
+Ceph will output (-o) a compiled CRUSH map to the filename you specified. Since
+the CRUSH map is in a compiled form, you must decompile it first before you can
+edit it.
+
+.. _decompilecrushmap:
+
+Decompile a CRUSH Map
+---------------------
+
+To decompile a CRUSH map, execute the following::
+
+ crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
+
+
+Sections
+--------
+
+There are six main sections to a CRUSH Map.
+
+#. **tunables:** The preamble at the top of the map described any *tunables*
+ for CRUSH behavior that vary from the historical/legacy CRUSH behavior. These
+ correct for old bugs, optimizations, or other changes in behavior that have
+ been made over the years to improve CRUSH's behavior.
+
+#. **devices:** Devices are individual ``ceph-osd`` daemons that can
+ store data.
+
+#. **types**: Bucket ``types`` define the types of buckets used in
+ your CRUSH hierarchy. Buckets consist of a hierarchical aggregation
+ of storage locations (e.g., rows, racks, chassis, hosts, etc.) and
+ their assigned weights.
+
+#. **buckets:** Once you define bucket types, you must define each node
+ in the hierarchy, its type, and which devices or other nodes it
+ containes.
+
+#. **rules:** Rules define policy about how data is distributed across
+ devices in the hierarchy.
+
+#. **choose_args:** Choose_args are alternative weights associated with
+ the hierarchy that have been adjusted to optimize data placement. A single
+ choose_args map can be used for the entire cluster, or one can be
+ created for each individual pool.
+
+
+.. _crushmapdevices:
+
+CRUSH Map Devices
+-----------------
+
+Devices are individual ``ceph-osd`` daemons that can store data. You
+will normally have one defined here for each OSD daemon in your
+cluster. Devices are identified by an id (a non-negative integer) and
+a name, normally ``osd.N`` where ``N`` is the device id.
+
+Devices may also have a *device class* associated with them (e.g.,
+``hdd`` or ``ssd``), allowing them to be conveniently targetted by a
+crush rule.
+
+::
+
+ # devices
+ device {num} {osd.name} [class {class}]
+
+For example::
+
+ # devices
+ device 0 osd.0 class ssd
+ device 1 osd.1 class hdd
+ device 2 osd.2
+ device 3 osd.3
+
+In most cases, each device maps to a single ``ceph-osd`` daemon. This
+is normally a single storage device, a pair of devices (for example,
+one for data and one for a journal or metadata), or in some cases a
+small RAID device.
+
+
+
+
+
+CRUSH Map Bucket Types
+----------------------
+
+The second list in the CRUSH map defines 'bucket' types. Buckets facilitate
+a hierarchy of nodes and leaves. Node (or non-leaf) buckets typically represent
+physical locations in a hierarchy. Nodes aggregate other nodes or leaves.
+Leaf buckets represent ``ceph-osd`` daemons and their corresponding storage
+media.
+
+.. tip:: The term "bucket" used in the context of CRUSH means a node in
+ the hierarchy, i.e. a location or a piece of physical hardware. It
+ is a different concept from the term "bucket" when used in the
+ context of RADOS Gateway APIs.
+
+To add a bucket type to the CRUSH map, create a new line under your list of
+bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name.
+By convention, there is one leaf bucket and it is ``type 0``; however, you may
+give it any name you like (e.g., osd, disk, drive, storage, etc.)::
+
+ #types
+ type {num} {bucket-name}
+
+For example::
+
+ # types
+ type 0 osd
+ type 1 host
+ type 2 chassis
+ type 3 rack
+ type 4 row
+ type 5 pdu
+ type 6 pod
+ type 7 room
+ type 8 datacenter
+ type 9 region
+ type 10 root
+
+
+
+.. _crushmapbuckets:
+
+CRUSH Map Bucket Hierarchy
+--------------------------
+
+The CRUSH algorithm distributes data objects among storage devices according
+to a per-device weight value, approximating a uniform probability distribution.
+CRUSH distributes objects and their replicas according to the hierarchical
+cluster map you define. Your CRUSH map represents the available storage
+devices and the logical elements that contain them.
+
+To map placement groups to OSDs across failure domains, a CRUSH map defines a
+hierarchical list of bucket types (i.e., under ``#types`` in the generated CRUSH
+map). The purpose of creating a bucket hierarchy is to segregate the
+leaf nodes by their failure domains, such as hosts, chassis, racks, power
+distribution units, pods, rows, rooms, and data centers. With the exception of
+the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and
+you may define it according to your own needs.
+
+We recommend adapting your CRUSH map to your firms's hardware naming conventions
+and using instances names that reflect the physical hardware. Your naming
+practice can make it easier to administer the cluster and troubleshoot
+problems when an OSD and/or other hardware malfunctions and the administrator
+need access to physical hardware.
+
+In the following example, the bucket hierarchy has a leaf bucket named ``osd``,
+and two node buckets named ``host`` and ``rack`` respectively.
+
+.. ditaa::
+ +-----------+
+ | {o}rack |
+ | Bucket |
+ +-----+-----+
+ |
+ +---------------+---------------+
+ | |
+ +-----+-----+ +-----+-----+
+ | {o}host | | {o}host |
+ | Bucket | | Bucket |
+ +-----+-----+ +-----+-----+
+ | |
+ +-------+-------+ +-------+-------+
+ | | | |
+ +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+
+ | osd | | osd | | osd | | osd |
+ | Bucket | | Bucket | | Bucket | | Bucket |
+ +-----------+ +-----------+ +-----------+ +-----------+
+
+.. note:: The higher numbered ``rack`` bucket type aggregates the lower
+ numbered ``host`` bucket type.
+
+Since leaf nodes reflect storage devices declared under the ``#devices`` list
+at the beginning of the CRUSH map, you do not need to declare them as bucket
+instances. The second lowest bucket type in your hierarchy usually aggregates
+the devices (i.e., it's usually the computer containing the storage media, and
+uses whatever term you prefer to describe it, such as "node", "computer",
+"server," "host", "machine", etc.). In high density environments, it is
+increasingly common to see multiple hosts/nodes per chassis. You should account
+for chassis failure too--e.g., the need to pull a chassis if a node fails may
+result in bringing down numerous hosts/nodes and their OSDs.
+
+When declaring a bucket instance, you must specify its type, give it a unique
+name (string), assign it a unique ID expressed as a negative integer (optional),
+specify a weight relative to the total capacity/capability of its item(s),
+specify the bucket algorithm (usually ``straw``), and the hash (usually ``0``,
+reflecting hash algorithm ``rjenkins1``). A bucket may have one or more items.
+The items may consist of node buckets or leaves. Items may have a weight that
+reflects the relative weight of the item.
+
+You may declare a node bucket with the following syntax::
+
+ [bucket-type] [bucket-name] {
+ id [a unique negative numeric ID]
+ weight [the relative capacity/capability of the item(s)]
+ alg [the bucket type: uniform | list | tree | straw ]
+ hash [the hash type: 0 by default]
+ item [item-name] weight [weight]
+ }
+
+For example, using the diagram above, we would define two host buckets
+and one rack bucket. The OSDs are declared as items within the host buckets::
+
+ host node1 {
+ id -1
+ alg straw
+ hash 0
+ item osd.0 weight 1.00
+ item osd.1 weight 1.00
+ }
+
+ host node2 {
+ id -2
+ alg straw
+ hash 0
+ item osd.2 weight 1.00
+ item osd.3 weight 1.00
+ }
+
+ rack rack1 {
+ id -3
+ alg straw
+ hash 0
+ item node1 weight 2.00
+ item node2 weight 2.00
+ }
+
+.. note:: In the foregoing example, note that the rack bucket does not contain
+ any OSDs. Rather it contains lower level host buckets, and includes the
+ sum total of their weight in the item entry.
+
+.. topic:: Bucket Types
+
+ Ceph supports four bucket types, each representing a tradeoff between
+ performance and reorganization efficiency. If you are unsure of which bucket
+ type to use, we recommend using a ``straw`` bucket. For a detailed
+ discussion of bucket types, refer to
+ `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_,
+ and more specifically to **Section 3.4**. The bucket types are:
+
+ #. **Uniform:** Uniform buckets aggregate devices with **exactly** the same
+ weight. For example, when firms commission or decommission hardware, they
+ typically do so with many machines that have exactly the same physical
+ configuration (e.g., bulk purchases). When storage devices have exactly
+ the same weight, you may use the ``uniform`` bucket type, which allows
+ CRUSH to map replicas into uniform buckets in constant time. With
+ non-uniform weights, you should use another bucket algorithm.
+
+ #. **List**: List buckets aggregate their content as linked lists. Based on
+ the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`P` algorithm,
+ a list is a natural and intuitive choice for an **expanding cluster**:
+ either an object is relocated to the newest device with some appropriate
+ probability, or it remains on the older devices as before. The result is
+ optimal data migration when items are added to the bucket. Items removed
+ from the middle or tail of the list, however, can result in a significant
+ amount of unnecessary movement, making list buckets most suitable for
+ circumstances in which they **never (or very rarely) shrink**.
+
+ #. **Tree**: Tree buckets use a binary search tree. They are more efficient
+ than list buckets when a bucket contains a larger set of items. Based on
+ the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`R` algorithm,
+ tree buckets reduce the placement time to O(log :sub:`n`), making them
+ suitable for managing much larger sets of devices or nested buckets.
+
+ #. **Straw:** List and Tree buckets use a divide and conquer strategy
+ in a way that either gives certain items precedence (e.g., those
+ at the beginning of a list) or obviates the need to consider entire
+ subtrees of items at all. That improves the performance of the replica
+ placement process, but can also introduce suboptimal reorganization
+ behavior when the contents of a bucket change due an addition, removal,
+ or re-weighting of an item. The straw bucket type allows all items to
+ fairly “compete” against each other for replica placement through a
+ process analogous to a draw of straws.
+
+.. topic:: Hash
+
+ Each bucket uses a hash algorithm. Currently, Ceph supports ``rjenkins1``.
+ Enter ``0`` as your hash setting to select ``rjenkins1``.
+
+
+.. _weightingbucketitems:
+
+.. topic:: Weighting Bucket Items
+
+ Ceph expresses bucket weights as doubles, which allows for fine
+ weighting. A weight is the relative difference between device capacities. We
+ recommend using ``1.00`` as the relative weight for a 1TB storage device.
+ In such a scenario, a weight of ``0.5`` would represent approximately 500GB,
+ and a weight of ``3.00`` would represent approximately 3TB. Higher level
+ buckets have a weight that is the sum total of the leaf items aggregated by
+ the bucket.
+
+ A bucket item weight is one dimensional, but you may also calculate your
+ item weights to reflect the performance of the storage drive. For example,
+ if you have many 1TB drives where some have relatively low data transfer
+ rate and the others have a relatively high data transfer rate, you may
+ weight them differently, even though they have the same capacity (e.g.,
+ a weight of 0.80 for the first set of drives with lower total throughput,
+ and 1.20 for the second set of drives with higher total throughput).
+
+
+.. _crushmaprules:
+
+CRUSH Map Rules
+---------------
+
+CRUSH maps support the notion of 'CRUSH rules', which are the rules that
+determine data placement for a pool. For large clusters, you will likely create
+many pools where each pool may have its own CRUSH ruleset and rules. The default
+CRUSH map has a rule for each pool, and one ruleset assigned to each of the
+default pools.
+
+.. note:: In most cases, you will not need to modify the default rules. When
+ you create a new pool, its default ruleset is ``0``.
+
+
+CRUSH rules define placement and replication strategies or distribution policies
+that allow you to specify exactly how CRUSH places object replicas. For
+example, you might create a rule selecting a pair of targets for 2-way
+mirroring, another rule for selecting three targets in two different data
+centers for 3-way mirroring, and yet another rule for erasure coding over six
+storage devices. For a detailed discussion of CRUSH rules, refer to
+`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_,
+and more specifically to **Section 3.2**.
+
+A rule takes the following form::
+
+ rule <rulename> {
+
+ ruleset <ruleset>
+ type [ replicated | erasure ]
+ min_size <min-size>
+ max_size <max-size>
+ step take <bucket-name> [class <device-class>]
+ step [choose|chooseleaf] [firstn|indep] <N> <bucket-type>
+ step emit
+ }
+
+
+``ruleset``
+
+:Description: A means of classifying a rule as belonging to a set of rules.
+ Activated by `setting the ruleset in a pool`_.
+
+:Purpose: A component of the rule mask.
+:Type: Integer
+:Required: Yes
+:Default: 0
+
+.. _setting the ruleset in a pool: ../pools#setpoolvalues
+
+
+``type``
+
+:Description: Describes a rule for either a storage drive (replicated)
+ or a RAID.
+
+:Purpose: A component of the rule mask.
+:Type: String
+:Required: Yes
+:Default: ``replicated``
+:Valid Values: Currently only ``replicated`` and ``erasure``
+
+``min_size``
+
+:Description: If a pool makes fewer replicas than this number, CRUSH will
+ **NOT** select this rule.
+
+:Type: Integer
+:Purpose: A component of the rule mask.
+:Required: Yes
+:Default: ``1``
+
+``max_size``
+
+:Description: If a pool makes more replicas than this number, CRUSH will
+ **NOT** select this rule.
+
+:Type: Integer
+:Purpose: A component of the rule mask.
+:Required: Yes
+:Default: 10
+
+
+``step take <bucket-name> [class <device-class>]``
+
+:Description: Takes a bucket name, and begins iterating down the tree.
+ If the ``device-class`` is specified, it must match
+ a class previously used when defining a device. All
+ devices that do not belong to the class are excluded.
+:Purpose: A component of the rule.
+:Required: Yes
+:Example: ``step take data``
+
+
+``step choose firstn {num} type {bucket-type}``
+
+:Description: Selects the number of buckets of the given type. The number is
+ usually the number of replicas in the pool (i.e., pool size).
+
+ - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
+ - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
+ - If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
+
+:Purpose: A component of the rule.
+:Prerequisite: Follows ``step take`` or ``step choose``.
+:Example: ``step choose firstn 1 type row``
+
+
+``step chooseleaf firstn {num} type {bucket-type}``
+
+:Description: Selects a set of buckets of ``{bucket-type}`` and chooses a leaf
+ node from the subtree of each bucket in the set of buckets. The
+ number of buckets in the set is usually the number of replicas in
+ the pool (i.e., pool size).
+
+ - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
+ - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
+ - If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
+
+:Purpose: A component of the rule. Usage removes the need to select a device using two steps.
+:Prerequisite: Follows ``step take`` or ``step choose``.
+:Example: ``step chooseleaf firstn 0 type row``
+
+
+
+``step emit``
+
+:Description: Outputs the current value and empties the stack. Typically used
+ at the end of a rule, but may also be used to pick from different
+ trees in the same rule.
+
+:Purpose: A component of the rule.
+:Prerequisite: Follows ``step choose``.
+:Example: ``step emit``
+
+.. important:: To activate one or more rules with a common ruleset number to a
+ pool, set the ruleset number of the pool.
+
+
+Placing Different Pools on Different OSDS:
+==========================================
+
+Suppose you want to have most pools default to OSDs backed by large hard drives,
+but have some pools mapped to OSDs backed by fast solid-state drives (SSDs).
+It's possible to have multiple independent CRUSH hierarchies within the same
+CRUSH map. Define two hierarchies with two different root nodes--one for hard
+disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown
+below::
+
+ device 0 osd.0
+ device 1 osd.1
+ device 2 osd.2
+ device 3 osd.3
+ device 4 osd.4
+ device 5 osd.5
+ device 6 osd.6
+ device 7 osd.7
+
+ host ceph-osd-ssd-server-1 {
+ id -1
+ alg straw
+ hash 0
+ item osd.0 weight 1.00
+ item osd.1 weight 1.00
+ }
+
+ host ceph-osd-ssd-server-2 {
+ id -2
+ alg straw
+ hash 0
+ item osd.2 weight 1.00
+ item osd.3 weight 1.00
+ }
+
+ host ceph-osd-platter-server-1 {
+ id -3
+ alg straw
+ hash 0
+ item osd.4 weight 1.00
+ item osd.5 weight 1.00
+ }
+
+ host ceph-osd-platter-server-2 {
+ id -4
+ alg straw
+ hash 0
+ item osd.6 weight 1.00
+ item osd.7 weight 1.00
+ }
+
+ root platter {
+ id -5
+ alg straw
+ hash 0
+ item ceph-osd-platter-server-1 weight 2.00
+ item ceph-osd-platter-server-2 weight 2.00
+ }
+
+ root ssd {
+ id -6
+ alg straw
+ hash 0
+ item ceph-osd-ssd-server-1 weight 2.00
+ item ceph-osd-ssd-server-2 weight 2.00
+ }
+
+ rule data {
+ ruleset 0
+ type replicated
+ min_size 2
+ max_size 2
+ step take platter
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+ rule metadata {
+ ruleset 1
+ type replicated
+ min_size 0
+ max_size 10
+ step take platter
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+ rule rbd {
+ ruleset 2
+ type replicated
+ min_size 0
+ max_size 10
+ step take platter
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+ rule platter {
+ ruleset 3
+ type replicated
+ min_size 0
+ max_size 10
+ step take platter
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+ rule ssd {
+ ruleset 4
+ type replicated
+ min_size 0
+ max_size 4
+ step take ssd
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+ rule ssd-primary {
+ ruleset 5
+ type replicated
+ min_size 5
+ max_size 10
+ step take ssd
+ step chooseleaf firstn 1 type host
+ step emit
+ step take platter
+ step chooseleaf firstn -1 type host
+ step emit
+ }
+
+You can then set a pool to use the SSD rule by::
+
+ ceph osd pool set <poolname> crush_ruleset 4
+
+Similarly, using the ``ssd-primary`` rule will cause each placement group in the
+pool to be placed with an SSD as the primary and platters as the replicas.
+
+
+Tuning CRUSH, the hard way
+--------------------------
+
+If you can ensure that all clients are running recent code, you can
+adjust the tunables by extracting the CRUSH map, modifying the values,
+and reinjecting it into the cluster.
+
+* Extract the latest CRUSH map::
+
+ ceph osd getcrushmap -o /tmp/crush
+
+* Adjust tunables. These values appear to offer the best behavior
+ for both large and small clusters we tested with. You will need to
+ additionally specify the ``--enable-unsafe-tunables`` argument to
+ ``crushtool`` for this to work. Please use this option with
+ extreme care.::
+
+ crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
+
+* Reinject modified map::
+
+ ceph osd setcrushmap -i /tmp/crush.new
+
+Legacy values
+-------------
+
+For reference, the legacy values for the CRUSH tunables can be set
+with::
+
+ crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
+
+Again, the special ``--enable-unsafe-tunables`` option is required.
+Further, as noted above, be careful running old versions of the
+``ceph-osd`` daemon after reverting to legacy values as the feature
+bit is not perfectly enforced.
replicas are on devices using different shelves, racks, power supplies,
controllers, and/or physical locations.
-When you create a configuration file and deploy Ceph with ``ceph-deploy``, Ceph
-generates a default CRUSH map for your configuration. The default CRUSH map is
-fine for your Ceph sandbox environment. However, when you deploy a large-scale
-data cluster, you should give significant consideration to developing a custom
-CRUSH map, because it will help you manage your Ceph cluster, improve
-performance and ensure data safety.
-
-For example, if an OSD goes down, a CRUSH map can help you to locate
-the physical data center, room, row and rack of the host with the failed OSD in
-the event you need to use onsite support or replace hardware.
-
-Similarly, CRUSH may help you identify faults more quickly. For example, if all
-OSDs in a particular rack go down simultaneously, the fault may lie with a
-network switch or power to the rack rather than the OSDs themselves.
-
-A custom CRUSH map can also help you identify the physical locations where
-Ceph stores redundant copies of data when the placement group(s) associated
-with a failed host are in a degraded state.
-
-.. note:: Lines of code in example boxes may extend past the edge of the box.
- Please scroll when reading or copying longer examples.
+When you deploy OSDs they are automatically placed within the CRUSH map under a
+``host`` node named with the hostname for the host they are running on. This,
+combined with the default CRUSH failure domain, ensures that replicas or erasure
+code shards are separated across hosts and a single host failure will not
+affect availability. For larger clusters, however, administrators should carefully consider their choice of failure domain. Separating replicas across racks,
+for example, is common for mid- to large-sized clusters.
CRUSH Location
==============
-The location of an OSD in terms of the CRUSH map's hierarchy is referred to
-as a 'crush location'. This location specifier takes the form of a list of
-key and value pairs describing a position. For example, if an OSD is in a
-particular row, rack, chassis and host, and is part of the 'default' CRUSH
-tree, its crush location could be described as::
+The location of an OSD in terms of the CRUSH map's hierarchy is
+referred to as a ``crush location``. This location specifier takes the
+form of a list of key and value pairs describing a position. For
+example, if an OSD is in a particular row, rack, chassis and host, and
+is part of the 'default' CRUSH tree (this is the case for the vast
+majority of clusters), its crush location could be described as::
root=default row=a rack=a2 chassis=a2a host=a2a1
automatically sets a ``ceph-osd`` daemon's location to be
``root=default host=HOSTNAME`` (based on the output from ``hostname -s``).
-ceph-crush-location hook
-------------------------
-
-By default, the ``ceph-crush-location`` utility will generate a CRUSH
-location string for a given daemon. The location is based on, in order of
-preference:
-
-#. A ``TYPE crush location`` option in ceph.conf. For example, this
- is ``osd crush location`` for OSD daemons.
-#. A ``crush location`` option in ceph.conf.
-#. A default of ``root=default host=HOSTNAME`` where the hostname is
- generated with the ``hostname -s`` command.
-
-In a typical deployment scenario, provisioning software (or the system
-administrator) can simply set the 'crush location' field in a host's
-ceph.conf to describe that machine's location within the datacenter or
-cluster. This will provide location awareness to both Ceph daemons
-and clients alike.
-
-It is possible to manage the CRUSH map entirely manually by toggling
-the hook off in the configuration::
+The crush location for an OSD is normally expressed via the ``crush location``
+config option being set in the ``ceph.conf`` file. Each time the OSD starts,
+it verifies it is in the correct location in the CRUSH map and, if it is not,
+it moved itself. To disable this automatic CRUSH map management, add the
+following to your configuration file in the ``[osd]`` section::
osd crush update on start = false
+
Custom location hooks
---------------------
-A customized location hook can be used in place of the generic hook for OSD
-daemon placement in the hierarchy. (On startup, each OSD ensures its position is
-correct.)::
+A customized location hook can be used to generate a more complete
+crush location on startup. The sample ``ceph-crush-location`` utility
+will generate a CRUSH location string for a given daemon. The
+location is based on, in order of preference:
+
+#. A ``crush location`` option in ceph.conf.
+#. A default of ``root=default host=HOSTNAME`` where the hostname is
+ generated with the ``hostname -s`` command.
+
+This is not useful by itself, as the OSD itself has the exact same
+behavior. However, the script can be modified to provide additional
+location fields (for example, the rack or datacenter), and then the
+hook enabled via the config option::
- osd crush location hook = /path/to/script
+ crush location hook = /path/to/customized-ceph-crush-location
This hook is passed several arguments (below) and should output a single line
to stdout with the CRUSH location description.::
identifier (the OSD number), and the daemon type is typically ``osd``.
-Editing a CRUSH Map
-===================
-
-To edit an existing CRUSH map:
-
-#. `Get the CRUSH map`_.
-#. `Decompile`_ the CRUSH map.
-#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_.
-#. `Recompile`_ the CRUSH map.
-#. `Set the CRUSH map`_.
-
-To activate CRUSH map rules for a specific pool, identify the common ruleset
-number for those rules and specify that ruleset number for the pool. See `Set
-Pool Values`_ for details.
-
-.. _Get the CRUSH map: #getcrushmap
-.. _Decompile: #decompilecrushmap
-.. _Devices: #crushmapdevices
-.. _Buckets: #crushmapbuckets
-.. _Rules: #crushmaprules
-.. _Recompile: #compilecrushmap
-.. _Set the CRUSH map: #setcrushmap
-.. _Set Pool Values: ../pools#setpoolvalues
-
-.. _getcrushmap:
-
-Get a CRUSH Map
----------------
-
-To get the CRUSH map for your cluster, execute the following::
-
- ceph osd getcrushmap -o {compiled-crushmap-filename}
-
-Ceph will output (-o) a compiled CRUSH map to the filename you specified. Since
-the CRUSH map is in a compiled form, you must decompile it first before you can
-edit it.
-
-.. _decompilecrushmap:
-
-Decompile a CRUSH Map
----------------------
-
-To decompile a CRUSH map, execute the following::
-
- crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
-
-Ceph will decompile (-d) the compiled CRUSH map and output (-o) it to the
-filename you specified.
-
-
-.. _compilecrushmap:
-
-Compile a CRUSH Map
--------------------
-
-To compile a CRUSH map, execute the following::
-
- crushtool -c {decompiled-crush-map-filename} -o {compiled-crush-map-filename}
-
-Ceph will store a compiled CRUSH map to the filename you specified.
-
-
-.. _setcrushmap:
-
-Set a CRUSH Map
----------------
-
-To set the CRUSH map for your cluster, execute the following::
-
- ceph osd setcrushmap -i {compiled-crushmap-filename}
-
-Ceph will input the compiled CRUSH map of the filename you specified as the
-CRUSH map for the cluster.
-
-
-
-CRUSH Map Parameters
-====================
-
-There are four main sections to a CRUSH Map.
-
-#. **Devices:** Devices consist of any object storage device--i.e., the storage
- drive corresponding to a ``ceph-osd`` daemon. You should have a device for
- each OSD daemon in your Ceph configuration file.
-
-#. **Bucket Types**: Bucket ``types`` define the types of buckets used in your
- CRUSH hierarchy. Buckets consist of a hierarchical aggregation of storage
- locations (e.g., rows, racks, chassis, hosts, etc.) and their assigned
- weights.
-
-#. **Bucket Instances:** Once you define bucket types, you must declare bucket
- instances for your hosts, and any other failure domain partitioning
- you choose.
-
-#. **Rules:** Rules consist of the manner of selecting buckets.
-
-If you launched Ceph using one of our Quick Start guides, you'll notice
-that you didn't need to create a CRUSH map. Ceph's deployment tools generate
-a default CRUSH map that lists devices from the OSDs you defined in your
-Ceph configuration file, and it declares a bucket for each host you specified
-in the ``[osd]`` sections of your Ceph configuration file. You should create
-your own CRUSH maps with buckets that reflect your cluster's failure domains
-to better ensure data safety and availability.
+CRUSH structure
+===============
-.. note:: The generated CRUSH map doesn't take your larger grained failure
- domains into account. So you should modify your CRUSH map to account for
- larger grained failure domains such as chassis, racks, rows, data
- centers, etc.
+The CRUSH map consists of, loosely speaking, a hierarchy describing
+the physical topology of the cluster, and a set of rules defining
+policy about how we place data on those devices. The hierarchy has
+devices (``ceph-osd`` daemons) at the leaves, and internal nodes
+corresponding to other physical features or groupings: hosts, racks,
+rows, datacenters, and so on. The rules describe how replicas are
+placed in terms of that hierarchy (e.g., 'three replicas in different
+racks').
+Devices
+-------
+Devices are individual ``ceph-osd`` daemons that can store data. You
+will normally have one defined here for each OSD daemon in your
+cluster. Devices are identified by an id (a non-negative integer) and
+a name, normally ``osd.N`` where ``N`` is the device id.
-.. _crushmapdevices:
+Devices may also have a *device class* associated with them (e.g.,
+``hdd`` or ``ssd``), allowing them to be conveniently targetted by a
+crush rule.
-CRUSH Map Devices
+Types and Buckets
-----------------
-To map placement groups to OSDs, a CRUSH map requires a list of OSD devices
-(i.e., the names of the OSD daemons from the Ceph configuration file). The list
-of devices appears first in the CRUSH map. To declare a device in the CRUSH map,
-create a new line under your list of devices, enter ``device`` followed by a
-unique numeric ID, followed by the corresponding ``ceph-osd`` daemon instance.
-The device class can optionaly be added to group devices so they can be
-conveniently targetted by a crush rule.
-
-::
-
- #devices
- device {num} {osd.name} [class {class}]
-
-For example::
-
- #devices
- device 0 osd.0 class ssd
- device 1 osd.1 class hdd
- device 2 osd.2
- device 3 osd.3
-
-As a general rule, an OSD daemon maps to a single storage drive or to a RAID.
-
-
-CRUSH Map Bucket Types
-----------------------
-
-The second list in the CRUSH map defines 'bucket' types. Buckets facilitate
-a hierarchy of nodes and leaves. Node (or non-leaf) buckets typically represent
-physical locations in a hierarchy. Nodes aggregate other nodes or leaves.
-Leaf buckets represent ``ceph-osd`` daemons and their corresponding storage
-media.
-
-.. tip:: The term "bucket" used in the context of CRUSH means a node in
- the hierarchy, i.e. a location or a piece of physical hardware. It
- is a different concept from the term "bucket" when used in the
- context of RADOS Gateway APIs.
-
-To add a bucket type to the CRUSH map, create a new line under your list of
-bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name.
-By convention, there is one leaf bucket and it is ``type 0``; however, you may
-give it any name you like (e.g., osd, disk, drive, storage, etc.)::
-
- #types
- type {num} {bucket-name}
-
-For example::
-
- # types
- type 0 osd
- type 1 host
- type 2 chassis
- type 3 rack
- type 4 row
- type 5 pdu
- type 6 pod
- type 7 room
- type 8 datacenter
- type 9 region
- type 10 root
-
-
-
-.. _crushmapbuckets:
-
-CRUSH Map Bucket Hierarchy
---------------------------
-
-The CRUSH algorithm distributes data objects among storage devices according
-to a per-device weight value, approximating a uniform probability distribution.
-CRUSH distributes objects and their replicas according to the hierarchical
-cluster map you define. Your CRUSH map represents the available storage
-devices and the logical elements that contain them.
-
-To map placement groups to OSDs across failure domains, a CRUSH map defines a
-hierarchical list of bucket types (i.e., under ``#types`` in the generated CRUSH
-map). The purpose of creating a bucket hierarchy is to segregate the
-leaf nodes by their failure domains, such as hosts, chassis, racks, power
-distribution units, pods, rows, rooms, and data centers. With the exception of
-the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and
-you may define it according to your own needs.
-
-We recommend adapting your CRUSH map to your firms's hardware naming conventions
-and using instances names that reflect the physical hardware. Your naming
-practice can make it easier to administer the cluster and troubleshoot
-problems when an OSD and/or other hardware malfunctions and the administrator
-need access to physical hardware.
-
-In the following example, the bucket hierarchy has a leaf bucket named ``osd``,
-and two node buckets named ``host`` and ``rack`` respectively.
-
-.. ditaa::
- +-----------+
- | {o}rack |
- | Bucket |
- +-----+-----+
+A bucket is the CRUSH term for internal nodes in the hierarchy: hosts,
+racks, rows, etc. The CRUSH map defines a series of *types* that are
+used to describe these nodes. By default, these types include:
+
+- osd (or device)
+- host
+- chassis
+- rack
+- row
+- pdu
+- pod
+- room
+- datacenter
+- region
+- root
+
+Most clusters make use of only a handful of these types, and others
+can be defined as needed.
+
+The hierarchy is built with devices (normally type ``osd``) at the
+leaves, interior nodes with non-device types, and a root node of type
+``root``. For example,
+
+.. ditaa::
+
+ +-----------------+
+ | {o}root default |
+ +--------+--------+
|
+---------------+---------------+
| |
- +-----+-----+ +-----+-----+
- | {o}host | | {o}host |
- | Bucket | | Bucket |
- +-----+-----+ +-----+-----+
+ +-------+-------+ +-----+-------+
+ | {o}host foo | | {o}host bar |
+ +-------+-------+ +-----+-------+
| |
+-------+-------+ +-------+-------+
| | | |
+-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+
- | osd | | osd | | osd | | osd |
- | Bucket | | Bucket | | Bucket | | Bucket |
+ | osd.0 | | osd.1 | | osd.2 | | osd.3 |
+-----------+ +-----------+ +-----------+ +-----------+
-.. note:: The higher numbered ``rack`` bucket type aggregates the lower
- numbered ``host`` bucket type.
-
-Since leaf nodes reflect storage devices declared under the ``#devices`` list
-at the beginning of the CRUSH map, you do not need to declare them as bucket
-instances. The second lowest bucket type in your hierarchy usually aggregates
-the devices (i.e., it's usually the computer containing the storage media, and
-uses whatever term you prefer to describe it, such as "node", "computer",
-"server," "host", "machine", etc.). In high density environments, it is
-increasingly common to see multiple hosts/nodes per chassis. You should account
-for chassis failure too--e.g., the need to pull a chassis if a node fails may
-result in bringing down numerous hosts/nodes and their OSDs.
-
-When declaring a bucket instance, you must specify its type, give it a unique
-name (string), assign it a unique ID expressed as a negative integer (optional),
-specify a weight relative to the total capacity/capability of its item(s),
-specify the bucket algorithm (usually ``straw``), and the hash (usually ``0``,
-reflecting hash algorithm ``rjenkins1``). A bucket may have one or more items.
-The items may consist of node buckets or leaves. Items may have a weight that
-reflects the relative weight of the item.
-
-You may declare a node bucket with the following syntax::
-
- [bucket-type] [bucket-name] {
- id [a unique negative numeric ID]
- weight [the relative capacity/capability of the item(s)]
- alg [the bucket type: uniform | list | tree | straw ]
- hash [the hash type: 0 by default]
- item [item-name] weight [weight]
- }
-
-For example, using the diagram above, we would define two host buckets
-and one rack bucket. The OSDs are declared as items within the host buckets::
-
- host node1 {
- id -1
- alg straw
- hash 0
- item osd.0 weight 1.00
- item osd.1 weight 1.00
- }
-
- host node2 {
- id -2
- alg straw
- hash 0
- item osd.2 weight 1.00
- item osd.3 weight 1.00
- }
-
- rack rack1 {
- id -3
- alg straw
- hash 0
- item node1 weight 2.00
- item node2 weight 2.00
- }
-
-.. note:: In the foregoing example, note that the rack bucket does not contain
- any OSDs. Rather it contains lower level host buckets, and includes the
- sum total of their weight in the item entry.
-
-.. topic:: Bucket Types
-
- Ceph supports four bucket types, each representing a tradeoff between
- performance and reorganization efficiency. If you are unsure of which bucket
- type to use, we recommend using a ``straw`` bucket. For a detailed
- discussion of bucket types, refer to
- `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_,
- and more specifically to **Section 3.4**. The bucket types are:
+Each node (device or bucket) in the hierarchy has a *weight*
+associated with it, indicating the relative proportion of the total
+data that device or hierarchy subtree should store. Weights are set
+at the leaves, indicating the size of the device, and automatically
+sum up the tree from there, such that the weight of the default node
+will be the total of all devices contained beneath it. Normally
+weights are in units of terabytes (TB).
+
+You can get a simple view the CRUSH hierarchy for your cluster,
+including the weights, with::
+
+ ceph osd crush tree
+
+Rules
+-----
+
+Rules define policy about how data is distributed across the devices
+in the hierarchy.
+
+CRUSH rules define placement and replication strategies or
+distribution policies that allow you to specify exactly how CRUSH
+places object replicas. For example, you might create a rule selecting
+a pair of targets for 2-way mirroring, another rule for selecting
+three targets in two different data centers for 3-way mirroring, and
+yet another rule for erasure coding over six storage devices. For a
+detailed discussion of CRUSH rules, refer to `CRUSH - Controlled,
+Scalable, Decentralized Placement of Replicated Data`_, and more
+specifically to **Section 3.2**.
+
+In almost all cases, CRUSH rules can be created via the CLI by
+specifying the *pool type* they will be used for (replicated or
+erasure coded), the *failure domain*, and optionally a *device class*.
+In rare cases rules must be written by hand by manually editing the
+CRUSH map.
- #. **Uniform:** Uniform buckets aggregate devices with **exactly** the same
- weight. For example, when firms commission or decommission hardware, they
- typically do so with many machines that have exactly the same physical
- configuration (e.g., bulk purchases). When storage devices have exactly
- the same weight, you may use the ``uniform`` bucket type, which allows
- CRUSH to map replicas into uniform buckets in constant time. With
- non-uniform weights, you should use another bucket algorithm.
-
- #. **List**: List buckets aggregate their content as linked lists. Based on
- the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`P` algorithm,
- a list is a natural and intuitive choice for an **expanding cluster**:
- either an object is relocated to the newest device with some appropriate
- probability, or it remains on the older devices as before. The result is
- optimal data migration when items are added to the bucket. Items removed
- from the middle or tail of the list, however, can result in a significant
- amount of unnecessary movement, making list buckets most suitable for
- circumstances in which they **never (or very rarely) shrink**.
-
- #. **Tree**: Tree buckets use a binary search tree. They are more efficient
- than list buckets when a bucket contains a larger set of items. Based on
- the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`R` algorithm,
- tree buckets reduce the placement time to O(log :sub:`n`), making them
- suitable for managing much larger sets of devices or nested buckets.
-
- #. **Straw:** List and Tree buckets use a divide and conquer strategy
- in a way that either gives certain items precedence (e.g., those
- at the beginning of a list) or obviates the need to consider entire
- subtrees of items at all. That improves the performance of the replica
- placement process, but can also introduce suboptimal reorganization
- behavior when the contents of a bucket change due an addition, removal,
- or re-weighting of an item. The straw bucket type allows all items to
- fairly “compete” against each other for replica placement through a
- process analogous to a draw of straws.
-
-.. topic:: Hash
-
- Each bucket uses a hash algorithm. Currently, Ceph supports ``rjenkins1``.
- Enter ``0`` as your hash setting to select ``rjenkins1``.
-
-
-.. _weightingbucketitems:
-
-.. topic:: Weighting Bucket Items
-
- Ceph expresses bucket weights as doubles, which allows for fine
- weighting. A weight is the relative difference between device capacities. We
- recommend using ``1.00`` as the relative weight for a 1TB storage device.
- In such a scenario, a weight of ``0.5`` would represent approximately 500GB,
- and a weight of ``3.00`` would represent approximately 3TB. Higher level
- buckets have a weight that is the sum total of the leaf items aggregated by
- the bucket.
-
- A bucket item weight is one dimensional, but you may also calculate your
- item weights to reflect the performance of the storage drive. For example,
- if you have many 1TB drives where some have relatively low data transfer
- rate and the others have a relatively high data transfer rate, you may
- weight them differently, even though they have the same capacity (e.g.,
- a weight of 0.80 for the first set of drives with lower total throughput,
- and 1.20 for the second set of drives with higher total throughput).
-
-
-.. _crushmaprules:
-
-CRUSH Map Rules
----------------
-
-CRUSH maps support the notion of 'CRUSH rules', which are the rules that
-determine data placement for a pool. For large clusters, you will likely create
-many pools where each pool may have its own CRUSH ruleset and rules. The default
-CRUSH map has a rule for each pool, and one ruleset assigned to each of the
-default pools.
-
-.. note:: In most cases, you will not need to modify the default rules. When
- you create a new pool, its default ruleset is ``0``.
-
-
-CRUSH rules define placement and replication strategies or distribution policies
-that allow you to specify exactly how CRUSH places object replicas. For
-example, you might create a rule selecting a pair of targets for 2-way
-mirroring, another rule for selecting three targets in two different data
-centers for 3-way mirroring, and yet another rule for erasure coding over six
-storage devices. For a detailed discussion of CRUSH rules, refer to
-`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_,
-and more specifically to **Section 3.2**.
-
-A rule takes the following form::
-
- rule <rulename> {
-
- ruleset <ruleset>
- type [ replicated | erasure ]
- min_size <min-size>
- max_size <max-size>
- step take <bucket-name> [class <device-class>]
- step [choose|chooseleaf] [firstn|indep] <N> <bucket-type>
- step emit
- }
+You can see what rules are defined for your cluster with::
+ ceph osd crush rule ls
-``ruleset``
+You can view the contents of the rules with::
-:Description: A means of classifying a rule as belonging to a set of rules.
- Activated by `setting the ruleset in a pool`_.
-
-:Purpose: A component of the rule mask.
-:Type: Integer
-:Required: Yes
-:Default: 0
-
-.. _setting the ruleset in a pool: ../pools#setpoolvalues
-
-
-``type``
-
-:Description: Describes a rule for either a storage drive (replicated)
- or a RAID.
-
-:Purpose: A component of the rule mask.
-:Type: String
-:Required: Yes
-:Default: ``replicated``
-:Valid Values: Currently only ``replicated`` and ``erasure``
-
-``min_size``
-
-:Description: If a pool makes fewer replicas than this number, CRUSH will
- **NOT** select this rule.
-
-:Type: Integer
-:Purpose: A component of the rule mask.
-:Required: Yes
-:Default: ``1``
-
-``max_size``
-
-:Description: If a pool makes more replicas than this number, CRUSH will
- **NOT** select this rule.
-
-:Type: Integer
-:Purpose: A component of the rule mask.
-:Required: Yes
-:Default: 10
+ ceph osd crush rule dump
-``step take <bucket-name> [class <device-class>]``
-
-:Description: Takes a bucket name, and begins iterating down the tree.
- If the ``device-class`` is specified, it must match
- a class previously used when defining a device. All
- devices that do not belong to the class are excluded.
-:Purpose: A component of the rule.
-:Required: Yes
-:Example: ``step take data``
-
-
-``step choose firstn {num} type {bucket-type}``
-
-:Description: Selects the number of buckets of the given type. The number is
- usually the number of replicas in the pool (i.e., pool size).
-
- - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
- - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
- - If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
-
-:Purpose: A component of the rule.
-:Prerequisite: Follows ``step take`` or ``step choose``.
-:Example: ``step choose firstn 1 type row``
-
-
-``step chooseleaf firstn {num} type {bucket-type}``
-
-:Description: Selects a set of buckets of ``{bucket-type}`` and chooses a leaf
- node from the subtree of each bucket in the set of buckets. The
- number of buckets in the set is usually the number of replicas in
- the pool (i.e., pool size).
-
- - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
- - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
- - If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
-
-:Purpose: A component of the rule. Usage removes the need to select a device using two steps.
-:Prerequisite: Follows ``step take`` or ``step choose``.
-:Example: ``step chooseleaf firstn 0 type row``
-
-
-
-``step emit``
-
-:Description: Outputs the current value and empties the stack. Typically used
- at the end of a rule, but may also be used to pick from different
- trees in the same rule.
-
-:Purpose: A component of the rule.
-:Prerequisite: Follows ``step choose``.
-:Example: ``step emit``
-
-.. important:: To activate one or more rules with a common ruleset number to a
- pool, set the ruleset number of the pool.
-
-
-
-Primary Affinity
-================
-
-When a Ceph Client reads or writes data, it always contacts the primary OSD in
-the acting set. For set ``[2, 3, 4]``, ``osd.2`` is the primary. Sometimes an
-OSD isn't well suited to act as a primary compared to other OSDs (e.g., it has
-a slow disk or a slow controller). To prevent performance bottlenecks
-(especially on read operations) while maximizing utilization of your hardware,
-you can set a Ceph OSD's primary affinity so that CRUSH is less likely to use
-the OSD as a primary in an acting set. ::
-
- ceph osd primary-affinity <osd-id> <weight>
-
-Primary affinity is ``1`` by default (*i.e.,* an OSD may act as a primary). You
-may set the OSD primary range from ``0-1``, where ``0`` means that the OSD may
-**NOT** be used as a primary and ``1`` means that an OSD may be used as a
-primary. When the weight is ``< 1``, it is less likely that CRUSH will select
-the Ceph OSD Daemon to act as a primary.
-
+Weights sets
+------------
-Placing Different Pools on Different OSDS:
-==========================================
-
-Suppose you want to have most pools default to OSDs backed by large hard drives,
-but have some pools mapped to OSDs backed by fast solid-state drives (SSDs).
-It's possible to have multiple independent CRUSH hierarchies within the same
-CRUSH map. Define two hierarchies with two different root nodes--one for hard
-disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown
-below::
-
- device 0 osd.0
- device 1 osd.1
- device 2 osd.2
- device 3 osd.3
- device 4 osd.4
- device 5 osd.5
- device 6 osd.6
- device 7 osd.7
-
- host ceph-osd-ssd-server-1 {
- id -1
- alg straw
- hash 0
- item osd.0 weight 1.00
- item osd.1 weight 1.00
- }
-
- host ceph-osd-ssd-server-2 {
- id -2
- alg straw
- hash 0
- item osd.2 weight 1.00
- item osd.3 weight 1.00
- }
-
- host ceph-osd-platter-server-1 {
- id -3
- alg straw
- hash 0
- item osd.4 weight 1.00
- item osd.5 weight 1.00
- }
-
- host ceph-osd-platter-server-2 {
- id -4
- alg straw
- hash 0
- item osd.6 weight 1.00
- item osd.7 weight 1.00
- }
-
- root platter {
- id -5
- alg straw
- hash 0
- item ceph-osd-platter-server-1 weight 2.00
- item ceph-osd-platter-server-2 weight 2.00
- }
-
- root ssd {
- id -6
- alg straw
- hash 0
- item ceph-osd-ssd-server-1 weight 2.00
- item ceph-osd-ssd-server-2 weight 2.00
- }
-
- rule data {
- ruleset 0
- type replicated
- min_size 2
- max_size 2
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule metadata {
- ruleset 1
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule rbd {
- ruleset 2
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule platter {
- ruleset 3
- type replicated
- min_size 0
- max_size 10
- step take platter
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule ssd {
- ruleset 4
- type replicated
- min_size 0
- max_size 4
- step take ssd
- step chooseleaf firstn 0 type host
- step emit
- }
-
- rule ssd-primary {
- ruleset 5
- type replicated
- min_size 5
- max_size 10
- step take ssd
- step chooseleaf firstn 1 type host
- step emit
- step take platter
- step chooseleaf firstn -1 type host
- step emit
- }
-
-You can then set a pool to use the SSD rule by::
-
- ceph osd pool set <poolname> crush_ruleset 4
-
-Similarly, using the ``ssd-primary`` rule will cause each placement group in the
-pool to be placed with an SSD as the primary and platters as the replicas.
+A *weight set* is an alternative set of weights to use when
+calculating data placement. The normal weights associated with each
+device in the CRUSH map are set based on the device size and indicate
+how much data we *should* be storing where. However, because CRUSH is
+based on a pseudorandom placement process, there is always some
+variation from this ideal distribution, the same way that rolling a
+dice sixty times will not result in rolling exactly 10 ones and 10
+sixes. Weight sets allow the cluster to do a numerical optimization
+based on the specifics of your cluster (hierarchy, pools, etc.) to achieve
+a balanced distribution.
+
+There are two types of weight sets supported:
+
+ #. A **compat** weight set is a single alternative set of weights for
+ each device and node in the cluster. This is not well-suited for
+ correcting for all anomalies (for example, placement groups for
+ different pools may be different sizes and have different load
+ levels, but will be mostly treated the same by the balancer).
+ However, compat weight sets have the huge advantage that they are
+ *backward compatible* with previous versions of Ceph, which means
+ that even though weight sets were first introduced in Luminous
+ v12.2.z, older clients (e.g., firefly) can still connect to the
+ cluster when a compat weight set is being used to balance data.
+ #. A **per-pool** weight set is more flexible in that it allows
+ placement to be optimized for each data pool. Additionally,
+ weights can be adjusted for each position of placement, allowing
+ the optimizer to correct for a suble skew of data toward devices
+ with small weights relative to their peers (and effect that is
+ usually only apparently in very large clusters but which can cause
+ balancing problems).
+
+When weight sets are in use, the weights associated with each node in
+the hierarchy is visible as a separate column (labeled either
+``(compat)`` or the pool name) from the command::
+
+ ceph osd crush tree
+
+When both *compat* and *per-pool* weight sets are in use, data
+placement for a particular pool will use its own per-pool weight set
+if present. If not, it will use the compat weight set if present. If
+neither are present, it will use the normal CRUSH weights.
+
+Although weight sets can be set up and manipulated by hand, it is
+recommended that the *balancer* module be enabled to do so
+automatically.
+
+
+Modifying the CRUSH map
+=======================
.. _addosd:
Add/Move an OSD
-===============
+---------------
-To add or move an OSD in the CRUSH map of a running cluster, execute the
-``ceph osd crush set``. For Argonaut (v 0.48), execute the following::
+.. note: OSDs are normally automatically added to the CRUSH map when
+ the OSD is created. This command is rarely needed.
- ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name} ...]
-
-For Bobtail (v 0.56), execute the following::
+To add or move an OSD in the CRUSH map of a running cluster::
- ceph osd crush set {id-or-name} {weight} root={pool-name} [{bucket-type}={bucket-name} ...]
+ ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...]
Where:
-``id``
-
-:Description: The numeric ID of the OSD.
-:Type: Integer
-:Required: Yes
-:Example: ``0``
-
-
``name``
:Description: The full name of the OSD.
``weight``
-:Description: The CRUSH weight for the OSD.
+:Description: The CRUSH weight for the OSD, normally its size measure in terabytes (TB).
:Type: Double
:Required: Yes
:Example: ``2.0``
``root``
-:Description: The root of the tree in which the OSD resides.
+:Description: The root node of the tree in which the OSD resides (normally ``default``)
:Type: Key/value pair.
:Required: Yes
:Example: ``root=default``
:Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1``
-The following example adds ``osd.0`` to the hierarchy, or moves the OSD from a
-previous location. ::
+The following example adds ``osd.0`` to the hierarchy, or moves the
+OSD from a previous location. ::
- ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
+ ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
-Adjust an OSD's CRUSH Weight
-============================
+Adjust OSD weight
+-----------------
+
+.. note: Normally OSDs automatically add themselves to the CRUSH map
+ with the correct weight when they are created. This command
+ is rarely needed.
To adjust an OSD's crush weight in the CRUSH map of a running cluster, execute
the following::
- ceph osd crush reweight {name} {weight}
+ ceph osd crush reweight {name} {weight}
Where:
.. _removeosd:
Remove an OSD
-=============
+-------------
+
+.. note: OSDs are normally removed from the CRUSH as part of the
+ ``ceph osd purge`` command. This command is rarely needed.
-To remove an OSD from the CRUSH map of a running cluster, execute the following::
+To remove an OSD from the CRUSH map of a running cluster, execute the
+following::
- ceph osd crush remove {name}
+ ceph osd crush remove {name}
Where:
:Required: Yes
:Example: ``osd.0``
+
Add a Bucket
-============
+------------
+
+.. note: Buckets are normally implicitly created when an OSD is added
+ that specifies a ``{bucket-type}={bucket-name}`` as part of its
+ location and a bucket with that name does not already exist. This
+ command is typically used when manually adjusting the structure of the
+ hierarchy after OSDs have been created (for example, to move a
+ series of hosts underneath a new rack-level bucket).
-To add a bucket in the CRUSH map of a running cluster, execute the ``ceph osd crush add-bucket`` command::
+To add a bucket in the CRUSH map of a running cluster, execute the
+``ceph osd crush add-bucket`` command::
- ceph osd crush add-bucket {bucket-name} {bucket-type}
+ ceph osd crush add-bucket {bucket-name} {bucket-type}
Where:
The following example adds the ``rack12`` bucket to the hierarchy::
- ceph osd crush add-bucket rack12 rack
+ ceph osd crush add-bucket rack12 rack
Move a Bucket
-=============
+-------------
-To move a bucket to a different location or position in the CRUSH map hierarchy,
-execute the following::
+To move a bucket to a different location or position in the CRUSH map
+hierarchy, execute the following::
- ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
+ ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
Where:
:Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1``
Remove a Bucket
-===============
+---------------
To remove a bucket from the CRUSH map hierarchy, execute the following::
- ceph osd crush remove {bucket-name}
+ ceph osd crush remove {bucket-name}
.. note:: A bucket must be empty before removing it from the CRUSH hierarchy.
The following example removes the ``rack12`` bucket from the hierarchy::
- ceph osd crush remove rack12
+ ceph osd crush remove rack12
+
+Creating a compat weight set
+----------------------------
+
+.. note: This step is normally done automatically by the ``balancer``
+ module when enabled.
+
+To create a *compat* weight set::
+
+ ceph osd crush weight-set create-compat
+
+Weights for the compat weight set can be adjusted with::
+
+ ceph osd crush weight-set reweight-compat {name} {weight}
+
+The compat weight set can be destroyed with::
+
+ ceph osd crush weight-set rm-compat
+
+Creating per-pool weight sets
+-----------------------------
+
+To create a weight set for a specific pool,::
+
+ ceph osd crush weight-set create {pool-name} {mode}
+
+.. note:: Per-pool weight sets require that all servers and daemons
+ run Luminous v12.2.z or later.
+
+Where:
+
+``pool-name``
+
+:Description: The name of a RADOS pool
+:Type: String
+:Required: Yes
+:Example: ``rbd``
+
+``mode``
+
+:Description: Either ``flat`` or ``positional``. A *flat* weight set
+ has a single weight for each device or bucket. A
+ *positional* weight set has a potentially different
+ weight for each position in the resulting placement
+ mapping. For example, if a pool has a replica count of
+ 3, then a positional weight set will have three weights
+ for each device and bucket.
+:Type: String
+:Required: Yes
+:Example: ``flat``
+
+To adjust the weight of an item in a weight set::
+
+ ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]}
+
+To list existing weight sets,::
+
+ ceph osd crush weight-set ls
+
+To remove a weight set,::
+
+ ceph osd crush weight-set rm {pool-name}
+
+Creating a rule for a replicated pool
+-------------------------------------
+
+For a replicated pool, the primary decision when creating the CRUSH
+rule is what the failure domain is going to be. For example, if a
+failure domain of ``host`` is selected, then CRUSH will ensure that
+each replica of the data is stored on a different host. If ``rack``
+is selected, then each replica will be stored in a different rack.
+What failure domain you choose primarily depends on the size of your
+cluster and how your hierarchy is structured.
+
+Normally, the entire cluster hierarchy is nested beneath a root node
+named ``default``. If you have customized your hierarchy, you may
+want to create a rule nested at some other node in the hierarchy. It
+doesn't matter what type is associated with that node (it doesn't have
+to be a ``root`` node).
+
+It is also possible to create a rule that restricts data placement to
+a specific *class* of device. By default, Ceph OSDs automatically
+classify themselves as either ``hdd`` or ``ssd``, depending on the
+underlying type of device being used. These classes can also be
+customized.
+
+To create a replicated rule,::
+
+ ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}]
+
+Where:
+
+``name``
+
+:Description: The name of the rule
+:Type: String
+:Required: Yes
+:Example: ``rbd-rule``
+
+``root``
+
+:Description: The name of the node under which data should be placed.
+:Type: String
+:Required: Yes
+:Example: ``default``
+
+``failure-domain-type``
+
+:Description: The type of CRUSH nodes across which we should separate replicas.
+:Type: String
+:Required: Yes
+:Example: ``rack``
+
+``class``
+
+:Description: The device class data should be placed on.
+:Type: String
+:Required: No
+:Example: ``ssd``
+
+Creating a rule for an erasure coded pool
+-----------------------------------------
+
+For an erasure-coded pool, the same basic decisions need to be made as
+with a replicated pool: what is the failure domain, what node in the
+hierarchy will data be placed under (usually ``default``), and will
+placement be restricted to a specific device class. Erasure code
+pools are created a bit differently, however, because they need to be
+constructed carefully based on the erasure code being used. For this reason,
+you must include this information in the *erasure code profile*. A CRUSH
+rule will then be created from that either explicitly or automatically when
+the profile is used to create a pool.
+
+The erasure code profiles can be listed with::
+
+ ceph osd erasure-code-profile ls
+
+An existing profile can be viewed with::
+
+ ceph osd erasure-code-profile get {profile-name}
+
+Normally profiles should never be modified; instead, a new profile
+should be created and used when creating a new pool or creating a new
+rule for an existing pool.
+
+An erasure code profile consists of a set of key=value pairs. Most of
+these control the behavior of the erasure code that is encoding data
+in the pool. Those that begin with ``crush-``, however, affect the
+CRUSH rule that is created.
+
+The erasure code profile properties of interest are:
+
+ * **crush-root**: the name of the CRUSH node to place data under [default: ``default``].
+ * **crush-failure-domain**: the CRUSH type to separate erasure-coded shards across [default: ``host``].
+ * **crush-device-class**: the device class to place data on [default: none, meaning all devices are used].
+ * **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the number of erasure code shards, affecting the resulting CRUSH rule.
+
+Once a profile is defined, you can create a CRUSH rule with::
+
+ ceph osd crush rule create-erasure {name} {profile-name}
+
+.. note: When creating a new pool, it is not actually necessary to
+ explicitly create the rule. If the erasure code profile alone is
+ specified and the rule argument is left off then Ceph will create
+ the CRUSH rule automatically.
+
+Deleting rules
+--------------
+
+Rules that are not in use by pools can be deleted with::
+
+ ceph osd crush rule rm {rule-name}
+
Tunables
========
* ``argonaut``: the legacy values supported by the original argonaut release
* ``bobtail``: the values supported by the bobtail release
* ``firefly``: the values supported by the firefly release
+ * ``hammer``: the values supported by the hammer release
+ * ``jewel``: the values supported by the jewel release
* ``optimal``: the best (ie optimal) values of the current version of Ceph
* ``default``: the default values of a new cluster installed from
scratch. These values, which depend on the current version of Ceph,
Note that this may result in some data movement.
-Tuning CRUSH, the hard way
---------------------------
-
-If you can ensure that all clients are running recent code, you can
-adjust the tunables by extracting the CRUSH map, modifying the values,
-and reinjecting it into the cluster.
-
-* Extract the latest CRUSH map::
-
- ceph osd getcrushmap -o /tmp/crush
-
-* Adjust tunables. These values appear to offer the best behavior
- for both large and small clusters we tested with. You will need to
- additionally specify the ``--enable-unsafe-tunables`` argument to
- ``crushtool`` for this to work. Please use this option with
- extreme care.::
+.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
- crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
-* Reinject modified map::
+Primary Affinity
+================
- ceph osd setcrushmap -i /tmp/crush.new
+When a Ceph Client reads or writes data, it always contacts the primary OSD in
+the acting set. For set ``[2, 3, 4]``, ``osd.2`` is the primary. Sometimes an
+OSD is not well suited to act as a primary compared to other OSDs (e.g., it has
+a slow disk or a slow controller). To prevent performance bottlenecks
+(especially on read operations) while maximizing utilization of your hardware,
+you can set a Ceph OSD's primary affinity so that CRUSH is less likely to use
+the OSD as a primary in an acting set. ::
-Legacy values
--------------
+ ceph osd primary-affinity <osd-id> <weight>
-For reference, the legacy values for the CRUSH tunables can be set
-with::
+Primary affinity is ``1`` by default (*i.e.,* an OSD may act as a primary). You
+may set the OSD primary range from ``0-1``, where ``0`` means that the OSD may
+**NOT** be used as a primary and ``1`` means that an OSD may be used as a
+primary. When the weight is ``< 1``, it is less likely that CRUSH will select
+the Ceph OSD Daemon to act as a primary.
- crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
-Again, the special ``--enable-unsafe-tunables`` option is required.
-Further, as noted above, be careful running old versions of the
-``ceph-osd`` daemon after reverting to legacy values as the feature
-bit is not perfectly enforced.
-.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
--- /dev/null
+
+=============
+Health checks
+=============
+
+Overview
+========
+
+There is a finite set of possible health messages that a Ceph cluster can
+raise -- these are defined as *health checks* which have unique identifiers.
+
+The identifier is a terse pseudo-human-readable (i.e. like a variable name)
+string. It is intended to enable tools (such as UIs) to make sense of
+health checks, and present them in a way that reflects their meaning.
+
+This page lists the health checks that are raised by the monitor and manager
+daemons. In addition to these, you may also see health checks that originate
+from MDS daemons (see :doc:`/cephfs/health-messages`), and health checks
+that are defined by ceph-mgr python modules.
+
+Definitions
+===========
+
+
+OSDs
+----
+
+OSD_DOWN
+________
+
+One or more OSDs are marked down. The ceph-osd daemon may have been
+stopped, or peer OSDs may be unable to reach the OSD over the network.
+Common causes include a stopped or crashed daemon, a down host, or a
+network outage.
+
+Verify the host is healthy, the daemon is started, and network is
+functioning. If the daemon has crashed, the daemon log file
+(``/var/log/ceph/ceph-osd.*``) may contain debugging information.
+
+OSD_<crush type>_DOWN
+_____________________
+
+(e.g. OSD_HOST_DOWN, OSD_ROOT_DOWN)
+
+All the OSDs within a particular CRUSH subtree are marked down, for example
+all OSDs on a host.
+
+OSD_ORPHAN
+__________
+
+An OSD is referenced in the CRUSH map hierarchy but does not exist.
+
+The OSD can be removed from the CRUSH hierarchy with::
+
+ ceph osd crush rm osd.<id>
+
+OSD_OUT_OF_ORDER_FULL
+_____________________
+
+The utilization thresholds for `backfillfull`, `nearfull`, `full`,
+and/or `failsafe_full` are not ascending. In particular, we expect
+`backfillfull < nearfull`, `nearfull < full`, and `full <
+failsafe_full`.
+
+The thresholds can be adjusted with::
+
+ ceph osd set-backfillfull-ratio <ratio>
+ ceph osd set-nearfull-ratio <ratio>
+ ceph osd set-full-ratio <ratio>
+
+
+OSD_FULL
+________
+
+One or more OSDs has exceeded the `full` threshold and is preventing
+the cluster from servicing writes.
+
+Utilization by pool can be checked with::
+
+ ceph df
+
+The currently defined `full` ratio can be seen with::
+
+ ceph osd dump | grep full_ratio
+
+A short-term workaround to restore write availability is to raise the full
+threshold by a small amount::
+
+ ceph osd set-full-ratio <ratio>
+
+New storage should be added to the cluster by deploying more OSDs or
+existing data should be deleted in order to free up space.
+
+OSD_BACKFILLFULL
+________________
+
+One or more OSDs has exceeded the `backfillfull` threshold, which will
+prevent data from being allowed to rebalance to this device. This is
+an early warning that rebalancing may not be able to complete and that
+the cluster is approaching full.
+
+Utilization by pool can be checked with::
+
+ ceph df
+
+OSD_NEARFULL
+____________
+
+One or more OSDs has exceeded the `nearfull` threshold. This is an early
+warning that the cluster is approaching full.
+
+Utilization by pool can be checked with::
+
+ ceph df
+
+OSDMAP_FLAGS
+____________
+
+One or more cluster flags of interest has been set. These flags include:
+
+* *full* - the cluster is flagged as full and cannot service writes
+* *pauserd*, *pausewr* - paused reads or writes
+* *noup* - OSDs are not allowed to start
+* *nodown* - OSD failure reports are being ignored, such that the
+ monitors will not mark OSDs `down`
+* *noin* - OSDs that were previously marked `out` will not be marked
+ back `in` when they start
+* *noout* - down OSDs will not automatically be marked out after the
+ configured interval
+* *nobackfill*, *norecover*, *norebalance* - recovery or data
+ rebalancing is suspended
+* *noscrub*, *nodeep_scrub* - scrubbing is disabled
+* *notieragent* - cache tiering activity is suspended
+
+With the exception of *full*, these flags can be set or cleared with::
+
+ ceph osd set <flag>
+ ceph osd unset <flag>
+
+OSD_FLAGS
+_________
+
+One or more OSDs has a per-OSD flag of interest set. These flags include:
+
+* *noup*: OSD is not allowed to start
+* *nodown*: failure reports for this OSD will be ignored
+* *noin*: if this OSD was previously marked `out` automatically
+ after a failure, it will not be marked in when it stats
+* *noout*: if this OSD is down it will not automatically be marked
+ `out` after the configured interval
+
+Per-OSD flags can be set and cleared with::
+
+ ceph osd add-<flag> <osd-id>
+ ceph osd rm-<flag> <osd-id>
+
+For example, ::
+
+ ceph osd rm-nodown osd.123
+
+OLD_CRUSH_TUNABLES
+__________________
+
+The CRUSH map is using very old settings and should be updated. The
+oldest tunables that can be used (i.e., the oldest client version that
+can connect to the cluster) without triggering this health warning is
+determined by the ``mon_crush_min_required_version`` config option.
+See :doc:`/rados/operations/crush-map/#tunables` for more information.
+
+OLD_CRUSH_STRAW_CALC_VERSION
+____________________________
+
+The CRUSH map is using an older, non-optimal method for calculating
+intermediate weight values for ``straw`` buckets.
+
+The CRUSH map should be updated to use the newer method
+(``straw_calc_version=1``). See
+:doc:`/rados/operations/crush-map/#tunables` for more information.
+
+CACHE_POOL_NO_HIT_SET
+_____________________
+
+One or more cache pools is not configured with a *hit set* to track
+utilization, which will prevent the tiering agent from identifying
+cold objects to flush and evict from the cache.
+
+Hit sets can be configured on the cache pool with::
+
+ ceph osd pool set <poolname> hit_set_type <type>
+ ceph osd pool set <poolname> hit_set_period <period-in-seconds>
+ ceph osd pool set <poolname> hit_set_count <number-of-hitsets>
+ ceph osd pool set <poolname> hit_set_fpp <target-false-positive-rate>
+
+OSD_NO_SORTBITWISE
+__________________
+
+No pre-luminous v12.y.z OSDs are running but the ``sortbitwise`` flag has not
+been set.
+
+The ``sortbitwise`` flag must be set before luminous v12.y.z or newer
+OSDs can start. You can safely set the flag with::
+
+ ceph osd set sortbitwise
+
+POOL_FULL
+_________
+
+One or more pools has reached its quota and is no longer allowing writes.
+
+Pool quotas and utilization can be seen with::
+
+ ceph df detail
+
+You can either raise the pool quota with::
+
+ ceph osd pool set-quota <poolname> max_objects <num-objects>
+ ceph osd pool set-quota <poolname> max_bytes <num-bytes>
+
+or delete some existing data to reduce utilization.
+
+
+Data health (pools & placement groups)
+------------------------------
+
+PG_AVAILABILITY
+_______________
+
+Data availability is reduced, meaning that the cluster is unable to
+service potential read or write requests for some data in the cluster.
+Specifically, one or more PGs is in a state that does not allow IO
+requests to be serviced. Problematic PG states include *peering*,
+*stale*, *incomplete*, and the lack of *active* (if those conditions do not clear
+quickly).
+
+Detailed information about which PGs are affected is available from::
+
+ ceph health detail
+
+In most cases the root cause is that one or more OSDs is currently
+down; see the dicussion for ``OSD_DOWN`` above.
+
+The state of specific problematic PGs can be queried with::
+
+ ceph tell <pgid> query
+
+PG_DEGRADED
+___________
+
+Data redundancy is reduced for some data, meaning the cluster does not
+have the desired number of replicas for all data (for replicated
+pools) or erasure code fragments (for erasure coded pools).
+Specifically, one or more PGs:
+
+* has the *degraded* or *undersized* flag set, meaning there are not
+ enough instances of that placement group in the cluster;
+* has not had the *clean* flag set for some time.
+
+Detailed information about which PGs are affected is available from::
+
+ ceph health detail
+
+In most cases the root cause is that one or more OSDs is currently
+down; see the dicussion for ``OSD_DOWN`` above.
+
+The state of specific problematic PGs can be queried with::
+
+ ceph tell <pgid> query
+
+
+PG_DEGRADED_FULL
+________________
+
+Data redundancy may be reduced or at risk for some data due to a lack
+of free space in the cluster. Specifically, one or more PGs has the
+*backfill_toofull* or *recovery_toofull* flag set, meaning that the
+cluster is unable to migrate or recover data because one or more OSDs
+is above the *backfillfull* threshold.
+
+See the discussion for *OSD_BACKFILLFULL* or *OSD_FULL* above for
+steps to resolve this condition.
+
+PG_DAMAGED
+__________
+
+Data scrubbing has discovered some problems with data consistency in
+the cluster. Specifically, one or more PGs has the *inconsistent* or
+*snaptrim_error* flag is set, indicating an earlier scrub operation
+found a problem, or that the *repair* flag is set, meaning a repair
+for such an inconsistency is currently in progress.
+
+See :doc:`pg-repair` for more information.
+
+OSD_SCRUB_ERRORS
+________________
+
+Recent OSD scrubs have uncovered inconsistencies. This error is generally
+paired with *PG_DAMANGED* (see above).
+
+See :doc:`pg-repair` for more information.
+
+CACHE_POOL_NEAR_FULL
+____________________
+
+A cache tier pool is nearly full. Full in this context is determined
+by the ``target_max_bytes`` and ``target_max_objects`` properties on
+the cache pool. Once the pool reaches the target threshold, write
+requests to the pool may block while data is flushed and evicted
+from the cache, a state that normally leads to very high latencies and
+poor performance.
+
+The cache pool target size can be adjusted with::
+
+ ceph osd pool set <cache-pool-name> target_max_bytes <bytes>
+ ceph osd pool set <cache-pool-name> target_max_objects <objects>
+
+Normal cache flush and evict activity may also be throttled due to reduced
+availability or performance of the base tier, or overall cluster load.
+
+TOO_FEW_PGS
+___________
+
+The number of PGs in use in the cluster is below the configurable
+threshold of ``mon_pg_warn_min_per_osd`` PGs per OSD. This can lead
+to suboptimizal distribution and balance of data across the OSDs in
+the cluster, and similar reduce overall performance.
+
+This may be an expected condition if data pools have not yet been
+created.
+
+The PG count for existing pools can be increased or new pools can be
+created. Please refer to
+:doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for
+more information.
+
+TOO_MANY_PGS
+____________
+
+The number of PGs in use in the cluster is above the configurable
+threshold of ``mon_pg_warn_max_per_osd`` PGs per OSD. This can lead
+to higher memory utilization for OSD daemons, slower peering after
+cluster state changes (like OSD restarts, additions, or removals), and
+higher load on the Manager and Monitor daemons.
+
+The ``pg_num`` value for existing pools cannot currently be reduced.
+However, the ``pgp_num`` value can, which effectively collocates some
+PGs on the same sets of OSDs, mitigating some of the negative impacts
+described above. The ``pgp_num`` value can be adjusted with::
+
+ ceph osd pool set <pool> pgp_num <value>
+
+Please refer to
+:doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for
+more information.
+
+SMALLER_PGP_NUM
+_______________
+
+One or more pools has a ``pgp_num`` value less than ``pg_num``. This
+is normally an indication that the PG count was increased without
+also increasing the placement behavior.
+
+This is sometimes done deliberately to separate out the `split` step
+when the PG count is adjusted from the data migration that is needed
+when ``pgp_num`` is changed.
+
+This is normally resolved by setting ``pgp_num`` to match ``pg_num``,
+triggering the data migration, with::
+
+ ceph osd pool set <pool> pgp_num <pg-num-value>
+
+
+MANY_OBJECTS_PER_PG
+___________________
+
+One or more pools has an average number of objects per PG that is
+significantly higher than the overall cluster average. The specific
+threshold is controlled by the ``mon_pg_warn_max_object_skew``
+configuration value.
+
+This is usually an indication that the pool(s) containing most of the
+data in the cluster have too few PGs, and/or that other pools that do
+not contain as much data have too many PGs. See the discussion of
+*TOO_MANY_PGS* above.
+
+The threshold can be raised to silence the health warning by adjusting
+the ``mon_pg_warn_max_object_skew`` config option on the monitors.
+
+POOL_APP_NOT_ENABLED
+____________________
+
+A pool exists that contains one or more objects but has not been
+tagged for use by a particular application.
+
+Resolve this warning by labeling the pool for use by an application. For
+example, if the pool is used by RBD,::
+
+ rbd pool init <poolname>
+
+If the pool is being used by a custom application 'foo', you can also label
+via the low-level command::
+
+ ceph osd pool application enable foo
+
+For more information, see :doc:`pools.rst#associate-pool-to-application`.
+
+POOL_FULL
+_________
+
+One or more pools has reached (or is very close to reaching) its
+quota. The threshold to trigger this error condition is controlled by
+the ``mon_pool_quota_crit_threshold`` configuration option.
+
+Pool quotas can be adjusted up or down (or removed) with::
+
+ ceph osd pool set-quota <pool> max_bytes <bytes>
+ ceph osd pool set-quota <pool> max_objects <objects>
+
+Setting the quota value to 0 will disable the quota.
+
+POOL_NEAR_FULL
+______________
+
+One or more pools is approaching is quota. The threshold to trigger
+this warning condition is controlled by the
+``mon_pool_quota_warn_threshold`` configuration option.
+
+Pool quotas can be adjusted up or down (or removed) with::
+
+ ceph osd pool set-quota <pool> max_bytes <bytes>
+ ceph osd pool set-quota <pool> max_objects <objects>
+
+Setting the quota value to 0 will disable the quota.
+
+OBJECT_MISPLACED
+________________
+
+One or more objects in the cluster is not stored on the node the
+cluster would like it to be stored on. This is an indication that
+data migration due to some recent cluster change has not yet completed.
+
+Misplaced data is not a dangerous condition in and of itself; data
+consistency is never at risk, and old copies of objects are never
+removed until the desired number of new copies (in the desired
+locations) are present.
+
+OBJECT_UNFOUND
+______________
+
+One or more objects in the cluster cannot be found. Specifically, the
+OSDs know that a new or updated copy of an object should exist, but a
+copy of that version of the object has not been found on OSDs that are
+currently online.
+
+Read or write requests to unfound objects will block.
+
+Ideally, a down OSD can be brought back online that has the more
+recent copy of the unfound object. Candidate OSDs can be identified from the
+peering state for the PG(s) responsible for the unfound object::
+
+ ceph tell <pgid> query
+
+If the latest copy of the object is not available, the cluster can be
+told to roll back to a previous version of the object. See
+:doc:`troubleshooting-pg#Unfound-objects` for more information.
+
+REQUEST_SLOW
+____________
+
+One or more OSD requests is taking a long time to process. This can
+be an indication of extreme load, a slow storage device, or a software
+bug.
+
+The request queue on the OSD(s) in question can be queried with the
+following command, executed from the OSD host::
+
+ ceph daemon osd.<id> ops
+
+A summary of the slowest recent requests can be seen with::
+
+ ceph daemon osd.<id> dump_historic_ops
+
+The location of an OSD can be found with::
+
+ ceph osd find osd.<id>
+
+REQUEST_STUCK
+_____________
+
+One or more OSD requests has been blocked for an extremely long time.
+This is an indication that either the cluster has been unhealthy for
+an extended period of time (e.g., not enough running OSDs) or there is
+some internal problem with the OSD. See the dicussion of
+*REQUEST_SLOW* above.
+
+PG_NOT_SCRUBBED
+_______________
+
+One or more PGs has not been scrubbed recently. PGs are normally
+scrubbed every ``mon_scrub_interval`` seconds, and this warning
+triggers when ``mon_warn_not_scrubbed`` such intervals have elapsed
+without a scrub.
+
+PGs will not scrub if they are not flagged as *clean*, which may
+happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
+*PG_DEGRADED* above).
+
+You can manually initiate a scrub of a clean PG with::
+
+ ceph pg scrub <pgid>
+
+PG_NOT_DEEP_SCRUBBED
+____________________
+
+One or more PGs has not been deep scrubbed recently. PGs are normally
+scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
+triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
+without a scrub.
+
+PGs will not (deep) scrub if they are not flagged as *clean*, which may
+happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
+*PG_DEGRADED* above).
+
+You can manually initiate a scrub of a clean PG with::
+
+ ceph pg deep-scrub <pgid>
+
+CephFS
+------
+
+FS_WITH_FAILED_MDS
+__________________
+
+
+FS_DEGRADED
+___________
+
+
+MDS_INSUFFICIENT_STANDBY
+________________________
+
+
+MDS_DAMAGED
+___________
+
+
:maxdepth: 1
operating
+ health-checks
monitoring
monitoring-osd-pg
user-management
erasure-code
cache-tiering
placement-groups
+ upmap
crush-map
+ crush-map-edits
finding the `placement group`_ and the underlying OSDs at root of the problem.
.. tip:: A fault in one part of the cluster may prevent you from accessing a
- particular object, but that doesn't mean that you can't access other objects.
+ particular object, but that doesn't mean that you cannot access other objects.
When you run into a fault, don't panic. Just follow the steps for monitoring
your OSDs and placement groups. Then, begin troubleshooting.
If the number of OSDs that are ``in`` the cluster is more than the number of
OSDs that are ``up``, execute the following command to identify the ``ceph-osd``
-daemons that aren't running::
+daemons that are not running::
ceph osd tree
few cases:
- You are reaching your ``near full ratio`` or ``full ratio``.
-- Your data isn't getting distributed across the cluster due to an
+- Your data is not getting distributed across the cluster due to an
error in your CRUSH configuration.
current state. During that time period, the OSD may reflect a ``recovering``
state.
-Recovery isn't always trivial, because a hardware failure might cause a
+Recovery is not always trivial, because a hardware failure might cause a
cascading failure of multiple OSDs. For example, a network switch for a rack or
cabinet may fail, which can cause the OSDs of a number of host machines to fall
behind the current state of the cluster. Each one of the OSDs must recover once
requests when it is ready.
During the backfill operations, you may see one of several states:
-``backfill_wait`` indicates that a backfill operation is pending, but isn't
+``backfill_wait`` indicates that a backfill operation is pending, but is not
underway yet; ``backfill`` indicates that a backfill operation is underway;
and, ``backfill_too_full`` indicates that a backfill operation was requested,
but couldn't be completed due to insufficient storage capacity. When a
-placement group can't be backfilled, it may be considered ``incomplete``.
+placement group cannot be backfilled, it may be considered ``incomplete``.
Ceph provides a number of settings to manage the load spike associated with
reassigning placement groups to an OSD (especially a new OSD). By default,
-----
While Ceph uses heartbeats to ensure that hosts and daemons are running, the
-``ceph-osd`` daemons may also get into a ``stuck`` state where they aren't
+``ceph-osd`` daemons may also get into a ``stuck`` state where they are not
reporting statistics in a timely manner (e.g., a temporary network fault). By
default, OSD daemons report their placement group, up thru, boot and failure
statistics every half second (i.e., ``0.5``), which is more frequent than the
Identifying Troubled PGs
========================
-As previously noted, a placement group isn't necessarily problematic just
-because its state isn't ``active+clean``. Generally, Ceph's ability to self
+As previously noted, a placement group is not necessarily problematic just
+because its state is not ``active+clean``. Generally, Ceph's ability to self
repair may not be working when placement groups get stuck. The stuck states
include:
cluster. Monitoring a cluster typically involves checking OSD status, monitor
status, placement group status and metadata server status.
-Interactive Mode
-================
+Using the command line
+======================
+
+Interactive mode
+----------------
To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
with no arguments. For example::
ceph> status
ceph> quorum_status
ceph> mon_status
-
-Checking Cluster Health
-=======================
-
-After you start your cluster, and before you start reading and/or
-writing data, check your cluster's health first. You can check on the
-health of your Ceph cluster with the following::
-
- ceph health
+Non-default paths
+-----------------
If you specified non-default locations for your configuration or keyring,
you may specify their locations::
ceph -c /path/to/conf -k /path/to/keyring health
-Upon starting the Ceph cluster, you will likely encounter a health
-warning such as ``HEALTH_WARN XXX num placement groups stale``. Wait a few moments and check
-it again. When your cluster is ready, ``ceph health`` should return a message
-such as ``HEALTH_OK``. At that point, it is okay to begin using the cluster.
+Checking a Cluster's Status
+===========================
+
+After you start your cluster, and before you start reading and/or
+writing data, check your cluster's status first.
-Watching a Cluster
-==================
+To check a cluster's status, execute the following::
-To watch the cluster's ongoing events, open a new terminal. Then, enter::
+ ceph status
+
+Or::
- ceph -w
+ ceph -s
+
+In interactive mode, type ``status`` and press **Enter**. ::
+
+ ceph> status
+
+Ceph will print the cluster status. For example, a tiny Ceph demonstration
+cluster with one of each service may print the following:
+
+::
+
+ cluster:
+ id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20
+ health: HEALTH_OK
+
+ services:
+ mon: 1 daemons, quorum a
+ mgr: x(active)
+ mds: 1/1/1 up {0=a=up:active}
+ osd: 1 osds: 1 up, 1 in
+
+ data:
+ pools: 2 pools, 16 pgs
+ objects: 21 objects, 2246 bytes
+ usage: 546 GB used, 384 GB / 931 GB avail
+ pgs: 16 active+clean
-Ceph will print each event. For example, a tiny Ceph cluster consisting of
-one monitor, and two OSDs may print the following::
-
- cluster b370a29d-9287-4ca3-ab57-3d824f65e339
- health HEALTH_OK
- monmap e1: 1 mons at {ceph1=10.0.0.8:6789/0}, election epoch 2, quorum 0 ceph1
- osdmap e63: 2 osds: 2 up, 2 in
- pgmap v41338: 952 pgs, 20 pools, 17130 MB data, 2199 objects
- 115 GB used, 167 GB / 297 GB avail
- 952 active+clean
-
- 2014-06-02 15:45:21.655871 osd.0 [INF] 17.71 deep-scrub ok
- 2014-06-02 15:45:47.880608 osd.1 [INF] 1.0 scrub ok
- 2014-06-02 15:45:48.865375 osd.1 [INF] 1.3 scrub ok
- 2014-06-02 15:45:50.866479 osd.1 [INF] 1.4 scrub ok
- 2014-06-02 15:45:01.345821 mon.0 [INF] pgmap v41339: 952 pgs: 952 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail
- 2014-06-02 15:45:05.718640 mon.0 [INF] pgmap v41340: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail
- 2014-06-02 15:45:53.997726 osd.1 [INF] 1.5 scrub ok
- 2014-06-02 15:45:06.734270 mon.0 [INF] pgmap v41341: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail
- 2014-06-02 15:45:15.722456 mon.0 [INF] pgmap v41342: 952 pgs: 952 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail
- 2014-06-02 15:46:06.836430 osd.0 [INF] 17.75 deep-scrub ok
- 2014-06-02 15:45:55.720929 mon.0 [INF] pgmap v41343: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail
-
-
-The output provides:
-
-- Cluster ID
-- Cluster health status
-- The monitor map epoch and the status of the monitor quorum
-- The OSD map epoch and the status of OSDs
-- The placement group map version
-- The number of placement groups and pools
-- The *notional* amount of data stored and the number of objects stored; and,
-- The total amount of data stored.
.. topic:: How Ceph Calculates Data Usage
- The ``used`` value reflects the *actual* amount of raw storage used. The
+ The ``usage`` value reflects the *actual* amount of raw storage used. The
``xxx GB / xxx GB`` value means the amount available (the lesser number)
of the overall storage capacity of the cluster. The notional number reflects
the size of the stored data before it is replicated, cloned or snapshotted.
storage capacity for cloning and snapshotting.
+Watching a Cluster
+==================
+
+In addition to local logging by each daemon, Ceph clusters maintain
+a *cluster log* that records high level events about the whole system.
+This is logged to disk on monitor servers (as ``/var/log/ceph/ceph.log`` by
+default), but can also be monitored via the command line.
+
+To follow the cluster log, use the following command
+
+::
+
+ ceph -w
+
+Ceph will print the status of the system, followed by each log message as it
+is emitted. For example:
+
+::
+
+ cluster:
+ id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20
+ health: HEALTH_OK
+
+ services:
+ mon: 1 daemons, quorum a
+ mgr: x(active)
+ mds: 1/1/1 up {0=a=up:active}
+ osd: 1 osds: 1 up, 1 in
+
+ data:
+ pools: 2 pools, 16 pgs
+ objects: 21 objects, 2246 bytes
+ usage: 546 GB used, 384 GB / 931 GB avail
+ pgs: 16 active+clean
+
+
+ 2017-07-24 08:15:11.329298 mon.a mon.0 172.21.9.34:6789/0 23 : cluster [INF] osd.0 172.21.9.34:6806/20527 boot
+ 2017-07-24 08:15:14.258143 mon.a mon.0 172.21.9.34:6789/0 39 : cluster [INF] Activating manager daemon x
+ 2017-07-24 08:15:15.446025 mon.a mon.0 172.21.9.34:6789/0 47 : cluster [INF] Manager daemon x is now available
+
+
+In addition to using ``ceph -w`` to print log lines as they are emitted,
+use ``ceph log last [n]`` to see the most recent ``n`` lines from the cluster
+log.
+
+Monitoring Health Checks
+========================
+
+Ceph continously runs various *health checks* against its own status. When
+a health check fails, this is reflected in the output of ``ceph status`` (or
+``ceph health``). In addition, messages are sent to the cluster log to
+indicate when a check fails, and when the cluster recovers.
+
+For example, when an OSD goes down, the ``health`` section of the status
+output may be updated as follows:
+
+::
+
+ health: HEALTH_WARN
+ 1 osds down
+ Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded
+
+At this time, cluster log messages are also emitted to record the failure of the
+health checks:
+
+::
+
+ 2017-07-25 10:08:58.265945 mon.a mon.0 172.21.9.34:6789/0 91 : cluster [WRN] Health check failed: 1 osds down (OSD_DOWN)
+ 2017-07-25 10:09:01.302624 mon.a mon.0 172.21.9.34:6789/0 94 : cluster [WRN] Health check failed: Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded (PG_DEGRADED)
+
+When the OSD comes back online, the cluster log records the cluster's return
+to a health state:
+
+::
+
+ 2017-07-25 10:11:11.526841 mon.a mon.0 172.21.9.34:6789/0 109 : cluster [WRN] Health check update: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized (PG_DEGRADED)
+ 2017-07-25 10:11:13.535493 mon.a mon.0 172.21.9.34:6789/0 110 : cluster [INF] Health check cleared: PG_DEGRADED (was: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized)
+ 2017-07-25 10:11:13.535577 mon.a mon.0 172.21.9.34:6789/0 111 : cluster [INF] Cluster is now healthy
+
+
+Detecting configuration issues
+==============================
+
+In addition to the health checks that Ceph continuously runs on its
+own status, there are some configuration issues that may only be detected
+by an external tool.
+
+Use the `ceph-medic`_ tool to run these additional checks on your Ceph
+cluster's configuration.
+
Checking a Cluster's Usage Stats
================================
mon_osd_full_ratio.
-Checking a Cluster's Status
-===========================
-
-To check a cluster's status, execute the following::
-
- ceph status
-
-Or::
-
- ceph -s
-
-In interactive mode, type ``status`` and press **Enter**. ::
-
- ceph> status
-
-Ceph will print the cluster status. For example, a tiny Ceph cluster consisting
-of one monitor, and two OSDs may print the following::
-
- cluster b370a29d-9287-4ca3-ab57-3d824f65e339
- health HEALTH_OK
- monmap e1: 1 mons at {ceph1=10.0.0.8:6789/0}, election epoch 2, quorum 0 ceph1
- osdmap e63: 2 osds: 2 up, 2 in
- pgmap v41332: 952 pgs, 20 pools, 17130 MB data, 2199 objects
- 115 GB used, 167 GB / 297 GB avail
- 1 active+clean+scrubbing+deep
- 951 active+clean
-
Checking OSD Status
===================
.. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#ceph-runtime-config
.. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity
+.. _ceph-medic: http://docs.ceph.com/ceph-medic/master/
--- /dev/null
+Repairing PG inconsistencies
+============================
+
+
*Recovering*
Ceph is migrating/synchronizing objects and their replicas.
+*Forced-Recovery*
+ High recovery priority of that PG is enforced by user.
+
*Backfill*
Ceph is scanning and synchronizing the entire contents of a placement group
instead of inferring what contents need to be synchronized from the logs of
recent operations. *Backfill* is a special case of recovery.
+*Forced-Backfill*
+ High backfill priority of that PG is enforced by user.
+
*Wait-backfill*
The placement group is waiting in line to start backfill.
To set the number of placement groups in a pool, you must specify the
number of placement groups at the time you create the pool.
-See `Create a Pool`_ for details. Once you've set placement groups for a
+See `Create a Pool`_ for details. Once you have set placement groups for a
pool, you may increase the number of placement groups (but you cannot
decrease the number of placement groups). To increase the number of
placement groups, execute the following::
match, a final semantic sweep ensures that all of the snapshot-related object
metadata is consistent. Errors are reported via logs.
+Prioritize backfill/recovery of a Placement Group(s)
+====================================================
+
+You may run into a situation where a bunch of placement groups will require
+recovery and/or backfill, and some particular groups hold data more important
+than others (for example, those PGs may hold data for images used by running
+machines and other PGs may be used by inactive machines/less relevant data).
+In that case, you may want to prioritize recovery of those groups so
+performance and/or availability of data stored on those groups is restored
+earlier. To do this (mark particular placement group(s) as prioritized during
+backfill or recovery), execute the following::
+
+ ceph pg force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+ ceph pg force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+
+This will cause Ceph to perform recovery or backfill on specified placement
+groups first, before other placement groups. This does not interrupt currently
+ongoing backfills or recovery, but causes specified PGs to be processed
+as soon as possible. If you change your mind or prioritize wrong groups,
+use::
+
+ ceph pg cancel-force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+ ceph pg cancel-force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+
+This will remove "force" flag from those PGs and they will be processed
+in default order. Again, this doesn't affect currently processed placement
+group, only those that are still queued.
+
+The "force" flag is cleared automatically after recovery or backfill of group
+is done.
Revert Lost
===========
groups in your Ceph configuration file, as the default is NOT ideal.
For details on placement group numbers refer to `setting the number of placement groups`_
+.. note:: Starting with Luminous, all pools need to be associated to the
+ application using the pool. See `Associate Pool to Application`_ below for
+ more information.
+
For example::
osd pool default pg num = 100
:Required: No.
:Default: 0, no splitting at the pool creation time.
+Associate Pool to Application
+=============================
+
+Pools need to be associated with an application before use. Pools that will be
+used with CephFS or pools that are automatically created by RGW are
+automatically associated. Pools that are intended for use with RBD should be
+initialized using the ``rbd`` tool (see `Block Device Commands`_ for more
+information).
+
+For other cases, you can manually associate a free-form application name to
+a pool.::
+
+ ceph osd pool application enable {pool-name} {application-name}
+
+.. note:: CephFS uses the application name ``cephfs``, RBD uses the
+ application name ``rbd``, and RGW uses the application name ``rgw``.
+
Set Pool Quotas
===============
If you created users with permissions strictly for a pool that no longer
exists, you should consider deleting those users too::
- ceph auth list | grep -C 5 {pool-name}
+ ceph auth ls | grep -C 5 {pool-name}
ceph auth del {user}
.. _Bloom Filter: http://en.wikipedia.org/wiki/Bloom_filter
.. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups
.. _Erasure Coding with Overwrites: ../erasure-code#erasure-coding-with-overwrites
+.. _Block Device Commands: ../../../rbd/rados-rbd-cmds/#create-a-block-device-pool
+
--- /dev/null
+Using the pg-upmap
+==================
+
+Starting in Luminous v12.2.z there is a new *pg-upmap* exception table
+in the OSDMap that allows the cluster to explicitly map specific PGs to
+specific OSDs. This allows the cluster to fine-tune the data
+distribution to, in most cases, perfectly distributed PGs across OSDs.
+
+The key caveat to this new mechanism is that it requires that all
+clients understand the new *pg-upmap* structure in the OSDMap.
+
+Enabling
+--------
+
+To allow use of the feature, you must tell the cluster that it only
+needs to support luminous (and newer) clients with::
+
+ ceph osd set-require-min-compat-client luminous
+
+This command will fail if any pre-luminous clients or daemons are
+connected to the monitors. You can see what client versions are in
+use with::
+
+ ceph features
+
+A word of caution
+-----------------
+
+This is a new feature and not very user friendly. At the time of this
+writing we are working on a new `balancer` module for ceph-mgr that
+will eventually do all of this automatically.
+
+Until then,
+
+Offline optimization
+--------------------
+
+Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
+
+#. Grab the latest copy of your osdmap::
+
+ ceph osd getmap -o om
+
+#. Run the optimizer::
+
+ osdmaptool om --upmap out.txt [--upmap-pool <pool>] [--upmap-max <max-count>] [--upmap-deviation <max-deviation>]
+
+ It is highly recommended that optimization be done for each pool
+ individually, or for sets of similarly-utilized pools. You can
+ specify the ``--upmap-pool`` option multiple times. "Similar pools"
+ means pools that are mapped to the same devices and store the same
+ kind of data (e.g., RBD image pools, yes; RGW index pool and RGW
+ data pool, no).
+
+ The ``max-count`` value is the maximum number of upmap entries to
+ identify in the run. The default is 100, but you may want to make
+ this a smaller number so that the tool completes more quickly (but
+ does less work). If it cannot find any additional changes to make
+ it will stop early (i.e., when the pool distribution is perfect).
+
+ The ``max-deviation`` value defaults to `.01` (i.e., 1%). If an OSD
+ utilization varies from the average by less than this amount it
+ will be considered perfect.
+
+#. The proposed changes are written to the output file ``out.txt`` in
+ the example above. These are normal ceph CLI commands that can be
+ run to apply the changes to the cluster. This can be done with::
+
+ source out.txt
+
+The above steps can be repeated as many times as necessary to achieve
+a perfect distribution of PGs for each set of pools.
+
+You can see some (gory) details about what the tool is doing by
+passing ``--debug-osd 10`` to ``osdmaptool``.
Capability syntax follows the form::
- {daemon-type} 'allow {capability}' [{daemon-type} 'allow {capability}']
+ {daemon-type} '{capspec}[, {capspec} ...]'
-
-- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` and
- ``allow profile {cap}``. For example::
+- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` access
+ settings or ``profile {name}``. For example::
mon 'allow rwx'
- mon 'allow profile osd'
+ mon 'profile osd'
-- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``,
- ``class-write`` and ``profile osd``. Additionally, OSD capabilities also
- allow for pool and namespace settings. ::
+- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``,
+ ``class-write`` access settings or ``profile {name}``. Additionally, OSD
+ capabilities also allow for pool and namespace settings. ::
- osd 'allow {capability}' [pool={poolname}] [namespace={namespace-name}]
+ osd 'allow {access} [pool={pool-name} [namespace={namespace-name}]]'
+ osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]]'
- **Metadata Server Caps:** Metadata server capability simply requires ``allow``,
or blank and does not parse anything further. ::
-
+
mds 'allow'
.. note:: The Ceph Object Gateway daemon (``radosgw``) is a client of the
- Ceph Storage Cluster, so it isn't represented as a Ceph Storage
+ Ceph Storage Cluster, so it is not represented as a Ceph Storage
Cluster daemon type.
The following entries describe each capability.
admin commands.
-``profile osd``
+``profile osd`` (Monitor only)
:Description: Gives a user permissions to connect as an OSD to other OSDs or
monitors. Conferred on OSDs to enable OSDs to handle replication
heartbeat traffic and status reporting.
-``profile mds``
+``profile mds`` (Monitor only)
:Description: Gives a user permissions to connect as a MDS to other MDSs or
monitors.
-``profile bootstrap-osd``
+``profile bootstrap-osd`` (Monitor only)
:Description: Gives a user permissions to bootstrap an OSD. Conferred on
deployment tools such as ``ceph-disk``, ``ceph-deploy``, etc.
bootstrapping an OSD.
-``profile bootstrap-mds``
+``profile bootstrap-mds`` (Monitor only)
:Description: Gives a user permissions to bootstrap a metadata server.
Conferred on deployment tools such as ``ceph-deploy``, etc.
so they have permissions to add keys, etc. when bootstrapping
a metadata server.
+``profile rbd`` (Monitor and OSD)
+
+:Description: Gives a user permissions to manipulate RBD images. When used
+ as a Monitor cap, it provides the minimal privileges required
+ by an RBD client application. When used as an OSD cap, it
+ provides read-write access to an RBD client application.
+
+``profile rbd-read-only`` (OSD only)
+
+:Description: Gives a user read-only permissions to an RBD image.
Pool
namespace. Objects written to a namespace within the pool can only be accessed
by users who have access to the namespace.
-.. note:: Currently (i.e., ``firefly``), namespaces are only useful for
- applications written on top of ``librados``. Ceph clients such as block
- device, object storage and file system do not currently support this
- feature.
+.. note:: Namespaces are primarily useful for applications written on top of
+ ``librados`` where the logical grouping can alleviate the need to create
+ different pools. Ceph Object Gateway (from ``luminous``) uses namespaces for various
+ metadata objects.
The rationale for namespaces is that pools can be a computationally expensive
method of segregating data sets for the purposes of authorizing separate sets
To list the users in your cluster, execute the following::
- ceph auth list
+ ceph auth ls
Ceph will list out all users in your cluster. For example, in a two-node
-exemplary cluster, ``ceph auth list`` will output something that looks like
+exemplary cluster, ``ceph auth ls`` will output something that looks like
this::
installed auth entries:
Note also that each entry has a ``key: <value>`` entry, and one or more
``caps:`` entries.
-You may use the ``-o {filename}`` option with ``ceph auth list`` to
+You may use the ``-o {filename}`` option with ``ceph auth ls`` to
save the output to a file.
ceph auth export {TYPE.ID}
The ``auth export`` command is identical to ``auth get``, but also prints
-out the internal ``auid``, which isn't relevant to end users.
+out the internal ``auid``, which is not relevant to end users.
Second, make sure you are able to connect to ``mon.a``'s server from the
other monitors' servers. Check the ports as well. Check ``iptables`` on
- all your monitor nodes and make sure you're not dropping/rejecting
+ all your monitor nodes and make sure you are not dropping/rejecting
connections.
If this initial troubleshooting doesn't solve your problems, then it's
If you have a quorum, however, the monitor should be able to find the
remaining monitors pretty fast, as long as they can be reached. If your
- monitor is stuck probing and you've gone through with all the communication
+ monitor is stuck probing and you have gone through with all the communication
troubleshooting, then there is a fair chance that the monitor is trying
to reach the other monitors on a wrong address. ``mon_status`` outputs the
``monmap`` known to the monitor: check if the other monitor's locations
`Clock Skews`_ for more infos on that. If all your clocks are properly
synchronized, it is best if you prepare some logs and reach out to the
community. This is not a state that is likely to persist and aside from
- (*really*) old bugs there isn't an obvious reason besides clock skews on
+ (*really*) old bugs there is not an obvious reason besides clock skews on
why this would happen.
What if state is ``synchronizing``?
What if state is ``leader`` or ``peon``?
This should not happen. There is a chance this might happen however, and
- it has a lot to do with clock skews -- see `Clock Skews`_. If you're not
+ it has a lot to do with clock skews -- see `Clock Skews`_. If you are not
suffering from clock skews, then please prepare your logs (see
`Preparing your logs`_) and reach out to us.
$ ceph mon getmap -o /tmp/monmap
2. No quorum? Grab the monmap directly from another monitor (this
- assumes the monitor you're grabbing the monmap from has id ID-FOO
+ assumes the monitor you are grabbing the monmap from has id ID-FOO
and has been stopped)::
$ ceph-mon -i ID-FOO --extract-monmap /tmp/monmap
- 3. Stop the monitor you're going to inject the monmap into.
+ 3. Stop the monitor you are going to inject the monmap into.
4. Inject the monmap::
# keyring with the caps, and there is no need to pass the "--keyring" option.
# i.e. just use "ceph-monstore-tool /tmp/mon-store rebuild" instead
ceph-authtool /path/to/admin.keyring -n mon. \
- --cap mon allow 'allow *'
+ --cap mon 'allow *'
ceph-authtool /path/to/admin.keyring -n client.admin \
- --cap mon allow 'allow *' --cap osd 'allow *' --cap mds 'allow *'
+ --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *'
ceph-monstore-tool /tmp/mon-store rebuild -- --keyring /path/to/admin.keyring
# backup corrupted store.db just in case
mv /var/lib/ceph/mon/mon.0/store.db /var/lib/ceph/mon/mon.0/store.db.corrupted
to ensure you have addressed any issues related to your kernel.
- **Segment Fault:** If there is a segment fault, turn your logging up
- (if it isn't already), and try again. If it segment faults again,
+ (if it is not already), and try again. If it segment faults again,
contact the ceph-devel email list and provide your Ceph configuration
file, your monitor output and the contents of your log file(s).
.. tip:: Newer versions of Ceph provide better recovery handling by preventing
recovering OSDs from using up system resources so that ``up`` and ``in``
- OSDs aren't available or are otherwise slow.
+ OSDs are not available or are otherwise slow.
Networking Issues
We recommend using both a public (front-end) network and a cluster (back-end)
network so that you can better meet the capacity requirements of object
replication. Another advantage is that you can run a cluster network such that
-it isn't connected to the internet, thereby preventing some denial of service
+it is not connected to the internet, thereby preventing some denial of service
attacks. When OSDs peer and check heartbeats, they use the cluster (back-end)
network when it's available. See `Monitor/OSD Interaction`_ for details.
Fewer OSDs than Replicas
------------------------
-If you've brought up two OSDs to an ``up`` and ``in`` state, but you still
+If you have brought up two OSDs to an ``up`` and ``in`` state, but you still
don't see ``active + clean`` placement groups, you may have an
``osd pool default size`` set to greater than ``2``.
``ceph pg dump``), you can force the first OSD to notice the placement groups
it needs by running::
- ceph pg force_create_pg <pgid>
+ ceph osd force-create-pg <pgid>
CRUSH Map Errors
mapped to OSDs, a small number of placement groups will not distribute across
your cluster. Try creating a pool with a placement group count that is a
multiple of the number of OSDs. See `Placement Groups`_ for details. The default
-placement group count for pools isn't useful, but you can change it `here`_.
+placement group count for pools is not useful, but you can change it `here`_.
Can't Write Data
# radosgw-admin --tenant testx --uid tester --display-name "Test User" --subuser tester:test --key-type swift --access full user create
# radosgw-admin --subuser 'testx$tester:test' --key-type swift --secret test123
-Note that the subuser with explicit tenant had to be quoted in the shell.
+.. note:: The subuser with explicit tenant has to be quoted in the shell.
+
+ Tenant names may contain only alphanumeric characters and underscores.
Accessing Buckets with Explicit Tenants
=======================================
+------------------------+-----------+--------------------------------------------------------------------------------------+
| ``max-uploads`` | Integer | The maximum number of multipart uploads. The range from 1-1000. The default is 1000. |
+------------------------+-----------+--------------------------------------------------------------------------------------+
-| ``upload-id-marker`` | String | Ignored if ``key-marker`` isn't specified. Specifies the ``ID`` of first |
+| ``upload-id-marker`` | String | Ignored if ``key-marker`` is not specified. Specifies the ``ID`` of first |
| | | upload to list in lexicographical order at or following the ``ID``. |
+------------------------+-----------+--------------------------------------------------------------------------------------+
| ``VersioningConfiguration`` | Container | A container for the request. |
+-----------------------------+-----------+---------------------------------------------------------------------------+
| ``Status`` | String | Sets the versioning state of the bucket. Valid Values: Suspended/Enabled |
-+-----------------------------+-----------+---------------------------------------------------------------------------+
\ No newline at end of file
++-----------------------------+-----------+---------------------------------------------------------------------------+
.. note::
The `Amazon::S3`_ module does not have a way to generate download
- URLs, so we're going to be using another module instead. Unfortunately,
+ URLs, so we are going to be using another module instead. Unfortunately,
most modules for generating these URLs assume that you are using Amazon,
- so we've had to go with using a more obscure module, `Muck::FS::S3`_. This
+ so we have had to go with using a more obscure module, `Muck::FS::S3`_. This
should be the same as Amazon's sample S3 perl module, but this sample
module is not in CPAN. So, you can either use CPAN to install
`Muck::FS::S3`_, or install Amazon's sample S3 module manually. If you go
To delete a container, make a ``DELETE`` request with the API version, account,
and the name of the container. The container must be empty. If you'd like to check
if the container is empty, execute a ``HEAD`` request against the container. Once
-you've successfully removed the container, you'll be able to reuse the container name.
+you have successfully removed the container, you will be able to reuse the container name.
Syntax
~~~~~~
To delete an object, make a ``DELETE`` request with the API version, account,
container and object name. You must have write permissions on the container to delete
-an object within it. Once you've successfully deleted the object, you'll be able to
+an object within it. Once you have successfully deleted the object, you will be able to
reuse the object name.
Syntax
cannot be :type:unicode - `Librbd` does not know how to deal with
characters wider than a :c:type:char.
-In the end, you'll want to close the image, the IO context and the connection to RADOS::
+In the end, you will want to close the image, the IO context and the connection to RADOS::
image.close()
ioctx.close()
To create VMs that use Ceph block devices, use the procedures in the following
-sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool
+sections. In the exemplary embodiment, we have used ``libvirt-pool`` for the pool
name, ``client.libvirt`` for the user name, and ``new-libvirt-image`` for the
image name. You may use any value you like, but ensure you replace those values
when executing commands in the subsequent procedures.
To configure Ceph for use with ``libvirt``, perform the following steps:
-#. `Create a pool`_ (or use the default). The following example uses the
+#. `Create a pool`_. The following example uses the
pool name ``libvirt-pool`` with 128 placement groups. ::
ceph osd pool create libvirt-pool 128 128
ceph osd lspools
-#. `Create a Ceph User`_ (or use ``client.admin`` for version 0.9.7 and
- earlier). The following example uses the Ceph user name ``client.libvirt``
+#. Use the ``rbd`` tool to initialize the pool for use by RBD::
+
+ rbd pool init <pool-name>
+
+#. `Create a Ceph User`_ (or use ``client.admin`` for version 0.9.7 and
+ earlier). The following example uses the Ceph user name ``client.libvirt``
and references ``libvirt-pool``. ::
- ceph auth get-or-create client.libvirt mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=libvirt-pool'
+ ceph auth get-or-create client.libvirt mon 'profile rbd' osd 'profile rbd pool=libvirt-pool'
Verify the name exists. ::
- ceph auth list
+ ceph auth ls
**NOTE**: ``libvirt`` will access Ceph using the ID ``libvirt``,
not the Ceph name ``client.libvirt``. See `User Management - User`_ and
.. important:: To use Ceph Block Device commands, you must have access to
a running Ceph cluster.
+Create a Block Device Pool
+==========================
+
+#. On the admin node, use the ``ceph`` tool to `create a pool`_.
+
+#. On the admin node, use the ``rbd`` tool to initialize the pool for use by RBD::
+
+ rbd pool init <pool-name>
+
+.. note:: The ``rbd`` tool assumes a default pool name of 'rbd' when not
+ provided.
+
+Create a Block Device User
+==========================
+
+Unless specified, the ``rbd`` command will access the Ceph cluster using the ID
+``admin``. This ID allows full administrative access to the cluster. It is
+recommended that you utilize a more restricted user wherever possible.
+
+To `create a Ceph user`_, with ``ceph`` specify the ``auth get-or-create``
+command, user name, monitor caps, and OSD caps::
+
+ ceph auth get-or-create client.{ID} mon 'profile rbd' osd 'profile {profile name} [pool={pool-name}][, profile ...]'
+
+For example, to create a user ID named ``qemu`` with read-write access to the
+pool ``vms`` and read-only access to the pool ``images``, execute the
+following::
+
+ ceph auth get-or-create client.qemu mon 'profile rbd' osd 'profile rbd pool=vms, profile rbd-read-only pool=images'
+
+The output from the ``ceph auth get-or-create`` command will be the keyring for
+the specified user, which can be written to ``/etc/ceph/ceph.client.{ID}.keyring``.
+
+.. note:: The user ID can be specified when using the ``rbd`` command by
+ providing the ``--id {id}`` optional argument.
Creating a Block Device Image
=============================
the following::
rbd create --size {megabytes} {pool-name}/{image-name}
-
+
For example, to create a 1GB image named ``bar`` that stores information in a
pool named ``swimmingpool``, execute the following::
For example::
rbd ls swimmingpool
-
+
+To list deferred delete block devices in the ``rbd`` pool, execute the
+following::
+
+ rbd trash ls
+
+To list deferred delete block devices in a particular pool, execute the
+following, but replace ``{poolname}`` with the name of the pool::
+
+ rbd trash ls {poolname}
+
+For example::
+
+ rbd trash ls swimmingpool
+
Retrieving Image Information
============================
with the name of the image you want to remove::
rbd rm {image-name}
-
+
For example::
rbd rm foo
-
+
To remove a block device from a pool, execute the following, but replace
``{image-name}`` with the name of the image to remove and replace
``{pool-name}`` with the name of the pool::
rbd rm {pool-name}/{image-name}
-
+
For example::
rbd rm swimmingpool/bar
+To defer delete a block device from a pool, execute the following, but
+replace ``{image-name}`` with the name of the image to move and replace
+``{pool-name}`` with the name of the pool::
+
+ rbd trash mv {pool-name}/{image-name}
+
+For example::
+
+ rbd trash mv swimmingpool/bar
+
+To remove a deferred block device from a pool, execute the following, but
+replace ``{image-id}`` with the id of the image to remove and replace
+``{pool-name}`` with the name of the pool::
+
+ rbd trash rm {pool-name}/{image-id}
+
+For example::
+
+ rbd trash rm swimmingpool/2bf4474b0dc51
+
+.. note::
+
+ * You can move an image to the trash even it has shapshot(s) or actively
+ in-use by clones, but can not be removed from trash.
+
+ * You can use *--delay* to set the defer time (default is 0), and if its
+ deferment time has not expired, it can not be removed unless you use
+ force.
+
+Restoring a Block Device Image
+==============================
+
+To restore a deferred delete block device in the rbd pool, execute the
+following, but replace ``{image-id}`` with the id of the image::
+
+ rbd trash restore {image-d}
+
+For example::
+
+ rbd trash restore 2bf4474b0dc51
+
+To restore a deferred delete block device in a particular pool, execute
+the following, but replace ``{image-id}`` with the id of the image and
+replace ``{pool-name}`` with the name of the pool::
+
+ rbd trash restore {pool-name}/{image-id}
+
+For example::
+
+ rbd trash restore swimmingpool/2bf4474b0dc51
+
+Also you can use *--image* to rename the iamge when restore it, for
+example::
+
+ rbd trash restore swimmingpool/2bf4474b0dc51 --image new-name
+.. _create a pool: ../../rados/operations/pools/#create-a-pool
.. _Storage Pools: ../../rados/operations/pools
.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/
+.. _create a Ceph user: ../../rados/operations/user-management#add-a-user
for your pools, and `Placement Groups`_ for details on the number of placement
groups you should set for your pools.
+A newly created pool must initialized prior to use. Use the ``rbd`` tool
+to initialize the pool::
+
+ rbd pool init cloudstack
+
Create a Ceph User
==================
use ``client.admin`` for this, it's recommended to create a user with only
access to the ``cloudstack`` pool. ::
- ceph auth get-or-create client.cloudstack mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=cloudstack'
+ ceph auth get-or-create client.cloudstack mon 'profile rbd' osd 'profile rbd pool=cloudstack'
Use the information returned by the command in the next step when adding the
Primary Storage.
your pools, and `Placement Groups`_ for details on the number of placement
groups you should set for your pools.
+Newly created pools must initialized prior to use. Use the ``rbd`` tool
+to initialize the pools::
+
+ rbd pool init volumes
+ rbd pool init images
+ rbd pool init backups
+ rbd pool init vms
+
.. _Create a Pool: ../../rados/operations/pools#createpool
.. _Placement Groups: ../../rados/operations/placement-groups
Install Ceph client packages
----------------------------
-On the ``glance-api`` node, you'll need the Python bindings for ``librbd``::
+On the ``glance-api`` node, you will need the Python bindings for ``librbd``::
sudo apt-get install python-rbd
sudo yum install python-rbd
If you have `cephx authentication`_ enabled, create a new user for Nova/Cinder
and Glance. Execute the following::
- ceph auth get-or-create client.glance mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=images'
- ceph auth get-or-create client.cinder-backup mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=backups'
-
-If you run an OpenStack version before Mitaka, create the following ``client.cinder`` key::
-
- ceph auth get-or-create client.cinder mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rwx pool=vms, allow rx pool=images'
-
-Since Mitaka introduced the support of RBD snapshots while doing a snapshot of a Nova instance,
-we need to allow the ``client.cinder`` key write access to the ``images`` pool; therefore, create the following key::
-
- ceph auth get-or-create client.cinder mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rwx pool=vms, allow rwx pool=images'
+ ceph auth get-or-create client.glance mon 'profile rbd' osd 'profile rbd pool=images'
+ ceph auth get-or-create client.cinder mon 'profile rbd' osd 'profile rbd pool=volumes, profile rbd pool=vms, profile rbd pool=images'
+ ceph auth get-or-create client.cinder-backup mon 'profile rbd' osd 'profile rbd pool=backups'
Add the keyrings for ``client.cinder``, ``client.glance``, and
``client.cinder-backup`` to the appropriate nodes and change their ownership::
rados_connect_timeout = -1
glance_api_version = 2
-If you're using `cephx authentication`_, also configure the user and uuid of
+If you are using `cephx authentication`_, also configure the user and uuid of
the secret you added to ``libvirt`` as documented earlier::
[ceph]
- *General*:
* Ceph now has a simple, built-in web-based dashboard for monitoring
- cluster status. FIXME DOCS.
+ cluster status. See :doc:`/mgr/dashboard/`.
- *RADOS*:
BlueStore for performance reasons.) FIXME DOCS
* *Erasure coded* pools now have full support for *overwrites*,
- allowing them to be used with RBD and CephFS. `Read more about EC overwrites`_.
+ allowing them to be used with RBD and CephFS. See :doc:`/rados/operations/erasure-code/#erasure-coding-with-overwrites`.
* *ceph-mgr*:
*ceph-mgr* for reliability. See the notes on `Upgrading`_ below.
- The *ceph-mgr* daemon includes a REST-based management API. The
API is still experimental and somewhat limited but will form the basis
- for API-based management of Ceph going forward. FIXME DOCS
+ for API-based management of Ceph going forward. See :doc:`/mgr/restful`.
+ - *ceph-mgr* also includes a Prometheus exporter plugin, which can
+ provide Ceph perfcounters to Prometheus. See :doc:`/mgr/prometheus`.
* The overall *scalability* of the cluster has improved. We have
successfully tested clusters with up to 10,000 OSDs.
- * Each OSD can now have a *device class* associated with it (e.g., `hdd` or
- `ssd`), allowing CRUSH rules to trivially map data to a subset of devices
- in the system. Manually writing CRUSH rules or manual editing of the CRUSH
- is normally not required. FIXME DOCS
- * You can now *optimize CRUSH weights* can now be optimized to
- maintain a *near-perfect distribution of data* across OSDs. FIXME DOCS
+ * Each OSD can now have a *device class* associated with it (e.g.,
+ `hdd` or `ssd`), allowing CRUSH rules to trivially map data to a
+ subset of devices in the system. Manually writing CRUSH rules or
+ manual editing of the CRUSH is normally not required. See
+ :doc:`/rados/operations/crush-map/#crush-structure`.
+ * You can now *optimize CRUSH weights* to maintain a *near-perfect
+ distribution of data* across OSDs. FIXME DOCS
* There is also a new `upmap` exception mechanism that allows
individual PGs to be moved around to achieve a *perfect
- distribution* (this requires luminous clients). FIXME DOCS
+ distribution* (this requires luminous clients). See
+ :doc:`/rados/operations/upmap`.
* Each OSD now adjusts its default configuration based on whether the
backing device is an HDD or SSD. Manual tuning generally not required.
- * The prototype *mclock QoS queueing algorithm* is now available. FIXME DOCS
+ * The prototype `mClock QoS queueing algorithm </rados/configuration/osd-config-ref/#qos-based-on-mclock>` is now available.
* There is now a *backoff* mechanism that prevents OSDs from being
overloaded by requests to objects or PGs that are not currently able to
process IO.
- * There is a *simplified OSD replacement process* that is more robust. FIXME DOCS
+ * There is a simplified OSD replacement process that is more robust (see :doc:`/rados/operations/add-or-rm-osds/#replacing-an-osd`).
* You can query the supported features and (apparent) releases of
- all connected daemons and clients with ``ceph features``. FIXME DOCS
+ all connected daemons and clients with `ceph features </man/8/ceph#features>`_.
* You can configure the oldest Ceph client version you wish to allow to
connect to the cluster via ``ceph osd set-require-min-compat-client`` and
Ceph will prevent you from enabling features that will break compatibility
- with those clients. FIXME DOCS
+ with those clients.
* Several `sleep` settings, include ``osd_recovery_sleep``,
``osd_snap_trim_sleep``, and ``osd_scrub_sleep`` have been
reimplemented to work efficiently. (These are used in some cases
* Improved discard handling when the object map feature is enabled.
* rbd CLI ``import`` and ``copy`` commands now detect sparse and
preserve sparse regions.
- * Images and Snapshots will now include a creation timestamp
+ * Images and Snapshots will now include a creation timestamp.
- *CephFS*:
* *CLI changes*:
- The ``ceph -s`` or ``ceph status`` command has a fresh look.
- - ``ceph {osd,mds,mon} versions`` summarizes versions of running daemons.
- - ``ceph {osd,mds,mon} count-metadata <property>`` similarly
+ - ``ceph mgr metadata`` will dump metadata associated with each mgr
+ daemon.
+ - ``ceph versions`` or ``ceph {osd,mds,mon,mgr} versions``
+ summarize versions of running daemons.
+ - ``ceph {osd,mds,mon,mgr} count-metadata <property>`` similarly
tabulates any other daemon metadata visible via the ``ceph
- {osd,mds,mon} metadata`` commands.
+ {osd,mds,mon,mgr} metadata`` commands.
- ``ceph features`` summarizes features and releases of connected
clients and daemons.
- ``ceph osd require-osd-release <release>`` replaces the old
``require_RELEASE_osds`` flags.
- ``ceph osd pg-upmap``, ``ceph osd rm-pg-upmap``, ``ceph osd
pg-upmap-items``, ``ceph osd rm-pg-upmap-items`` can explicitly
- manage `upmap` items (FIXME DOCS).
+ manage `upmap` items (see :doc:`/rados/operations/upmap`).
- ``ceph osd getcrushmap`` returns a crush map version number on
stderr, and ``ceph osd setcrushmap [version]`` will only inject
an updated crush map if the version matches. This allows crush
for applying changes to entire subtrees. For example, ``ceph
osd down `ceph osd ls-tree rack1```.
- ``ceph osd {add,rm}-{noout,noin,nodown,noup}`` allow the
- `noout`, `nodown`, `noin`, and `noup` flags to be applied to
+ `noout`, `noin`, `nodown`, and `noup` flags to be applied to
specific OSDs.
- ``ceph log last [n]`` will output the last *n* lines of the cluster
log.
- ``ceph config-key dump`` dumps config-key entries and their
contents. (The existing ``ceph config-key list`` only dumps the key
names, not the values.)
+ - ``ceph config-key list`` is deprecated in favor of ``ceph config-key ls``.
+ - ``ceph auth list`` is deprecated in favor of ``ceph auth ls``.
+ - ``ceph osd crush rule list`` is deprecated in favor of ``ceph osd crush rule ls``.
- ``ceph osd set-{full,nearfull,backfillfull}-ratio`` sets the
cluster-wide ratio for various full thresholds (when the cluster
refuses IO, when the cluster warns about being close to full,
- ``ceph osd reweightn`` will specify the `reweight` values for
multiple OSDs in a single command. This is equivalent to a series of
``ceph osd reweight`` commands.
- - ``ceph osd crush class {create,rm,ls,rename}`` manage the new
+ - ``ceph osd crush class {rm,ls,ls-osd}`` manage the new
CRUSH *device class* feature. ``ceph crush set-device-class
<class> <osd> [<osd>...]`` will set the class for particular devices.
+ Note that if you specify a non-existent class, it will be created
+ automatically. ``ceph crush rm-device-class <osd> [<osd>...]``
+ will instead remove the class for particular devices.
+ And if a class contains no more devices, it will be automatically
+ destoryed.
- ``ceph osd crush rule create-replicated`` replaces the old
``ceph osd crush rule create-simple`` command to create a CRUSH
rule for a replicated pool. Notably it takes a `class` argument
these exist yet).
- ``ceph tell <daemon> help`` will now return a usage summary.
-.. _Read more about EC overwrites: ../rados/operations/erasure-code/#erasure-coding-with-overwrites
-
Major Changes from Jewel
------------------------
#. Do not create any new erasure-code pools while upgrading the monitors.
+#. You can monitor the progress of your upgrade at each stage with the
+ ``ceph versions`` command, which will tell you what ceph version is
+ running for each type of daemon.
+
#. Set the ``noout`` flag for the duration of the upgrade. (Optional
but recommended.)::
#. Upgrade monitors by installing the new packages and restarting the
monitor daemons. Note that, unlike prior releases, the ceph-mon
- daemons *must* be upgraded first.::
+ daemons *must* be upgraded first::
# systemctl restart ceph-mon.target
If you are upgrading from kraken, you may already have ceph-mgr
daemons deployed. If not, or if you are upgrading from jewel, you
can deploy new daemons with tools like ceph-deploy or ceph-ansible.
- For example,::
+ For example::
# ceph-deploy mgr create HOST
...
#. Upgrade all OSDs by installing the new packages and restarting the
- ceph-osd daemons on all hosts.::
+ ceph-osd daemons on all hosts::
# systemctl restart ceph-osd.target
You can monitor the progress of the OSD upgrades with the new
- ``ceph osd versions`` command.::
+ ``ceph versions`` or ``ceph osd versions`` command::
# ceph osd versions
{
}
#. Upgrade all CephFS daemons by upgrading packages and restarting
- daemons on all hosts.::
+ daemons on all hosts::
# systemctl restart ceph-mds.target
#. Upgrade all radosgw daemons by upgrading packages and restarting
- daemons on all hosts.::
+ daemons on all hosts::
# systemctl restart radosgw.target
(when the ``ceph osd require-osd-release luminous`` command is run)
but any provisioning tools that create erasure coded pools may need
to be updated.
+* The structure of the XML output for ``osd crush tree`` has changed
+ slightly to better match the ``osd tree`` output. The top level
+ structure is now ``nodes`` instead of ``crush_map_roots``.
* When assigning a network to the public network and not to
the cluster network the network specification of the public
network will be used for the cluster network as well.
* bluestore: os/bluestore: fix typo(s/trasnaction/transaction/) (`pr#14890 <https://github.com/ceph/ceph/pull/14890>`_, xie xingguo)
* bluestore: os/bluestore: fix use after free race with aio_wait (`pr#14956 <https://github.com/ceph/ceph/pull/14956>`_, Sage Weil)
* bluestore: os/bluestore: pre-calculate number of ghost buffers to evict (`pr#15029 <https://github.com/ceph/ceph/pull/15029>`_, xie xingguo)
-* bluestore: os/bluestore: Record l_bluestore_state_kv_queued_lat for sync_submit_… (`pr#14448 <https://github.com/ceph/ceph/pull/14448>`_, Jianpeng Ma)
+* bluestore: os/bluestore: Record l_bluestore_state_kv_queued_lat for sync\_submit\_… (`pr#14448 <https://github.com/ceph/ceph/pull/14448>`_, Jianpeng Ma)
* bluestore: os/bluestore: Remove ExtentFreeListManager. (`pr#14772 <https://github.com/ceph/ceph/pull/14772>`_, Jianpeng Ma)
* bluestore: os/bluestore: remove unused condition variable (`pr#14973 <https://github.com/ceph/ceph/pull/14973>`_, Igor Fedotov)
* bluestore: os/bluestore: rename/fix throttle options (`pr#14717 <https://github.com/ceph/ceph/pull/14717>`_, Sage Weil)
* crush: add devices class that rules can use as a filter (`issue#18943 <http://tracker.ceph.com/issues/18943>`_, `pr#13444 <http://github.com/ceph/ceph/pull/13444>`_, Loic Dachary)
* crush: add --dump to crushtool (`pr#13726 <http://github.com/ceph/ceph/pull/13726>`_, Loic Dachary)
* crush: allow uniform buckets with no items (`pr#13521 <http://github.com/ceph/ceph/pull/13521>`_, Loic Dachary)
-* crush: document tunables and rule step set_ (`pr#13722 <http://github.com/ceph/ceph/pull/13722>`_, Loic Dachary)
+* crush: document tunables and rule step set\_ (`pr#13722 <http://github.com/ceph/ceph/pull/13722>`_, Loic Dachary)
* crush: do is_out test only if we do not collide (`pr#13326 <http://github.com/ceph/ceph/pull/13326>`_, xie xingguo)
* crush: fix dprintk compilation (`pr#13424 <http://github.com/ceph/ceph/pull/13424>`_, Loic Dachary)
* debian: Add missing tp files in deb packaging (`pr#13526 <http://github.com/ceph/ceph/pull/13526>`_, Ganesh Mahalingam)
* cleanup: common/config: fix return type of string::find and use string::npos (`pr#9924 <http://github.com/ceph/ceph/pull/9924>`_, Yan Jun)
* cleanup: common/config_opts.h: remove obsolete configuration option (`pr#12659 <http://github.com/ceph/ceph/pull/12659>`_, Li Wang)
* cleanup,common: global: we need to handle the init_on_startup return value when global_init. (`pr#13018 <http://github.com/ceph/ceph/pull/13018>`_, song baisen)
-* cleanup,common: msg/async: assert if compiled code doesn't support the configured ms_… (`pr#12559 <http://github.com/ceph/ceph/pull/12559>`_, Avner BenHanoch)
+* cleanup,common: msg/async: assert if compiled code doesn't support the configured ms\_… (`pr#12559 <http://github.com/ceph/ceph/pull/12559>`_, Avner BenHanoch)
* cleanup,common: msg/async/rdma: clean line endings (`pr#12688 <http://github.com/ceph/ceph/pull/12688>`_, Adir Lev)
* cleanup,common: msg/async/rdma: Remove compilation warning (`pr#13142 <http://github.com/ceph/ceph/pull/13142>`_, Sarit Zubakov)
* cleanup,common: osd/OSDMap: get_previous_up_osd_before() may run into endless loop (`pr#12976 <http://github.com/ceph/ceph/pull/12976>`_, Mingxin Liu)
* The 'ceph osd perf' command will display 'commit_latency(ms)' and
'apply_latency(ms)'. Previously, the names of these two columns are
'fs_commit_latency(ms)' and 'fs_apply_latency(ms)'. We remove the
- prefix 'fs_', because they are not filestore specific.
+ prefix 'fs\_', because they are not filestore specific.
* Monitors will no longer allow pools to be removed by default. The
setting mon_allow_pool_delete has to be set to true (defaults to
* build/ops: rpm: Remove trailing whitespace in usermod command (SUSE) (`pr#10707 <http://github.com/ceph/ceph/pull/10707>`_, Tim Serong)
* build/ops: scripts/release-notes: allow title guesses from gh tags & description update (`pr#11399 <http://github.com/ceph/ceph/pull/11399>`_, Abhishek Lekshmanan)
* build/ops: systemd: Fix startup of ceph-mgr on Debian 8 (`pr#12555 <http://github.com/ceph/ceph/pull/12555>`_, Mark Korenberg)
-* build/ops: tracing/objectstore.tp: add missing move_ranges_... tp (`pr#11484 <http://github.com/ceph/ceph/pull/11484>`_, Sage Weil)
+* build/ops: tracing/objectstore.tp: add missing move_ranges\_... tp (`pr#11484 <http://github.com/ceph/ceph/pull/11484>`_, Sage Weil)
* build/ops: upstart: fix ceph-crush-location default (`issue#6698 <http://tracker.ceph.com/issues/6698>`_, `pr#803 <http://github.com/ceph/ceph/pull/803>`_, Jason Dillaman)
* build/ops: upstart: start ceph-all after static-network-up (`issue#17689 <http://tracker.ceph.com/issues/17689>`_, `pr#11631 <http://github.com/ceph/ceph/pull/11631>`_, Billy Olsen)
* cephfs: add gid to asok status (`pr#11487 <http://github.com/ceph/ceph/pull/11487>`_, Patrick Donnelly)
* osd: print log when osd want to kill self (`pr#9288 <http://github.com/ceph/ceph/pull/9288>`_, Haomai Wang)
* osd: Remove extra call to reg_next_scrub() during splits (`issue#16474 <http://tracker.ceph.com/issues/16474>`_, `pr#11206 <http://github.com/ceph/ceph/pull/11206>`_, David Zafman)
* osd: remove redudant call of heartbeat_check (`pr#12130 <http://github.com/ceph/ceph/pull/12130>`_, Pan Liu)
-* osd: remove the lock heartbeat_update_lock, and change heatbeat_need_… (`pr#12461 <http://github.com/ceph/ceph/pull/12461>`_, Pan Liu)
+* osd: remove the lock heartbeat_update_lock, and change heatbeat_need\_… (`pr#12461 <http://github.com/ceph/ceph/pull/12461>`_, Pan Liu)
* osd: remove the redundant clear method in consume_map function (`pr#10553 <http://github.com/ceph/ceph/pull/10553>`_, song baisen)
* osd: Remove unused '_lsb_release_' declarations (`pr#11364 <http://github.com/ceph/ceph/pull/11364>`_, Brad Hubbard)
* osd: replace hb_out and hb_in with a single hb_peers (`issue#18057 <http://tracker.ceph.com/issues/18057>`_, `pr#12178 <http://github.com/ceph/ceph/pull/12178>`_, Pan Liu)
* rgw: merge setting flags operation together and cleanups (`pr#10203 <http://github.com/ceph/ceph/pull/10203>`_, Yan Jun)
* rgw: miscellaneous cleanups (`pr#10299 <http://github.com/ceph/ceph/pull/10299>`_, Yan Jun)
* rgw: multiple fixes for Swift's object expiration (`issue#16705 <http://tracker.ceph.com/issues/16705>`_, `issue#16684 <http://tracker.ceph.com/issues/16684>`_, `pr#10330 <http://github.com/ceph/ceph/pull/10330>`_, Radoslaw Zarzynski)
-* rgw: need to 'open_object_section' before dump stats in 'RGWGetUsage_… (`issue#17499 <http://tracker.ceph.com/issues/17499>`_, `pr#11325 <http://github.com/ceph/ceph/pull/11325>`_, weiqiaomiao)
+* rgw: need to 'open_object_section' before dump stats in 'RGWGetUsage\_… (`issue#17499 <http://tracker.ceph.com/issues/17499>`_, `pr#11325 <http://github.com/ceph/ceph/pull/11325>`_, weiqiaomiao)
* rgw: obsolete 'radosgw-admin period prepare' command (`issue#17387 <http://tracker.ceph.com/issues/17387>`_, `pr#11278 <http://github.com/ceph/ceph/pull/11278>`_, Gaurav Kumar Garg)
* rgw: radosgw-admin: add "--orphan-stale-secs" to --help (`issue#17280 <http://tracker.ceph.com/issues/17280>`_, `pr#11098 <http://github.com/ceph/ceph/pull/11098>`_, Ken Dreyer)
* rgw: radosgw-admin: zone[group] modify can change realm id (`issue#16839 <http://tracker.ceph.com/issues/16839>`_, `pr#10477 <http://github.com/ceph/ceph/pull/10477>`_, Casey Bodley)
* osd: reindex properly on pg log split (`issue#18975 <http://tracker.ceph.com/issues/18975>`_, `pr#14047 <https://github.com/ceph/ceph/pull/14047>`_, Alexey Sheplyakov)
* osd: restrict want_acting to up+acting on recovery completion (`issue#18929 <http://tracker.ceph.com/issues/18929>`_, `pr#13541 <https://github.com/ceph/ceph/pull/13541>`_, Sage Weil)
* rbd-nbd: check /sys/block/nbdX/size to ensure kernel mapped correctly (`issue#18335 <http://tracker.ceph.com/issues/18335>`_, `pr#13932 <https://github.com/ceph/ceph/pull/13932>`_, Mykola Golub, Alexey Sheplyakov)
-* rbd: [api] temporarily restrict (rbd_)mirror_peer_add from adding multiple peers (`issue#19256 <http://tracker.ceph.com/issues/19256>`_, `pr#14664 <https://github.com/ceph/ceph/pull/14664>`_, Jason Dillaman)
+* rbd: [api] temporarily restrict (rbd\_)mirror_peer_add from adding multiple peers (`issue#19256 <http://tracker.ceph.com/issues/19256>`_, `pr#14664 <https://github.com/ceph/ceph/pull/14664>`_, Jason Dillaman)
* rbd: qemu crash triggered by network issues (`issue#18436 <http://tracker.ceph.com/issues/18436>`_, `pr#13244 <https://github.com/ceph/ceph/pull/13244>`_, Jason Dillaman)
* rbd: rbd --pool=x rename y z does not work (`issue#18326 <http://tracker.ceph.com/issues/18326>`_, `pr#14148 <https://github.com/ceph/ceph/pull/14148>`_, Gaurav Kumar Garg)
* rbd: systemctl stop rbdmap unmaps all rbds and not just the ones in /etc/ceph/rbdmap (`issue#18884 <http://tracker.ceph.com/issues/18884>`_, `issue#18262 <http://tracker.ceph.com/issues/18262>`_, `pr#14083 <https://github.com/ceph/ceph/pull/14083>`_, David Disseldorp, Nathan Cutler)
* For all distributions that support systemd (CentOS 7, Fedora, Debian
Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd
- files instead of the legacy sysvinit scripts. For example,::
+ files instead of the legacy sysvinit scripts. For example::
systemctl start ceph.target # start all daemons
systemctl status ceph-osd@12 # check status of osd.12
ceph-deploy install --stable jewel HOST
- #. Stop the daemon(s).::
+ #. Stop the daemon(s)::
service ceph stop # fedora, centos, rhel, debian
stop ceph-all # ubuntu
chown -R ceph:ceph /var/lib/ceph
chown -R ceph:ceph /var/log/ceph
- #. Restart the daemon(s).::
+ #. Restart the daemon(s)::
start ceph-all # ubuntu
systemctl start ceph.target # debian, centos, fedora, rhel
* For all distributions that support systemd (CentOS 7, Fedora, Debian
Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd
- files instead of the legacy sysvinit scripts. For example,::
+ files instead of the legacy sysvinit scripts. For example::
systemctl start ceph.target # start all daemons
systemctl status ceph-osd@12 # check status of osd.12
ceph-deploy install --stable infernalis HOST
- #. Stop the daemon(s).::
+ #. Stop the daemon(s)::
service ceph stop # fedora, centos, rhel, debian
stop ceph-all # ubuntu
chown -R ceph:ceph /var/lib/ceph
chown -R ceph:ceph /var/log/ceph
- #. Restart the daemon(s).::
+ #. Restart the daemon(s)::
start ceph-all # ubuntu
systemctl start ceph.target # debian, centos, fedora, rhel
* For all distributions that support systemd (CentOS 7, Fedora, Debian
Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd
- files instead of the legacy sysvinit scripts. For example,::
+ files instead of the legacy sysvinit scripts. For example::
systemctl start ceph.target # start all daemons
systemctl status ceph-osd@12 # check status of osd.12
ceph-deploy install --stable infernalis HOST
- #. Stop the daemon(s).::
+ #. Stop the daemon(s)::
service ceph stop # fedora, centos, rhel, debian
stop ceph-all # ubuntu
chown -R ceph:ceph /var/lib/ceph
chown -R ceph:ceph /var/log/ceph
- #. Restart the daemon(s).::
+ #. Restart the daemon(s)::
start ceph-all # ubuntu
systemctl start ceph.target # debian, centos, fedora, rhel
Upgrading a cluster without adjusting the Ceph configuration will
likely prevent the system from starting up on its own. We recommend
first modifying the configuration to indicate that authentication is
- disabled, and only then upgrading to the latest version.::
+ disabled, and only then upgrading to the latest version::
auth client required = none
auth service required = none
| |Development|`Dumpling`_|`Emperor`_ |`Firefly`_ |`Giant`_ |`Hammer`_ |`Infernalis`_ |`Jewel`_ |`Kraken`_ |
| |Testing |LTS |Stable |LTS |Stable |LTS |Stable |LTS |Stable |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
+| July 2017 | 12.1.1 | | | | | | |`10.2.9`_ | |
+| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
+| | | | | | | | |`10.2.8`_ | |
++----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| June 2017 |`12.1.0`_ | | | | | | | | |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| May 2017 |`12.0.3`_ | | | | | | | | |
| January 2017 | 11.1.1 | | | | | | | |`11.2.0`_ |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| December 2016 | 11.1.0 | | | | | | |`10.2.5`_ | |
-+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
+| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| | | | | | | | |`10.2.4`_ | |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| October 2016 |`11.0.2`_ | | | | | | | | |
-+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
+| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| | 11.0.1 | | | | | | | | |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| September 2016 | | | | | | | |`10.2.3`_ | |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| August 2016 | | | | | |`0.94.9`_ | | | |
-+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
+| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| | | | | | |`0.94.8`_ | | | |
+----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+
| June 2016 | 11.0.0 | | | | | | |`10.2.2`_ | |
.. _11.0.2: ../release-notes#v11-0-2-kraken
+.. _10.2.9: ../release-notes#v10-2-9-jewel
+.. _10.2.8: ../release-notes#v10-2-8-jewel
.. _10.2.7: ../release-notes#v10-2-7-jewel
.. _10.2.6: ../release-notes#v10-2-6-jewel
.. _10.2.5: ../release-notes#v10-2-5-jewel
</td></tr></tbody></table>
-Install each dependency that isn't installed on your host. For Debian/Ubuntu
+Install each dependency that is not installed on your host. For Debian/Ubuntu
distributions, execute the following::
sudo apt-get install gcc python-dev python-pip python-virtualenv libxml2-dev libxslt-dev doxygen graphviz ant ditaa
wget http://rpmfind.net/linux/centos/7/os/x86_64/Packages/python-sphinx-1.1.3-11.el7.noarch.rpm
sudo yum install python-sphinx-1.1.3-11.el7.noarch.rpm
-Ceph documentation makes extensive use of `ditaa`_, which isn't presently built
+Ceph documentation makes extensive use of `ditaa`_, which is not presently built
for CentOS/RHEL7. You must install ``ditaa`` if you are making changes to
``ditaa`` diagrams so that you can verify that they render properly before you
commit new or modified ``ditaa`` diagrams. You may retrieve compatible required
drive, but SSDs often exhibit access times that are at least 100x faster than a
hard disk drive.
-SSDs do not have moving mechanical parts so they aren't necessarily subject to
+SSDs do not have moving mechanical parts so they are not necessarily subject to
the same types of limitations as hard disk drives. SSDs do have significant
limitations though. When evaluating SSDs, it is important to consider the
performance of sequential reads and writes. An SSD that has 400MB/s sequential
</td><td><h3>Step 2: Storage Cluster</h3>
-Once you've completed your preflight checklist, you should be able to begin
+Once you have completed your preflight checklist, you should be able to begin
deploying a Ceph Storage Cluster.
.. toctree::
ceph-deploy admin node1 node2 node3
-#. Deploy a manager daemon.::
+#. Deploy a manager daemon. (Required only for luminous+ builds)::
- ceph-deploy mgr create node1
+ ceph-deploy mgr create node1 *Required only for luminous+ builds, i.e >= 12.x builds*
#. Add three OSDs. For the purposes of these instructions, we assume you have an
unused disk in each node called ``/dev/vdb``. *Be sure that the device is not currently in use and does not contain any important data.*
directory. Ensure that the keyring file has appropriate read permissions
(e.g., ``sudo chmod +r /etc/ceph/ceph.client.admin.keyring``).
-Create an rbd pool
-==================
-#. On the admin node, use the ``ceph`` tool to `Create a Pool`_
+Create a Block Device Pool
+==========================
+
+#. On the admin node, use the ``ceph`` tool to `create a pool`_
(we recommend the name 'rbd').
+#. On the admin node, use the ``rbd`` tool to initialize the pool for use by RBD::
+
+ rbd pool init <pool-name>
+
Configure a Block Device
========================
See `block devices`_ for additional details.
-.. _Create a Pool: ../../rados/operations/pools#createpool
.. _Storage Cluster Quick Start: ../quick-ceph-deploy
+.. _create a pool: ../../rados/operations/pools/#create-a-pool
.. _block devices: ../../rbd/rbd
.. _FAQ: http://wiki.ceph.com/How_Can_I_Give_Ceph_a_Try
.. _OS Recommendations: ../os-recommendations
#. Add the Ceph repository to your yum configuration file at ``/etc/yum.repos.d/ceph.repo`` with the following command::
- cat >/etc/yum.repos.d/ceph.repro
+ cat >/etc/yum.repos.d/ceph.repo
[ceph-noarch]
name=Ceph noarch packages
baseurl=https://download.ceph.com/rpm/el7/noarch
fi
export LC_ALL=C # the following is vulnerable to i18n
+function munge_ceph_spec_in {
+ local OUTFILE=$1
+ sed -e 's/@//g' -e 's/%bcond_with make_check/%bcond_without make_check/g' < ceph.spec.in > $OUTFILE
+}
+
if [ x`uname`x = xFreeBSDx ]; then
$SUDO pkg install -yq \
+ devel/babeltrace \
devel/git \
devel/gperf \
devel/gmake \
lang/cython \
devel/py-virtualenv \
databases/leveldb \
- net/openldap24-client \
+ net/openldap-client \
security/nss \
security/cryptopp \
archivers/snappy \
textproc/gsed \
textproc/libxml2 \
textproc/xmlstarlet \
- textproc/jq \
- textproc/sphinx \
+ textproc/jq \
+ textproc/py-sphinx \
emulators/fuse \
java/junit \
+ lang/python \
lang/python27 \
- devel/py-pip \
+ devel/py-pip \
devel/py-argparse \
devel/py-nose \
+ devel/py-prettytable \
www/py-flask \
www/fcgi \
sysutils/flock \
fi
;;
esac
- sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+ munge_ceph_spec_in $DIR/ceph.spec
$SUDO $builddepcmd $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
! grep -q -i error: $DIR/yum-builddep.out || exit 1
;;
opensuse|suse|sles)
echo "Using zypper to install dependencies"
$SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release systemd-rpm-macros
- sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+ munge_ceph_spec_in $DIR/ceph.spec
$SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
;;
alpine)
+++ /dev/null
-overrides:
- ceph-deploy:
- dmcrypt: yes
+++ /dev/null
-overrides:
- ceph-deploy:
- separate_journal_disk:
+++ /dev/null
-overrides:
- ceph-deploy:
- separate_journal_disk: yes
+++ /dev/null
-overrides:
- ceph-deploy:
- dmcrypt: yes
- separate_journal_disk: yes
- overall HEALTH_
- (OSD_DOWN)
- (OSD_
- - wrongly marked me down
+ - but it is still running
# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
- is not responding
conf:
+++ /dev/null
-overrides:
- ceph-deploy:
- conf:
- global:
- mon pg warn min per osd: 2
- osd pool default size: 2
elif [ $2 = "kraken" ] ; then
# run kraken branch with /40 jobs
teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6
+elif [ $2 = "luminous" ] ; then
+ # run luminous branch with /40 jobs
+ teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6
else
# run NON master branches without --newest
teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $5 $6
osd failsafe full ratio: .95
# this doesn't work with failures bc the log writes are not atomic across the two backends
# bluestore bluefs env mirror: true
+ ceph-deploy:
+ fs: xfs
+ bluestore: yes
+ conf:
+ osd:
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ debug bluestore: 30
+ debug bdev: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore fsck on mount: true
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+
osd:
osd objectstore: filestore
osd sloppy crc: true
+ ceph-deploy:
+ fs: xfs
+ filestore: True
+ conf:
+ osd:
+ osd objectstore: filestore
+ osd sloppy crc: true
+
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
conf:
mds:
debug mds: 20
--- /dev/null
+#!/bin/sh -ex
+
+if [ ! -e Makefile ]; then
+ echo 'run this from the build dir'
+ exit 1
+fi
+
+if [ `uname` = FreeBSD ]; then
+ # otherwise module prettytable will not be found
+ export PYTHONPATH=/usr/local/lib/python2.7/site-packages
+ exec_mode=+111
+else
+ exec_mode=/111
+fi
+
+for f in `find ../qa/standalone -perm $exec_mode -type f`
+do
+ echo '--- $f ---'
+ PATH=$PATH:bin \
+ CEPH_ROOT=.. \
+ CEPH_LIB=lib \
+ $f || exit 1
+done
+
+exit 0
--- /dev/null
+qa/standalone
+=============
+
+These scripts run standalone clusters, but not in a normal way. They make
+use of functions ceph-helpers.sh to quickly start/stop daemons against
+toy clusters in a single directory.
+
+They are normally run via teuthology based on qa/suites/rados/standalone/*.yaml.
+
+You can run them in a git checkout + build directory as well:
+
+ * The qa/run-standalone.sh will run all of them in sequence. This is slow
+ since there is no parallelism.
+
+ * You can run an individual script by passing these environment args. For
+ example, if you are in the build/ directory,
+
+PATH=$PATH:bin CEPH_ROOT=.. CEPH_LIB=lib ../qa/standalone/mon/misc.sh
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014,2015 Red Hat <contact@redhat.com>
+# Copyright (C) 2014 Federico Gimenez <fgimenez@coit.es>
+#
+# Author: Loic Dachary <loic@dachary.org>
+# Author: Federico Gimenez <fgimenez@coit.es>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+TIMEOUT=300
+PG_NUM=4
+: ${CEPH_BUILD_VIRTUALENV:=/tmp}
+
+if type xmlstarlet > /dev/null 2>&1; then
+ XMLSTARLET=xmlstarlet
+elif type xml > /dev/null 2>&1; then
+ XMLSTARLET=xml
+else
+ echo "Missing xmlstarlet binary!"
+ exit 1
+fi
+
+if [ `uname` = FreeBSD ]; then
+ SED=gsed
+ DIFFCOLOPTS=""
+else
+ SED=sed
+ termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
+ if [ -n "$termwidth" -a "$termwidth" != "0" ]; then
+ termwidth="-W ${termwidth}"
+ fi
+ DIFFCOLOPTS="-y $termwidth"
+fi
+
+EXTRA_OPTS=""
+if [ -n "$CEPH_LIB" ]; then
+ EXTRA_OPTS+=" --erasure-code-dir $CEPH_LIB"
+ EXTRA_OPTS+=" --plugin-dir $CEPH_LIB"
+ EXTRA_OPTS+=" --osd-class-dir $CEPH_LIB"
+fi
+
+#! @file ceph-helpers.sh
+# @brief Toolbox to manage Ceph cluster dedicated to testing
+#
+# Example use case:
+#
+# ~~~~~~~~~~~~~~~~{.sh}
+# source ceph-helpers.sh
+#
+# function mytest() {
+# # cleanup leftovers and reset mydir
+# setup mydir
+# # create a cluster with one monitor and three osds
+# run_mon mydir a
+# run_osd mydir 0
+# run_osd mydir 2
+# run_osd mydir 3
+# # put and get an object
+# rados --pool rbd put GROUP /etc/group
+# rados --pool rbd get GROUP /tmp/GROUP
+# # stop the cluster and cleanup the directory
+# teardown mydir
+# }
+# ~~~~~~~~~~~~~~~~
+#
+# The focus is on simplicity and efficiency, in the context of
+# functional tests. The output is intentionally very verbose
+# and functions return as soon as an error is found. The caller
+# is also expected to abort on the first error so that debugging
+# can be done by looking at the end of the output.
+#
+# Each function is documented, implemented and tested independently.
+# When modifying a helper, the test and the documentation are
+# expected to be updated and it is easier of they are collocated. A
+# test for a given function can be run with
+#
+# ~~~~~~~~~~~~~~~~{.sh}
+# ceph-helpers.sh TESTS test_get_osds
+# ~~~~~~~~~~~~~~~~
+#
+# and all the tests (i.e. all functions matching test_*) are run
+# with:
+#
+# ~~~~~~~~~~~~~~~~{.sh}
+# ceph-helpers.sh TESTS
+# ~~~~~~~~~~~~~~~~
+#
+# A test function takes a single argument : the directory dedicated
+# to the tests. It is expected to not create any file outside of this
+# directory and remove it entirely when it completes successfully.
+#
+
+
+function get_asok_dir() {
+ if [ -n "$CEPH_ASOK_DIR" ]; then
+ echo "$CEPH_ASOK_DIR"
+ else
+ echo ${TMPDIR:-/tmp}/ceph-asok.$$
+ fi
+}
+
+function get_asok_path() {
+ local name=$1
+ if [ -n "$name" ]; then
+ echo $(get_asok_dir)/ceph-$name.asok
+ else
+ echo $(get_asok_dir)/\$cluster-\$name.asok
+ fi
+}
+##
+# Cleanup any leftovers found in **dir** via **teardown**
+# and reset **dir** as an empty environment.
+#
+# @param dir path name of the environment
+# @return 0 on success, 1 on error
+#
+function setup() {
+ local dir=$1
+ teardown $dir || return 1
+ mkdir -p $dir
+ mkdir -p $(get_asok_dir)
+}
+
+function test_setup() {
+ local dir=$dir
+ setup $dir || return 1
+ test -d $dir || return 1
+ setup $dir || return 1
+ test -d $dir || return 1
+ teardown $dir
+}
+
+#######################################################################
+
+##
+# Kill all daemons for which a .pid file exists in **dir** and remove
+# **dir**. If the file system in which **dir** is btrfs, delete all
+# subvolumes that relate to it.
+#
+# @param dir path name of the environment
+# @return 0 on success, 1 on error
+#
+function teardown() {
+ local dir=$1
+ kill_daemons $dir KILL
+ if [ `uname` != FreeBSD ] \
+ && [ $(stat -f -c '%T' .) == "btrfs" ]; then
+ __teardown_btrfs $dir
+ fi
+ rm -fr $dir
+ rm -rf $(get_asok_dir)
+}
+
+function __teardown_btrfs() {
+ local btrfs_base_dir=$1
+ local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
+ local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
+ for subvolume in $btrfs_dirs; do
+ sudo btrfs subvolume delete $btrfs_root/$subvolume
+ done
+}
+
+function test_teardown() {
+ local dir=$dir
+ setup $dir || return 1
+ teardown $dir || return 1
+ ! test -d $dir || return 1
+}
+
+#######################################################################
+
+##
+# Sends a signal to a single daemon.
+# This is a helper function for kill_daemons
+#
+# After the daemon is sent **signal**, its actual termination
+# will be verified by sending it signal 0. If the daemon is
+# still alive, kill_daemon will pause for a few seconds and
+# try again. This will repeat for a fixed number of times
+# before kill_daemon returns on failure. The list of
+# sleep intervals can be specified as **delays** and defaults
+# to:
+#
+# 0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120
+#
+# This sequence is designed to run first a very short sleep time (0.1)
+# if the machine is fast enough and the daemon terminates in a fraction of a
+# second. The increasing sleep numbers should give plenty of time for
+# the daemon to die even on the slowest running machine. If a daemon
+# takes more than a few minutes to stop (the sum of all sleep times),
+# there probably is no point in waiting more and a number of things
+# are likely to go wrong anyway: better give up and return on error.
+#
+# @param pid the process id to send a signal
+# @param send_signal the signal to send
+# @param delays sequence of sleep times before failure
+#
+function kill_daemon() {
+ local pid=$(cat $1)
+ local send_signal=$2
+ local delays=${3:-0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120}
+ local exit_code=1
+ for try in $delays ; do
+ if kill -$send_signal $pid 2> /dev/null ; then
+ exit_code=1
+ else
+ exit_code=0
+ break
+ fi
+ send_signal=0
+ sleep $try
+ done;
+ return $exit_code
+}
+
+function test_kill_daemon() {
+ local dir=$1
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+
+ name_prefix=osd
+ for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
+ #
+ # sending signal 0 won't kill the daemon
+ # waiting just for one second instead of the default schedule
+ # allows us to quickly verify what happens when kill fails
+ # to stop the daemon (i.e. it must return false)
+ #
+ ! kill_daemon $pidfile 0 1 || return 1
+ #
+ # killing just the osd and verify the mon still is responsive
+ #
+ kill_daemon $pidfile TERM || return 1
+ done
+
+ ceph osd dump | grep "osd.0 down" || return 1
+
+ name_prefix=mgr
+ for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
+ #
+ # kill the mgr
+ #
+ kill_daemon $pidfile TERM || return 1
+ done
+
+ name_prefix=mon
+ for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
+ #
+ # kill the mon and verify it cannot be reached
+ #
+ kill_daemon $pidfile TERM || return 1
+ ! timeout 5 ceph status || return 1
+ done
+
+ teardown $dir || return 1
+}
+
+##
+# Kill all daemons for which a .pid file exists in **dir**. Each
+# daemon is sent a **signal** and kill_daemons waits for it to exit
+# during a few minutes. By default all daemons are killed. If a
+# **name_prefix** is provided, only the daemons for which a pid
+# file is found matching the prefix are killed. See run_osd and
+# run_mon for more information about the name conventions for
+# the pid files.
+#
+# Send TERM to all daemons : kill_daemons $dir
+# Send KILL to all daemons : kill_daemons $dir KILL
+# Send KILL to all osds : kill_daemons $dir KILL osd
+# Send KILL to osd 1 : kill_daemons $dir KILL osd.1
+#
+# If a daemon is sent the TERM signal and does not terminate
+# within a few minutes, it will still be running even after
+# kill_daemons returns.
+#
+# If all daemons are kill successfully the function returns 0
+# if at least one daemon remains, this is treated as an
+# error and the function return 1.
+#
+# @param dir path name of the environment
+# @param signal name of the first signal (defaults to TERM)
+# @param name_prefix only kill match daemons (defaults to all)
+# @param delays sequence of sleep times before failure
+# @return 0 on success, 1 on error
+#
+function kill_daemons() {
+ local trace=$(shopt -q -o xtrace && echo true || echo false)
+ $trace && shopt -u -o xtrace
+ local dir=$1
+ local signal=${2:-TERM}
+ local name_prefix=$3 # optional, osd, mon, osd.1
+ local delays=$4 #optional timing
+ local status=0
+ local pids=""
+
+ for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
+ run_in_background pids kill_daemon $pidfile $signal $delays
+ done
+
+ wait_background pids
+ status=$?
+
+ $trace && shopt -s -o xtrace
+ return $status
+}
+
+function test_kill_daemons() {
+ local dir=$1
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ #
+ # sending signal 0 won't kill the daemon
+ # waiting just for one second instead of the default schedule
+ # allows us to quickly verify what happens when kill fails
+ # to stop the daemon (i.e. it must return false)
+ #
+ ! kill_daemons $dir 0 osd 1 || return 1
+ #
+ # killing just the osd and verify the mon still is responsive
+ #
+ kill_daemons $dir TERM osd || return 1
+ ceph osd dump | grep "osd.0 down" || return 1
+ #
+ # kill the mgr
+ #
+ kill_daemons $dir TERM mgr || return 1
+ #
+ # kill the mon and verify it cannot be reached
+ #
+ kill_daemons $dir TERM || return 1
+ ! timeout 5 ceph status || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Run a monitor by the name mon.**id** with data in **dir**/**id**.
+# The logs can be found in **dir**/mon.**id**.log and the pid file
+# is **dir**/mon.**id**.pid and the admin socket is
+# **dir**/**id**/ceph-mon.**id**.asok.
+#
+# The remaining arguments are passed verbatim to ceph-mon --mkfs
+# and the ceph-mon daemon.
+#
+# Two mandatory arguments must be provided: --fsid and --mon-host
+# Instead of adding them to every call to run_mon, they can be
+# set in the CEPH_ARGS environment variable to be read implicitly
+# by every ceph command.
+#
+# The CEPH_CONF variable is expected to be set to /dev/null to
+# only rely on arguments for configuration.
+#
+# Examples:
+#
+# CEPH_ARGS="--fsid=$(uuidgen) "
+# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
+# run_mon $dir a # spawn a mon and bind port 7018
+# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging
+#
+# If mon_initial_members is not set, the default rbd pool is deleted
+# and replaced with a replicated pool with less placement groups to
+# speed up initialization. If mon_initial_members is set, no attempt
+# is made to recreate the rbd pool because it would hang forever,
+# waiting for other mons to join.
+#
+# A **dir**/ceph.conf file is created but not meant to be used by any
+# function. It is convenient for debugging a failure with:
+#
+# ceph --conf **dir**/ceph.conf -s
+#
+# @param dir path name of the environment
+# @param id mon identifier
+# @param ... can be any option valid for ceph-mon
+# @return 0 on success, 1 on error
+#
+function run_mon() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local data=$dir/$id
+
+ ceph-mon \
+ --id $id \
+ --mkfs \
+ --mon-data=$data \
+ --run-dir=$dir \
+ "$@" || return 1
+
+ ceph-mon \
+ --id $id \
+ --mon-osd-full-ratio=.99 \
+ --mon-data-avail-crit=1 \
+ --paxos-propose-interval=0.1 \
+ --osd-crush-chooseleaf-type=0 \
+ $EXTRA_OPTS \
+ --debug-mon 20 \
+ --debug-ms 20 \
+ --debug-paxos 20 \
+ --chdir= \
+ --mon-data=$data \
+ --log-file=$dir/\$name.log \
+ --admin-socket=$(get_asok_path) \
+ --mon-cluster-log-file=$dir/log \
+ --run-dir=$dir \
+ --pid-file=$dir/\$name.pid \
+ --mon-allow-pool-delete \
+ --mon-osd-backfillfull-ratio .99 \
+ "$@" || return 1
+
+ cat > $dir/ceph.conf <<EOF
+[global]
+fsid = $(get_config mon $id fsid)
+mon host = $(get_config mon $id mon_host)
+EOF
+}
+
+function test_run_mon() {
+ local dir=$1
+
+ setup $dir || return 1
+
+ run_mon $dir a --mon-initial-members=a || return 1
+ create_rbd_pool || return 1
+ # rbd has not been deleted / created, hence it has pool id 0
+ ceph osd dump | grep "pool 1 'rbd'" || return 1
+ kill_daemons $dir || return 1
+
+ run_mon $dir a || return 1
+ create_rbd_pool || return 1
+ # rbd has been deleted / created, hence it does not have pool id 0
+ ! ceph osd dump | grep "pool 1 'rbd'" || return 1
+ local size=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path mon.a) \
+ config get osd_pool_default_size)
+ test "$size" = '{"osd_pool_default_size":"3"}' || return 1
+
+ ! CEPH_ARGS='' ceph status || return 1
+ CEPH_ARGS='' ceph --conf $dir/ceph.conf status || return 1
+
+ kill_daemons $dir || return 1
+
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ local size=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path mon.a) \
+ config get osd_pool_default_size)
+ test "$size" = '{"osd_pool_default_size":"1"}' || return 1
+ kill_daemons $dir || return 1
+
+ CEPH_ARGS="$CEPH_ARGS --osd_pool_default_size=2" \
+ run_mon $dir a || return 1
+ local size=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path mon.a) \
+ config get osd_pool_default_size)
+ test "$size" = '{"osd_pool_default_size":"2"}' || return 1
+ kill_daemons $dir || return 1
+
+ teardown $dir || return 1
+}
+
+function create_rbd_pool() {
+ ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1
+ ceph osd pool create rbd $PG_NUM || return 1
+ rbd pool init rbd
+}
+
+#######################################################################
+
+function run_mgr() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local data=$dir/$id
+
+ ceph-mgr \
+ --id $id \
+ $EXTRA_OPTS \
+ --debug-mgr 20 \
+ --debug-objecter 20 \
+ --debug-ms 20 \
+ --debug-paxos 20 \
+ --chdir= \
+ --mgr-data=$data \
+ --log-file=$dir/\$name.log \
+ --admin-socket=$(get_asok_path) \
+ --run-dir=$dir \
+ --pid-file=$dir/\$name.pid \
+ "$@" || return 1
+}
+
+#######################################################################
+
+##
+# Create (prepare) and run (activate) an osd by the name osd.**id**
+# with data in **dir**/**id**. The logs can be found in
+# **dir**/osd.**id**.log, the pid file is **dir**/osd.**id**.pid and
+# the admin socket is **dir**/**id**/ceph-osd.**id**.asok.
+#
+# The remaining arguments are passed verbatim to ceph-osd.
+#
+# Two mandatory arguments must be provided: --fsid and --mon-host
+# Instead of adding them to every call to run_osd, they can be
+# set in the CEPH_ARGS environment variable to be read implicitly
+# by every ceph command.
+#
+# The CEPH_CONF variable is expected to be set to /dev/null to
+# only rely on arguments for configuration.
+#
+# The run_osd function creates the OSD data directory with ceph-disk
+# prepare on the **dir**/**id** directory and relies on the
+# activate_osd function to run the daemon.
+#
+# Examples:
+#
+# CEPH_ARGS="--fsid=$(uuidgen) "
+# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
+# run_osd $dir 0 # prepare and activate an osd using the monitor listening on 7018
+#
+# @param dir path name of the environment
+# @param id osd identifier
+# @param ... can be any option valid for ceph-osd
+# @return 0 on success, 1 on error
+#
+function run_osd() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local osd_data=$dir/$id
+
+ local ceph_disk_args
+ ceph_disk_args+=" --statedir=$dir"
+ ceph_disk_args+=" --sysconfdir=$dir"
+ ceph_disk_args+=" --prepend-to-path="
+
+ mkdir -p $osd_data
+ ceph-disk $ceph_disk_args \
+ prepare --filestore $osd_data || return 1
+
+ activate_osd $dir $id "$@"
+}
+
+function run_osd_bluestore() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local osd_data=$dir/$id
+
+ local ceph_disk_args
+ ceph_disk_args+=" --statedir=$dir"
+ ceph_disk_args+=" --sysconfdir=$dir"
+ ceph_disk_args+=" --prepend-to-path="
+
+ mkdir -p $osd_data
+ ceph-disk $ceph_disk_args \
+ prepare --bluestore $osd_data || return 1
+
+ activate_osd $dir $id "$@"
+}
+
+function test_run_osd() {
+ local dir=$1
+
+ setup $dir || return 1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+
+ run_osd $dir 0 || return 1
+ local backfills=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.0) \
+ config get osd_max_backfills)
+ echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
+
+ run_osd $dir 1 --osd-max-backfills 20 || return 1
+ local backfills=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.1) \
+ config get osd_max_backfills)
+ test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
+
+ CEPH_ARGS="$CEPH_ARGS --osd-max-backfills 30" run_osd $dir 2 || return 1
+ local backfills=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.2) \
+ config get osd_max_backfills)
+ test "$backfills" = '{"osd_max_backfills":"30"}' || return 1
+
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Shutdown and remove all traces of the osd by the name osd.**id**.
+#
+# The OSD is shutdown with the TERM signal. It is then removed from
+# the auth list, crush map, osd map etc and the files associated with
+# it are also removed.
+#
+# @param dir path name of the environment
+# @param id osd identifier
+# @return 0 on success, 1 on error
+#
+function destroy_osd() {
+ local dir=$1
+ local id=$2
+
+ ceph osd out osd.$id || return 1
+ kill_daemons $dir TERM osd.$id || return 1
+ ceph osd purge osd.$id --yes-i-really-mean-it || return 1
+ teardown $dir/$id || return 1
+ rm -fr $dir/$id
+}
+
+function test_destroy_osd() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ destroy_osd $dir 0 || return 1
+ ! ceph osd dump | grep "osd.$id " || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Run (activate) an osd by the name osd.**id** with data in
+# **dir**/**id**. The logs can be found in **dir**/osd.**id**.log,
+# the pid file is **dir**/osd.**id**.pid and the admin socket is
+# **dir**/**id**/ceph-osd.**id**.asok.
+#
+# The remaining arguments are passed verbatim to ceph-osd.
+#
+# Two mandatory arguments must be provided: --fsid and --mon-host
+# Instead of adding them to every call to activate_osd, they can be
+# set in the CEPH_ARGS environment variable to be read implicitly
+# by every ceph command.
+#
+# The CEPH_CONF variable is expected to be set to /dev/null to
+# only rely on arguments for configuration.
+#
+# The activate_osd function expects a valid OSD data directory
+# in **dir**/**id**, either just created via run_osd or re-using
+# one left by a previous run of ceph-osd. The ceph-osd daemon is
+# run indirectly via ceph-disk activate.
+#
+# The activate_osd function blocks until the monitor reports the osd
+# up. If it fails to do so within $TIMEOUT seconds, activate_osd
+# fails.
+#
+# Examples:
+#
+# CEPH_ARGS="--fsid=$(uuidgen) "
+# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
+# activate_osd $dir 0 # activate an osd using the monitor listening on 7018
+#
+# @param dir path name of the environment
+# @param id osd identifier
+# @param ... can be any option valid for ceph-osd
+# @return 0 on success, 1 on error
+#
+function activate_osd() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local osd_data=$dir/$id
+
+ local ceph_disk_args
+ ceph_disk_args+=" --statedir=$dir"
+ ceph_disk_args+=" --sysconfdir=$dir"
+ ceph_disk_args+=" --prepend-to-path="
+
+ local ceph_args="$CEPH_ARGS"
+ ceph_args+=" --osd-failsafe-full-ratio=.99"
+ ceph_args+=" --osd-journal-size=100"
+ ceph_args+=" --osd-scrub-load-threshold=2000"
+ ceph_args+=" --osd-data=$osd_data"
+ ceph_args+=" --chdir="
+ ceph_args+=$EXTRA_OPTS
+ ceph_args+=" --run-dir=$dir"
+ ceph_args+=" --admin-socket=$(get_asok_path)"
+ ceph_args+=" --debug-osd=20"
+ ceph_args+=" --log-file=$dir/\$name.log"
+ ceph_args+=" --pid-file=$dir/\$name.pid"
+ ceph_args+=" --osd-max-object-name-len 460"
+ ceph_args+=" --osd-max-object-namespace-len 64"
+ ceph_args+=" --enable-experimental-unrecoverable-data-corrupting-features *"
+ ceph_args+=" "
+ ceph_args+="$@"
+ mkdir -p $osd_data
+ CEPH_ARGS="$ceph_args " ceph-disk $ceph_disk_args \
+ activate \
+ --mark-init=none \
+ $osd_data || return 1
+
+ [ "$id" = "$(cat $osd_data/whoami)" ] || return 1
+
+ wait_for_osd up $id || return 1
+}
+
+function test_activate_osd() {
+ local dir=$1
+
+ setup $dir || return 1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+
+ run_osd $dir 0 || return 1
+ local backfills=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.0) \
+ config get osd_max_backfills)
+ echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
+
+ kill_daemons $dir TERM osd || return 1
+
+ activate_osd $dir 0 --osd-max-backfills 20 || return 1
+ local backfills=$(CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.0) \
+ config get osd_max_backfills)
+ test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
+
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Wait until the OSD **id** is either up or down, as specified by
+# **state**. It fails after $TIMEOUT seconds.
+#
+# @param state either up or down
+# @param id osd identifier
+# @return 0 on success, 1 on error
+#
+function wait_for_osd() {
+ local state=$1
+ local id=$2
+
+ status=1
+ for ((i=0; i < $TIMEOUT; i++)); do
+ echo $i
+ if ! ceph osd dump | grep "osd.$id $state"; then
+ sleep 1
+ else
+ status=0
+ break
+ fi
+ done
+ return $status
+}
+
+function test_wait_for_osd() {
+ local dir=$1
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ wait_for_osd up 0 || return 1
+ kill_daemons $dir TERM osd || return 1
+ wait_for_osd down 0 || return 1
+ ( TIMEOUT=1 ; ! wait_for_osd up 0 ) || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Display the list of OSD ids supporting the **objectname** stored in
+# **poolname**, as reported by ceph osd map.
+#
+# @param poolname an existing pool
+# @param objectname an objectname (may or may not exist)
+# @param STDOUT white space separated list of OSD ids
+# @return 0 on success, 1 on error
+#
+function get_osds() {
+ local poolname=$1
+ local objectname=$2
+
+ local osds=$(ceph --format json osd map $poolname $objectname 2>/dev/null | \
+ jq '.acting | .[]')
+ # get rid of the trailing space
+ echo $osds
+}
+
+function test_get_osds() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ create_rbd_pool || return 1
+ get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Wait for the monitor to form quorum (optionally, of size N)
+#
+# @param timeout duration (lower-bound) to wait for quorum to be formed
+# @param quorumsize size of quorum to wait for
+# @return 0 on success, 1 on error
+#
+function wait_for_quorum() {
+ local timeout=$1
+ local quorumsize=$2
+
+ if [[ -z "$timeout" ]]; then
+ timeout=300
+ fi
+
+ if [[ -z "$quorumsize" ]]; then
+ timeout $timeout ceph mon_status --format=json >&/dev/null || return 1
+ return 0
+ fi
+
+ no_quorum=1
+ wait_until=$((`date +%s` + $timeout))
+ while [[ $(date +%s) -lt $wait_until ]]; do
+ jqfilter='.quorum | length == '$quorumsize
+ jqinput="$(timeout $timeout ceph mon_status --format=json 2>/dev/null)"
+ res=$(echo $jqinput | jq "$jqfilter")
+ if [[ "$res" == "true" ]]; then
+ no_quorum=0
+ break
+ fi
+ done
+ return $no_quorum
+}
+
+#######################################################################
+
+##
+# Return the PG of supporting the **objectname** stored in
+# **poolname**, as reported by ceph osd map.
+#
+# @param poolname an existing pool
+# @param objectname an objectname (may or may not exist)
+# @param STDOUT a PG
+# @return 0 on success, 1 on error
+#
+function get_pg() {
+ local poolname=$1
+ local objectname=$2
+
+ ceph --format json osd map $poolname $objectname 2>/dev/null | jq -r '.pgid'
+}
+
+function test_get_pg() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the value of the **config**, obtained via the config get command
+# of the admin socket of **daemon**.**id**.
+#
+# @param daemon mon or osd
+# @param id mon or osd ID
+# @param config the configuration variable name as found in config_opts.h
+# @param STDOUT the config value
+# @return 0 on success, 1 on error
+#
+function get_config() {
+ local daemon=$1
+ local id=$2
+ local config=$3
+
+ CEPH_ARGS='' \
+ ceph --format json daemon $(get_asok_path $daemon.$id) \
+ config get $config 2> /dev/null | \
+ jq -r ".$config"
+}
+
+function test_get_config() {
+ local dir=$1
+
+ # override the default config using command line arg and check it
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ test $(get_config mon a osd_pool_default_size) = 1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 --osd_max_scrubs=3 || return 1
+ test $(get_config osd 0 osd_max_scrubs) = 3 || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Set the **config** to specified **value**, via the config set command
+# of the admin socket of **daemon**.**id**
+#
+# @param daemon mon or osd
+# @param id mon or osd ID
+# @param config the configuration variable name as found in config_opts.h
+# @param value the config value
+# @return 0 on success, 1 on error
+#
+function set_config() {
+ local daemon=$1
+ local id=$2
+ local config=$3
+ local value=$4
+
+ test $(env CEPH_ARGS='' ceph --format json daemon $(get_asok_path $daemon.$id) \
+ config set $config $value 2> /dev/null | \
+ jq 'has("success")') == true
+}
+
+function test_set_config() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ test $(get_config mon a ms_crc_header) = true || return 1
+ set_config mon a ms_crc_header false || return 1
+ test $(get_config mon a ms_crc_header) = false || return 1
+ set_config mon a ms_crc_header true || return 1
+ test $(get_config mon a ms_crc_header) = true || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the OSD id of the primary OSD supporting the **objectname**
+# stored in **poolname**, as reported by ceph osd map.
+#
+# @param poolname an existing pool
+# @param objectname an objectname (may or may not exist)
+# @param STDOUT the primary OSD id
+# @return 0 on success, 1 on error
+#
+function get_primary() {
+ local poolname=$1
+ local objectname=$2
+
+ ceph --format json osd map $poolname $objectname 2>/dev/null | \
+ jq '.acting_primary'
+}
+
+function test_get_primary() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ local osd=0
+ run_mgr $dir x || return 1
+ run_osd $dir $osd || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ test $(get_primary rbd GROUP) = $osd || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the id of any OSD supporting the **objectname** stored in
+# **poolname**, as reported by ceph osd map, except the primary.
+#
+# @param poolname an existing pool
+# @param objectname an objectname (may or may not exist)
+# @param STDOUT the OSD id
+# @return 0 on success, 1 on error
+#
+function get_not_primary() {
+ local poolname=$1
+ local objectname=$2
+
+ local primary=$(get_primary $poolname $objectname)
+ ceph --format json osd map $poolname $objectname 2>/dev/null | \
+ jq ".acting | map(select (. != $primary)) | .[0]"
+}
+
+function test_get_not_primary() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ local primary=$(get_primary rbd GROUP)
+ local not_primary=$(get_not_primary rbd GROUP)
+ test $not_primary != $primary || return 1
+ test $not_primary = 0 -o $not_primary = 1 || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Run ceph-objectstore-tool against the OSD **id** using the data path
+# **dir**. The OSD is killed with TERM prior to running
+# ceph-objectstore-tool because access to the data path is
+# exclusive. The OSD is restarted after the command completes. The
+# objectstore_tool returns after all PG are active+clean again.
+#
+# @param dir the data path of the OSD
+# @param id the OSD id
+# @param ... arguments to ceph-objectstore-tool
+# @param STDIN the input of ceph-objectstore-tool
+# @param STDOUT the output of ceph-objectstore-tool
+# @return 0 on success, 1 on error
+#
+# The value of $ceph_osd_args will be passed to restarted osds
+#
+function objectstore_tool() {
+ local dir=$1
+ shift
+ local id=$1
+ shift
+ local osd_data=$dir/$id
+
+ local osd_type=$(cat $osd_data/type)
+
+ kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
+
+ local journal_args
+ if [ "$objectstore_type" == "filestore" ]; then
+ journal_args=" --journal-path $osd_data/journal"
+ fi
+ ceph-objectstore-tool \
+ --data-path $osd_data \
+ $journal_args \
+ "$@" || return 1
+ activate_osd $dir $id $ceph_osd_args >&2 || return 1
+ wait_for_clean >&2
+}
+
+function test_objectstore_tool() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ local osd=0
+ run_mgr $dir x || return 1
+ run_osd $dir $osd || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ rados --pool rbd put GROUP /etc/group || return 1
+ objectstore_tool $dir $osd GROUP get-bytes | \
+ diff - /etc/group
+ ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Predicate checking if there is an ongoing recovery in the
+# cluster. If any of the recovering_{keys,bytes,objects}_per_sec
+# counters are reported by ceph status, it means recovery is in
+# progress.
+#
+# @return 0 if recovery in progress, 1 otherwise
+#
+function get_is_making_recovery_progress() {
+ local recovery_progress
+ recovery_progress+=".recovering_keys_per_sec + "
+ recovery_progress+=".recovering_bytes_per_sec + "
+ recovery_progress+=".recovering_objects_per_sec"
+ local progress=$(ceph --format json status 2>/dev/null | \
+ jq -r ".pgmap | $recovery_progress")
+ test "$progress" != null
+}
+
+function test_get_is_making_recovery_progress() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ ! get_is_making_recovery_progress || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the number of active PGs in the cluster. A PG is active if
+# ceph pg dump pgs reports it both **active** and **clean** and that
+# not **stale**.
+#
+# @param STDOUT the number of active PGs
+# @return 0 on success, 1 on error
+#
+function get_num_active_clean() {
+ local expression
+ expression+="select(contains(\"active\") and contains(\"clean\")) | "
+ expression+="select(contains(\"stale\") | not)"
+ ceph --format json pg dump pgs 2>/dev/null | \
+ jq "[.[] | .state | $expression] | length"
+}
+
+function test_get_num_active_clean() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ local num_active_clean=$(get_num_active_clean)
+ test "$num_active_clean" = $PG_NUM || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the number of PGs in the cluster, according to
+# ceph pg dump pgs.
+#
+# @param STDOUT the number of PGs
+# @return 0 on success, 1 on error
+#
+function get_num_pgs() {
+ ceph --format json status 2>/dev/null | jq '.pgmap.num_pgs'
+}
+
+function test_get_num_pgs() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ local num_pgs=$(get_num_pgs)
+ test "$num_pgs" -gt 0 || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the OSD ids in use by at least one PG in the cluster (either
+# in the up or the acting set), according to ceph pg dump pgs. Every
+# OSD id shows as many times as they are used in up and acting sets.
+# If an OSD id is in both the up and acting set of a given PG, it will
+# show twice.
+#
+# @param STDOUT a sorted list of OSD ids
+# @return 0 on success, 1 on error
+#
+function get_osd_id_used_by_pgs() {
+ ceph --format json pg dump pgs 2>/dev/null | jq '.[] | .up[], .acting[]' | sort
+}
+
+function test_get_osd_id_used_by_pgs() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ local osd_ids=$(get_osd_id_used_by_pgs | uniq)
+ test "$osd_ids" = "0" || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Wait until the OSD **id** shows **count** times in the
+# PGs (see get_osd_id_used_by_pgs for more information about
+# how OSD ids are counted).
+#
+# @param id the OSD id
+# @param count the number of time it must show in the PGs
+# @return 0 on success, 1 on error
+#
+function wait_osd_id_used_by_pgs() {
+ local id=$1
+ local count=$2
+
+ status=1
+ for ((i=0; i < $TIMEOUT / 5; i++)); do
+ echo $i
+ if ! test $(get_osd_id_used_by_pgs | grep -c $id) = $count ; then
+ sleep 5
+ else
+ status=0
+ break
+ fi
+ done
+ return $status
+}
+
+function test_wait_osd_id_used_by_pgs() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ wait_osd_id_used_by_pgs 0 8 || return 1
+ ! TIMEOUT=1 wait_osd_id_used_by_pgs 123 5 || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return the date and time of the last completed scrub for **pgid**,
+# as reported by ceph pg dump pgs. Note that a repair also sets this
+# date.
+#
+# @param pgid the id of the PG
+# @param STDOUT the date and time of the last scrub
+# @return 0 on success, 1 on error
+#
+function get_last_scrub_stamp() {
+ local pgid=$1
+ local sname=${2:-last_scrub_stamp}
+ ceph --format json pg dump pgs 2>/dev/null | \
+ jq -r ".[] | select(.pgid==\"$pgid\") | .$sname"
+}
+
+function test_get_last_scrub_stamp() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ stamp=$(get_last_scrub_stamp 2.0)
+ test -n "$stamp" || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Predicate checking if the cluster is clean, i.e. all of its PGs are
+# in a clean state (see get_num_active_clean for a definition).
+#
+# @return 0 if the cluster is clean, 1 otherwise
+#
+function is_clean() {
+ num_pgs=$(get_num_pgs)
+ test $num_pgs != 0 || return 1
+ test $(get_num_active_clean) = $num_pgs || return 1
+}
+
+function test_is_clean() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ is_clean || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return a list of numbers that are increasingly larger and whose
+# total is **timeout** seconds. It can be used to have short sleep
+# delay while waiting for an event on a fast machine. But if running
+# very slowly the larger delays avoid stressing the machine even
+# further or spamming the logs.
+#
+# @param timeout sum of all delays, in seconds
+# @return a list of sleep delays
+#
+function get_timeout_delays() {
+ local trace=$(shopt -q -o xtrace && echo true || echo false)
+ $trace && shopt -u -o xtrace
+ local timeout=$1
+ local first_step=${2:-1}
+
+ local i
+ local total="0"
+ i=$first_step
+ while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do
+ echo -n "$i "
+ total=$(echo $total + $i | bc -l)
+ i=$(echo $i \* 2 | bc -l)
+ done
+ if test "$(echo $total \< $timeout | bc -l)" = "1"; then
+ echo -n $(echo $timeout - $total | bc -l)
+ fi
+ $trace && shopt -s -o xtrace
+}
+
+function test_get_timeout_delays() {
+ test "$(get_timeout_delays 1)" = "1 " || return 1
+ test "$(get_timeout_delays 5)" = "1 2 2" || return 1
+ test "$(get_timeout_delays 6)" = "1 2 3" || return 1
+ test "$(get_timeout_delays 7)" = "1 2 4 " || return 1
+ test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1
+ test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1
+ test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1
+ test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1
+ test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1
+ test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1
+ test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1
+}
+
+#######################################################################
+
+##
+# Wait until the cluster becomes clean or if it does not make progress
+# for $TIMEOUT seconds.
+# Progress is measured either via the **get_is_making_recovery_progress**
+# predicate or if the number of clean PGs changes (as returned by get_num_active_clean)
+#
+# @return 0 if the cluster is clean, 1 otherwise
+#
+function wait_for_clean() {
+ local num_active_clean=-1
+ local cur_active_clean
+ local -a delays=($(get_timeout_delays $TIMEOUT .1))
+ local -i loop=0
+
+ while test $(get_num_pgs) == 0 ; do
+ sleep 1
+ done
+
+ while true ; do
+ # Comparing get_num_active_clean & get_num_pgs is used to determine
+ # if the cluster is clean. That's almost an inline of is_clean() to
+ # get more performance by avoiding multiple calls of get_num_active_clean.
+ cur_active_clean=$(get_num_active_clean)
+ test $cur_active_clean = $(get_num_pgs) && break
+ if test $cur_active_clean != $num_active_clean ; then
+ loop=0
+ num_active_clean=$cur_active_clean
+ elif get_is_making_recovery_progress ; then
+ loop=0
+ elif (( $loop >= ${#delays[*]} )) ; then
+ ceph report
+ return 1
+ fi
+ sleep ${delays[$loop]}
+ loop+=1
+ done
+ return 0
+}
+
+function test_wait_for_clean() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ create_rbd_pool || return 1
+ ! TIMEOUT=1 wait_for_clean || return 1
+ run_osd $dir 0 || return 1
+ wait_for_clean || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Wait until the cluster becomes HEALTH_OK again or if it does not make progress
+# for $TIMEOUT seconds.
+#
+# @return 0 if the cluster is HEALTHY, 1 otherwise
+#
+function wait_for_health() {
+ local grepstr=$1
+ local -a delays=($(get_timeout_delays $TIMEOUT .1))
+ local -i loop=0
+
+ while ! ceph health detail | grep "$grepstr" ; do
+ if (( $loop >= ${#delays[*]} )) ; then
+ ceph health detail
+ return 1
+ fi
+ sleep ${delays[$loop]}
+ loop+=1
+ done
+}
+
+function wait_for_health_ok() {
+ wait_for_health "HEALTH_OK" || return 1
+}
+
+function test_wait_for_health_ok() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 --osd_failsafe_full_ratio=.99 --mon_pg_warn_min_per_osd=0 || return 1
+ run_mgr $dir x --mon_pg_warn_min_per_osd=0 || return 1
+ run_osd $dir 0 || return 1
+ kill_daemons $dir TERM osd || return 1
+ ! TIMEOUT=1 wait_for_health_ok || return 1
+ activate_osd $dir 0 || return 1
+ wait_for_health_ok || return 1
+ teardown $dir || return 1
+}
+
+
+#######################################################################
+
+##
+# Run repair on **pgid** and wait until it completes. The repair
+# function will fail if repair does not complete within $TIMEOUT
+# seconds.
+#
+# @param pgid the id of the PG
+# @return 0 on success, 1 on error
+#
+function repair() {
+ local pgid=$1
+ local last_scrub=$(get_last_scrub_stamp $pgid)
+ ceph pg repair $pgid
+ wait_for_scrub $pgid "$last_scrub"
+}
+
+function test_repair() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ repair 2.0 || return 1
+ kill_daemons $dir KILL osd || return 1
+ ! TIMEOUT=1 repair 2.0 || return 1
+ teardown $dir || return 1
+}
+#######################################################################
+
+##
+# Run scrub on **pgid** and wait until it completes. The pg_scrub
+# function will fail if repair does not complete within $TIMEOUT
+# seconds. The pg_scrub is complete whenever the
+# **get_last_scrub_stamp** function reports a timestamp different from
+# the one stored before starting the scrub.
+#
+# @param pgid the id of the PG
+# @return 0 on success, 1 on error
+#
+function pg_scrub() {
+ local pgid=$1
+ local last_scrub=$(get_last_scrub_stamp $pgid)
+ ceph pg scrub $pgid
+ wait_for_scrub $pgid "$last_scrub"
+}
+
+function pg_deep_scrub() {
+ local pgid=$1
+ local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
+ ceph pg deep-scrub $pgid
+ wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
+}
+
+function test_pg_scrub() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ pg_scrub 2.0 || return 1
+ kill_daemons $dir KILL osd || return 1
+ ! TIMEOUT=1 pg_scrub 2.0 || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Run the *command* and expect it to fail (i.e. return a non zero status).
+# The output (stderr and stdout) is stored in a temporary file in *dir*
+# and is expected to contain the string *expected*.
+#
+# Return 0 if the command failed and the string was found. Otherwise
+# return 1 and cat the full output of the command on stderr for debug.
+#
+# @param dir temporary directory to store the output
+# @param expected string to look for in the output
+# @param command ... the command and its arguments
+# @return 0 on success, 1 on error
+#
+
+function expect_failure() {
+ local dir=$1
+ shift
+ local expected="$1"
+ shift
+ local success
+
+ if "$@" > $dir/out 2>&1 ; then
+ success=true
+ else
+ success=false
+ fi
+
+ if $success || ! grep --quiet "$expected" $dir/out ; then
+ cat $dir/out >&2
+ return 1
+ else
+ return 0
+ fi
+}
+
+function test_expect_failure() {
+ local dir=$1
+
+ setup $dir || return 1
+ expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1
+ # the command did not fail
+ ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1
+ grep --quiet FAIL $dir/out || return 1
+ # the command failed but the output does not contain the expected string
+ ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1
+ ! grep --quiet FAIL $dir/out || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
+# will fail if scrub does not complete within $TIMEOUT seconds. The
+# repair is complete whenever the **get_last_scrub_stamp** function
+# reports a timestamp different from the one given in argument.
+#
+# @param pgid the id of the PG
+# @param last_scrub timestamp of the last scrub for *pgid*
+# @return 0 on success, 1 on error
+#
+function wait_for_scrub() {
+ local pgid=$1
+ local last_scrub="$2"
+ local sname=${3:-last_scrub_stamp}
+
+ for ((i=0; i < $TIMEOUT; i++)); do
+ if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then
+ return 0
+ fi
+ sleep 1
+ done
+ return 1
+}
+
+function test_wait_for_scrub() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ local pgid=2.0
+ ceph pg repair $pgid
+ local last_scrub=$(get_last_scrub_stamp $pgid)
+ wait_for_scrub $pgid "$last_scrub" || return 1
+ kill_daemons $dir KILL osd || return 1
+ last_scrub=$(get_last_scrub_stamp $pgid)
+ ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Return 0 if the erasure code *plugin* is available, 1 otherwise.
+#
+# @param plugin erasure code plugin
+# @return 0 on success, 1 on error
+#
+
+function erasure_code_plugin_exists() {
+ local plugin=$1
+ local status
+ local grepstr
+ local s
+ case `uname` in
+ FreeBSD) grepstr="Cannot open.*$plugin" ;;
+ *) grepstr="$plugin.*No such file" ;;
+ esac
+
+ s=$(ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1)
+ local status=$?
+ if [ $status -eq 0 ]; then
+ ceph osd erasure-code-profile rm TESTPROFILE
+ elif ! echo $s | grep --quiet "$grepstr" ; then
+ status=1
+ # display why the string was rejected.
+ echo $s
+ fi
+ return $status
+}
+
+function test_erasure_code_plugin_exists() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ erasure_code_plugin_exists jerasure || return 1
+ ! erasure_code_plugin_exists FAKE || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+
+##
+# Display all log files from **dir** on stdout.
+#
+# @param dir directory in which all data is stored
+#
+
+function display_logs() {
+ local dir=$1
+
+ find $dir -maxdepth 1 -name '*.log' | \
+ while read file ; do
+ echo "======================= $file"
+ cat $file
+ done
+}
+
+function test_display_logs() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ kill_daemons $dir || return 1
+ display_logs $dir > $dir/log.out
+ grep --quiet mon.a.log $dir/log.out || return 1
+ teardown $dir || return 1
+}
+
+#######################################################################
+##
+# Spawn a command in background and save the pid in the variable name
+# passed in argument. To make the output reading easier, the output is
+# prepend with the process id.
+#
+# Example:
+# pids1=""
+# run_in_background pids1 bash -c 'sleep 1; exit 1'
+#
+# @param pid_variable the variable name (not value) where the pids will be stored
+# @param ... the command to execute
+# @return only the pid_variable output should be considered and used with **wait_background**
+#
+function run_in_background() {
+ local pid_variable=$1
+ shift;
+ # Execute the command and prepend the output with its pid
+ # We enforce to return the exit status of the command and not the awk one.
+ ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 &
+ eval "$pid_variable+=\" $!\""
+}
+
+function test_run_in_background() {
+ local pids
+ run_in_background pids sleep 1
+ run_in_background pids sleep 1
+ test $(echo $pids | wc -w) = 2 || return 1
+ wait $pids || return 1
+}
+
+#######################################################################
+##
+# Wait for pids running in background to complete.
+# This function is usually used after a **run_in_background** call
+# Example:
+# pids1=""
+# run_in_background pids1 bash -c 'sleep 1; exit 1'
+# wait_background pids1
+#
+# @param pids The variable name that contains the active PIDS. Set as empty at then end of the function.
+# @return returns 1 if at least one process exits in error unless returns 0
+#
+function wait_background() {
+ # We extract the PIDS from the variable name
+ pids=${!1}
+
+ return_code=0
+ for pid in $pids; do
+ if ! wait $pid; then
+ # If one process failed then return 1
+ return_code=1
+ fi
+ done
+
+ # We empty the variable reporting that all process ended
+ eval "$1=''"
+
+ return $return_code
+}
+
+
+function test_wait_background() {
+ local pids=""
+ run_in_background pids bash -c "sleep 1; exit 1"
+ run_in_background pids bash -c "sleep 2; exit 0"
+ wait_background pids
+ if [ $? -ne 1 ]; then return 1; fi
+
+ run_in_background pids bash -c "sleep 1; exit 0"
+ run_in_background pids bash -c "sleep 2; exit 0"
+ wait_background pids
+ if [ $? -ne 0 ]; then return 1; fi
+
+ if [ ! -z "$pids" ]; then return 1; fi
+}
+
+function flush_pg_stats()
+{
+ local timeout=${1:-$TIMEOUT}
+
+ ids=`ceph osd ls`
+ seqs=''
+ for osd in $ids; do
+ seq=`ceph tell osd.$osd flush_pg_stats`
+ seqs="$seqs $osd-$seq"
+ done
+
+ for s in $seqs; do
+ osd=`echo $s | cut -d - -f 1`
+ seq=`echo $s | cut -d - -f 2`
+ echo "waiting osd.$osd seq $seq"
+ while test $(ceph osd last-stat-seq $osd) -lt $seq; do
+ sleep 1
+ if [ $((timeout--)) -eq 0 ]; then
+ return 1
+ fi
+ done
+ done
+}
+
+function test_flush_pg_stats()
+{
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ rados -p rbd put obj /etc/group
+ flush_pg_stats
+ local jq_filter='.pools | .[] | select(.name == "rbd") | .stats'
+ raw_bytes_used=`ceph df detail --format=json | jq "$jq_filter.raw_bytes_used"`
+ bytes_used=`ceph df detail --format=json | jq "$jq_filter.bytes_used"`
+ test $raw_bytes_used > 0 || return 1
+ test $raw_bytes_used == $bytes_used || return 1
+}
+
+#######################################################################
+
+##
+# Call the **run** function (which must be defined by the caller) with
+# the **dir** argument followed by the caller argument list.
+#
+# If the **run** function returns on error, all logs found in **dir**
+# are displayed for diagnostic purposes.
+#
+# **teardown** function is called when the **run** function returns
+# (on success or on error), to cleanup leftovers. The CEPH_CONF is set
+# to /dev/null and CEPH_ARGS is unset so that the tests are protected from
+# external interferences.
+#
+# It is the responsibility of the **run** function to call the
+# **setup** function to prepare the test environment (create a temporary
+# directory etc.).
+#
+# The shell is required (via PS4) to display the function and line
+# number whenever a statement is executed to help debugging.
+#
+# @param dir directory in which all data is stored
+# @param ... arguments passed transparently to **run**
+# @return 0 on success, 1 on error
+#
+function main() {
+ local dir=td/$1
+ shift
+
+ shopt -s -o xtrace
+ PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
+
+ export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
+ #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
+
+ export CEPH_CONF=/dev/null
+ unset CEPH_ARGS
+
+ local code
+ if run $dir "$@" ; then
+ code=0
+ else
+ display_logs $dir
+ code=1
+ fi
+ teardown $dir || return 1
+ return $code
+}
+
+#######################################################################
+
+function run_tests() {
+ shopt -s -o xtrace
+ PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
+
+ export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
+ #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
+
+ export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ export CEPH_CONF=/dev/null
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')}
+ local dir=td/ceph-helpers
+
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+if test "$1" = TESTS ; then
+ shift
+ run_tests "$@"
+fi
+
+# NOTE:
+# jq only support --exit-status|-e from version 1.4 forwards, which makes
+# returning on error waaaay prettier and straightforward.
+# However, the current automated upstream build is running with v1.3,
+# which has no idea what -e is. Hence the convoluted error checking we
+# need. Sad.
+# The next time someone changes this code, please check if v1.4 is now
+# a thing, and, if so, please change these to use -e. Thanks.
+
+# jq '.all.supported | select([.[] == "foo"] | any)'
+function jq_success() {
+ input="$1"
+ filter="$2"
+ expects="\"$3\""
+
+ in_escaped=$(printf %s "$input" | sed "s/'/'\\\\''/g")
+ filter_escaped=$(printf %s "$filter" | sed "s/'/'\\\\''/g")
+
+ ret=$(echo "$in_escaped" | jq "$filter_escaped")
+ if [[ "$ret" == "true" ]]; then
+ return 0
+ elif [[ -n "$expects" ]]; then
+ if [[ "$ret" == "$expects" ]]; then
+ return 0
+ fi
+ fi
+ return 1
+ input=$1
+ filter=$2
+ expects="$3"
+
+ ret="$(echo $input | jq \"$filter\")"
+ if [[ "$ret" == "true" ]]; then
+ return 0
+ elif [[ -n "$expects" && "$ret" == "$expects" ]]; then
+ return 0
+ fi
+ return 1
+}
+
+# Local Variables:
+# compile-command: "cd ../../src ; make -j4 && ../qa/standalone/ceph-helpers.sh TESTS # test_get_config"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7131" # git grep '\<7131\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ CEPH_ARGS+="--crush-location=root=default,host=HOST "
+ CEPH_ARGS+="--osd-crush-initial-weight=3 "
+ #
+ # Disable device auto class feature for now.
+ # The device class is non-deterministic and will
+ # crash the crushmap comparison below.
+ #
+ CEPH_ARGS+="--osd-class-update-on-start=false "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_choose_args_update() {
+ #
+ # adding a weighted OSD updates the weight up to the top
+ #
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+
+ ceph osd set-require-min-compat-client luminous
+ ceph osd getcrushmap > $dir/map || return 1
+ crushtool -d $dir/map -o $dir/map.txt || return 1
+ sed -i -e '/end crush map/d' $dir/map.txt
+ cat >> $dir/map.txt <<EOF
+# choose_args
+choose_args 0 {
+ {
+ bucket_id -1
+ weight_set [
+ [ 3.000 ]
+ [ 3.000 ]
+ ]
+ ids [ -10 ]
+ }
+ {
+ bucket_id -2
+ weight_set [
+ [ 2.000 ]
+ [ 2.000 ]
+ ]
+ ids [ -20 ]
+ }
+}
+
+# end crush map
+EOF
+ crushtool -c $dir/map.txt -o $dir/map-new || return 1
+ ceph osd setcrushmap -i $dir/map-new || return 1
+
+ run_osd $dir 1 || return 1
+ ceph osd getcrushmap > $dir/map-one-more || return 1
+ crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1
+ cat $dir/map-one-more.txt
+ diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-3.txt || return 1
+
+ destroy_osd $dir 1 || return 1
+ ceph osd getcrushmap > $dir/map-one-less || return 1
+ crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1
+ diff -u $dir/map-one-less.txt $dir/map.txt || return 1
+}
+
+function TEST_no_update_weight_set() {
+ #
+ # adding a zero weight OSD does not update the weight set at all
+ #
+ local dir=$1
+
+ ORIG_CEPH_ARGS="$CEPH_ARGS"
+ CEPH_ARGS+="--osd-crush-update-weight-set=false "
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+
+ ceph osd set-require-min-compat-client luminous
+ ceph osd crush tree
+ ceph osd getcrushmap > $dir/map || return 1
+ crushtool -d $dir/map -o $dir/map.txt || return 1
+ sed -i -e '/end crush map/d' $dir/map.txt
+ cat >> $dir/map.txt <<EOF
+# choose_args
+choose_args 0 {
+ {
+ bucket_id -1
+ weight_set [
+ [ 2.000 ]
+ [ 1.000 ]
+ ]
+ ids [ -10 ]
+ }
+ {
+ bucket_id -2
+ weight_set [
+ [ 2.000 ]
+ [ 1.000 ]
+ ]
+ ids [ -20 ]
+ }
+}
+
+# end crush map
+EOF
+ crushtool -c $dir/map.txt -o $dir/map-new || return 1
+ ceph osd setcrushmap -i $dir/map-new || return 1
+ ceph osd crush tree
+
+
+ run_osd $dir 1 || return 1
+ ceph osd crush tree
+ ceph osd getcrushmap > $dir/map-one-more || return 1
+ crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1
+ cat $dir/map-one-more.txt
+ diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-0.txt || return 1
+
+ destroy_osd $dir 1 || return 1
+ ceph osd crush tree
+ ceph osd getcrushmap > $dir/map-one-less || return 1
+ crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1
+ diff -u $dir/map-one-less.txt $dir/map.txt || return 1
+
+ CEPH_ARGS="$ORIG_CEPH_ARGS"
+}
+
+main crush-choose-args "$@"
+
+# Local Variables:
+# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-choose-args.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7130" # git grep '\<7130\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ #
+ # Disable auto-class, so we can inject device class manually below
+ #
+ CEPH_ARGS+="--osd-class-update-on-start=false "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function add_something() {
+ local dir=$1
+ local obj=${2:-SOMETHING}
+
+ local payload=ABCDEF
+ echo $payload > $dir/ORIGINAL
+ rados --pool rbd put $obj $dir/ORIGINAL || return 1
+}
+
+function get_osds_up() {
+ local poolname=$1
+ local objectname=$2
+
+ local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \
+ $XMLSTARLET sel -t -m "//up/osd" -v . -o ' ')
+ # get rid of the trailing space
+ echo $osds
+}
+
+function TEST_classes() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ create_rbd_pool || return 1
+
+ test "$(get_osds_up rbd SOMETHING)" == "1 2 0" || return 1
+ add_something $dir SOMETHING || return 1
+
+ #
+ # osd.0 has class ssd and the rule is modified
+ # to only take ssd devices.
+ #
+ ceph osd getcrushmap > $dir/map || return 1
+ crushtool -d $dir/map -o $dir/map.txt || return 1
+ ${SED} -i \
+ -e '/device 0 osd.0/s/$/ class ssd/' \
+ -e '/step take default/s/$/ class ssd/' \
+ $dir/map.txt || return 1
+ crushtool -c $dir/map.txt -o $dir/map-new || return 1
+ ceph osd setcrushmap -i $dir/map-new || return 1
+
+ #
+ # There can only be one mapping since there only is
+ # one device with ssd class.
+ #
+ ok=false
+ for delay in 2 4 8 16 32 64 128 256 ; do
+ if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0" ; then
+ ok=true
+ break
+ fi
+ sleep $delay
+ ceph osd dump # for debugging purposes
+ ceph pg dump # for debugging purposes
+ done
+ $ok || return 1
+ #
+ # Writing keeps working because the pool is min_size 1 by
+ # default.
+ #
+ add_something $dir SOMETHING_ELSE || return 1
+
+ #
+ # Sanity check that the rule indeed has ssd
+ # generated bucket with a name including ~ssd.
+ #
+ ceph osd crush dump | grep -q '~ssd' || return 1
+}
+
+function TEST_set_device_class() {
+ local dir=$1
+
+ TEST_classes $dir || return 1
+
+ ceph osd crush set-device-class ssd osd.0 || return 1
+ ceph osd crush class ls-osd ssd | grep 0 || return 1
+ ceph osd crush set-device-class ssd osd.1 || return 1
+ ceph osd crush class ls-osd ssd | grep 1 || return 1
+ ceph osd crush set-device-class ssd 0 1 || return 1 # should be idempotent
+
+ ok=false
+ for delay in 2 4 8 16 32 64 128 256 ; do
+ if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0 1" ; then
+ ok=true
+ break
+ fi
+ sleep $delay
+ ceph osd crush dump
+ ceph osd dump # for debugging purposes
+ ceph pg dump # for debugging purposes
+ done
+ $ok || return 1
+}
+
+function TEST_mon_classes() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ create_rbd_pool || return 1
+
+ test "$(get_osds_up rbd SOMETHING)" == "1 2 0" || return 1
+ add_something $dir SOMETHING || return 1
+
+ # test rm-device-class
+ ceph osd crush set-device-class aaa osd.0 || return 1
+ ceph osd tree | grep -q 'aaa' || return 1
+ ceph osd crush dump | grep -q '~aaa' || return 1
+ ceph osd crush tree --show-shadow | grep -q '~aaa' || return 1
+ ceph osd crush set-device-class bbb osd.1 || return 1
+ ceph osd tree | grep -q 'bbb' || return 1
+ ceph osd crush dump | grep -q '~bbb' || return 1
+ ceph osd crush tree --show-shadow | grep -q '~bbb' || return 1
+ ceph osd crush set-device-class ccc osd.2 || return 1
+ ceph osd tree | grep -q 'ccc' || return 1
+ ceph osd crush dump | grep -q '~ccc' || return 1
+ ceph osd crush tree --show-shadow | grep -q '~ccc' || return 1
+ ceph osd crush rm-device-class 0 || return 1
+ ceph osd tree | grep -q 'aaa' && return 1
+ ceph osd crush dump | grep -q '~aaa' && return 1
+ ceph osd crush tree --show-shadow | grep -q '~aaa' && return 1
+ ceph osd crush class ls | grep -q 'aaa' && return 1
+ ceph osd crush rm-device-class 1 || return 1
+ ceph osd tree | grep -q 'bbb' && return 1
+ ceph osd crush dump | grep -q '~bbb' && return 1
+ ceph osd crush tree --show-shadow | grep -q '~bbb' && return 1
+ ceph osd crush class ls | grep -q 'bbb' && return 1
+ ceph osd crush rm-device-class 2 || return 1
+ ceph osd tree | grep -q 'ccc' && return 1
+ ceph osd crush dump | grep -q '~ccc' && return 1
+ ceph osd crush tree --show-shadow | grep -q '~ccc' && return 1
+ ceph osd crush class ls | grep -q 'ccc' && return 1
+ ceph osd crush set-device-class asdf all || return 1
+ ceph osd tree | grep -q 'asdf' || return 1
+ ceph osd crush dump | grep -q '~asdf' || return 1
+ ceph osd crush tree --show-shadow | grep -q '~asdf' || return 1
+ ceph osd crush rm-device-class all || return 1
+ ceph osd tree | grep -q 'asdf' && return 1
+ ceph osd crush dump | grep -q '~asdf' && return 1
+ ceph osd crush tree --show-shadow | grep -q '~asdf' && return 1
+
+ # test 'class rm' automatically recycles shadow trees
+ ceph osd crush set-device-class asdf 0 1 2 || return 1
+ ceph osd tree | grep -q 'asdf' || return 1
+ ceph osd crush dump | grep -q '~asdf' || return 1
+ ceph osd crush tree --show-shadow | grep -q '~asdf' || return 1
+ ceph osd crush class ls | grep -q 'asdf' || return 1
+ ceph osd crush class rm asdf || return 1
+ ceph osd tree | grep -q 'asdf' && return 1
+ ceph osd crush dump | grep -q '~asdf' && return 1
+ ceph osd crush tree --show-shadow | grep -q '~asdf' && return 1
+ ceph osd crush class ls | grep -q 'asdf' && return 1
+
+ ceph osd crush set-device-class abc osd.2 || return 1
+ ceph osd crush move osd.2 root=foo rack=foo-rack host=foo-host || return 1
+ out=`ceph osd tree |awk '$1 == 2 && $2 == "abc" {print $0}'`
+ if [ "$out" == "" ]; then
+ return 1
+ fi
+
+ # verify 'crush move' too
+ ceph osd crush dump | grep -q 'foo~abc' || return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo~abc' || return 1
+ ceph osd crush dump | grep -q 'foo-rack~abc' || return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo-rack~abc' || return 1
+ ceph osd crush dump | grep -q 'foo-host~abc' || return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo-host~abc' || return 1
+ ceph osd crush rm-device-class osd.2 || return 1
+ ceph osd crush dump | grep -q 'foo~abc' && return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo~abc' && return 1
+ ceph osd crush dump | grep -q 'foo-rack~abc' && return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo-rack~abc' && return 1
+ ceph osd crush dump | grep -q 'foo-host~abc' && return 1
+ ceph osd crush tree --show-shadow | grep -q 'foo-host~abc' && return 1
+ # restore class, so we can continue to test create-replicated
+ ceph osd crush set-device-class abc osd.2 || return 1
+
+ ceph osd crush rule create-replicated foo-rule foo host abc || return 1
+
+ # test class_is_in_use
+ ceph osd crush set-device-class hdd osd.0 || return 1
+ ceph osd crush set-device-class ssd osd.1 || return 1
+ ceph osd crush rule create-replicated foo-hdd1 default host hdd || return 1
+ ceph osd crush rule create-replicated foo-hdd2 default host hdd || return 1
+ ceph osd crush rule create-replicated foo-ssd default host ssd || return 1
+ expect_failure $dir EBUSY ceph osd crush class rm hdd || return 1
+ expect_failure $dir EBUSY ceph osd crush class rm ssd || return 1
+ ceph osd crush rule rm foo-hdd1 || return 1
+ expect_failure $dir EBUSY ceph osd crush class rm hdd || return 1 # still referenced by foo-hdd2
+ ceph osd crush rule rm foo-hdd2 || return 1
+ ceph osd crush rule rm foo-ssd || return 1
+ ceph osd crush class rm hdd || return 1
+ ceph osd crush class rm ssd || return 1
+ expect_failure $dir EBUSY ceph osd crush class rm abc || return 1 # still referenced by foo-rule
+ ceph osd crush rule rm foo-rule || return 1
+ ceph osd crush class rm abc || return 1
+
+ # test set-device-class implicitly change class
+ ceph osd crush set-device-class hdd osd.0 || return 1
+ expect_failure $dir EBUSY ceph osd crush set-device-class nvme osd.0 || return 1
+}
+
+main crush-classes "$@"
+
+# Local Variables:
+# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-classes.sh"
+# End:
--- /dev/null
+#!/bin/bash -x
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+arch=$(uname -m)
+
+case $arch in
+ i[[3456]]86*|x86_64*|amd64*)
+ legacy_jerasure_plugins=(jerasure_generic jerasure_sse3 jerasure_sse4)
+ legacy_shec_plugins=(shec_generic shec_sse3 shec_sse4)
+ plugins=(jerasure shec lrc isa)
+ ;;
+ aarch64*|arm*)
+ legacy_jerasure_plugins=(jerasure_generic jerasure_neon)
+ legacy_shec_plugins=(shec_generic shec_neon)
+ plugins=(jerasure shec lrc)
+ ;;
+ *)
+ echo "unsupported platform ${arch}."
+ return 1
+ ;;
+esac
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:17110" # git grep '\<17110\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_preload_warning() {
+ local dir=$1
+
+ for plugin in ${legacy_jerasure_plugins[*]} ${legacy_shec_plugins[*]}; do
+ setup $dir || return 1
+ run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1
+ run_mgr $dir x || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/mon.a.log || return 1
+ grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/osd.0.log || return 1
+ teardown $dir || return 1
+ done
+ return 0
+}
+
+function TEST_preload_no_warning() {
+ local dir=$1
+
+ for plugin in ${plugins[*]}; do
+ setup $dir || return 1
+ run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1
+ run_mgr $dir x || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/mon.a.log || return 1
+ ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/osd.0.log || return 1
+ teardown $dir || return 1
+ done
+
+ return 0
+}
+
+function TEST_preload_no_warning_default() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ ! grep "WARNING: osd_erasure_code_plugins" $dir/mon.a.log || return 1
+ ! grep "WARNING: osd_erasure_code_plugins" $dir/osd.0.log || return 1
+ teardown $dir || return 1
+
+ return 0
+}
+
+function TEST_ec_profile_warning() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 2) ; do
+ run_osd $dir $id || return 1
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ for plugin in ${legacy_jerasure_plugins[*]}; do
+ ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd technique=reed_sol_van plugin=${plugin} || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1
+ done
+
+ for plugin in ${legacy_shec_plugins[*]}; do
+ ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd plugin=${plugin} || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1
+ done
+
+ teardown $dir || return 1
+}
+
+main test-erasure-code-plugins "$@"
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7101" # git grep '\<7101\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON --mon-osd-prime-pg-temp=false"
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ # check that erasure code plugins are preloaded
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
+ for id in $(seq 0 10) ; do
+ run_osd $dir $id || return 1
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ # check that erasure code plugins are preloaded
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
+ create_erasure_coded_pool ecpool || return 1
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+
+ delete_pool ecpool || return 1
+ teardown $dir || return 1
+}
+
+function create_erasure_coded_pool() {
+ local poolname=$1
+
+ ceph osd erasure-code-profile set myprofile \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 12 12 erasure myprofile \
+ || return 1
+ wait_for_clean || return 1
+}
+
+function delete_pool() {
+ local poolname=$1
+
+ ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+}
+
+function rados_put_get() {
+ local dir=$1
+ local poolname=$2
+ local objname=${3:-SOMETHING}
+
+
+ for marker in AAA BBB CCCC DDDD ; do
+ printf "%*s" 1024 $marker
+ done > $dir/ORIGINAL
+
+ #
+ # get and put an object, compare they are equal
+ #
+ rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+ rados --pool $poolname get $objname $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ rm $dir/COPY
+
+ #
+ # take out an OSD used to store the object and
+ # check the object can still be retrieved, which implies
+ # recovery
+ #
+ local -a initial_osds=($(get_osds $poolname $objname))
+ local last=$((${#initial_osds[@]} - 1))
+ ceph osd out ${initial_osds[$last]} || return 1
+ ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
+ rados --pool $poolname get $objname $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ ceph osd in ${initial_osds[$last]} || return 1
+
+ rm $dir/ORIGINAL
+}
+
+function rados_osds_out_in() {
+ local dir=$1
+ local poolname=$2
+ local objname=${3:-SOMETHING}
+
+
+ for marker in FFFF GGGG HHHH IIII ; do
+ printf "%*s" 1024 $marker
+ done > $dir/ORIGINAL
+
+ #
+ # get and put an object, compare they are equal
+ #
+ rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+ rados --pool $poolname get $objname $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ rm $dir/COPY
+
+ #
+ # take out two OSDs used to store the object, wait for the cluster
+ # to be clean (i.e. all PG are clean and active) again which
+ # implies the PG have been moved to use the remaining OSDs. Check
+ # the object can still be retrieved.
+ #
+ wait_for_clean || return 1
+ local osds_list=$(get_osds $poolname $objname)
+ local -a osds=($osds_list)
+ for osd in 0 1 ; do
+ ceph osd out ${osds[$osd]} || return 1
+ done
+ wait_for_clean || return 1
+ #
+ # verify the object is no longer mapped to the osds that are out
+ #
+ for osd in 0 1 ; do
+ ! get_osds $poolname $objname | grep '\<'${osds[$osd]}'\>' || return 1
+ done
+ rados --pool $poolname get $objname $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ #
+ # bring the osds back in, , wait for the cluster
+ # to be clean (i.e. all PG are clean and active) again which
+ # implies the PG go back to using the same osds as before
+ #
+ for osd in 0 1 ; do
+ ceph osd in ${osds[$osd]} || return 1
+ done
+ wait_for_clean || return 1
+ test "$osds_list" = "$(get_osds $poolname $objname)" || return 1
+ rm $dir/ORIGINAL
+}
+
+function TEST_rados_put_get_lrc_advanced() {
+ local dir=$1
+ local poolname=pool-lrc-a
+ local profile=profile-lrc-a
+
+ ceph osd erasure-code-profile set $profile \
+ plugin=lrc \
+ mapping=DD_ \
+ crush-steps='[ [ "chooseleaf", "osd", 0 ] ]' \
+ layers='[ [ "DDc", "" ] ]' || return 1
+ ceph osd pool create $poolname 12 12 erasure $profile \
+ || return 1
+
+ rados_put_get $dir $poolname || return 1
+
+ delete_pool $poolname
+ ceph osd erasure-code-profile rm $profile
+}
+
+function TEST_rados_put_get_lrc_kml() {
+ local dir=$1
+ local poolname=pool-lrc
+ local profile=profile-lrc
+
+ ceph osd erasure-code-profile set $profile \
+ plugin=lrc \
+ k=4 m=2 l=3 \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 12 12 erasure $profile \
+ || return 1
+
+ rados_put_get $dir $poolname || return 1
+
+ delete_pool $poolname
+ ceph osd erasure-code-profile rm $profile
+}
+
+function TEST_rados_put_get_isa() {
+ if ! erasure_code_plugin_exists isa ; then
+ echo "SKIP because plugin isa has not been built"
+ return 0
+ fi
+ local dir=$1
+ local poolname=pool-isa
+
+ ceph osd erasure-code-profile set profile-isa \
+ plugin=isa \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 1 1 erasure profile-isa \
+ || return 1
+
+ rados_put_get $dir $poolname || return 1
+
+ delete_pool $poolname
+}
+
+function TEST_rados_put_get_jerasure() {
+ local dir=$1
+
+ rados_put_get $dir ecpool || return 1
+
+ local poolname=pool-jerasure
+ local profile=profile-jerasure
+
+ ceph osd erasure-code-profile set $profile \
+ plugin=jerasure \
+ k=4 m=2 \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 12 12 erasure $profile \
+ || return 1
+
+ rados_put_get $dir $poolname || return 1
+ rados_osds_out_in $dir $poolname || return 1
+
+ delete_pool $poolname
+ ceph osd erasure-code-profile rm $profile
+}
+
+function TEST_rados_put_get_shec() {
+ local dir=$1
+
+ local poolname=pool-shec
+ local profile=profile-shec
+
+ ceph osd erasure-code-profile set $profile \
+ plugin=shec \
+ k=2 m=1 c=1 \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 12 12 erasure $profile \
+ || return 1
+
+ rados_put_get $dir $poolname || return 1
+
+ delete_pool $poolname
+ ceph osd erasure-code-profile rm $profile
+}
+
+function TEST_alignment_constraints() {
+ local payload=ABC
+ echo "$payload" > $dir/ORIGINAL
+ #
+ # Verify that the rados command enforces alignment constraints
+ # imposed by the stripe width
+ # See http://tracker.ceph.com/issues/8622
+ #
+ local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
+ eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
+ local block_size=$((stripe_unit * k - 1))
+ dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2
+ rados --block-size=$block_size \
+ --pool ecpool put UNALIGNED $dir/ORIGINAL || return 1
+ rm $dir/ORIGINAL
+}
+
+function chunk_size() {
+ echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
+}
+
+#
+# By default an object will be split in two (k=2) with the first part
+# of the object in the first OSD of the up set and the second part in
+# the next OSD in the up set. This layout is defined by the mapping
+# parameter and this function helps verify that the first and second
+# part of the object are located in the OSD where they should be.
+#
+function verify_chunk_mapping() {
+ local dir=$1
+ local poolname=$2
+ local first=$3
+ local second=$4
+
+ local payload=$(printf '%*s' $(chunk_size) FIRST$poolname ; printf '%*s' $(chunk_size) SECOND$poolname)
+ echo -n "$payload" > $dir/ORIGINAL
+
+ rados --pool $poolname put SOMETHING$poolname $dir/ORIGINAL || return 1
+ rados --pool $poolname get SOMETHING$poolname $dir/COPY || return 1
+ local -a osds=($(get_osds $poolname SOMETHING$poolname))
+ for (( i = 0; i < ${#osds[@]}; i++ )) ; do
+ ceph daemon osd.${osds[$i]} flush_journal
+ done
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ rm $dir/COPY
+
+ local -a osds=($(get_osds $poolname SOMETHING$poolname))
+ grep --quiet --recursive --text FIRST$poolname $dir/${osds[$first]} || return 1
+ grep --quiet --recursive --text SECOND$poolname $dir/${osds[$second]} || return 1
+}
+
+function TEST_chunk_mapping() {
+ local dir=$1
+
+ #
+ # mapping=DD_ is the default:
+ # first OSD (i.e. 0) in the up set has the first part of the object
+ # second OSD (i.e. 1) in the up set has the second part of the object
+ #
+ verify_chunk_mapping $dir ecpool 0 1 || return 1
+
+ ceph osd erasure-code-profile set remap-profile \
+ plugin=lrc \
+ layers='[ [ "_DD", "" ] ]' \
+ mapping='_DD' \
+ crush-steps='[ [ "choose", "osd", 0 ] ]' || return 1
+ ceph osd erasure-code-profile get remap-profile
+ ceph osd pool create remap-pool 12 12 erasure remap-profile \
+ || return 1
+
+ #
+ # mapping=_DD
+ # second OSD (i.e. 1) in the up set has the first part of the object
+ # third OSD (i.e. 2) in the up set has the second part of the object
+ #
+ verify_chunk_mapping $dir remap-pool 1 2 || return 1
+
+ delete_pool remap-pool
+ ceph osd erasure-code-profile rm remap-profile
+}
+
+main test-erasure-code "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-code.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+#
+# Author: Kefu Chai <kchai@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7112" # git grep '\<7112\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ create_rbd_pool || return 1
+
+ # check that erasure code plugins are preloaded
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function setup_osds() {
+ for id in $(seq 0 3) ; do
+ run_osd $dir $id || return 1
+ done
+ wait_for_clean || return 1
+
+ # check that erasure code plugins are preloaded
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
+}
+
+function create_erasure_coded_pool() {
+ local poolname=$1
+
+ ceph osd erasure-code-profile set myprofile \
+ plugin=jerasure \
+ k=2 m=1 \
+ crush-failure-domain=osd || return 1
+ ceph osd pool create $poolname 1 1 erasure myprofile \
+ || return 1
+ wait_for_clean || return 1
+}
+
+function delete_pool() {
+ local poolname=$1
+
+ ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+ ceph osd erasure-code-profile rm myprofile
+}
+
+function rados_put() {
+ local dir=$1
+ local poolname=$2
+ local objname=${3:-SOMETHING}
+
+ for marker in AAA BBB CCCC DDDD ; do
+ printf "%*s" 1024 $marker
+ done > $dir/ORIGINAL
+ #
+ # get and put an object, compare they are equal
+ #
+ rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+}
+
+function rados_get() {
+ local dir=$1
+ local poolname=$2
+ local objname=${3:-SOMETHING}
+ local expect=${4:-ok}
+
+ #
+ # Expect a failure to get object
+ #
+ if [ $expect = "fail" ];
+ then
+ ! rados --pool $poolname get $objname $dir/COPY
+ return
+ fi
+ #
+ # get an object, compare with $dir/ORIGINAL
+ #
+ rados --pool $poolname get $objname $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+ rm $dir/COPY
+}
+
+function rados_put_get() {
+ local dir=$1
+ local poolname=$2
+ local objname=${3:-SOMETHING}
+ local recovery=$4
+
+ #
+ # get and put an object, compare they are equal
+ #
+ rados_put $dir $poolname $objname || return 1
+ # We can read even though caller injected read error on one of the shards
+ rados_get $dir $poolname $objname || return 1
+
+ if [ -n "$recovery" ];
+ then
+ #
+ # take out the last OSD used to store the object,
+ # bring it back, and check for clean PGs which means
+ # recovery didn't crash the primary.
+ #
+ local -a initial_osds=($(get_osds $poolname $objname))
+ local last=$((${#initial_osds[@]} - 1))
+ # Kill OSD
+ kill_daemons $dir TERM osd.${initial_osds[$last]} >&2 < /dev/null || return 1
+ ceph osd out ${initial_osds[$last]} || return 1
+ ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
+ ceph osd in ${initial_osds[$last]} || return 1
+ run_osd $dir ${initial_osds[$last]} || return 1
+ wait_for_clean || return 1
+ fi
+
+ rm $dir/ORIGINAL
+}
+
+function inject_eio() {
+ local objname=$1
+ shift
+ local dir=$1
+ shift
+ local shard_id=$1
+ shift
+
+ local poolname=pool-jerasure
+ local -a initial_osds=($(get_osds $poolname $objname))
+ local osd_id=${initial_osds[$shard_id]}
+ set_config osd $osd_id filestore_debug_inject_read_err true || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.$osd_id) \
+ injectdataerr $poolname $objname $shard_id || return 1
+}
+
+function rados_get_data_eio() {
+ local dir=$1
+ shift
+ local shard_id=$1
+ shift
+ local recovery=$1
+ shift
+
+ # inject eio to speificied shard
+ #
+ local poolname=pool-jerasure
+ local objname=obj-eio-$$-$shard_id
+ inject_eio $objname $dir $shard_id || return 1
+ rados_put_get $dir $poolname $objname $recovery || return 1
+
+ shard_id=$(expr $shard_id + 1)
+ inject_eio $objname $dir $shard_id || return 1
+ # Now 2 out of 3 shards get EIO, so should fail
+ rados_get $dir $poolname $objname fail || return 1
+}
+
+# Change the size of speificied shard
+#
+function set_size() {
+ local objname=$1
+ shift
+ local dir=$1
+ shift
+ local shard_id=$1
+ shift
+ local bytes=$1
+ shift
+ local mode=${1}
+
+ local poolname=pool-jerasure
+ local -a initial_osds=($(get_osds $poolname $objname))
+ local osd_id=${initial_osds[$shard_id]}
+ ceph osd set noout
+ if [ "$mode" = "add" ];
+ then
+ objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1
+ dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT
+ elif [ "$bytes" = "0" ];
+ then
+ touch $dir/CORRUPT
+ else
+ dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT
+ fi
+ objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1
+ rm -f $dir/CORRUPT
+ ceph osd unset noout
+}
+
+function rados_get_data_bad_size() {
+ local dir=$1
+ shift
+ local shard_id=$1
+ shift
+ local bytes=$1
+ shift
+ local mode=${1:-set}
+
+ local poolname=pool-jerasure
+ local objname=obj-size-$$-$shard_id-$bytes
+ rados_put $dir $poolname $objname || return 1
+
+ # Change the size of speificied shard
+ #
+ set_size $objname $dir $shard_id $bytes $mode || return 1
+
+ rados_get $dir $poolname $objname || return 1
+
+ # Leave objname and modify another shard
+ shard_id=$(expr $shard_id + 1)
+ set_size $objname $dir $shard_id $bytes $mode || return 1
+ rados_get $dir $poolname $objname fail || return 1
+}
+
+#
+# These two test cases try to validate the following behavior:
+# For object on EC pool, if there is one shard having read error (
+# either primary or replica), client can still read object.
+#
+# If 2 shards have read errors the client will get an error.
+#
+function TEST_rados_get_subread_eio_shard_0() {
+ local dir=$1
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # inject eio on primary OSD (0) and replica OSD (1)
+ local shard_id=0
+ rados_get_data_eio $dir $shard_id || return 1
+ delete_pool $poolname
+}
+
+function TEST_rados_get_subread_eio_shard_1() {
+ local dir=$1
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # inject eio into replicas OSD (1) and OSD (2)
+ local shard_id=1
+ rados_get_data_eio $dir $shard_id || return 1
+ delete_pool $poolname
+}
+
+#
+# These two test cases try to validate that following behavior:
+# For object on EC pool, if there is one shard which an incorrect
+# size this will cause an internal read error, client can still read object.
+#
+# If 2 shards have incorrect size the client will get an error.
+#
+function TEST_rados_get_bad_size_shard_0() {
+ local dir=$1
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # Set incorrect size into primary OSD (0) and replica OSD (1)
+ local shard_id=0
+ rados_get_data_bad_size $dir $shard_id 10 || return 1
+ rados_get_data_bad_size $dir $shard_id 0 || return 1
+ rados_get_data_bad_size $dir $shard_id 256 add || return 1
+ delete_pool $poolname
+}
+
+function TEST_rados_get_bad_size_shard_1() {
+ local dir=$1
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # Set incorrect size into replicas OSD (1) and OSD (2)
+ local shard_id=1
+ rados_get_data_bad_size $dir $shard_id 10 || return 1
+ rados_get_data_bad_size $dir $shard_id 0 || return 1
+ rados_get_data_bad_size $dir $shard_id 256 add || return 1
+ delete_pool $poolname
+}
+
+function TEST_rados_get_with_subreadall_eio_shard_0() {
+ local dir=$1
+ local shard_id=0
+
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # inject eio on primary OSD (0)
+ local shard_id=0
+ rados_get_data_eio $dir $shard_id recovery || return 1
+
+ delete_pool $poolname
+}
+
+function TEST_rados_get_with_subreadall_eio_shard_1() {
+ local dir=$1
+ local shard_id=0
+
+ setup_osds || return 1
+
+ local poolname=pool-jerasure
+ create_erasure_coded_pool $poolname || return 1
+ # inject eio on replica OSD (1)
+ local shard_id=1
+ rados_get_data_eio $dir $shard_id recovery || return 1
+
+ delete_pool $poolname
+}
+
+main test-erasure-eio "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Red Hat <contact@redhat.com>
+#
+# Author: Sebastien Ponce <sebastien.ponce@cern.ch>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7116" # git grep '\<7116\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ # setup
+ setup $dir || return 1
+
+ # create a cluster with one monitor and three osds
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ create_rbd_pool || return 1
+
+ # create toyfile
+ dd if=/dev/urandom of=$dir/toyfile bs=1234 count=1
+
+ # put a striped object
+ rados --pool rbd --striper put toyfile $dir/toyfile || return 1
+
+ # stat it, with and without striping
+ rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
+ rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
+ echo ' size 1234' > $dir/refstat
+ diff -w $dir/stripedStat $dir/refstat || return 1
+ diff -w $dir/stat $dir/refstat || return 1
+ rados --pool rbd stat toyfile >& $dir/staterror
+ grep -q 'No such file or directory' $dir/staterror || return 1
+
+ # get the file back with and without striping
+ rados --pool rbd --striper get toyfile $dir/stripedGroup || return 1
+ diff -w $dir/toyfile $dir/stripedGroup || return 1
+ rados --pool rbd get toyfile.0000000000000000 $dir/nonSTripedGroup || return 1
+ diff -w $dir/toyfile $dir/nonSTripedGroup || return 1
+
+ # test truncate
+ rados --pool rbd --striper truncate toyfile 12
+ rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
+ rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
+ echo ' size 12' > $dir/reftrunc
+ diff -w $dir/stripedStat $dir/reftrunc || return 1
+ diff -w $dir/stat $dir/reftrunc || return 1
+
+ # test xattrs
+
+ rados --pool rbd --striper setxattr toyfile somexattr somevalue || return 1
+ rados --pool rbd --striper getxattr toyfile somexattr > $dir/xattrvalue || return 1
+ rados --pool rbd getxattr toyfile.0000000000000000 somexattr > $dir/xattrvalue2 || return 1
+ echo 'somevalue' > $dir/refvalue
+ diff -w $dir/xattrvalue $dir/refvalue || return 1
+ diff -w $dir/xattrvalue2 $dir/refvalue || return 1
+ rados --pool rbd --striper listxattr toyfile > $dir/xattrlist || return 1
+ echo 'somexattr' > $dir/reflist
+ diff -w $dir/xattrlist $dir/reflist || return 1
+ rados --pool rbd listxattr toyfile.0000000000000000 | grep -v striper > $dir/xattrlist2 || return 1
+ diff -w $dir/xattrlist2 $dir/reflist || return 1
+ rados --pool rbd --striper rmxattr toyfile somexattr || return 1
+
+ local attr_not_found_str="No data available"
+ [ `uname` = FreeBSD ] && \
+ attr_not_found_str="Attribute not found"
+ expect_failure $dir "$attr_not_found_str" \
+ rados --pool rbd --striper getxattr toyfile somexattr || return 1
+ expect_failure $dir "$attr_not_found_str" \
+ rados --pool rbd getxattr toyfile.0000000000000000 somexattr || return 1
+
+ # test rm
+ rados --pool rbd --striper rm toyfile || return 1
+ expect_failure $dir 'No such file or directory' \
+ rados --pool rbd --striper stat toyfile || return 1
+ expect_failure $dir 'No such file or directory' \
+ rados --pool rbd stat toyfile.0000000000000000 || return 1
+
+ # cleanup
+ teardown $dir || return 1
+}
+
+main rados-striper "$@"
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014 Red Hat <contact@redhat.com>
+# Copyright (C) 2014 Federico Gimenez <fgimenez@coit.es>
+#
+# Author: Loic Dachary <loic@dachary.org>
+# Author: Federico Gimenez <fgimenez@coit.es>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+$CEPH_ROOT/qa/standalone/ceph-helpers.sh TESTS
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7102" # git grep '\<7102\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+TEST_POOL=rbd
+
+function TEST_osd_pool_get_set() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ create_rbd_pool || return 1
+ ceph osd pool create $TEST_POOL 8
+
+ local flag
+ for flag in nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do
+ ceph osd pool set $TEST_POOL $flag 0 || return 1
+ ! ceph osd dump | grep 'pool ' | grep $flag || return 1
+ ceph osd pool set $TEST_POOL $flag 1 || return 1
+ ceph osd dump | grep 'pool ' | grep $flag || return 1
+ ceph osd pool set $TEST_POOL $flag false || return 1
+ ! ceph osd dump | grep 'pool ' | grep $flag || return 1
+ ceph osd pool set $TEST_POOL $flag false || return 1
+ # check that setting false twice does not toggle to true (bug)
+ ! ceph osd dump | grep 'pool ' | grep $flag || return 1
+ ceph osd pool set $TEST_POOL $flag true || return 1
+ ceph osd dump | grep 'pool ' | grep $flag || return 1
+ # cleanup
+ ceph osd pool set $TEST_POOL $flag 0 || return 1
+ done
+
+ local size=$(ceph osd pool get $TEST_POOL size|awk '{print $2}')
+ local min_size=$(ceph osd pool get $TEST_POOL min_size|awk '{print $2}')
+
+ ceph osd pool set $TEST_POOL scrub_min_interval 123456 || return 1
+ ceph osd dump | grep 'pool ' | grep 'scrub_min_interval 123456' || return 1
+ ceph osd pool set $TEST_POOL scrub_min_interval 0 || return 1
+ ceph osd dump | grep 'pool ' | grep 'scrub_min_interval' && return 1
+ ceph osd pool set $TEST_POOL scrub_max_interval 123456 || return 1
+ ceph osd dump | grep 'pool ' | grep 'scrub_max_interval 123456' || return 1
+ ceph osd pool set $TEST_POOL scrub_max_interval 0 || return 1
+ ceph osd dump | grep 'pool ' | grep 'scrub_max_interval' && return 1
+ ceph osd pool set $TEST_POOL deep_scrub_interval 123456 || return 1
+ ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval 123456' || return 1
+ ceph osd pool set $TEST_POOL deep_scrub_interval 0 || return 1
+ ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval' && return 1
+
+ #replicated pool size restrict in 1 and 10
+ ! ceph osd pool set $TEST_POOL 11 || return 1
+ #replicated pool min_size must be between in 1 and size
+ ! ceph osd pool set $TEST_POOL min_size $(expr $size + 1) || return 1
+ ! ceph osd pool set $TEST_POOL min_size 0 || return 1
+
+ local ecpool=erasepool
+ ceph osd pool create $ecpool 12 12 erasure default || return 1
+ #erasue pool size=k+m, min_size=k
+ local size=$(ceph osd pool get $ecpool size|awk '{print $2}')
+ local min_size=$(ceph osd pool get $ecpool min_size|awk '{print $2}')
+ local k=$(expr $min_size - 1) # default min_size=k+1
+ #erasure pool size can't change
+ ! ceph osd pool set $ecpool size $(expr $size + 1) || return 1
+ #erasure pool min_size must be between in k and size
+ ceph osd pool set $ecpool min_size $(expr $k + 1) || return 1
+ ! ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
+ ! ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
+
+ teardown $dir || return 1
+}
+
+function TEST_mon_add_to_single_mon() {
+ local dir=$1
+
+ fsid=$(uuidgen)
+ MONA=127.0.0.1:7117 # git grep '\<7117\>' : there must be only one
+ MONB=127.0.0.1:7118 # git grep '\<7118\>' : there must be only one
+ CEPH_ARGS_orig=$CEPH_ARGS
+ CEPH_ARGS="--fsid=$fsid --auth-supported=none "
+ CEPH_ARGS+="--mon-initial-members=a "
+ CEPH_ARGS+="--mon-host=$MONA "
+
+ setup $dir || return 1
+ run_mon $dir a --public-addr $MONA || return 1
+ # wait for the quorum
+ timeout 120 ceph -s > /dev/null || return 1
+ run_mon $dir b --public-addr $MONB || return 1
+ teardown $dir || return 1
+
+ setup $dir || return 1
+ run_mon $dir a --public-addr $MONA || return 1
+ # without the fix of #5454, mon.a will assert failure at seeing the MMonJoin
+ # from mon.b
+ run_mon $dir b --public-addr $MONB || return 1
+ # wait for the quorum
+ timeout 120 ceph -s > /dev/null || return 1
+ local num_mons
+ num_mons=$(ceph mon dump --format=json 2>/dev/null | jq ".mons | length") || return 1
+ [ $num_mons == 2 ] || return 1
+ # no reason to take more than 120 secs to get this submitted
+ timeout 120 ceph mon add b $MONB || return 1
+ teardown $dir || return 1
+}
+
+function TEST_no_segfault_for_bad_keyring() {
+ local dir=$1
+ setup $dir || return 1
+ # create a client.admin key and add it to ceph.mon.keyring
+ ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
+ ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *'
+ ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring
+ CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx "
+ CEPH_ARGS_orig=$CEPH_ARGS
+ CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring "
+ run_mon $dir a
+ # create a bad keyring and make sure no segfault occurs when using the bad keyring
+ echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring
+ CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring"
+ ceph osd dump 2> /dev/null
+ # 139(11|128) means segfault and core dumped
+ [ $? -eq 139 ] && return 1
+ CEPH_ARGS=$CEPH_ARGS_orig
+ teardown $dir || return 1
+}
+
+function TEST_mon_features() {
+ local dir=$1
+ setup $dir || return 1
+
+ fsid=$(uuidgen)
+ MONA=127.0.0.1:7127 # git grep '\<7127\>' ; there must be only one
+ MONB=127.0.0.1:7128 # git grep '\<7128\>' ; there must be only one
+ MONC=127.0.0.1:7129 # git grep '\<7129\>' ; there must be only one
+ CEPH_ARGS_orig=$CEPH_ARGS
+ CEPH_ARGS="--fsid=$fsid --auth-supported=none "
+ CEPH_ARGS+="--mon-initial-members=a,b,c "
+ CEPH_ARGS+="--mon-host=$MONA,$MONB,$MONC "
+ CEPH_ARGS+="--mon-debug-no-initial-persistent-features "
+ CEPH_ARGS+="--mon-debug-no-require-luminous "
+
+ run_mon $dir a --public-addr $MONA || return 1
+ run_mon $dir b --public-addr $MONB || return 1
+ timeout 120 ceph -s > /dev/null || return 1
+
+ # expect monmap to contain 3 monitors (a, b, and c)
+ jqinput="$(ceph mon_status --format=json 2>/dev/null)"
+ jq_success "$jqinput" '.monmap.mons | length == 3' || return 1
+ # quorum contains two monitors
+ jq_success "$jqinput" '.quorum | length == 2' || return 1
+ # quorum's monitor features contain kraken and luminous
+ jqfilter='.features.quorum_mon[]|select(. == "kraken")'
+ jq_success "$jqinput" "$jqfilter" "kraken" || return 1
+ jqfilter='.features.quorum_mon[]|select(. == "luminous")'
+ jq_success "$jqinput" "$jqfilter" "luminous" || return 1
+
+ # monmap must have no persistent features set, because we
+ # don't currently have a quorum made out of all the monitors
+ # in the monmap.
+ jqfilter='.monmap.features.persistent | length == 0'
+ jq_success "$jqinput" "$jqfilter" || return 1
+
+ # nor do we have any optional features, for that matter.
+ jqfilter='.monmap.features.optional | length == 0'
+ jq_success "$jqinput" "$jqfilter" || return 1
+
+ # validate 'mon feature ls'
+
+ jqinput="$(ceph mon feature ls --format=json 2>/dev/null)"
+ # 'kraken' and 'luminous' are supported
+ jqfilter='.all.supported[] | select(. == "kraken")'
+ jq_success "$jqinput" "$jqfilter" "kraken" || return 1
+ jqfilter='.all.supported[] | select(. == "luminous")'
+ jq_success "$jqinput" "$jqfilter" "luminous" || return 1
+
+ # start third monitor
+ run_mon $dir c --public-addr $MONC || return 1
+
+ wait_for_quorum 300 3 || return 1
+
+ timeout 300 ceph -s > /dev/null || return 1
+
+ jqinput="$(ceph mon_status --format=json 2>/dev/null)"
+ # expect quorum to have all three monitors
+ jqfilter='.quorum | length == 3'
+ jq_success "$jqinput" "$jqfilter" || return 1
+ # quorum's monitor features contain kraken and luminous
+ jqfilter='.features.quorum_mon[]|select(. == "kraken")'
+ jq_success "$jqinput" "$jqfilter" "kraken" || return 1
+ jqfilter='.features.quorum_mon[]|select(. == "luminous")'
+ jq_success "$jqinput" "$jqfilter" "luminous" || return 1
+
+ # monmap must have no both 'kraken' and 'luminous' persistent
+ # features set.
+ jqfilter='.monmap.features.persistent | length == 2'
+ jq_success "$jqinput" "$jqfilter" || return 1
+ jqfilter='.monmap.features.persistent[]|select(. == "kraken")'
+ jq_success "$jqinput" "$jqfilter" "kraken" || return 1
+ jqfilter='.monmap.features.persistent[]|select(. == "luminous")'
+ jq_success "$jqinput" "$jqfilter" "luminous" || return 1
+
+ CEPH_ARGS=$CEPH_ARGS_orig
+ # that's all folks. thank you for tuning in.
+ teardown $dir || return 1
+}
+
+main misc "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+set -xe
+PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
+
+
+DIR=mkfs
+export CEPH_CONF=/dev/null
+unset CEPH_ARGS
+MON_ID=a
+MON_DIR=$DIR/$MON_ID
+CEPH_MON=127.0.0.1:7110 # git grep '\<7110\>' : there must be only one
+TIMEOUT=360
+
+EXTRAOPTS=""
+if [ -n "$CEPH_LIB" ]; then
+ EXTRAOPTS+=" --erasure-code-dir $CEPH_LIB"
+ EXTRAOPTS+=" --plugin-dir $CEPH_LIB"
+ EXTRAOPTS+=" --osd-class-dir $CEPH_LIB"
+fi
+
+function setup() {
+ teardown
+ mkdir $DIR
+}
+
+function teardown() {
+ kill_daemons
+ rm -fr $DIR
+}
+
+function mon_mkfs() {
+ local fsid=$(uuidgen)
+
+ ceph-mon \
+ --id $MON_ID \
+ --fsid $fsid \
+ $EXTRAOPTS \
+ --mkfs \
+ --mon-data=$MON_DIR \
+ --mon-initial-members=$MON_ID \
+ --mon-host=$CEPH_MON \
+ "$@"
+}
+
+function mon_run() {
+ ceph-mon \
+ --id $MON_ID \
+ --chdir= \
+ --mon-osd-full-ratio=.99 \
+ --mon-data-avail-crit=1 \
+ $EXTRAOPTS \
+ --mon-data=$MON_DIR \
+ --log-file=$MON_DIR/log \
+ --mon-cluster-log-file=$MON_DIR/log \
+ --run-dir=$MON_DIR \
+ --pid-file=$MON_DIR/pidfile \
+ --public-addr $CEPH_MON \
+ "$@"
+}
+
+function kill_daemons() {
+ for pidfile in $(find $DIR -name pidfile) ; do
+ pid=$(cat $pidfile)
+ for try in 0 1 1 1 2 3 ; do
+ kill $pid || break
+ sleep $try
+ done
+ done
+}
+
+function auth_none() {
+ mon_mkfs --auth-supported=none
+
+ ceph-mon \
+ --id $MON_ID \
+ --mon-osd-full-ratio=.99 \
+ --mon-data-avail-crit=1 \
+ $EXTRAOPTS \
+ --mon-data=$MON_DIR \
+ --extract-monmap $MON_DIR/monmap
+
+ [ -f $MON_DIR/monmap ] || return 1
+
+ [ ! -f $MON_DIR/keyring ] || return 1
+
+ mon_run --auth-supported=none
+
+ timeout $TIMEOUT ceph --mon-host $CEPH_MON mon stat || return 1
+}
+
+function auth_cephx_keyring() {
+ cat > $DIR/keyring <<EOF
+[mon.]
+ key = AQDUS79S0AF9FRAA2cgRLFscVce0gROn/s9WMg==
+ caps mon = "allow *"
+EOF
+
+ mon_mkfs --keyring=$DIR/keyring
+
+ [ -f $MON_DIR/keyring ] || return 1
+
+ mon_run
+
+ timeout $TIMEOUT ceph \
+ --name mon. \
+ --keyring $MON_DIR/keyring \
+ --mon-host $CEPH_MON mon stat || return 1
+}
+
+function auth_cephx_key() {
+ if [ -f /etc/ceph/keyring ] ; then
+ echo "Please move /etc/ceph/keyring away for testing!"
+ return 1
+ fi
+
+ local key=$(ceph-authtool --gen-print-key)
+
+ if mon_mkfs --key='corrupted key' ; then
+ return 1
+ else
+ rm -fr $MON_DIR/store.db
+ rm -fr $MON_DIR/kv_backend
+ fi
+
+ mon_mkfs --key=$key
+
+ [ -f $MON_DIR/keyring ] || return 1
+ grep $key $MON_DIR/keyring
+
+ mon_run
+
+ timeout $TIMEOUT ceph \
+ --name mon. \
+ --keyring $MON_DIR/keyring \
+ --mon-host $CEPH_MON mon stat || return 1
+}
+
+function makedir() {
+ local toodeep=$MON_DIR/toodeep
+
+ # fail if recursive directory creation is needed
+ ceph-mon \
+ --id $MON_ID \
+ --mon-osd-full-ratio=.99 \
+ --mon-data-avail-crit=1 \
+ $EXTRAOPTS \
+ --mkfs \
+ --mon-data=$toodeep 2>&1 | tee $DIR/makedir.log
+ grep 'toodeep.*No such file' $DIR/makedir.log > /dev/null
+ rm $DIR/makedir.log
+
+ # an empty directory does not mean the mon exists
+ mkdir $MON_DIR
+ mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log
+ ! grep "$MON_DIR already exists" $DIR/makedir.log || return 1
+}
+
+function idempotent() {
+ mon_mkfs --auth-supported=none
+ mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log
+ grep "'$MON_DIR' already exists" $DIR/makedir.log > /dev/null || return 1
+}
+
+function run() {
+ local actions
+ actions+="makedir "
+ actions+="idempotent "
+ actions+="auth_cephx_key "
+ actions+="auth_cephx_keyring "
+ actions+="auth_none "
+ for action in $actions ; do
+ setup
+ $action || return 1
+ teardown
+ done
+}
+
+run
+
+# Local Variables:
+# compile-command: "cd ../.. ; make TESTS=test/mon/mkfs.sh check"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2017 Quantum Corp.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+SOCAT_PIDS=()
+
+function port_forward() {
+ local source_port=$1
+ local target_port=$2
+
+ socat TCP-LISTEN:${source_port},fork,reuseaddr TCP:localhost:${target_port} &
+ SOCAT_PIDS+=( $! )
+}
+
+function cleanup() {
+ for p in "${SOCAT_PIDS[@]}"; do
+ kill $p
+ done
+ SOCAT_PIDS=()
+}
+
+trap cleanup SIGTERM SIGKILL SIGQUIT SIGINT
+
+function run() {
+ local dir=$1
+ shift
+
+ export MON_IP=127.0.0.1
+ export MONA_PUBLIC=7132 # git grep '\<7132\>' ; there must be only one
+ export MONB_PUBLIC=7133 # git grep '\<7133\>' ; there must be only one
+ export MONC_PUBLIC=7134 # git grep '\<7134\>' ; there must be only one
+ export MONA_BIND=7135 # git grep '\<7135\>' ; there must be only one
+ export MONB_BIND=7136 # git grep '\<7136\>' ; there must be only one
+ export MONC_BIND=7137 # git grep '\<7137\>' ; there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir && cleanup || { cleanup; return 1; }
+ teardown $dir
+ done
+}
+
+function TEST_mon_client_connect_fails() {
+ local dir=$1
+
+ # start the mon with a public-bind-addr that is different
+ # from the public-addr.
+ CEPH_ARGS+="--mon-initial-members=a "
+ CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} "
+ run_mon $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
+
+ # now attempt to ping it that should fail.
+ timeout 3 ceph ping mon.a || return 0
+ return 1
+}
+
+function TEST_mon_client_connect() {
+ local dir=$1
+
+ # start the mon with a public-bind-addr that is different
+ # from the public-addr.
+ CEPH_ARGS+="--mon-initial-members=a "
+ CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} "
+ run_mon $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
+
+ # now forward the public port to the bind port.
+ port_forward ${MONA_PUBLIC} ${MONA_BIND}
+
+ # attempt to connect. we expect that to work
+ ceph ping mon.a || return 1
+}
+
+function TEST_mon_quorum() {
+ local dir=$1
+
+ # start the mon with a public-bind-addr that is different
+ # from the public-addr.
+ CEPH_ARGS+="--mon-initial-members=a,b,c "
+ CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} "
+ run_mon $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
+ run_mon $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1
+ run_mon $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1
+
+ # now forward the public port to the bind port.
+ port_forward ${MONA_PUBLIC} ${MONA_BIND}
+ port_forward ${MONB_PUBLIC} ${MONB_BIND}
+ port_forward ${MONC_PUBLIC} ${MONC_BIND}
+
+ # expect monmap to contain 3 monitors (a, b, and c)
+ jqinput="$(ceph mon_status --format=json 2>/dev/null)"
+ jq_success "$jqinput" '.monmap.mons | length == 3' || return 1
+
+ # quorum should form
+ wait_for_quorum 300 3 || return 1
+ # expect quorum to have all three monitors
+ jqfilter='.quorum | length == 3'
+ jq_success "$jqinput" "$jqfilter" || return 1
+}
+
+function TEST_put_get() {
+ local dir=$1
+
+ # start the mon with a public-bind-addr that is different
+ # from the public-addr.
+ CEPH_ARGS+="--mon-initial-members=a,b,c "
+ CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} "
+ run_mon $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
+ run_mon $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1
+ run_mon $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1
+
+ # now forward the public port to the bind port.
+ port_forward ${MONA_PUBLIC} ${MONA_BIND}
+ port_forward ${MONB_PUBLIC} ${MONB_BIND}
+ port_forward ${MONC_PUBLIC} ${MONC_BIND}
+
+ # quorum should form
+ wait_for_quorum 300 3 || return 1
+
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ ceph osd pool create hello 8 || return 1
+
+ echo "hello world" > $dir/hello
+ rados --pool hello put foo $dir/hello || return 1
+ rados --pool hello get foo $dir/hello2 || return 1
+ diff $dir/hello $dir/hello2 || return 1
+}
+
+main mon-bind "$@"
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7125" # git grep '\<7125\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_mon_created_time() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ ceph mon dump || return 1
+
+ if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = ""x ; then
+ return 1
+ fi
+
+ if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = "0.000000"x ; then
+ return 1
+ fi
+}
+
+main mon-created-time "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-created-time.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014,2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+
+ setup $dir || return 1
+
+ MONA=127.0.0.1:7300
+ MONB=127.0.0.1:7301
+ (
+ FSID=$(uuidgen)
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$FSID --auth-supported=none "
+ CEPH_ARGS+="--mon-initial-members=a,b --mon-host=$MONA,$MONB "
+ run_mon $dir a --public-addr $MONA || return 1
+ run_mon $dir b --public-addr $MONB || return 1
+ )
+
+ timeout 360 ceph --mon-host $MONA mon stat || return 1
+ # check that MONB is indeed a peon
+ ceph --admin-daemon $(get_asok_path mon.b) mon_status |
+ grep '"peon"' || return 1
+ # when the leader ( MONA ) is used, there is no message forwarding
+ ceph --mon-host $MONA osd pool create POOL1 12
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep 'mon_command(.*"POOL1"' $dir/a/mon.a.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.b) log flush || return 1
+ grep 'mon_command(.*"POOL1"' $dir/mon.b.log && return 1
+ # when the peon ( MONB ) is used, the message is forwarded to the leader
+ ceph --mon-host $MONB osd pool create POOL2 12
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.b) log flush || return 1
+ grep 'forward_request.*mon_command(.*"POOL2"' $dir/mon.b.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep ' forward(mon_command(.*"POOL2"' $dir/mon.a.log
+ # forwarded messages must retain features from the original connection
+ features=$(sed -n -e 's|.*127.0.0.1:0.*accept features \([0-9][0-9]*\)|\1|p' < \
+ $dir/mon.b.log)
+ grep ' forward(mon_command(.*"POOL2".*con_features '$features $dir/mon.a.log
+
+ teardown $dir || return 1
+}
+
+main mon-handle-forward "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 TESTS=test/mon/mon-handle-forward.sh check"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7119" # git grep '\<7119\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_mon_ping() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ ceph ping mon.a || return 1
+}
+
+main mon-ping "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7120" # git grep '\<7120\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_mon_scrub() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ ceph mon scrub || return 1
+}
+
+main mon-scrub "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-scrub.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7104" # git grep '\<7104\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | ${SED} -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_crush_rule_create_simple() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ ceph --format xml osd crush rule dump replicated_rule | \
+ egrep '<op>take</op><item>[^<]+</item><item_name>default</item_name>' | \
+ grep '<op>choose_firstn</op><num>0</num><type>osd</type>' || return 1
+ local ruleset=ruleset0
+ local root=host1
+ ceph osd crush add-bucket $root host
+ local failure_domain=osd
+ ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1
+ ceph osd crush rule create-simple $ruleset $root $failure_domain 2>&1 | \
+ grep "$ruleset already exists" || return 1
+ ceph --format xml osd crush rule dump $ruleset | \
+ egrep '<op>take</op><item>[^<]+</item><item_name>'$root'</item_name>' | \
+ grep '<op>choose_firstn</op><num>0</num><type>'$failure_domain'</type>' || return 1
+ ceph osd crush rule rm $ruleset || return 1
+}
+
+function TEST_crush_rule_dump() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ local ruleset=ruleset1
+ ceph osd crush rule create-erasure $ruleset || return 1
+ test $(ceph --format json osd crush rule dump $ruleset | \
+ jq ".rule_name == \"$ruleset\"") == true || return 1
+ test $(ceph --format json osd crush rule dump | \
+ jq "map(select(.rule_name == \"$ruleset\")) | length == 1") == true || return 1
+ ! ceph osd crush rule dump non_existent_ruleset || return 1
+ ceph osd crush rule rm $ruleset || return 1
+}
+
+function TEST_crush_rule_rm() {
+ local ruleset=erasure2
+
+ run_mon $dir a || return 1
+
+ ceph osd crush rule create-erasure $ruleset default || return 1
+ ceph osd crush rule ls | grep $ruleset || return 1
+ ceph osd crush rule rm $ruleset || return 1
+ ! ceph osd crush rule ls | grep $ruleset || return 1
+}
+
+function TEST_crush_rule_create_erasure() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ # should have at least one OSD
+ run_osd $dir 0 || return 1
+
+ local ruleset=ruleset3
+ #
+ # create a new ruleset with the default profile, implicitly
+ #
+ ceph osd crush rule create-erasure $ruleset || return 1
+ ceph osd crush rule create-erasure $ruleset 2>&1 | \
+ grep "$ruleset already exists" || return 1
+ ceph --format xml osd crush rule dump $ruleset | \
+ egrep '<op>take</op><item>[^<]+</item><item_name>default</item_name>' | \
+ grep '<op>chooseleaf_indep</op><num>0</num><type>host</type>' || return 1
+ ceph osd crush rule rm $ruleset || return 1
+ ! ceph osd crush rule ls | grep $ruleset || return 1
+ #
+ # create a new ruleset with the default profile, explicitly
+ #
+ ceph osd crush rule create-erasure $ruleset default || return 1
+ ceph osd crush rule ls | grep $ruleset || return 1
+ ceph osd crush rule rm $ruleset || return 1
+ ! ceph osd crush rule ls | grep $ruleset || return 1
+ #
+ # create a new ruleset and the default profile, implicitly
+ #
+ ceph osd erasure-code-profile rm default || return 1
+ ! ceph osd erasure-code-profile ls | grep default || return 1
+ ceph osd crush rule create-erasure $ruleset || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
+ grep 'profile set default' $dir/mon.a.log || return 1
+ ceph osd erasure-code-profile ls | grep default || return 1
+ ceph osd crush rule rm $ruleset || return 1
+ ! ceph osd crush rule ls | grep $ruleset || return 1
+}
+
+function check_ruleset_id_match_rule_id() {
+ local rule_name=$1
+ rule_id=`ceph osd crush rule dump $rule_name | grep "\"rule_id\":" | awk -F ":|," '{print int($2)}'`
+ ruleset_id=`ceph osd crush rule dump $rule_name | grep "\"ruleset\":"| awk -F ":|," '{print int($2)}'`
+ test $ruleset_id = $rule_id || return 1
+}
+
+function generate_manipulated_rules() {
+ local dir=$1
+ ceph osd crush add-bucket $root host
+ ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1
+ ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1
+ ceph osd getcrushmap -o $dir/original_map
+ crushtool -d $dir/original_map -o $dir/decoded_original_map
+ #manipulate the rulesets , to make the rule_id != ruleset_id
+ ${SED} -i 's/ruleset 0/ruleset 3/' $dir/decoded_original_map
+ ${SED} -i 's/ruleset 2/ruleset 0/' $dir/decoded_original_map
+ ${SED} -i 's/ruleset 1/ruleset 2/' $dir/decoded_original_map
+
+ crushtool -c $dir/decoded_original_map -o $dir/new_map
+ ceph osd setcrushmap -i $dir/new_map
+
+ ceph osd crush rule dump
+}
+
+function TEST_crush_ruleset_match_rule_when_creating() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ local root=host1
+
+ generate_manipulated_rules $dir
+
+ ceph osd crush rule create-simple special_rule_simple $root osd firstn || return 1
+
+ ceph osd crush rule dump
+ #show special_rule_simple has same rule_id and ruleset_id
+ check_ruleset_id_match_rule_id special_rule_simple || return 1
+}
+
+function TEST_add_ruleset_failed() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ local root=host1
+
+ ceph osd crush add-bucket $root host
+ ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1
+ ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1
+ ceph osd getcrushmap > $dir/crushmap || return 1
+ crushtool --decompile $dir/crushmap > $dir/crushmap.txt || return 1
+ for i in $(seq 3 255)
+ do
+ cat <<EOF
+rule test_rule$i {
+ ruleset $i
+ type replicated
+ min_size 1
+ max_size 10
+ step take $root
+ step choose firstn 0 type osd
+ step emit
+}
+EOF
+ done >> $dir/crushmap.txt
+ crushtool --compile $dir/crushmap.txt -o $dir/crushmap || return 1
+ ceph osd setcrushmap -i $dir/crushmap || return 1
+ ceph osd crush rule create-simple test_rule_nospace $root osd firstn 2>&1 | grep "Error ENOSPC" || return 1
+
+}
+
+function TEST_crush_rename_bucket() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+
+ ceph osd crush add-bucket host1 host
+ ceph osd tree
+ ! ceph osd tree | grep host2 || return 1
+ ceph osd crush rename-bucket host1 host2 || return 1
+ ceph osd tree
+ ceph osd tree | grep host2 || return 1
+ ceph osd crush rename-bucket host1 host2 || return 1 # idempotency
+ ceph osd crush rename-bucket nonexistent something 2>&1 | grep "Error ENOENT" || return 1
+}
+
+function TEST_crush_reject_empty() {
+ local dir=$1
+ run_mon $dir a || return 1
+ # should have at least one OSD
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+
+ local empty_map=$dir/empty_map
+ :> $empty_map.txt
+ crushtool -c $empty_map.txt -o $empty_map.map || return 1
+ expect_failure $dir "Error EINVAL" \
+ ceph osd setcrushmap -i $empty_map.map || return 1
+}
+
+main osd-crush "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/osd-crush.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7220" # git grep '\<7220\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_set() {
+ local dir=$1
+ local id=$2
+
+ run_mon $dir a || return 1
+
+ local profile=myprofile
+ #
+ # no key=value pairs : use the default configuration
+ #
+ ceph osd erasure-code-profile set $profile 2>&1 || return 1
+ ceph osd erasure-code-profile get $profile | \
+ grep plugin=jerasure || return 1
+ ceph osd erasure-code-profile rm $profile
+ #
+ # key=value pairs override the default
+ #
+ ceph osd erasure-code-profile set $profile \
+ key=value plugin=isa || return 1
+ ceph osd erasure-code-profile get $profile | \
+ grep -e key=value -e plugin=isa || return 1
+ #
+ # --force is required to override an existing profile
+ #
+ ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1
+ grep 'will not override' $dir/out || return 1
+ ceph osd erasure-code-profile set $profile key=other --force || return 1
+ ceph osd erasure-code-profile get $profile | \
+ grep key=other || return 1
+
+ ceph osd erasure-code-profile rm $profile # cleanup
+}
+
+function TEST_ls() {
+ local dir=$1
+ local id=$2
+
+ run_mon $dir a || return 1
+
+ local profile=myprofile
+ ! ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile set $profile 2>&1 || return 1
+ ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph --format xml osd erasure-code-profile ls | \
+ grep "<profile>$profile</profile>" || return 1
+
+ ceph osd erasure-code-profile rm $profile # cleanup
+}
+
+function TEST_rm() {
+ local dir=$1
+ local id=$2
+
+ run_mon $dir a || return 1
+
+ local profile=myprofile
+ ceph osd erasure-code-profile set $profile 2>&1 || return 1
+ ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile rm $profile || return 1
+ ! ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile rm WRONG 2>&1 | \
+ grep "WRONG does not exist" || return 1
+
+ ceph osd erasure-code-profile set $profile || return 1
+ ceph osd pool create poolname 12 12 erasure $profile || return 1
+ ! ceph osd erasure-code-profile rm $profile > $dir/out 2>&1 || return 1
+ grep "poolname.*using.*$profile" $dir/out || return 1
+ ceph osd pool delete poolname poolname --yes-i-really-really-mean-it || return 1
+ ceph osd erasure-code-profile rm $profile || return 1
+
+ ceph osd erasure-code-profile rm $profile # cleanup
+}
+
+function TEST_get() {
+ local dir=$1
+ local id=$2
+
+ run_mon $dir a || return 1
+
+ local default_profile=default
+ ceph osd erasure-code-profile get $default_profile | \
+ grep plugin=jerasure || return 1
+ ceph --format xml osd erasure-code-profile get $default_profile | \
+ grep '<plugin>jerasure</plugin>' || return 1
+ ! ceph osd erasure-code-profile get WRONG > $dir/out 2>&1 || return 1
+ grep -q "unknown erasure code profile 'WRONG'" $dir/out || return 1
+}
+
+function TEST_set_idempotent() {
+ local dir=$1
+ local id=$2
+
+ run_mon $dir a || return 1
+ #
+ # The default profile is set using a code path different from
+ # ceph osd erasure-code-profile set: verify that it is idempotent,
+ # as if it was using the same code path.
+ #
+ ceph osd erasure-code-profile set default k=2 m=1 2>&1 || return 1
+ local profile
+ #
+ # Because plugin=jerasure is the default, it uses a slightly
+ # different code path where defaults (m=1 for instance) are added
+ # implicitly.
+ #
+ profile=profileidempotent1
+ ! ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1
+ ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1
+ ceph osd erasure-code-profile rm $profile # cleanup
+
+ #
+ # In the general case the profile is exactly what is on
+ #
+ profile=profileidempotent2
+ ! ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1
+ ceph osd erasure-code-profile ls | grep $profile || return 1
+ ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1
+ ceph osd erasure-code-profile rm $profile # cleanup
+}
+
+function TEST_format_invalid() {
+ local dir=$1
+
+ local profile=profile
+ # osd_pool_default_erasure-code-profile is
+ # valid JSON but not of the expected type
+ run_mon $dir a \
+ --osd_pool_default_erasure-code-profile 1 || return 1
+ ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1
+ cat $dir/out
+ grep 'must be a JSON object' $dir/out || return 1
+}
+
+function TEST_format_json() {
+ local dir=$1
+
+ # osd_pool_default_erasure-code-profile is JSON
+ expected='"plugin":"isa"'
+ run_mon $dir a \
+ --osd_pool_default_erasure-code-profile "{$expected}" || return 1
+ ceph --format json osd erasure-code-profile get default | \
+ grep "$expected" || return 1
+}
+
+function TEST_format_plain() {
+ local dir=$1
+
+ # osd_pool_default_erasure-code-profile is plain text
+ expected='"plugin":"isa"'
+ run_mon $dir a \
+ --osd_pool_default_erasure-code-profile "plugin=isa" || return 1
+ ceph --format json osd erasure-code-profile get default | \
+ grep "$expected" || return 1
+}
+
+function TEST_profile_k_sanity() {
+ local dir=$1
+ local profile=profile-sanity
+
+ run_mon $dir a || return 1
+
+ expect_failure $dir 'k must be a multiple of (k + m) / l' \
+ ceph osd erasure-code-profile set $profile \
+ plugin=lrc \
+ l=1 \
+ k=1 \
+ m=1 || return 1
+
+ if erasure_code_plugin_exists isa ; then
+ expect_failure $dir 'k=1 must be >= 2' \
+ ceph osd erasure-code-profile set $profile \
+ plugin=isa \
+ k=1 \
+ m=1 || return 1
+ else
+ echo "SKIP because plugin isa has not been built"
+ fi
+
+ expect_failure $dir 'k=1 must be >= 2' \
+ ceph osd erasure-code-profile set $profile \
+ plugin=jerasure \
+ k=1 \
+ m=1 || return 1
+}
+
+main osd-erasure-code-profile "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/osd-erasure-code-profile.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2013, 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7105" # git grep '\<7105\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+# Before http://tracker.ceph.com/issues/8307 the invalid profile was created
+function TEST_erasure_invalid_profile() {
+ local dir=$1
+ run_mon $dir a || return 1
+ local poolname=pool_erasure
+ local notaprofile=not-a-valid-erasure-code-profile
+ ! ceph osd pool create $poolname 12 12 erasure $notaprofile || return 1
+ ! ceph osd erasure-code-profile ls | grep $notaprofile || return 1
+}
+
+function TEST_erasure_crush_rule() {
+ local dir=$1
+ run_mon $dir a || return 1
+ #
+ # choose the crush ruleset used with an erasure coded pool
+ #
+ local crush_ruleset=myruleset
+ ! ceph osd crush rule ls | grep $crush_ruleset || return 1
+ ceph osd crush rule create-erasure $crush_ruleset
+ ceph osd crush rule ls | grep $crush_ruleset
+ local poolname
+ poolname=pool_erasure1
+ ! ceph --format json osd dump | grep '"crush_rule":1' || return 1
+ ceph osd pool create $poolname 12 12 erasure default $crush_ruleset
+ ceph --format json osd dump | grep '"crush_rule":1' || return 1
+ #
+ # a crush ruleset by the name of the pool is implicitly created
+ #
+ poolname=pool_erasure2
+ ceph osd erasure-code-profile set myprofile
+ ceph osd pool create $poolname 12 12 erasure myprofile
+ ceph osd crush rule ls | grep $poolname || return 1
+ #
+ # a non existent crush ruleset given in argument is an error
+ # http://tracker.ceph.com/issues/9304
+ #
+ poolname=pool_erasure3
+ ! ceph osd pool create $poolname 12 12 erasure myprofile INVALIDRULESET || return 1
+}
+
+function TEST_erasure_code_profile_default() {
+ local dir=$1
+ run_mon $dir a || return 1
+ ceph osd erasure-code-profile rm default || return 1
+ ! ceph osd erasure-code-profile ls | grep default || return 1
+ ceph osd pool create $poolname 12 12 erasure default
+ ceph osd erasure-code-profile ls | grep default || return 1
+}
+
+function TEST_erasure_crush_stripe_unit() {
+ local dir=$1
+ # the default stripe unit is used to initialize the pool
+ run_mon $dir a --public-addr $CEPH_MON
+ stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
+ eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
+ stripe_width = $((stripe_unit * k))
+ ceph osd pool create pool_erasure 12 12 erasure
+ ceph --format json osd dump | tee $dir/osd.json
+ grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1
+}
+
+function TEST_erasure_crush_stripe_unit_padded() {
+ local dir=$1
+ # setting osd_pool_erasure_code_stripe_unit modifies the stripe_width
+ # and it is padded as required by the default plugin
+ profile+=" plugin=jerasure"
+ profile+=" technique=reed_sol_van"
+ k=4
+ profile+=" k=$k"
+ profile+=" m=2"
+ actual_stripe_unit=2048
+ desired_stripe_unit=$((actual_stripe_unit - 1))
+ actual_stripe_width=$((actual_stripe_unit * k))
+ run_mon $dir a \
+ --osd_pool_erasure_code_stripe_unit $desired_stripe_unit \
+ --osd_pool_default_erasure_code_profile "$profile" || return 1
+ ceph osd pool create pool_erasure 12 12 erasure
+ ceph osd dump | tee $dir/osd.json
+ grep "stripe_width $actual_stripe_width" $dir/osd.json > /dev/null || return 1
+}
+
+function TEST_erasure_code_pool() {
+ local dir=$1
+ run_mon $dir a || return 1
+ ceph --format json osd dump > $dir/osd.json
+ local expected='"erasure_code_profile":"default"'
+ ! grep "$expected" $dir/osd.json || return 1
+ ceph osd pool create erasurecodes 12 12 erasure
+ ceph --format json osd dump | tee $dir/osd.json
+ grep "$expected" $dir/osd.json > /dev/null || return 1
+
+ ceph osd pool create erasurecodes 12 12 erasure 2>&1 | \
+ grep 'already exists' || return 1
+ ceph osd pool create erasurecodes 12 12 2>&1 | \
+ grep 'cannot change to type replicated' || return 1
+}
+
+function TEST_replicated_pool_with_ruleset() {
+ local dir=$1
+ run_mon $dir a
+ local ruleset=ruleset0
+ local root=host1
+ ceph osd crush add-bucket $root host
+ local failure_domain=osd
+ local poolname=mypool
+ ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1
+ ceph osd crush rule ls | grep $ruleset
+ ceph osd pool create $poolname 12 12 replicated $ruleset || return 1
+ rule_id=`ceph osd crush rule dump $ruleset | grep "rule_id" | awk -F[' ':,] '{print $4}'`
+ ceph osd pool get $poolname crush_rule 2>&1 | \
+ grep "crush_rule: $rule_id" || return 1
+ #non-existent crush ruleset
+ ceph osd pool create newpool 12 12 replicated non-existent 2>&1 | \
+ grep "doesn't exist" || return 1
+}
+
+function TEST_erasure_code_pool_lrc() {
+ local dir=$1
+ run_mon $dir a || return 1
+
+ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=DD_ \
+ layers='[ [ "DDc", "" ] ]' || return 1
+
+ ceph --format json osd dump > $dir/osd.json
+ local expected='"erasure_code_profile":"LRCprofile"'
+ local poolname=erasurecodes
+ ! grep "$expected" $dir/osd.json || return 1
+ ceph osd pool create $poolname 12 12 erasure LRCprofile
+ ceph --format json osd dump | tee $dir/osd.json
+ grep "$expected" $dir/osd.json > /dev/null || return 1
+ ceph osd crush rule ls | grep $poolname || return 1
+}
+
+function TEST_replicated_pool() {
+ local dir=$1
+ run_mon $dir a || return 1
+ ceph osd pool create replicated 12 12 replicated replicated_rule || return 1
+ ceph osd pool create replicated 12 12 replicated replicated_rule 2>&1 | \
+ grep 'already exists' || return 1
+ # default is replicated
+ ceph osd pool create replicated1 12 12 || return 1
+ # default is replicated, pgp_num = pg_num
+ ceph osd pool create replicated2 12 || return 1
+ ceph osd pool create replicated 12 12 erasure 2>&1 | \
+ grep 'cannot change to type erasure' || return 1
+}
+
+function TEST_no_pool_delete() {
+ local dir=$1
+ run_mon $dir a || return 1
+ ceph osd pool create foo 1 || return 1
+ ceph tell mon.a injectargs -- --no-mon-allow-pool-delete || return 1
+ ! ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
+ ceph tell mon.a injectargs -- --mon-allow-pool-delete || return 1
+ ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
+}
+
+function TEST_utf8_cli() {
+ local dir=$1
+ run_mon $dir a || return 1
+ # Hopefully it's safe to include literal UTF-8 characters to test
+ # the fix for http://tracker.ceph.com/issues/7387. If it turns out
+ # to not be OK (when is the default encoding *not* UTF-8?), maybe
+ # the character '黄' can be replaced with the escape $'\xe9\xbb\x84'
+ ceph osd pool create 黄 1024 || return 1
+ ceph osd lspools 2>&1 | \
+ grep "黄" || return 1
+ ceph -f json-pretty osd dump | \
+ python -c "import json; import sys; json.load(sys.stdin)" || return 1
+ ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it
+}
+
+main osd-pool-create "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/osd-pool-create.sh"
+# End:
--- /dev/null
+#!/bin/bash
+
+#
+# Generic pool quota test
+#
+
+# Includes
+
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:17108" # git grep '\<17108\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_pool_quota() {
+ local dir=$1
+ setup $dir || return 1
+
+ run_mon $dir a || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ local poolname=testquoa
+ ceph osd pool create $poolname 20
+ local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
+ local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
+
+ echo $objects
+ echo $bytes
+ if [ $objects != 'N/A' ] || [ $bytes != 'N/A' ] ;
+ then
+ return 1
+ fi
+
+ ceph osd pool set-quota $poolname max_objects 1000
+ ceph osd pool set-quota $poolname max_bytes 1024
+
+ objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
+ bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
+
+ if [ $objects != '1000' ] || [ $bytes != '1024' ] ;
+ then
+ return 1
+ fi
+
+ ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+ teardown $dir || return 1
+}
+
+main testpoolquota
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7106" # git grep '\<7106\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_bench() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+
+ local osd_bench_small_size_max_iops=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_bench_small_size_max_iops)
+ local osd_bench_large_size_max_throughput=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_bench_large_size_max_throughput)
+ local osd_bench_max_block_size=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_bench_max_block_size)
+ local osd_bench_duration=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_bench_duration)
+
+ #
+ # block size too high
+ #
+ expect_failure $dir osd_bench_max_block_size \
+ ceph tell osd.0 bench 1024 $((osd_bench_max_block_size + 1)) || return 1
+
+ #
+ # count too high for small (< 1MB) block sizes
+ #
+ local bsize=1024
+ local max_count=$(($bsize * $osd_bench_duration * $osd_bench_small_size_max_iops))
+ expect_failure $dir bench_small_size_max_iops \
+ ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
+
+ #
+ # count too high for large (>= 1MB) block sizes
+ #
+ local bsize=$((1024 * 1024 + 1))
+ local max_count=$(($osd_bench_large_size_max_throughput * $osd_bench_duration))
+ expect_failure $dir osd_bench_large_size_max_throughput \
+ ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
+
+ #
+ # default values should work
+ #
+ ceph tell osd.0 bench || return 1
+
+ #
+ # test object_size < block_size
+ ceph tell osd.0 bench 10 14456 4444 3
+ #
+
+ #
+ # test object_size < block_size & object_size = 0(default value)
+ #
+ ceph tell osd.0 bench 1 14456
+}
+
+main osd-bench "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7100" # git grep '\<7100\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_config_init() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ local advance=1000
+ local stale=1000
+ local cache=500
+ run_osd $dir 0 \
+ --osd-map-max-advance $advance \
+ --osd-map-cache-size $cache \
+ --osd-pg-epoch-persisted-max-stale $stale \
+ || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
+}
+
+function TEST_config_track() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+
+ local osd_map_cache_size=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_map_cache_size)
+ local osd_map_max_advance=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_map_max_advance)
+ local osd_pg_epoch_persisted_max_stale=$(CEPH_ARGS='' ceph-conf \
+ --show-config-value osd_pg_epoch_persisted_max_stale)
+ #
+ # lower cache_size under max_advance to trigger the warning
+ #
+ ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ local cache=$(($osd_map_max_advance / 2))
+ ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ rm $dir/osd.0.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1
+
+ #
+ # reset cache_size to the default and assert that it does not trigger the warning
+ #
+ ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ local cache=$osd_map_cache_size
+ ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ rm $dir/osd.0.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1
+
+ #
+ # increase the osd_map_max_advance above the default cache_size
+ #
+ ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ local advance=$(($osd_map_cache_size * 2))
+ ceph tell osd.0 injectargs "--osd-map-max-advance $advance" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+ rm $dir/osd.0.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1
+
+ #
+ # increase the osd_pg_epoch_persisted_max_stale above the default cache_size
+ #
+ ! grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
+ local stale=$(($osd_map_cache_size * 2))
+ ceph tell osd.0 injectargs "--osd-pg-epoch-persisted-max-stale $stale" || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
+ grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
+ rm $dir/osd.0.log
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1
+}
+
+main osd-config "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-config.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+# Author: Sage Weil <sage@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7111" # git grep '\<7111\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_copy_from() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+
+ # success
+ rados -p rbd put foo $(which rados)
+ rados -p rbd cp foo foo2
+ rados -p rbd stat foo2
+
+ # failure
+ ceph tell osd.\* injectargs -- --osd-debug-inject-copyfrom-error
+ ! rados -p rbd cp foo foo3
+ ! rados -p rbd stat foo3
+
+ # success again
+ ceph tell osd.\* injectargs -- --no-osd-debug-inject-copyfrom-error
+ ! rados -p rbd cp foo foo3
+ rados -p rbd stat foo3
+}
+
+main osd-copy-from "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
+# End:
--- /dev/null
+#!/bin/bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7146" # git grep '\<7146\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ # avoid running out of fds in rados bench
+ CEPH_ARGS+="--filestore_wbthrottle_xfs_ios_hard_limit=900 "
+ CEPH_ARGS+="--filestore_wbthrottle_btrfs_ios_hard_limit=900 "
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_filestore_to_bluestore() {
+ local dir=$1
+
+ local flimit=$(ulimit -n)
+ if [ $flimit -lt 1536 ]; then
+ echo "Low open file limit ($flimit), test may fail. Increase to 1536 or higher and retry if that happens."
+ fi
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ osd_pid=$(cat $dir/osd.0.pid)
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ sleep 5
+
+ ceph osd pool create foo 16
+
+ # write some objects
+ rados bench -p foo 10 write -b 4096 --no-cleanup || return 1
+
+ # kill
+ while kill $osd_pid; do sleep 1 ; done
+ ceph osd down 0
+
+ mv $dir/0 $dir/0.old || return 1
+ mkdir $dir/0 || return 1
+ ofsid=$(cat $dir/0.old/fsid)
+ echo "osd fsid $ofsid"
+ O=$CEPH_ARGS
+ CEPH_ARGS+="--log-file $dir/cot.log --log-max-recent 0 "
+ ceph-objectstore-tool --type bluestore --data-path $dir/0 --fsid $ofsid \
+ --op mkfs || return 1
+ ceph-objectstore-tool --data-path $dir/0.old --target-data-path $dir/0 \
+ --op dup || return 1
+ CEPH_ARGS=$O
+
+ run_osd_bluestore $dir 0 || return 1
+
+ while ! ceph osd stat | grep '3 up' ; do sleep 1 ; done
+ ceph osd metadata 0 | grep bluestore || return 1
+
+ ceph osd scrub 0
+
+ # give it some time
+ sleep 15
+ # and make sure mon is sync'ed
+ flush_pg_stats
+
+ wait_for_clean || return 1
+}
+
+main osd-dup "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-dup.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2016 Piotr Dałek <git@predictor.org.pl>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Piotr Dałek <git@predictor.org.pl>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+MAX_PROPAGATION_TIME=30
+
+function run() {
+ local dir=$1
+ shift
+ rm -f $dir/*.pid
+ export CEPH_MON="127.0.0.1:7126" # git grep '\<7126\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ #
+ # Disable device auto class feature for this testing,
+ # as it will automatically make root clones based on new class types
+ # and hence affect the down osd counting.
+ # E.g.,
+ #
+ # ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
+ # -4 3.00000 root default~hdd
+ # -3 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic~hdd
+ # 0 1.00000 osd.0 down 1.00000 1.00000
+ # 1 1.00000 osd.1 up 1.00000 1.00000
+ # 2 1.00000 osd.2 up 1.00000 1.00000
+ # -1 3.00000 root default
+ # -2 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic
+ # 0 1.00000 osd.0 down 1.00000 1.00000
+ # 1 1.00000 osd.1 up 1.00000 1.00000
+ # 2 1.00000 osd.2 up 1.00000 1.00000
+ #
+ CEPH_ARGS+="--osd-class-update-on-start=false "
+
+ OLD_ARGS=$CEPH_ARGS
+ CEPH_ARGS+="--osd-fast-fail-on-connection-refused=false "
+ echo "Ensuring old behavior is there..."
+ test_fast_kill $dir && (echo "OSDs died too early! Old behavior doesn't work." ; return 1)
+
+ CEPH_ARGS=$OLD_ARGS"--osd-fast-fail-on-connection-refused=true "
+ OLD_ARGS=$CEPH_ARGS
+
+ CEPH_ARGS+="--ms_type=simple"
+ echo "Testing simple msgr..."
+ test_fast_kill $dir || return 1
+
+ CEPH_ARGS=$OLD_ARGS"--ms_type=async"
+ echo "Testing async msgr..."
+ test_fast_kill $dir || return 1
+
+ return 0
+
+}
+
+function test_fast_kill() {
+ # create cluster with 3 osds
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=3 || return 1
+ run_mgr $dir x || return 1
+ for oi in {0..2}; do
+ run_osd $dir $oi || return 1
+ pids[$oi]=$(cat $dir/osd.$oi.pid)
+ done
+
+ create_rbd_pool || return 1
+
+ # make some objects so osds to ensure connectivity between osds
+ rados -p rbd bench 10 write -b 4096 --max-objects 128 --no-cleanup
+ sleep 1
+
+ killid=0
+ previd=0
+
+ # kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased.
+ for i in {1..2}; do
+ while [ $killid -eq $previd ]; do
+ killid=${pids[$RANDOM%${#pids[@]}]}
+ done
+ previd=$killid
+
+ kill -9 $killid
+ time_left=$MAX_PROPAGATION_TIME
+ down_osds=0
+
+ while [ $time_left -gt 0 ]; do
+ sleep 1
+ time_left=$[$time_left - 1];
+
+ grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null
+ if [ $? -ne 0 ]; then
+ continue
+ fi
+
+ down_osds=$(ceph osd tree | grep -c down)
+ if [ $down_osds -lt $i ]; then
+ # osds not marked down yet, try again in a second
+ continue
+ elif [ $down_osds -gt $i ]; then
+ echo Too many \($down_osds\) osds died!
+ return 1
+ else
+ break
+ fi
+ done
+
+ if [ $down_osds -lt $i ]; then
+ echo Killed the OSD, yet it is not marked down
+ ceph osd tree
+ return 1
+ fi
+ done
+ pkill -SIGTERM rados
+ teardown $dir || return 1
+}
+
+main osd-fast-mark-down "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-fast-mark-down.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2015 Intel <contact@intel.com.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Xiaoxi Chen <xiaoxi.chen@intel.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7108" # git grep '\<7108\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function markdown_N_impl() {
+ markdown_times=$1
+ total_time=$2
+ sleeptime=$3
+ for i in `seq 1 $markdown_times`
+ do
+ # check the OSD is UP
+ ceph osd tree
+ ceph osd tree | grep osd.0 |grep up || return 1
+ # mark the OSD down.
+ ceph osd down 0
+ sleep $sleeptime
+ done
+}
+
+
+function TEST_markdown_exceed_maxdown_count() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+ # 3+1 times within 300s, osd should stay dead on the 4th time
+ local count=3
+ local sleeptime=10
+ local period=300
+ ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
+ ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
+
+ markdown_N_impl $(($count+1)) $period $sleeptime
+ # down N+1 times ,the osd.0 shoud die
+ ceph osd tree | grep down | grep osd.0 || return 1
+}
+
+function TEST_markdown_boot() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ # 3 times within 120s, should stay up
+ local count=3
+ local sleeptime=10
+ local period=120
+ ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
+ ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
+
+ markdown_N_impl $count $period $sleeptime
+ #down N times, osd.0 should be up
+ sleep 15 # give osd plenty of time to notice and come back up
+ ceph osd tree | grep up | grep osd.0 || return 1
+}
+
+function TEST_markdown_boot_exceed_time() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+
+ # 3+1 times, but over 40s, > 20s, so should stay up
+ local count=3
+ local period=20
+ local sleeptime=10
+ ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
+ ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
+
+ markdown_N_impl $(($count+1)) $period $sleeptime
+ sleep 15 # give osd plenty of time to notice and come back up
+ ceph osd tree | grep up | grep osd.0 || return 1
+}
+
+main osd-markdown "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
+# End:
--- /dev/null
+#!/bin/bash
+#
+# Author: Vicente Cheng <freeze.bilsted@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7122" # git grep '\<7122\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_reactivate() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+
+ kill_daemons $dir TERM osd || return 1
+
+ ready_path=$dir"/0/ready"
+ activate_path=$dir"/0/active"
+ # trigger mkfs again
+ rm -rf $ready_path $activate_path
+ activate_osd $dir 0 || return 1
+
+}
+
+main osd-reactivate "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reactivate.sh"
+# End:
--- /dev/null
+#! /bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7123" # git grep '\<7123\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_reuse_id() {
+ local dir=$1
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+ destroy_osd $dir 1 || return 1
+ run_osd $dir 1 || return 1
+}
+
+main osd-reuse-id "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reuse-id.sh"
+# End:
--- /dev/null
+#!/bin/bash -x
+#
+# Copyright (C) 2014 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+if [ `uname` = FreeBSD ]; then
+ # erasure coding overwrites are only tested on Bluestore
+ # erasure coding on filestore is unsafe
+ # http://docs.ceph.com/docs/master/rados/operations/erasure-code/#erasure-coding-with-overwrites
+ use_ec_overwrite=false
+else
+ use_ec_overwrite=true
+fi
+
+# Test development and debugging
+# Set to "yes" in order to ignore diff errors and save results to update test
+getjson="no"
+
+# Ignore the epoch and filter out the attr '_' value because it has date information and won't match
+jqfilter='.inconsistents | (.[].shards[].attrs[]? | select(.name == "_") | .value) |= "----Stripped-by-test----"'
+sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+
+# Remove items are not consistent across runs, the pg interval and client
+sedfilter='s/\([ ]*\"\(selected_\)*object_info\":.*head[(]\)[^[:space:]]* [^[:space:]]* \(.*\)/\1\3/'
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function add_something() {
+ local dir=$1
+ local poolname=$2
+ local obj=${3:-SOMETHING}
+ local scrub=${4:-noscrub}
+
+ if [ "$scrub" = "noscrub" ];
+ then
+ ceph osd set noscrub || return 1
+ ceph osd set nodeep-scrub || return 1
+ else
+ ceph osd unset noscrub || return 1
+ ceph osd unset nodeep-scrub || return 1
+ fi
+
+ local payload=ABCDEF
+ echo $payload > $dir/ORIGINAL
+ rados --pool $poolname put $obj $dir/ORIGINAL || return 1
+}
+
+#
+# Corrupt one copy of a replicated pool
+#
+function TEST_corrupt_and_repair_replicated() {
+ local dir=$1
+ local poolname=rbd
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ add_something $dir $poolname || return 1
+ corrupt_and_repair_one $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
+ # Reproduces http://tracker.ceph.com/issues/8914
+ corrupt_and_repair_one $dir $poolname $(get_primary $poolname SOMETHING) || return 1
+
+ teardown $dir || return 1
+}
+
+function corrupt_and_repair_two() {
+ local dir=$1
+ local poolname=$2
+ local first=$3
+ local second=$4
+
+ #
+ # 1) remove the corresponding file from the OSDs
+ #
+ pids=""
+ run_in_background pids objectstore_tool $dir $first SOMETHING remove
+ run_in_background pids objectstore_tool $dir $second SOMETHING remove
+ wait_background pids
+ return_code=$?
+ if [ $return_code -ne 0 ]; then return $return_code; fi
+
+ #
+ # 2) repair the PG
+ #
+ local pg=$(get_pg $poolname SOMETHING)
+ repair $pg
+ #
+ # 3) The files must be back
+ #
+ pids=""
+ run_in_background pids objectstore_tool $dir $first SOMETHING list-attrs
+ run_in_background pids objectstore_tool $dir $second SOMETHING list-attrs
+ wait_background pids
+ return_code=$?
+ if [ $return_code -ne 0 ]; then return $return_code; fi
+
+ rados --pool $poolname get SOMETHING $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+}
+
+#
+# 1) add an object
+# 2) remove the corresponding file from a designated OSD
+# 3) repair the PG
+# 4) check that the file has been restored in the designated OSD
+#
+function corrupt_and_repair_one() {
+ local dir=$1
+ local poolname=$2
+ local osd=$3
+
+ #
+ # 1) remove the corresponding file from the OSD
+ #
+ objectstore_tool $dir $osd SOMETHING remove || return 1
+ #
+ # 2) repair the PG
+ #
+ local pg=$(get_pg $poolname SOMETHING)
+ repair $pg
+ #
+ # 3) The file must be back
+ #
+ objectstore_tool $dir $osd SOMETHING list-attrs || return 1
+ rados --pool $poolname get SOMETHING $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+}
+
+function corrupt_and_repair_erasure_coded() {
+ local dir=$1
+ local poolname=$2
+
+ add_something $dir $poolname || return 1
+
+ local primary=$(get_primary $poolname SOMETHING)
+ local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
+ local not_primary_first=${osds[0]}
+ local not_primary_second=${osds[1]}
+
+ # Reproduces http://tracker.ceph.com/issues/10017
+ corrupt_and_repair_one $dir $poolname $primary || return 1
+ # Reproduces http://tracker.ceph.com/issues/10409
+ corrupt_and_repair_one $dir $poolname $not_primary_first || return 1
+ corrupt_and_repair_two $dir $poolname $not_primary_first $not_primary_second || return 1
+ corrupt_and_repair_two $dir $poolname $primary $not_primary_first || return 1
+
+}
+
+function create_ec_pool() {
+ local pool_name=$1
+ local allow_overwrites=$2
+
+ ceph osd erasure-code-profile set myprofile crush-failure-domain=osd $3 $4 $5 $6 $7 || return 1
+
+ ceph osd pool create "$poolname" 1 1 erasure myprofile || return 1
+
+ if [ "$allow_overwrites" = "true" ]; then
+ ceph osd pool set "$poolname" allow_ec_overwrites true || return 1
+ fi
+
+ wait_for_clean || return 1
+ return 0
+}
+
+function auto_repair_erasure_coded() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+
+ # Launch a cluster with 5 seconds scrub interval
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ local ceph_osd_args="--osd-scrub-auto-repair=true \
+ --osd-deep-scrub-interval=5 \
+ --osd-scrub-max-interval=5 \
+ --osd-scrub-min-interval=5 \
+ --osd-scrub-interval-randomize-ratio=0"
+ for id in $(seq 0 2) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id $ceph_osd_args || return 1
+ else
+ run_osd $dir $id $ceph_osd_args || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ # Create an EC pool
+ create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1
+
+ # Put an object
+ local payload=ABCDEF
+ echo $payload > $dir/ORIGINAL
+ rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
+
+ # Remove the object from one shard physically
+ # Restarted osd get $ceph_osd_args passed
+ objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
+ # Wait for auto repair
+ local pgid=$(get_pg $poolname SOMETHING)
+ wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)"
+ wait_for_clean || return 1
+ # Verify - the file should be back
+ # Restarted osd get $ceph_osd_args passed
+ objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1
+ rados --pool $poolname get SOMETHING $dir/COPY || return 1
+ diff $dir/ORIGINAL $dir/COPY || return 1
+
+ # Tear down
+ teardown $dir || return 1
+}
+
+function TEST_auto_repair_erasure_coded_appends() {
+ auto_repair_erasure_coded $1 false
+}
+
+function TEST_auto_repair_erasure_coded_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ auto_repair_erasure_coded $1 true
+ fi
+}
+
+function corrupt_and_repair_jerasure() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 3) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id || return 1
+ else
+ run_osd $dir $id || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1
+ corrupt_and_repair_erasure_coded $dir $poolname || return 1
+
+ teardown $dir || return 1
+}
+
+function TEST_corrupt_and_repair_jerasure_appends() {
+ corrupt_and_repair_jerasure $1
+}
+
+function TEST_corrupt_and_repair_jerasure_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ corrupt_and_repair_jerasure $1 true
+ fi
+}
+
+function corrupt_and_repair_lrc() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 9) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id || return 1
+ else
+ run_osd $dir $id || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_ec_pool $poolname $allow_overwrites k=4 m=2 l=3 plugin=lrc || return 1
+ corrupt_and_repair_erasure_coded $dir $poolname || return 1
+
+ teardown $dir || return 1
+}
+
+function TEST_corrupt_and_repair_lrc_appends() {
+ corrupt_and_repair_jerasure $1
+}
+
+function TEST_corrupt_and_repair_lrc_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ corrupt_and_repair_jerasure $1 true
+ fi
+}
+
+function unfound_erasure_coded() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+ local payload=ABCDEF
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 3) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id || return 1
+ else
+ run_osd $dir $id || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1
+
+ add_something $dir $poolname || return 1
+
+ local primary=$(get_primary $poolname SOMETHING)
+ local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
+ local not_primary_first=${osds[0]}
+ local not_primary_second=${osds[1]}
+ local not_primary_third=${osds[2]}
+
+ #
+ # 1) remove the corresponding file from the OSDs
+ #
+ pids=""
+ run_in_background pids objectstore_tool $dir $not_primary_first SOMETHING remove
+ run_in_background pids objectstore_tool $dir $not_primary_second SOMETHING remove
+ run_in_background pids objectstore_tool $dir $not_primary_third SOMETHING remove
+ wait_background pids
+ return_code=$?
+ if [ $return_code -ne 0 ]; then return $return_code; fi
+
+ #
+ # 2) repair the PG
+ #
+ local pg=$(get_pg $poolname SOMETHING)
+ repair $pg
+ #
+ # 3) check pg state
+ #
+ # it may take a bit to appear due to mon/mgr asynchrony
+ for f in `seq 1 60`; do
+ ceph -s | grep "1/1 unfound" && break
+ sleep 1
+ done
+ ceph -s|grep "4 osds: 4 up, 4 in" || return 1
+ ceph -s|grep "1/1 unfound" || return 1
+
+ teardown $dir || return 1
+}
+
+function TEST_unfound_erasure_coded_appends() {
+ unfound_erasure_coded $1
+}
+
+function TEST_unfound_erasure_coded_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ unfound_erasure_coded $1 true
+ fi
+}
+
+#
+# list_missing for EC pool
+#
+function list_missing_erasure_coded() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 2) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id || return 1
+ else
+ run_osd $dir $id || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1
+
+ # Put an object and remove the two shards (including primary)
+ add_something $dir $poolname MOBJ0 || return 1
+ local -a osds0=($(get_osds $poolname MOBJ0))
+
+ # Put another object and remove two shards (excluding primary)
+ add_something $dir $poolname MOBJ1 || return 1
+ local -a osds1=($(get_osds $poolname MOBJ1))
+
+ # Stop all osd daemons
+ for id in $(seq 0 2) ; do
+ kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
+ done
+
+ id=${osds0[0]}
+ ceph-objectstore-tool --data-path $dir/$id \
+ MOBJ0 remove || return 1
+ id=${osds0[1]}
+ ceph-objectstore-tool --data-path $dir/$id \
+ MOBJ0 remove || return 1
+
+ id=${osds1[1]}
+ ceph-objectstore-tool --data-path $dir/$id \
+ MOBJ1 remove || return 1
+ id=${osds1[2]}
+ ceph-objectstore-tool --data-path $dir/$id \
+ MOBJ1 remove || return 1
+
+ for id in $(seq 0 2) ; do
+ activate_osd $dir $id >&2 || return 1
+ done
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ # Get get - both objects should in the same PG
+ local pg=$(get_pg $poolname MOBJ0)
+
+ # Repair the PG, which triggers the recovering,
+ # and should mark the object as unfound
+ repair $pg
+
+ for i in $(seq 0 120) ; do
+ [ $i -lt 60 ] || return 1
+ matches=$(ceph pg $pg list_missing | egrep "MOBJ0|MOBJ1" | wc -l)
+ [ $matches -eq 2 ] && break
+ done
+
+ teardown $dir || return 1
+}
+
+function TEST_list_missing_erasure_coded_appends() {
+ list_missing_erasure_coded $1 false
+}
+
+function TEST_list_missing_erasure_coded_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ list_missing_erasure_coded $1 true
+ fi
+}
+
+#
+# Corrupt one copy of a replicated pool
+#
+function TEST_corrupt_scrub_replicated() {
+ local dir=$1
+ local poolname=csr_pool
+ local total_objs=15
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ ceph osd pool create foo 1 || return 1
+ ceph osd pool create $poolname 1 1 || return 1
+ wait_for_clean || return 1
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+ add_something $dir $poolname $objname || return 1
+
+ rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+ rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+ done
+
+ local pg=$(get_pg $poolname ROBJ0)
+
+ # Compute an old omap digest and save oi
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+ config set osd_deep_scrub_update_digest_min_age 0
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+ config set osd_deep_scrub_update_digest_min_age 0
+ pg_deep_scrub $pg
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+
+ # Alternate corruption between osd.0 and osd.1
+ local osd=$(expr $i % 2)
+
+ case $i in
+ 1)
+ # Size (deep scrub data_digest too)
+ local payload=UVWXYZZZ
+ echo $payload > $dir/CORRUPT
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 2)
+ # digest (deep scrub only)
+ local payload=UVWXYZ
+ echo $payload > $dir/CORRUPT
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 3)
+ # missing
+ objectstore_tool $dir $osd $objname remove || return 1
+ ;;
+
+ 4)
+ # Modify omap value (deep scrub only)
+ objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+ ;;
+
+ 5)
+ # Delete omap key (deep scrub only)
+ objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+ ;;
+
+ 6)
+ # Add extra omap key (deep scrub only)
+ echo extra > $dir/extra-val
+ objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+ rm $dir/extra-val
+ ;;
+
+ 7)
+ # Modify omap header (deep scrub only)
+ echo -n newheader > $dir/hdr
+ objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+ rm $dir/hdr
+ ;;
+
+ 8)
+ rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+ rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+ # Break xattrs
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+ objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+ echo -n val3-$objname > $dir/newval
+ objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+ rm $dir/bad-val $dir/newval
+ ;;
+
+ 9)
+ objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+ echo -n D > $dir/change
+ rados --pool $poolname put $objname $dir/change
+ objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+ rm $dir/oi $dir/change
+ ;;
+
+ # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+ # ROBJ11 must be handled with config change before deep scrub
+ # ROBJ12 must be handled with config change before scrubs
+ # ROBJ13 must be handled before scrubs
+
+ 14)
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+ objectstore_tool $dir 1 $objname rm-attr _ || return 1
+ rm $dir/bad-val
+ ;;
+
+ 15)
+ objectstore_tool $dir $osd $objname rm-attr _ || return 1
+
+ esac
+ done
+
+ local pg=$(get_pg $poolname ROBJ0)
+
+ set_config osd 0 filestore_debug_inject_read_err true || return 1
+ set_config osd 1 filestore_debug_inject_read_err true || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \
+ injectdataerr $poolname ROBJ11 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \
+ injectmdataerr $poolname ROBJ12 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \
+ injectmdataerr $poolname ROBJ13 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \
+ injectdataerr $poolname ROBJ13 || return 1
+
+ pg_scrub $pg
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pg || return 1
+
+ rados list-inconsistent-obj $pg > $dir/json || return 1
+ # Get epoch for repair-get requests
+ epoch=$(jq .epoch $dir/json)
+
+ jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "size": 9,
+ "errors": [
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 3,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ1"
+ }
+ },
+ {
+ "shards": [
+ {
+ "errors": [
+ "stat_error"
+ ],
+ "osd": 0
+ },
+ {
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "stat_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 36,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ12"
+ }
+ },
+ {
+ "shards": [
+ {
+ "errors": [
+ "stat_error"
+ ],
+ "osd": 0
+ },
+ {
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:d60617f9:::ROBJ13:head(47'55 osd.0.0:54 dirty|omap|data_digest|omap_digest s 7 uv 39 dd 2ddbf8f5 od 6441854d alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "stat_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 39,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ13"
+ }
+ },
+ {
+ "shards": [
+ {
+ "size": 7,
+ "errors": [
+ "oi_attr_corrupted"
+ ],
+ "osd": 0
+ },
+ {
+ "size": 7,
+ "errors": [
+ "oi_attr_missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "union_shard_errors": [
+ "oi_attr_missing",
+ "oi_attr_corrupted"
+ ],
+ "errors": [],
+ "object": {
+ "version": 0,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ14"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "size": 7,
+ "errors": [
+ "oi_attr_missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "oi_attr_missing"
+ ],
+ "errors": [
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 45,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ15"
+ }
+ },
+ {
+ "shards": [
+ {
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "errors": [
+ "missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "missing"
+ ],
+ "errors": [],
+ "object": {
+ "version": 9,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ3"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "bad-val",
+ "name": "_key1-ROBJ8"
+ },
+ {
+ "Base64": false,
+ "value": "val3-ROBJ8",
+ "name": "_key3-ROBJ8"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-ROBJ8",
+ "name": "_key1-ROBJ8"
+ },
+ {
+ "Base64": false,
+ "value": "val2-ROBJ8",
+ "name": "_key2-ROBJ8"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "attr_value_mismatch",
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 62,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ8"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
+ "size": 1,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
+ "size": 1,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "object_info_inconsistency",
+ "attr_value_mismatch"
+ ],
+ "object": {
+ "version": 63,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ9"
+ }
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save1.json
+ fi
+
+ if which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ objname=ROBJ9
+ # Change data and size again because digest was recomputed
+ echo -n ZZZ > $dir/change
+ rados --pool $poolname put $objname $dir/change
+ # Set one to an even older value
+ objectstore_tool $dir 0 $objname set-attr _ $dir/robj9-oi
+ rm $dir/oi $dir/change
+
+ objname=ROBJ10
+ objectstore_tool $dir 1 $objname get-attr _ > $dir/oi
+ rados --pool $poolname setomapval $objname key2-$objname val2-$objname
+ objectstore_tool $dir 0 $objname set-attr _ $dir/oi
+ objectstore_tool $dir 1 $objname set-attr _ $dir/oi
+ rm $dir/oi
+
+ set_config osd 0 filestore_debug_inject_read_err true || return 1
+ set_config osd 1 filestore_debug_inject_read_err true || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \
+ injectdataerr $poolname ROBJ11 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \
+ injectmdataerr $poolname ROBJ12 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \
+ injectmdataerr $poolname ROBJ13 || return 1
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \
+ injectdataerr $poolname ROBJ13 || return 1
+ pg_deep_scrub $pg
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pg || return 1
+
+ rados list-inconsistent-obj $pg > $dir/json || return 1
+ # Get epoch for repair-get requests
+ epoch=$(jq .epoch $dir/json)
+
+ jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf5fba2c6",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2d4a11c2",
+ "omap_digest": "0xf5fba2c6",
+ "size": 9,
+ "errors": [
+ "data_digest_mismatch_oi",
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "data_digest_mismatch_oi",
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "data_digest_mismatch",
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 3,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ1"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xa8dd5adc",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xa8dd5adc",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:b1f19cbd:::ROBJ10:head(47'51 osd.0.0:50 dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "errors": [],
+ "object": {
+ "version": 30,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ10"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xa03cef03",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "size": 7,
+ "errors": [
+ "read_error"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:87abbf36:::ROBJ11:head(47'48 osd.0.0:47 dirty|omap|data_digest|omap_digest s 7 uv 33 dd 2ddbf8f5 od a03cef03 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "read_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 33,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ11"
+ }
+ },
+ {
+ "shards": [
+ {
+ "errors": [
+ "stat_error"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x067f306a",
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "stat_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 36,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ12"
+ }
+ },
+ {
+ "shards": [
+ {
+ "errors": [
+ "stat_error"
+ ],
+ "osd": 0
+ },
+ {
+ "size": 7,
+ "errors": [
+ "read_error"
+ ],
+ "osd": 1
+ }
+ ],
+ "union_shard_errors": [
+ "stat_error",
+ "read_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 0,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ13"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x4f14f849",
+ "size": 7,
+ "errors": [
+ "oi_attr_corrupted"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x4f14f849",
+ "size": 7,
+ "errors": [
+ "oi_attr_missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "union_shard_errors": [
+ "oi_attr_missing",
+ "oi_attr_corrupted"
+ ],
+ "errors": [],
+ "object": {
+ "version": 0,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ14"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x2d2a4d6e",
+ "size": 7,
+ "errors": [
+ "oi_attr_missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "oi_attr_missing"
+ ],
+ "errors": [
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 45,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ15"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x578a4830",
+ "omap_digest": "0xf8e11918",
+ "size": 7,
+ "errors": [
+ "data_digest_mismatch_oi"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xf8e11918",
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:e97ce31e:::ROBJ2:head(47'56 osd.0.0:55 dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "data_digest_mismatch_oi"
+ ],
+ "errors": [
+ "data_digest_mismatch"
+ ],
+ "object": {
+ "version": 6,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ2"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x00b35dfd",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "errors": [
+ "missing"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "missing"
+ ],
+ "errors": [],
+ "object": {
+ "version": 9,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ3"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xd7178dfe",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xe2d46ea4",
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:f4981d31:::ROBJ4:head(47'58 osd.0.0:57 dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "errors": [
+ "omap_digest_mismatch"
+ ],
+ "object": {
+ "version": 12,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ4"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x1a862a41",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x06cac8f6",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:f4bfd4d1:::ROBJ5:head(47'59 osd.0.0:58 dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "errors": [
+ "omap_digest_mismatch"
+ ],
+ "object": {
+ "version": 15,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ5"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x689ee887",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x179c919f",
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:a53c12e8:::ROBJ6:head(47'50 osd.0.0:49 dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "errors": [
+ "omap_digest_mismatch"
+ ],
+ "object": {
+ "version": 18,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ6"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xefced57a",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0x6a73cc07",
+ "size": 7,
+ "errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:8b55fa4b:::ROBJ7:head(47'49 osd.0.0:48 dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "omap_digest_mismatch_oi"
+ ],
+ "errors": [
+ "omap_digest_mismatch"
+ ],
+ "object": {
+ "version": 21,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ7"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "bad-val",
+ "name": "_key1-ROBJ8"
+ },
+ {
+ "Base64": false,
+ "value": "val3-ROBJ8",
+ "name": "_key3-ROBJ8"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xd6be81dc",
+ "size": 7,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-ROBJ8",
+ "name": "_key1-ROBJ8"
+ },
+ {
+ "Base64": false,
+ "value": "val2-ROBJ8",
+ "name": "_key2-ROBJ8"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xd6be81dc",
+ "size": 7,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "attr_value_mismatch",
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 62,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ8"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
+ "data_digest": "0x1f26fb26",
+ "omap_digest": "0x2eecc539",
+ "size": 3,
+ "errors": [],
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
+ "data_digest": "0x1f26fb26",
+ "omap_digest": "0x2eecc539",
+ "size": 3,
+ "errors": [],
+ "osd": 1
+ }
+ ],
+ "selected_object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "object_info_inconsistency",
+ "attr_value_mismatch"
+ ],
+ "object": {
+ "version": 64,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ9"
+ }
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save2.json
+ fi
+
+ if which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ rados rmpool $poolname $poolname --yes-i-really-really-mean-it
+ teardown $dir || return 1
+}
+
+
+#
+# Test scrub errors for an erasure coded pool
+#
+function corrupt_scrub_erasure() {
+ local dir=$1
+ local allow_overwrites=$2
+ local poolname=ecpool
+ local total_objs=5
+
+ setup $dir || return 1
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ for id in $(seq 0 2) ; do
+ if [ "$allow_overwrites" = "true" ]; then
+ run_osd_bluestore $dir $id || return 1
+ else
+ run_osd $dir $id || return 1
+ fi
+ done
+ create_rbd_pool || return 1
+ ceph osd pool create foo 1
+
+ create_ec_pool $poolname $allow_overwrites k=2 m=1 stripe_unit=2K --force || return 1
+ wait_for_clean || return 1
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=EOBJ${i}
+ add_something $dir $poolname $objname || return 1
+
+ local osd=$(expr $i % 2)
+
+ case $i in
+ 1)
+ # Size (deep scrub data_digest too)
+ local payload=UVWXYZZZ
+ echo $payload > $dir/CORRUPT
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 2)
+ # Corrupt EC shard
+ dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=1
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 3)
+ # missing
+ objectstore_tool $dir $osd $objname remove || return 1
+ ;;
+
+ 4)
+ rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+ rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+ # Break xattrs
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+ objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+ echo -n val3-$objname > $dir/newval
+ objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+ rm $dir/bad-val $dir/newval
+ ;;
+
+ 5)
+ # Corrupt EC shard
+ dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=2
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ esac
+ done
+
+ local pg=$(get_pg $poolname EOBJ0)
+
+ pg_scrub $pg
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pg || return 1
+
+ rados list-inconsistent-obj $pg > $dir/json || return 1
+ # Get epoch for repair-get requests
+ epoch=$(jq .epoch $dir/json)
+
+ jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "size": 9,
+ "shard": 0,
+ "errors": [
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ },
+ {
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 1,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ1"
+ }
+ },
+ {
+ "shards": [
+ {
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "shard": 0,
+ "errors": [
+ "missing"
+ ],
+ "osd": 1
+ },
+ {
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "missing"
+ ],
+ "errors": [],
+ "object": {
+ "version": 3,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ3"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "bad-val",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val3-EOBJ4",
+ "name": "_key3-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "osd": 1,
+ "shard": 0,
+ "errors": [],
+ "size": 2048,
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ]
+ },
+ {
+ "osd": 2,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ]
+ }
+ ],
+ "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "attr_value_mismatch",
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 6,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ4"
+ }
+ },
+ {
+ "shards": [
+ {
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "size": 4096,
+ "shard": 0,
+ "errors": [
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ },
+ {
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 7,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ5"
+ }
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save3.json
+ fi
+
+ if which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ pg_deep_scrub $pg
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pg || return 1
+
+ rados list-inconsistent-obj $pg > $dir/json || return 1
+ # Get epoch for repair-get requests
+ epoch=$(jq .epoch $dir/json)
+
+ if [ "$allow_overwrites" = "true" ]
+ then
+ jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "size": 9,
+ "shard": 0,
+ "errors": [
+ "read_error",
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ },
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:9175b684:::EOBJ1:head(27'1 client.4155.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "read_error",
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 1,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ1"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "shard": 0,
+ "errors": [
+ "missing"
+ ],
+ "osd": 1
+ },
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:b197b25d:::EOBJ3:head(41'3 client.4199.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "missing"
+ ],
+ "errors": [],
+ "object": {
+ "version": 3,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ3"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "bad-val",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val3-EOBJ4",
+ "name": "_key3-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 0,
+ "osd": 1
+ },
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 1,
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:5e723e06:::EOBJ4:head(48'6 client.4223.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "attr_value_mismatch",
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 6,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ4"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 4096,
+ "errors": [
+ "size_mismatch_oi"
+ ],
+ "shard": 0,
+ "osd": 1
+ },
+ {
+ "data_digest": "0x00000000",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 1,
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4288.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 7,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ5"
+ }
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ else
+
+ jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "size": 9,
+ "shard": 0,
+ "errors": [
+ "read_error",
+ "size_mismatch_oi"
+ ],
+ "osd": 1
+ },
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "read_error",
+ "size_mismatch_oi"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 1,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ1"
+ }
+ },
+ {
+ "shards": [
+ {
+ "size": 2048,
+ "errors": [
+ "ec_hash_error"
+ ],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 0,
+ "osd": 1
+ },
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 1,
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:9babd184:::EOBJ2:head(29'2 client.4217.0:1 dirty|data_digest|omap_digest s 7 uv 2 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "ec_hash_error"
+ ],
+ "errors": [],
+ "object": {
+ "version": 2,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ2"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "osd": 1,
+ "shard": 0,
+ "errors": [
+ "missing"
+ ]
+ },
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "missing"
+ ],
+ "errors": [],
+ "object": {
+ "version": 3,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ3"
+ }
+ },
+ {
+ "shards": [
+ {
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "bad-val",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val3-EOBJ4",
+ "name": "_key3-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ],
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "osd": 1,
+ "shard": 0,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x04cfa72f",
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ]
+ },
+ {
+ "osd": 2,
+ "shard": 1,
+ "errors": [],
+ "size": 2048,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0x04cfa72f",
+ "attrs": [
+ {
+ "Base64": true,
+ "value": "",
+ "name": "_"
+ },
+ {
+ "Base64": false,
+ "value": "val1-EOBJ4",
+ "name": "_key1-EOBJ4"
+ },
+ {
+ "Base64": false,
+ "value": "val2-EOBJ4",
+ "name": "_key2-EOBJ4"
+ },
+ {
+ "Base64": true,
+ "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
+ "name": "hinfo_key"
+ },
+ {
+ "Base64": true,
+ "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
+ "name": "snapset"
+ }
+ ]
+ }
+ ],
+ "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [],
+ "errors": [
+ "attr_value_mismatch",
+ "attr_name_mismatch"
+ ],
+ "object": {
+ "version": 6,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ4"
+ }
+ },
+ {
+ "shards": [
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "errors": [],
+ "shard": 2,
+ "osd": 0
+ },
+ {
+ "size": 4096,
+ "shard": 0,
+ "errors": [
+ "size_mismatch_oi",
+ "ec_size_error"
+ ],
+ "osd": 1
+ },
+ {
+ "data_digest": "0x04cfa72f",
+ "omap_digest": "0xffffffff",
+ "size": 2048,
+ "shard": 1,
+ "errors": [],
+ "osd": 2
+ }
+ ],
+ "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
+ "union_shard_errors": [
+ "size_mismatch_oi",
+ "ec_size_error"
+ ],
+ "errors": [
+ "size_mismatch"
+ ],
+ "object": {
+ "version": 7,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "EOBJ5"
+ }
+ }
+ ],
+ "epoch": 0
+}
+EOF
+
+ fi
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ if [ "$allow_overwrites" = "true" ]
+ then
+ num=4
+ else
+ num=5
+ fi
+ jq '.' $dir/json > save${num}.json
+ fi
+
+ if which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ rados rmpool $poolname $poolname --yes-i-really-really-mean-it
+ teardown $dir || return 1
+}
+
+function TEST_corrupt_scrub_erasure_appends() {
+ corrupt_scrub_erasure $1 false
+}
+
+function TEST_corrupt_scrub_erasure_overwrites() {
+ if [ "$use_ec_overwrite" = "true" ]; then
+ corrupt_scrub_erasure $1 true
+ fi
+}
+
+#
+# Test to make sure that a periodic scrub won't cause deep-scrub info to be lost
+#
+function TEST_periodic_scrub_replicated() {
+ local dir=$1
+ local poolname=psr_pool
+ local objname=POBJ
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0"
+ run_osd $dir 0 $ceph_osd_args || return 1
+ run_osd $dir 1 $ceph_osd_args || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ ceph osd pool create $poolname 1 1 || return 1
+ wait_for_clean || return 1
+
+ local osd=0
+ add_something $dir $poolname $objname scrub || return 1
+ local primary=$(get_primary $poolname $objname)
+ local pg=$(get_pg $poolname $objname)
+
+ # Add deep-scrub only error
+ local payload=UVWXYZ
+ echo $payload > $dir/CORRUPT
+ # Uses $ceph_osd_args for osd restart
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+
+ # No scrub information available, so expect failure
+ set -o pipefail
+ ! rados list-inconsistent-obj $pg | jq '.' || return 1
+ set +o pipefail
+
+ pg_deep_scrub $pg || return 1
+
+ # Make sure bad object found
+ rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
+
+ local last_scrub=$(get_last_scrub_stamp $pg)
+ # Fake a schedule scrub
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
+ trigger_scrub $pg || return 1
+ # Wait for schedule regular scrub
+ wait_for_scrub $pg "$last_scrub"
+
+ # It needed to be upgraded
+ grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1
+
+ # Bad object still known
+ rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
+
+ # Can't upgrade with this set
+ ceph osd set nodeep-scrub
+ # Let map change propagate to OSDs
+ sleep 2
+
+ # Fake a schedule scrub
+ local last_scrub=$(get_last_scrub_stamp $pg)
+ CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
+ trigger_scrub $pg || return 1
+ # Wait for schedule regular scrub
+ # to notice scrub and skip it
+ local found=false
+ for i in $(seq 14 -1 0)
+ do
+ sleep 1
+ ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.${primary}.log || { found=true ; break; }
+ echo Time left: $i seconds
+ done
+ test $found = "true" || return 1
+
+ # Bad object still known
+ rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
+
+ # Request a regular scrub and it will be done
+ local scrub_backoff_ratio=$(get_config osd ${primary} osd_scrub_backoff_ratio)
+ set_config osd ${primary} osd_scrub_backoff_ratio 0
+ pg_scrub $pg
+ sleep 1
+ set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio
+ grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
+
+ # deep-scrub error is no longer present
+ rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1
+}
+
+
+main osd-scrub-repair "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && \
+# test/osd/osd-scrub-repair.sh # TEST_corrupt_and_repair_replicated"
+# End:
--- /dev/null
+#! /bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function TEST_scrub_snaps() {
+ local dir=$1
+ local poolname=test
+
+ TESTDATA="testdata.$$"
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ # Create a pool with a single pg
+ ceph osd pool create $poolname 1 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+ for i in `seq 1 15`
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ SNAP=1
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj1 $TESTDATA
+ rados -p $poolname put obj5 $TESTDATA
+ rados -p $poolname put obj3 $TESTDATA
+ for i in `seq 6 14`
+ do rados -p $poolname put obj${i} $TESTDATA
+ done
+
+ SNAP=2
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+
+ SNAP=3
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj3 $TESTDATA
+
+ SNAP=4
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+ rados -p $poolname put obj2 $TESTDATA
+
+ SNAP=5
+ rados -p $poolname mksnap snap${SNAP}
+ SNAP=6
+ rados -p $poolname mksnap snap${SNAP}
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+ rados -p $poolname put obj5 $TESTDATA
+
+ SNAP=7
+ rados -p $poolname mksnap snap${SNAP}
+
+ rados -p $poolname rm obj4
+ rados -p $poolname rm obj2
+
+ kill_daemons $dir TERM osd || return 1
+
+ # Don't need to ceph_objectstore_tool function because osd stopped
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj1)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" --force remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)"
+ OBJ5SAVE="$JSON"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=18
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj3)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=15
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj2)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset
+
+ # Create a clone which isn't in snapset and doesn't have object info
+ JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
+ dd if=/dev/urandom of=$TESTDATA bs=256 count=7
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+ rm -f $TESTDATA
+
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj6)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj7)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj8)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj9)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj10)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj11)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj12)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj13)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj14)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size
+
+ echo "garbage" > $dir/bad
+ JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj15)"
+ ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-attr snapset $dir/bad
+ rm -f $dir/bad
+
+ run_osd $dir 0 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ local pgid="${poolid}.0"
+ if ! pg_scrub "$pgid" ; then
+ cat $dir/osd.0.log
+ return 1
+ fi
+ grep 'log_channel' $dir/osd.0.log
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pgid || return 1
+
+ rados list-inconsistent-snapset $pgid > $dir/json || return 1
+ test $(jq '.inconsistents | length' $dir/json) = "21" || return 1
+
+ local jqfilter='.inconsistents'
+ local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+
+ jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj1"
+ },
+ {
+ "errors": [
+ "size_mismatch"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj10"
+ },
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj11"
+ },
+ {
+ "errors": [
+ "size_mismatch"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj14"
+ },
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj6"
+ },
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj7"
+ },
+ {
+ "errors": [
+ "size_mismatch"
+ ],
+ "snap": 1,
+ "locator": "",
+ "nspace": "",
+ "name": "obj9"
+ },
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 4,
+ "locator": "",
+ "nspace": "",
+ "name": "obj2"
+ },
+ {
+ "errors": [
+ "size_mismatch"
+ ],
+ "snap": 4,
+ "locator": "",
+ "nspace": "",
+ "name": "obj5"
+ },
+ {
+ "errors": [
+ "headless"
+ ],
+ "snap": 7,
+ "locator": "",
+ "nspace": "",
+ "name": "obj2"
+ },
+ {
+ "errors": [
+ "oi_attr_missing",
+ "headless"
+ ],
+ "snap": 7,
+ "locator": "",
+ "nspace": "",
+ "name": "obj5"
+ },
+ {
+ "extra clones": [
+ 1
+ ],
+ "errors": [
+ "extra_clones"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj11"
+ },
+ {
+ "errors": [
+ "head_mismatch"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj12"
+ },
+ {
+ "errors": [
+ "ss_attr_corrupted"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj15"
+ },
+ {
+ "extra clones": [
+ 7,
+ 4
+ ],
+ "errors": [
+ "ss_attr_missing",
+ "extra_clones"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj2"
+ },
+ {
+ "errors": [
+ "size_mismatch"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj3"
+ },
+ {
+ "missing": [
+ 7
+ ],
+ "errors": [
+ "clone_missing"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj4"
+ },
+ {
+ "missing": [
+ 2,
+ 1
+ ],
+ "extra clones": [
+ 7
+ ],
+ "errors": [
+ "extra_clones",
+ "clone_missing"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj5"
+ },
+ {
+ "extra clones": [
+ 1
+ ],
+ "errors": [
+ "extra_clones"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj6"
+ },
+ {
+ "extra clones": [
+ 1
+ ],
+ "errors": [
+ "head_mismatch",
+ "extra_clones"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj7"
+ },
+ {
+ "errors": [
+ "snapset_mismatch"
+ ],
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "obj8"
+ }
+ ],
+ "epoch": 20
+}
+EOF
+
+ jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || return 1
+
+ if which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
+ fi
+
+ for i in `seq 1 7`
+ do
+ rados -p $poolname rmsnap snap$i
+ done
+
+ ERRORS=0
+
+ pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
+ pid=$(cat $pidfile)
+ if ! kill -0 $pid
+ then
+ echo "OSD crash occurred"
+ tail -100 $dir/osd.0.log
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+
+ kill_daemons $dir || return 1
+
+ declare -a err_strings
+ err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj10:.* is missing in clone_overlap"
+ err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 no '_' attr"
+ err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 is an unexpected clone"
+ err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
+ err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:2"
+ err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:1"
+ err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj5:head 2 missing clone[(]s[)]"
+ err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:head snapset.head_exists=false, but head exists"
+ err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj8:head snaps.seq not set"
+ err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:head snapset.head_exists=false, but head exists"
+ err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 is an unexpected clone"
+ err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
+ err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 is an unexpected clone"
+ err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head no 'snapset' attr"
+ err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 clone ignored due to missing snapset"
+ err_strings[15]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 clone ignored due to missing snapset"
+ err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head expected clone .*:::obj4:7"
+ err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head 1 missing clone[(]s[)]"
+ err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 is an unexpected clone"
+ err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 is missing in clone_size"
+ err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 is an unexpected clone"
+ err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 size 1032 != clone_size 1033"
+ err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 23 errors"
+ err_strings[23]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head can't decode 'snapset' attr buffer"
+ err_strings[24]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:1 has no oi or legacy_snaps; cannot convert 1=[[]1[]]:[[]1[]].stray_clone_snaps=[{]1=[[]1[]][}]"
+
+ for i in `seq 0 ${#err_strings[@]}`
+ do
+ if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null;
+ then
+ echo "Missing log message '${err_strings[$i]}'"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
+
+ teardown $dir || return 1
+
+ if [ $ERRORS != "0" ];
+ then
+ echo "TEST FAILED WITH $ERRORS ERRORS"
+ return 1
+ fi
+
+ echo "TEST PASSED"
+ return 0
+}
+
+main osd-scrub-snaps "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && \
+# test/osd/osd-scrub-snaps.sh"
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
+++ /dev/null
-../../../ceph-deploy-overrides
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph-deploy:
+ dmcrypt: yes
--- /dev/null
+overrides:
+ ceph-deploy:
+ separate_journal_disk:
--- /dev/null
+overrides:
+ ceph-deploy:
+ separate_journal_disk: yes
--- /dev/null
+overrides:
+ ceph-deploy:
+ dmcrypt: yes
+ separate_journal_disk: yes
+++ /dev/null
-../../../config_options
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph-deploy:
+ conf:
+ global:
+ mon pg warn min per osd: 2
+ osd pool default size: 2
--- /dev/null
+../../../../objectstore/bluestore.yaml
\ No newline at end of file
--- /dev/null
+../../../../objectstore/filestore-xfs.yaml
\ No newline at end of file
ceph:
log-whitelist:
- evicting unresponsive client
- - wrongly marked me down
+ - but it is still running
- slow request
tasks:
- error reading table object
- error reading sessionmap
- unmatched fragstat
+ - unmatched rstat
- was unreadable, recreating it now
- Scrub error on inode
- Metadata damage detected
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- slow request
tasks:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
--- /dev/null
+tasks:
+- install:
+- ceph:
--- /dev/null
+overrides:
+ ceph:
+ conf:
+ osd:
+ osd backoff on peering: true
+ osd backoff on degraded: true
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+++ /dev/null
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - wrongly marked me down
- - objects unfound and apparently lost
-- thrashosds:
tasks:
-- install:
-- ceph:
- mon_thrash:
revive_delay: 20
thrash_delay: 1
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - but it is still running
+ - objects unfound and apparently lost
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 2
+ chance_pgpnum_fix: 1
--- /dev/null
+overrides:
+ ceph:
+ crush_tunables: optimal
+ conf:
+ mon:
+ mon osd initial require min compat client: luminous
+ log-whitelist:
+ - wrongly marked me down
+ - objects unfound and apparently lost
+tasks:
+- thrashosds:
+ timeout: 1200
+ chance_pgnum_grow: 1
+ chance_pgpnum_fix: 1
+ chance_thrash_pg_upmap: 3
+ chance_thrash_pg_upmap_items: 3
--- /dev/null
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
tasks:
- rbd_fio:
client.0:
- fio-io-size: 90%
+ fio-io-size: 100%
formats: [2]
features: [[layering,exclusive-lock]]
- io-engine: sync
+ io-engine: libaio
rw: randrw
+ bs: 1024
+ io-depth: 256
runtime: 1200
+++ /dev/null
-tasks:
-- rbd:
- all:
- image_size: 20480
-- workunit:
- clients:
- all:
- - suites/iozone.sh
ceph:
log-whitelist:
- reached quota
+ - (POOL_APP_NOT_ENABLED)
tasks:
- ceph-fuse:
- workunit:
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - (MDS_TRIM)
ceph:
log-whitelist:
- reached quota
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (POOL_FULL)
- (SMALLER_PGP_NUM)
- (CACHE_POOL_NO_HIT_SET)
- (CACHE_POOL_NEAR_FULL)
+ - (POOL_APP_NOT_ENABLED)
tasks:
- workunit:
clients:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (PG_
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
tasks:
- workunit:
clients:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
tasks:
- workunit:
clients:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
tasks:
- workunit:
clients:
- scrub [0-9]+ errors
- 'size 1 != size'
- attr name mismatch
- - Regular scrub request, losing deep-scrub details
+ - Regular scrub request, deep-scrub details will be lost
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
tasks:
- rgw:
client.0:
+- ceph_manager.wait_for_pools:
+ kwargs:
+ pools:
+ - .rgw.buckets
+ - .rgw.root
+ - default.rgw.control
+ - default.rgw.meta
+ - default.rgw.log
- thrash_pool_snaps:
pools:
- .rgw.buckets
- .rgw.root
- - .rgw.control
- - .rgw
- - .users.uid
- - .users.email
- - .users
+ - default.rgw.control
+ - default.rgw.meta
+ - default.rgw.log
- s3readwrite:
client.0:
rgw_server: client.0
ms inject delay probability: .005
ms inject delay max: 1
ms inject internal delays: .002
+ mgr:
+ debug monc: 10
ceph:
log-whitelist:
- slow request
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
tasks:
- exec:
client.0:
+overrides:
+ ceph:
+ log-whitelist:
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
tasks:
- exec:
client.0:
- (REQUEST_SLOW)
- (MON_DOWN)
- (PG_
+ - (POOL_APP_NOT_ENABLED)
conf:
global:
debug objecter: 20
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (PG_
- (MON_DOWN)
- (MGR_DOWN)
- exec:
mon.a:
- - ceph tell mgr.x restful create-key admin
- - ceph tell mgr.x restful create-self-signed-cert
+ - ceph restful create-key admin
+ - ceph restful create-self-signed-cert
- ceph.restart: [mgr.x]
- workunit:
clients:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- had wrong client addr
- had wrong cluster addr
- must scrub before tier agent can activate
- exec:
client.0:
- ceph osd pool create base-pool 4
+ - ceph osd pool application enable base-pool rados
- ceph osd pool create cache-pool 4
- ceph osd tier add base-pool cache-pool
- ceph osd tier cache-mode cache-pool writeback
client.0:
- ceph osd pool create ec-ca 1 1
- ceph osd pool create ec 1 1 erasure default
+ - ceph osd pool application enable ec rados
- ceph osd tier add ec ec-ca
- ceph osd tier cache-mode ec-ca readproxy
- ceph osd tier set-overlay ec ec-ca
osd max object name len: 400
osd max object namespace len: 64
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- exec:
client.0:
- sudo ceph osd pool create foo 64
+ - sudo ceph osd pool application enable foo rados
- rados -p foo bench 60 write -b 1024 --no-cleanup
- sudo ceph osd pool set foo size 3
- sudo ceph osd out 0 1
git_version:
help:
config show:
+ config help:
config set filestore_dump_file /tmp/foo:
perf dump:
perf schema:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ clients:
+ all:
+ - erasure-code/encode-decode-non-regression.sh
+++ /dev/null
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - mgr.x
- - osd.0
- - osd.1
- - client.0
-openstack:
- - volumes: # attached to each instance
- count: 2
- size: 10 # GB
-tasks:
-- install:
-- ceph:
- log-whitelist:
- - overall HEALTH_
- - (MON_DOWN)
- - (PG_
-- mon_thrash:
- revive_delay: 20
- thrash_delay: 1
-- workunit:
- clients:
- all:
- - mon/workloadgen.sh
- env:
- LOADGEN_NUM_OSDS: "5"
- VERBOSE: "1"
- DURATION: "600"
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- slow request
- overall HEALTH_
- (OSDMAP_FLAGS)
- exec:
client.0:
- sudo ceph osd pool create foo 128 128
+ - sudo ceph osd pool application enable foo rados
- sleep 5
- sudo ceph tell osd.0 injectargs -- --osd-inject-failure-on-pg-removal
- sudo ceph osd pool delete foo foo --yes-i-really-really-mean-it
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- had wrong client addr
- had wrong cluster addr
- reached quota
- overall HEALTH_
- (POOL_FULL)
+ - (POOL_APP_NOT_ENABLED)
- workunit:
clients:
all:
log-whitelist:
- missing primary copy of
- objects unfound and apparently lost
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
- full_sequential:
- exec:
client.0:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- had wrong client addr
- overall HEALTH_
- (OSDMAP_FLAGS)
tasks:
- install:
- ceph:
+ log-whitelist:
+ - overall HEALTH_
+ - (POOL_APP_NOT_ENABLED)
- workunit:
clients:
all:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- missing primary copy of
- objects unfound and apparently lost
- overall HEALTH_
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- thrashosds:
op_delay: 30
clean_interval: 120
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- slow request
- overall HEALTH_
- (CACHE_POOL_
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
conf:
global:
ms inject socket failures: 500
+ mgr:
+ debug monc: 10
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - crush
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - erasure-code
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - misc
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - mon
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - osd
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - client.0
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- workunit:
+ basedir: qa/standalone
+ clients:
+ all:
+ - scrub
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- slow request
conf:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
mon:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- osd_map_cache_size
conf:
osd scrub max interval: 120
osd max backfills: 9
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
osd:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- slow request
conf:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
osd:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
mon:
osd scrub max interval: 120
osd max backfills: 5
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- osd_map_cache_size
tasks:
osd scrub max interval: 120
osd max backfills: 9
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
osd:
- exec:
mon.a:
- ceph osd require-osd-release luminous
+ - ceph osd pool application enable base rados || true
# make sure osds have latest map
- rados -p rbd bench 5 write -b 4096
- ceph.healthy:
conf:
global:
mon debug no require luminous: true
+
+# setting luminous triggers peering, which *might* trigger health alerts
+ log-whitelist:
+ - overall HEALTH_
+ - (PG_AVAILABILITY)
+ - (PG_DEGRADED)
thrashosds:
chance_thrash_cluster_full: 0
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
osd:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- osd_map_cache_size
conf:
filestore queue throttle max multiple: 10
osd max backfills: 9
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
conf:
osd:
client.0:
- sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
- sudo ceph osd pool create base 4 4 erasure teuthologyprofile
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool set base min_size 2
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache readproxy
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
ceph:
log-whitelist:
- reached quota
+ - (POOL_APP_NOT_ENABLED)
crush_tunables: hammer
conf:
client:
- print: "**** done install.upgrade osd.0"
- ceph.restart:
daemons: [mon.a, mon.b, mon.c]
+ wait-for-healthy: false
- ceph.restart:
daemons: [osd.0, osd.1, osd.2]
+ wait-for-healthy: false
- print: "**** done ceph.restart 1st half"
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- log bound mismatch
tasks:
chance_thrash_cluster_full: 0
chance_thrash_pg_upmap: 0
chance_thrash_pg_upmap_items: 0
+ chance_force_recovery: 0
- print: "**** done thrashosds 3-thrash"
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
- (MON_DOWN)
- (OSDMAP_FLAGS)
- (SMALLER_PGP_NUM)
+ - (POOL_APP_NOT_ENABLED)
tasks:
- mon_recovery:
- (SMALLER_PGP_NUM)
- (REQUEST_SLOW)
- (CACHE_POOL_NEAR_FULL)
+ - (POOL_APP_NOT_ENABLED)
conf:
client:
debug ms: 1
- sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
- sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
- sudo ceph osd pool set datapool allow_ec_overwrites true
+ - rbd pool init datapool
overrides:
thrashosds:
- exec:
client.0:
- sudo ceph osd pool create datapool 4
+ - rbd pool init datapool
overrides:
ceph:
global:
ms inject socket failures: 5000
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- exec:
client.0:
- sudo ceph osd pool create datapool 4
+ - rbd pool init datapool
overrides:
ceph:
meta:
- desc: run the rbd_mirror_ha.sh workunit to test the rbd-mirror daemon
tasks:
+- exec:
+ cluster1.client.mirror:
+ - ceph --cluster cluster1 auth caps client.mirror mon 'profile rbd' osd 'profile rbd'
+ cluster2.client.mirror:
+ - ceph --cluster cluster2 auth caps client.mirror mon 'profile rbd' osd 'profile rbd'
- workunit:
clients:
cluster1.client.mirror: [rbd/rbd_mirror_ha.sh]
admin socket: /var/run/ceph/$cluster-$name.asok
pid file: /var/run/ceph/$cluster-$name.pid
tasks:
+- exec:
+ cluster1.client.mirror:
+ - ceph --cluster cluster1 auth caps client.mirror mon 'profile rbd' osd 'profile rbd'
+ cluster2.client.mirror:
+ - ceph --cluster cluster2 auth caps client.mirror mon 'profile rbd' osd 'profile rbd'
- rbd-mirror:
client: cluster1.client.mirror
- rbd-mirror:
global:
ms inject socket failures: 5000
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- sudo ceph osd pool set cache hit_set_count 8
- sudo ceph osd pool set cache hit_set_period 60
- sudo ceph osd pool set cache target_max_objects 250
+ - rbd pool init rbd
- sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
- sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile
- sudo ceph osd pool set datapool allow_ec_overwrites true
+ - rbd pool init datapool
overrides:
thrashosds:
- exec:
client.0:
- sudo ceph osd pool create datapool 4
+ - rbd pool init datapool
overrides:
ceph:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- exec:
overrides:
ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
tasks:
- thrashosds:
- ceph:
fs: xfs
log-whitelist:
- - wrongly marked me down
+ - but it is still running
conf:
client.rest0:
debug ms: 1
ceph:
log-whitelist:
- reached quota
+ - (POOL_APP_NOT_ENABLED)
conf:
global:
ms inject delay max: 1
fs: ext4
log-whitelist:
- reached quota
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
+ - (POOL_APP_NOT_ENABLED)
- thrashosds:
chance_pgnum_grow: 2
chance_pgpnum_fix: 1
- ceph:
fs: xfs
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
chance_pgnum_grow: 2
- install: null
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
chance_pgnum_grow: 2
- exec:
client.0:
- sudo ceph osd pool create base 4
+ - sudo ceph osd pool application enable base rados
- sudo ceph osd pool create cache 4
- sudo ceph osd tier add base cache
- sudo ceph osd tier cache-mode cache writeback
- ceph:
fs: xfs
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
chance_pgnum_grow: 3
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- ceph-fuse:
- workunit:
clients:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
op_delay: 1
- install:
- ceph:
log-whitelist:
- - wrongly marked me down
+ - but it is still running
- objects unfound and apparently lost
- thrashosds:
chance_down: 50
mon warn on legacy crush tunables: false
mon debug unsafe allow tier with nonempty snaps: true
log-whitelist:
+ - but it is still running
- wrongly marked me down
- reached quota
roles:
mon:
mon warn on legacy crush tunables: false
log-whitelist:
+ - but it is still running
- wrongly marked me down
roles:
- - mon.a
client.0:
- ceph osd erasure-code-profile set t-profile crush-failure-domain=osd k=2 m=1
- ceph osd pool create base-pool 4 4 erasure t-profile
+ - ceph osd pool application enable base-pool rados
- exec:
client.0:
- ceph osd pool create base-pool 4
+ - ceph osd pool application enable base-pool rados
- ceph:
skip_mgr_daemons: true
add_osds_to_crush: true
+ log-whitelist:
+ - overall HEALTH_
+ - (FS_
+ - (MDS_
- print: "**** done ceph"
- install.upgrade:
mon.a:
- workload_x
- upgrade-sequence_x
- print: "**** done parallel -x branch"
+- exec:
+ osd.0:
+ - ceph osd set-require-min-compat-client luminous
# Run librados tests on the -x upgraded cluster
- install.upgrade:
client.1:
- exec:
osd.0:
- ceph osd require-osd-release luminous
- - ceph osd set-require-min-compat-client luminous
- ceph.healthy:
- print: "**** done ceph.restart all -x branch mds/osd/mon"
overrides:
ceph:
log-whitelist:
+ - but it is still running
- wrongly marked me down
- objects unfound and apparently lost
- log bound mismatch
chance_thrash_cluster_full: 0
chance_thrash_pg_upmap: 0
chance_thrash_pg_upmap_items: 0
+ chance_force_recovery: 0
- print: "**** done thrashosds 3-thrash"
--- /dev/null
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
overrides:
ceph:
log-whitelist:
+ - but it is still running
- wrongly marked me down
- objects unfound and apparently lost
- log bound mismatch
chance_thrash_pg_upmap: 0
chance_thrash_pg_upmap_items: 0
disable_objectstore_tool_tests: true
+ chance_force_recovery: 0
- print: "**** done thrashosds 3-thrash"
--- /dev/null
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
- scrub mismatch
- ScrubResult
- wrongly marked
+ - (POOL_APP_NOT_ENABLED)
+ - overall HEALTH_
conf:
global:
enable experimental unrecoverable data corrupting features: "*"
overrides:
ceph:
log-whitelist:
+ - but it is still running
- wrongly marked me down
- objects unfound and apparently lost
- log bound mismatch
chance_thrash_cluster_full: 0
chance_thrash_pg_upmap: 0
chance_thrash_pg_upmap_items: 0
+ chance_force_recovery: 0
- print: "**** done thrashosds 3-thrash"
overrides:
ceph:
log-whitelist:
+ - but it is still running
- wrongly marked me down
- objects unfound and apparently lost
- log bound mismatch
chance_thrash_pg_upmap: 0
chance_thrash_pg_upmap_items: 0
disable_objectstore_tool_tests: true
+ chance_force_recovery: 0
- print: "**** done thrashosds 3-thrash"
if config.get('coverage') or config.get('valgrind') is not None:
daemon_signal = 'term'
+ # create osds in order. (this only matters for pre-luminous, which might
+ # be hammer, which doesn't take an id_ argument to legacy 'osd create').
+ osd_uuids = {}
for remote, roles_for_host in daemons.remotes.iteritems():
is_type_ = teuthology.is_type(type_, cluster_name)
for role in roles_for_host:
continue
_, _, id_ = teuthology.split_role(role)
+
if type_ == 'osd':
datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
cluster=cluster_name, id=id_)
path=datadir + '/fsid',
sudo=True,
).strip()
- try:
- remote.run(
- args=[
- 'sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'new', osd_uuid, id_,
- ]
- )
- except:
- # fallback to pre-luminous (hammer or jewel)
- remote.run(
- args=[
- 'sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'create', osd_uuid,
- ]
- )
- if config.get('add_osds_to_crush'):
- remote.run(
- args=[
- 'sudo', 'ceph', '--cluster', cluster_name,
- 'osd', 'crush', 'create-or-move', 'osd.' + id_,
- '1.0', 'host=localhost', 'root=default',
- ]
- )
+ osd_uuids[id_] = osd_uuid
+ for osd_id in range(len(osd_uuids)):
+ id_ = str(osd_id)
+ osd_uuid = osd_uuids.get(id_)
+ try:
+ remote.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'new', osd_uuid, id_,
+ ]
+ )
+ except:
+ # fallback to pre-luminous (hammer or jewel)
+ remote.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'create', osd_uuid,
+ ]
+ )
+ if config.get('add_osds_to_crush'):
+ remote.run(
+ args=[
+ 'sudo', 'ceph', '--cluster', cluster_name,
+ 'osd', 'crush', 'create-or-move', 'osd.' + id_,
+ '1.0', 'host=localhost', 'root=default',
+ ]
+ )
+
+ for remote, roles_for_host in daemons.remotes.iteritems():
+ is_type_ = teuthology.is_type(type_, cluster_name)
+ for role in roles_for_host:
+ if not is_type_(role):
+ continue
+ _, _, id_ = teuthology.split_role(role)
run_cmd = [
'sudo',
"""
config = config if isinstance(config, dict) else dict()
cluster_name = config.get('cluster', 'ceph')
- log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+ log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
+ manager = ctx.managers[cluster_name]
+ try:
+ manager.wait_for_mgr_available()
+ except run.CommandFailedError:
+ log.info('ignoring mgr wait error, probably testing upgrade')
+
firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
teuthology.wait_until_osds_up(
remote=mon0_remote,
ceph_cluster=cluster_name,
)
+
+ try:
+ manager.flush_all_pg_stats()
+ except run.CommandFailedError:
+ log.info('ignoring flush pg stats error, probably testing upgrade')
+ manager.wait_for_clean()
+
+ log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
teuthology.wait_until_healthy(
ctx,
remote=mon0_remote,
if estatus != 0:
raise RuntimeError("ceph-deploy: Failed to zap osds")
osd_create_cmd = './ceph-deploy osd create '
+ # first check for filestore, default is bluestore with ceph-deploy
+ if config.get('filestore') is not None:
+ osd_create_cmd += '--filestore '
+ else:
+ osd_create_cmd += '--bluestore '
if config.get('dmcrypt') is not None:
osd_create_cmd += '--dmcrypt '
osd_create_cmd += ":".join(d)
mon_initial_members: 1
only_mon: true
keep_running: true
+ # either choose bluestore or filestore, default is bluestore
+ bluestore: True
+ # or
+ filestore: True
tasks:
- install:
self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0)
self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
self.random_eio = self.config.get('random_eio')
+ self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
num_osds = self.in_osds + self.out_osds
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
except CommandFailedError:
self.log('Failed to rm-pg-upmap-items, ignoring')
+ def force_recovery(self):
+ """
+ Force recovery on some of PGs
+ """
+ backfill = random.random() >= 0.5
+ j = self.ceph_manager.get_pgids_to_force(backfill)
+ if j:
+ if backfill:
+ self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j)
+ else:
+ self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j)
+
+ def cancel_force_recovery(self):
+ """
+ Force recovery on some of PGs
+ """
+ backfill = random.random() >= 0.5
+ j = self.ceph_manager.get_pgids_to_cancel_force(backfill)
+ if j:
+ if backfill:
+ self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j)
+ else:
+ self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j)
+
+ def force_cancel_recovery(self):
+ """
+ Force or cancel forcing recovery
+ """
+ if random.random() >= 0.4:
+ self.force_recovery()
+ else:
+ self.cancel_force_recovery()
+
def all_up(self):
"""
Make sure all osds are up and not out.
while len(self.in_osds) < (self.minin + 1):
self.in_osd()
self.log("Waiting for recovery")
- self.ceph_manager.wait_for_all_up(
+ self.ceph_manager.wait_for_all_osds_up(
timeout=self.config.get('timeout')
)
# now we wait 20s for the pg status to change, if it takes longer,
actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,))
if self.chance_thrash_pg_upmap_items > 0:
actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
+ if self.chance_force_recovery > 0:
+ actions.append((self.force_cancel_recovery, self.chance_force_recovery))
for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
for scenario in [
finally:
if self.do_revive:
self.manager.revive_osd(self.osd)
+ self.manager.wait_till_osd_is_up(self.osd, 300)
class CephManager:
'osd', 'pool', 'set', pool_name,
'allow_ec_overwrites',
'true')
+ self.raw_cluster_cmd(
+ 'osd', 'pool', 'application', 'enable',
+ pool_name, 'rados', '--yes-i-really-mean-it',
+ run.Raw('||'), 'true')
self.pools[pool_name] = pg_num
time.sleep(1)
j = json.loads('\n'.join(out.split('\n')[1:]))
return j['pg_stats']
+ def get_pgids_to_force(self, backfill):
+ """
+ Return the randomized list of PGs that can have their recovery/backfill forced
+ """
+ j = self.get_pg_stats();
+ pgids = []
+ if backfill:
+ wanted = ['degraded', 'backfilling', 'backfill_wait']
+ else:
+ wanted = ['recovering', 'degraded', 'recovery_wait']
+ for pg in j:
+ status = pg['state'].split('+')
+ for t in wanted:
+ if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status:
+ pgids.append(pg['pgid'])
+ break
+ return pgids
+
+ def get_pgids_to_cancel_force(self, backfill):
+ """
+ Return the randomized list of PGs whose recovery/backfill priority is forced
+ """
+ j = self.get_pg_stats();
+ pgids = []
+ if backfill:
+ wanted = 'forced_backfill'
+ else:
+ wanted = 'forced_recovery'
+ for pg in j:
+ status = pg['state'].split('+')
+ if wanted in status and random.random() > 0.5:
+ pgids.append(pg['pgid'])
+ return pgids
+
def compile_pg_status(self):
"""
Return a histogram of pg state values
"""
return self.get_osd_dump_json()['osds']
+ def get_mgr_dump(self):
+ out = self.raw_cluster_cmd('mgr', 'dump', '--format=json')
+ return json.loads(out)
+
def get_stuck_pgs(self, type_, threshold):
"""
:returns: stuck pg information from the cluster
x = self.get_osd_dump()
return (len(x) == sum([(y['up'] > 0) for y in x]))
- def wait_for_all_up(self, timeout=None):
+ def wait_for_all_osds_up(self, timeout=None):
"""
When this exits, either the timeout has expired, or all
osds are up.
while not self.are_all_osds_up():
if timeout is not None:
assert time.time() - start < timeout, \
- 'timeout expired in wait_for_all_up'
+ 'timeout expired in wait_for_all_osds_up'
time.sleep(3)
self.log("all up!")
+ def pool_exists(self, pool):
+ if pool in self.list_pools():
+ return True
+ return False
+
+ def wait_for_pool(self, pool, timeout=300):
+ """
+ Wait for a pool to exist
+ """
+ self.log('waiting for pool %s to exist' % pool)
+ start = time.time()
+ while not self.pool_exists(pool):
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'timeout expired in wait_for_pool'
+ time.sleep(3)
+
+ def wait_for_pools(self, pools):
+ for pool in pools:
+ self.wait_for_pool(pool)
+
+ def is_mgr_available(self):
+ x = self.get_mgr_dump()
+ return x.get('available', False)
+
+ def wait_for_mgr_available(self, timeout=None):
+ self.log("waiting for mgr available")
+ start = time.time()
+ while not self.is_mgr_available():
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'timeout expired in wait_for_mgr_available'
+ time.sleep(3)
+ self.log("mgr available!")
+
def wait_for_recovery(self, timeout=None):
"""
Check peering. When this exists, we have recovered.
create_pool = utility_task("create_pool")
remove_pool = utility_task("remove_pool")
wait_for_clean = utility_task("wait_for_clean")
+flush_all_pg_stats = utility_task("flush_all_pg_stats")
set_pool_property = utility_task("set_pool_property")
do_pg_scrub = utility_task("do_pg_scrub")
+wait_for_pool = utility_task("wait_for_pool")
+wait_for_pools = utility_task("wait_for_pools")
# Load an config settings of interest
for setting in self.LOAD_SETTINGS:
- setattr(self, setting, int(self.fs.mds_asok(
+ setattr(self, setting, float(self.fs.mds_asok(
['config', 'get', setting], self.mds_cluster.mds_ids[0]
)[setting]))
def auth_list(self):
"""
- Convenience wrapper on "ceph auth list"
+ Convenience wrapper on "ceph auth ls"
"""
return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
- "auth", "list", "--format=json-pretty"
+ "auth", "ls", "--format=json-pretty"
))['auth_dump']
def assert_session_count(self, expected, ls_data=None, mds_id=None):
except run.CommandFailedError:
log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
+ self.client_remote.run(args=[
+ 'sudo',
+ run.Raw('PATH=/usr/sbin:$PATH'),
+ 'lsof',
+ run.Raw(';'),
+ 'ps',
+ 'auxf',
+ ])
+
# abort the fuse mount, killing all hung processes
if self._fuse_conn:
self.run_python(dedent("""
if force:
cmd.append('-f')
- self.client_remote.run(args=cmd)
+ try:
+ self.client_remote.run(args=cmd)
+ except Exception as e:
+ self.client_remote.run(args=[
+ 'sudo',
+ run.Raw('PATH=/usr/sbin:$PATH'),
+ 'lsof',
+ run.Raw(';'),
+ 'ps', 'auxf',
+ ])
+ raise e
rproc = self.client_remote.run(
args=[
# MDS should not be happy about that, as the client is failing to comply
# with the SESSION_RECALL messages it is being sent
- mds_recall_state_timeout = int(self.fs.get_config("mds_recall_state_timeout"))
- self.wait_for_health("MDS_HEALTH_CLIENT_RECALL",
+ mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
+ self.wait_for_health("MDS_CLIENT_RECALL",
mds_recall_state_timeout + 10)
# We can also test that the MDS health warning for oversized
# After mds_revoke_cap_timeout, we should see a health warning (extra lag from
# MDS beacon period)
- mds_revoke_cap_timeout = int(self.fs.get_config("mds_revoke_cap_timeout"))
- self.wait_for_health("MDS_CLIENT_RECALL", mds_revoke_cap_timeout + 10)
+ mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout"))
+ self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10)
# Client B should still be stuck
self.assertFalse(rproc.finished)
# Kill the rank 0 daemon's physical process
self.fs.mds_stop(original_active)
- grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+ grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# Wait until the monitor promotes his replacement
def promoted():
if not require_active:
raise case.SkipTest("fuse_require_active_mds is not set")
- grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+ grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# Check it's not laggy to begin with
(original_active, ) = self.fs.get_active_names()
# Need all my standbys up as well as the active daemons
self.wait_for_daemon_start()
- grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+ grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
# Remove the snapshot
self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
- self.mount_a.umount_wait()
# Purging file_a doesn't happen until after we've flushed the journal, because
# it is referenced by the snapshotted subdir, and the snapshot isn't really
# gone until the journal references to it are gone
self.fs.mds_asok(["flush", "journal"])
+ # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs.
+ # See also: http://tracker.ceph.com/issues/20072
+ self.wait_until_true(
+ lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024),
+ timeout=60
+ )
+
# See that a purge happens now
self._wait_for_counter("mds_cache", "strays_enqueued", 2)
self._wait_for_counter("purge_queue", "pq_executed", 2)
- self.assertTrue(self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024))
self.await_data_pool_empty()
def test_fancy_layout(self):
log.info('stopping first osd')
manager.kill_osd(0)
manager.mark_down_osd(0)
+ manager.wait_for_active(timeout)
log.info('waiting for all to be unclean')
starttime = time.time()
thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
during replay. Value should be between 0.0 and 1.0.
- thrash_max_mds: [default: 0.0] likelihood that the max_mds of the mds
+ thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds
cluster will be modified to a value [1, current) or (current, starting
max_mds]. When reduced, randomly selected MDSs other than rank 0 will be
deactivated to reach the new max_mds. Value should be between 0.0 and 1.0.
self.stopping = Event()
self.randomize = bool(self.config.get('randomize', True))
- self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.0))
+ self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.05))
self.max_thrash = int(self.config.get('max_thrash', 1))
self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0))
self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
import contextlib
import json
import logging
+import os
import StringIO
-import re
from teuthology.parallel import parallel
from teuthology import misc as teuthology
from tempfile import NamedTemporaryFile
from teuthology.orchestra import run
+from teuthology.packaging import install_package, remove_package
log = logging.getLogger(__name__)
yield
+def get_ioengine_package_name(ioengine, remote):
+ system_type = teuthology.get_system_type(remote)
+ if ioengine == 'rbd':
+ return 'librbd1-devel' if system_type == 'rpm' else 'librbd-dev'
+ elif ioengine == 'libaio':
+ return 'libaio-devel' if system_type == 'rpm' else 'libaio-dev'
+ else:
+ return None
+
+
+def run_rbd_map(remote, image, iodepth):
+ iodepth = max(iodepth, 128) # RBD_QUEUE_DEPTH_DEFAULT
+ out = StringIO.StringIO()
+ remote.run(args=['sudo', 'rbd', 'map', '-o', 'queue_depth={}'.format(iodepth), image], stdout=out)
+ dev = out.getvalue().rstrip('\n')
+ teuthology.sudo_write_file(
+ remote,
+ '/sys/block/{}/queue/nr_requests'.format(os.path.basename(dev)),
+ str(iodepth))
+ return dev
+
+
def run_fio(remote, config, rbd_test_dir):
"""
create fio config file with options based on above config
fio_config.write('bs={bs}\n'.format(bs=bs))
else:
fio_config.write('bs=4k\n')
- fio_config.write('iodepth=2\n')
+ iodepth = config.get('io-depth', 2)
+ fio_config.write('iodepth={iod}\n'.format(iod=iodepth))
if config.get('fio-io-size'):
size=config['fio-io-size']
fio_config.write('size={size}\n'.format(size=size))
formats=[1,2]
features=[['layering'],['striping'],['exclusive-lock','object-map']]
- fio_version='2.7'
+ fio_version='2.21'
if config.get('formats'):
formats=config['formats']
if config.get('features'):
if config.get('fio-version'):
fio_version=config['fio-version']
- fio_config.write('norandommap\n')
- if ioengine == 'rbd':
- fio_config.write('invalidate=0\n')
- #handle package required for librbd engine
+ # handle package required for ioengine, if any
sn=remote.shortname
- system_type= teuthology.get_system_type(remote)
- if system_type == 'rpm' and ioengine == 'rbd':
- log.info("Installing librbd1 devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'yum' , 'install', 'librbd1-devel', '-y'])
- elif ioengine == 'rbd':
- log.info("Installing librbd devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'apt-get', '-y',
- '--force-yes',
- 'install', 'librbd-dev'])
+ ioengine_pkg = get_ioengine_package_name(ioengine, remote)
+ if ioengine_pkg:
+ install_package(ioengine_pkg, remote)
+
+ fio_config.write('norandommap\n')
if ioengine == 'rbd':
fio_config.write('clientname=admin\n')
fio_config.write('pool=rbd\n')
+ fio_config.write('invalidate=0\n')
+ elif ioengine == 'libaio':
+ fio_config.write('direct=1\n')
for frmt in formats:
for feature in features:
log.info("Creating rbd images on {sn}".format(sn=sn))
remote.run(args=create_args)
remote.run(args=['rbd', 'info', rbd_name])
if ioengine != 'rbd':
- out=StringIO.StringIO()
- remote.run(args=['sudo', 'rbd', 'map', rbd_name ],stdout=out)
- dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
- rbd_dev=dev.group(1)
+ rbd_dev = run_rbd_map(remote, rbd_name, iodepth)
if config.get('test-clone-io'):
log.info("Testing clones using fio")
remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
- remote.run(args=['sudo', 'rbd', 'map', rbd_clone_name], stdout=out)
- dev=re.search(r'(/dev/rbd\d+)',out.getvalue())
- rbd_clone_dev=dev.group(1)
+ rbd_clone_dev = run_rbd_map(remote, rbd_clone_name, iodepth)
fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev))
if config.get('rw'):
rw=config['rw']
run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'),
run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make'])
remote.run(args=['ceph', '-s'])
+ remote.run(args=[run.Raw('{tdir}/fio-fio-{v}/fio --showcmd {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
remote.run(args=['ceph', '-s'])
finally:
remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])])
log.info("Cleaning up fio install")
remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)])
- if system_type == 'rpm' and ioengine == 'rbd':
- log.info("Uninstall librbd1 devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'yum' , 'remove', 'librbd1-devel', '-y'])
- elif ioengine == 'rbd':
- log.info("Uninstall librbd devel package on {sn}".format(sn=sn))
- remote.run(args=['sudo', 'apt-get', '-y', 'remove', 'librbd-dev'])
+ if ioengine_pkg:
+ remove_package(ioengine_pkg, remote)
import time
from cStringIO import StringIO
+from teuthology.orchestra import run
from teuthology import misc as teuthology
from util.rados import rados
import os
# create 1 pg pool
log.info('creating foo')
manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+ manager.raw_cluster_cmd(
+ 'osd', 'pool', 'application', 'enable',
+ 'foo', 'rados', run.Raw('||'), 'true')
# Remove extra pool to simlify log output
manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
manager.mark_down_osd(non_divergent[0])
# manager.mark_out_osd(non_divergent[0])
- # An empty collection for pg 2.0 needs to be cleaned up
+ # An empty collection for pg 2.0 might need to be cleaned up
cmd = ((prefix + "--op remove --pgid 2.0").
format(id=non_divergent[0]))
proc = exp_remote.run(args=cmd, wait=True,
check_status=False, stdout=StringIO())
- assert proc.exitstatus == 0
cmd = ((prefix + "--op import --file {file}").
format(id=non_divergent[0], file=expfile))
- 'scrub [0-9]+ errors'
- 'size 1 != size'
- 'attr name mismatch'
- - 'Regular scrub request, losing deep-scrub details'
+ - 'Regular scrub request, deep-scrub details will be lost'
conf:
osd:
filestore debug inject read err: true
'repair_test task only accepts a dict for config'
manager = ctx.managers['ceph']
- manager.wait_for_all_up()
+ manager.wait_for_all_osds_up()
manager.raw_cluster_cmd('osd', 'set', 'noscrub')
manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
finally:
log.info('joining thrashosds')
thrash_proc.do_join()
- cluster_manager.wait_for_all_up()
+ cluster_manager.wait_for_all_osds_up()
cluster_manager.flush_all_pg_stats()
cluster_manager.wait_for_recovery(config.get('timeout', 360))
# Wait for OSD to come up so that subsequent injectargs etc will
# definitely succeed
- LocalCephCluster(LocalContext()).mon_manager.wait_for_all_up(timeout=30)
+ LocalCephCluster(LocalContext()).mon_manager.wait_for_all_osds_up(timeout=30)
# List of client mounts, sufficient to run the selected tests
clients = [i.__str__() for i in range(0, max_required_clients)]
backup.client.0: [foo]
client.1: [bar] # cluster is implicitly 'ceph'
+ You can also specify an alternative top-level dir to 'qa/workunits', like
+ 'qa/standalone', with::
+
+ tasks:
+ - install:
+ - workunit:
+ basedir: qa/standalone
+ clients:
+ client.0:
+ - test-ceph-helpers.sh
+
:param ctx: Context
:param config: Configuration
"""
for role, tests in clients.iteritems():
if role != "all":
p.spawn(_run_tests, ctx, refspec, role, tests,
- config.get('env'), timeout=timeout)
+ config.get('env'),
+ basedir=config.get('basedir','qa/workunits'),
+ timeout=timeout)
# Clean up dirs from any non-all workunits
for role, created in created_mountpoint.items():
if 'all' in clients:
all_tasks = clients["all"]
_spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+ config.get('basedir', 'qa/workunits'),
config.get('subdir'), timeout=timeout)
return created_mountpoint
-def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
+def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None):
"""
Make a scratch directory for each client in the cluster, and then for each
test spawn _run_tests() for each role.
for unit in tests:
with parallel() as p:
for role, remote in client_remotes.items():
- p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir,
+ p.spawn(_run_tests, ctx, refspec, role, [unit], env,
+ basedir,
+ subdir,
timeout=timeout)
# cleanup the generated client directories
_delete_dir(ctx, role, created_mountpoint[role])
-def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
+def _run_tests(ctx, refspec, role, tests, env, basedir,
+ subdir=None, timeout=None):
"""
Run the individual test. Create a scratch directory and then extract the
workunits from git. Make the executables, and then run the tests.
else:
scratch_tmp = os.path.join(mnt, subdir)
clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
- srcdir = '{cdir}/qa/workunits'.format(cdir=clonedir)
+ srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
+ basedir=basedir)
git_url = teuth_config.get_ceph_qa_suite_git_url()
# if we are running an upgrade test, and ceph-ci does not have branches like
run.Raw('CEPH_ID="{id}"'.format(id=id_)),
run.Raw('PATH=$PATH:/usr/sbin'),
run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
+ run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
]
if env is not None:
for var, val in env.iteritems():
# ln -sf /home/ubuntu/ceph/systemd/ceph-disk@.service /usr/lib/systemd/system/ceph-disk@.service
# ceph-disk.conf will be silently ignored if it is a symbolic link or a hard link /var/log/upstart for logs
# cp /home/ubuntu/ceph/src/upstart/ceph-disk.conf /etc/init/ceph-disk.conf
-# id=3 ; ceph-disk deactivate --deactivate-by-id $id ; ceph-disk destroy --zap --destroy-by-id $id
+# id=3 ; ceph-disk deactivate --deactivate-by-id $id ; ceph-disk destroy --purge --zap --destroy-by-id $id
# py.test -s -v -k test_activate_dmcrypt_luks ceph-disk-test.py
#
# CentOS 7
self.sh("""
set -xe
ceph-disk --verbose deactivate --deactivate-by-id {id}
- ceph-disk --verbose destroy --destroy-by-id {id} --zap
+ ceph-disk --verbose destroy --purge --destroy-by-id {id} --zap
""".format(id=id))
def deactivate_osd(self, uuid):
assert partition['state'] == 'active'
c.sh("ceph-disk --verbose deactivate " + partition['path'])
c.wait_for_osd_down(osd_uuid)
- c.sh("ceph-disk --verbose destroy " + partition['path'] + " --zap")
+ c.sh("ceph-disk --verbose destroy --purge " + partition['path'] + " --zap")
def test_deactivate_reactivate_dmcrypt_plain(self):
c = CephDisk()
#!/bin/bash
-if [ -f $(dirname $0)/../ceph-helpers-root.sh ]; then
- source $(dirname $0)/../ceph-helpers-root.sh
+if [ -f $(dirname $0)/../../standalone/ceph-helpers-root.sh ]; then
+ source $(dirname $0)/../../standalone/ceph-helpers-root.sh
else
- echo "$(dirname $0)/../ceph-helpers-root.sh does not exist."
+ echo "$(dirname $0)/../../standalone/ceph-helpers-root.sh does not exist."
exit 1
fi
ceph osd pool create $test_pool 4 || return 1
ceph osd pool set $test_pool size $size || return 1
ceph osd pool set $test_pool min_size $size || return 1
+ ceph osd pool application enable $test_pool rados
echo FOO > $dir/BAR
timeout $timeout rados --pool $test_pool put BAR $dir/BAR || return 1
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014,2015 Red Hat <contact@redhat.com>
-# Copyright (C) 2014 Federico Gimenez <fgimenez@coit.es>
-#
-# Author: Loic Dachary <loic@dachary.org>
-# Author: Federico Gimenez <fgimenez@coit.es>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-TIMEOUT=300
-PG_NUM=4
-: ${CEPH_BUILD_VIRTUALENV:=/tmp}
-
-if type xmlstarlet > /dev/null 2>&1; then
- XMLSTARLET=xmlstarlet
-elif type xml > /dev/null 2>&1; then
- XMLSTARLET=xml
-else
- echo "Missing xmlstarlet binary!"
- exit 1
-fi
-
-if [ `uname` = FreeBSD ]; then
- SED=gsed
- DIFFCOLOPTS=""
-else
- SED=sed
- termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
- if [ -n "$termwidth" -a "$termwidth" != "0" ]; then
- termwidth="-W ${termwidth}"
- fi
- DIFFCOLOPTS="-y $termwidth"
-fi
-
-#! @file ceph-helpers.sh
-# @brief Toolbox to manage Ceph cluster dedicated to testing
-#
-# Example use case:
-#
-# ~~~~~~~~~~~~~~~~{.sh}
-# source ceph-helpers.sh
-#
-# function mytest() {
-# # cleanup leftovers and reset mydir
-# setup mydir
-# # create a cluster with one monitor and three osds
-# run_mon mydir a
-# run_osd mydir 0
-# run_osd mydir 2
-# run_osd mydir 3
-# # put and get an object
-# rados --pool rbd put GROUP /etc/group
-# rados --pool rbd get GROUP /tmp/GROUP
-# # stop the cluster and cleanup the directory
-# teardown mydir
-# }
-# ~~~~~~~~~~~~~~~~
-#
-# The focus is on simplicity and efficiency, in the context of
-# functional tests. The output is intentionally very verbose
-# and functions return as soon as an error is found. The caller
-# is also expected to abort on the first error so that debugging
-# can be done by looking at the end of the output.
-#
-# Each function is documented, implemented and tested independently.
-# When modifying a helper, the test and the documentation are
-# expected to be updated and it is easier of they are collocated. A
-# test for a given function can be run with
-#
-# ~~~~~~~~~~~~~~~~{.sh}
-# ceph-helpers.sh TESTS test_get_osds
-# ~~~~~~~~~~~~~~~~
-#
-# and all the tests (i.e. all functions matching test_*) are run
-# with:
-#
-# ~~~~~~~~~~~~~~~~{.sh}
-# ceph-helpers.sh TESTS
-# ~~~~~~~~~~~~~~~~
-#
-# A test function takes a single argument : the directory dedicated
-# to the tests. It is expected to not create any file outside of this
-# directory and remove it entirely when it completes successfully.
-#
-
-
-##
-# Cleanup any leftovers found in **dir** via **teardown**
-# and reset **dir** as an empty environment.
-#
-# @param dir path name of the environment
-# @return 0 on success, 1 on error
-#
-function setup() {
- local dir=$1
- teardown $dir || return 1
- mkdir -p $dir
-}
-
-function test_setup() {
- local dir=$dir
- setup $dir || return 1
- test -d $dir || return 1
- setup $dir || return 1
- test -d $dir || return 1
- teardown $dir
-}
-
-#######################################################################
-
-##
-# Kill all daemons for which a .pid file exists in **dir** and remove
-# **dir**. If the file system in which **dir** is btrfs, delete all
-# subvolumes that relate to it.
-#
-# @param dir path name of the environment
-# @return 0 on success, 1 on error
-#
-function teardown() {
- local dir=$1
- kill_daemons $dir KILL
- if [ `uname` != FreeBSD ] \
- && [ $(stat -f -c '%T' .) == "btrfs" ]; then
- __teardown_btrfs $dir
- fi
- rm -fr $dir
-}
-
-function __teardown_btrfs() {
- local btrfs_base_dir=$1
- local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
- local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
- for subvolume in $btrfs_dirs; do
- sudo btrfs subvolume delete $btrfs_root/$subvolume
- done
-}
-
-function test_teardown() {
- local dir=$dir
- setup $dir || return 1
- teardown $dir || return 1
- ! test -d $dir || return 1
-}
-
-#######################################################################
-
-##
-# Sends a signal to a single daemon.
-# This is a helper function for kill_daemons
-#
-# After the daemon is sent **signal**, its actual termination
-# will be verified by sending it signal 0. If the daemon is
-# still alive, kill_daemon will pause for a few seconds and
-# try again. This will repeat for a fixed number of times
-# before kill_daemon returns on failure. The list of
-# sleep intervals can be specified as **delays** and defaults
-# to:
-#
-# 0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120
-#
-# This sequence is designed to run first a very short sleep time (0.1)
-# if the machine is fast enough and the daemon terminates in a fraction of a
-# second. The increasing sleep numbers should give plenty of time for
-# the daemon to die even on the slowest running machine. If a daemon
-# takes more than a few minutes to stop (the sum of all sleep times),
-# there probably is no point in waiting more and a number of things
-# are likely to go wrong anyway: better give up and return on error.
-#
-# @param pid the process id to send a signal
-# @param send_signal the signal to send
-# @param delays sequence of sleep times before failure
-#
-function kill_daemon() {
- local pid=$(cat $1)
- local send_signal=$2
- local delays=${3:-0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120}
- local exit_code=1
- for try in $delays ; do
- if kill -$send_signal $pid 2> /dev/null ; then
- exit_code=1
- else
- exit_code=0
- break
- fi
- send_signal=0
- sleep $try
- done;
- return $exit_code
-}
-
-function test_kill_daemon() {
- local dir=$1
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- name_prefix=osd
- for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
- #
- # sending signal 0 won't kill the daemon
- # waiting just for one second instead of the default schedule
- # allows us to quickly verify what happens when kill fails
- # to stop the daemon (i.e. it must return false)
- #
- ! kill_daemon $pidfile 0 1 || return 1
- #
- # killing just the osd and verify the mon still is responsive
- #
- kill_daemon $pidfile TERM || return 1
- done
-
- ceph osd dump | grep "osd.0 down" || return 1
-
- name_prefix=mgr
- for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
- #
- # kill the mgr
- #
- kill_daemon $pidfile TERM || return 1
- done
-
- name_prefix=mon
- for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
- #
- # kill the mon and verify it cannot be reached
- #
- kill_daemon $pidfile TERM || return 1
- ! timeout 5 ceph status || return 1
- done
-
- teardown $dir || return 1
-}
-
-##
-# Kill all daemons for which a .pid file exists in **dir**. Each
-# daemon is sent a **signal** and kill_daemons waits for it to exit
-# during a few minutes. By default all daemons are killed. If a
-# **name_prefix** is provided, only the daemons for which a pid
-# file is found matching the prefix are killed. See run_osd and
-# run_mon for more information about the name conventions for
-# the pid files.
-#
-# Send TERM to all daemons : kill_daemons $dir
-# Send KILL to all daemons : kill_daemons $dir KILL
-# Send KILL to all osds : kill_daemons $dir KILL osd
-# Send KILL to osd 1 : kill_daemons $dir KILL osd.1
-#
-# If a daemon is sent the TERM signal and does not terminate
-# within a few minutes, it will still be running even after
-# kill_daemons returns.
-#
-# If all daemons are kill successfully the function returns 0
-# if at least one daemon remains, this is treated as an
-# error and the function return 1.
-#
-# @param dir path name of the environment
-# @param signal name of the first signal (defaults to TERM)
-# @param name_prefix only kill match daemons (defaults to all)
-# @param delays sequence of sleep times before failure
-# @return 0 on success, 1 on error
-#
-function kill_daemons() {
- local trace=$(shopt -q -o xtrace && echo true || echo false)
- $trace && shopt -u -o xtrace
- local dir=$1
- local signal=${2:-TERM}
- local name_prefix=$3 # optional, osd, mon, osd.1
- local delays=$4 #optional timing
- local status=0
- local pids=""
-
- for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
- run_in_background pids kill_daemon $pidfile $signal $delays
- done
-
- wait_background pids
- status=$?
-
- $trace && shopt -s -o xtrace
- return $status
-}
-
-function test_kill_daemons() {
- local dir=$1
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- #
- # sending signal 0 won't kill the daemon
- # waiting just for one second instead of the default schedule
- # allows us to quickly verify what happens when kill fails
- # to stop the daemon (i.e. it must return false)
- #
- ! kill_daemons $dir 0 osd 1 || return 1
- #
- # killing just the osd and verify the mon still is responsive
- #
- kill_daemons $dir TERM osd || return 1
- ceph osd dump | grep "osd.0 down" || return 1
- #
- # kill the mgr
- #
- kill_daemons $dir TERM mgr || return 1
- #
- # kill the mon and verify it cannot be reached
- #
- kill_daemons $dir TERM || return 1
- ! timeout 5 ceph status || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Run a monitor by the name mon.**id** with data in **dir**/**id**.
-# The logs can be found in **dir**/mon.**id**.log and the pid file
-# is **dir**/mon.**id**.pid and the admin socket is
-# **dir**/**id**/ceph-mon.**id**.asok.
-#
-# The remaining arguments are passed verbatim to ceph-mon --mkfs
-# and the ceph-mon daemon.
-#
-# Two mandatory arguments must be provided: --fsid and --mon-host
-# Instead of adding them to every call to run_mon, they can be
-# set in the CEPH_ARGS environment variable to be read implicitly
-# by every ceph command.
-#
-# The CEPH_CONF variable is expected to be set to /dev/null to
-# only rely on arguments for configuration.
-#
-# Examples:
-#
-# CEPH_ARGS="--fsid=$(uuidgen) "
-# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
-# run_mon $dir a # spawn a mon and bind port 7018
-# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging
-#
-# If mon_initial_members is not set, the default rbd pool is deleted
-# and replaced with a replicated pool with less placement groups to
-# speed up initialization. If mon_initial_members is set, no attempt
-# is made to recreate the rbd pool because it would hang forever,
-# waiting for other mons to join.
-#
-# A **dir**/ceph.conf file is created but not meant to be used by any
-# function. It is convenient for debugging a failure with:
-#
-# ceph --conf **dir**/ceph.conf -s
-#
-# @param dir path name of the environment
-# @param id mon identifier
-# @param ... can be any option valid for ceph-mon
-# @return 0 on success, 1 on error
-#
-function run_mon_no_pool() {
- local dir=$1
- shift
- local id=$1
- shift
- local data=$dir/$id
-
- ceph-mon \
- --id $id \
- --mkfs \
- --mon-data=$data \
- --run-dir=$dir \
- "$@" || return 1
-
- ceph-mon \
- --id $id \
- --mon-osd-full-ratio=.99 \
- --mon-data-avail-crit=1 \
- --paxos-propose-interval=0.1 \
- --osd-crush-chooseleaf-type=0 \
- --erasure-code-dir=$CEPH_LIB \
- --plugin-dir=$CEPH_LIB \
- --debug-mon 20 \
- --debug-ms 20 \
- --debug-paxos 20 \
- --chdir= \
- --mon-data=$data \
- --log-file=$dir/\$name.log \
- --admin-socket=$dir/\$cluster-\$name.asok \
- --mon-cluster-log-file=$dir/log \
- --run-dir=$dir \
- --pid-file=$dir/\$name.pid \
- --mon-allow-pool-delete \
- "$@" || return 1
-
- cat > $dir/ceph.conf <<EOF
-[global]
-fsid = $(get_config mon $id fsid)
-mon host = $(get_config mon $id mon_host)
-EOF
-}
-
-function run_mon() {
- local dir=$1
- shift
- local id=$1
- shift
-
- run_mon_no_pool $dir $id "$@" || return 1
-
- ceph osd pool create rbd 8
-
- if test -z "$(get_config mon $id mon_initial_members)" ; then
- ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1
- ceph osd pool create rbd $PG_NUM || return 1
- ceph osd set-backfillfull-ratio .99
- fi
-}
-
-function test_run_mon() {
- local dir=$1
-
- setup $dir || return 1
-
- run_mon $dir a --mon-initial-members=a || return 1
- # rbd has not been deleted / created, hence it has pool id 0
- ceph osd dump | grep "pool 1 'rbd'" || return 1
- kill_daemons $dir || return 1
-
- run_mon $dir a || return 1
- # rbd has been deleted / created, hence it does not have pool id 0
- ! ceph osd dump | grep "pool 1 'rbd'" || return 1
- local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
- config get osd_pool_default_size)
- test "$size" = '{"osd_pool_default_size":"3"}' || return 1
-
- ! CEPH_ARGS='' ceph status || return 1
- CEPH_ARGS='' ceph --conf $dir/ceph.conf status || return 1
-
- kill_daemons $dir || return 1
-
- run_mon $dir a --osd_pool_default_size=1 || return 1
- local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
- config get osd_pool_default_size)
- test "$size" = '{"osd_pool_default_size":"1"}' || return 1
- kill_daemons $dir || return 1
-
- CEPH_ARGS="$CEPH_ARGS --osd_pool_default_size=2" \
- run_mon $dir a || return 1
- local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
- config get osd_pool_default_size)
- test "$size" = '{"osd_pool_default_size":"2"}' || return 1
- kill_daemons $dir || return 1
-
- teardown $dir || return 1
-}
-
-#######################################################################
-
-function run_mgr() {
- local dir=$1
- shift
- local id=$1
- shift
- local data=$dir/$id
-
- ceph-mgr \
- --id $id \
- --erasure-code-dir=$CEPH_LIB \
- --plugin-dir=$CEPH_LIB \
- --debug-mgr 20 \
- --debug-objecter 20 \
- --debug-ms 20 \
- --debug-paxos 20 \
- --chdir= \
- --mgr-data=$data \
- --log-file=$dir/\$name.log \
- --admin-socket=$dir/\$cluster-\$name.asok \
- --run-dir=$dir \
- --pid-file=$dir/\$name.pid \
- "$@" || return 1
-}
-
-#######################################################################
-
-##
-# Create (prepare) and run (activate) an osd by the name osd.**id**
-# with data in **dir**/**id**. The logs can be found in
-# **dir**/osd.**id**.log, the pid file is **dir**/osd.**id**.pid and
-# the admin socket is **dir**/**id**/ceph-osd.**id**.asok.
-#
-# The remaining arguments are passed verbatim to ceph-osd.
-#
-# Two mandatory arguments must be provided: --fsid and --mon-host
-# Instead of adding them to every call to run_osd, they can be
-# set in the CEPH_ARGS environment variable to be read implicitly
-# by every ceph command.
-#
-# The CEPH_CONF variable is expected to be set to /dev/null to
-# only rely on arguments for configuration.
-#
-# The run_osd function creates the OSD data directory with ceph-disk
-# prepare on the **dir**/**id** directory and relies on the
-# activate_osd function to run the daemon.
-#
-# Examples:
-#
-# CEPH_ARGS="--fsid=$(uuidgen) "
-# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
-# run_osd $dir 0 # prepare and activate an osd using the monitor listening on 7018
-#
-# @param dir path name of the environment
-# @param id osd identifier
-# @param ... can be any option valid for ceph-osd
-# @return 0 on success, 1 on error
-#
-function run_osd() {
- local dir=$1
- shift
- local id=$1
- shift
- local osd_data=$dir/$id
-
- local ceph_disk_args
- ceph_disk_args+=" --statedir=$dir"
- ceph_disk_args+=" --sysconfdir=$dir"
- ceph_disk_args+=" --prepend-to-path="
-
- mkdir -p $osd_data
- ceph-disk $ceph_disk_args \
- prepare --filestore $osd_data || return 1
-
- activate_osd $dir $id "$@"
-}
-
-function run_osd_bluestore() {
- local dir=$1
- shift
- local id=$1
- shift
- local osd_data=$dir/$id
-
- local ceph_disk_args
- ceph_disk_args+=" --statedir=$dir"
- ceph_disk_args+=" --sysconfdir=$dir"
- ceph_disk_args+=" --prepend-to-path="
-
- mkdir -p $osd_data
- ceph-disk $ceph_disk_args \
- prepare --bluestore $osd_data || return 1
-
- activate_osd $dir $id "$@"
-}
-
-function test_run_osd() {
- local dir=$1
-
- setup $dir || return 1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
-
- run_osd $dir 0 || return 1
- local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
- config get osd_max_backfills)
- echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
-
- run_osd $dir 1 --osd-max-backfills 20 || return 1
- local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.1.asok \
- config get osd_max_backfills)
- test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
-
- CEPH_ARGS="$CEPH_ARGS --osd-max-backfills 30" run_osd $dir 2 || return 1
- local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.2.asok \
- config get osd_max_backfills)
- test "$backfills" = '{"osd_max_backfills":"30"}' || return 1
-
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Shutdown and remove all traces of the osd by the name osd.**id**.
-#
-# The OSD is shutdown with the TERM signal. It is then removed from
-# the auth list, crush map, osd map etc and the files associated with
-# it are also removed.
-#
-# @param dir path name of the environment
-# @param id osd identifier
-# @return 0 on success, 1 on error
-#
-function destroy_osd() {
- local dir=$1
- local id=$2
-
- kill_daemons $dir TERM osd.$id || return 1
- ceph osd out osd.$id || return 1
- ceph auth del osd.$id || return 1
- ceph osd crush remove osd.$id || return 1
- ceph osd rm $id || return 1
- teardown $dir/$id || return 1
- rm -fr $dir/$id
-}
-
-function test_destroy_osd() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- destroy_osd $dir 0 || return 1
- ! ceph osd dump | grep "osd.$id " || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Run (activate) an osd by the name osd.**id** with data in
-# **dir**/**id**. The logs can be found in **dir**/osd.**id**.log,
-# the pid file is **dir**/osd.**id**.pid and the admin socket is
-# **dir**/**id**/ceph-osd.**id**.asok.
-#
-# The remaining arguments are passed verbatim to ceph-osd.
-#
-# Two mandatory arguments must be provided: --fsid and --mon-host
-# Instead of adding them to every call to activate_osd, they can be
-# set in the CEPH_ARGS environment variable to be read implicitly
-# by every ceph command.
-#
-# The CEPH_CONF variable is expected to be set to /dev/null to
-# only rely on arguments for configuration.
-#
-# The activate_osd function expects a valid OSD data directory
-# in **dir**/**id**, either just created via run_osd or re-using
-# one left by a previous run of ceph-osd. The ceph-osd daemon is
-# run indirectly via ceph-disk activate.
-#
-# The activate_osd function blocks until the monitor reports the osd
-# up. If it fails to do so within $TIMEOUT seconds, activate_osd
-# fails.
-#
-# Examples:
-#
-# CEPH_ARGS="--fsid=$(uuidgen) "
-# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
-# activate_osd $dir 0 # activate an osd using the monitor listening on 7018
-#
-# @param dir path name of the environment
-# @param id osd identifier
-# @param ... can be any option valid for ceph-osd
-# @return 0 on success, 1 on error
-#
-function activate_osd() {
- local dir=$1
- shift
- local id=$1
- shift
- local osd_data=$dir/$id
-
- local ceph_disk_args
- ceph_disk_args+=" --statedir=$dir"
- ceph_disk_args+=" --sysconfdir=$dir"
- ceph_disk_args+=" --prepend-to-path="
-
- local ceph_args="$CEPH_ARGS"
- ceph_args+=" --osd-failsafe-full-ratio=.99"
- ceph_args+=" --osd-journal-size=100"
- ceph_args+=" --osd-scrub-load-threshold=2000"
- ceph_args+=" --osd-data=$osd_data"
- ceph_args+=" --chdir="
- ceph_args+=" --erasure-code-dir=$CEPH_LIB"
- ceph_args+=" --plugin-dir=$CEPH_LIB"
- ceph_args+=" --osd-class-dir=$CEPH_LIB"
- ceph_args+=" --run-dir=$dir"
- ceph_args+=" --debug-osd=20"
- ceph_args+=" --log-file=$dir/\$name.log"
- ceph_args+=" --pid-file=$dir/\$name.pid"
- ceph_args+=" --osd-max-object-name-len 460"
- ceph_args+=" --osd-max-object-namespace-len 64"
- ceph_args+=" --enable-experimental-unrecoverable-data-corrupting-features *"
- ceph_args+=" "
- ceph_args+="$@"
- mkdir -p $osd_data
- CEPH_ARGS="$ceph_args " ceph-disk $ceph_disk_args \
- activate \
- --mark-init=none \
- $osd_data || return 1
-
- [ "$id" = "$(cat $osd_data/whoami)" ] || return 1
-
- wait_for_osd up $id || return 1
-}
-
-function test_activate_osd() {
- local dir=$1
-
- setup $dir || return 1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
-
- run_osd $dir 0 || return 1
- local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
- config get osd_max_backfills)
- echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
-
- kill_daemons $dir TERM osd || return 1
-
- activate_osd $dir 0 --osd-max-backfills 20 || return 1
- local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
- config get osd_max_backfills)
- test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
-
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Wait until the OSD **id** is either up or down, as specified by
-# **state**. It fails after $TIMEOUT seconds.
-#
-# @param state either up or down
-# @param id osd identifier
-# @return 0 on success, 1 on error
-#
-function wait_for_osd() {
- local state=$1
- local id=$2
-
- status=1
- for ((i=0; i < $TIMEOUT; i++)); do
- echo $i
- if ! ceph osd dump | grep "osd.$id $state"; then
- sleep 1
- else
- status=0
- break
- fi
- done
- return $status
-}
-
-function test_wait_for_osd() {
- local dir=$1
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_osd up 0 || return 1
- kill_daemons $dir TERM osd || return 1
- wait_for_osd down 0 || return 1
- ( TIMEOUT=1 ; ! wait_for_osd up 0 ) || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Display the list of OSD ids supporting the **objectname** stored in
-# **poolname**, as reported by ceph osd map.
-#
-# @param poolname an existing pool
-# @param objectname an objectname (may or may not exist)
-# @param STDOUT white space separated list of OSD ids
-# @return 0 on success, 1 on error
-#
-function get_osds() {
- local poolname=$1
- local objectname=$2
-
- local osds=$(ceph --format json osd map $poolname $objectname 2>/dev/null | \
- jq '.acting | .[]')
- # get rid of the trailing space
- echo $osds
-}
-
-function test_get_osds() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=2 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- wait_for_clean || return 1
- get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Wait for the monitor to form quorum (optionally, of size N)
-#
-# @param timeout duration (lower-bound) to wait for quorum to be formed
-# @param quorumsize size of quorum to wait for
-# @return 0 on success, 1 on error
-#
-function wait_for_quorum() {
- local timeout=$1
- local quorumsize=$2
-
- if [[ -z "$timeout" ]]; then
- timeout=300
- fi
-
- if [[ -z "$quorumsize" ]]; then
- timeout $timeout ceph mon_status --format=json >&/dev/null || return 1
- return 0
- fi
-
- no_quorum=1
- wait_until=$((`date +%s` + $timeout))
- while [[ $(date +%s) -lt $wait_until ]]; do
- jqfilter='.quorum | length == '$quorumsize
- jqinput="$(timeout $timeout ceph mon_status --format=json 2>/dev/null)"
- res=$(echo $jqinput | jq "$jqfilter")
- if [[ "$res" == "true" ]]; then
- no_quorum=0
- break
- fi
- done
- return $no_quorum
-}
-
-#######################################################################
-
-##
-# Return the PG of supporting the **objectname** stored in
-# **poolname**, as reported by ceph osd map.
-#
-# @param poolname an existing pool
-# @param objectname an objectname (may or may not exist)
-# @param STDOUT a PG
-# @return 0 on success, 1 on error
-#
-function get_pg() {
- local poolname=$1
- local objectname=$2
-
- ceph --format json osd map $poolname $objectname 2>/dev/null | jq -r '.pgid'
-}
-
-function test_get_pg() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the value of the **config**, obtained via the config get command
-# of the admin socket of **daemon**.**id**.
-#
-# @param daemon mon or osd
-# @param id mon or osd ID
-# @param config the configuration variable name as found in config_opts.h
-# @param STDOUT the config value
-# @return 0 on success, 1 on error
-#
-function get_config() {
- local daemon=$1
- local id=$2
- local config=$3
-
- CEPH_ARGS='' \
- ceph --format json daemon $dir/ceph-$daemon.$id.asok \
- config get $config 2> /dev/null | \
- jq -r ".$config"
-}
-
-function test_get_config() {
- local dir=$1
-
- # override the default config using command line arg and check it
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- test $(get_config mon a osd_pool_default_size) = 1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 --osd_max_scrubs=3 || return 1
- test $(get_config osd 0 osd_max_scrubs) = 3 || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Set the **config** to specified **value**, via the config set command
-# of the admin socket of **daemon**.**id**
-#
-# @param daemon mon or osd
-# @param id mon or osd ID
-# @param config the configuration variable name as found in config_opts.h
-# @param value the config value
-# @return 0 on success, 1 on error
-#
-function set_config() {
- local daemon=$1
- local id=$2
- local config=$3
- local value=$4
-
- test $(env CEPH_ARGS='' ceph --format json daemon $dir/ceph-$daemon.$id.asok \
- config set $config $value 2> /dev/null | \
- jq 'has("success")') == true
-}
-
-function test_set_config() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- test $(get_config mon a ms_crc_header) = true || return 1
- set_config mon a ms_crc_header false || return 1
- test $(get_config mon a ms_crc_header) = false || return 1
- set_config mon a ms_crc_header true || return 1
- test $(get_config mon a ms_crc_header) = true || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the OSD id of the primary OSD supporting the **objectname**
-# stored in **poolname**, as reported by ceph osd map.
-#
-# @param poolname an existing pool
-# @param objectname an objectname (may or may not exist)
-# @param STDOUT the primary OSD id
-# @return 0 on success, 1 on error
-#
-function get_primary() {
- local poolname=$1
- local objectname=$2
-
- ceph --format json osd map $poolname $objectname 2>/dev/null | \
- jq '.acting_primary'
-}
-
-function test_get_primary() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- local osd=0
- run_mgr $dir x || return 1
- run_osd $dir $osd || return 1
- wait_for_clean || return 1
- test $(get_primary rbd GROUP) = $osd || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the id of any OSD supporting the **objectname** stored in
-# **poolname**, as reported by ceph osd map, except the primary.
-#
-# @param poolname an existing pool
-# @param objectname an objectname (may or may not exist)
-# @param STDOUT the OSD id
-# @return 0 on success, 1 on error
-#
-function get_not_primary() {
- local poolname=$1
- local objectname=$2
-
- local primary=$(get_primary $poolname $objectname)
- ceph --format json osd map $poolname $objectname 2>/dev/null | \
- jq ".acting | map(select (. != $primary)) | .[0]"
-}
-
-function test_get_not_primary() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=2 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- wait_for_clean || return 1
- local primary=$(get_primary rbd GROUP)
- local not_primary=$(get_not_primary rbd GROUP)
- test $not_primary != $primary || return 1
- test $not_primary = 0 -o $not_primary = 1 || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Run ceph-objectstore-tool against the OSD **id** using the data path
-# **dir**. The OSD is killed with TERM prior to running
-# ceph-objectstore-tool because access to the data path is
-# exclusive. The OSD is restarted after the command completes. The
-# objectstore_tool returns after all PG are active+clean again.
-#
-# @param dir the data path of the OSD
-# @param id the OSD id
-# @param ... arguments to ceph-objectstore-tool
-# @param STDIN the input of ceph-objectstore-tool
-# @param STDOUT the output of ceph-objectstore-tool
-# @return 0 on success, 1 on error
-#
-# The value of $ceph_osd_args will be passed to restarted osds
-#
-function objectstore_tool() {
- local dir=$1
- shift
- local id=$1
- shift
- local osd_data=$dir/$id
-
- local osd_type=$(cat $osd_data/type)
-
- kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
-
- local journal_args
- if [ "$objectstore_type" == "filestore" ]; then
- journal_args=" --journal-path $osd_data/journal"
- fi
- ceph-objectstore-tool \
- --data-path $osd_data \
- $journal_args \
- "$@" || return 1
- activate_osd $dir $id $ceph_osd_args >&2 || return 1
- wait_for_clean >&2
-}
-
-function test_objectstore_tool() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- local osd=0
- run_mgr $dir x || return 1
- run_osd $dir $osd || return 1
- wait_for_clean || return 1
- rados --pool rbd put GROUP /etc/group || return 1
- objectstore_tool $dir $osd GROUP get-bytes | \
- diff - /etc/group
- ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Predicate checking if there is an ongoing recovery in the
-# cluster. If any of the recovering_{keys,bytes,objects}_per_sec
-# counters are reported by ceph status, it means recovery is in
-# progress.
-#
-# @return 0 if recovery in progress, 1 otherwise
-#
-function get_is_making_recovery_progress() {
- local recovery_progress
- recovery_progress+=".recovering_keys_per_sec + "
- recovery_progress+=".recovering_bytes_per_sec + "
- recovery_progress+=".recovering_objects_per_sec"
- local progress=$(ceph --format json status 2>/dev/null | \
- jq -r ".pgmap | $recovery_progress")
- test "$progress" != null
-}
-
-function test_get_is_making_recovery_progress() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- ! get_is_making_recovery_progress || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the number of active PGs in the cluster. A PG is active if
-# ceph pg dump pgs reports it both **active** and **clean** and that
-# not **stale**.
-#
-# @param STDOUT the number of active PGs
-# @return 0 on success, 1 on error
-#
-function get_num_active_clean() {
- local expression
- expression+="select(contains(\"active\") and contains(\"clean\")) | "
- expression+="select(contains(\"stale\") | not)"
- ceph --format json pg dump pgs 2>/dev/null | \
- jq "[.[] | .state | $expression] | length"
-}
-
-function test_get_num_active_clean() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- local num_active_clean=$(get_num_active_clean)
- test "$num_active_clean" = $PG_NUM || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the number of PGs in the cluster, according to
-# ceph pg dump pgs.
-#
-# @param STDOUT the number of PGs
-# @return 0 on success, 1 on error
-#
-function get_num_pgs() {
- ceph --format json status 2>/dev/null | jq '.pgmap.num_pgs'
-}
-
-function test_get_num_pgs() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- local num_pgs=$(get_num_pgs)
- test "$num_pgs" -gt 0 || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return the date and time of the last completed scrub for **pgid**,
-# as reported by ceph pg dump pgs. Note that a repair also sets this
-# date.
-#
-# @param pgid the id of the PG
-# @param STDOUT the date and time of the last scrub
-# @return 0 on success, 1 on error
-#
-function get_last_scrub_stamp() {
- local pgid=$1
- local sname=${2:-last_scrub_stamp}
- ceph --format json pg dump pgs 2>/dev/null | \
- jq -r ".[] | select(.pgid==\"$pgid\") | .$sname"
-}
-
-function test_get_last_scrub_stamp() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- stamp=$(get_last_scrub_stamp 2.0)
- test -n "$stamp" || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Predicate checking if the cluster is clean, i.e. all of its PGs are
-# in a clean state (see get_num_active_clean for a definition).
-#
-# @return 0 if the cluster is clean, 1 otherwise
-#
-function is_clean() {
- num_pgs=$(get_num_pgs)
- test $num_pgs != 0 || return 1
- test $(get_num_active_clean) = $num_pgs || return 1
-}
-
-function test_is_clean() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- is_clean || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return a list of numbers that are increasingly larger and whose
-# total is **timeout** seconds. It can be used to have short sleep
-# delay while waiting for an event on a fast machine. But if running
-# very slowly the larger delays avoid stressing the machine even
-# further or spamming the logs.
-#
-# @param timeout sum of all delays, in seconds
-# @return a list of sleep delays
-#
-function get_timeout_delays() {
- local trace=$(shopt -q -o xtrace && echo true || echo false)
- $trace && shopt -u -o xtrace
- local timeout=$1
- local first_step=${2:-1}
-
- local i
- local total="0"
- i=$first_step
- while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do
- echo -n "$i "
- total=$(echo $total + $i | bc -l)
- i=$(echo $i \* 2 | bc -l)
- done
- if test "$(echo $total \< $timeout | bc -l)" = "1"; then
- echo -n $(echo $timeout - $total | bc -l)
- fi
- $trace && shopt -s -o xtrace
-}
-
-function test_get_timeout_delays() {
- test "$(get_timeout_delays 1)" = "1 " || return 1
- test "$(get_timeout_delays 5)" = "1 2 2" || return 1
- test "$(get_timeout_delays 6)" = "1 2 3" || return 1
- test "$(get_timeout_delays 7)" = "1 2 4 " || return 1
- test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1
- test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1
- test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1
- test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1
- test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1
- test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1
- test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1
-}
-
-#######################################################################
-
-##
-# Wait until the cluster becomes clean or if it does not make progress
-# for $TIMEOUT seconds.
-# Progress is measured either via the **get_is_making_recovery_progress**
-# predicate or if the number of clean PGs changes (as returned by get_num_active_clean)
-#
-# @return 0 if the cluster is clean, 1 otherwise
-#
-function wait_for_clean() {
- local num_active_clean=-1
- local cur_active_clean
- local -a delays=($(get_timeout_delays $TIMEOUT .1))
- local -i loop=0
-
- while test $(get_num_pgs) == 0 ; do
- sleep 1
- done
-
- while true ; do
- # Comparing get_num_active_clean & get_num_pgs is used to determine
- # if the cluster is clean. That's almost an inline of is_clean() to
- # get more performance by avoiding multiple calls of get_num_active_clean.
- cur_active_clean=$(get_num_active_clean)
- test $cur_active_clean = $(get_num_pgs) && break
- if test $cur_active_clean != $num_active_clean ; then
- loop=0
- num_active_clean=$cur_active_clean
- elif get_is_making_recovery_progress ; then
- loop=0
- elif (( $loop >= ${#delays[*]} )) ; then
- ceph report
- return 1
- fi
- sleep ${delays[$loop]}
- loop+=1
- done
- return 0
-}
-
-function test_wait_for_clean() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- ! TIMEOUT=1 wait_for_clean || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Wait until the cluster becomes HEALTH_OK again or if it does not make progress
-# for $TIMEOUT seconds.
-#
-# @return 0 if the cluster is HEALTHY, 1 otherwise
-#
-function wait_for_health() {
- local grepstr=$1
- local -a delays=($(get_timeout_delays $TIMEOUT .1))
- local -i loop=0
-
- while ! ceph health detail | grep "$grepstr" ; do
- if (( $loop >= ${#delays[*]} )) ; then
- ceph health detail
- return 1
- fi
- sleep ${delays[$loop]}
- loop+=1
- done
-}
-
-function wait_for_health_ok() {
- wait_for_health "HEALTH_OK" || return 1
-}
-
-function test_wait_for_health_ok() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 --osd_failsafe_full_ratio=.99 --mon_pg_warn_min_per_osd=0 || return 1
- run_mgr $dir x --mon_pg_warn_min_per_osd=0 || return 1
- run_osd $dir 0 || return 1
- kill_daemons $dir TERM osd || return 1
- ! TIMEOUT=1 wait_for_health_ok || return 1
- activate_osd $dir 0 || return 1
- wait_for_health_ok || return 1
- teardown $dir || return 1
-}
-
-
-#######################################################################
-
-##
-# Run repair on **pgid** and wait until it completes. The repair
-# function will fail if repair does not complete within $TIMEOUT
-# seconds.
-#
-# @param pgid the id of the PG
-# @return 0 on success, 1 on error
-#
-function repair() {
- local pgid=$1
- local last_scrub=$(get_last_scrub_stamp $pgid)
- ceph pg repair $pgid
- wait_for_scrub $pgid "$last_scrub"
-}
-
-function test_repair() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- repair 2.0 || return 1
- kill_daemons $dir KILL osd || return 1
- ! TIMEOUT=1 repair 2.0 || return 1
- teardown $dir || return 1
-}
-#######################################################################
-
-##
-# Run scrub on **pgid** and wait until it completes. The pg_scrub
-# function will fail if repair does not complete within $TIMEOUT
-# seconds. The pg_scrub is complete whenever the
-# **get_last_scrub_stamp** function reports a timestamp different from
-# the one stored before starting the scrub.
-#
-# @param pgid the id of the PG
-# @return 0 on success, 1 on error
-#
-function pg_scrub() {
- local pgid=$1
- local last_scrub=$(get_last_scrub_stamp $pgid)
- ceph pg scrub $pgid
- wait_for_scrub $pgid "$last_scrub"
-}
-
-function pg_deep_scrub() {
- local pgid=$1
- local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
- ceph pg deep-scrub $pgid
- wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
-}
-
-function test_pg_scrub() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- pg_scrub 2.0 || return 1
- kill_daemons $dir KILL osd || return 1
- ! TIMEOUT=1 pg_scrub 2.0 || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Run the *command* and expect it to fail (i.e. return a non zero status).
-# The output (stderr and stdout) is stored in a temporary file in *dir*
-# and is expected to contain the string *expected*.
-#
-# Return 0 if the command failed and the string was found. Otherwise
-# return 1 and cat the full output of the command on stderr for debug.
-#
-# @param dir temporary directory to store the output
-# @param expected string to look for in the output
-# @param command ... the command and its arguments
-# @return 0 on success, 1 on error
-#
-
-function expect_failure() {
- local dir=$1
- shift
- local expected="$1"
- shift
- local success
-
- if "$@" > $dir/out 2>&1 ; then
- success=true
- else
- success=false
- fi
-
- if $success || ! grep --quiet "$expected" $dir/out ; then
- cat $dir/out >&2
- return 1
- else
- return 0
- fi
-}
-
-function test_expect_failure() {
- local dir=$1
-
- setup $dir || return 1
- expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1
- # the command did not fail
- ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1
- grep --quiet FAIL $dir/out || return 1
- # the command failed but the output does not contain the expected string
- ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1
- ! grep --quiet FAIL $dir/out || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
-# will fail if scrub does not complete within $TIMEOUT seconds. The
-# repair is complete whenever the **get_last_scrub_stamp** function
-# reports a timestamp different from the one given in argument.
-#
-# @param pgid the id of the PG
-# @param last_scrub timestamp of the last scrub for *pgid*
-# @return 0 on success, 1 on error
-#
-function wait_for_scrub() {
- local pgid=$1
- local last_scrub="$2"
- local sname=${3:-last_scrub_stamp}
-
- for ((i=0; i < $TIMEOUT; i++)); do
- if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then
- return 0
- fi
- sleep 1
- done
- return 1
-}
-
-function test_wait_for_scrub() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
- local pgid=2.0
- ceph pg repair $pgid
- local last_scrub=$(get_last_scrub_stamp $pgid)
- wait_for_scrub $pgid "$last_scrub" || return 1
- kill_daemons $dir KILL osd || return 1
- last_scrub=$(get_last_scrub_stamp $pgid)
- ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Return 0 if the erasure code *plugin* is available, 1 otherwise.
-#
-# @param plugin erasure code plugin
-# @return 0 on success, 1 on error
-#
-
-function erasure_code_plugin_exists() {
- local plugin=$1
- local status
- local grepstr
- local s
- case `uname` in
- FreeBSD) grepstr="Cannot open.*$plugin" ;;
- *) grepstr="$plugin.*No such file" ;;
- esac
-
- s=$(ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1)
- local status=$?
- if [ $status -eq 0 ]; then
- ceph osd erasure-code-profile rm TESTPROFILE
- elif ! echo $s | grep --quiet "$grepstr" ; then
- status=1
- # display why the string was rejected.
- echo $s
- fi
- return $status
-}
-
-function test_erasure_code_plugin_exists() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- erasure_code_plugin_exists jerasure || return 1
- ! erasure_code_plugin_exists FAKE || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-
-##
-# Display all log files from **dir** on stdout.
-#
-# @param dir directory in which all data is stored
-#
-
-function display_logs() {
- local dir=$1
-
- find $dir -maxdepth 1 -name '*.log' | \
- while read file ; do
- echo "======================= $file"
- cat $file
- done
-}
-
-function test_display_logs() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- kill_daemons $dir || return 1
- display_logs $dir > $dir/log.out
- grep --quiet mon.a.log $dir/log.out || return 1
- teardown $dir || return 1
-}
-
-#######################################################################
-##
-# Spawn a command in background and save the pid in the variable name
-# passed in argument. To make the output reading easier, the output is
-# prepend with the process id.
-#
-# Example:
-# pids1=""
-# run_in_background pids1 bash -c 'sleep 1; exit 1'
-#
-# @param pid_variable the variable name (not value) where the pids will be stored
-# @param ... the command to execute
-# @return only the pid_variable output should be considered and used with **wait_background**
-#
-function run_in_background() {
- local pid_variable=$1
- shift;
- # Execute the command and prepend the output with its pid
- # We enforce to return the exit status of the command and not the awk one.
- ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 &
- eval "$pid_variable+=\" $!\""
-}
-
-function test_run_in_background() {
- local pids
- run_in_background pids sleep 1
- run_in_background pids sleep 1
- test $(echo $pids | wc -w) = 2 || return 1
- wait $pids || return 1
-}
-
-#######################################################################
-##
-# Wait for pids running in background to complete.
-# This function is usually used after a **run_in_background** call
-# Example:
-# pids1=""
-# run_in_background pids1 bash -c 'sleep 1; exit 1'
-# wait_background pids1
-#
-# @param pids The variable name that contains the active PIDS. Set as empty at then end of the function.
-# @return returns 1 if at least one process exits in error unless returns 0
-#
-function wait_background() {
- # We extract the PIDS from the variable name
- pids=${!1}
-
- return_code=0
- for pid in $pids; do
- if ! wait $pid; then
- # If one process failed then return 1
- return_code=1
- fi
- done
-
- # We empty the variable reporting that all process ended
- eval "$1=''"
-
- return $return_code
-}
-
-
-function test_wait_background() {
- local pids=""
- run_in_background pids bash -c "sleep 1; exit 1"
- run_in_background pids bash -c "sleep 2; exit 0"
- wait_background pids
- if [ $? -ne 1 ]; then return 1; fi
-
- run_in_background pids bash -c "sleep 1; exit 0"
- run_in_background pids bash -c "sleep 2; exit 0"
- wait_background pids
- if [ $? -ne 0 ]; then return 1; fi
-
- if [ ! -z "$pids" ]; then return 1; fi
-}
-
-function flush_pg_stats()
-{
- local timeout=${1:-$TIMEOUT}
-
- ids=`ceph osd ls`
- seqs=''
- for osd in $ids; do
- seq=`ceph tell osd.$osd flush_pg_stats`
- seqs="$seqs $osd-$seq"
- done
-
- for s in $seqs; do
- osd=`echo $s | cut -d - -f 1`
- seq=`echo $s | cut -d - -f 2`
- echo "waiting osd.$osd seq $seq"
- while test $(ceph osd last-stat-seq $osd) -lt $seq; do
- sleep 1
- if [ $((timeout--)) -eq 0 ]; then
- return 1
- fi
- done
- done
-}
-
-function test_flush_pg_stats()
-{
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- rados -p rbd put obj /etc/group
- flush_pg_stats
- local jq_filter='.pools | .[] | select(.name == "rbd") | .stats'
- raw_bytes_used=`ceph df detail --format=json | jq "$jq_filter.raw_bytes_used"`
- bytes_used=`ceph df detail --format=json | jq "$jq_filter.bytes_used"`
- test $raw_bytes_used > 0 || return 1
- test $raw_bytes_used == $bytes_used || return 1
-}
-
-#######################################################################
-
-##
-# Call the **run** function (which must be defined by the caller) with
-# the **dir** argument followed by the caller argument list.
-#
-# If the **run** function returns on error, all logs found in **dir**
-# are displayed for diagnostic purposes.
-#
-# **teardown** function is called when the **run** function returns
-# (on success or on error), to cleanup leftovers. The CEPH_CONF is set
-# to /dev/null and CEPH_ARGS is unset so that the tests are protected from
-# external interferences.
-#
-# It is the responsibility of the **run** function to call the
-# **setup** function to prepare the test environment (create a temporary
-# directory etc.).
-#
-# The shell is required (via PS4) to display the function and line
-# number whenever a statement is executed to help debugging.
-#
-# @param dir directory in which all data is stored
-# @param ... arguments passed transparently to **run**
-# @return 0 on success, 1 on error
-#
-function main() {
- local dir=td/$1
- shift
-
- shopt -s -o xtrace
- PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
-
- export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
- #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
-
- export CEPH_CONF=/dev/null
- unset CEPH_ARGS
-
- local code
- if run $dir "$@" ; then
- code=0
- else
- display_logs $dir
- code=1
- fi
- teardown $dir || return 1
- return $code
-}
-
-#######################################################################
-
-function run_tests() {
- shopt -s -o xtrace
- PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
-
- export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
- #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
-
- export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
- export CEPH_CONF=/dev/null
-
- local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')}
- local dir=td/ceph-helpers
-
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-if test "$1" = TESTS ; then
- shift
- run_tests "$@"
-fi
-
-# NOTE:
-# jq only support --exit-status|-e from version 1.4 forwards, which makes
-# returning on error waaaay prettier and straightforward.
-# However, the current automated upstream build is running with v1.3,
-# which has no idea what -e is. Hence the convoluted error checking we
-# need. Sad.
-# The next time someone changes this code, please check if v1.4 is now
-# a thing, and, if so, please change these to use -e. Thanks.
-
-# jq '.all.supported | select([.[] == "foo"] | any)'
-function jq_success() {
- input="$1"
- filter="$2"
- expects="\"$3\""
-
- in_escaped=$(printf %s "$input" | sed "s/'/'\\\\''/g")
- filter_escaped=$(printf %s "$filter" | sed "s/'/'\\\\''/g")
-
- ret=$(echo "$in_escaped" | jq "$filter_escaped")
- if [[ "$ret" == "true" ]]; then
- return 0
- elif [[ -n "$expects" ]]; then
- if [[ "$ret" == "$expects" ]]; then
- return 0
- fi
- fi
- return 1
- input=$1
- filter=$2
- expects="$3"
-
- ret="$(echo $input | jq \"$filter\")"
- if [[ "$ret" == "true" ]]; then
- return 0
- elif [[ -n "$expects" && "$ret" == "$expects" ]]; then
- return 0
- fi
- return 1
-}
-
-# Local Variables:
-# compile-command: "cd ../../src ; make -j4 && ../qa/workunits/ceph-helpers.sh TESTS # test_get_config"
-# End:
# -*- mode:shell-script; tab-width:8; sh-basic-offset:2; indent-tabs-mode:t -*-
# vim: ts=8 sw=8 ft=bash smarttab
-source $(dirname $0)/../ceph-helpers.sh
+source $(dirname $0)/../../standalone/ceph-helpers.sh
set -e
set -o functrace
{
local client=$1
- if test -n "$CEPH_OUT_DIR";
+ if test -n "$CEPH_ASOK_DIR";
then
- echo $CEPH_OUT_DIR/$client.asok
+ echo $(get_asok_dir)/$client.asok
else
local cluster=$(echo $CEPH_ARGS | sed -r 's/.*--cluster[[:blank:]]*([[:alnum:]]*).*/\1/')
echo "/var/run/ceph/$cluster-$client.asok"
if [ -n "$1" ]; then
whatch_opt=--watch-$1
+ if [ -n "$2" ]; then
+ whatch_opt+=" --watch-channel $2"
+ fi
fi
CEPH_WATCH_FILE=${TEMP_DIR}/CEPH_WATCH_$$
local slow=slow_eviction
local fast=fast_eviction
ceph osd pool create $slow 1 1
+ ceph osd pool application enable $slow rados
ceph osd pool create $fast 1 1
ceph osd tier add $slow $fast
ceph osd tier cache-mode $fast writeback
{
# tiering
ceph osd pool create slow 2
+ ceph osd pool application enable slow rados
ceph osd pool create slow2 2
+ ceph osd pool application enable slow2 rados
ceph osd pool create cache 2
ceph osd pool create cache2 2
ceph osd tier add slow cache
{
# make sure we can't clobber snapshot state
ceph osd pool create snap_base 2
+ ceph osd pool application enable snap_base rados
ceph osd pool create snap_cache 2
ceph osd pool mksnap snap_cache snapname
expect_false ceph osd tier add snap_base snap_cache
{
# make sure we can't create snapshot on tier
ceph osd pool create basex 2
+ ceph osd pool application enable basex rados
ceph osd pool create cachex 2
ceph osd tier add basex cachex
expect_false ceph osd pool mksnap cache snapname
ceph osd pool create eccache 2 2 erasure
expect_false ceph osd set-require-min-compat-client bobtail
ceph osd pool create repbase 2
+ ceph osd pool application enable repbase rados
expect_false ceph osd tier add repbase eccache
ceph osd pool delete repbase repbase --yes-i-really-really-mean-it
ceph osd pool delete eccache eccache --yes-i-really-really-mean-it
{
# convenient add-cache command
ceph osd pool create slow 2
+ ceph osd pool application enable slow rados
ceph osd pool create cache3 2
ceph osd tier add-cache slow cache3 1024000
ceph osd dump | grep cache3 | grep bloom | grep 'false_positive_probability: 0.05' | grep 'target_bytes 1024000' | grep '1200s x4'
{
# check add-cache whether work
ceph osd pool create datapool 2
+ ceph osd pool application enable datapool rados
ceph osd pool create cachepool 2
ceph osd tier add-cache datapool cachepool 1024000
ceph osd tier cache-mode cachepool writeback
{
# protection against pool removal when used as tiers
ceph osd pool create datapool 2
+ ceph osd pool application enable datapool rados
ceph osd pool create cachepool 2
ceph osd tier add-cache datapool cachepool 1024000
ceph osd pool delete cachepool cachepool --yes-i-really-really-mean-it 2> $TMPFILE || true
## check health check
ceph osd set notieragent
ceph osd pool create datapool 2
+ ceph osd pool application enable datapool rados
ceph osd pool create cache4 2
ceph osd tier add-cache datapool cache4 1024000
ceph osd tier cache-mode cache4 writeback
# results in a 'pool foo is now (or already was) not a tier of bar'
#
ceph osd pool create basepoolA 2
+ ceph osd pool application enable basepoolA rados
ceph osd pool create basepoolB 2
+ ceph osd pool application enable basepoolB rados
poolA_id=$(ceph osd dump | grep 'pool.*basepoolA' | awk '{print $2;}')
poolB_id=$(ceph osd dump | grep 'pool.*basepoolB' | awk '{print $2;}')
ceph auth add client.xx -i client.xx.keyring
rm -f client.xx.keyring
ceph auth list | grep client.xx
+ ceph auth ls | grep client.xx
ceph auth get client.xx | grep caps | grep mon
ceph auth get client.xx | grep caps | grep osd
ceph auth get-key client.xx
check_response "auid = $auid"
ceph --format json-pretty auth get client.TEST > $TMPFILE
check_response '"auid": '$auid
- ceph auth list > $TMPFILE
+ ceph auth ls > $TMPFILE
check_response "auid: $auid"
- ceph --format json-pretty auth list > $TMPFILE
+ ceph --format json-pretty auth ls > $TMPFILE
check_response '"auid": '$auid
ceph auth del client.TEST
}
check_response "EACCES: access denied"
ceph -n client.xx-profile-ro -k client.xx.keyring osd set noout >& $TMPFILE || true
check_response "EACCES: access denied"
- ceph -n client.xx-profile-ro -k client.xx.keyring auth list >& $TMPFILE || true
+ ceph -n client.xx-profile-ro -k client.xx.keyring auth ls >& $TMPFILE || true
check_response "EACCES: access denied"
# read-write is allowed for all read-write commands (except auth)
ceph -n client.xx-profile-rw -k client.xx.keyring osd set noout
ceph -n client.xx-profile-rw -k client.xx.keyring osd unset noout
# read-write gets access denied for auth commands
- ceph -n client.xx-profile-rw -k client.xx.keyring auth list >& $TMPFILE || true
+ ceph -n client.xx-profile-rw -k client.xx.keyring auth ls >& $TMPFILE || true
check_response "EACCES: access denied"
# role-definer is allowed RWX 'auth' commands and read-only 'mon' commands
- ceph -n client.xx-profile-rd -k client.xx.keyring auth list
+ ceph -n client.xx-profile-rd -k client.xx.keyring auth ls
ceph -n client.xx-profile-rd -k client.xx.keyring auth export
ceph -n client.xx-profile-rd -k client.xx.keyring auth add client.xx-profile-foo
ceph -n client.xx-profile-rd -k client.xx.keyring status
ceph mon count-metadata ceph_version
ceph mon versions
+ ceph mgr metadata
+ ceph mgr versions
+ ceph mgr count-metadata ceph_version
+
+ ceph versions
+
ceph node ls
}
# in the cluster at all
function mds_exists()
{
- ceph auth list | grep "^mds"
+ ceph auth ls | grep "^mds"
}
# some of the commands are just not idempotent.
}
+function test_mon_config_key()
+{
+ key=asdfasdfqwerqwreasdfuniquesa123df
+ ceph config-key list | grep -c $key | grep 0
+ ceph config-key get $key | grep -c bar | grep 0
+ ceph config-key set $key bar
+ ceph config-key get $key | grep bar
+ ceph config-key list | grep -c $key | grep 1
+ ceph config-key dump | grep $key | grep bar
+ ceph config-key rm $key
+ expect_false ceph config-key get $key
+ ceph config-key list | grep -c $key | grep 0
+ ceph config-key dump | grep -c $key | grep 0
+}
+
function test_mon_osd()
{
#
ceph osd ls
ceph osd pool create data 10
+ ceph osd pool application enable data rados
ceph osd lspools | grep data
ceph osd map data foo | grep 'pool.*data.*object.*foo.*pg.*up.*acting'
ceph osd map data foo namespace| grep 'pool.*data.*object.*namespace/foo.*pg.*up.*acting'
ceph osd tree down
ceph osd tree in
ceph osd tree out
+ ceph osd tree destroyed
ceph osd tree up in
ceph osd tree up out
ceph osd tree down in
ceph osd tree down out
ceph osd tree out down
expect_false ceph osd tree up down
+ expect_false ceph osd tree up destroyed
+ expect_false ceph osd tree down destroyed
+ expect_false ceph osd tree up down destroyed
expect_false ceph osd tree in out
expect_false ceph osd tree up foo
# osd pool
#
ceph osd pool create data 10
+ ceph osd pool application enable data rados
ceph osd pool mksnap data datasnap
rados -p data lssnap | grep datasnap
ceph osd pool rmsnap data datasnap
ceph osd pool delete data data --yes-i-really-really-mean-it
ceph osd pool create data2 10
+ ceph osd pool application enable data2 rados
ceph osd pool rename data2 data3
ceph osd lspools | grep data3
ceph osd pool delete data3 data3 --yes-i-really-really-mean-it
ceph osd pool create replicated 12 12 replicated
ceph osd pool create replicated 12 12 # default is replicated
ceph osd pool create replicated 12 # default is replicated, pgp_num = pg_num
+ ceph osd pool application enable replicated rados
# should fail because the type is not the same
expect_false ceph osd pool create replicated 12 12 erasure
ceph osd lspools | grep replicated
ceph osd pool create ec_test 1 1 erasure
+ ceph osd pool application enable ec_test rados
set +e
- ceph osd metadata | grep osd_objectstore_type | grep -qc bluestore
- if [ $? -eq 0 ]; then
+ ceph osd count-metadata osd_objectstore | grep 'bluestore'
+ if [ $? -eq 1 ]; then # enable ec_overwrites on non-bluestore pools should fail
ceph osd pool set ec_test allow_ec_overwrites true >& $TMPFILE
- check_response $? 22 "pool must only be stored on bluestore for scrubbing to work"
+ check_response "pool must only be stored on bluestore for scrubbing to work" $? 22
else
ceph osd pool set ec_test allow_ec_overwrites true || return 1
expect_false ceph osd pool set ec_test allow_ec_overwrites false
# create tmp pool
ceph osd pool create tmp-quota-pool 36
+ ceph osd pool application enable tmp-quota-pool rados
#
# set erroneous quotas
#
{
TEST_POOL_GETSET=pool_getset
ceph osd pool create $TEST_POOL_GETSET 1
+ ceph osd pool application enable $TEST_POOL_GETSET rados
wait_for_clean
ceph osd pool get $TEST_POOL_GETSET all
ceph osd pool set $TEST_POOL_GETSET size $old_size
ceph osd pool create pool_erasure 1 1 erasure
+ ceph osd pool application enable pool_erasure rados
wait_for_clean
set +e
ceph osd pool set pool_erasure size 4444 2>$TMPFILE
ceph osd pool set $TEST_POOL_GETSET pgp_num 10
old_pgs=$(ceph osd pool get $TEST_POOL_GETSET pg_num | sed -e 's/pg_num: //')
- new_pgs=$(($old_pgs+$(ceph osd stat | grep osdmap | awk '{print $3}')*32))
+ new_pgs=$(($old_pgs + $(ceph osd stat --format json | jq '.num_osds') * 32))
ceph osd pool set $TEST_POOL_GETSET pg_num $new_pgs
ceph osd pool set $TEST_POOL_GETSET pgp_num $new_pgs
wait_for_clean
old_pgs=$(ceph osd pool get $TEST_POOL_GETSET pg_num | sed -e 's/pg_num: //')
- new_pgs=$(($old_pgs+$(ceph osd stat | grep osdmap | awk '{print $3}')*32+1))
+ new_pgs=$(($old_pgs + $(ceph osd stat --format json | jq '.num_osds') * 32 + 1))
expect_false ceph osd pool set $TEST_POOL_GETSET pg_num $new_pgs
ceph osd pool set $TEST_POOL_GETSET nosizechange 1
expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 1
ceph osd pool set $TEST_POOL_GETSET hashpspool 1 --yes-i-really-mean-it
- ceph osd pool set $TEST_POOL_GETSET nodelete 1
- expect_false ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it
- ceph osd pool set $TEST_POOL_GETSET nodelete 0
- ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it
-
ceph osd pool get rbd crush_rule | grep 'crush_rule: '
ceph osd pool get $TEST_POOL_GETSET compression_mode | expect_false grep '.'
ceph osd pool set $TEST_POOL_GETSET $size 0
ceph osd pool get $TEST_POOL_GETSET $size | expect_false grep '.'
done
+
+ ceph osd pool set $TEST_POOL_GETSET nodelete 1
+ expect_false ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it
+ ceph osd pool set $TEST_POOL_GETSET nodelete 0
+ ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it
+
}
function test_mon_osd_tiered_pool_set()
# this is not a tier pool
ceph osd pool create fake-tier 2
+ ceph osd pool application enable fake-tier rados
wait_for_clean
expect_false ceph osd pool set fake-tier hit_set_type explicit_hash
sleep 1
- ceph_watch_start debug
+ ceph_watch_start debug audit
ceph tell mon.a version
ceph_watch_wait 'mon.a \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch'
- ceph_watch_start debug
+ ceph_watch_start debug audit
ceph tell mon.b version
ceph_watch_wait 'mon.b \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch'
}
# RAW USED The near raw used per pool in raw total
ceph osd pool create cephdf_for_test 32 32 replicated
+ ceph osd pool application enable cephdf_for_test rados
ceph osd pool set cephdf_for_test size 2
dd if=/dev/zero of=./cephdf_for_test bs=4k count=1
expect_false test $cal_raw_used_size != $raw_used_size
}
+function test_mon_pool_application()
+{
+ ceph osd pool create app_for_test 10
+
+ ceph osd pool application enable app_for_test rbd
+ expect_false ceph osd pool application enable app_for_test rgw
+ ceph osd pool application enable app_for_test rgw --yes-i-really-mean-it
+ ceph osd pool ls detail | grep "application rbd,rgw"
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{},"rgw":{}}'
+
+ expect_false ceph osd pool application set app_for_test cephfs key value
+ ceph osd pool application set app_for_test rbd key1 value1
+ ceph osd pool application set app_for_test rbd key2 value2
+ ceph osd pool application set app_for_test rgw key1 value1
+
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{"key1":"value1"}}'
+
+ ceph osd pool application rm app_for_test rgw key1
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{}}'
+ ceph osd pool application rm app_for_test rbd key2
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1"},"rgw":{}}'
+ ceph osd pool application rm app_for_test rbd key1
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{},"rgw":{}}'
+ ceph osd pool application rm app_for_test rbd key1 # should be idempotent
+
+ expect_false ceph osd pool application disable app_for_test rgw
+ ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it
+ ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it # should be idempotent
+ ceph osd pool ls detail | grep "application rbd"
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{}}'
+
+ ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it
+ ceph osd pool ls detail | grep -v "application "
+ ceph osd pool ls detail --format=json | grep '"application_metadata":{}'
+
+ ceph osd pool rm app_for_test app_for_test --yes-i-really-really-mean-it
+}
+
function test_mon_tell_help_command()
{
ceph tell mon.a help
expect_false ceph tell mon.zzz help
}
+function test_mon_stdin_stdout()
+{
+ echo foo | ceph config-key set test_key -i -
+ ceph config-key get test_key -o - | grep -c foo | grep -q 1
+}
+
function test_osd_tell_help_command()
{
ceph tell osd.1 help
function test_osd_compact()
{
ceph tell osd.1 compact
- ceph daemon osd.1 compact
+ $SUDO ceph daemon osd.1 compact
}
function test_mds_tell_help_command()
function test_mgr_tell()
{
ceph tell mgr help
- ceph tell mgr fs status
+ #ceph tell mgr fs status # see http://tracker.ceph.com/issues/20761
ceph tell mgr osd status
}
MON_TESTS+=" mon_misc"
MON_TESTS+=" mon_mon"
MON_TESTS+=" mon_osd"
+MON_TESTS+=" mon_config_key"
MON_TESTS+=" mon_crush"
MON_TESTS+=" mon_osd_create_destroy"
MON_TESTS+=" mon_osd_pool"
MON_TESTS+=" mon_caps"
MON_TESTS+=" mon_cephdf_commands"
MON_TESTS+=" mon_tell_help_command"
+MON_TESTS+=" mon_stdin_stdout"
OSD_TESTS+=" osd_bench"
OSD_TESTS+=" osd_negative_filestore_merge_threshold"
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library Public License for more details.
#
-source $(dirname $0)/../../../src/test/detect-build-env-vars.sh
: ${CORPUS:=https://github.com/ceph/ceph-erasure-code-corpus.git}
: ${DIRECTORY:=$CEPH_ROOT/ceph-erasure-code-corpus}
expect $ret ceph auth get-key client.admin $args
expect $ret ceph auth export $args
expect $ret ceph auth export client.admin $args
- expect $ret ceph auth list $args
+ expect $ret ceph auth ls $args
expect $ret ceph auth print-key client.admin $args
expect $ret ceph auth print_key client.admin $args
}
'auth':[
{
'pre':'',
- 'cmd':('auth list', '', 'r'),
+ 'cmd':('auth ls', '', 'r'),
'post':''
},
{
],
'config-key':[
{
- 'pre':'config-key put foo bar',
+ 'pre':'config-key set foo bar',
'cmd':('config-key get', 'key=foo', 'r')
},
{
- 'pre':'config-key put foo bar',
+ 'pre':'config-key set foo bar',
'cmd':('config-key del', 'key=foo', 'rw')
}
]
expect "ceph -k $tmp.bazar.keyring --user bazar mon_status" 13
ceph auth del client.bazar
-c="'allow command \"auth list\", allow command mon_status'"
+c="'allow command \"auth ls\", allow command mon_status'"
expect "ceph auth get-or-create client.foo mon $c > $tmp.foo.keyring" 0
expect "ceph -k $tmp.foo.keyring --user foo mon_status" 0
-expect "ceph -k $tmp.foo.keyring --user foo auth list" 0
+expect "ceph -k $tmp.foo.keyring --user foo auth ls" 0
expect "ceph -k $tmp.foo.keyring --user foo auth export" 13
expect "ceph -k $tmp.foo.keyring --user foo auth del client.bazar" 13
expect "ceph -k $tmp.foo.keyring --user foo osd dump" 13
c="'allow command service with prefix=list, allow command mon_status'"
expect "ceph auth get-or-create client.bar mon $c > $tmp.bar.keyring" 0
expect "ceph -k $tmp.bar.keyring --user bar mon_status" 0
-expect "ceph -k $tmp.bar.keyring --user bar auth list" 13
+expect "ceph -k $tmp.bar.keyring --user bar auth ls" 13
expect "ceph -k $tmp.bar.keyring --user bar auth export" 13
expect "ceph -k $tmp.bar.keyring --user bar auth del client.foo" 13
expect "ceph -k $tmp.bar.keyring --user bar osd dump" 13
rm $tmp.bazar.keyring $tmp.foo.keyring $tmp.bar.keyring
-echo OK
\ No newline at end of file
+echo OK
# make sure we're at luminous+ before using crush device classes
ceph osd require-osd-release luminous
-ceph osd crush class create ssd
-ceph osd crush class create hdd
ceph osd crush set-device-class ssd osd.0
ceph osd crush set-device-class hdd osd.1
ceph osd crush rule create-replicated foo-ssd default host ssd
ceph osd crush rule rm bar
# can't delete in-use rules, tho:
+ceph osd pool create pinning_pool 1
expect_false ceph osd crush rule rm replicated_rule
+ceph osd pool rm pinning_pool pinning_pool --yes-i-really-really-mean-it
# build a simple map
expect_false ceph osd crush add-bucket foo osd
ceph osd rm osd.$o4
ceph osd rm osd.$o5
+# weight sets
+# make sure we require luminous before testing weight-sets
+ceph osd set-require-min-compat-client luminous
+ceph osd crush weight-set dump
+ceph osd crush weight-set ls
+expect_false ceph osd crush weight-set reweight fooset osd.0 .9
+ceph osd pool create fooset 8
+ceph osd pool create barset 8
+ceph osd pool set barset size 3
+expect_false ceph osd crush weight-set reweight fooset osd.0 .9
+ceph osd crush weight-set create fooset flat
+ceph osd crush weight-set create barset positional
+ceph osd crush weight-set ls | grep fooset
+ceph osd crush weight-set ls | grep barset
+ceph osd crush weight-set dump
+ceph osd crush weight-set reweight fooset osd.0 .9
+expect_false ceph osd crush weight-set reweight fooset osd.0 .9 .9
+expect_false ceph osd crush weight-set reweight barset osd.0 .9
+ceph osd crush weight-set reweight barset osd.0 .9 .9 .9
+ceph osd crush weight-set ls | grep -c fooset | grep -q 1
+ceph osd crush weight-set rm fooset
+ceph osd crush weight-set ls | grep -c fooset | grep -q 0
+ceph osd crush weight-set ls | grep barset
+ceph osd crush weight-set rm barset
+ceph osd crush weight-set ls | grep -c barset | grep -q 0
+ceph osd crush weight-set create-compat
+ceph osd crush weight-set ls | grep '(compat)'
+ceph osd crush weight-set rm-compat
+
echo OK
expect 'ceph osd pool mksnap test snapshot' 0
expect 'ceph osd pool rmsnap test snapshot' 0
+expect 'rbd --pool=test pool init' 0
expect 'rbd --pool=test --rbd_validate_pool=false create --size=102400 image' 0
expect 'rbd --pool=test snap create image@snapshot' 22
expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0
expect 'ceph osd pool create test 256 256' 0
+expect 'rbd --pool=test pool init' 0
expect 'rbd --pool=test create --size=102400 image' 0
expect 'rbd --pool=test snap create image@snapshot' 0
expect 'rbd --pool=test snap ls image' 0
+++ /dev/null
-#!/bin/bash -x
-# vim: ts=8 sw=2 smarttab
-#
-# $0.sh - run mon workload generator
-
-d() {
- [[ "$VERBOSE" != "" && $VERBOSE -eq 1 ]] && echo "## DEBUG ## $*"
-}
-
-d "check for required binaries"
-
-required_bins="ceph crushtool ceph_test_mon_workloadgen"
-for b in $required_bins; do
- which $b >& /dev/null
- if [[ $? -ne 0 ]]; then
- echo "Unable to find '$b' in PATH"
- exit 1
- fi
-done
-
-d "Start workunit"
-
-crush_map_fn=test.crush.map
-create_crush=0
-clobber_crush=0
-new_cluster=0
-do_run=0
-num_osds=0
-
-# Assume the test is in PATH
-bin_test=ceph_test_mon_workloadgen
-
-num_osds=10
-if [[ "$LOADGEN_NUM_OSDS" != "" ]]; then
- num_osds=$LOADGEN_NUM_OSDS
-fi
-
-duration=300
-[ ! -z $DURATION ] && duration=$DURATION
-
-d "checking osd tree"
-
-crush_testing_root="`ceph osd tree | grep 'root[ \t]\+testing'`"
-
-d "$crush_testing_root"
-
-if [[ "$crush_testing_root" == "" ]]; then
- d "set create_crush"
- create_crush=1
-fi
-
-d "generate run_id (create_crush = $create_crush)"
-
-run_id=`uuidgen`
-
-d "run_id = $run_id ; create_crush = $create_crush"
-
-if [[ $create_crush -eq 1 ]]; then
- tmp_crush_fn="/tmp/ceph.$run_id.crush"
- ceph osd getcrushmap -o $tmp_crush_fn
- crushtool -d $tmp_crush_fn -o $tmp_crush_fn.plain
-
- highest_root_id=0
- root_ids_raw="`cat $tmp_crush_fn.plain | grep id`"
- ifs=$IFS
- IFS=$'\n'
- for l in $root_ids_raw; do
- root_id=`echo $l | sed 's/.*-\([[:digit:]]\+\).*/\1/'`
- d "root id = $root_id ; highest = $highest_root_id"
- if [[ $root_id -gt $highest_root_id ]]; then
- highest_root_id=$root_id
- fi
- done
- our_root_id=$(($highest_root_id+1))
- IFS=$ifs
-
- cat << EOF >> $tmp_crush_fn.plain
-root testing {
- id -$our_root_id
- alg straw
- hash 0 # rjenkins1
-}
-rule testingdata {
- ruleset 0
- type replicated
- min_size 1
- max_size 10
- step take testing
- step choose firstn 0 type osd
- step emit
-}
-rule testingmetadata {
- ruleset 1
- type replicated
- min_size 1
- max_size 10
- step take testing
- step choose firstn 0 type osd
- step emit
-}
-rule testingrbd {
- ruleset 2
- type replicated
- min_size 1
- max_size 10
- step take testing
- step choose firstn 0 type osd
- step emit
-}
-EOF
-
- if [[ $VERBOSE -eq 1 ]]; then
- cat $tmp_crush_fn.plain
- fi
-
- crushtool -c $tmp_crush_fn.plain -o $tmp_crush_fn
- if [[ $? -eq 1 ]]; then
- echo "Error compiling test crush map; probably need newer crushtool"
- echo "NOK"
- exit 1
- fi
-
- d "created crush"
-
- ceph osd setcrushmap -i $tmp_crush_fn
-fi
-
-keyring="/tmp/ceph.$run_id.keyring"
-
-ceph auth get-or-create-key osd.admin mon 'allow rwx' osd 'allow *'
-ceph auth export | grep -v "export" > $keyring
-
-osd_ids=""
-
-for osd in `seq 1 $num_osds`; do
- id=`ceph osd create`
- osd_ids="$osd_ids $id"
- d "osd.$id"
- ceph osd crush set $id osd.$id 1.0 host=testhost rack=testrack root=testing
-done
-
-d "osds: $osd_ids"
-
-stub_id_args=""
-f=
-l=
-for i in $osd_ids; do
- d "i: $i"
- if [[ $stub_id_args == "" ]]; then
- stub_id_args="--stub-id $i"
- f=$i
- fi
- if [[ $l != "" ]]; then
- if [[ $i -gt $(($l+1)) ]]; then
- stub_id_args="$stub_id_args..$l --stub-id $i"
- f=$i
- fi
- fi
- l=$i
-done
-if [[ $l -gt $f ]]; then
- stub_id_args="$stub_id_args..$l"
-fi
-
-args="$EXTRA_ARGS --duration $duration $stub_id_args"
-
-d "running: $args"
-
-$bin_test --keyring $keyring $args
POOL="alloc_hint-rep"
ceph osd pool create "${POOL}" "${NUM_PG}"
ceph osd pool set "${POOL}" size "${NUM_OSDS}"
+ceph osd pool application enable "${POOL}" rados
OBJ="foo"
setup_pgid "${POOL}" "${OBJ}"
ceph osd erasure-code-profile set "${PROFILE}" k=2 m=1 crush-failure-domain=osd
ceph osd erasure-code-profile get "${PROFILE}" # just so it's logged
ceph osd pool create "${POOL}" "${NUM_PG}" "${NUM_PGP}" erasure "${PROFILE}"
+ceph osd pool application enable "${POOL}" rados
OBJ="baz"
setup_pgid "${POOL}" "${OBJ}"
# create pools, set up tier relationship
ceph osd pool create base_pool 2
+ceph osd pool application enable base_pool rados
ceph osd pool create partial_wrong 2
ceph osd pool create wrong_cache 2
ceph osd tier add base_pool partial_wrong
## set of base, cache
ceph osd pool create base 8
+ceph osd pool application enable base rados
ceph osd pool create cache 8
ceph osd tier add base cache
test_mark_two_osds_same_host_down_with_classes() {
ceph osd set noup
- ceph osd crush class create ssd
- ceph osd crush class create hdd
ceph osd crush set-device-class ssd osd.0 osd.2 osd.4 osd.6 osd.8
ceph osd crush set-device-class hdd osd.1 osd.3 osd.5 osd.7 osd.9
ceph osd down osd.0 osd.1
# objects
ceph osd pool create $p 12
ceph osd pool set-quota $p max_objects 10
+ceph osd pool application enable $p rados
for f in `seq 1 10` ; do
rados -p $p put obj$f /etc/passwd
pp=`uuidgen`
ceph osd pool create $pp 12
+ceph osd pool application enable $pp rados
# set objects quota
ceph osd pool set-quota $pp max_objects 10
ceph osd pool delete test test --yes-i-really-really-mean-it || true
ceph osd pool create test 100
+ rbd pool init test
truncate -s 1 /tmp/empty /tmp/empty@snap
rbd ls | wc -l | grep 0
#!/bin/bash -e
-. $(dirname $0)/../ceph-helpers.sh
+. $(dirname $0)/../../standalone/ceph-helpers.sh
function list_tests()
{
}
ceph osd pool create repdata 24 24
+rbd pool init repdata
ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
ceph osd pool create ecdata 24 24 erasure teuthologyprofile
+rbd pool init ecdata
ceph osd pool set ecdata allow_ec_overwrites true
ceph osd pool create rbdnonzero 24 24
+rbd pool init rbdnonzero
ceph osd pool create clonesonly 24 24
+rbd pool init clonesonly
for pool in rbd rbdnonzero; do
rbd create --size 200 --image-format 1 $pool/img0
create_pools() {
ceph osd pool create images 100
+ rbd pool init images
ceph osd pool create volumes 100
+ rbd pool init volumes
}
delete_pools() {
#!/bin/bash -ex
-. $(dirname $0)/../ceph-helpers.sh
+. $(dirname $0)/../../standalone/ceph-helpers.sh
POOL=rbd
IMAGE=testrbdnbd$$
# demote and promote same cluster
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
-wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
# failover (unmodified)
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
-wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
# failback (unmodified)
demote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
# failover
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
-wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
write_image ${CLUSTER1} ${POOL} ${image} 100
# failback
demote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
create_image ${CLUSTER2} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
demote_image ${CLUSTER2} ${POOL} ${image}
-wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER1} ${POOL} ${image} 10
demote_image ${CLUSTER1} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
testlog "TEST: no blacklists"
-ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
-ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
+CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
+CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
echo OK
POOL=mirror
PARENT_POOL=mirror_parent
TEMPDIR=
+USER_ID=mirror
+export CEPH_ARGS="--id ${USER_ID}"
CEPH_ROOT=$(readlink -f $(dirname $0)/../../../src)
CEPH_BIN=.
if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
cd ${CEPH_ROOT}
- ${CEPH_SRC}/mstart.sh ${CLUSTER1} -n ${RBD_MIRROR_VARGS}
- ${CEPH_SRC}/mstart.sh ${CLUSTER2} -n ${RBD_MIRROR_VARGS}
+ CEPH_ARGS='' ${CEPH_SRC}/mstart.sh ${CLUSTER1} -n ${RBD_MIRROR_VARGS}
+ CEPH_ARGS='' ${CEPH_SRC}/mstart.sh ${CLUSTER2} -n ${RBD_MIRROR_VARGS}
+
+ CEPH_ARGS='' ceph --conf run/${CLUSTER1}/ceph.conf \
+ auth get-or-create client.${USER_ID} mon 'profile rbd' osd 'profile rbd' >> \
+ run/${CLUSTER1}/keyring
+ CEPH_ARGS='' ceph --conf run/${CLUSTER2}/ceph.conf \
+ auth get-or-create client.${USER_ID} mon 'profile rbd' osd 'profile rbd' >> \
+ run/${CLUSTER2}/keyring
rm -f ${TEMPDIR}/${CLUSTER1}.conf
ln -s $(readlink -f run/${CLUSTER1}/ceph.conf) \
cd ${TEMPDIR}
fi
- ceph --cluster ${CLUSTER1} osd pool create ${POOL} 64 64
- ceph --cluster ${CLUSTER1} osd pool create ${PARENT_POOL} 64 64
- ceph --cluster ${CLUSTER2} osd pool create ${PARENT_POOL} 64 64
- ceph --cluster ${CLUSTER2} osd pool create ${POOL} 64 64
+ CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool create ${POOL} 64 64
+ CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool create ${PARENT_POOL} 64 64
+ CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool create ${PARENT_POOL} 64 64
+ CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool create ${POOL} 64 64
rbd --cluster ${CLUSTER1} mirror pool enable ${POOL} pool
rbd --cluster ${CLUSTER2} mirror pool enable ${POOL} pool
if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
cd ${CEPH_ROOT}
- ${CEPH_SRC}/mstop.sh ${CLUSTER1}
- ${CEPH_SRC}/mstop.sh ${CLUSTER2}
+ CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1}
+ CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2}
else
- ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
- ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
- ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
- ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
+ CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
+ CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
+ CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
+ CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
fi
test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" ||
rm -Rf ${TEMPDIR}
rbd-mirror \
--cluster ${cluster} \
+ --id mirror \
--pid-file=$(daemon_pid_file "${cluster}:${instance}") \
--log-file=${TEMPDIR}/rbd-mirror.${cluster}_daemon.${instance}.log \
--admin-socket=${TEMPDIR}/rbd-mirror.${cluster}_daemon.${instance}.\$cluster.asok \
chmod 0755 ${STACK_HOME_PATH}/start.sh
sudo -H -u ${STACK_USER} ${STACK_HOME_PATH}/start.sh
+# switch to rbd profile caps
+ceph auth caps client.cinder mon 'profile rbd' osd 'profile rbd pool=volumes, profile rbd pool=vms, profile rbd pool=images'
+ceph auth caps client.cinder-bak mon 'profile rbd' osd 'profile rbd pool=backups, profile rbd pool=volumes'
+ceph auth caps client.glance mon 'profile rbd' osd 'profile rbd pool=images'
+
# execute tempest
chown -R ${TEMPEST_USER}:${STACK_GROUP} ${STACK_OPT_PATH}/tempest
chown -R ${TEMPEST_USER}:${STACK_GROUP} ${STACK_OPT_PATH}/data/tempest
mkdir $TMPDIR
trap "rm -fr $TMPDIR" 0
-. $(dirname $0)/../ceph-helpers.sh
+. $(dirname $0)/../../standalone/ceph-helpers.sh
function expect_false()
{
tear_down
ceph osd pool create $POOL_NAME $PG_NUM
ceph osd pool mksnap $POOL_NAME snap
+ rbd pool init $POOL_NAME
}
trap tear_down EXIT HUP INT
$DRY_RUN ./do_cmake.sh $@ || return 1
$DRY_RUN cd build
$DRY_RUN make $BUILD_MAKEOPTS tests || return 1
- $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure || return 1
+ if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
+ rm -f ${TMPDIR:-/tmp}/ceph-asok.*
+ return 1
+ fi
}
function main() {
-f3e663a190bf2ed12c7e3cda288b9a159572c800
-v12.1.1
+b661348f156f148d764b998b65b90451f096cb27
+v12.1.2
common/ceph_hash.cc
common/ceph_strings.cc
common/ceph_frag.cc
+ common/options.cc
common/config.cc
- common/config_validators.cc
common/utf8.c
common/mime.c
common/strtol.cc
${auth_files}
${mds_files})
+if(HAS_VTA)
+ set_source_files_properties(common/config.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+
if(FREEBSD)
list(APPEND libcommon_files common/freebsd_errno.cc)
elseif(DARWIN)
mgr/PyState.cc
mgr/MgrPyModule.cc
mgr/MgrStandby.cc
- mgr/Mgr.cc)
+ mgr/Mgr.cc
+ mgr/mgr_commands.cc)
add_executable(ceph-mgr ${mgr_srcs}
$<TARGET_OBJECTS:heap_profiler_objs>)
target_include_directories(ceph-mgr PRIVATE "${PYTHON_INCLUDE_DIRS}")
install(TARGETS cephfs DESTINATION ${CMAKE_INSTALL_LIBDIR})
install(DIRECTORY
"${CMAKE_SOURCE_DIR}/src/include/cephfs"
- DESTINATION include)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
set(ceph_syn_srcs
ceph_syn.cc
client/SyntheticClient.cc)
}
return NULL;
}
-
-
-void AuthSessionHandler::print_auth_session_handler_stats() {
- ldout(cct,10) << "Auth Session Handler Stats " << this << dendl;
- ldout(cct,10) << " Messages Signed = " << messages_signed << dendl;
- ldout(cct,10) << " Signatures Checked = " << signatures_checked << dendl;
- ldout(cct,10) << " Signatures Matched = " << signatures_matched << dendl;
- ldout(cct,10) << " Signatures Did Not Match = " << signatures_failed << dendl;
- ldout(cct,10) << " Messages Encrypted = " << messages_encrypted << dendl;
- ldout(cct,10) << " Messages Decrypted = " << messages_decrypted << dendl;
-}
CryptoKey key;
public:
- // Keep stats on how many messages were signed, how many messages were encrypted, how many
- // signatures were properly checked, and how many messages were decrypted. PLR
- int messages_signed;
- int signatures_checked;
- int signatures_matched;
- int signatures_failed;
- int messages_encrypted;
- int messages_decrypted;
-
- explicit AuthSessionHandler(CephContext *cct_) : cct(cct_), protocol(CEPH_AUTH_UNKNOWN), messages_signed(0),
- signatures_checked(0), signatures_matched(0), signatures_failed(0), messages_encrypted(0),
- messages_decrypted(0) {}
+ explicit AuthSessionHandler(CephContext *cct_) : cct(cct_), protocol(CEPH_AUTH_UNKNOWN) {}
AuthSessionHandler(CephContext *cct_, int protocol_, CryptoKey key_) : cct(cct_),
- protocol(protocol_), key(key_), messages_signed(0), signatures_checked(0), signatures_matched(0),
- signatures_failed(0), messages_encrypted(0), messages_decrypted(0) {}
+ protocol(protocol_), key(key_) {}
virtual ~AuthSessionHandler() { }
- void print_auth_session_handler_stats() ;
-
virtual bool no_security() = 0;
virtual int sign_message(Message *message) = 0;
virtual int check_message_signature(Message *message) = 0;
ceph_msg_footer& f = m->get_footer();
f.sig = sig;
f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED;
- messages_signed++;
ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq()
<< "): sig = " << sig << dendl;
return 0;
if (r < 0)
return r;
- signatures_checked++;
-
if (sig != m->get_footer().sig) {
// Should have been signed, but signature check failed. PLR
if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) {
// security failure, particularly when there are large numbers of
// them, since the latter is a potential sign of an attack. PLR
- signatures_failed++;
ldout(cct, 0) << "Signature failed." << dendl;
return (SESSION_SIGNATURE_FAILURE);
}
- // If we get here, the signature checked. PLR
- signatures_matched++;
-
return 0;
}
if rc:
#uuid is not yet set.
uid = str(uuid.uuid4())
- (rc, o, e) = run_command(['ceph', 'config-key', 'put',
+ (rc, o, e) = run_command(['ceph', 'config-key', 'set',
CLUSTER_UUID_NAME, uid])
if rc:
- raise RuntimeError("\'ceph config-key put\' failed -" + e)
+ raise RuntimeError("\'ceph config-key set\' failed -" + e)
return uid
#!/usr/bin/env python
#
-# Copyright (C) 2015, 2016 Red Hat <contact@redhat.com>
+# Copyright (C) 2015, 2016, 2017 Red Hat <contact@redhat.com>
# Copyright (C) 2014 Inktank <info@inktank.com>
# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
# Copyright (C) 2014 Catalyst.net Ltd
import uuid
import time
import shlex
+import shutil
import pwd
import grp
import textwrap
PROCDIR = '/compat/linux/proc'
# FreeBSD does not have blockdevices any more
BLOCKDIR = '/dev'
+ ROOTGROUP = 'wheel'
else:
FREEBSD = False
DEFAULT_FS_TYPE = 'xfs'
PROCDIR = '/proc'
BLOCKDIR = '/sys/block'
+ ROOTGROUP = 'root'
"""
OSD STATUS Definition
return _bytes2str(out), _bytes2str(err), process.returncode
+def command_with_stdin(arguments, stdin):
+ LOG.info("Running command with stdin: " + " ".join(arguments))
+ process = subprocess.Popen(
+ arguments,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ out, err = process.communicate(stdin)
+ LOG.debug(out)
+ if process.returncode != 0:
+ LOG.error(err)
+ raise SystemExit(
+ "'{cmd}' failed with status code {returncode}".format(
+ cmd=arguments,
+ returncode=process.returncode,
+ )
+ )
+ return out
+
+
def _bytes2str(string):
return string.decode('utf-8') if isinstance(string, bytes) else string
cluster,
fsid,
keyring,
+ path,
):
"""
- Accocates an OSD id on the given cluster.
+ Allocates an OSD id on the given cluster.
:raises: Error if the call to allocate the OSD id fails.
:return: The allocated OSD id.
"""
+ lockbox_path = os.path.join(STATEDIR, 'osd-lockbox', fsid)
+ lockbox_osd_id = read_one_line(lockbox_path, 'whoami')
+ osd_keyring = os.path.join(path, 'keyring')
+ if lockbox_osd_id:
+ LOG.debug('Getting OSD id from Lockbox...')
+ osd_id = lockbox_osd_id
+ shutil.move(os.path.join(lockbox_path, 'osd_keyring'),
+ osd_keyring)
+ path_set_context(osd_keyring)
+ os.unlink(os.path.join(lockbox_path, 'whoami'))
+ return osd_id
LOG.debug('Allocating OSD id...')
+ secrets = Secrets()
try:
- osd_id = _check_output(
- args=[
+ wanttobe = read_one_line(path, 'wanttobe')
+ if os.path.exists(os.path.join(path, 'wanttobe')):
+ os.unlink(os.path.join(path, 'wanttobe'))
+ id_arg = wanttobe and [wanttobe] or []
+ osd_id = command_with_stdin(
+ [
'ceph',
'--cluster', cluster,
'--name', 'client.bootstrap-osd',
'--keyring', keyring,
- 'osd', 'create', '--concise',
+ '-i', '-',
+ 'osd', 'new',
fsid,
- ],
+ ] + id_arg,
+ secrets.get_json()
)
except subprocess.CalledProcessError as e:
raise Error('ceph osd create failed', e, e.output)
osd_id = must_be_one_line(osd_id)
check_osd_id(osd_id)
+ secrets.write_osd_keyring(osd_keyring, osd_id)
return osd_id
rawdev,
] + cryptsetup_parameters
- def run(args, stdin):
- LOG.info(" ".join(args))
- process = subprocess.Popen(
- args,
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- out, err = process.communicate(stdin)
- LOG.debug(out)
- LOG.error(err)
- assert process.returncode == 0
-
try:
if luks:
if format_dev:
- run(luksFormat_args, key)
- run(luksOpen_args, key)
+ command_with_stdin(luksFormat_args, key)
+ command_with_stdin(luksOpen_args, key)
else:
# Plain mode has no format function, nor any validation
# that the key is correct.
- run(create_args, key)
+ command_with_stdin(create_args, key)
# set proper ownership of mapped device
command_check_call(['chown', 'ceph:ceph', dev])
return dev
metavar='UUID',
help='unique OSD uuid to assign this disk to',
)
+ parser.add_argument(
+ '--osd-id',
+ metavar='ID',
+ help='unique OSD id to assign this disk to',
+ )
parser.add_argument(
'--crush-device-class',
help='crush device class to assign this disk to',
return None
+class Secrets(object):
+
+ def __init__(self):
+ secret, stderr, ret = command(['ceph-authtool', '--gen-print-key'])
+ LOG.debug("stderr " + stderr)
+ assert ret == 0
+ self.keys = {
+ 'cephx_secret': secret.strip(),
+ }
+
+ def write_osd_keyring(self, keyring, osd_id):
+ command_check_call(
+ [
+ 'ceph-authtool', keyring,
+ '--create-keyring',
+ '--name', 'osd.' + str(osd_id),
+ '--add-key', self.keys['cephx_secret'],
+ ])
+ path_set_context(keyring)
+
+ def get_json(self):
+ return bytearray(json.dumps(self.keys), 'ascii')
+
+
+class LockboxSecrets(Secrets):
+
+ def __init__(self, args):
+ super(LockboxSecrets, self).__init__()
+
+ key_size = CryptHelpers.get_dmcrypt_keysize(args)
+ key = open('/dev/urandom', 'rb').read(key_size / 8)
+ base64_key = base64.b64encode(key).decode('ascii')
+
+ secret, stderr, ret = command(['ceph-authtool', '--gen-print-key'])
+ LOG.debug("stderr " + stderr)
+ assert ret == 0
+
+ self.keys.update({
+ 'dmcrypt_key': base64_key,
+ 'cephx_lockbox_secret': secret.strip(),
+ })
+
+ def write_lockbox_keyring(self, path, osd_uuid):
+ keyring = os.path.join(path, 'keyring')
+ command_check_call(
+ [
+ 'ceph-authtool', keyring,
+ '--create-keyring',
+ '--name', 'client.osd-lockbox.' + osd_uuid,
+ '--add-key', self.keys['cephx_lockbox_secret'],
+ ])
+ path_set_context(keyring)
+
+
class Lockbox(object):
def __init__(self, args):
def create_partition(self):
self.device = Device.factory(self.args.lockbox, argparse.Namespace())
- partition_number = 3
+ partition_number = 5
self.device.create_partition(uuid=self.args.lockbox_uuid,
name='lockbox',
num=partition_number,
self.partition = self.create_partition()
def create_key(self):
- key_size = CryptHelpers.get_dmcrypt_keysize(self.args)
- key = open('/dev/urandom', 'rb').read(key_size / 8)
- base64_key = base64.b64encode(key)
cluster = self.args.cluster
bootstrap = self.args.prepare_key_template.format(cluster=cluster,
statedir=STATEDIR)
- command_check_call(
- [
- 'ceph',
- '--cluster', cluster,
- '--name', 'client.bootstrap-osd',
- '--keyring', bootstrap,
- 'config-key',
- 'put',
- 'dm-crypt/osd/' + self.args.osd_uuid + '/luks',
- base64_key,
- ],
- )
- keyring, stderr, ret = command(
+ path = self.get_mount_point()
+ secrets = LockboxSecrets(self.args)
+ id_arg = self.args.osd_id and [self.args.osd_id] or []
+ osd_id = command_with_stdin(
[
'ceph',
'--cluster', cluster,
'--name', 'client.bootstrap-osd',
'--keyring', bootstrap,
- 'auth',
- 'get-or-create',
- 'client.osd-lockbox.' + self.args.osd_uuid,
- 'mon',
- ('allow command "config-key get" with key="dm-crypt/osd/' +
- self.args.osd_uuid + '/luks"'),
- ],
+ '-i', '-',
+ 'osd', 'new', self.args.osd_uuid,
+ ] + id_arg,
+ secrets.get_json()
)
- LOG.debug("stderr " + stderr)
- assert ret == 0
- path = self.get_mount_point()
- open(os.path.join(path, 'keyring'), 'w').write(keyring)
+ secrets.write_lockbox_keyring(path, self.args.osd_uuid)
+ osd_id = must_be_one_line(osd_id)
+ check_osd_id(osd_id)
+ write_one_line(path, 'whoami', osd_id)
+ secrets.write_osd_keyring(os.path.join(path, 'osd_keyring'), osd_id)
write_one_line(path, 'key-management-mode', KEY_MANAGEMENT_MODE_V1)
def symlink_spaces(self, path):
write_one_line(path, 'ceph_fsid', self.args.cluster_uuid)
write_one_line(path, 'fsid', self.args.osd_uuid)
+ if self.args.osd_id:
+ write_one_line(path, 'wanttobe', self.args.osd_id)
if self.args.crush_device_class:
write_one_line(path, 'crush_device_class',
self.args.crush_device_class)
write_one_line(path, 'type', 'bluestore')
-#
-# Temporary workaround: if ceph-osd --mkfs does not
-# complete within 5 minutes, assume it is blocked
-# because of http://tracker.ceph.com/issues/13522
-# and retry a few times.
-#
-# Remove this function calls with command_check_call
-# when http://tracker.ceph.com/issues/13522 is fixed
-#
-def ceph_osd_mkfs(arguments):
- timeout = _get_command_executable(['timeout'])
- mkfs_ok = False
- error = 'unknown error'
- for delay in os.environ.get('CEPH_OSD_MKFS_DELAYS',
- '300 300 300 300 300').split():
- try:
- _check_output(timeout + [delay] + arguments)
- mkfs_ok = True
- break
- except subprocess.CalledProcessError as e:
- error = e.output
- if e.returncode == 124: # timeout fired, retry
- LOG.debug('%s timed out : %s (retry)'
- % (str(arguments), error))
- else:
- break
- if not mkfs_ok:
- raise Error('%s failed : %s' % (str(arguments), error))
-
-
def mkfs(
path,
cluster,
osd_type = read_one_line(path, 'type')
if osd_type == 'bluestore':
- ceph_osd_mkfs(
+ command_check_call(
[
'ceph-osd',
'--cluster', cluster,
'--mkfs',
- '--mkkey',
'-i', osd_id,
'--monmap', monmap,
'--osd-data', path,
'--osd-uuid', fsid,
- '--keyring', os.path.join(path, 'keyring'),
'--setuser', get_ceph_user(),
'--setgroup', get_ceph_group(),
],
)
elif osd_type == 'filestore':
- ceph_osd_mkfs(
+ command_check_call(
[
'ceph-osd',
'--cluster', cluster,
'--mkfs',
- '--mkkey',
'-i', osd_id,
'--monmap', monmap,
'--osd-data', path,
'--osd-journal', os.path.join(path, 'journal'),
'--osd-uuid', fsid,
- '--keyring', os.path.join(path, 'keyring'),
'--setuser', get_ceph_user(),
'--setgroup', get_ceph_group(),
],
raise Error('unrecognized objectstore type %s' % osd_type)
-def auth_key(
- path,
- cluster,
- osd_id,
- keyring,
-):
- try:
- # try dumpling+ cap scheme
- command_check_call(
- [
- 'ceph',
- '--cluster', cluster,
- '--name', 'client.bootstrap-osd',
- '--keyring', keyring,
- 'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id),
- '-i', os.path.join(path, 'keyring'),
- 'osd', 'allow *',
- 'mon', 'allow profile osd',
- ],
- )
- except subprocess.CalledProcessError as err:
- if err.returncode == errno.EINVAL:
- # try old cap scheme
- command_check_call(
- [
- 'ceph',
- '--cluster', cluster,
- '--name', 'client.bootstrap-osd',
- '--keyring', keyring,
- 'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id),
- '-i', os.path.join(path, 'keyring'),
- 'osd', 'allow *',
- 'mon', 'allow rwx',
- ],
- )
- else:
- raise
-
-
def get_mount_point(cluster, osd_id):
parent = STATEDIR + '/osd'
return os.path.join(
cluster=cluster,
fsid=fsid,
keyring=keyring,
+ path=path,
)
write_one_line(path, 'whoami', osd_id)
LOG.debug('OSD id is %s', osd_id)
pass
if not os.path.exists(os.path.join(path, 'active')):
- LOG.debug('Authorizing OSD key...')
- auth_key(
- path=path,
- cluster=cluster,
- osd_id=osd_id,
- keyring=keyring,
- )
write_one_line(path, 'active', 'ok')
LOG.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path)
return (osd_id, cluster)
###########################
-def _remove_from_crush_map(cluster, osd_id):
- LOG.info("Prepare to remove osd.%s from crush map..." % osd_id)
- command([
- 'ceph',
- 'osd',
- 'crush',
- 'remove',
- 'osd.%s' % osd_id,
- ])
-
-
-def _delete_osd_auth_key(cluster, osd_id):
- LOG.info("Prepare to delete osd.%s cephx key..." % osd_id)
- command([
- 'ceph',
- 'auth',
- 'del',
- 'osd.%s' % osd_id,
- ])
-
-
-def _deallocate_osd_id(cluster, osd_id):
- LOG.info("Prepare to deallocate the osd-id: %s..." % osd_id)
- command([
- 'ceph',
- 'osd',
- 'rm',
- '%s' % osd_id,
- ])
-
-
def _remove_lockbox(uuid, cluster):
- command([
- 'ceph',
- '--cluster', cluster,
- 'auth',
- 'del',
- 'client.osd-lockbox.' + uuid,
- ])
- command([
- 'ceph',
- '--cluster', cluster,
- 'config-key',
- 'del',
- 'dm-crypt/osd/' + uuid + '/luks',
- ])
lockbox = os.path.join(STATEDIR, 'osd-lockbox')
if not os.path.exists(lockbox):
return
raise Error("Could not destroy the active osd. (osd-id: %s)" %
osd_id)
- # Remove OSD from crush map
- _remove_from_crush_map(args.cluster, osd_id)
-
- # Remove OSD cephx key
- _delete_osd_auth_key(args.cluster, osd_id)
-
- # Deallocate OSD ID
- _deallocate_osd_id(args.cluster, osd_id)
+ if args.purge:
+ action = 'purge'
+ else:
+ action = 'destroy'
+ LOG.info("Prepare to %s osd.%s" % (action, osd_id))
+ command([
+ 'ceph',
+ 'osd',
+ action,
+ 'osd.%s' % osd_id,
+ '--yes-i-really-mean-it',
+ ])
# we remove the crypt map and device mapper (if dmcrypt is True)
if dmcrypt:
if not os.path.exists(path):
raise Error('%s does not exist' % path)
- if path_is_diskdevice(path):
+ if not path_is_diskdevice(path):
raise Error('%s is not a block device' % path)
if (is_partition(path) and
if not os.path.exists(args.dev):
raise Error('%s does not exist' % args.dev)
+ if is_suppressed(args.dev):
+ LOG.info('suppressed activate request on space %s', args.dev)
+ return
+
cluster = None
osd_id = None
osd_uuid = None
disk = os.path.realpath(path)
if not os.path.exists(disk):
raise Error('does not exist', path)
- if ldev_is_diskdevice(path):
+ if not ldev_is_diskdevice(path):
raise Error('not a block device', path)
base = get_dev_name(disk)
def main_fix(args):
# A hash table containing 'path': ('uid', 'gid', blocking, recursive)
fix_table = [
- ('/usr/bin/ceph-mon', 'root', 'root', True, False),
- ('/usr/bin/ceph-mds', 'root', 'root', True, False),
- ('/usr/bin/ceph-osd', 'root', 'root', True, False),
- ('/usr/bin/radosgw', 'root', 'root', True, False),
- ('/etc/ceph', 'root', 'root', True, True),
+ ('/usr/bin/ceph-mon', 'root', ROOTGROUP, True, False),
+ ('/usr/bin/ceph-mds', 'root', ROOTGROUP, True, False),
+ ('/usr/bin/ceph-osd', 'root', ROOTGROUP, True, False),
+ ('/usr/bin/radosgw', 'root', ROOTGROUP, True, False),
+ ('/etc/ceph', 'root', ROOTGROUP, True, True),
('/var/run/ceph', 'ceph', 'ceph', True, True),
('/var/log/ceph', 'ceph', 'ceph', True, True),
('/var/log/radosgw', 'ceph', 'ceph', True, True),
destroy_parser = subparsers.add_parser(
'destroy',
formatter_class=argparse.RawDescriptionHelpFormatter,
- description=textwrap.fill(textwrap.dedent("""\
- Destroy the OSD located at PATH.
- It removes the OSD from the cluster, the crushmap and
- deallocates the OSD id. An OSD must be down before it
- can be destroyed.
- """)),
+ description=textwrap.fill(textwrap.dedent("""\ Destroy the OSD located at PATH. It removes the OSD from the
+ cluster and marks it destroyed. An OSD must be down before it
+ can be destroyed. Once it is destroyed, a new OSD can be created
+ in its place, reusing the same OSD id and position (e.g. after
+ a failed HDD or SSD is replaced). Alternatively, if the
+ --purge option is also specified, the OSD is removed from the
+ CRUSH map and the OSD id is deallocated.""")),
help='Destroy a Ceph OSD')
destroy_parser.add_argument(
'--cluster',
action='store_true', default=False,
help='option to erase data and partition',
)
+ destroy_parser.add_argument(
+ '--purge',
+ action='store_true', default=False,
+ help='option to remove OSD from CRUSH map and deallocate the id',
+ )
destroy_parser.set_defaults(
func=main_destroy,
)
#!/bin/bash
#
# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015, 2016 Red Hat <contact@redhat.com>
+# Copyright (C) 2014, 2015, 2016, 2017 Red Hat <contact@redhat.com>
#
# Author: Loic Dachary <loic@dachary.org>
#
CEPH_BIN=$CEPH_ROOT
CEPH_LIB=$CEPH_ROOT/.libs
fi
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
set -x
CEPH_DISK_ARGS=
CEPH_DISK_ARGS+=" --verbose"
CEPH_DISK_ARGS+=" --prepend-to-path="
-TIMEOUT=360
+: ${CEPH_DISK_TIMEOUT:=360}
if [ `uname` != FreeBSD ]; then
PROCDIR=""
else
fi
cat=$(which cat)
-timeout=$(which timeout)
diff=$(which diff)
mkdir=$(which mkdir)
rm=$(which rm)
uuidgen=$(which uuidgen)
+if [ `uname` = FreeBSD ]; then
+ # for unknown reasons FreeBSD timeout does not return sometimes
+ timeout=""
+else
+ timeout="$(which timeout) $CEPH_DISK_TIMEOUT"
+fi
function setup() {
local dir=$1
teardown $dir
mkdir -p $dir/osd
+ mkdir -p $(get_asok_dir)
touch $dir/ceph.conf # so ceph-disk think ceph is the cluster
}
umount $mounted
done
rm -fr $dir
+ rm -rf $(get_asok_dir)
}
function command_fixture() {
shift
run_mon $dir a
+ create_rbd_pool
local osd_data=$dir/dir
$mkdir -p $osd_data
else
expected=systemd
fi
- $timeout $TIMEOUT ${CEPH_DISK} $CEPH_DISK_ARGS \
+ $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \
--verbose \
activate \
--mark-init=$expected \
grep --quiet $uuid $osd_data/ceph_fsid || return 1
}
+function read_write() {
+ local dir=$1
+ local file=${2:-$(uuidgen)}
+ local pool=rbd
+
+ echo FOO > $dir/$file
+ $timeout rados --pool $pool put $file $dir/$file || return 1
+ $timeout rados --pool $pool get $file $dir/$file.copy || return 1
+ $diff $dir/$file $dir/$file.copy || return 1
+}
+
function test_pool_read_write() {
- local osd_uuid=$1
- local TEST_POOL=rbd
-
- $timeout $TIMEOUT ceph osd pool set $TEST_POOL size 1 || return 1
-
- local id=$(ceph osd create $osd_uuid)
- local weight=1
- ceph osd crush add osd.$id $weight root=default host=localhost || return 1
- echo FOO > $dir/BAR
- $timeout $TIMEOUT rados --pool $TEST_POOL put BAR $dir/BAR || return 1
- $timeout $TIMEOUT rados --pool $TEST_POOL get BAR $dir/BAR.copy || return 1
- $diff $dir/BAR $dir/BAR.copy || return 1
+ local dir=$1
+ local pool=rbd
+
+ $timeout ceph osd pool set $pool size 1 || return 1
+ read_write $dir || return 1
}
function test_activate() {
- local to_prepare=$1
- local to_activate=$2
- local osd_uuid=$($uuidgen)
- local timeoutcmd
-
- if [ `uname` = FreeBSD ]; then
- # for unknown reasons FreeBSD timeout does not return here
- # So we run without timeout
- timeoutcmd=""
- else
- timeoutcmd="${timeout} $TIMEOUT"
- fi
+ local dir=$1
+ shift
+ local osd_data=$1
+ shift
+
+ mkdir -p $osd_data
${CEPH_DISK} $CEPH_DISK_ARGS \
- prepare --filestore --osd-uuid $osd_uuid $to_prepare || return 1
+ prepare --filestore "$@" $osd_data || return 1
- $timeoutcmd ${CEPH_DISK} $CEPH_DISK_ARGS \
+ $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \
activate \
--mark-init=none \
- $to_activate || return 1
+ $osd_data || return 1
+
+ test_pool_read_write $dir || return 1
+}
- test_pool_read_write $osd_uuid || return 1
+function test_reuse_osd_id() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ create_rbd_pool
+
+ test_activate $dir $dir/dir1 --osd-uuid $(uuidgen) || return 1
+
+ #
+ # add a new OSD with a given OSD id (13)
+ #
+ local osd_uuid=$($uuidgen)
+ local osd_id=13
+ test_activate $dir $dir/dir2 --osd-id $osd_id --osd-uuid $osd_uuid || return 1
+ test $osd_id = $(ceph osd new $osd_uuid) || return 1
+
+ #
+ # make sure the OSD is in use by the PGs
+ #
+ wait_osd_id_used_by_pgs $osd_id $PG_NUM || return 1
+ read_write $dir SOMETHING || return 1
+
+ #
+ # set the OSD out and verify it is no longer used by the PGs
+ #
+ ceph osd out osd.$osd_id || return 1
+ wait_osd_id_used_by_pgs $osd_id 0 || return 1
+
+ #
+ # kill the OSD and destroy it (do not purge, retain its place in the crushmap)
+ #
+ kill_daemons $dir TERM osd.$osd_id || return 1
+ ceph osd destroy osd.$osd_id --yes-i-really-mean-it || return 1
+
+ #
+ # add a new OSD with the same id as the destroyed OSD
+ #
+ osd_uuid=$($uuidgen)
+ test_activate $dir $dir/dir3 --osd-id $osd_id --osd-uuid $osd_uuid || return 1
+ test $osd_id = $(ceph osd new $osd_uuid) || return 1
}
function test_activate_dir() {
local dir=$1
shift
- run_mon $dir a
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ create_rbd_pool
$@
- local osd_data=$dir/dir
- $mkdir -p $osd_data
- test_activate $osd_data $osd_data || return 1
+ test_activate $dir $dir/dir || return 1
}
function test_activate_dir_bluestore() {
local dir=$1
- run_mon $dir a
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ create_rbd_pool
local osd_data=$dir/dir
$mkdir -p $osd_data
prepare --bluestore --block-file --osd-uuid $osd_uuid $to_prepare || return 1
CEPH_ARGS=" --osd-objectstore=bluestore --bluestore-fsck-on-mount=true --bluestore-block-db-size=67108864 --bluestore-block-wal-size=134217728 --bluestore-block-size=10737418240 $CEPH_ARGS" \
- $timeout $TIMEOUT ${CEPH_DISK} $CEPH_DISK_ARGS \
+ $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \
activate \
--mark-init=none \
$to_activate || return 1
- test_pool_read_write $osd_uuid || return 1
+
+ test_pool_read_write $dir || return 1
}
function test_find_cluster_by_uuid() {
grep --quiet "keyring $dir/bootstrap-osd/ceph.keyring" $dir/test_keyring || return 1
}
-# http://tracker.ceph.com/issues/13522
-function ceph_osd_fail_once_fixture() {
- local dir=$1
- local command=ceph-osd
- local fpath=`readlink -f $(which $command)`
- [ "$fpath" = `readlink -f $CEPH_BIN/$command` ] || [ "$fpath" = `readlink -f $(pwd)/$command` ] || return 1
-
- cat > $dir/$command <<EOF
-#!/bin/bash
-if echo "\$@" | grep -e --mkfs && ! test -f $dir/used-$command ; then
- touch $dir/used-$command
- # sleep longer than the first CEPH_OSD_MKFS_DELAYS value (5) below
- sleep 600
-else
- exec $CEPH_BIN/$command "\$@"
-fi
-EOF
- chmod +x $dir/$command
-}
-
-function test_ceph_osd_mkfs() {
- local dir=$1
- ceph_osd_fail_once_fixture $dir || return 1
- CEPH_OSD_MKFS_DELAYS='5 300 300' use_path $dir test_activate_dir || return 1
- [ -f $dir/used-ceph-osd ] || return 1
-}
-
function test_crush_device_class() {
local dir=$1
shift
run_mon $dir a
+ create_rbd_pool
local osd_data=$dir/dir
$mkdir -p $osd_data
test -f $osd_data/crush_device_class || return 1
test $(cat $osd_data/crush_device_class) = CRUSH_CLASS || return 1
- ceph osd crush class create CRUSH_CLASS || return 1
-
CEPH_ARGS="--crush-location=root=default $CEPH_ARGS" \
${CEPH_DISK} $CEPH_DISK_ARGS \
--verbose \
default_actions+="test_zap "
[ `uname` != FreeBSD ] && \
default_actions+="test_activate_dir_bluestore "
- default_actions+="test_ceph_osd_mkfs "
default_actions+="test_crush_device_class "
+ default_actions+="test_reuse_osd_id "
local actions=${@:-$default_actions}
for action in $actions ; do
setup $dir || return 1
-#!/bin/bash
+#!/usr/bin/env python
#
# Copyright (C) 2015, 2016 Red Hat <contact@redhat.com>
#
list_devices=list_devices_return,
get_partition_base=lambda dev_path: '/dev/sdY',
_check_osd_status=lambda cluster, osd_id: 0,
- _remove_from_crush_map=lambda cluster, osd_id: True,
- _delete_osd_auth_key=lambda cluster, osd_id: True,
- _deallocate_osd_id=lambda cluster, osd_id: True,
zap=lambda dev: True
):
main.main_destroy(args)
self.assertRaises(Exception, main.main_destroy, args)
shutil.rmtree(data)
- def test_remove_from_crush_map_fail(self):
- cluster = 'ceph'
- osd_id = '5566'
- with patch.multiple(
- main,
- command=raise_command_error
- ):
- self.assertRaises(Exception, main._remove_from_crush_map,
- cluster, osd_id)
-
- def test_delete_osd_auth_key_fail(self):
- cluster = 'ceph'
- osd_id = '5566'
- with patch.multiple(
- main,
- command=raise_command_error
- ):
- self.assertRaises(Exception, main._delete_osd_auth_key,
- cluster, osd_id)
-
- def test_deallocate_osd_id_fail(self):
- cluster = 'ceph'
- osd_id = '5566'
- with patch.multiple(
- main,
- command=raise_command_error
- ):
- self.assertRaises(Exception, main._deallocate_osd_id,
- cluster, osd_id)
-
def test_main_fix(self):
if platform.system() == "FreeBSD":
return
-#!/bin/bash
+#!/usr/bin/env python
#
# Copyright (C) 2015, 2016 Red Hat <contact@redhat.com>
#
set_type=set_type):
data = main.PrepareData(args)
assert data.args.cluster_uuid == cluster_uuid
+
+
+class TestSecrets(Base):
+
+ @mock.patch('ceph_disk.main.command')
+ def test_secrets(self, m_command):
+ key = "KEY"
+ m_command.side_effect = lambda cmd: (key + "\n", '', 0)
+ s = main.Secrets()
+ assert {"cephx_secret": key} == s.keys
+ assert '{"cephx_secret": "' + key + '"}' == s.get_json()
+
+ @mock.patch('ceph_disk.main.open')
+ @mock.patch('ceph_disk.main.CryptHelpers.get_dmcrypt_keysize')
+ @mock.patch('ceph_disk.main.command')
+ def test_lockbox_secrets(self,
+ m_command,
+ m_get_dmcrypt_keysize,
+ m_open):
+ key = "KEY"
+ m_command.side_effect = lambda cmd: (key + "\n", '', 0)
+ m_get_dmcrypt_keysize.side_effect = lambda args: 32
+
+ class File:
+ def read(self, size):
+ return b'O' * size
+
+ m_open.side_effect = lambda path, mode: File()
+ s = main.LockboxSecrets({})
+ assert {
+ "dmcrypt_key": 'T09PTw==',
+ "cephx_secret": key,
+ "cephx_lockbox_secret": key,
+ } == s.keys
parser.add_argument('-c', '--conf', dest='cephconf',
help='ceph configuration file')
parser.add_argument('-i', '--in-file', dest='input_file',
- help='input file')
+ help='input file, or "-" for stdin')
parser.add_argument('-o', '--out-file', dest='output_file',
- help='output file')
+ help='output file, or "-" for stdout')
parser.add_argument('--id', '--user', dest='client_id',
help='client id for authentication')
parser.add_argument('--admin-daemon', dest='admin_socket',
help='submit admin-socket commands (\"help\" for help')
- parser.add_argument('--admin-socket', dest='admin_socket_nope',
- help='you probably mean --admin-daemon')
parser.add_argument('-s', '--status', action='store_true',
help='show cluster status')
i = sys.argv.index("injectargs")
sys.argv = sys.argv[:i] + ceph_args.split() + sys.argv[i:]
else:
- sys.argv.extend(ceph_args.split())
+ sys.argv.extend([arg for arg in ceph_args.split()
+ if '--admin-socket' not in arg])
parser, parsed_args, childargs = parse_cmdargs()
if parsed_args.version:
if verbose:
print("parsed_args: {0}, childargs: {1}".format(parsed_args, childargs), file=sys.stderr)
- if parsed_args.admin_socket_nope:
- print('--admin-socket is used by daemons; '
- 'you probably mean --admin-daemon/daemon', file=sys.stderr)
- return 1
-
# pass on --id, --name, --conf
name = 'client.admin'
if parsed_args.client_id:
inbuf = b''
if parsed_args.input_file:
try:
- with open(parsed_args.input_file, 'rb') as f:
- inbuf = f.read()
+ if parsed_args.input_file == '-':
+ inbuf = sys.stdin.read()
+ else:
+ with open(parsed_args.input_file, 'rb') as f:
+ inbuf = f.read()
except Exception as e:
print('Can\'t open input file {0}: {1}'.format(parsed_args.input_file, e), file=sys.stderr)
return 1
# prepare output file, if any
if parsed_args.output_file:
try:
- outf = open(parsed_args.output_file, 'wb')
+ if parsed_args.output_file == '-':
+ outf = sys.stdout
+ else:
+ outf = open(parsed_args.output_file, 'wb')
except Exception as e:
print('Can\'t open output file {0}: {1}'.format(parsed_args.output_file, e), file=sys.stderr)
return 1
sys.stdout.flush()
- if parsed_args.output_file:
+ if parsed_args.output_file and parsed_args.output_file != '-':
outf.close()
if final_ret:
void usage()
{
cout <<
-"usage: ceph-fuse [-m mon-ip-addr:mon-port] <mount point> [OPTIONS]\n"
-" --client_mountpoint/-r <root_directory>\n"
-" use root_directory as the mounted root, rather than the full Ceph tree.\n"
+"usage: ceph-fuse [-n client.username] [-m mon-ip-addr:mon-port] <mount point> [OPTIONS]\n"
+" --client_mountpoint/-r <sub_directory>\n"
+" use sub_directory as the mounted root, rather than the full Ceph tree.\n"
"\n";
fuse_usage();
generic_client_usage();
CODE_ENVIRONMENT_DAEMON, 0,
"mgr_data");
// For consumption by KeyRing::from_ceph_context in MonClient
- g_conf->set_val("keyring", "$mgr_data/keyring", false);
+ g_conf->set_val_or_die("keyring", "$mgr_data/keyring");
// Handle --help
if ((args.size() == 1 && (std::string(args[0]) == "--help" || std::string(args[0]) == "-h"))) {
// We need to specify some default values that may be overridden by the
// user, that are specific to the monitor. The options we are overriding
// are also used on the OSD (or in any other component that uses leveldb),
- // so changing them directly in common/config_opts.h is not an option.
+ // so changing the global defaults is not an option.
// This is not the prettiest way of doing this, especially since it has us
- // having a different place than common/config_opts.h defining default
- // values, but it's not horribly wrong enough to prevent us from doing it :)
+ // having a different place defining default values, but it's not horribly
+ // wrong enough to prevent us from doing it :)
//
// NOTE: user-defined options will take precedence over ours.
//
list<Client*> clients;
list<SyntheticClient*> synclients;
- Messenger* messengers[g_conf->num_client];
- MonClient* mclients[g_conf->num_client];
+ Messenger* messengers[num_client];
+ MonClient* mclients[num_client];
- cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl;
- for (int i=0; i<g_conf->num_client; i++) {
+ cout << "ceph-syn: starting " << num_client << " syn client(s)" << std::endl;
+ for (int i=0; i<num_client; i++) {
messengers[i] = Messenger::create_client_messenger(g_ceph_context,
"synclient");
messengers[i]->bind(g_conf->public_addr);
delete client;
}
- for (int i = 0; i < g_conf->num_client; ++i) {
+ for (int i = 0; i < num_client; ++i) {
// wait for messenger to finish
delete mclients[i];
messengers[i]->shutdown();
if (!waitfor_caps && !waitfor_commit) {
if ((have & need) == need) {
- int butnot = want & ~(have & need);
int revoking = implemented & ~have;
ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
<< " need " << ccap_string(need) << " want " << ccap_string(want)
- << " but not " << ccap_string(butnot) << " revoking " << ccap_string(revoking)
+ << " revoking " << ccap_string(revoking)
<< dendl;
- if ((revoking & butnot) == 0) {
+ if ((revoking & want) == 0) {
*phave = need | (have & want);
in->get_cap_ref(need);
return 0;
}
//make new dir
r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
+
//check proper creation/existence
- if (r < 0) return r;
+ if(-EEXIST == r && i < path.depth() - 1) {
+ r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
+ }
+ if (r < 0)
+ return r;
//move to new dir and continue
cur.swap(next);
ldout(cct, 20) << "mkdirs: successfully created directory "
if (in->inline_version == 0) {
int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
- if (r < 0)
+ if (r < 0) {
+ if (movepos)
+ unlock_fh_pos(f);
return r;
+ }
assert(in->inline_version > 0);
}
retry:
int have;
int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
- if (r < 0)
+ if (r < 0) {
+ if (movepos)
+ unlock_fh_pos(f);
return r;
-
+ }
if (f->flags & O_DIRECT)
have &= ~CEPH_CAP_FILE_CACHE;
if (have)
put_cap_ref(in, CEPH_CAP_FILE_RD);
- return r < 0 ? r : bl->length();
+ if (r < 0) {
+ if (movepos)
+ unlock_fh_pos(f);
+ return r;
+ } else
+ return bl->length();
}
Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
//void trace_include(SyntheticClient *syn, Client *cl, string& prefix);
//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix);
-
+int num_client = 1;
list<int> syn_modes;
list<int> syn_iargs;
list<string> syn_sargs;
vector<const char*> nargs;
for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i],"--num-client") == 0) {
+ num_client = atoi(args[++i]);
+ continue;
+ }
if (strcmp(args[i],"--syn") == 0) {
++i;
int SyntheticClient::create_objects(int nobj, int osize, int inflight)
{
// divy up
- int numc = client->cct->_conf->num_client ? client->cct->_conf->num_client : 1;
+ int numc = num_client ? num_client : 1;
int start, inc, end;
if (sp < 0) dirnum++;
//dout(0) << "leading dir " << filename << " " << dirnum << dendl;
- if (dirnum % client->cct->_conf->num_client != client->get_nodeid()) {
+ if (dirnum % num_client != client->get_nodeid()) {
dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl;
continue;
}
void parse_syn_options(vector<const char*>& args);
+extern int num_client;
class SyntheticClient {
StandaloneClient *client;
skip_client_key = key_from_client_id(*skip_client_id);
}
- int r;
uint64_t minimum_tag_tid = std::numeric_limits<uint64_t>::max();
std::string last_read = HEADER_KEY_CLIENT_PREFIX;
+ bool more;
do {
std::map<std::string, bufferlist> vals;
- r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX,
- MAX_KEYS_READ, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX,
+ MAX_KEYS_READ, &vals, &more);
if (r < 0 && r != -ENOENT) {
CLS_ERR("failed to retrieve registered clients: %s",
cpp_strerror(r).c_str());
if (!vals.empty()) {
last_read = vals.rbegin()->first;
}
- } while (r == MAX_KEYS_READ);
+ } while (more);
// cannot expire tags if a client hasn't committed yet
if (minimum_tag_tid == std::numeric_limits<uint64_t>::max()) {
last_read = HEADER_KEY_TAG_PREFIX;
do {
std::map<std::string, bufferlist> vals;
- r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX,
- MAX_KEYS_READ, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX,
+ MAX_KEYS_READ, &vals, &more);
if (r < 0 && r != -ENOENT) {
CLS_ERR("failed to retrieve tags: %s", cpp_strerror(r).c_str());
return r;
}
}
- if (tag_pass != TAG_PASS_DONE && vals.size() < MAX_KEYS_READ) {
+ if (tag_pass != TAG_PASS_DONE && !more) {
last_read = HEADER_KEY_TAG_PREFIX;
++tag_pass;
} else if (!vals.empty()) {
}
std::map<std::string, bufferlist> vals;
+ bool more;
int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX,
- max_return, &vals);
+ max_return, &vals, &more);
if (r < 0) {
CLS_ERR("failed to retrieve omap values: %s", cpp_strerror(r).c_str());
return r;
std::string last_read = HEADER_KEY_TAG_PREFIX;
do {
std::map<std::string, bufferlist> vals;
+ bool more;
r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX,
- MAX_KEYS_READ, &vals);
+ MAX_KEYS_READ, &vals, &more);
if (r < 0 && r != -ENOENT) {
CLS_ERR("failed to retrieve tags: %s", cpp_strerror(r).c_str());
return r;
}
}
- if (tag_pass != TAG_PASS_DONE && vals.size() < MAX_KEYS_READ) {
+ if (tag_pass != TAG_PASS_DONE && !more) {
last_read = HEADER_KEY_TAG_PREFIX;
++tag_pass;
} else if (!vals.empty()) {
if (!max_entries || max_entries > MAX_ENTRIES)
max_entries = MAX_ENTRIES;
- int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries + 1, &keys);
+ cls_log_list_ret ret;
+
+ int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys, &ret.truncated);
if (rc < 0)
return rc;
- cls_log_list_ret ret;
-
list<cls_log_entry>& entries = ret.entries;
map<string, bufferlist>::iterator iter = keys.begin();
- bool done = false;
string marker;
- size_t i;
- for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
marker = index;
if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) {
- done = true;
+ ret.truncated = false;
break;
}
}
}
- if (iter == keys.end())
- done = true;
-
- ret.marker = marker;
- ret.truncated = !done;
+ if (ret.truncated) {
+ ret.marker = marker;
+ }
::encode(ret, *out);
#define MAX_TRIM_ENTRIES 1000
size_t max_entries = MAX_TRIM_ENTRIES;
+ bool more;
- int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys);
+ int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys, &more);
if (rc < 0)
return rc;
map<string, bufferlist>::iterator iter = keys.begin();
- size_t i;
bool removed = false;
- for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str());
int max_to_get = luaL_checkinteger(L, 2);
std::set<string> keys;
- int ret = cls_cxx_map_get_keys(hctx, start_after, max_to_get, &keys);
+ bool more;
+ int ret = cls_cxx_map_get_keys(hctx, start_after, max_to_get, &keys, &more);
if (ret < 0)
return clslua_opresult(L, 0, ret, 0);
int max_to_get = luaL_checkinteger(L, 3);
map<string, bufferlist> kvpairs;
+ bool more;
int ret = cls_cxx_map_get_vals(hctx, start_after, filter_prefix,
- max_to_get, &kvpairs);
+ max_to_get, &kvpairs, &more);
if (ret < 0)
return clslua_opresult(L, 0, ret, 0);
int max_read = RBD_MAX_KEYS_READ;
vector<snapid_t> snap_ids;
string last_read = RBD_SNAP_KEY_PREFIX;
+ bool more;
do {
set<string> keys;
- r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys);
+ r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys, &more);
if (r < 0) {
return r;
}
if (!keys.empty()) {
last_read = *(keys.rbegin());
}
- } while (r == max_read);
+ } while (more);
}
cls_rbd_parent parent;
int max_read = RBD_MAX_KEYS_READ;
vector<snapid_t> snap_ids;
string last_read = RBD_SNAP_KEY_PREFIX;
+ bool more;
do {
set<string> keys;
- r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys);
+ r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys, &more);
if (r < 0)
return r;
}
if (!keys.empty())
last_read = *(keys.rbegin());
- } while (r == max_read);
+ } while (more);
uint64_t snap_seq;
r = read_key(hctx, "snap_seq", &snap_seq);
int max_read = RBD_MAX_KEYS_READ;
uint64_t total_read = 0;
string last_read = RBD_SNAP_KEY_PREFIX;
+ bool more;
do {
map<string, bufferlist> vals;
r = cls_cxx_map_get_vals(hctx, last_read, RBD_SNAP_KEY_PREFIX,
- max_read, &vals);
+ max_read, &vals, &more);
if (r < 0)
return r;
if (!vals.empty())
last_read = vals.rbegin()->first;
- } while (r == RBD_MAX_KEYS_READ);
+ } while (more);
// snapshot inherits parent, if any
cls_rbd_parent parent;
int max_read = RBD_MAX_KEYS_READ;
string last_read = RBD_SNAP_KEY_PREFIX;
+ bool more;
do {
map<string, bufferlist> vals;
r = cls_cxx_map_get_vals(hctx, last_read, RBD_SNAP_KEY_PREFIX,
- max_read, &vals);
+ max_read, &vals, &more);
if (r < 0)
return r;
}
if (!vals.empty())
last_read = vals.rbegin()->first;
- } while (r == RBD_MAX_KEYS_READ);
+ } while (more);
key_from_snap_id(src_snap_id, &src_snap_key);
r = read_key(hctx, src_snap_key, &snap_meta);
}
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
map<string, string> images;
string last_read = dir_key_for_name(start_after);
+ bool more = true;
- while (r == max_read && images.size() < max_return) {
+ while (more && images.size() < max_return) {
map<string, bufferlist> vals;
CLS_LOG(20, "last_read = '%s'", last_read.c_str());
- r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX,
- max_read, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX,
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading directory by name: %s", cpp_strerror(r).c_str());
return r;
map<string, bufferlist> data;
string last_read = metadata_key_for_name(start_after);
int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ;
+ bool more;
do {
map<string, bufferlist> raw_data;
int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX,
- max_read, &raw_data);
+ max_read, &raw_data, &more);
if (r < 0) {
CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str());
return r;
for (; it != raw_data.end(); ++it)
data[metadata_name_from_key(it->first)].swap(it->second);
- if (r < max_read)
+ if (!more)
break;
last_read = raw_data.rbegin()->first;
if (max_return)
max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size());
- } while (max_read);
+ } while (more);
::encode(data, *out);
return 0;
std::vector<cls::rbd::MirrorPeer> *peers) {
std::string last_read = PEER_KEY_PREFIX;
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
- while (r == max_read) {
+ bool more = true;
+ while (more) {
std::map<std::string, bufferlist> vals;
- r = cls_cxx_map_get_vals(hctx, last_read, PEER_KEY_PREFIX.c_str(),
- max_read, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, PEER_KEY_PREFIX.c_str(),
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading peers: %s", cpp_strerror(r).c_str());
return r;
map<std::string, cls::rbd::MirrorImageStatus> *mirror_statuses) {
std::string last_read = image_key(start_after);
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
+ bool more = true;
- while (r == max_read && mirror_images->size() < max_return) {
+ while (more && mirror_images->size() < max_return) {
std::map<std::string, bufferlist> vals;
CLS_LOG(20, "last_read = '%s'", last_read.c_str());
- r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read,
- &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read,
+ &vals, &more);
if (r < 0) {
CLS_ERR("error reading mirror image directory by name: %s",
cpp_strerror(r).c_str());
string last_read = IMAGE_KEY_PREFIX;
int max_read = RBD_MAX_KEYS_READ;
- r = max_read;
- while (r == max_read) {
+ bool more = true;
+ while (more) {
map<string, bufferlist> vals;
r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX,
- max_read, &vals);
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading mirrored images: %s", cpp_strerror(r).c_str());
return r;
string last_read = STATUS_GLOBAL_KEY_PREFIX;
int max_read = RBD_MAX_KEYS_READ;
- r = max_read;
- while (r == max_read) {
+ bool more = true;
+ while (more) {
map<string, bufferlist> vals;
r = cls_cxx_map_get_vals(hctx, last_read, STATUS_GLOBAL_KEY_PREFIX,
- max_read, &vals);
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading mirrored images: %s", cpp_strerror(r).c_str());
return r;
std::vector<std::string> *instance_ids) {
std::string last_read = INSTANCE_KEY_PREFIX;
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
- while (r == max_read) {
+ bool more = true;
+ while (more) {
std::map<std::string, bufferlist> vals;
- r = cls_cxx_map_get_vals(hctx, last_read, INSTANCE_KEY_PREFIX.c_str(),
- max_read, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, INSTANCE_KEY_PREFIX.c_str(),
+ max_read, &vals, &more);
if (r < 0) {
if (r != -ENOENT) {
CLS_ERR("error reading mirror instances: %s", cpp_strerror(r).c_str());
}
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
+ bool more = true;
std::map<std::string, std::string> mirror_images;
std::string last_read = mirror::image_key(start_after);
- while (r == max_read && mirror_images.size() < max_return) {
+ while (more && mirror_images.size() < max_return) {
std::map<std::string, bufferlist> vals;
CLS_LOG(20, "last_read = '%s'", last_read.c_str());
- r = cls_cxx_map_get_vals(hctx, last_read, mirror::IMAGE_KEY_PREFIX,
- max_read, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, mirror::IMAGE_KEY_PREFIX,
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading mirror image directory by name: %s",
cpp_strerror(r).c_str());
}
int max_read = RBD_MAX_KEYS_READ;
- int r = max_read;
+ bool more = true;
map<string, string> groups;
string last_read = dir_key_for_name(start_after);
- while (r == max_read && groups.size() < max_return) {
+ while (more && groups.size() < max_return) {
map<string, bufferlist> vals;
CLS_LOG(20, "last_read = '%s'", last_read.c_str());
- r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX,
- max_read, &vals);
+ int r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX,
+ max_read, &vals, &more);
if (r < 0) {
CLS_ERR("error reading directory by name: %s", cpp_strerror(r).c_str());
return r;
std::map<string, bufferlist> vals;
string last_read = start_after.image_key();
std::vector<cls::rbd::GroupImageStatus> res;
- int keys_read;
+ bool more;
do {
- keys_read = cls_cxx_map_get_vals(hctx, last_read,cls::rbd::RBD_GROUP_IMAGE_KEY_PREFIX,
- max_read, &vals);
- if (keys_read < 0)
- return keys_read;
+ int r = cls_cxx_map_get_vals(hctx, last_read,cls::rbd::RBD_GROUP_IMAGE_KEY_PREFIX,
+ max_read, &vals, &more);
+ if (r < 0)
+ return r;
for (map<string, bufferlist>::iterator it = vals.begin();
it != vals.end() && res.size() < max_return; ++it) {
last_read = res.rbegin()->spec.image_key();
}
- } while ((keys_read == RBD_MAX_KEYS_READ) && (res.size() < max_return));
+ } while (more && (res.size() < max_return));
::encode(res, *out);
return 0;
* Returns the list of trash spec entries registered in the rbd_trash
* object.
*
+ * Input:
+ * @param start_after which name to begin listing after
+ * (use the empty string to start at the beginning)
+ * @param max_return the maximum number of names to list
+ *
* Output:
* @param data the map between image id and trash spec info
*
*/
int trash_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
+ string start_after;
+ uint64_t max_return;
+
+ try {
+ bufferlist::iterator iter = in->begin();
+ ::decode(start_after, iter);
+ ::decode(max_return, iter);
+ } catch (const buffer::error &err) {
+ return -EINVAL;
+ }
+
map<string, cls::rbd::TrashImageSpec> data;
- string last_read = trash::image_key("");
- int max_read = RBD_MAX_KEYS_READ;
+ string last_read = trash::image_key(start_after);
+ bool more = true;
CLS_LOG(20, "trash_get_images");
-
- do {
+ while (data.size() < max_return) {
map<string, bufferlist> raw_data;
+ int max_read = std::min<int32_t>(RBD_MAX_KEYS_READ,
+ max_return - data.size());
int r = cls_cxx_map_get_vals(hctx, last_read, trash::IMAGE_KEY_PREFIX,
- max_read, &raw_data);
+ max_read, &raw_data, &more);
if (r < 0) {
- CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str());
+ CLS_ERR("failed to read the vals off of disk: %s",
+ cpp_strerror(r).c_str());
return r;
}
if (raw_data.empty()) {
::decode(data[trash::image_id_from_key(it->first)], it->second);
}
- if (r < max_read) {
+ if (!more) {
break;
}
last_read = raw_data.rbegin()->first;
- } while (max_read);
+ }
::encode(data, *out);
-
return 0;
}
return ioctx->operate(RBD_TRASH, &op);
}
- void trash_list_start(librados::ObjectReadOperation *op)
+ void trash_list_start(librados::ObjectReadOperation *op,
+ const std::string &start, uint64_t max_return)
{
bufferlist bl;
+ ::encode(start, bl);
+ ::encode(max_return, bl);
op->exec("rbd", "trash_list", bl);
}
}
int trash_list(librados::IoCtx *ioctx,
+ const std::string &start, uint64_t max_return,
map<string, cls::rbd::TrashImageSpec> *entries)
{
librados::ObjectReadOperation op;
- trash_list_start(&op);
+ trash_list_start(&op, start, max_return);
bufferlist out_bl;
int r = ioctx->operate(RBD_TRASH, &op, &out_bl);
void trash_remove(librados::ObjectWriteOperation *op,
const std::string &id);
int trash_remove(librados::IoCtx *ioctx, const std::string &id);
- void trash_list_start(librados::ObjectReadOperation *op);
+ void trash_list_start(librados::ObjectReadOperation *op,
+ const std::string &start, uint64_t max_return);
int trash_list_finish(bufferlist::iterator *it,
map<string, cls::rbd::TrashImageSpec> *entries);
int trash_list(librados::IoCtx *ioctx,
+ const std::string &start, uint64_t max_return,
map<string, cls::rbd::TrashImageSpec> *entries);
void trash_get_start(librados::ObjectReadOperation *op,
const std::string &id);
struct obj_refcount {
map<string, bool> refs;
+ set<string> retired_refs;
obj_refcount() {}
void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(refs, bl);
+ ::encode(retired_refs, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(refs, bl);
+ if (struct_v >= 2) {
+ ::decode(retired_refs, bl);
+ }
DECODE_FINISH(bl);
}
};
return 0;
}
-static int set_refcount(cls_method_context_t hctx, map<string, bool>& refs)
+static int set_refcount(cls_method_context_t hctx, const struct obj_refcount& objr)
{
bufferlist bl;
- struct obj_refcount objr;
-
- objr.refs = refs;
::encode(objr, bl);
objr.refs[op.tag] = true;
- ret = set_refcount(hctx, objr.refs);
+ ret = set_refcount(hctx, objr);
if (ret < 0)
return ret;
}
}
- if (!found)
+ if (!found ||
+ objr.retired_refs.find(op.tag) != objr.retired_refs.end())
return 0;
+ objr.retired_refs.insert(op.tag);
objr.refs.erase(iter);
if (objr.refs.empty()) {
return cls_cxx_remove(hctx);
}
- ret = set_refcount(hctx, objr.refs);
+ ret = set_refcount(hctx, objr);
if (ret < 0)
return ret;
objr.refs[*iter] = true;
}
- int ret = set_refcount(hctx, objr.refs);
+ int ret = set_refcount(hctx, objr);
if (ret < 0)
return ret;
* read list of objects, skips objects in the ugly namespace
*/
static int get_obj_vals(cls_method_context_t hctx, const string& start, const string& filter_prefix,
- int num_entries, map<string, bufferlist> *pkeys)
+ int num_entries, map<string, bufferlist> *pkeys, bool *pmore)
{
- int ret = cls_cxx_map_get_vals(hctx, start, filter_prefix, num_entries, pkeys);
+ int ret = cls_cxx_map_get_vals(hctx, start, filter_prefix, num_entries, pkeys, pmore);
if (ret < 0)
return ret;
string new_start = c;
/* now get some more keys */
- ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix, num_entries - pkeys->size(), &new_keys);
+ ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix, num_entries - pkeys->size(), &new_keys, pmore);
if (ret < 0)
return ret;
string start_key;
encode_list_index_key(hctx, op.start_obj, &start_key);
bool done = false;
- uint32_t left_to_read = op.num_entries + 1;
+ uint32_t left_to_read = op.num_entries;
+ bool more;
do {
- rc = get_obj_vals(hctx, start_key, op.filter_prefix, left_to_read, &keys);
+ rc = get_obj_vals(hctx, start_key, op.filter_prefix, left_to_read, &keys, &more);
if (rc < 0)
return rc;
}
} while (left_to_read > 0 && !done);
- ret.is_truncated = (left_to_read == 0) && /* we found more entries than we were requested, meaning response is truncated */
- !done;
+ ret.is_truncated = more && !done;
::encode(ret, *out);
return 0;
#define CHECK_CHUNK_SIZE 1000
bool done = false;
+ bool more;
do {
- rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys);
+ rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more);
if (rc < 0)
return rc;
return rc;
}
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime,
entry.ver, info.state, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
if (rc < 0)
bufferlist op_bl;
if (cancel) {
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver,
CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
if (rc < 0)
break;
}
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver,
CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
if (rc < 0)
remove_entry.key.name.c_str(), remove_entry.key.instance.c_str(), remove_entry.meta.category);
unaccount_entry(header, remove_entry);
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace);
if (rc < 0)
get_list_index_key(instance_entry, &list_idx);
/* this is the current head, need to update! */
map<string, bufferlist> keys;
+ bool more;
string filter = key.name; /* list key starts with key name, filter it to avoid a case where we cross to
different namespace */
- int ret = cls_cxx_map_get_vals(hctx, list_idx, filter, 1, &keys);
+ int ret = cls_cxx_map_get_vals(hctx, list_idx, filter, 1, &keys, &more);
if (ret < 0) {
return ret;
}
return ret;
}
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rgw_bucket_dir_entry& entry = obj.get_dir_entry();
rgw_bucket_entry_ver ver;
return ret;
}
- if (op.log_op) {
+ if (op.log_op && !header.syncstopped) {
rgw_bucket_entry_ver ver;
ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch());
ret = cls_cxx_map_remove_key(hctx, cur_change_key);
if (ret < 0)
return ret;
- if (log_op && cur_disk.exists) {
+ if (log_op && cur_disk.exists && !header.syncstopped) {
ret = log_index_operation(hctx, cur_disk.key, CLS_RGW_OP_DEL, cur_disk.tag, cur_disk.meta.mtime,
cur_disk.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, 0, NULL, NULL, NULL);
if (ret < 0) {
ret = cls_cxx_map_set_val(hctx, cur_change_key, &cur_state_bl);
if (ret < 0)
return ret;
- if (log_op) {
+ if (log_op && !header.syncstopped) {
ret = log_index_operation(hctx, cur_change.key, CLS_RGW_OP_ADD, cur_change.tag, cur_change.meta.mtime,
cur_change.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, 0, NULL, NULL, NULL);
if (ret < 0) {
}
static int list_plain_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries)
+ list<rgw_cls_bi_entry> *entries, bool *pmore)
{
string filter = name;
string start_key = marker;
int count = 0;
map<string, bufferlist> keys;
- do {
- if (count >= (int)max) {
+ int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max, &keys, pmore);
+ if (ret < 0) {
+ return ret;
+ }
+
+ map<string, bufferlist>::iterator iter;
+ for (iter = keys.begin(); iter != keys.end(); ++iter) {
+ if (iter->first >= end_key) {
+ /* past the end of plain namespace */
return count;
}
- keys.clear();
-#define BI_GET_NUM_KEYS 128
- int ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_GET_NUM_KEYS, &keys);
- if (ret < 0) {
- return ret;
- }
-
- map<string, bufferlist>::iterator iter;
- for (iter = keys.begin(); iter != keys.end(); ++iter) {
- if (iter->first >= end_key) {
- /* past the end of plain namespace */
- return count;
- }
- rgw_cls_bi_entry entry;
- entry.type = PlainIdx;
- entry.idx = iter->first;
- entry.data = iter->second;
+ rgw_cls_bi_entry entry;
+ entry.type = PlainIdx;
+ entry.idx = iter->first;
+ entry.data = iter->second;
- bufferlist::iterator biter = entry.data.begin();
+ bufferlist::iterator biter = entry.data.begin();
- rgw_bucket_dir_entry e;
- try {
- ::decode(e, biter);
- } catch (buffer::error& err) {
- CLS_LOG(0, "ERROR: %s(): failed to decode buffer", __func__);
- return -EIO;
- }
+ rgw_bucket_dir_entry e;
+ try {
+ ::decode(e, biter);
+ } catch (buffer::error& err) {
+ CLS_LOG(0, "ERROR: %s(): failed to decode buffer", __func__);
+ return -EIO;
+ }
- CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());
+ CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());
- if (!name.empty() && e.key.name != name) {
- return count;
- }
+ if (!name.empty() && e.key.name != name) {
+ return count;
+ }
- entries->push_back(entry);
- count++;
- if (count >= (int)max) {
- return count;
- }
- start_key = entry.idx;
+ entries->push_back(entry);
+ count++;
+ if (count >= (int)max) {
+ return count;
}
- } while (!keys.empty());
+ start_key = entry.idx;
+ }
return count;
}
static int list_instance_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries)
+ list<rgw_cls_bi_entry> *entries, bool *pmore)
{
cls_rgw_obj_key key(name);
string first_instance_idx;
}
int count = 0;
map<string, bufferlist> keys;
- bool started = true;
- do {
- if (count >= (int)max) {
- return count;
- }
- keys.clear();
-#define BI_GET_NUM_KEYS 128
- int ret;
- if (started) {
- ret = cls_cxx_map_get_val(hctx, start_key, &keys[start_key]);
- if (ret == -ENOENT) {
- ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys);
- }
- started = false;
- } else {
- ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys);
- }
+ bufferlist k;
+ int ret = cls_cxx_map_get_val(hctx, start_key, &k);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ }
+ bool found_first = (ret == 0);
+ if (found_first) {
+ --max;
+ }
+ if (max > 0) {
+ ret = cls_cxx_map_get_vals(hctx, start_key, string(), max, &keys, pmore);
CLS_LOG(20, "%s(): start_key=%s first_instance_idx=%s keys.size()=%d", __func__, escape_str(start_key).c_str(), escape_str(first_instance_idx).c_str(), (int)keys.size());
if (ret < 0) {
return ret;
}
+ }
+ if (found_first) {
+ keys[start_key].claim(k);
+ }
- map<string, bufferlist>::iterator iter;
- for (iter = keys.begin(); iter != keys.end(); ++iter) {
- rgw_cls_bi_entry entry;
- entry.type = InstanceIdx;
- entry.idx = iter->first;
- entry.data = iter->second;
-
- if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
- return count;
- }
+ map<string, bufferlist>::iterator iter;
+ for (iter = keys.begin(); iter != keys.end(); ++iter) {
+ rgw_cls_bi_entry entry;
+ entry.type = InstanceIdx;
+ entry.idx = iter->first;
+ entry.data = iter->second;
- CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str());
+ if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+ return count;
+ }
- bufferlist::iterator biter = entry.data.begin();
+ CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str());
- rgw_bucket_dir_entry e;
- try {
- ::decode(e, biter);
- } catch (buffer::error& err) {
- CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length());
- return -EIO;
- }
+ bufferlist::iterator biter = entry.data.begin();
- if (!name.empty() && e.key.name != name) {
- return count;
- }
+ rgw_bucket_dir_entry e;
+ try {
+ ::decode(e, biter);
+ } catch (buffer::error& err) {
+ CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length());
+ return -EIO;
+ }
- entries->push_back(entry);
- count++;
- start_key = entry.idx;
+ if (!name.empty() && e.key.name != name) {
+ return count;
}
- } while (!keys.empty());
+
+ entries->push_back(entry);
+ count++;
+ start_key = entry.idx;
+ }
return count;
}
static int list_olh_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries)
+ list<rgw_cls_bi_entry> *entries, bool *pmore)
{
cls_rgw_obj_key key(name);
string first_instance_idx;
}
int count = 0;
map<string, bufferlist> keys;
- bool started = true;
- do {
- if (count >= (int)max) {
- return count;
- }
- keys.clear();
-#define BI_GET_NUM_KEYS 128
- int ret;
- if (started) {
- ret = cls_cxx_map_get_val(hctx, start_key, &keys[start_key]);
- if (ret == -ENOENT) {
- ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys);
- }
- started = false;
- } else {
- ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys);
- }
+ int ret;
+ bufferlist k;
+ ret = cls_cxx_map_get_val(hctx, start_key, &k);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ }
+ bool found_first = (ret == 0);
+ if (found_first) {
+ --max;
+ }
+ if (max > 0) {
+ ret = cls_cxx_map_get_vals(hctx, start_key, string(), max, &keys, pmore);
CLS_LOG(20, "%s(): start_key=%s first_instance_idx=%s keys.size()=%d", __func__, escape_str(start_key).c_str(), escape_str(first_instance_idx).c_str(), (int)keys.size());
if (ret < 0) {
return ret;
}
+ }
- map<string, bufferlist>::iterator iter;
- for (iter = keys.begin(); iter != keys.end(); ++iter) {
- rgw_cls_bi_entry entry;
- entry.type = OLHIdx;
- entry.idx = iter->first;
- entry.data = iter->second;
+ if (found_first) {
+ keys[start_key].claim(k);
+ }
- if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
- return count;
- }
+ map<string, bufferlist>::iterator iter;
+ for (iter = keys.begin(); iter != keys.end(); ++iter) {
+ rgw_cls_bi_entry entry;
+ entry.type = OLHIdx;
+ entry.idx = iter->first;
+ entry.data = iter->second;
- CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str());
+ if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) {
+ return count;
+ }
- bufferlist::iterator biter = entry.data.begin();
+ CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str());
- rgw_bucket_olh_entry e;
- try {
- ::decode(e, biter);
- } catch (buffer::error& err) {
- CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length());
- return -EIO;
- }
+ bufferlist::iterator biter = entry.data.begin();
- if (!name.empty() && e.key.name != name) {
- return count;
- }
+ rgw_bucket_olh_entry e;
+ try {
+ ::decode(e, biter);
+ } catch (buffer::error& err) {
+ CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length());
+ return -EIO;
+ }
- entries->push_back(entry);
- count++;
- start_key = entry.idx;
+ if (!name.empty() && e.key.name != name) {
+ return count;
}
- } while (!keys.empty());
+
+ entries->push_back(entry);
+ count++;
+ start_key = entry.idx;
+ }
return count;
}
string filter = op.name;
#define MAX_BI_LIST_ENTRIES 1000
- int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES) + 1; /* one extra entry for identifying truncation */
+ int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES);
string start_key = op.marker;
- int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries);
+ bool more;
+ int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more);
if (ret < 0) {
CLS_LOG(0, "ERROR: %s(): list_plain_entries retured ret=%d", __func__, ret);
return ret;
CLS_LOG(20, "found %d plain entries", count);
- ret = list_instance_entries(hctx, op.name, op.marker, max - count, &op_ret.entries);
- if (ret < 0) {
- CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret);
- return ret;
+ if (!more) {
+ ret = list_instance_entries(hctx, op.name, op.marker, max - count, &op_ret.entries, &more);
+ if (ret < 0) {
+ CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret);
+ return ret;
+ }
+
+ count += ret;
}
- count += ret;
+ if (!more) {
+ ret = list_olh_entries(hctx, op.name, op.marker, max - count, &op_ret.entries, &more);
+ if (ret < 0) {
+ CLS_LOG(0, "ERROR: %s(): list_olh_entries retured ret=%d", __func__, ret);
+ return ret;
+ }
- ret = list_olh_entries(hctx, op.name, op.marker, max - count, &op_ret.entries);
- if (ret < 0) {
- CLS_LOG(0, "ERROR: %s(): list_olh_entries retured ret=%d", __func__, ret);
- return ret;
+ count += ret;
}
- count += ret;
-
- op_ret.is_truncated = (count >= max);
+ op_ret.is_truncated = (count >= max) || more;
while (count >= max) {
op_ret.entries.pop_back();
count--;
string filter;
- do {
-#define BI_NUM_KEYS 128
- int ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_NUM_KEYS, &keys);
- if (ret < 0)
- return ret;
+ int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max_entries, &keys, truncated);
+ if (ret < 0)
+ return ret;
- map<string, bufferlist>::iterator iter = keys.begin();
- if (iter == keys.end())
- break;
+ map<string, bufferlist>::iterator iter = keys.begin();
+ if (iter == keys.end())
+ return 0;
- for (; iter != keys.end(); ++iter) {
- const string& key = iter->first;
- rgw_bi_log_entry e;
+ uint32_t num_keys = keys.size();
- CLS_LOG(0, "bi_log_iterate_entries key=%s bl.length=%d\n", key.c_str(), (int)iter->second.length());
+ for (; iter != keys.end(); ++iter,++i) {
+ const string& key = iter->first;
+ rgw_bi_log_entry e;
- if (key.compare(end_key) > 0)
- return 0;
+ CLS_LOG(0, "bi_log_iterate_entries key=%s bl.length=%d\n", key.c_str(), (int)iter->second.length());
- ret = bi_log_record_decode(iter->second, e);
- if (ret < 0)
- return ret;
+ if (key.compare(end_key) > 0) {
+ key_iter = key;
+ return 0;
+ }
- if (max_entries && (i >= max_entries)) {
- if (truncated)
- *truncated = true;
- key_iter = key;
- return 0;
- }
+ ret = bi_log_record_decode(iter->second, e);
+ if (ret < 0)
+ return ret;
- ret = cb(hctx, key, e, param);
- if (ret < 0)
- return ret;
- i++;
+ ret = cb(hctx, key, e, param);
+ if (ret < 0)
+ return ret;
+ if (i == num_keys - 1) {
+ key_iter = key;
}
- --iter;
- start_key = iter->first;
- } while (true);
+ }
+
return 0;
}
return 0;
}
+static int rgw_bi_log_resync(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+ struct rgw_bucket_dir_header header;
+ int rc = read_bucket_header(hctx, &header);
+ if (rc < 0) {
+ CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n");
+ return rc;
+ }
+
+ bufferlist bl;
+
+ struct rgw_bi_log_entry entry;
+
+ entry.timestamp = real_clock::now();
+ entry.op = RGWModifyOp::CLS_RGW_OP_RESYNC;
+ entry.state = RGWPendingState::CLS_RGW_STATE_COMPLETE;
+
+ string key;
+ bi_log_index_key(hctx, key, entry.id, header.ver);
+
+ ::encode(entry, bl);
+
+ if (entry.id > header.max_marker)
+ header.max_marker = entry.id;
+
+ header.syncstopped = false;
+
+ rc = cls_cxx_map_set_val(hctx, key, &bl);
+ if (rc < 0)
+ return rc;
+
+ return write_bucket_header(hctx, &header);
+}
+
+static int rgw_bi_log_stop(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+ struct rgw_bucket_dir_header header;
+ int rc = read_bucket_header(hctx, &header);
+ if (rc < 0) {
+ CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n");
+ return rc;
+ }
+
+ bufferlist bl;
+
+ struct rgw_bi_log_entry entry;
+
+ entry.timestamp = real_clock::now();
+ entry.op = RGWModifyOp::CLS_RGW_OP_SYNCSTOP;
+ entry.state = RGWPendingState::CLS_RGW_STATE_COMPLETE;
+
+ string key;
+ bi_log_index_key(hctx, key, entry.id, header.ver);
+
+ ::encode(entry, bl);
+
+ if (entry.id > header.max_marker)
+ header.max_marker = entry.id;
+ header.syncstopped = true;
+
+ rc = cls_cxx_map_set_val(hctx, key, &bl);
+ if (rc < 0)
+ return rc;
+
+ return write_bucket_header(hctx, &header);
+}
+
+
static void usage_record_prefix_by_time(uint64_t epoch, string& key)
{
char buf[32];
start_key = key_iter;
}
- do {
- CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
- int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, NUM_KEYS, &keys);
- if (ret < 0)
- return ret;
+ CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
+ int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, truncated);
+ if (ret < 0)
+ return ret;
- map<string, bufferlist>::iterator iter = keys.begin();
- if (iter == keys.end())
- break;
+ map<string, bufferlist>::iterator iter = keys.begin();
+ if (iter == keys.end())
+ return 0;
- for (; iter != keys.end(); ++iter) {
- const string& key = iter->first;
- rgw_usage_log_entry e;
+ uint32_t num_keys = keys.size();
- if (!by_user && key.compare(end_key) >= 0) {
- CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
- return 0;
- }
+ for (; iter != keys.end(); ++iter,++i) {
+ const string& key = iter->first;
+ rgw_usage_log_entry e;
- if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
- CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
- return 0;
- }
+ if (!by_user && key.compare(end_key) >= 0) {
+ CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ return 0;
+ }
- ret = usage_record_decode(iter->second, e);
- if (ret < 0)
- return ret;
+ if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
+ CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
+ return 0;
+ }
- if (e.epoch < start)
- continue;
+ ret = usage_record_decode(iter->second, e);
+ if (ret < 0)
+ return ret;
- /* keys are sorted by epoch, so once we're past end we're done */
- if (e.epoch >= end)
- return 0;
+ if (e.epoch < start)
+ continue;
- ret = cb(hctx, key, e, param);
- if (ret < 0)
- return ret;
+ /* keys are sorted by epoch, so once we're past end we're done */
+ if (e.epoch >= end)
+ return 0;
+ ret = cb(hctx, key, e, param);
+ if (ret < 0)
+ return ret;
- i++;
- if (max_entries && (i > max_entries)) {
- CLS_LOG(20, "usage_iterate_range reached max_entries (%d), done", max_entries);
- *truncated = true;
- key_iter = key;
- return 0;
- }
+
+ if (i == num_keys - 1) {
+ key_iter = key;
+ return 0;
}
- --iter;
- start_key = iter->first;
- } while (true);
+ }
return 0;
}
}
string iter;
- ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, 0, NULL, usage_log_trim_cb, NULL);
+ bool more;
+#define MAX_USAGE_TRIM_ENTRIES 128
+ ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, MAX_USAGE_TRIM_ENTRIES, &more, usage_log_trim_cb, NULL);
if (ret < 0)
return ret;
string filter;
- do {
-#define GC_NUM_KEYS 32
- int ret = cls_cxx_map_get_vals(hctx, start_key, filter, GC_NUM_KEYS, &keys);
- if (ret < 0)
- return ret;
+ int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max_entries, &keys, truncated);
+ if (ret < 0)
+ return ret;
- map<string, bufferlist>::iterator iter = keys.begin();
- if (iter == keys.end())
- break;
+ map<string, bufferlist>::iterator iter = keys.begin();
+ if (iter == keys.end())
+ return 0;
- for (; iter != keys.end(); ++iter) {
- const string& key = iter->first;
- cls_rgw_gc_obj_info e;
+ uint32_t num_keys = keys.size();
- CLS_LOG(10, "gc_iterate_entries key=%s\n", key.c_str());
+ for (; iter != keys.end(); ++iter, ++i) {
+ const string& key = iter->first;
+ cls_rgw_gc_obj_info e;
- if (!end_key.empty() && key.compare(end_key) >= 0)
- return 0;
+ CLS_LOG(10, "gc_iterate_entries key=%s\n", key.c_str());
- if (!key_in_index(key, GC_OBJ_TIME_INDEX))
- return 0;
+ if (!end_key.empty() && key.compare(end_key) >= 0)
+ return 0;
- ret = gc_record_decode(iter->second, e);
- if (ret < 0)
- return ret;
+ if (!key_in_index(key, GC_OBJ_TIME_INDEX))
+ return 0;
- if (max_entries && (i >= max_entries)) {
- if (truncated)
- *truncated = true;
- --iter;
- key_iter = iter->first;
- return 0;
- }
+ ret = gc_record_decode(iter->second, e);
+ if (ret < 0)
+ return ret;
- ret = cb(hctx, key, e, param);
- if (ret < 0)
- return ret;
- i++;
+ ret = cb(hctx, key, e, param);
+ if (ret < 0)
+ return ret;
+ if (i == num_keys - 1) {
+ key_iter = key;
}
- --iter;
- start_key = iter->first;
- } while (true);
+ }
+
return 0;
}
}
cls_rgw_gc_list_ret op_ret;
- int ret = gc_list_entries(hctx, op.marker, op.max, op.expired_only,
+#define GC_LIST_ENTRIES_DEFAULT 128
+ int ret = gc_list_entries(hctx, op.marker, (op.max ? op.max : GC_LIST_ENTRIES_DEFAULT), op.expired_only,
op_ret.entries, &op_ret.truncated, op_ret.next_marker);
if (ret < 0)
return ret;
map<string, bufferlist> vals;
string filter_prefix;
- int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, 1, &vals);
+ bool more;
+ int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, 1, &vals, &more);
if (ret < 0)
return ret;
map<string, bufferlist>::iterator it;
bufferlist::iterator iter;
map<string, bufferlist> vals;
string filter_prefix;
- int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, op.max_entries, &vals);
+ int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, op.max_entries, &vals, &op_ret.is_truncated);
if (ret < 0)
return ret;
map<string, bufferlist>::iterator it;
string filter_prefix;
#define MAX_RESHARD_LIST_ENTRIES 1000
/* one extra entry for identifying truncation */
- int32_t max = (op.max < MAX_RESHARD_LIST_ENTRIES ? op.max : MAX_RESHARD_LIST_ENTRIES) + 1;
- int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, max, &vals);
+ int32_t max = (op.max && (op.max < MAX_RESHARD_LIST_ENTRIES) ? op.max : MAX_RESHARD_LIST_ENTRIES);
+ int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, max, &vals, &op_ret.is_truncated);
if (ret < 0)
return ret;
map<string, bufferlist>::iterator it;
}
op_ret.entries.push_back(entry);
}
- op_ret.is_truncated = op.max && (vals.size() > op.max);
::encode(op_ret, *out);
return 0;
}
cls_method_handle_t h_rgw_bi_put_op;
cls_method_handle_t h_rgw_bi_list_op;
cls_method_handle_t h_rgw_bi_log_list_op;
+ cls_method_handle_t h_rgw_bi_log_resync_op;
+ cls_method_handle_t h_rgw_bi_log_stop_op;
cls_method_handle_t h_rgw_dir_suggest_changes;
cls_method_handle_t h_rgw_user_usage_log_add;
cls_method_handle_t h_rgw_user_usage_log_read;
cls_register_cxx_method(h_class, RGW_BI_LOG_TRIM, CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_trim, &h_rgw_bi_log_list_op);
cls_register_cxx_method(h_class, RGW_DIR_SUGGEST_CHANGES, CLS_METHOD_RD | CLS_METHOD_WR, rgw_dir_suggest_changes, &h_rgw_dir_suggest_changes);
+ cls_register_cxx_method(h_class, "bi_log_resync", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_resync, &h_rgw_bi_log_resync_op);
+ cls_register_cxx_method(h_class, "bi_log_stop", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_stop, &h_rgw_bi_log_stop_op);
+
/* usage logging */
cls_register_cxx_method(h_class, RGW_USER_USAGE_LOG_ADD, CLS_METHOD_RD | CLS_METHOD_WR, rgw_user_usage_log_add, &h_rgw_user_usage_log_add);
cls_register_cxx_method(h_class, RGW_USER_USAGE_LOG_READ, CLS_METHOD_RD, rgw_user_usage_log_read, &h_rgw_user_usage_log_read);
return issue_bucket_list_op(io_ctx, oid, nokey, "", 0, false, &manager, &result[shard_id]);
}
+static bool issue_resync_bi_log(librados::IoCtx& io_ctx, const string& oid, BucketIndexAioManager *manager)
+{
+ bufferlist in;
+ librados::ObjectWriteOperation op;
+ op.exec("rgw", "bi_log_resync", in);
+ return manager->aio_operate(io_ctx, oid, &op);
+}
+
+int CLSRGWIssueResyncBucketBILog::issue_op(int shard_id, const string& oid)
+{
+ return issue_resync_bi_log(io_ctx, oid, &manager);
+}
+
+static bool issue_bi_log_stop(librados::IoCtx& io_ctx, const string& oid, BucketIndexAioManager *manager)
+{
+ bufferlist in;
+ librados::ObjectWriteOperation op;
+ op.exec("rgw", "bi_log_stop", in);
+ return manager->aio_operate(io_ctx, oid, &op);
+}
+
+int CLSRGWIssueBucketBILogStop::issue_op(int shard_id, const string& oid)
+{
+ return issue_bi_log_stop(io_ctx, oid, &manager);
+}
+
class GetDirHeaderCompletion : public ObjectOperationCompletion {
RGWGetDirHeader_CB *ret_ctx;
public:
uint32_t _max_aio) : CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio), entry(_entry) {}
};
+class CLSRGWIssueResyncBucketBILog : public CLSRGWConcurrentIO {
+protected:
+ int issue_op(int shard_id, const string& oid);
+public:
+ CLSRGWIssueResyncBucketBILog(librados::IoCtx& io_ctx, map<int, string>& _bucket_objs, uint32_t max_aio) :
+ CLSRGWConcurrentIO(io_ctx, _bucket_objs, max_aio) {}
+};
+
+class CLSRGWIssueBucketBILogStop : public CLSRGWConcurrentIO {
+protected:
+ int issue_op(int shard_id, const string& oid);
+public:
+ CLSRGWIssueBucketBILogStop(librados::IoCtx& io_ctx, map<int, string>& _bucket_objs, uint32_t max_aio) :
+ CLSRGWConcurrentIO(io_ctx, _bucket_objs, max_aio) {}
+};
+
int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx);
void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates);
struct cls_rgw_lc_list_entries_ret {
map<string, int> entries;
+ bool is_truncated{false};
cls_rgw_lc_list_entries_ret() {}
void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(entries, bl);
+ ::encode(is_truncated, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(entries, bl);
+ if (struct_v >= 2) {
+ ::decode(is_truncated, bl);
+ }
DECODE_FINISH(bl);
}
op = CLS_RGW_OP_LINK_OLH_DM;
} else if (op_str == "unlink_instance") {
op = CLS_RGW_OP_UNLINK_INSTANCE;
+ } else if (op_str == "syncstop") {
+ op = CLS_RGW_OP_SYNCSTOP;
+ } else if (op_str == "resync") {
+ op = CLS_RGW_OP_RESYNC;
} else {
op = CLS_RGW_OP_UNKNOWN;
}
case CLS_RGW_OP_UNLINK_INSTANCE:
f->dump_string("op", "unlink_instance");
break;
+ case CLS_RGW_OP_SYNCSTOP:
+ f->dump_string("op", "syncstop");
+ break;
+ case CLS_RGW_OP_RESYNC:
+ f->dump_string("op", "resync");
+ break;
default:
f->dump_string("op", "invalid");
break;
CLS_RGW_OP_LINK_OLH = 4,
CLS_RGW_OP_LINK_OLH_DM = 5, /* creation of delete marker */
CLS_RGW_OP_UNLINK_INSTANCE = 6,
+ CLS_RGW_OP_SYNCSTOP = 7,
+ CLS_RGW_OP_RESYNC = 8,
};
enum RGWBILogFlags {
uint64_t master_ver;
string max_marker;
cls_rgw_bucket_instance_entry new_instance;
+ bool syncstopped;
- rgw_bucket_dir_header() : tag_timeout(0), ver(0), master_ver(0) {}
+ rgw_bucket_dir_header() : tag_timeout(0), ver(0), master_ver(0), syncstopped(false) {}
void encode(bufferlist &bl) const {
- ENCODE_START(6, 2, bl);
+ ENCODE_START(7, 2, bl);
::encode(stats, bl);
::encode(tag_timeout, bl);
::encode(ver, bl);
::encode(master_ver, bl);
::encode(max_marker, bl);
::encode(new_instance, bl);
+ ::encode(syncstopped,bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &bl) {
} else {
new_instance = cls_rgw_bucket_instance_entry();
}
+ if (struct_v >= 7) {
+ ::decode(syncstopped,bl);
+ }
DECODE_FINISH(bl);
}
void dump(Formatter *f) const;
if (!max_entries || max_entries > MAX_ENTRIES)
max_entries = MAX_ENTRIES;
- int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries + 1, &keys);
+ cls_statelog_list_ret ret;
+
+ int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries, &keys, &ret.truncated);
if (rc < 0)
return rc;
CLS_LOG(20, "from_index=%s match_prefix=%s", from_index.c_str(), match_prefix.c_str());
- cls_statelog_list_ret ret;
list<cls_statelog_entry>& entries = ret.entries;
map<string, bufferlist>::iterator iter = keys.begin();
- bool done = false;
string marker;
- size_t i;
- for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
marker = index;
}
}
- if (iter == keys.end())
- done = true;
- else
+ if (ret.truncated) {
ret.marker = marker;
-
- ret.truncated = !done;
+ }
::encode(ret, *out);
max_entries = MAX_LIST_ENTRIES;
}
+ cls_timeindex_list_ret ret;
+
int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX,
- max_entries + 1, &keys);
+ max_entries, &keys, &ret.truncated);
if (rc < 0) {
return rc;
}
- cls_timeindex_list_ret ret;
-
list<cls_timeindex_entry>& entries = ret.entries;
map<string, bufferlist>::iterator iter = keys.begin();
- bool done = false;
string marker;
- for (size_t i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
bufferlist& bl = iter->second;
if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) {
CLS_LOG(20, "DEBUG: cls_timeindex_list: finishing on to_index=%s",
to_index.c_str());
- done = true;
+ ret.truncated = false;
break;
}
}
}
- if (iter == keys.end()) {
- done = true;
- }
-
ret.marker = marker;
- ret.truncated = !done;
::encode(ret, *out);
to_index = op.to_marker;
}
+ bool more;
+
int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX,
- MAX_TRIM_ENTRIES, &keys);
+ MAX_TRIM_ENTRIES, &keys, &more);
if (rc < 0) {
return rc;
}
map<string, bufferlist>::iterator iter = keys.begin();
bool removed = false;
- for (size_t i = 0; i < MAX_TRIM_ENTRIES && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str());
max_entries = MAX_ENTRIES;
string match_prefix;
+ cls_user_list_buckets_ret ret;
- int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries + 1, &keys);
+ int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries, &keys, &ret.truncated);
if (rc < 0)
return rc;
from_index.c_str(),
to_index.c_str(),
match_prefix.c_str());
- cls_user_list_buckets_ret ret;
list<cls_user_bucket_entry>& entries = ret.entries;
map<string, bufferlist>::iterator iter = keys.begin();
- bool done = false;
string marker;
- size_t i;
- for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+ for (; iter != keys.end(); ++iter) {
const string& index = iter->first;
marker = index;
- if (to_index_valid && to_index.compare(index) <= 0)
+ if (to_index_valid && to_index.compare(index) <= 0) {
+ ret.truncated = false;
break;
+ }
bufferlist& bl = iter->second;
bufferlist::iterator biter = bl.begin();
}
}
- if (iter == keys.end())
- done = true;
- else
+ if (ret.truncated) {
ret.marker = marker;
-
- ret.truncated = !done;
+ }
::encode(ret, *out);
int OutputDataSocket::dump_data(int fd)
{
m_lock.Lock();
- list<bufferlist> l;
- l = data;
+ list<bufferlist> l = std::move(data);
data.clear();
data_size = 0;
m_lock.Unlock();
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
+#include "include/scope_guard.h"
+
#include "common/Throttle.h"
#include "common/perf_counters.h"
Throttle::~Throttle()
{
- while (!cond.empty()) {
- Cond *cv = cond.front();
- delete cv;
- cond.pop_front();
+ {
+ Mutex::Locker l(lock);
+ assert(cond.empty());
}
if (!use_perf)
utime_t start;
bool waited = false;
if (_should_wait(c) || !cond.empty()) { // always wait behind other waiters.
- Cond *cv = new Cond;
- cond.push_back(cv);
- waited = true;
- ldout(cct, 2) << "_wait waiting..." << dendl;
- if (logger)
- start = ceph_clock_now();
-
- do {
- cv->Wait(lock);
- } while (_should_wait(c) || cv != cond.front());
-
- ldout(cct, 2) << "_wait finished waiting" << dendl;
- if (logger) {
- utime_t dur = ceph_clock_now() - start;
- logger->tinc(l_throttle_wait, dur);
+ {
+ auto cv = cond.insert(cond.end(), new Cond);
+ auto w = make_scope_guard([this, cv]() {
+ delete *cv;
+ cond.erase(cv);
+ });
+ waited = true;
+ ldout(cct, 2) << "_wait waiting..." << dendl;
+ if (logger)
+ start = ceph_clock_now();
+
+ do {
+ (*cv)->Wait(lock);
+ } while ((_should_wait(c) || cv != cond.begin()));
+
+ ldout(cct, 2) << "_wait finished waiting" << dendl;
+ if (logger) {
+ utime_t dur = ceph_clock_now() - start;
+ logger->tinc(l_throttle_wait, dur);
+ }
}
-
- delete cv;
- cond.pop_front();
-
// wake up the next guy
if (!cond.empty())
cond.front()->SignalOne();
BackoffThrottle::~BackoffThrottle()
{
+ {
+ locker l(lock);
+ assert(waiters.empty());
+ }
+
if (!use_perf)
return;
{
Mutex::Locker l(m_lock);
assert(m_current == 0);
+ assert(waiters == 0);
}
void SimpleThrottle::start_op()
{
Mutex::Locker l(m_lock);
- while (m_max == m_current)
+ while (m_max == m_current) {
+ waiters++;
m_cond.Wait(m_lock);
+ waiters--;
+ }
++m_current;
}
int SimpleThrottle::wait_for_ret()
{
Mutex::Locker l(m_lock);
- while (m_current > 0)
+ while (m_current > 0) {
+ waiters++;
m_cond.Wait(m_lock);
+ waiters--;
+ }
return m_ret;
}
m_ignore_enoent(ignore_enoent), m_next_tid(0), m_complete_tid(0) {
}
+OrderedThrottle::~OrderedThrottle() {
+ Mutex::Locker locker(m_lock);
+ assert(waiters == 0);
+}
+
C_OrderedThrottle *OrderedThrottle::start_op(Context *on_finish) {
assert(on_finish != NULL);
complete_pending_ops();
while (m_max == m_current) {
+ ++waiters;
m_cond.Wait(m_lock);
+ --waiters;
complete_pending_ops();
}
++m_current;
complete_pending_ops();
while (m_current > 0) {
+ ++waiters;
m_cond.Wait(m_lock);
+ --waiters;
complete_pending_ops();
}
return m_ret_val;
#include <atomic>
#include <iostream>
#include <condition_variable>
+#include <stdexcept>
#include "Cond.h"
#include "include/Context.h"
Mutex lock;
list<Cond*> cond;
const bool use_perf;
+ bool shutting_down = false;
+ Cond shutdown_clear;
public:
Throttle(CephContext *cct, const std::string& n, int64_t m = 0, bool _use_perf = true);
uint64_t m_current;
int m_ret;
bool m_ignore_enoent;
+ uint32_t waiters = 0;
};
class OrderedThrottle {
public:
OrderedThrottle(uint64_t max, bool ignore_enoent);
+ ~OrderedThrottle();
C_OrderedThrottle *start_op(Context *on_finish);
void end_op(int r);
TidResult m_tid_result;
void complete_pending_ops();
+ uint32_t waiters = 0;
};
#endif
}
}
-void OpHistory::dump_ops(utime_t now, Formatter *f)
+void OpHistory::dump_ops(utime_t now, Formatter *f, set<string> filters)
{
Mutex::Locker history_lock(ops_history_lock);
cleanup(now);
arrived.begin();
i != arrived.end();
++i) {
+ if (!i->second->filter_out(filters))
+ continue;
f->open_object_section("op");
i->second->dump(now, f);
f->close_section();
f->close_section();
}
-void OpHistory::dump_ops_by_duration(utime_t now, Formatter *f)
+void OpHistory::dump_ops_by_duration(utime_t now, Formatter *f, set<string> filters)
{
Mutex::Locker history_lock(ops_history_lock);
cleanup(now);
arrived.begin();
i != arrived.end();
++i) {
+ if (!i->second->filter_out(filters))
+ continue;
durationvec.push_back(pair<double, TrackedOpRef>(i->second->get_duration(), i->second));
}
}
}
-bool OpTracker::dump_historic_ops(Formatter *f, bool by_duration)
+bool OpTracker::dump_historic_ops(Formatter *f, bool by_duration, set<string> filters)
{
RWLock::RLocker l(lock);
if (!tracking_enabled)
utime_t now = ceph_clock_now();
if (by_duration) {
- history.dump_ops_by_duration(now, f);
+ history.dump_ops_by_duration(now, f, filters);
} else {
- history.dump_ops(now, f);
+ history.dump_ops(now, f, filters);
}
return true;
}
-void OpHistory::dump_slow_ops(utime_t now, Formatter *f)
+void OpHistory::dump_slow_ops(utime_t now, Formatter *f, set<string> filters)
{
Mutex::Locker history_lock(ops_history_lock);
cleanup(now);
slow_op.begin();
i != slow_op.end();
++i) {
+ if (!i->second->filter_out(filters))
+ continue;
f->open_object_section("Op");
i->second->dump(now, f);
f->close_section();
f->close_section();
}
-bool OpTracker::dump_historic_slow_ops(Formatter *f)
+bool OpTracker::dump_historic_slow_ops(Formatter *f, set<string> filters)
{
RWLock::RLocker l(lock);
if (!tracking_enabled)
return false;
utime_t now = ceph_clock_now();
- history.dump_slow_ops(now, f);
+ history.dump_slow_ops(now, f, filters);
return true;
}
-bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked)
+bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked, set<string> filters)
{
RWLock::RLocker l(lock);
if (!tracking_enabled)
Mutex::Locker locker(sdata->ops_in_flight_lock_sharded);
for (auto& op : sdata->ops_in_flight_sharded) {
if (print_only_blocked && (now - op.get_initiated() <= complaint_time))
- break;
+ break;
+ if (!op.filter_out(filters))
+ continue;
f->open_object_section("op");
op.dump(now, f);
f->close_section(); // this TrackedOp
events.push_back(Event(stamp, event));
current = events.back().c_str();
}
- dout(6) << "seq: " << seq
+ dout(6) << " seq: " << seq
<< ", time: " << stamp
<< ", event: " << event
<< ", op: " << get_desc()
events.push_back(Event(stamp, event));
current = event;
}
- dout(6) << "seq: " << seq
+ dout(6) << " seq: " << seq
<< ", time: " << stamp
<< ", event: " << event
<< ", op: " << get_desc()
f->dump_float("age", now - get_initiated());
f->dump_float("duration", get_duration());
{
- f->open_array_section("type_data");
+ f->open_object_section("type_data");
_dump(f);
f->close_section();
}
assert(slow_op.empty());
}
void insert(utime_t now, TrackedOpRef op);
- void dump_ops(utime_t now, Formatter *f);
- void dump_ops_by_duration(utime_t now, Formatter *f);
- void dump_slow_ops(utime_t now, Formatter *f);
+ void dump_ops(utime_t now, Formatter *f, set<string> filters = {""});
+ void dump_ops_by_duration(utime_t now, Formatter *f, set<string> filters = {""});
+ void dump_slow_ops(utime_t now, Formatter *f, set<string> filters = {""});
void on_shutdown();
void set_size_and_duration(uint32_t new_size, uint32_t new_duration) {
history_size = new_size;
RWLock::WLocker l(lock);
tracking_enabled = enable;
}
- bool dump_ops_in_flight(Formatter *f, bool print_only_blocked=false);
- bool dump_historic_ops(Formatter *f, bool by_duration = false);
- bool dump_historic_slow_ops(Formatter *f);
+ bool dump_ops_in_flight(Formatter *f, bool print_only_blocked = false, set<string> filters = {""});
+ bool dump_historic_ops(Formatter *f, bool by_duration = false, set<string> filters = {""});
+ bool dump_historic_slow_ops(Formatter *f, set<string> filters = {""});
bool register_inflight_op(TrackedOp *i);
void unregister_inflight_op(TrackedOp *i);
/// return a unique descriptor of the Op; eg the message it's attached to
virtual void _dump_op_descriptor_unlocked(ostream& stream) const = 0;
/// called when the last non-OpTracker reference is dropped
- virtual void _unregistered() {};
+ virtual void _unregistered() {}
+
+ virtual bool filter_out(const set<string>& filters) { return true; }
public:
ZTracer::Trace osd_trace;
ldout(cct,10) << "worker start" << dendl;
std::stringstream ss;
- char name[16] = {0};
- ceph_pthread_getname(pthread_self(), name, sizeof(name));
- ss << name << " thread " << name;
+ ss << name << " thread " << (void *)pthread_self();
heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
while (!_stop) {
ldout(cct,10) << "worker start" << dendl;
std::stringstream ss;
- char name[16] = {0};
- ceph_pthread_getname(pthread_self(), name, sizeof(name));
- ss << name << " thread " << name;
+ ss << name << " thread " << (void *)pthread_self();
heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
while (!stop_threads) {
return buffer_c_str_accesses;
}
+#ifdef CEPH_HAVE_SETPIPE_SZ
static std::atomic<unsigned> buffer_max_pipe_size { 0 };
int update_max_pipe_size() {
-#ifdef CEPH_HAVE_SETPIPE_SZ
char buf[32];
int r;
std::string err;
if (!err.empty())
return -EIO;
buffer_max_pipe_size = size;
-#endif
return 0;
}
size_t get_max_pipe_size() {
-#ifdef CEPH_HAVE_SETPIPE_SZ
size_t size = buffer_max_pipe_size;
if (size)
return size;
if (update_max_pipe_size() == 0)
return buffer_max_pipe_size;
-#endif
// this is the max size hardcoded in linux before 2.6.35
return 65536;
}
+#else
+ size_t get_max_pipe_size() { return 65536; }
+#endif
+
const char * buffer::error::what() const throw () {
return "buffer::exception";
f->dump_string(var.c_str(), buf);
}
}
+ } else if (command == "config help") {
+ std::string var;
+ if (cmd_getval(this, cmdmap, "var", var)) {
+ // Output a single one
+ std::string key = ConfFile::normalize_key_name(var);
+ const auto &i = _conf->schema.find(key);
+ if (i == _conf->schema.end()) {
+ std::ostringstream msg;
+ msg << "Setting not found: '" << key << "'";
+ f->dump_string("error", msg.str());
+ } else {
+ i->second.dump(f);
+ }
+ } else {
+ // Output all
+ f->open_array_section("options");
+ for (const auto &option : ceph_options) {
+ option.dump(f);
+ }
+ f->close_section();
+ }
} else if (command == "config diff") {
md_config_t def_conf;
def_conf.set_val("cluster", _conf->cluster);
<< "result is " << out->length() << " bytes" << dendl;
}
-
-CephContext::CephContext(uint32_t module_type_, int init_flags_)
+CephContext::CephContext(uint32_t module_type_,
+ enum code_environment_t code_env,
+ int init_flags_)
: nref(1),
- _conf(new md_config_t()),
+ _conf(new md_config_t(code_env == CODE_ENVIRONMENT_DAEMON)),
_log(NULL),
_module_type(module_type_),
_init_flags(init_flags_),
_admin_socket->register_command("perf histogram schema", "perf histogram schema", _admin_hook, "dump perf histogram schema");
_admin_socket->register_command("perf reset", "perf reset name=var,type=CephString", _admin_hook, "perf reset <name>: perf reset all or one perfcounter name");
_admin_socket->register_command("config show", "config show", _admin_hook, "dump current config settings");
+ _admin_socket->register_command("config help", "config help name=var,type=CephString,req=false", _admin_hook, "get config setting schema and descriptions");
_admin_socket->register_command("config set", "config set name=var,type=CephString name=val,type=CephString,n=N", _admin_hook, "config set <field> <val> [<val> ...]: set a config variable");
_admin_socket->register_command("config get", "config get name=var,type=CephString", _admin_hook, "config get <field>: get the config value");
_admin_socket->register_command("config diff",
_admin_socket->unregister_command("config show");
_admin_socket->unregister_command("config set");
_admin_socket->unregister_command("config get");
+ _admin_socket->unregister_command("config help");
_admin_socket->unregister_command("config diff");
_admin_socket->unregister_command("config diff get");
_admin_socket->unregister_command("log flush");
#include <boost/noncopyable.hpp>
#include "common/cmdparse.h"
+#include "common/code_environment.h"
#include "crush/CrushLocation.h"
#include "include/Spinlock.h"
*/
class CephContext {
public:
- CephContext(uint32_t module_type_, int init_flags_ = 0);
+ CephContext(uint32_t module_type_,
+ enum code_environment_t code_env=CODE_ENVIRONMENT_UTILITY,
+ int init_flags_ = 0);
// ref count!
private:
}
f->close_section();
}
+
+ void operator()(const std::vector<double> &operand) const
+ {
+ f->open_array_section(key.c_str());
+ for (const auto i : operand) {
+ f->dump_float("item", i);
+ }
+ f->close_section();
+ }
};
//f->open_object_section("cmdmap");
case json_spirit::array_type:
{
// array is a vector of values. Unpack it to a vector
- // of strings or int64_t, the only types we handle.
+ // of strings, doubles, or int64_t, the only types we handle.
const vector<json_spirit::mValue>& spvals = it->second.get_array();
if (spvals.empty()) {
// if an empty array is acceptable, the caller should always check for
outv.push_back(sv.get_int64());
}
(*mapp)[it->first] = std::move(outv);
+ } else if (spvals.front().type() == json_spirit::real_type) {
+ vector<double> outv;
+ for (const auto& sv : spvals) {
+ if (spvals.front().type() != json_spirit::real_type) {
+ throw(runtime_error("Can't handle arrays of multiple types"));
+ }
+ outv.push_back(sv.get_real());
+ }
+ (*mapp)[it->first] = std::move(outv);
} else {
throw(runtime_error("Can't handle arrays of types other than "
- "int or string"));
+ "int, string, or double"));
}
}
break;
int64_t,
double,
std::vector<std::string>,
- std::vector<int64_t>> cmd_vartype;
+ std::vector<int64_t>,
+ std::vector<double>> cmd_vartype;
typedef std::map<std::string, cmd_vartype> cmdmap_t;
std::string cmddesc_get_prefix(const std::string &cmddesc);
v = lat.p->cache[slot];
if (v) {
if (CEQ()(*v, k)) {
- if (flags & (FLAG_LOCK|FLAG_UNLOCK))
+ if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
lat.lock->unlock();
return v;
}
lat.p->cache[slot] = v;
}
}
- if (flags & (FLAG_LOCK|FLAG_UNLOCK))
+ if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
lat.lock->unlock();
return v;
} /* find_latch */
g_code_env = code_env;
// Create a configuration object
- CephContext *cct = new CephContext(iparams.module_type, flags);
+ CephContext *cct = new CephContext(iparams.module_type, code_env, flags);
md_config_t *conf = cct->_conf;
// add config observers here
if (data_dir_option)
conf->data_dir_option = data_dir_option;
- // Set some defaults based on code type
- switch (code_env) {
- case CODE_ENVIRONMENT_DAEMON:
- conf->set_val_or_die("daemonize", "true");
- conf->set_val_or_die("log_to_stderr", "false");
- conf->set_val_or_die("err_to_stderr", "true");
-
- // different default keyring locations for osd and mds. this is
- // for backward compatibility. moving forward, we want all keyrings
- // in these locations. the mon already forces $mon_data/keyring.
- if (conf->name.is_mds())
- conf->set_val("keyring", "$mds_data/keyring", false);
- else if (conf->name.is_osd())
- conf->set_val("keyring", "$osd_data/keyring", false);
- break;
-
- case CODE_ENVIRONMENT_UTILITY_NODOUT:
- case CODE_ENVIRONMENT_LIBRARY:
+ // different default keyring locations for osd and mds. this is
+ // for backward compatibility. moving forward, we want all keyrings
+ // in these locations. the mon already forces $mon_data/keyring.
+ if (conf->name.is_mds()) {
+ conf->set_val("keyring", "$mds_data/keyring", false);
+ } else if (conf->name.is_osd()) {
+ conf->set_val("keyring", "$osd_data/keyring", false);
+ }
+
+ if (code_env == CODE_ENVIRONMENT_LIBRARY ||
+ code_env == CODE_ENVIRONMENT_UTILITY_NODOUT) {
conf->set_val_or_die("log_to_stderr", "false");
conf->set_val_or_die("err_to_stderr", "false");
conf->set_val_or_die("log_flush_on_exit", "false");
- break;
-
- default:
- break;
- }
-
- if (flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) {
- // do nothing special! we used to do no default log, pid_file,
- // admin_socket, but changed our minds. let's make ceph-fuse
- // and radosgw use the same defaults as ceph-{osd,mon,mds,...}
- } else if (code_env != CODE_ENVIRONMENT_DAEMON) {
- // no default log, pid_file, admin_socket
- conf->set_val_or_die("pid_file", "");
- conf->set_val_or_die("admin_socket", "");
- conf->set_val_or_die("log_file", "");
- // use less memory for logs
- conf->set_val_or_die("log_max_recent", "500");
}
return cct;
#include "common/ceph_argparse.h"
#include "common/common_init.h"
#include "common/config.h"
-#include "common/config_validators.h"
#include "include/str_list.h"
#include "include/stringify.h"
#include "osd/osd_types.h"
return ret;
}
-#define OPTION(name, type, def_val)
-#define OPTION_VALIDATOR(name) \
-struct md_config_t::option_##name##_t { \
- typedef decltype(md_config_t::name) type; \
-};
-#define SAFE_OPTION(name, type, def_val)
-#define SUBSYS(name, log, gather)
-#define DEFAULT_SUBSYS(log, gather)
-#include "common/config_opts.h"
+
+
+md_config_t::md_config_t(bool is_daemon)
+ : cluster(""),
+ lock("md_config_t", true, false)
+{
+ init_subsys();
+
+ // Load the compile-time list of Option into
+ // a map so that we can resolve keys quickly.
+ for (const auto &i : ceph_options) {
+ if (schema.count(i.name)) {
+ // We may be instantiated pre-logging so send
+ std::cerr << "Duplicate config key in schema: '" << i.name << "'"
+ << std::endl;
+ assert(false);
+ }
+ schema.insert({i.name, i});
+ }
+
+ // Populate list of legacy_values according to the OPTION() definitions
+ // Note that this is just setting up our map of name->member ptr. The
+ // default values etc will get loaded in along with new-style data,
+ // as all loads write to both the values map, and the legacy
+ // members if present.
+ legacy_values = {
+#define OPTION(name, type) \
+ {std::string(STRINGIFY(name)), &md_config_t::name},
+#define SAFE_OPTION(name, type) OPTION(name, type)
+#include "common/legacy_config_opts.h"
#undef OPTION
-#undef OPTION_VALIDATOR
#undef SAFE_OPTION
-#undef SUBSYS
-#undef DEFAULT_SUBSYS
+ };
-namespace {
+ validate_schema();
-template <typename T>
-typename std::enable_if<!std::is_destructible<T>::value,
- md_config_t::validator_t>::type create_validator() {
- return md_config_t::validator_t();
-}
+ // Load default values from the schema
+ for (const auto &i : schema) {
+ const Option &opt = i.second;
+ bool has_daemon_default = !boost::get<boost::blank>(&opt.daemon_value);
+ Option::value_t default_val;
+ if (is_daemon && has_daemon_default) {
+ default_val = opt.daemon_value;
+ } else {
+ default_val = opt.value;
+ }
-template <typename T>
-typename std::enable_if<std::is_destructible<T>::value,
- md_config_t::validator_t>::type create_validator() {
- // if T is defined (and not just forward declared), it implies
- // that a validator function exists. use a dummy typed pointer to
- // pick the correct validator function
- return [](std::string *value, std::string *error_message) {
- return ::validate(reinterpret_cast<T*>(0), value, error_message);
- };
-}
+ if (opt.type == Option::TYPE_STR) {
+ // We call pre_validate as a sanity check, but also to get any
+ // side effect (value modification) from the validator.
+ std::string *def_str = boost::get<std::string>(&default_val);
+ std::string err;
+ if (opt.pre_validate(def_str, &err) != 0) {
+ std::cerr << "Default value " << opt.name << "=" << *def_str << " is "
+ "invalid: " << err << std::endl;
+
+ // This is the compiled-in default that is failing its own option's
+ // validation, so this is super-invalid and should never make it
+ // past a pull request: crash out.
+ assert(false);
+ }
+ }
-} // anonymous namespace
+ values[i.first] = default_val;
+ }
-md_config_t::md_config_t()
- : cluster(""),
+ // Copy out values (defaults) into any legacy (C struct member) fields
+ for (const auto &i : legacy_values) {
+ const auto &name = i.first;
+ const auto &option = schema.at(name);
+ auto ptr = i.second;
-#define OPTION_OPT_INT(name, def_val) name(def_val),
-#define OPTION_OPT_LONGLONG(name, def_val) name((1LL) * def_val),
-#define OPTION_OPT_STR(name, def_val) name(def_val),
-#define OPTION_OPT_DOUBLE(name, def_val) name(def_val),
-#define OPTION_OPT_FLOAT(name, def_val) name(def_val),
-#define OPTION_OPT_BOOL(name, def_val) name(def_val),
-#define OPTION_OPT_ADDR(name, def_val) name(def_val),
-#define OPTION_OPT_U32(name, def_val) name(def_val),
-#define OPTION_OPT_U64(name, def_val) name(((uint64_t)1) * def_val),
-#define OPTION_OPT_UUID(name, def_val) name(def_val),
-#define OPTION(name, type, def_val) OPTION_##type(name, def_val)
-#define OPTION_VALIDATOR(name)
-#define SAFE_OPTION(name, type, def_val) OPTION(name, type, def_val)
-#define SUBSYS(name, log, gather)
-#define DEFAULT_SUBSYS(log, gather)
-#include "common/config_opts.h"
-#undef OPTION_OPT_INT
-#undef OPTION_OPT_LONGLONG
-#undef OPTION_OPT_STR
-#undef OPTION_OPT_DOUBLE
-#undef OPTION_OPT_FLOAT
-#undef OPTION_OPT_BOOL
-#undef OPTION_OPT_ADDR
-#undef OPTION_OPT_U32
-#undef OPTION_OPT_U64
-#undef OPTION_OPT_UUID
-#undef OPTION
-#undef OPTION_VALIDATOR
-#undef SAFE_OPTION
-#undef SUBSYS
-#undef DEFAULT_SUBSYS
- lock("md_config_t", true, false)
+ update_legacy_val(option, ptr);
+ }
+}
+
+/**
+ * Sanity check schema. Assert out on failures, to ensure any bad changes
+ * cannot possibly pass any testing and make it into a release.
+ */
+void md_config_t::validate_schema()
{
- static const std::vector<md_config_t::config_option> s_config_options = {
-#define OPTION4(name, type, def_val, safe) \
- config_option{ STRINGIFY(name), type, &md_config_t::name, safe, \
- create_validator<option_##name##_t>() },
-#define OPTION(name, type, def_val) OPTION4(name, type, def_val, false)
-#define OPTION_VALIDATOR(name)
-#define SAFE_OPTION(name, type, def_val) OPTION4(name, type, def_val, true)
-#define SUBSYS(name, log, gather)
-#define DEFAULT_SUBSYS(log, gather)
-#include "common/config_opts.h"
-#undef OPTION4
-#undef OPTION
-#undef OPTION_VALIDATOR
-#undef SAFE_OPTION
-#undef SUBSYS
-#undef DEFAULT_SUBSYS
- };
- static std::shared_ptr<decltype(s_config_options)>
- s_tbl(new std::vector<md_config_t::config_option>(std::move(s_config_options)));
- config_options = s_tbl;
+ for (const auto &i : schema) {
+ const auto &opt = i.second;
+ for (const auto &see_also_key : opt.see_also) {
+ if (schema.count(see_also_key) == 0) {
+ std::cerr << "Non-existent see-also key '" << see_also_key
+ << "' on option '" << opt.name << "'" << std::endl;
+ assert(false);
+ }
+ }
+ }
- validate_default_settings();
- init_subsys();
+ for (const auto &i : legacy_values) {
+ if (schema.count(i.first) == 0) {
+ std::cerr << "Schema is missing legacy field '" << i.first << "'"
+ << std::endl;
+ assert(false);
+ }
+ }
}
void md_config_t::init_subsys()
subsys.add(ceph_subsys_##name, STRINGIFY(name), log, gather);
#define DEFAULT_SUBSYS(log, gather) \
subsys.add(ceph_subsys_, "none", log, gather);
-#define OPTION(a, b, c)
-#define OPTION_VALIDATOR(a)
-#define SAFE_OPTION(a, b, c)
-#include "common/config_opts.h"
-#undef OPTION
-#undef OPTION_VALIDATOR
-#undef SAFE_OPTION
+#include "common/subsys.h"
#undef SUBSYS
#undef DEFAULT_SUBSYS
}
string &s = *p;
if (s.find("$data_dir") != string::npos) {
if (data_dir_option.length()) {
- list<config_option const *> stack;
+ list<const Option *> stack;
expand_meta(s, NULL, stack, warnings);
p++;
} else {
std::vector <std::string> my_sections;
_get_my_sections(my_sections);
- for (auto& opt: *config_options) {
+ for (const auto &i : schema) {
+ const auto &opt = i.second;
std::string val;
int ret = _get_val_from_conf_file(my_sections, opt.name, val, false);
if (ret == 0) {
std::string error_message;
- int r = set_val_impl(val, &opt, &error_message);
+ int r = set_val_impl(val, opt, &error_message);
if (warnings != nullptr && (r != 0 || !error_message.empty())) {
*warnings << "parse error setting '" << opt.name << "' to '" << val
<< "'";
}
// subsystems?
- for (int o = 0; o < subsys.get_num(); o++) {
+ for (size_t o = 0; o < subsys.get_num(); o++) {
std::string as_option("debug_");
as_option += subsys.get_name(o);
std::string val;
f->dump_string("name", stringify(name));
f->dump_string("cluster", cluster);
}
- for (int o = 0; o < subsys.get_num(); o++) {
+ for (size_t o = 0; o < subsys.get_num(); o++) {
if (out)
*out << "debug_" << subsys.get_name(o)
<< " = " << subsys.get_log_level(o)
f->dump_string(debug_name.c_str(), ss.str());
}
}
- for (auto& opt: *config_options) {
+ for (const auto& i: schema) {
+ const Option &opt = i.second;
char *buf;
_get_val(opt.name, &buf, -1);
if (out)
*out << opt.name << " = " << buf << std::endl;
if (f)
- f->dump_string(opt.name, buf);
+ f->dump_string(opt.name.c_str(), buf);
free(buf);
}
}
ostream *oss)
{
int ret = 0;
- int o;
+ size_t o = 0;
std::string val;
// subsystems?
return ret;
}
- const char *option_name = nullptr;
+ std::string option_name;
std::string error_message;
o = 0;
- for (auto& opt_ref: *config_options) {
+ for (const auto& opt_iter: schema) {
+ const Option &opt = opt_iter.second;
ostringstream err;
- config_option const *opt = &opt_ref;
std::string as_option("--");
- as_option += opt->name;
- option_name = opt->name;
- if (opt->type == OPT_BOOL) {
+ as_option += opt.name;
+ option_name = opt.name;
+ if (opt.type == Option::TYPE_BOOL) {
int res;
if (ceph_argparse_binary_flag(args, i, &res, oss, as_option.c_str(),
(char*)NULL)) {
break;
} else {
std::string no("--no-");
- no += opt->name;
+ no += opt.name;
if (ceph_argparse_flag(args, i, no.c_str(), (char*)NULL)) {
ret = set_val_impl("false", opt, &error_message);
break;
ret = -EINVAL;
break;
}
- if (oss && ((!opt->is_safe()) &&
- (observers.find(opt->name) == observers.end()))) {
- *oss << "You cannot change " << opt->name << " using injectargs.\n";
+ if (oss && ((!opt.is_safe()) &&
+ (observers.find(opt.name) == observers.end()))) {
+ *oss << "You cannot change " << opt.name << " using injectargs.\n";
return -ENOSYS;
}
ret = set_val_impl(val, opt, &error_message);
}
if (ret != 0 || !error_message.empty()) {
- assert(option_name);
+ assert(!option_name.empty());
if (oss) {
*oss << "Parse error setting " << option_name << " to '"
<< val << "' using injectargs";
}
}
- if (o == (int)config_options->size()) {
+ if (o == schema.size()) {
// ignore
++i;
}
expand_all_meta();
+ // expand_all_meta could have modified anything. Copy it all out again.
+ for (const auto &i : legacy_values) {
+ const auto &name = i.first;
+ const auto &option = schema.at(name);
+ auto ptr = i.second;
+
+ update_legacy_val(option, ptr);
+ }
+
// create the reverse observer mapping, mapping observers to the set of
// changed keys that they'll get.
rev_obs_map_t robs;
return ret;
}
-void md_config_t::set_val_or_die(const char *key, const char *val)
+void md_config_t::set_val_or_die(const std::string &key,
+ const std::string &val,
+ bool meta)
{
- int ret = set_val(key, val);
- assert(ret == 0);
-}
-
-struct is_integer_member : public boost::static_visitor<bool> {
- template<typename T,
- typename boost::enable_if<boost::is_integral<T>, int>::type = 0>
- bool operator()(const T md_config_t::* /* member_ptr */) const {
- return true;
- }
- template<typename T,
- typename boost::enable_if_c<!boost::is_integral<T>::value, int>::type = 0>
- bool operator()(const T md_config_t::* /* member_ptr */) const {
- return false;
- }
-};
-
-struct is_float_member : public boost::static_visitor<bool> {
- template<typename T,
- typename boost::enable_if<boost::is_float<T>, int>::type = 0>
- bool operator()(const T md_config_t::* /* member_ptr */) const {
- return true;
- }
- template<typename T,
- typename boost::enable_if_c<!boost::is_float<T>::value, int>::type = 0>
- bool operator()(const T md_config_t::* /* member_ptr */) const {
- return false;
+ std::stringstream err;
+ int ret = set_val(key, val, meta, &err);
+ if (ret != 0) {
+ std::cerr << "set_val_or_die(" << key << "): " << err.str();
}
-};
-
-bool md_config_t::config_option::is_safe() const {
- // for now integer and floating point options considered thread safe
- return safe ||
- boost::apply_visitor(is_integer_member(), md_member_ptr) ||
- boost::apply_visitor(is_float_member(), md_member_ptr);
-}
-
-md_config_t::config_option const *md_config_t::find_config_option(const std::string &normalized_key) const
-{
- auto opt_it = std::find_if(config_options->begin(),
- config_options->end(),
- [normalized_key](const config_option &opt) -> bool {
- return strcmp(normalized_key.c_str(), opt.name) == 0;
- });
- return config_options->end() == opt_it ? nullptr : &(*opt_it);
+ assert(ret == 0);
}
-int md_config_t::set_val(const char *key, const char *val, bool meta)
+int md_config_t::set_val(const std::string &key, const char *val,
+ bool meta, std::stringstream *err_ss)
{
Mutex::Locker l(lock);
- if (!key)
+ if (key.empty()) {
+ if (err_ss) *err_ss << "No key specified";
return -EINVAL;
- if (!val)
+ }
+ if (!val) {
return -EINVAL;
+ }
std::string v(val);
if (meta)
// subsystems?
if (strncmp(k.c_str(), "debug_", 6) == 0) {
- for (int o = 0; o < subsys.get_num(); o++) {
+ for (size_t o = 0; o < subsys.get_num(); o++) {
std::string as_option = "debug_" + subsys.get_name(o);
if (k == as_option) {
int log, gather;
int r = sscanf(v.c_str(), "%d/%d", &log, &gather);
if (r >= 1) {
- if (r < 2)
+ if (r < 2) {
gather = log;
- // cout << "subsys " << subsys.get_name(o) << " log " << log << " gather " << gather << std::endl;
+ }
subsys.set_log_level(o, log);
subsys.set_gather_level(o, gather);
+ if (err_ss) *err_ss << "Set " << k << " to " << log << "/" << gather;
return 0;
}
+ if (err_ss) {
+ *err_ss << "Invalid debug level, should be <int> or <int>/<int>";
+ }
return -EINVAL;
}
}
}
- config_option const *opt = find_config_option(k);
- if (opt) {
- if ((!opt->is_safe()) && internal_safe_to_start_threads) {
+ const auto &opt_iter = schema.find(k);
+ if (opt_iter != schema.end()) {
+ const Option &opt = opt_iter->second;
+ if ((!opt.is_safe()) && internal_safe_to_start_threads) {
// If threads have been started and the option is not thread safe
- if (observers.find(opt->name) == observers.end()) {
+ if (observers.find(opt.name) == observers.end()) {
// And there is no observer to safely change it...
// You lose.
+ if (err_ss) *err_ss << "Configuration option '" << key << "' may "
+ "not be modified at runtime";
return -ENOSYS;
}
}
std::string error_message;
int r = set_val_impl(v, opt, &error_message);
+ if (r == 0) {
+ if (err_ss) *err_ss << "Set " << opt.name << " to " << v;
+ } else {
+ if (err_ss) *err_ss << error_message;
+ }
return r;
}
- // couldn't find a configuration option with key 'key'
+ if (err_ss) *err_ss << "Configuration option not found: '" << key << "'";
return -ENOENT;
}
-int md_config_t::get_val(const char *key, char **buf, int len) const
+int md_config_t::get_val(const std::string &key, char **buf, int len) const
{
Mutex::Locker l(lock);
return _get_val(key, buf,len);
}
-md_config_t::config_value_t md_config_t::get_val_generic(const char *key) const
+Option::value_t md_config_t::get_val_generic(const std::string &key) const
{
Mutex::Locker l(lock);
return _get_val(key);
}
-class get_value_generic_visitor : public boost::static_visitor<md_config_t::config_value_t> {
- md_config_t const *conf;
-public:
- explicit get_value_generic_visitor(md_config_t const *conf_) : conf(conf_) { }
- template<typename T> md_config_t::config_value_t operator()(const T md_config_t::* member_ptr) {
- return md_config_t::config_value_t(conf->*member_ptr);
- }
-};
-
-md_config_t::config_value_t md_config_t::_get_val(const char *key) const
+Option::value_t md_config_t::_get_val(const std::string &key) const
{
assert(lock.is_locked());
- if (!key)
- return config_value_t(invalid_config_value_t());
+ if (key.empty()) {
+ return Option::value_t(boost::blank());
+ }
// In key names, leading and trailing whitespace are not significant.
string k(ConfFile::normalize_key_name(key));
- config_option const *opt = find_config_option(k);
- if (!opt) {
- return config_value_t(invalid_config_value_t());
+ const auto &opt_iter = schema.find(k);
+ if (opt_iter != schema.end()) {
+ // Using .at() is safe because all keys in the schema always have
+ // entries in ::values
+ return values.at(k);
+ } else {
+ return Option::value_t(boost::blank());
}
- get_value_generic_visitor gvv(this);
- return boost::apply_visitor(gvv, opt->md_member_ptr);
}
-int md_config_t::_get_val(const char *key, std::string *value) const {
+int md_config_t::_get_val(const std::string &key, std::string *value) const {
assert(lock.is_locked());
std::string normalized_key(ConfFile::normalize_key_name(key));
- config_value_t config_value = _get_val(normalized_key.c_str());
- if (!boost::get<invalid_config_value_t>(&config_value)) {
+ Option::value_t config_value = _get_val(normalized_key.c_str());
+ if (!boost::get<boost::blank>(&config_value)) {
ostringstream oss;
if (bool *flag = boost::get<bool>(&config_value)) {
oss << (*flag ? "true" : "false");
- } else if (float *fp = boost::get<float>(&config_value)) {
- oss << std::fixed << *fp ;
} else if (double *dp = boost::get<double>(&config_value)) {
- oss << std::fixed << *dp ;
+ oss << std::fixed << *dp;
} else {
oss << config_value;
}
return -ENOENT;
}
-int md_config_t::_get_val(const char *key, char **buf, int len) const
+int md_config_t::_get_val(const std::string &key, char **buf, int len) const
{
assert(lock.is_locked());
- if (!key)
+ if (key.empty())
return -EINVAL;
string val ;
- if (!_get_val(key, &val)) {
+ if (_get_val(key, &val) == 0) {
int l = val.length() + 1;
if (len == -1) {
*buf = (char*)malloc(l);
string k(ConfFile::normalize_key_name(key));
// subsys?
- for (int o = 0; o < subsys.get_num(); o++) {
+ for (size_t o = 0; o < subsys.get_num(); o++) {
std::string as_option = "debug_" + subsys.get_name(o);
if (k == as_option) {
if (len == -1) {
const std::string negative_flag_prefix("no_");
keys->clear();
- keys->reserve(config_options->size());
- for (auto& opt: *config_options) {
+ keys->reserve(schema.size());
+ for (const auto &i: schema) {
+ const Option &opt = i.second;
keys->push_back(opt.name);
- if (opt.type == OPT_BOOL) {
+ if (opt.type == Option::TYPE_BOOL) {
keys->push_back(negative_flag_prefix + opt.name);
}
}
- for (int i = 0; i < subsys.get_num(); ++i) {
+ for (size_t i = 0; i < subsys.get_num(); ++i) {
keys->push_back("debug_" + subsys.get_name(i));
}
}
}
int md_config_t::get_val_from_conf_file(const std::vector <std::string> §ions,
- const char *key, std::string &out, bool emeta) const
+ const std::string &key, std::string &out, bool emeta) const
{
Mutex::Locker l(lock);
return _get_val_from_conf_file(sections, key, out, emeta);
}
int md_config_t::_get_val_from_conf_file(const std::vector <std::string> §ions,
- const char *key, std::string &out, bool emeta) const
+ const std::string &key, std::string &out, bool emeta) const
{
assert(lock.is_locked());
std::vector <std::string>::const_iterator s = sections.begin();
return -ENOENT;
}
-int md_config_t::set_val_impl(const std::string &val, config_option const *opt,
+int md_config_t::set_val_impl(const std::string &raw_val, const Option &opt,
std::string *error_message)
{
assert(lock.is_locked());
- std::string value(val);
- if (opt->validator) {
- int r = opt->validator(&value, error_message);
- if (r < 0) {
- return r;
- }
- }
- int ret = set_val_raw(value.c_str(), opt);
- if (ret)
- return ret;
- changed.insert(opt->name);
- return 0;
-}
+ std::string val = raw_val;
-template<typename T> struct strtox_helper;
+ int r = opt.pre_validate(&val, error_message);
+ if (r != 0) {
+ return r;
+ }
-template<> struct strtox_helper<float> {
- static inline void apply(const char *val, float &x, std::string &err) {
- x = strict_strtof(val, &err);
+ Option::value_t new_value;
+ if (opt.type == Option::TYPE_INT) {
+ int64_t f = strict_si_cast<int64_t>(val.c_str(), error_message);
+ if (!error_message->empty()) {
+ return -EINVAL;
+ }
+ new_value = f;
+ } else if (opt.type == Option::TYPE_UINT) {
+ uint64_t f = strict_si_cast<uint64_t>(val.c_str(), error_message);
+ if (!error_message->empty()) {
+ return -EINVAL;
+ }
+ new_value = f;
+ } else if (opt.type == Option::TYPE_STR) {
+ new_value = val;
+ } else if (opt.type == Option::TYPE_FLOAT) {
+ double f = strict_strtod(val.c_str(), error_message);
+ if (!error_message->empty()) {
+ return -EINVAL;
+ } else {
+ new_value = f;
+ }
+ } else if (opt.type == Option::TYPE_BOOL) {
+ if (strcasecmp(val.c_str(), "false") == 0) {
+ new_value = false;
+ } else if (strcasecmp(val.c_str(), "true") == 0) {
+ new_value = true;
+ } else {
+ int b = strict_strtol(val.c_str(), 10, error_message);
+ if (!error_message->empty()) {
+ return -EINVAL;
+ }
+ new_value = !!b;
+ }
+ } else if (opt.type == Option::TYPE_ADDR) {
+ entity_addr_t addr;
+ if (!addr.parse(val.c_str())){
+ return -EINVAL;
+ }
+ new_value = addr;
+ } else if (opt.type == Option::TYPE_UUID) {
+ uuid_d uuid;
+ if (!uuid.parse(val.c_str())) {
+ return -EINVAL;
+ }
+ new_value = uuid;
+ } else {
+ ceph_abort();
}
-};
-template<> struct strtox_helper<double> {
- static inline void apply(const char *val, double &x, std::string &err) {
- x = strict_strtod(val, &err);
+ r = opt.validate(new_value, error_message);
+ if (r != 0) {
+ return r;
}
-};
-template<typename T> static inline int strict_strtox(const char *val, T &x) {
- std::string err;
- strtox_helper<T>::apply(val, x, err);
- return err.empty() ? 0 : -EINVAL;
-}
-class set_value_visitor : public boost::static_visitor<int> {
- md_config_t const *conf;
- const char *val;
-public:
- explicit set_value_visitor(md_config_t const *conf_, const char *val_) :
- conf(conf_), val(val_) { }
+ // Apply the value to its entry in the `values` map
+ values[opt.name] = new_value;
- int operator()(const std::string md_config_t::* member_ptr) {
- auto *ptr = const_cast<std::string *>(&(conf->*member_ptr));
- *ptr = val ? val : "";
- return 0;
+ // Apply the value to its legacy field, if it has one
+ auto legacy_ptr_iter = legacy_values.find(std::string(opt.name));
+ if (legacy_ptr_iter != legacy_values.end()) {
+ update_legacy_val(opt, legacy_ptr_iter->second);
}
- int operator()(const bool md_config_t::* member_ptr) {
- bool *ptr = const_cast<bool *>(&(conf->*member_ptr));
- if (strcasecmp(val, "false") == 0) {
- *ptr = false;
- } else if (strcasecmp(val, "true") == 0) {
- *ptr = true;
- } else {
- std::string err;
- int b = strict_strtol(val, 10, &err);
- if (!err.empty()) {
- return -EINVAL;
- }
- *ptr = !!b;
- }
- return 0;
- }
-
- // type has parse() member function
- template<typename T,
- typename boost::enable_if<boost::is_member_function_pointer<decltype(&T::parse)>, int>::type = 0>
- int operator()(const T md_config_t::* member_ptr) {
- T *obj = const_cast<T *>(&(conf->*member_ptr));
- if (!obj->parse(val)) {
- return -EINVAL;
- }
- return 0;
- }
+ changed.insert(opt.name);
+ return 0;
+}
- // float, double
- template<typename T,
- typename boost::enable_if<boost::is_floating_point<T>, int>::type = 0>
- int operator()(const T md_config_t::* member_ptr) {
- T* ptr = const_cast<T *>(&(conf->*member_ptr));
- return strict_strtox(val, *ptr);
- }
+/**
+ * Handles assigning from a variant-of-types to a variant-of-pointers-to-types
+ */
+class assign_visitor : public boost::static_visitor<>
+{
+ md_config_t *conf;
+ Option::value_t val;
+ public:
- // integers
- template<typename T,
- typename boost::enable_if_c<boost::is_integral<T>::value &&
- !boost::is_same<T, bool>::value, int>::type = 0>
- int operator()(const T md_config_t::* member_ptr) {
- std::string err;
- T f = strict_si_cast<T>(val, &err);
- if (!err.empty()) {
- return -EINVAL;
- }
- T *ptr = const_cast<T *>(&(conf->*member_ptr));
- *ptr = f;
- return 0;
+ assign_visitor(md_config_t *conf_, Option::value_t val_)
+ : conf(conf_), val(val_)
+ {}
+
+ template <typename T>
+ void operator()( T md_config_t::* ptr) const
+ {
+ T *member = const_cast<T *>(&(conf->*(boost::get<const T md_config_t::*>(ptr))));
+
+ *member = boost::get<T>(val);
}
};
-int md_config_t::set_val_raw(const char *val, config_option const *opt)
+void md_config_t::update_legacy_val(const Option &opt,
+ md_config_t::member_ptr_t member_ptr)
{
- assert(lock.is_locked());
- set_value_visitor svv(this, val);
- return boost::apply_visitor(svv, opt->md_member_ptr);
+ if (boost::get<boost::blank>(&values.at(opt.name))) {
+ // This shouldn't happen, but if it does then just don't even
+ // try to assign to the legacy field.
+ return;
+ }
+
+ boost::apply_visitor(assign_visitor(this, values.at(opt.name)), member_ptr);
}
+
static const char *CONF_METAVARIABLES[] = {
"data_dir", // put this first: it may contain some of the others
"cluster", "type", "name", "host", "num", "id", "pid", "cctid"
{
// Expand all metavariables
ostringstream oss;
- for (auto& opt: *config_options) {
- std::string *str;
- opt.conf_ptr(str, this);
- if (str) {
- list<config_option const *> stack;
+ for (const auto &i : schema) {
+ const Option &opt = i.second;
+
+ if (opt.type == Option::TYPE_STR) {
+ list<const Option*> stack;
+ std::string *str = boost::get<std::string>(&(values.at(opt.name)));
+ assert(str != nullptr); // Non-string values should never get in
expand_meta(*str, &opt, stack, &oss);
}
}
cerr << oss.str();
}
-bool md_config_t::expand_meta(std::string &origval,
+bool md_config_t::expand_meta(std::string &val,
std::ostream *oss) const
{
- list<config_option const *> stack;
- return expand_meta(origval, NULL, stack, oss);
+ list<const Option*> stack;
+ return expand_meta(val, NULL, stack, oss);
}
bool md_config_t::expand_meta(std::string &origval,
- config_option const *opt,
- std::list<config_option const *> stack,
+ const Option *opt,
+ std::list<const Option *> stack,
std::ostream *oss) const
{
assert(lock.is_locked());
// ignore an expansion loop and create a human readable
// message about it
if (opt) {
- for (list<config_option const *>::iterator i = stack.begin();
- i != stack.end();
- ++i) {
- if (strcmp(opt->name, (*i)->name) == 0) {
+ for (const auto stack_ptr : stack) {
+ if (opt->name == stack_ptr->name) {
*oss << "variable expansion loop at "
<< opt->name << "=" << origval << std::endl;
*oss << "expansion stack: " << std::endl;
- for (list<config_option const *>::iterator j = stack.begin();
- j != stack.end();
- ++j) {
- *oss << (*j)->name << "=" << *((*j)->conf_ptr<std::string>(this)) << std::endl;
+ for (const auto j : stack) {
+ std::string val;
+ _get_val(j->name, &val);
+ *oss << j->name << "=" << val << std::endl;
}
return false;
}
}
- }
- if (opt)
stack.push_front(opt);
+ }
bool found_meta = false;
string out;
if (!expanded) {
// config option?
- for (auto& opt: *config_options) {
- if (var == opt.name) {
- string *origval;
- opt.conf_ptr(origval, const_cast<md_config_t *>(this));
- if (origval) {
- expand_meta(*origval, &opt, stack, oss);
- out += *origval;
- } else {
- char *vv = NULL;
- _get_val(opt.name, &vv, -1);
- out += vv;
- free(vv);
- }
- expanded = true;
- break;
- }
+ const auto other_opt_iter = schema.find(var);
+ if (other_opt_iter != schema.end()) {
+ const Option &other_opt = other_opt_iter->second;
+ if (other_opt.type == Option::TYPE_STR) {
+ // The referenced option is a string, it may need substitution
+ // before inserting.
+ Option::value_t *other_val_ptr = const_cast<Option::value_t*>(&(values.at(other_opt.name)));
+ std::string *other_opt_val = boost::get<std::string>(other_val_ptr);
+ expand_meta(*other_opt_val, &other_opt, stack, oss);
+ out += *other_opt_val;
+ } else {
+ // The referenced option is not a string: retrieve and insert
+ // its stringized form.
+ char *vv = NULL;
+ _get_val(other_opt.name, &vv, -1);
+ out += vv;
+ free(vv);
+ }
+ expanded = true;
}
}
}
char local_buf[4096];
char other_buf[4096];
- for (auto& opt : *config_options) {
+ for (const auto &i : schema) {
+ const Option &opt = i.second;
if (!setting.empty()) {
if (setting != opt.name) {
continue;
::complain_about_parse_errors(cct, &parse_errors);
}
-void md_config_t::validate_default_settings() {
- Mutex::Locker l(lock);
- for (auto &opt : *config_options) {
- // normalize config defaults using their validator
- if (opt.validator) {
- std::string value;
- int r = _get_val(opt.name, &value);
- assert(r == 0);
-
- std::string error_message;
- r = set_val_impl(value.c_str(), &opt, &error_message);
- assert(r == 0);
- }
- }
-}
#include "common/ConfUtils.h"
#include "common/entity_name.h"
+#include "common/code_environment.h"
#include "common/Mutex.h"
#include "log/SubsystemMap.h"
#include "common/config_obs.h"
+#include "common/options.h"
#define OSD_REP_PRIMARY 0
#define OSD_REP_SPLAY 1
*/
struct md_config_t {
public:
+ typedef boost::variant<int64_t md_config_t::*,
+ uint64_t md_config_t::*,
+ std::string md_config_t::*,
+ double md_config_t::*,
+ bool md_config_t::*,
+ entity_addr_t md_config_t::*,
+ uuid_d md_config_t::*> member_ptr_t;
+
/* Maps configuration options to the observer listening for them. */
typedef std::multimap <std::string, md_config_obs_t*> obs_map_t;
* apply_changes */
typedef std::set < std::string > changed_set_t;
- struct invalid_config_value_t { };
- typedef boost::variant<invalid_config_value_t,
- int,
- long long,
- std::string,
- double,
- float,
- bool,
- entity_addr_t,
- uint32_t,
- uint64_t,
- uuid_d> config_value_t;
- typedef boost::variant<const int md_config_t::*,
- const long long md_config_t::*,
- const std::string md_config_t::*,
- const double md_config_t::*,
- const float md_config_t::*,
- const bool md_config_t::*,
- const entity_addr_t md_config_t::*,
- const uint32_t md_config_t::*,
- const uint64_t md_config_t::*,
- const uuid_d md_config_t::*> member_ptr_t;
+ /*
+ * Mapping from legacy config option names to class members
+ */
+ std::map<std::string, md_config_t::member_ptr_t> legacy_values;
+
+ /**
+ * The configuration schema, in the form of Option objects describing
+ * possible settings.
+ */
+ std::map<std::string, const Option &> schema;
+
+ /**
+ * The current values of all settings described by the schema
+ */
+ std::map<std::string, Option::value_t> values;
typedef enum {
OPT_INT, OPT_LONGLONG, OPT_STR, OPT_DOUBLE, OPT_FLOAT, OPT_BOOL,
OPT_ADDR, OPT_U32, OPT_U64, OPT_UUID
} opt_type_t;
- typedef std::function<int(std::string*, std::string*)> validator_t;
-
- class config_option {
- public:
- const char *name;
- opt_type_t type;
- md_config_t::member_ptr_t md_member_ptr;
- bool safe; // promise to access it only via md_config_t::get_val
- validator_t validator;
- private:
- template<typename T> struct get_typed_pointer_visitor : public boost::static_visitor<T const *> {
- md_config_t const *conf;
- explicit get_typed_pointer_visitor(md_config_t const *conf_) : conf(conf_) { }
- template<typename U,
- typename boost::enable_if<boost::is_same<T, U>, int>::type = 0>
- T const *operator()(const U md_config_t::* member_ptr) {
- return &(conf->*member_ptr);
- }
- template<typename U,
- typename boost::enable_if_c<!boost::is_same<T, U>::value, int>::type = 0>
- T const *operator()(const U md_config_t::* member_ptr) {
- return nullptr;
- }
- };
- public:
- // is it OK to alter the value when threads are running?
- bool is_safe() const;
- // Given a configuration, return a pointer to this option inside
- // that configuration.
- template<typename T> void conf_ptr(T const *&ptr, md_config_t const *conf) const {
- get_typed_pointer_visitor<T> gtpv(conf);
- ptr = boost::apply_visitor(gtpv, md_member_ptr);
- }
- template<typename T> void conf_ptr(T *&ptr, md_config_t *conf) const {
- get_typed_pointer_visitor<T> gtpv(conf);
- ptr = const_cast<T *>(boost::apply_visitor(gtpv, md_member_ptr));
- }
- template<typename T> T const *conf_ptr(md_config_t const *conf) const {
- get_typed_pointer_visitor<T> gtpv(conf);
- return boost::apply_visitor(gtpv, md_member_ptr);
- }
- template<typename T> T *conf_ptr(md_config_t *conf) const {
- get_typed_pointer_visitor<T> gtpv(conf);
- return const_cast<T *>(boost::apply_visitor(gtpv, md_member_ptr));
- }
- };
-
// Create a new md_config_t structure.
- md_config_t();
+ md_config_t(bool is_daemon=false);
~md_config_t();
// Adds a new observer to this configuration. You can do this at any time,
// Set a configuration value, or crash
// Metavariables will be expanded.
- void set_val_or_die(const char *key, const char *val);
+ void set_val_or_die(const std::string &key, const std::string &val,
+ bool meta=true);
// Set a configuration value.
// Metavariables will be expanded.
- int set_val(const char *key, const char *val, bool meta=true);
- int set_val(const char *key, const string& s, bool meta=true) {
- return set_val(key, s.c_str(), meta);
+ int set_val(const std::string &key, const char *val, bool meta=true,
+ std::stringstream *err_ss=nullptr);
+ int set_val(const std::string &key, const string& s, bool meta=true,
+ std::stringstream *err_ss=nullptr) {
+ return set_val(key, s.c_str(), meta, err_ss);
}
// Get a configuration value.
// No metavariables will be returned (they will have already been expanded)
- int get_val(const char *key, char **buf, int len) const;
- int _get_val(const char *key, char **buf, int len) const;
- config_value_t get_val_generic(const char *key) const;
- template<typename T> T get_val(const char *key) const;
+ int get_val(const std::string &key, char **buf, int len) const;
+ int _get_val(const std::string &key, char **buf, int len) const;
+ Option::value_t get_val_generic(const std::string &key) const;
+ template<typename T> T get_val(const std::string &key) const;
void get_all_keys(std::vector<std::string> *keys) const;
// Get a value from the configuration file that we read earlier.
// Metavariables will be expanded if emeta is true.
int get_val_from_conf_file(const std::vector <std::string> §ions,
- const char *key, std::string &out, bool emeta) const;
+ std::string const &key, std::string &out, bool emeta) const;
/// dump all config values to a stream
void show_config(std::ostream& out);
void complain_about_parse_errors(CephContext *cct);
private:
+ void validate_schema();
void validate_default_settings();
- int _get_val(const char *key, std::string *value) const;
- config_value_t _get_val(const char *key) const;
+ int _get_val(const std::string &key, std::string *value) const;
+ Option::value_t _get_val(const std::string &key) const;
void _show_config(std::ostream *out, Formatter *f);
void _get_my_sections(std::vector <std::string> §ions) const;
int _get_val_from_conf_file(const std::vector <std::string> §ions,
- const char *key, std::string &out, bool emeta) const;
+ const std::string &key, std::string &out, bool emeta) const;
int parse_option(std::vector<const char*>& args,
std::vector<const char*>::iterator& i,
int parse_config_files_impl(const std::list<std::string> &conf_files,
std::ostream *warnings);
- int set_val_impl(const std::string &val, config_option const *opt,
+ int set_val_impl(const std::string &val, const Option &opt,
std::string *error_message);
- int set_val_raw(const char *val, config_option const *opt);
+
+ template <typename T>
+ void assign_member(member_ptr_t ptr, const Option::value_t &val);
+
+
+ void update_legacy_val(const Option &opt,
+ md_config_t::member_ptr_t member);
void init_subsys();
}
private:
bool expand_meta(std::string &val,
- config_option const *opt,
- std::list<config_option const *> stack,
+ const Option *opt,
+ std::list<const Option*> stack,
std::ostream *oss) const;
/// expand all metavariables in config structure.
/// cluster name
string cluster;
-#define OPTION_OPT_INT(name) const int name;
-#define OPTION_OPT_LONGLONG(name) const long long name;
-#define OPTION_OPT_STR(name) const std::string name;
-#define OPTION_OPT_DOUBLE(name) const double name;
-#define OPTION_OPT_FLOAT(name) const float name;
-#define OPTION_OPT_BOOL(name) const bool name;
-#define OPTION_OPT_ADDR(name) const entity_addr_t name;
-#define OPTION_OPT_U32(name) const uint32_t name;
-#define OPTION_OPT_U64(name) const uint64_t name;
-#define OPTION_OPT_UUID(name) const uuid_d name;
-#define OPTION(name, ty, init) \
+// This macro block defines C members of the md_config_t struct
+// corresponding to the definitions in legacy_config_opts.h.
+// These C members are consumed by code that was written before
+// the new options.cc infrastructure: all newer code should
+// be consume options via explicit get() rather than C members.
+#define OPTION_OPT_INT(name) int64_t name;
+#define OPTION_OPT_LONGLONG(name) int64_t name;
+#define OPTION_OPT_STR(name) std::string name;
+#define OPTION_OPT_DOUBLE(name) double name;
+#define OPTION_OPT_FLOAT(name) double name;
+#define OPTION_OPT_BOOL(name) bool name;
+#define OPTION_OPT_ADDR(name) entity_addr_t name;
+#define OPTION_OPT_U32(name) uint64_t name;
+#define OPTION_OPT_U64(name) uint64_t name;
+#define OPTION_OPT_UUID(name) uuid_d name;
+#define OPTION(name, ty) \
public: \
- OPTION_##ty(name) \
- struct option_##name##_t;
-#define OPTION_VALIDATOR(name)
-#define SAFE_OPTION(name, ty, init) \
+ OPTION_##ty(name)
+#define SAFE_OPTION(name, ty) \
protected: \
- OPTION_##ty(name) \
- public: \
- struct option_##name##_t;
-#define SUBSYS(name, log, gather)
-#define DEFAULT_SUBSYS(log, gather)
-#include "common/config_opts.h"
+ OPTION_##ty(name)
+#include "common/legacy_config_opts.h"
#undef OPTION_OPT_INT
#undef OPTION_OPT_LONGLONG
#undef OPTION_OPT_STR
#undef OPTION_OPT_U64
#undef OPTION_OPT_UUID
#undef OPTION
-#undef OPTION_VALIDATOR
#undef SAFE_OPTION
-#undef SUBSYS
-#undef DEFAULT_SUBSYS
+public:
unsigned get_osd_pool_default_min_size() const {
return osd_pool_default_min_size ?
MIN(osd_pool_default_min_size, osd_pool_default_size) :
mutable Mutex lock;
friend class test_md_config_t;
-protected:
- // Tests and possibly users expect options to appear in the output
- // of ceph-conf in the same order as declared in config_opts.h
- std::shared_ptr<const std::vector<config_option>> config_options;
- config_option const *find_config_option(const std::string& normalized_key) const;
};
template<typename T>
}
};
-template<typename T> T md_config_t::get_val(const char *key) const {
- config_value_t generic_val = this->get_val_generic(key);
+template<typename T> T md_config_t::get_val(const std::string &key) const {
+ Option::value_t generic_val = this->get_val_generic(key);
get_typed_value_visitor<T> gtv;
return boost::apply_visitor(gtv, generic_val);
}
-inline std::ostream& operator<<(std::ostream& o, const md_config_t::invalid_config_value_t& ) {
+inline std::ostream& operator<<(std::ostream& o, const boost::blank& ) {
return o << "INVALID_CONFIG_VALUE";
}
int ceph_resolve_file_search(const std::string& filename_list,
std::string& result);
-typedef md_config_t::config_option config_option;
-
-
enum config_subsys_id {
ceph_subsys_, // default
-#define OPTION(a,b,c)
-#define OPTION_VALIDATOR(name)
-#define SAFE_OPTION(a,b,c)
#define SUBSYS(name, log, gather) \
ceph_subsys_##name,
#define DEFAULT_SUBSYS(log, gather)
-#include "common/config_opts.h"
+#include "common/subsys.h"
#undef SUBSYS
-#undef OPTION
-#undef OPTION_VALIDATOR
-#undef SAFE_OPTION
#undef DEFAULT_SUBSYS
ceph_subsys_max
};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-/* note: no header guard */
-OPTION(host, OPT_STR, "") // "" means that ceph will use short hostname
-OPTION(fsid, OPT_UUID, uuid_d())
-OPTION(public_addr, OPT_ADDR, entity_addr_t())
-OPTION(public_bind_addr, OPT_ADDR, entity_addr_t())
-OPTION(cluster_addr, OPT_ADDR, entity_addr_t())
-OPTION(public_network, OPT_STR, "")
-OPTION(cluster_network, OPT_STR, "")
-OPTION(num_client, OPT_INT, 1)
-OPTION(monmap, OPT_STR, "")
-OPTION(mon_host, OPT_STR, "")
-OPTION(mon_dns_srv_name, OPT_STR, "ceph-mon")
-OPTION(lockdep, OPT_BOOL, false)
-OPTION(lockdep_force_backtrace, OPT_BOOL, false) // always gather current backtrace at every lock
-OPTION(run_dir, OPT_STR, "/var/run/ceph") // the "/var/run/ceph" dir, created on daemon startup
-OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed by common_preinit()
-OPTION(admin_socket_mode, OPT_STR, "") // permission bits to set for admin socket file, e.g., "0775", "0755"
-
-OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit()
-OPTION(setuser, OPT_STR, "") // uid or user name
-OPTION(setgroup, OPT_STR, "") // gid or group name
-OPTION(setuser_match_path, OPT_STR, "") // make setuser/group conditional on this path matching ownership
-OPTION(pid_file, OPT_STR, "") // default changed by common_preinit()
-OPTION(chdir, OPT_STR, "/")
-OPTION(max_open_files, OPT_LONGLONG, 0)
-OPTION(restapi_log_level, OPT_STR, "") // default set by Python code
-OPTION(restapi_base_url, OPT_STR, "") // "
-OPTION(fatal_signal_handlers, OPT_BOOL, true)
-SAFE_OPTION(erasure_code_dir, OPT_STR, CEPH_PKGLIBDIR"/erasure-code") // default location for erasure-code plugins
-
-OPTION(log_file, OPT_STR, "/var/log/ceph/$cluster-$name.log") // default changed by common_preinit()
-OPTION(log_max_new, OPT_INT, 1000) // default changed by common_preinit()
-OPTION(log_max_recent, OPT_INT, 10000) // default changed by common_preinit()
-OPTION(log_to_stderr, OPT_BOOL, true) // default changed by common_preinit()
-OPTION(err_to_stderr, OPT_BOOL, true) // default changed by common_preinit()
-OPTION(log_to_syslog, OPT_BOOL, false)
-OPTION(err_to_syslog, OPT_BOOL, false)
-OPTION(log_flush_on_exit, OPT_BOOL, true) // default changed by common_preinit()
-OPTION(log_stop_at_utilization, OPT_FLOAT, .97) // stop logging at (near) full
-OPTION(log_to_graylog, OPT_BOOL, false)
-OPTION(err_to_graylog, OPT_BOOL, false)
-OPTION(log_graylog_host, OPT_STR, "127.0.0.1")
-OPTION(log_graylog_port, OPT_INT, 12201)
-
-// options will take k/v pairs, or single-item that will be assumed as general
-// default for all, regardless of channel.
-// e.g., "info" would be taken as the same as "default=info"
-// also, "default=daemon audit=local0" would mean
-// "default all to 'daemon', override 'audit' with 'local0'
-OPTION(clog_to_monitors, OPT_STR, "default=true")
-OPTION(clog_to_syslog, OPT_STR, "false")
-OPTION(clog_to_syslog_level, OPT_STR, "info") // this level and above
-OPTION(clog_to_syslog_facility, OPT_STR, "default=daemon audit=local0")
-OPTION(clog_to_graylog, OPT_STR, "false")
-OPTION(clog_to_graylog_host, OPT_STR, "127.0.0.1")
-OPTION(clog_to_graylog_port, OPT_STR, "12201")
-
-OPTION(mon_cluster_log_to_syslog, OPT_STR, "default=false")
-OPTION(mon_cluster_log_to_syslog_level, OPT_STR, "info") // this level and above
-OPTION(mon_cluster_log_to_syslog_facility, OPT_STR, "daemon")
-OPTION(mon_cluster_log_file, OPT_STR,
- "default=/var/log/ceph/$cluster.$channel.log cluster=/var/log/ceph/$cluster.log")
-OPTION(mon_cluster_log_file_level, OPT_STR, "info")
-OPTION(mon_cluster_log_to_graylog, OPT_STR, "false")
-OPTION(mon_cluster_log_to_graylog_host, OPT_STR, "127.0.0.1")
-OPTION(mon_cluster_log_to_graylog_port, OPT_STR, "12201")
-
-OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR, "")
-
-SAFE_OPTION(plugin_dir, OPT_STR, CEPH_PKGLIBDIR)
-
-OPTION(xio_trace_mempool, OPT_BOOL, false) // mempool allocation counters
-OPTION(xio_trace_msgcnt, OPT_BOOL, false) // incoming/outgoing msg counters
-OPTION(xio_trace_xcon, OPT_BOOL, false) // Xio message encode/decode trace
-OPTION(xio_queue_depth, OPT_INT, 128) // depth of Accelio msg queue
-OPTION(xio_mp_min, OPT_INT, 128) // default min mempool size
-OPTION(xio_mp_max_64, OPT_INT, 65536) // max 64-byte chunks (buffer is 40)
-OPTION(xio_mp_max_256, OPT_INT, 8192) // max 256-byte chunks
-OPTION(xio_mp_max_1k, OPT_INT, 8192) // max 1K chunks
-OPTION(xio_mp_max_page, OPT_INT, 4096) // max 1K chunks
-OPTION(xio_mp_max_hint, OPT_INT, 4096) // max size-hint chunks
-OPTION(xio_portal_threads, OPT_INT, 2) // xio portal threads per messenger
-OPTION(xio_max_conns_per_portal, OPT_INT, 32) // max xio_connections per portal/ctx
-OPTION(xio_transport_type, OPT_STR, "rdma") // xio transport type: {rdma or tcp}
-OPTION(xio_max_send_inline, OPT_INT, 512) // xio maximum threshold to send inline
-
-OPTION(compressor_zlib_isal, OPT_BOOL, false)
-OPTION(compressor_zlib_level, OPT_INT, 5) //regular zlib compression level, not applicable to isa-l optimized version
-
-OPTION(async_compressor_enabled, OPT_BOOL, false)
-OPTION(async_compressor_type, OPT_STR, "snappy")
-OPTION(async_compressor_threads, OPT_INT, 2)
-OPTION(async_compressor_thread_timeout, OPT_INT, 5)
-OPTION(async_compressor_thread_suicide_timeout, OPT_INT, 30)
-
-OPTION(plugin_crypto_accelerator, OPT_STR, "crypto_isal")
-
-OPTION(mempool_debug, OPT_BOOL, false)
-
-DEFAULT_SUBSYS(0, 5)
-SUBSYS(lockdep, 0, 1)
-SUBSYS(context, 0, 1)
-SUBSYS(crush, 1, 1)
-SUBSYS(mds, 1, 5)
-SUBSYS(mds_balancer, 1, 5)
-SUBSYS(mds_locker, 1, 5)
-SUBSYS(mds_log, 1, 5)
-SUBSYS(mds_log_expire, 1, 5)
-SUBSYS(mds_migrator, 1, 5)
-SUBSYS(buffer, 0, 1)
-SUBSYS(timer, 0, 1)
-SUBSYS(filer, 0, 1)
-SUBSYS(striper, 0, 1)
-SUBSYS(objecter, 0, 1)
-SUBSYS(rados, 0, 5)
-SUBSYS(rbd, 0, 5)
-SUBSYS(rbd_mirror, 0, 5)
-SUBSYS(rbd_replay, 0, 5)
-SUBSYS(journaler, 0, 5)
-SUBSYS(objectcacher, 0, 5)
-SUBSYS(client, 0, 5)
-SUBSYS(osd, 1, 5)
-SUBSYS(optracker, 0, 5)
-SUBSYS(objclass, 0, 5)
-SUBSYS(filestore, 1, 3)
-SUBSYS(journal, 1, 3)
-SUBSYS(ms, 0, 5)
-SUBSYS(mon, 1, 5)
-SUBSYS(monc, 0, 10)
-SUBSYS(paxos, 1, 5)
-SUBSYS(tp, 0, 5)
-SUBSYS(auth, 1, 5)
-SUBSYS(crypto, 1, 5)
-SUBSYS(finisher, 1, 1)
-SUBSYS(heartbeatmap, 1, 5)
-SUBSYS(perfcounter, 1, 5)
-SUBSYS(rgw, 1, 5) // log level for the Rados gateway
-SUBSYS(civetweb, 1, 10)
-SUBSYS(javaclient, 1, 5)
-SUBSYS(asok, 1, 5)
-SUBSYS(throttle, 1, 1)
-SUBSYS(refs, 0, 0)
-SUBSYS(xio, 1, 5)
-SUBSYS(compressor, 1, 5)
-SUBSYS(bluestore, 1, 5)
-SUBSYS(bluefs, 1, 5)
-SUBSYS(bdev, 1, 3)
-SUBSYS(kstore, 1, 5)
-SUBSYS(rocksdb, 4, 5)
-SUBSYS(leveldb, 4, 5)
-SUBSYS(memdb, 4, 5)
-SUBSYS(kinetic, 1, 5)
-SUBSYS(fuse, 1, 5)
-SUBSYS(mgr, 1, 5)
-SUBSYS(mgrc, 1, 5)
-SUBSYS(dpdk, 1, 5)
-SUBSYS(eventtrace, 1, 5)
-
-OPTION(key, OPT_STR, "")
-OPTION(keyfile, OPT_STR, "")
-OPTION(keyring, OPT_STR,
- // default changed by common_preinit() for mds and osd
- "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,"
-#if defined(__FreeBSD)
- "/usr/local/etc/ceph/$cluster.$name.keyring,/usr/local/etc/ceph/$cluster.keyring,"
- "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin,"
-#endif
- )
-OPTION(heartbeat_interval, OPT_INT, 5)
-OPTION(heartbeat_file, OPT_STR, "")
-OPTION(heartbeat_inject_failure, OPT_INT, 0) // force an unhealthy heartbeat for N seconds
-OPTION(perf, OPT_BOOL, true) // enable internal perf counters
-
-SAFE_OPTION(ms_type, OPT_STR, "async+posix") // messenger backend. It will be modified in runtime, so use SAFE_OPTION
-OPTION(ms_public_type, OPT_STR, "") // messenger backend
-OPTION(ms_cluster_type, OPT_STR, "") // messenger backend
-OPTION(ms_tcp_nodelay, OPT_BOOL, true)
-OPTION(ms_tcp_rcvbuf, OPT_INT, 0)
-OPTION(ms_tcp_prefetch_max_size, OPT_INT, 4096) // max prefetch size, we limit this to avoid extra memcpy
-OPTION(ms_initial_backoff, OPT_DOUBLE, .2)
-OPTION(ms_max_backoff, OPT_DOUBLE, 15.0)
-OPTION(ms_crc_data, OPT_BOOL, true)
-OPTION(ms_crc_header, OPT_BOOL, true)
-OPTION(ms_die_on_bad_msg, OPT_BOOL, false)
-OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false)
-OPTION(ms_die_on_old_message, OPT_BOOL, false) // assert if we get a dup incoming message and shouldn't have (may be triggered by pre-541cd3c64be0dfa04e8a2df39422e0eb9541a428 code)
-OPTION(ms_die_on_skipped_message, OPT_BOOL, false) // assert if we skip a seq (kernel client does this intentionally)
-OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20)
-OPTION(ms_bind_ipv6, OPT_BOOL, false)
-OPTION(ms_bind_port_min, OPT_INT, 6800)
-OPTION(ms_bind_port_max, OPT_INT, 7300)
-#if !defined(__FreeBSD__)
-OPTION(ms_bind_retry_count, OPT_INT, 3) // If binding fails, how many times do we retry to bind
-OPTION(ms_bind_retry_delay, OPT_INT, 5) // Delay between attemps to bind
-#else
-// FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
-OPTION(ms_bind_retry_count, OPT_INT, 6) // If binding fails, how many times do we retry to bind
-OPTION(ms_bind_retry_delay, OPT_INT, 6) // Delay between attemps to bind
-#endif
-OPTION(ms_bind_before_connect, OPT_BOOL, false)
-OPTION(ms_tcp_listen_backlog, OPT_INT, 512)
-OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10)
-OPTION(ms_tcp_read_timeout, OPT_U64, 900)
-OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 16777216)
-OPTION(ms_pq_min_cost, OPT_U64, 65536)
-OPTION(ms_inject_socket_failures, OPT_U64, 0)
-SAFE_OPTION(ms_inject_delay_type, OPT_STR, "") // "osd mds mon client" allowed
-OPTION(ms_inject_delay_msg_type, OPT_STR, "") // the type of message to delay, as returned by Message::get_type_name(). This is an additional restriction on the general type filter ms_inject_delay_type.
-OPTION(ms_inject_delay_max, OPT_DOUBLE, 1) // seconds
-OPTION(ms_inject_delay_probability, OPT_DOUBLE, 0) // range [0, 1]
-OPTION(ms_inject_internal_delays, OPT_DOUBLE, 0) // seconds
-OPTION(ms_dump_on_send, OPT_BOOL, false) // hexdump msg to log on send
-OPTION(ms_dump_corrupt_message_level, OPT_INT, 1) // debug level to hexdump undecodeable messages at
-OPTION(ms_async_op_threads, OPT_U64, 3) // number of worker processing threads for async messenger created on init
-OPTION(ms_async_max_op_threads, OPT_U64, 5) // max number of worker processing threads for async messenger
-OPTION(ms_async_set_affinity, OPT_BOOL, true)
-// example: ms_async_affinity_cores = 0,1
-// The number of coreset is expected to equal to ms_async_op_threads, otherwise
-// extra op threads will loop ms_async_affinity_cores again.
-// If ms_async_affinity_cores is empty, all threads will be bind to current running
-// core
-OPTION(ms_async_affinity_cores, OPT_STR, "")
-OPTION(ms_async_rdma_device_name, OPT_STR, "")
-OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL, false)
-OPTION(ms_async_rdma_buffer_size, OPT_INT, 128 << 10)
-OPTION(ms_async_rdma_send_buffers, OPT_U32, 1024)
-OPTION(ms_async_rdma_receive_buffers, OPT_U32, 1024)
-OPTION(ms_async_rdma_port_num, OPT_U32, 1)
-OPTION(ms_async_rdma_polling_us, OPT_U32, 1000)
-OPTION(ms_async_rdma_local_gid, OPT_STR, "") // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
-OPTION(ms_async_rdma_roce_ver, OPT_INT, 1) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
-OPTION(ms_async_rdma_sl, OPT_INT, 3) // in RoCE, this means PCP
-OPTION(ms_async_rdma_dscp, OPT_INT, 96) // in RoCE, this means DSCP
-
-OPTION(ms_dpdk_port_id, OPT_INT, 0)
-SAFE_OPTION(ms_dpdk_coremask, OPT_STR, "1") // it is modified in unittest so that use SAFE_OPTION to declare
-OPTION(ms_dpdk_memory_channel, OPT_STR, "4")
-OPTION(ms_dpdk_hugepages, OPT_STR, "")
-OPTION(ms_dpdk_pmd, OPT_STR, "")
-SAFE_OPTION(ms_dpdk_host_ipv4_addr, OPT_STR, "")
-SAFE_OPTION(ms_dpdk_gateway_ipv4_addr, OPT_STR, "")
-SAFE_OPTION(ms_dpdk_netmask_ipv4_addr, OPT_STR, "")
-OPTION(ms_dpdk_lro, OPT_BOOL, true)
-OPTION(ms_dpdk_hw_flow_control, OPT_BOOL, true)
-// Weighing of a hardware network queue relative to a software queue (0=no work, 1= equal share)")
-OPTION(ms_dpdk_hw_queue_weight, OPT_FLOAT, 1)
-OPTION(ms_dpdk_debug_allow_loopback, OPT_BOOL, false)
-OPTION(ms_dpdk_rx_buffer_count_per_core, OPT_INT, 8192)
-
-OPTION(inject_early_sigterm, OPT_BOOL, false)
-
-OPTION(mon_data, OPT_STR, "/var/lib/ceph/mon/$cluster-$id")
-OPTION(mon_initial_members, OPT_STR, "") // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
-OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start
-OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap
-OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states
-OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to rely on underlying store's cache
-
-OPTION(mon_cpu_threads, OPT_INT, 4)
-OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT, 4096)
-OPTION(mon_osd_max_creating_pgs, OPT_INT, 1024)
-OPTION(mon_tick_interval, OPT_INT, 5)
-OPTION(mon_session_timeout, OPT_INT, 300) // must send keepalive or subscribe
-OPTION(mon_subscribe_interval, OPT_DOUBLE, 24*3600) // for legacy clients only
-OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0
-OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60) // (seconds) how quickly our laggy estimations decay
-OPTION(mon_osd_laggy_weight, OPT_DOUBLE, .3) // weight for new 'samples's in laggy estimations
-OPTION(mon_osd_laggy_max_interval, OPT_INT, 300) // maximum value of laggy_interval in laggy estimations
-OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL, true) // true if we should scale based on laggy estimations
-OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL, true) // true if we should scale based on laggy estimations
-OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds 'in'
-OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in'
-OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in'
-OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds
-OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out
-OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
-OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75) // min osds required to be in to mark things out
-OPTION(mon_osd_warn_op_age, OPT_DOUBLE, 32) // max op age before we generate a warning (make it a power of 2)
-OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE, 128) // when to generate an error, as multiple of mon_osd_warn_op_age
-OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
-OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap
-OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap
-OPTION(mon_osd_prime_pg_temp, OPT_BOOL, true) // prime osdmap with pg mapping changes
-OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT, .5) // max time to spend priming
-OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT, .25) // max estimate of pg total before we do all pgs in parallel
-OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL, false) // whether turn on fast read on the pool or not
-OPTION(mon_stat_smooth_intervals, OPT_INT, 6) // smooth stats over last N PGMap maps
-OPTION(mon_election_timeout, OPT_FLOAT, 5) // on election proposer, max waiting time for all ACKs
-OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
-OPTION(mon_lease_renew_interval_factor, OPT_FLOAT, .6) // on leader, to renew the lease
-OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT, 2.0) // on leader, if lease isn't acked by all peons
-OPTION(mon_accept_timeout_factor, OPT_FLOAT, 2.0) // on leader, if paxos update isn't accepted
-
-OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between monitors
-OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
-OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
-OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
-OPTION(mon_pg_stuck_threshold, OPT_INT, 60) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
-OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
-OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin
-OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300) // max # pgs per (in) osd before we warn the admin
-OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
-OPTION(mon_pg_warn_min_objects, OPT_INT, 10000) // do not warn below this object #
-OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000) // do not warn on pools below this object #
-OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT, .5) // threshold of down osds after which we check all pgs
-OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
-OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
-OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD backfill full (backfill halted)
-OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
-OPTION(mon_osd_initial_require_min_compat_client, OPT_STR, "jewel")
-OPTION(mon_allow_pool_delete, OPT_BOOL, false) // allow pool deletion
-OPTION(mon_fake_pool_delete, OPT_BOOL, false) // fake pool deletion (add _DELETED suffix)
-OPTION(mon_globalid_prealloc, OPT_U32, 10000) // how many globalids to prealloc
-OPTION(mon_osd_report_timeout, OPT_INT, 900) // grace period before declaring unresponsive OSDs dead
-OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
-OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are too old (older than mon_min_crush_required_version)
-OPTION(mon_crush_min_required_version, OPT_STR, "firefly")
-OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
-OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
-OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
-OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount
-OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
-OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
-OPTION(mon_max_log_epochs, OPT_INT, 500)
-OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
-OPTION(mon_max_osd, OPT_INT, 10000)
-OPTION(mon_probe_timeout, OPT_DOUBLE, 2.0)
-OPTION(mon_client_bytes, OPT_U64, 100ul << 20) // client msg data allowed in memory (in bytes)
-OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT, .3) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
-OPTION(mon_log_max_summary, OPT_U64, 50)
-OPTION(mon_daemon_bytes, OPT_U64, 400ul << 20) // mds, osd message memory cap (in bytes)
-OPTION(mon_max_log_entries_per_event, OPT_INT, 4096)
-OPTION(mon_reweight_min_pgs_per_osd, OPT_U64, 10) // min pgs per osd for reweight-by-pg command
-OPTION(mon_reweight_min_bytes_per_osd, OPT_U64, 100*1024*1024) // min bytes per osd for reweight-by-utilization command
-OPTION(mon_reweight_max_osds, OPT_INT, 4) // max osds to change per reweight-by-* command
-OPTION(mon_reweight_max_change, OPT_DOUBLE, 0.05)
-OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0)
-OPTION(mon_health_to_clog, OPT_BOOL, true)
-OPTION(mon_health_to_clog_interval, OPT_INT, 3600)
-OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE, 60.0)
-OPTION(mon_health_preluminous_compat, OPT_BOOL, false)
-OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
-OPTION(mon_data_avail_crit, OPT_INT, 5)
-OPTION(mon_data_avail_warn, OPT_INT, 30)
-OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes)
-OPTION(mon_warn_not_scrubbed, OPT_INT, 0)
-OPTION(mon_warn_not_deep_scrubbed, OPT_INT, 0)
-OPTION(mon_scrub_interval, OPT_INT, 3600*24) // once a day
-OPTION(mon_scrub_timeout, OPT_INT, 60*5) // let's give it 5 minutes; why not.
-OPTION(mon_scrub_max_keys, OPT_INT, 100) // max number of keys to scrub each time
-OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE, 0.0) // probability of injected crc mismatch [0.0, 1.0]
-OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE, 0.0) // probability of injected missing keys [0.0, 1.0]
-OPTION(mon_config_key_max_entry_size, OPT_INT, 4096) // max num bytes per config-key entry
-OPTION(mon_sync_timeout, OPT_DOUBLE, 60.0)
-OPTION(mon_sync_max_payload_size, OPT_U32, 1048576) // max size for a sync chunk payload (say, 1MB)
-OPTION(mon_sync_debug, OPT_BOOL, false) // enable sync-specific debug
-OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0) // inject N second delay on each get_chunk request
-OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs from different subtrees who need to report a down OSD for it to count
-OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host") // in which level of parent bucket the reporters are counted
-OPTION(mon_osd_force_trim_to, OPT_INT, 0) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
-OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this point (dangerous, use with care)
-OPTION(mon_mds_skip_sanity, OPT_BOOL, false) // skip safety assertions on FSMap (in case of bugs where we want to continue anyway)
-
-// monitor debug options
-OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL, false) // consider deprecated commands as obsolete
-
-// dump transactions
-OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
-OPTION(mon_debug_dump_json, OPT_BOOL, false)
-OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
-OPTION(mon_debug_no_require_luminous, OPT_BOOL, false)
-OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL, false)
-OPTION(mon_debug_no_initial_persistent_features, OPT_BOOL, false)
-OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE, 10.0) // seconds
-OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE, 0) // range [0, 1]
-
-OPTION(mon_sync_provider_kill_at, OPT_INT, 0) // kill the sync provider at a specific point in the work flow
-OPTION(mon_sync_requester_kill_at, OPT_INT, 0) // kill the sync requester at a specific point in the work flow
-OPTION(mon_force_quorum_join, OPT_BOOL, false) // force monitor to join quorum even if it has been previously removed from the map
-OPTION(mon_keyvaluedb, OPT_STR, "rocksdb") // type of keyvaluedb backend
-
-// UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
-OPTION(mon_debug_unsafe_allow_tier_with_nonempty_snaps, OPT_BOOL, false)
-OPTION(mon_osd_blacklist_default_expire, OPT_DOUBLE, 60*60) // default one hour
-OPTION(mon_osd_crush_smoke_test, OPT_BOOL, true)
-
-OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state
-OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
-OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update
-OPTION(paxos_min_wait, OPT_DOUBLE, 0.05) // min time to gather updates for after period of inactivity
-OPTION(paxos_min, OPT_INT, 500) // minimum number of paxos states to keep around
-OPTION(paxos_trim_min, OPT_INT, 250) // number of extra proposals tolerated before trimming
-OPTION(paxos_trim_max, OPT_INT, 500) // max number of extra proposals to trim at a time
-OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it)
-OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it)
-OPTION(paxos_kill_at, OPT_INT, 0)
-OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons
-OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients
-OPTION(auth_client_required, OPT_STR, "cephx, none") // what clients require of daemons
-OPTION(auth_supported, OPT_STR, "") // deprecated; default value for above if they are not defined.
-OPTION(max_rotating_auth_attempts, OPT_INT, 10)
-OPTION(cephx_require_signatures, OPT_BOOL, false) // If true, don't talk to Cephx partners if they don't support message signing; off by default
-OPTION(cephx_cluster_require_signatures, OPT_BOOL, false)
-OPTION(cephx_service_require_signatures, OPT_BOOL, false)
-OPTION(cephx_sign_messages, OPT_BOOL, true) // Default to signing session messages if supported
-OPTION(auth_mon_ticket_ttl, OPT_DOUBLE, 60*60*12)
-OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60)
-OPTION(auth_debug, OPT_BOOL, false) // if true, assert when weird things happen
-OPTION(mon_client_hunt_parallel, OPT_U32, 2) // how many mons to try to connect to in parallel during hunt
-OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0) // try new mon every N seconds until we connect
-OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0) // ping every N seconds
-OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0) // fail if we don't hear back
-OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout
-OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds)
-OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000)
-OPTION(mon_max_pool_pg_num, OPT_INT, 65536)
-OPTION(mon_pool_quota_warn_threshold, OPT_INT, 0) // percent of quota at which to issue warnings
-OPTION(mon_pool_quota_crit_threshold, OPT_INT, 0) // percent of quota at which to issue errors
-OPTION(client_cache_size, OPT_INT, 16384)
-OPTION(client_cache_mid, OPT_FLOAT, .75)
-OPTION(client_use_random_mds, OPT_BOOL, false)
-OPTION(client_mount_timeout, OPT_DOUBLE, 300.0)
-OPTION(client_tick_interval, OPT_DOUBLE, 1.0)
-OPTION(client_trace, OPT_STR, "")
-OPTION(client_readahead_min, OPT_LONGLONG, 128*1024) // readahead at _least_ this much.
-OPTION(client_readahead_max_bytes, OPT_LONGLONG, 0) // default unlimited
-OPTION(client_readahead_max_periods, OPT_LONGLONG, 4) // as multiple of file layout period (object size * num stripes)
-OPTION(client_reconnect_stale, OPT_BOOL, false) // automatically reconnect stale session
-OPTION(client_snapdir, OPT_STR, ".snap")
-OPTION(client_mountpoint, OPT_STR, "/")
-OPTION(client_mount_uid, OPT_INT, -1)
-OPTION(client_mount_gid, OPT_INT, -1)
-OPTION(client_notify_timeout, OPT_INT, 10) // in seconds
-OPTION(osd_client_watch_timeout, OPT_INT, 30) // in seconds
-OPTION(client_caps_release_delay, OPT_INT, 5) // in seconds
-OPTION(client_quota_df, OPT_BOOL, true) // use quota for df on subdir mounts
-OPTION(client_oc, OPT_BOOL, true)
-OPTION(client_oc_size, OPT_INT, 1024*1024* 200) // MB * n
-OPTION(client_oc_max_dirty, OPT_INT, 1024*1024* 100) // MB * n (dirty OR tx.. bigish)
-OPTION(client_oc_target_dirty, OPT_INT, 1024*1024* 8) // target dirty (keep this smallish)
-OPTION(client_oc_max_dirty_age, OPT_DOUBLE, 5.0) // max age in cache before writeback
-OPTION(client_oc_max_objects, OPT_INT, 1000) // max objects in cache
-OPTION(client_debug_getattr_caps, OPT_BOOL, false) // check if MDS reply contains wanted caps
-OPTION(client_debug_force_sync_read, OPT_BOOL, false) // always read synchronously (go to osds)
-OPTION(client_debug_inject_tick_delay, OPT_INT, 0) // delay the client tick for a number of seconds
-OPTION(client_max_inline_size, OPT_U64, 4096)
-OPTION(client_inject_release_failure, OPT_BOOL, false) // synthetic client bug for testing
-OPTION(client_inject_fixed_oldest_tid, OPT_BOOL, false) // synthetic client bug for testing
-OPTION(client_metadata, OPT_STR, "")
-OPTION(client_acl_type, OPT_STR, "")
-OPTION(client_permissions, OPT_BOOL, true)
-OPTION(client_dirsize_rbytes, OPT_BOOL, true)
-
-// note: the max amount of "in flight" dirty data is roughly (max - target)
-OPTION(fuse_use_invalidate_cb, OPT_BOOL, true) // use fuse 2.8+ invalidate callback to keep page cache consistent
-OPTION(fuse_disable_pagecache, OPT_BOOL, false)
-OPTION(fuse_allow_other, OPT_BOOL, true)
-OPTION(fuse_default_permissions, OPT_BOOL, false)
-OPTION(fuse_big_writes, OPT_BOOL, true)
-OPTION(fuse_atomic_o_trunc, OPT_BOOL, true)
-OPTION(fuse_debug, OPT_BOOL, false)
-OPTION(fuse_multithreaded, OPT_BOOL, true)
-OPTION(fuse_require_active_mds, OPT_BOOL, true) // if ceph_fuse requires active mds server
-OPTION(fuse_syncfs_on_mksnap, OPT_BOOL, true)
-OPTION(fuse_set_user_groups, OPT_BOOL, false) // if ceph_fuse fills in group lists or not
-
-OPTION(client_try_dentry_invalidate, OPT_BOOL, true) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
-OPTION(client_die_on_failed_remount, OPT_BOOL, true)
-OPTION(client_check_pool_perm, OPT_BOOL, true)
-OPTION(client_use_faked_inos, OPT_BOOL, false)
-OPTION(client_mds_namespace, OPT_STR, "")
-
-OPTION(crush_location, OPT_STR, "") // whitespace-separated list of key=value pairs describing crush location
-OPTION(crush_location_hook, OPT_STR, "")
-OPTION(crush_location_hook_timeout, OPT_INT, 10)
-
-OPTION(objecter_tick_interval, OPT_DOUBLE, 5.0)
-OPTION(objecter_timeout, OPT_DOUBLE, 10.0) // before we ask for a map
-OPTION(objecter_inflight_op_bytes, OPT_U64, 1024*1024*100) // max in-flight data (both directions)
-OPTION(objecter_inflight_ops, OPT_U64, 1024) // max in-flight ios
-OPTION(objecter_completion_locks_per_session, OPT_U64, 32) // num of completion locks per each session, for serializing same object responses
-OPTION(objecter_inject_no_watch_ping, OPT_BOOL, false) // suppress watch pings
-OPTION(objecter_retry_writes_after_first_reply, OPT_BOOL, false) // ignore the first reply for each write, and resend the osd op instead
-OPTION(objecter_debug_inject_relock_delay, OPT_BOOL, false)
-
-// Max number of deletes at once in a single Filer::purge call
-OPTION(filer_max_purge_ops, OPT_U32, 10)
-// Max number of truncate at once in a single Filer::truncate call
-OPTION(filer_max_truncate_ops, OPT_U32, 128)
-
-OPTION(journaler_write_head_interval, OPT_INT, 15)
-OPTION(journaler_prefetch_periods, OPT_INT, 10) // * journal object size
-OPTION(journaler_prezero_periods, OPT_INT, 5) // * journal object size
-OPTION(mds_data, OPT_STR, "/var/lib/ceph/mds/$cluster-$id")
-OPTION(mds_max_file_size, OPT_U64, 1ULL << 40) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
-// max xattr kv pairs size for each dir/file
-OPTION(mds_max_xattr_pairs_size, OPT_U32, 64 << 10)
-OPTION(mds_cache_size, OPT_INT, 100000)
-OPTION(mds_cache_mid, OPT_FLOAT, .7)
-OPTION(mds_max_file_recover, OPT_U32, 32)
-OPTION(mds_dir_max_commit_size, OPT_INT, 10) // MB
-OPTION(mds_dir_keys_per_op, OPT_INT, 16384)
-OPTION(mds_decay_halflife, OPT_FLOAT, 5)
-OPTION(mds_beacon_interval, OPT_FLOAT, 4)
-OPTION(mds_beacon_grace, OPT_FLOAT, 15)
-OPTION(mds_enforce_unique_name, OPT_BOOL, true)
-OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes
-
-OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle
-OPTION(mds_session_blacklist_on_timeout, OPT_BOOL, true) // whether to blacklist clients whose sessions are dropped due to timeout
-OPTION(mds_session_blacklist_on_evict, OPT_BOOL, true) // whether to blacklist clients whose sessions are dropped via admin commands
-
-OPTION(mds_sessionmap_keys_per_op, OPT_U32, 1024) // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60) // detect clients which aren't revoking caps
-OPTION(mds_recall_state_timeout, OPT_FLOAT, 60) // detect clients which aren't trimming caps
-OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // detecting freeze tree deadlock
-OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session
-OPTION(mds_health_summarize_threshold, OPT_INT, 10) // collapse N-client health metrics to a single 'many'
-OPTION(mds_health_cache_threshold, OPT_FLOAT, 1.5) // warn on cache size if it exceeds mds_cache_size by this factor
-OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart
- // make it (mds_session_timeout - mds_beacon_grace)
-OPTION(mds_tick_interval, OPT_FLOAT, 5)
-OPTION(mds_dirstat_min_interval, OPT_FLOAT, 1) // try to avoid propagating more often than this
-OPTION(mds_scatter_nudge_interval, OPT_FLOAT, 5) // how quickly dirstat changes propagate up the hierarchy
-OPTION(mds_client_prealloc_inos, OPT_INT, 1000)
-OPTION(mds_early_reply, OPT_BOOL, true)
-OPTION(mds_default_dir_hash, OPT_INT, CEPH_STR_HASH_RJENKINS)
-OPTION(mds_log_pause, OPT_BOOL, false)
-OPTION(mds_log_skip_corrupt_events, OPT_BOOL, false)
-OPTION(mds_log_max_events, OPT_INT, -1)
-OPTION(mds_log_events_per_segment, OPT_INT, 1024)
-OPTION(mds_log_segment_size, OPT_INT, 0) // segment size for mds log, default to default file_layout_t
-OPTION(mds_log_max_segments, OPT_U32, 30)
-OPTION(mds_log_max_expiring, OPT_INT, 20)
-OPTION(mds_bal_export_pin, OPT_BOOL, true) // allow clients to pin directory trees to ranks
-OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0) // every 3 seconds
-OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000)
-OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0)
-OPTION(mds_bal_frag, OPT_BOOL, true)
-OPTION(mds_bal_split_size, OPT_INT, 10000)
-OPTION(mds_bal_split_rd, OPT_FLOAT, 25000)
-OPTION(mds_bal_split_wr, OPT_FLOAT, 10000)
-OPTION(mds_bal_split_bits, OPT_INT, 3)
-OPTION(mds_bal_merge_size, OPT_INT, 50)
-OPTION(mds_bal_interval, OPT_INT, 10) // seconds
-OPTION(mds_bal_fragment_interval, OPT_INT, 5) // seconds
-OPTION(mds_bal_fragment_size_max, OPT_INT, 10000*10) // order of magnitude higher than split size
-OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT, 1.5) // multiple of size_max that triggers immediate split
-OPTION(mds_bal_idle_threshold, OPT_FLOAT, 0)
-OPTION(mds_bal_max, OPT_INT, -1)
-OPTION(mds_bal_max_until, OPT_INT, -1)
-OPTION(mds_bal_mode, OPT_INT, 0)
-OPTION(mds_bal_min_rebalance, OPT_FLOAT, .1) // must be this much above average before we export anything
-OPTION(mds_bal_min_start, OPT_FLOAT, .2) // if we need less than this, we don't do anything
-OPTION(mds_bal_need_min, OPT_FLOAT, .8) // take within this range of what we need
-OPTION(mds_bal_need_max, OPT_FLOAT, 1.2)
-OPTION(mds_bal_midchunk, OPT_FLOAT, .3) // any sub bigger than this taken in full
-OPTION(mds_bal_minchunk, OPT_FLOAT, .001) // never take anything smaller than this
-OPTION(mds_bal_target_decay, OPT_DOUBLE, 10.0) // target decay half-life in MDSMap (2x larger is approx. 2x slower)
-OPTION(mds_replay_interval, OPT_FLOAT, 1.0) // time to wait before starting replay again
-OPTION(mds_shutdown_check, OPT_INT, 0)
-OPTION(mds_thrash_exports, OPT_INT, 0)
-OPTION(mds_thrash_fragments, OPT_INT, 0)
-OPTION(mds_dump_cache_on_map, OPT_BOOL, false)
-OPTION(mds_dump_cache_after_rejoin, OPT_BOOL, false)
-OPTION(mds_verify_scatter, OPT_BOOL, false)
-OPTION(mds_debug_scatterstat, OPT_BOOL, false)
-OPTION(mds_debug_frag, OPT_BOOL, false)
-OPTION(mds_debug_auth_pins, OPT_BOOL, false)
-OPTION(mds_debug_subtrees, OPT_BOOL, false)
-OPTION(mds_kill_mdstable_at, OPT_INT, 0)
-OPTION(mds_kill_export_at, OPT_INT, 0)
-OPTION(mds_kill_import_at, OPT_INT, 0)
-OPTION(mds_kill_link_at, OPT_INT, 0)
-OPTION(mds_kill_rename_at, OPT_INT, 0)
-OPTION(mds_kill_openc_at, OPT_INT, 0)
-OPTION(mds_kill_journal_at, OPT_INT, 0)
-OPTION(mds_kill_journal_expire_at, OPT_INT, 0)
-OPTION(mds_kill_journal_replay_at, OPT_INT, 0)
-OPTION(mds_journal_format, OPT_U32, 1) // Default to most recent JOURNAL_FORMAT_*
-OPTION(mds_kill_create_at, OPT_INT, 0)
-OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage
- of MDS modify replies to skip sending the
- client a trace on [0-1]*/
-OPTION(mds_wipe_sessions, OPT_BOOL, 0)
-OPTION(mds_wipe_ino_prealloc, OPT_BOOL, 0)
-OPTION(mds_skip_ino, OPT_INT, 0)
-OPTION(mds_standby_for_name, OPT_STR, "")
-OPTION(mds_standby_for_rank, OPT_INT, -1)
-OPTION(mds_standby_for_fscid, OPT_INT, -1)
-OPTION(mds_standby_replay, OPT_BOOL, false)
-OPTION(mds_enable_op_tracker, OPT_BOOL, true) // enable/disable MDS op tracking
-OPTION(mds_op_history_size, OPT_U32, 20) // Max number of completed ops to track
-OPTION(mds_op_history_duration, OPT_U32, 600) // Oldest completed op to track
-OPTION(mds_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy
-OPTION(mds_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go
-OPTION(mds_snap_min_uid, OPT_U32, 0) // The minimum UID required to create a snapshot
-OPTION(mds_snap_max_uid, OPT_U32, 4294967294) // The maximum UID allowed to create a snapshot
-OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot
-OPTION(mds_verify_backtrace, OPT_U32, 1)
-// detect clients which aren't trimming completed requests
-OPTION(mds_max_completed_flushes, OPT_U32, 100000)
-OPTION(mds_max_completed_requests, OPT_U32, 100000)
-
-OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash
-OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE, 5)
-
-// Maximum number of concurrent stray files to purge
-OPTION(mds_max_purge_files, OPT_U32, 64)
-// Maximum number of concurrent RADOS ops to issue in purging
-OPTION(mds_max_purge_ops, OPT_U32, 8192)
-// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count
-OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT, 0.5)
-
-OPTION(mds_purge_queue_busy_flush_period, OPT_FLOAT, 1.0)
-
-OPTION(mds_root_ino_uid, OPT_INT, 0) // The UID of / on new filesystems
-OPTION(mds_root_ino_gid, OPT_INT, 0) // The GID of / on new filesystems
-
-OPTION(mds_max_scrub_ops_in_progress, OPT_INT, 5) // the number of simultaneous scrubs allowed
-
-// Maximum number of damaged frags/dentries before whole MDS rank goes damaged
-OPTION(mds_damage_table_max_entries, OPT_INT, 10000)
-
-// Maximum increment for client writable range, counted by number of objects
-OPTION(mds_client_writeable_range_max_inc_objs, OPT_U32, 1024)
-
-// verify backend can support configured max object name length
-OPTION(osd_check_max_object_name_len_on_startup, OPT_BOOL, true)
-
-// Maximum number of backfills to or from a single osd
-OPTION(osd_max_backfills, OPT_U64, 1)
-
-// Minimum recovery priority (255 = max, smaller = lower)
-OPTION(osd_min_recovery_priority, OPT_INT, 0)
-
-// Seconds to wait before retrying refused backfills
-OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0)
-
-// Seconds to wait before retrying refused recovery
-OPTION(osd_recovery_retry_interval, OPT_DOUBLE, 30.0)
-
-// max agent flush ops
-OPTION(osd_agent_max_ops, OPT_INT, 4)
-OPTION(osd_agent_max_low_ops, OPT_INT, 2)
-OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1)
-OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1)
-OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0)
-
-// osd ignore history.last_epoch_started in find_best_info
-OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL, false)
-
-// decay atime and hist histograms after how many objects go by
-OPTION(osd_agent_hist_halflife, OPT_INT, 1000)
-
-// must be this amount over the threshold to enable,
-// this amount below the threshold to disable.
-OPTION(osd_agent_slop, OPT_FLOAT, .02)
-
-OPTION(osd_uuid, OPT_UUID, uuid_d())
-OPTION(osd_data, OPT_STR, "/var/lib/ceph/osd/$cluster-$id")
-OPTION(osd_journal, OPT_STR, "/var/lib/ceph/osd/$cluster-$id/journal")
-OPTION(osd_journal_size, OPT_INT, 5120) // in mb
-OPTION(osd_journal_flush_on_shutdown, OPT_BOOL, true) // Flush journal to data store on shutdown
-// flags for specific control purpose during osd mount() process.
-// e.g., can be 1 to skip over replaying journal
-// or 2 to skip over mounting omap or 3 to skip over both.
-// This might be helpful in case the journal is totally corrupted
-// and we still want to bring the osd daemon back normally, etc.
-OPTION(osd_os_flags, OPT_U32, 0)
-OPTION(osd_max_write_size, OPT_INT, 90)
-OPTION(osd_max_pgls, OPT_U64, 1024) // max number of pgls entries to return
-OPTION(osd_client_message_size_cap, OPT_U64, 500*1024L*1024L) // client data allowed in-memory (in bytes)
-OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages allowed in-memory
-OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
-OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
-OPTION(osd_crush_update_weight_set, OPT_BOOL, true) // update weight set while updating weights
-OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
-OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
-OPTION(osd_crush_update_on_start, OPT_BOOL, true)
-OPTION(osd_class_update_on_start, OPT_BOOL, true) // automatically set device class on start
-OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds.
-OPTION(osd_pool_default_crush_rule, OPT_INT, -1)
-OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes
-OPTION(osd_pool_default_size, OPT_INT, 3)
-OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2
-OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
-OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
-OPTION(osd_pool_default_type, OPT_STR, "replicated")
-OPTION(osd_pool_default_erasure_code_profile,
- OPT_STR,
- "plugin=jerasure "
- "technique=reed_sol_van "
- "k=2 "
- "m=1 "
- ) // default properties of osd pool create
-OPTION(osd_erasure_code_plugins, OPT_STR,
- "jerasure"
- " lrc"
-#ifdef HAVE_BETTER_YASM_ELF64
- " isa"
-#endif
- ) // list of erasure code plugins
-
-// Allows the "peered" state for recovery and backfill below min_size
-OPTION(osd_allow_recovery_below_min_size, OPT_BOOL, true)
-
-OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
-OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) // use new pg hashing to prevent pool/pg overlap
-OPTION(osd_pool_default_flag_nodelete, OPT_BOOL, false) // pool can't be deleted
-OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL, false) // pool's pg and pgp num can't be changed
-OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL, false) // pool's size and min size can't be changed
-OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
-OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
-OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT, .6)
-OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT, .8)
-OPTION(osd_pool_default_cache_min_flush_age, OPT_INT, 0) // seconds
-OPTION(osd_pool_default_cache_min_evict_age, OPT_INT, 0) // seconds
-OPTION(osd_pool_default_cache_max_evict_check_size, OPT_INT, 10) // max size to check for eviction
-OPTION(osd_hit_set_min_size, OPT_INT, 1000) // min target size for a HitSet
-OPTION(osd_hit_set_max_size, OPT_INT, 100000) // max target size for a HitSet
-OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking
-
-// conservative default throttling values
-OPTION(osd_tier_promote_max_objects_sec, OPT_U64, 25)
-OPTION(osd_tier_promote_max_bytes_sec, OPT_U64, 5 * 1024*1024)
-
-OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
-OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
-OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
-OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
-OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
-OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write)
-OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT, 20)
-OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT, 1)
-
-OPTION(osd_map_dedup, OPT_BOOL, true)
-OPTION(osd_map_max_advance, OPT_INT, 40) // make this < cache_size!
-OPTION(osd_map_cache_size, OPT_INT, 50)
-OPTION(osd_map_message_max, OPT_INT, 40) // max maps per MOSDMap message
-OPTION(osd_map_share_max_epochs, OPT_INT, 40) // cap on # of inc maps we send to peers, clients
-OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0)
-OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL, false)
-// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
-OPTION(osd_max_markdown_period , OPT_INT, 600)
-OPTION(osd_max_markdown_count, OPT_INT, 5)
-
-OPTION(osd_peering_wq_threads, OPT_INT, 2)
-OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
-OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
-OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
-OPTION(osd_disk_threads, OPT_INT, 1)
-OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be best effort idle
-OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
-OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration
-OPTION(osd_op_num_threads_per_shard, OPT_INT, 0)
-OPTION(osd_op_num_threads_per_shard_hdd, OPT_INT, 1)
-OPTION(osd_op_num_threads_per_shard_ssd, OPT_INT, 2)
-OPTION(osd_op_num_shards, OPT_INT, 0)
-OPTION(osd_op_num_shards_hdd, OPT_INT, 5)
-OPTION(osd_op_num_shards_ssd, OPT_INT, 8)
-
-// PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
-// mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
-// and "mclock_client" are based on the mClock/dmClock algorithm
-// (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
-// class the operation belongs to. "mclock_client" does the same but
-// also works to ienforce fairness between clients. "debug_random"
-// chooses among all four with equal probability.
-OPTION(osd_op_queue, OPT_STR, "wpq")
-
-OPTION(osd_op_queue_cut_off, OPT_STR, "low") // Min priority to go to strict queue. (low, high, debug_random)
-
-// mClock priority queue parameters for five types of ops
-OPTION(osd_op_queue_mclock_client_op_res, OPT_DOUBLE, 1000.0)
-OPTION(osd_op_queue_mclock_client_op_wgt, OPT_DOUBLE, 500.0)
-OPTION(osd_op_queue_mclock_client_op_lim, OPT_DOUBLE, 0.0)
-OPTION(osd_op_queue_mclock_osd_subop_res, OPT_DOUBLE, 1000.0)
-OPTION(osd_op_queue_mclock_osd_subop_wgt, OPT_DOUBLE, 500.0)
-OPTION(osd_op_queue_mclock_osd_subop_lim, OPT_DOUBLE, 0.0)
-OPTION(osd_op_queue_mclock_snap_res, OPT_DOUBLE, 0.0)
-OPTION(osd_op_queue_mclock_snap_wgt, OPT_DOUBLE, 1.0)
-OPTION(osd_op_queue_mclock_snap_lim, OPT_DOUBLE, 0.001)
-OPTION(osd_op_queue_mclock_recov_res, OPT_DOUBLE, 0.0)
-OPTION(osd_op_queue_mclock_recov_wgt, OPT_DOUBLE, 1.0)
-OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE, 0.001)
-OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE, 0.0)
-OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE, 1.0)
-OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE, 0.001)
-
-OPTION(osd_ignore_stale_divergent_priors, OPT_BOOL, false) // do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
-
-// Set to true for testing. Users should NOT set this.
-// If set to true even after reading enough shards to
-// decode the object, any error will be reported.
-OPTION(osd_read_ec_check_for_errors, OPT_BOOL, false) // return error if any ec shard has an error
-
-// Only use clone_overlap for recovery if there are fewer than
-// osd_recover_clone_overlap_limit entries in the overlap set
-OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10)
-
-OPTION(osd_backfill_scan_min, OPT_INT, 64)
-OPTION(osd_backfill_scan_max, OPT_INT, 512)
-OPTION(osd_op_thread_timeout, OPT_INT, 15)
-OPTION(osd_op_thread_suicide_timeout, OPT_INT, 150)
-OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
-OPTION(osd_recovery_thread_suicide_timeout, OPT_INT, 300)
-OPTION(osd_recovery_sleep, OPT_FLOAT, 0.01) // seconds to sleep between recovery ops
-OPTION(osd_snap_trim_sleep, OPT_DOUBLE, 0)
-OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
-OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
-OPTION(osd_remove_thread_suicide_timeout, OPT_INT, 10*60*60)
-OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
-OPTION(osd_command_thread_suicide_timeout, OPT_INT, 15*60)
-OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t())
-OPTION(osd_heartbeat_interval, OPT_INT, 6) // (seconds) how often we ping peers
-
-// (seconds) how long before we decide a peer has failed
-// This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
-OPTION(osd_heartbeat_grace, OPT_INT, 20)
-OPTION(osd_heartbeat_min_peers, OPT_INT, 10) // minimum number of peers
-OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL, false) // prio the heartbeat tcp socket and set dscp as CS6 on it if true
-OPTION(osd_heartbeat_min_size, OPT_INT, 2000) // the minimum size of OSD heartbeat messages to send
-
-// max number of parallel snap trims/pg
-OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2)
-// max number of trimming pgs
-OPTION(osd_max_trimming_pgs, OPT_U64, 2)
-
-// minimum number of peers that must be reachable to mark ourselves
-// back up after being wrongly marked down.
-OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT, .33)
-
-OPTION(osd_mon_heartbeat_interval, OPT_INT, 30) // (seconds) how often to ping monitor if no peers
-OPTION(osd_mon_report_interval_max, OPT_INT, 600)
-OPTION(osd_mon_report_interval_min, OPT_INT, 5) // pg stats, failures, up_thru, boot.
-OPTION(osd_mon_report_max_in_flight, OPT_INT, 2) // max updates in flight
-OPTION(osd_beacon_report_interval, OPT_INT, 300) // (second) how often to send beacon message to monitor
-OPTION(osd_pg_stat_report_interval_max, OPT_INT, 500) // report pg stats for any given pg at least this often
-OPTION(osd_mon_ack_timeout, OPT_DOUBLE, 30.0) // time out a mon if it doesn't ack stats
-OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE, 2.0) // multiples of mon_ack_timeout
-OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE, .9)
-OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
-OPTION(osd_auto_mark_unfound_lost, OPT_BOOL, false)
-OPTION(osd_recovery_delay_start, OPT_FLOAT, 0)
-OPTION(osd_recovery_max_active, OPT_U64, 3)
-OPTION(osd_recovery_max_single_start, OPT_U64, 1)
-OPTION(osd_recovery_max_chunk, OPT_U64, 8<<20) // max size of push chunk
-OPTION(osd_recovery_max_omap_entries_per_chunk, OPT_U64, 64000) // max number of omap entries per chunk; 0 to disable limit
-OPTION(osd_copyfrom_max_chunk, OPT_U64, 8<<20) // max size of a COPYFROM chunk
-OPTION(osd_push_per_object_cost, OPT_U64, 1000) // push cost per object
-OPTION(osd_max_push_cost, OPT_U64, 8<<20) // max size of push message
-OPTION(osd_max_push_objects, OPT_U64, 10) // max objects in single push op
-OPTION(osd_recovery_forget_lost_objects, OPT_BOOL, false) // off for now
-OPTION(osd_max_scrubs, OPT_INT, 1)
-OPTION(osd_scrub_during_recovery, OPT_BOOL, false) // Allow new scrubs to start while recovery is active on the OSD
-OPTION(osd_scrub_begin_hour, OPT_INT, 0)
-OPTION(osd_scrub_end_hour, OPT_INT, 24)
-OPTION(osd_scrub_load_threshold, OPT_FLOAT, 0.5)
-OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low
-OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load
-OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT, 0.5) // randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
-OPTION(osd_scrub_backoff_ratio, OPT_DOUBLE, .66) // the probability to back off the scheduled scrub
-OPTION(osd_scrub_chunk_min, OPT_INT, 5)
-OPTION(osd_scrub_chunk_max, OPT_INT, 25)
-OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
-OPTION(osd_scrub_auto_repair, OPT_BOOL, false) // whether auto-repair inconsistencies upon deep-scrubbing
-OPTION(osd_scrub_auto_repair_num_errors, OPT_U32, 5) // only auto-repair when number of errors is below this threshold
-OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
-OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT, 0.15) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
-OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
-OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60) // objects must be this old (seconds) before we update the whole-object digest on scrub
-OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored
-OPTION(osd_open_classes_on_start, OPT_BOOL, true)
-OPTION(osd_class_load_list, OPT_STR, "cephfs hello journal lock log numops "
- "rbd refcount replica_log rgw statelog timeindex user version") // list of object classes allowed to be loaded (allow all: *)
-OPTION(osd_class_default_list, OPT_STR, "cephfs hello journal lock log numops "
- "rbd refcount replica_log rgw statelog timeindex user version") // list of object classes with default execute perm (allow all: *)
-OPTION(osd_check_for_log_corruption, OPT_BOOL, false)
-OPTION(osd_use_stale_snap, OPT_BOOL, false)
-OPTION(osd_rollback_to_cluster_snap, OPT_STR, "")
-OPTION(osd_default_notify_timeout, OPT_U32, 30) // default notify timeout in seconds
-OPTION(osd_kill_backfill_at, OPT_INT, 0)
-
-// Bounds how infrequently a new map epoch will be persisted for a pg
-OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 40) // make this < map_cache_size!
-
-OPTION(osd_min_pg_log_entries, OPT_U32, 3000) // number of entries to keep in the pg log when trimming it
-OPTION(osd_max_pg_log_entries, OPT_U32, 10000) // max entries, say when degraded, before we trim
-OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT, 1.3) // max entries factor before force recovery
-OPTION(osd_pg_log_trim_min, OPT_U32, 100)
-OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy
-OPTION(osd_command_max_records, OPT_INT, 256)
-OPTION(osd_max_pg_blocked_by, OPT_U32, 16) // max peer osds to report that are blocking our progress
-OPTION(osd_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go
-OPTION(osd_verify_sparse_read_holes, OPT_BOOL, false) // read fiemap-reported holes and verify they are zeros
-OPTION(osd_backoff_on_unfound, OPT_BOOL, true) // object unfound
-OPTION(osd_backoff_on_degraded, OPT_BOOL, false) // [mainly for debug?] object unreadable/writeable
-OPTION(osd_backoff_on_down, OPT_BOOL, true) // pg in down/incomplete state
-OPTION(osd_backoff_on_peering, OPT_BOOL, false) // [debug] pg peering
-OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL, false) // crash osd if client ignores a backoff; useful for debugging
-OPTION(osd_debug_inject_dispatch_delay_probability, OPT_DOUBLE, 0)
-OPTION(osd_debug_inject_dispatch_delay_duration, OPT_DOUBLE, .1)
-OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE, 0)
-OPTION(osd_debug_drop_ping_duration, OPT_INT, 0)
-OPTION(osd_debug_op_order, OPT_BOOL, false)
-OPTION(osd_debug_verify_missing_on_start, OPT_BOOL, false)
-OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64, 0)
-OPTION(osd_debug_verify_snaps_on_info, OPT_BOOL, false)
-OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL, false)
-OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL, false)
-OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0)
-OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false) // inject failure during copyfrom completion
-OPTION(osd_debug_misdirected_ops, OPT_BOOL, false)
-OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL, false)
-OPTION(osd_debug_random_push_read_error, OPT_DOUBLE, 0)
-OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false)
-OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking
-OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops
-OPTION(osd_op_history_size, OPT_U32, 20) // Max number of completed ops to track
-OPTION(osd_op_history_duration, OPT_U32, 600) // Oldest completed op to track
-OPTION(osd_op_history_slow_op_size, OPT_U32, 20) // Max number of slow ops to track
-OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE, 10.0) // track the op if over this threshold
-OPTION(osd_target_transaction_size, OPT_INT, 30) // to adjust various transactions that batch smaller items
-OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "full" (failsafe)
-OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OSDs as down once they refuse to accept connections
-
-OPTION(osd_pg_object_context_cache_count, OPT_INT, 64)
-OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-OPTION(osd_function_tracing, OPT_BOOL, false) // true if function instrumentation should use LTTng
-
-OPTION(osd_fast_info, OPT_BOOL, true) // use fast info attr, if we can
-
-// determines whether PGLog::check() compares written out log to stored log
-OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false)
-OPTION(osd_loop_before_reset_tphandle, OPT_U32, 64) // Max number of loop before we reset thread-pool's handle
-// default timeout while caling WaitInterval on an empty queue
-OPTION(threadpool_default_timeout, OPT_INT, 60)
-// default wait time for an empty queue before pinging the hb timeout
-OPTION(threadpool_empty_queue_max_wait, OPT_INT, 2)
-
-OPTION(leveldb_log_to_ceph_log, OPT_BOOL, true)
-OPTION(leveldb_write_buffer_size, OPT_U64, 8 *1024*1024) // leveldb write buffer size
-OPTION(leveldb_cache_size, OPT_U64, 128 *1024*1024) // leveldb cache size
-OPTION(leveldb_block_size, OPT_U64, 0) // leveldb block size
-OPTION(leveldb_bloom_size, OPT_INT, 0) // leveldb bloom bits per entry
-OPTION(leveldb_max_open_files, OPT_INT, 0) // leveldb max open files
-OPTION(leveldb_compression, OPT_BOOL, true) // leveldb uses compression
-OPTION(leveldb_paranoid, OPT_BOOL, false) // leveldb paranoid flag
-OPTION(leveldb_log, OPT_STR, "/dev/null") // enable leveldb log file
-OPTION(leveldb_compact_on_mount, OPT_BOOL, false)
-
-OPTION(kinetic_host, OPT_STR, "") // hostname or ip address of a kinetic drive to use
-OPTION(kinetic_port, OPT_INT, 8123) // port number of the kinetic drive
-OPTION(kinetic_user_id, OPT_INT, 1) // kinetic user to authenticate as
-OPTION(kinetic_hmac_key, OPT_STR, "asdfasdf") // kinetic key to authenticate with
-OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS
-
-
-OPTION(rocksdb_separate_wal_dir, OPT_BOOL, false) // use $path.wal for wal
-SAFE_OPTION(rocksdb_db_paths, OPT_STR, "") // path,size( path,size)*
-OPTION(rocksdb_log_to_ceph_log, OPT_BOOL, true) // log to ceph log
-OPTION(rocksdb_cache_size, OPT_U64, 128*1024*1024) // rocksdb cache size (unless set by bluestore/etc)
-OPTION(rocksdb_cache_row_ratio, OPT_FLOAT, 0) // ratio of cache for row (vs block)
-OPTION(rocksdb_cache_shard_bits, OPT_INT, 4) // rocksdb block cache shard bits, 4 bit -> 16 shards
-OPTION(rocksdb_cache_type, OPT_STR, "lru") // 'lru' or 'clock'
-OPTION(rocksdb_block_size, OPT_INT, 4*1024) // default rocksdb block size
-OPTION(rocksdb_perf, OPT_BOOL, false) // Enabling this will have 5-10% impact on performance for the stats collection
-OPTION(rocksdb_collect_compaction_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
-OPTION(rocksdb_collect_extended_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
-OPTION(rocksdb_collect_memory_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
-OPTION(rocksdb_enable_rmrange, OPT_BOOL, false) // see https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L253
-
-// rocksdb options that will be used for omap(if omap_backend is rocksdb)
-OPTION(filestore_rocksdb_options, OPT_STR, "")
-// rocksdb options that will be used in monstore
-OPTION(mon_rocksdb_options, OPT_STR, "write_buffer_size=33554432,compression=kNoCompression")
-
-/**
- * osd_*_priority adjust the relative priority of client io, recovery io,
- * snaptrim io, etc
- *
- * osd_*_priority determines the ratio of available io between client and
- * recovery. Each option may be set between
- * 1..63.
- */
-OPTION(osd_client_op_priority, OPT_U32, 63)
-OPTION(osd_recovery_op_priority, OPT_U32, 3)
-
-OPTION(osd_snap_trim_priority, OPT_U32, 5)
-OPTION(osd_snap_trim_cost, OPT_U32, 1<<20) // set default cost equal to 1MB io
-
-OPTION(osd_scrub_priority, OPT_U32, 5)
-// set default cost equal to 50MB io
-OPTION(osd_scrub_cost, OPT_U32, 50<<20)
-// set requested scrub priority higher than scrub priority to make the
-// requested scrubs jump the queue of scheduled scrubs
-OPTION(osd_requested_scrub_priority, OPT_U32, 120)
-
-OPTION(osd_recovery_priority, OPT_U32, 5)
-// set default cost equal to 20MB io
-OPTION(osd_recovery_cost, OPT_U32, 20<<20)
-
-/**
- * osd_recovery_op_warn_multiple scales the normal warning threshhold,
- * osd_op_complaint_time, so that slow recovery ops won't cause noise
- */
-OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
-
-// Max time to wait between notifying mon of shutdown and shutting down
-OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
-OPTION(osd_shutdown_pgref_assert, OPT_BOOL, false) // crash if the OSD has stray PG refs on shutdown
-
-OPTION(osd_max_object_size, OPT_U64, 128*1024L*1024L) // OSD's maximum object size
-OPTION(osd_max_object_name_len, OPT_U32, 2048) // max rados object name len
-OPTION(osd_max_object_namespace_len, OPT_U32, 256) // max rados object namespace len
-OPTION(osd_max_attr_name_len, OPT_U32, 100) // max rados attr name len; cannot go higher than 100 chars for file system backends
-OPTION(osd_max_attr_size, OPT_U64, 0)
-
-OPTION(osd_max_omap_entries_per_request, OPT_U64, 131072)
-OPTION(osd_max_omap_bytes_per_request, OPT_U64, 1<<30)
-
-OPTION(osd_objectstore, OPT_STR, "filestore") // ObjectStore backend type
-OPTION(osd_objectstore_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-OPTION(osd_objectstore_fuse, OPT_BOOL, false)
-
-OPTION(osd_bench_small_size_max_iops, OPT_U32, 100) // 100 IOPS
-OPTION(osd_bench_large_size_max_throughput, OPT_U64, 100 << 20) // 100 MB/s
-OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64MB
-OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts
-
-OPTION(osd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all osd requests
-OPTION(osdc_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all objecter requests
-
-OPTION(osd_discard_disconnected_ops, OPT_BOOL, true)
-
-OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024)
-OPTION(memstore_page_set, OPT_BOOL, true)
-OPTION(memstore_page_size, OPT_U64, 64 << 10)
-
-OPTION(bdev_debug_inflight_ios, OPT_BOOL, false)
-OPTION(bdev_inject_crash, OPT_INT, 0) // if N>0, then ~ 1/N IOs will complete before we crash on flush.
-OPTION(bdev_inject_crash_flush_delay, OPT_INT, 2) // wait N more seconds on flush
-OPTION(bdev_aio, OPT_BOOL, true)
-OPTION(bdev_aio_poll_ms, OPT_INT, 250) // milliseconds
-OPTION(bdev_aio_max_queue_depth, OPT_INT, 1024)
-OPTION(bdev_aio_reap_max, OPT_INT, 16)
-OPTION(bdev_block_size, OPT_INT, 4096)
-OPTION(bdev_debug_aio, OPT_BOOL, false)
-OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT, 60.0)
-
-// if yes, osd will unbind all NVMe devices from kernel driver and bind them
-// to the uio_pci_generic driver. The purpose is to prevent the case where
-// NVMe driver is loaded while osd is running.
-OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL, false)
-OPTION(bdev_nvme_retry_count, OPT_INT, -1) // -1 means by default which is 4
-
-OPTION(objectstore_blackhole, OPT_BOOL, false)
-
-OPTION(bluefs_alloc_size, OPT_U64, 1048576)
-OPTION(bluefs_max_prefetch, OPT_U64, 1048576)
-OPTION(bluefs_min_log_runway, OPT_U64, 1048576) // alloc when we get this low
-OPTION(bluefs_max_log_runway, OPT_U64, 4194304) // alloc this much at a time
-OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT, 5.0) // before we consider
-OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576) // before we consider
-OPTION(bluefs_min_flush_size, OPT_U64, 524288) // ignore flush until its this big
-OPTION(bluefs_compact_log_sync, OPT_BOOL, false) // sync or async log compaction?
-OPTION(bluefs_buffered_io, OPT_BOOL, false)
-OPTION(bluefs_sync_write, OPT_BOOL, false)
-OPTION(bluefs_allocator, OPT_STR, "bitmap") // stupid | bitmap
-OPTION(bluefs_preextend_wal_files, OPT_BOOL, false) // this *requires* that rocksdb has recycling enabled
-
-OPTION(bluestore_bluefs, OPT_BOOL, true)
-OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug
-OPTION(bluestore_bluefs_min, OPT_U64, 1*1024*1024*1024) // 1gb
-OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .02) // min fs free / total free
-OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT, .90) // max fs free / total free
-OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT, .02) // how much to add at a time
-OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT, .20) // how much to reclaim at a time
-OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT, 1) // how often (sec) to balance free space between bluefs and bluestore
-// If you want to use spdk driver, you need to specify NVMe serial number here
-// with "spdk:" prefix.
-// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
-// get the serial number of Intel(R) Fultondale NVMe controllers.
-// Example:
-// bluestore_block_path = spdk:55cd2e404bd73932
-// If you want to run multiple SPDK instances per node, you must specify the
-// amount of dpdk memory size in MB each instance will use, to make sure each
-// instance uses its own dpdk memory
-OPTION(bluestore_spdk_mem, OPT_U32, 512)
-// A hexadecimal bit mask of the cores to run on. Note the core numbering can change between platforms and should be determined beforehand.
-OPTION(bluestore_spdk_coremask, OPT_STR, "0x3")
-// Specify the maximal I/Os to be batched completed while checking queue pair completions.
-// Default value 0 means that let SPDK nvme library determine the value.
-OPTION(bluestore_spdk_max_io_completion, OPT_U32, 0)
-OPTION(bluestore_block_path, OPT_STR, "")
-OPTION(bluestore_block_size, OPT_U64, 10 * 1024*1024*1024) // 10gb for testing
-OPTION(bluestore_block_create, OPT_BOOL, true)
-OPTION(bluestore_block_db_path, OPT_STR, "")
-OPTION(bluestore_block_db_size, OPT_U64, 0) // rocksdb ssts (hot/warm)
-OPTION(bluestore_block_db_create, OPT_BOOL, false)
-OPTION(bluestore_block_wal_path, OPT_STR, "")
-OPTION(bluestore_block_wal_size, OPT_U64, 96 * 1024*1024) // rocksdb wal
-OPTION(bluestore_block_wal_create, OPT_BOOL, false)
-OPTION(bluestore_block_preallocate_file, OPT_BOOL, false) //whether preallocate space if block/db_path/wal_path is file rather that block device.
-OPTION(bluestore_csum_type, OPT_STR, "crc32c") // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
-OPTION(bluestore_csum_min_block, OPT_U32, 4096)
-OPTION(bluestore_csum_max_block, OPT_U32, 64*1024)
-OPTION(bluestore_min_alloc_size, OPT_U32, 0)
-OPTION(bluestore_min_alloc_size_hdd, OPT_U32, 64*1024)
-OPTION(bluestore_min_alloc_size_ssd, OPT_U32, 16*1024)
-OPTION(bluestore_max_alloc_size, OPT_U32, 0)
-OPTION(bluestore_prefer_deferred_size, OPT_U32, 0)
-OPTION(bluestore_prefer_deferred_size_hdd, OPT_U32, 32768)
-OPTION(bluestore_prefer_deferred_size_ssd, OPT_U32, 0)
-OPTION(bluestore_compression_mode, OPT_STR, "none") // force|aggressive|passive|none
-OPTION(bluestore_compression_algorithm, OPT_STR, "snappy")
-OPTION(bluestore_compression_min_blob_size, OPT_U32, 0)
-OPTION(bluestore_compression_min_blob_size_hdd, OPT_U32, 128*1024)
-OPTION(bluestore_compression_min_blob_size_ssd, OPT_U32, 8*1024)
-OPTION(bluestore_compression_max_blob_size, OPT_U32, 0)
-OPTION(bluestore_compression_max_blob_size_hdd, OPT_U32, 512*1024)
-OPTION(bluestore_compression_max_blob_size_ssd, OPT_U32, 64*1024)
-/*
- * Specifies minimum expected amount of saved allocation units
- * per single blob to enable compressed blobs garbage collection
- *
- */
-OPTION(bluestore_gc_enable_blob_threshold, OPT_INT, 0)
-/*
- * Specifies minimum expected amount of saved allocation units
- * per all blobsb to enable compressed blobs garbage collection
- *
- */
-OPTION(bluestore_gc_enable_total_threshold, OPT_INT, 0)
-
-OPTION(bluestore_max_blob_size, OPT_U32, 0)
-OPTION(bluestore_max_blob_size_hdd, OPT_U32, 512*1024)
-OPTION(bluestore_max_blob_size_ssd, OPT_U32, 64*1024)
-/*
- * Require the net gain of compression at least to be at this ratio,
- * otherwise we don't compress.
- * And ask for compressing at least 12.5%(1/8) off, by default.
- */
-OPTION(bluestore_compression_required_ratio, OPT_DOUBLE, .875)
-OPTION(bluestore_extent_map_shard_max_size, OPT_U32, 1200)
-OPTION(bluestore_extent_map_shard_target_size, OPT_U32, 500)
-OPTION(bluestore_extent_map_shard_min_size, OPT_U32, 150)
-OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE, .2)
-OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32, 256)
-OPTION(bluestore_cache_trim_interval, OPT_DOUBLE, .2)
-OPTION(bluestore_cache_trim_max_skip_pinned, OPT_U32, 64) // skip this many onodes pinned in cache before we give up
-OPTION(bluestore_cache_type, OPT_STR, "2q") // lru, 2q
-OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5) // kin page slot size / max page slot size
-OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5) // number of kout page slot / total number of page slot
-OPTION(bluestore_cache_size, OPT_U64, 0)
-OPTION(bluestore_cache_size_hdd, OPT_U64, 1*1024*1024*1024)
-OPTION(bluestore_cache_size_ssd, OPT_U64, 3*1024*1024*1024)
-OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .01)
-OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE, .99)
-OPTION(bluestore_cache_kv_max, OPT_U64, 512*1024*1024) // limit the maximum amount of cache for the kv store
-OPTION(bluestore_kvbackend, OPT_STR, "rocksdb")
-OPTION(bluestore_allocator, OPT_STR, "bitmap") // stupid | bitmap
-OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128)
-OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
-OPTION(bluestore_bitmapallocator_span_size, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
-OPTION(bluestore_max_deferred_txc, OPT_U64, 32)
-OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152")
-OPTION(bluestore_fsck_on_mount, OPT_BOOL, false)
-OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL, true)
-OPTION(bluestore_fsck_on_umount, OPT_BOOL, false)
-OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL, true)
-OPTION(bluestore_fsck_on_mkfs, OPT_BOOL, true)
-OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL, false)
-OPTION(bluestore_sync_submit_transaction, OPT_BOOL, false) // submit kv txn in queueing thread (not kv_sync_thread)
-OPTION(bluestore_throttle_bytes, OPT_U64, 64*1024*1024)
-OPTION(bluestore_throttle_deferred_bytes, OPT_U64, 128*1024*1024)
-OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64, 670000)
-OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64, 4000)
-OPTION(bluestore_throttle_cost_per_io, OPT_U64, 0)
-OPTION(bluestore_deferred_batch_ops, OPT_U64, 0)
-OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64, 64)
-OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64, 16)
-OPTION(bluestore_nid_prealloc, OPT_INT, 1024)
-OPTION(bluestore_blobid_prealloc, OPT_U64, 10240)
-OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones
-OPTION(bluestore_default_buffered_read, OPT_BOOL, true)
-OPTION(bluestore_default_buffered_write, OPT_BOOL, false)
-OPTION(bluestore_debug_misc, OPT_BOOL, false)
-OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL, false)
-OPTION(bluestore_debug_small_allocations, OPT_INT, 0)
-OPTION(bluestore_debug_freelist, OPT_BOOL, false)
-OPTION(bluestore_debug_prefill, OPT_FLOAT, 0)
-OPTION(bluestore_debug_prefragment_max, OPT_INT, 1048576)
-OPTION(bluestore_debug_inject_read_err, OPT_BOOL, false)
-OPTION(bluestore_debug_randomize_serial_transaction, OPT_INT, 0)
-OPTION(bluestore_debug_omit_block_device_write, OPT_BOOL, false)
-OPTION(bluestore_debug_fsck_abort, OPT_BOOL, false)
-OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL, false)
-OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL, false)
-OPTION(bluestore_shard_finishers, OPT_BOOL, false)
-OPTION(bluestore_debug_random_read_err, OPT_DOUBLE, 0)
-
-OPTION(kstore_max_ops, OPT_U64, 512)
-OPTION(kstore_max_bytes, OPT_U64, 64*1024*1024)
-OPTION(kstore_backend, OPT_STR, "rocksdb")
-OPTION(kstore_rocksdb_options, OPT_STR, "compression=kNoCompression")
-OPTION(kstore_rocksdb_bloom_bits_per_key, OPT_INT, 0)
-OPTION(kstore_fsck_on_mount, OPT_BOOL, false)
-OPTION(kstore_fsck_on_mount_deep, OPT_BOOL, true)
-OPTION(kstore_nid_prealloc, OPT_U64, 1024)
-OPTION(kstore_sync_transaction, OPT_BOOL, false)
-OPTION(kstore_sync_submit_transaction, OPT_BOOL, false)
-OPTION(kstore_onode_map_size, OPT_U64, 1024)
-OPTION(kstore_default_stripe_size, OPT_INT, 65536)
-
-OPTION(filestore_omap_backend, OPT_STR, "rocksdb")
-OPTION(filestore_omap_backend_path, OPT_STR, "")
-
-/// filestore wb throttle limits
-OPTION(filestore_wbthrottle_enable, OPT_BOOL, true)
-OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64, 41943040)
-OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64, 419430400)
-OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64, 500)
-OPTION(filestore_wbthrottle_btrfs_ios_hard_limit, OPT_U64, 5000)
-OPTION(filestore_wbthrottle_btrfs_inodes_start_flusher, OPT_U64, 500)
-OPTION(filestore_wbthrottle_xfs_bytes_start_flusher, OPT_U64, 41943040)
-OPTION(filestore_wbthrottle_xfs_bytes_hard_limit, OPT_U64, 419430400)
-OPTION(filestore_wbthrottle_xfs_ios_start_flusher, OPT_U64, 500)
-OPTION(filestore_wbthrottle_xfs_ios_hard_limit, OPT_U64, 5000)
-OPTION(filestore_wbthrottle_xfs_inodes_start_flusher, OPT_U64, 500)
-
-/// These must be less than the fd limit
-OPTION(filestore_wbthrottle_btrfs_inodes_hard_limit, OPT_U64, 5000)
-OPTION(filestore_wbthrottle_xfs_inodes_hard_limit, OPT_U64, 5000)
-
-//Introduce a O_DSYNC write in the filestore
-OPTION(filestore_odsync_write, OPT_BOOL, false)
-
-// Tests index failure paths
-OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
-
-// Allow object read error injection
-OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
-OPTION(filestore_debug_random_read_err, OPT_DOUBLE, 0)
-
-OPTION(filestore_debug_omap_check, OPT_BOOL, false) // Expensive debugging check on sync
-OPTION(filestore_omap_header_cache_size, OPT_INT, 1024)
-
-// Use omap for xattrs for attrs over
-// filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override
-OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
-OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
-OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
-
-// for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override
-OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
-OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
-OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
-
-// max xattr value size
-OPTION(filestore_max_xattr_value_size, OPT_U32, 0) //Override
-OPTION(filestore_max_xattr_value_size_xfs, OPT_U32, 64<<10)
-OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32, 64<<10)
-// ext4 allows 4k xattrs total including some smallish extra fields and the
-// keys. We're allowing 2 512 inline attrs in addition some some filestore
-// replay attrs. After accounting for those, we still need to fit up to
-// two attrs of this value. That means we need this value to be around 1k
-// to be safe. This is hacky, but it's not worth complicating the code
-// to work around ext4's total xattr limit.
-OPTION(filestore_max_xattr_value_size_other, OPT_U32, 1<<10)
-
-OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs
-OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
-
-OPTION(filestore_max_alloc_hint_size, OPT_U64, 1ULL << 20) // bytes
-
-OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds
-OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds
-OPTION(filestore_btrfs_snap, OPT_BOOL, true)
-OPTION(filestore_btrfs_clone_range, OPT_BOOL, true)
-OPTION(filestore_zfs_snap, OPT_BOOL, false) // zfsonlinux is still unstable
-OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL, false)
-OPTION(filestore_fiemap, OPT_BOOL, false) // (try to) use fiemap
-OPTION(filestore_punch_hole, OPT_BOOL, false)
-OPTION(filestore_seek_data_hole, OPT_BOOL, false) // (try to) use seek_data/hole
-OPTION(filestore_splice, OPT_BOOL, false)
-OPTION(filestore_fadvise, OPT_BOOL, true)
-//collect device partition information for management application to use
-OPTION(filestore_collect_device_partition_information, OPT_BOOL, true)
-
-// (try to) use extsize for alloc hint NOTE: extsize seems to trigger
-// data corruption in xfs prior to kernel 3.5. filestore will
-// implicity disable this if it cannot confirm the kernel is newer
-// than that.
-// NOTE: This option involves a tradeoff: When disabled, fragmentation is
-// worse, but large sequential writes are faster. When enabled, large
-// sequential writes are slower, but fragmentation is reduced.
-OPTION(filestore_xfs_extsize, OPT_BOOL, false)
-
-OPTION(filestore_journal_parallel, OPT_BOOL, false)
-OPTION(filestore_journal_writeahead, OPT_BOOL, false)
-OPTION(filestore_journal_trailing, OPT_BOOL, false)
-OPTION(filestore_queue_max_ops, OPT_U64, 50)
-OPTION(filestore_queue_max_bytes, OPT_U64, 100 << 20)
-
-OPTION(filestore_caller_concurrency, OPT_INT, 10)
-
-/// Expected filestore throughput in B/s
-OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE, 200 << 20)
-/// Expected filestore throughput in ops/s
-OPTION(filestore_expected_throughput_ops, OPT_DOUBLE, 200)
-
-/// Filestore max delay multiple. Defaults to 0 (disabled)
-OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE, 0)
-/// Filestore high delay multiple. Defaults to 0 (disabled)
-OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE, 0)
-
-/// Use above to inject delays intended to keep the op queue between low and high
-OPTION(filestore_queue_low_threshhold, OPT_DOUBLE, 0.3)
-OPTION(filestore_queue_high_threshhold, OPT_DOUBLE, 0.9)
-
-OPTION(filestore_op_threads, OPT_INT, 2)
-OPTION(filestore_op_thread_timeout, OPT_INT, 60)
-OPTION(filestore_op_thread_suicide_timeout, OPT_INT, 180)
-OPTION(filestore_commit_timeout, OPT_FLOAT, 600)
-OPTION(filestore_fiemap_threshold, OPT_INT, 4096)
-OPTION(filestore_merge_threshold, OPT_INT, 10)
-OPTION(filestore_split_multiple, OPT_INT, 2)
-OPTION(filestore_split_rand_factor, OPT_U32, 20) // randomize the split threshold by adding 16 * [0, rand_factor)
-OPTION(filestore_update_to, OPT_INT, 1000)
-OPTION(filestore_blackhole, OPT_BOOL, false) // drop any new transactions on the floor
-OPTION(filestore_fd_cache_size, OPT_INT, 128) // FD lru size
-OPTION(filestore_fd_cache_shards, OPT_INT, 16) // FD number of shards
-OPTION(filestore_ondisk_finisher_threads, OPT_INT, 1)
-OPTION(filestore_apply_finisher_threads, OPT_INT, 1)
-OPTION(filestore_dump_file, OPT_STR, "") // file onto which store transaction dumps
-OPTION(filestore_kill_at, OPT_INT, 0) // inject a failure at the n'th opportunity
-OPTION(filestore_inject_stall, OPT_INT, 0) // artificially stall for N seconds in op queue thread
-OPTION(filestore_fail_eio, OPT_BOOL, true) // fail/crash on EIO
-OPTION(filestore_debug_verify_split, OPT_BOOL, false)
-OPTION(journal_dio, OPT_BOOL, true)
-OPTION(journal_aio, OPT_BOOL, true)
-OPTION(journal_force_aio, OPT_BOOL, false)
-OPTION(journal_block_size, OPT_INT, 4096)
-
-// max bytes to search ahead in journal searching for corruption
-OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
-OPTION(journal_block_align, OPT_BOOL, true)
-OPTION(journal_write_header_frequency, OPT_U64, 0)
-OPTION(journal_max_write_bytes, OPT_INT, 10 << 20)
-OPTION(journal_max_write_entries, OPT_INT, 100)
-
-/// Target range for journal fullness
-OPTION(journal_throttle_low_threshhold, OPT_DOUBLE, 0.6)
-OPTION(journal_throttle_high_threshhold, OPT_DOUBLE, 0.9)
-
-/// Multiple over expected at high_threshhold. Defaults to 0 (disabled).
-OPTION(journal_throttle_high_multiple, OPT_DOUBLE, 0)
-/// Multiple over expected at max. Defaults to 0 (disabled).
-OPTION(journal_throttle_max_multiple, OPT_DOUBLE, 0)
-
-OPTION(journal_align_min_size, OPT_INT, 64 << 10) // align data payloads >= this.
-OPTION(journal_replay_from, OPT_INT, 0)
-OPTION(journal_zero_on_create, OPT_BOOL, false)
-OPTION(journal_ignore_corruption, OPT_BOOL, false) // assume journal is not corrupt
-OPTION(journal_discard, OPT_BOOL, false) //using ssd disk as journal, whether support discard nouse journal-data.
-
-OPTION(fio_dir, OPT_STR, "/tmp/fio") // fio data directory for fio-objectstore
-
-OPTION(rados_mon_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means no limit.
-OPTION(rados_osd_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
-OPTION(rados_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-
-OPTION(rbd_op_threads, OPT_INT, 1)
-OPTION(rbd_op_thread_timeout, OPT_INT, 60)
-OPTION(rbd_non_blocking_aio, OPT_BOOL, true) // process AIO ops from a worker thread to prevent blocking
-OPTION(rbd_cache, OPT_BOOL, true) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
-OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL, true) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
-OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
-OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20) // dirty limit in bytes - set to 0 for write-through caching
-OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
-OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0) // seconds in cache before writeback starts
-OPTION(rbd_cache_max_dirty_object, OPT_INT, 0) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
-OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
-OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
-OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
-OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
-OPTION(rbd_balance_parent_reads, OPT_BOOL, false)
-OPTION(rbd_localize_parent_reads, OPT_BOOL, true)
-OPTION(rbd_readahead_trigger_requests, OPT_INT, 10) // number of sequential requests necessary to trigger readahead
-OPTION(rbd_readahead_max_bytes, OPT_LONGLONG, 512 * 1024) // set to 0 to disable readahead
-OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG, 50 * 1024 * 1024) // how many bytes are read in total before readahead is disabled
-OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)
-OPTION(rbd_blacklist_on_break_lock, OPT_BOOL, true) // whether to blacklist clients whose lock was broken
-OPTION(rbd_blacklist_expire_seconds, OPT_INT, 0) // number of seconds to blacklist - set to 0 for OSD default
-OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before maint request times out
-OPTION(rbd_skip_partial_discard, OPT_BOOL, false) // when trying to discard a range inside an object, set to true to skip zeroing the range.
-OPTION(rbd_enable_alloc_hint, OPT_BOOL, true) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
-OPTION(rbd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-OPTION(rbd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all RBD requests
-OPTION(rbd_validate_pool, OPT_BOOL, true) // true if empty pools should be validated for RBD compatibility
-OPTION(rbd_validate_names, OPT_BOOL, true) // true if image specs should be validated
-OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL, true) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API
-OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL, false) // automatically start image resync after mirroring is disconnected due to being laggy
-OPTION(rbd_mirroring_replay_delay, OPT_INT, 0) // time-delay in seconds for rbd-mirror asynchronous replication
-
-OPTION(rbd_default_pool, OPT_STR, "rbd") // default pool for storing images
-OPTION_VALIDATOR(rbd_default_pool)
-
-/*
- * The following options change the behavior for librbd's image creation methods that
- * don't require all of the parameters. These are provided so that older programs
- * can take advantage of newer features without being rewritten to use new versions
- * of the image creation functions.
- *
- * rbd_create()/RBD::create() are affected by all of these options.
- *
- * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by:
- * - rbd_default_order
- * - rbd_default_stripe_count
- * - rbd_default_stripe_size
- *
- * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
- * affected by rbd_default_order.
- */
-OPTION(rbd_default_format, OPT_INT, 2)
-OPTION(rbd_default_order, OPT_INT, 22)
-OPTION(rbd_default_stripe_count, OPT_U64, 0) // changing requires stripingv2 feature
-OPTION(rbd_default_stripe_unit, OPT_U64, 0) // changing to non-object size requires stripingv2 feature
-OPTION(rbd_default_data_pool, OPT_STR, "") // optional default pool for storing image data blocks
-OPTION_VALIDATOR(rbd_default_data_pool)
-
-/**
- * RBD features are only applicable for v2 images. This setting accepts either
- * an integer bitmask value or comma-delimited string of RBD feature names.
- * This setting is always internally stored as an integer bitmask value. The
- * mapping between feature bitmask value and feature name is as follows:
- *
- * +1 -> layering
- * +2 -> striping
- * +4 -> exclusive-lock
- * +8 -> object-map
- * +16 -> fast-diff
- * +32 -> deep-flatten
- * +64 -> journaling
- * +128 -> data-pool
- */
-SAFE_OPTION(rbd_default_features, OPT_STR, "layering,exclusive-lock,object-map,fast-diff,deep-flatten")
-OPTION_VALIDATOR(rbd_default_features)
-
-OPTION(rbd_default_map_options, OPT_STR, "") // default rbd map -o / --options
-
-/**
- * RBD journal options.
- */
-OPTION(rbd_journal_order, OPT_U32, 24) // bits to shift to compute journal object max size, between 12 and 64
-OPTION(rbd_journal_splay_width, OPT_U32, 4) // number of active journal objects
-OPTION(rbd_journal_commit_age, OPT_DOUBLE, 5) // commit time interval, seconds
-OPTION(rbd_journal_object_flush_interval, OPT_INT, 0) // maximum number of pending commits per journal object
-OPTION(rbd_journal_object_flush_bytes, OPT_INT, 0) // maximum number of pending bytes per journal object
-OPTION(rbd_journal_object_flush_age, OPT_DOUBLE, 0) // maximum age (in seconds) for pending commits
-OPTION(rbd_journal_pool, OPT_STR, "") // pool for journal objects
-OPTION(rbd_journal_max_payload_bytes, OPT_U32, 16384) // maximum journal payload size before splitting
-OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT, 0) // maximum number of object sets a journal client can be behind before it is automatically unregistered
-
-/**
- * RBD Mirror options
- */
-OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE, 5) // commit time interval, seconds
-OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE, 5) // maximum age (in seconds) between successive journal polls
-OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32, 32768) // maximum bytes to read from each journal data object per fetch
-OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE, 30) // number of seconds between each update of the image sync point object number
-OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32, 5) // maximum number of image syncs in parallel
-OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT, 30) // interval to refresh peers in rbd-mirror daemon
-OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE, 30) // interval to check and retry the failed requests in deleter
-OPTION(rbd_mirror_image_state_check_interval, OPT_INT, 30) // interval to get images from pool watcher and set sources in replayer
-OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT, 5) // interval (in seconds) between mirror leader heartbeats
-OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT, 2) // number of missed heartbeats for non-lock owner to attempt to acquire lock
-OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT, 3) // number of failed attempts to acquire lock after missing heartbeats before breaking lock
-
-OPTION(nss_db_path, OPT_STR, "") // path to nss db
-
-
-OPTION(rgw_max_chunk_size, OPT_INT, 4 * 1024 * 1024)
-OPTION(rgw_put_obj_min_window_size, OPT_INT, 16 * 1024 * 1024)
-OPTION(rgw_put_obj_max_window_size, OPT_INT, 64 * 1024 * 1024)
-OPTION(rgw_max_put_size, OPT_U64, 5ULL*1024*1024*1024)
-OPTION(rgw_max_put_param_size, OPT_U64, 1 * 1024 * 1024) // max input size for PUT requests accepting json/xml params
-
-/**
- * override max bucket index shards in zone configuration (if not zero)
- *
- * Represents the number of shards for the bucket index object, a value of zero
- * indicates there is no sharding. By default (no sharding, the name of the object
- * is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}',
- * sharding_id is zero-based value. It is not recommended to set a too large value
- * (e.g. thousand) as it increases the cost for bucket listing.
- */
-OPTION(rgw_override_bucket_index_max_shards, OPT_U32, 0)
-
-/**
- * Represents the maximum AIO pending requests for the bucket index object shards.
- */
-OPTION(rgw_bucket_index_max_aio, OPT_U32, 8)
-
-/**
- * whether or not the quota/gc threads should be started
- */
-OPTION(rgw_enable_quota_threads, OPT_BOOL, true)
-OPTION(rgw_enable_gc_threads, OPT_BOOL, true)
-OPTION(rgw_enable_lc_threads, OPT_BOOL, true)
-
-
-OPTION(rgw_data, OPT_STR, "/var/lib/ceph/radosgw/$cluster-$id")
-OPTION(rgw_enable_apis, OPT_STR, "s3, s3website, swift, swift_auth, admin")
-OPTION(rgw_cache_enabled, OPT_BOOL, true) // rgw cache enabled
-OPTION(rgw_cache_lru_size, OPT_INT, 10000) // num of entries in rgw cache
-OPTION(rgw_socket_path, OPT_STR, "") // path to unix domain socket, if not specified, rgw will not run as external fcgi
-OPTION(rgw_host, OPT_STR, "") // host for radosgw, can be an IP, default is 0.0.0.0
-OPTION(rgw_port, OPT_STR, "") // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi
-OPTION(rgw_dns_name, OPT_STR, "") // hostname suffix on buckets
-OPTION(rgw_dns_s3website_name, OPT_STR, "") // hostname suffix on buckets for s3-website endpoint
-OPTION(rgw_content_length_compat, OPT_BOOL, false) // Check both HTTP_CONTENT_LENGTH and CONTENT_LENGTH in fcgi env
-OPTION(rgw_lifecycle_work_time, OPT_STR, "00:00-06:00") //job process lc at 00:00-06:00s
-OPTION(rgw_lc_lock_max_time, OPT_INT, 60) // total run time for a single lc processor work
-OPTION(rgw_lc_max_objs, OPT_INT, 32)
-OPTION(rgw_lc_debug_interval, OPT_INT, -1) // Debug run interval, in seconds
-OPTION(rgw_script_uri, OPT_STR, "") // alternative value for SCRIPT_URI if not set in request
-OPTION(rgw_request_uri, OPT_STR, "") // alternative value for REQUEST_URI if not set in request
-OPTION(rgw_swift_url, OPT_STR, "") // the swift url, being published by the internal swift auth
-OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is considered a swift url
-OPTION(rgw_swift_auth_url, OPT_STR, "") // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
-OPTION(rgw_swift_auth_entry, OPT_STR, "auth") // entry point for which a url is considered a swift auth url
-OPTION(rgw_swift_tenant_name, OPT_STR, "") // tenant name to use for swift access
-OPTION(rgw_swift_account_in_url, OPT_BOOL, false) // assume that URL always contain the account (aka tenant) part
-OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false) // enforce generation of Content-Length even in cost of performance or scalability
-OPTION(rgw_keystone_url, OPT_STR, "") // url for keystone server
-OPTION(rgw_keystone_admin_token, OPT_STR, "") // keystone admin token (shared secret)
-OPTION(rgw_keystone_admin_user, OPT_STR, "") // keystone admin user name
-OPTION(rgw_keystone_admin_password, OPT_STR, "") // keystone admin user password
-OPTION(rgw_keystone_admin_tenant, OPT_STR, "") // keystone admin user tenant (for keystone v2.0)
-OPTION(rgw_keystone_admin_project, OPT_STR, "") // keystone admin user project (for keystone v3)
-OPTION(rgw_keystone_admin_domain, OPT_STR, "") // keystone admin user domain
-OPTION(rgw_keystone_barbican_user, OPT_STR, "") // keystone user to access barbican secrets
-OPTION(rgw_keystone_barbican_password, OPT_STR, "") // keystone password for barbican user
-OPTION(rgw_keystone_barbican_tenant, OPT_STR, "") // keystone barbican user tenant (for keystone v2.0)
-OPTION(rgw_keystone_barbican_project, OPT_STR, "") // keystone barbican user project (for keystone v3)
-OPTION(rgw_keystone_barbican_domain, OPT_STR, "") // keystone barbican user domain
-OPTION(rgw_keystone_api_version, OPT_INT, 2) // Version of Keystone API to use (2 or 3)
-OPTION(rgw_keystone_accepted_roles, OPT_STR, "Member, admin") // roles required to serve requests
-OPTION(rgw_keystone_accepted_admin_roles, OPT_STR, "") // list of roles allowing an user to gain admin privileges
-OPTION(rgw_keystone_token_cache_size, OPT_INT, 10000) // max number of entries in keystone token cache
-OPTION(rgw_keystone_revocation_interval, OPT_INT, 15 * 60) // seconds between tokens revocation check
-OPTION(rgw_keystone_verify_ssl, OPT_BOOL, true) // should we try to verify keystone's ssl
-OPTION(rgw_keystone_implicit_tenants, OPT_BOOL, false) // create new users in their own tenants of the same name
-OPTION(rgw_cross_domain_policy, OPT_STR, "<allow-access-from domain=\"*\" secure=\"false\" />")
-OPTION(rgw_healthcheck_disabling_path, OPT_STR, "") // path that existence causes the healthcheck to respond 503
-OPTION(rgw_s3_auth_use_rados, OPT_BOOL, true) // should we try to use the internal credentials for s3?
-OPTION(rgw_s3_auth_use_keystone, OPT_BOOL, false) // should we try to use keystone for s3?
-OPTION(rgw_s3_auth_aws4_force_boto2_compat, OPT_BOOL, true) // force aws4 auth boto2 compatibility
-OPTION(rgw_barbican_url, OPT_STR, "") // url for barbican server
-
-/* OpenLDAP-style LDAP parameter strings */
-/* rgw_ldap_uri space-separated list of LDAP servers in URI format */
-OPTION(rgw_ldap_uri, OPT_STR, "ldaps://<ldap.your.domain>")
-/* rgw_ldap_binddn LDAP entry RGW will bind with (user match) */
-OPTION(rgw_ldap_binddn, OPT_STR, "uid=admin,cn=users,dc=example,dc=com")
-/* rgw_ldap_searchdn LDAP search base (basedn) */
-OPTION(rgw_ldap_searchdn, OPT_STR, "cn=users,cn=accounts,dc=example,dc=com")
-/* rgw_ldap_dnattr LDAP attribute containing RGW user names (to form binddns)*/
-OPTION(rgw_ldap_dnattr, OPT_STR, "uid")
-/* rgw_ldap_secret file containing credentials for rgw_ldap_binddn */
-OPTION(rgw_ldap_secret, OPT_STR, "/etc/openldap/secret")
-/* rgw_s3_auth_use_ldap use LDAP for RGW auth? */
-OPTION(rgw_s3_auth_use_ldap, OPT_BOOL, false)
-/* rgw_ldap_searchfilter LDAP search filter */
-OPTION(rgw_ldap_searchfilter, OPT_STR, "")
-
-OPTION(rgw_admin_entry, OPT_STR, "admin") // entry point for which a url is considered an admin request
-OPTION(rgw_enforce_swift_acls, OPT_BOOL, true)
-OPTION(rgw_swift_token_expiration, OPT_INT, 24 * 3600) // time in seconds for swift token expiration
-OPTION(rgw_print_continue, OPT_BOOL, true) // enable if 100-Continue works
-OPTION(rgw_print_prohibited_content_length, OPT_BOOL, false) // violate RFC 7230 and send Content-Length in 204 and 304
-OPTION(rgw_remote_addr_param, OPT_STR, "REMOTE_ADDR") // e.g. X-Forwarded-For, if you have a reverse proxy
-OPTION(rgw_op_thread_timeout, OPT_INT, 10*60)
-OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 0)
-OPTION(rgw_thread_pool_size, OPT_INT, 100)
-OPTION(rgw_num_control_oids, OPT_INT, 8)
-OPTION(rgw_num_rados_handles, OPT_U32, 1)
-OPTION(rgw_verify_ssl, OPT_BOOL, true) // should http_client try to verify ssl when sent https request
-
-/* The following are tunables for caches of RGW NFS (and other file
- * client) objects.
- *
- * The file handle cache is a partitioned hash table
- * (fhcache_partitions), each with a closed hash part and backing
- * b-tree mapping. The number of partions is expected to be a small
- * prime, the cache size something larger but less than 5K, the total
- * size of the cache is n_part * cache_size.
- */
-OPTION(rgw_nfs_lru_lanes, OPT_INT, 5)
-OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT, 911)
-OPTION(rgw_nfs_fhcache_partitions, OPT_INT, 3)
-OPTION(rgw_nfs_fhcache_size, OPT_INT, 2017) /* 3*2017=6051 */
-OPTION(rgw_nfs_namespace_expire_secs, OPT_INT, 300) /* namespace invalidate
- * timer */
-OPTION(rgw_nfs_max_gc, OPT_INT, 300) /* max gc events per cycle */
-OPTION(rgw_nfs_write_completion_interval_s, OPT_INT, 10) /* stateless (V3)
- * commit
- * delay */
-
-OPTION(rgw_zone, OPT_STR, "") // zone name
-OPTION(rgw_zone_root_pool, OPT_STR, ".rgw.root") // pool where zone specific info is stored
-OPTION(rgw_default_zone_info_oid, OPT_STR, "default.zone") // oid where default zone info is stored
-OPTION(rgw_region, OPT_STR, "") // region name
-OPTION(rgw_region_root_pool, OPT_STR, ".rgw.root") // pool where all region info is stored
-OPTION(rgw_default_region_info_oid, OPT_STR, "default.region") // oid where default region info is stored
-OPTION(rgw_zonegroup, OPT_STR, "") // zone group name
-OPTION(rgw_zonegroup_root_pool, OPT_STR, ".rgw.root") // pool where all zone group info is stored
-OPTION(rgw_default_zonegroup_info_oid, OPT_STR, "default.zonegroup") // oid where default zone group info is stored
-OPTION(rgw_realm, OPT_STR, "") // realm name
-OPTION(rgw_realm_root_pool, OPT_STR, ".rgw.root") // pool where all realm info is stored
-OPTION(rgw_default_realm_info_oid, OPT_STR, "default.realm") // oid where default realm info is stored
-OPTION(rgw_period_root_pool, OPT_STR, ".rgw.root") // pool where all period info is stored
-OPTION(rgw_period_latest_epoch_info_oid, OPT_STR, ".latest_epoch") // oid where current period info is stored
-OPTION(rgw_log_nonexistent_bucket, OPT_BOOL, false)
-OPTION(rgw_log_object_name, OPT_STR, "%Y-%m-%d-%H-%i-%n") // man date to see codes (a subset are supported)
-OPTION(rgw_log_object_name_utc, OPT_BOOL, false)
-OPTION(rgw_usage_max_shards, OPT_INT, 32)
-OPTION(rgw_usage_max_user_shards, OPT_INT, 1)
-OPTION(rgw_enable_ops_log, OPT_BOOL, false) // enable logging every rgw operation
-OPTION(rgw_enable_usage_log, OPT_BOOL, false) // enable logging bandwidth usage
-OPTION(rgw_ops_log_rados, OPT_BOOL, true) // whether ops log should go to rados
-OPTION(rgw_ops_log_socket_path, OPT_STR, "") // path to unix domain socket where ops log can go
-OPTION(rgw_ops_log_data_backlog, OPT_INT, 5 << 20) // max data backlog for ops log
-OPTION(rgw_fcgi_socket_backlog, OPT_INT, 1024) // socket backlog for fcgi
-OPTION(rgw_usage_log_flush_threshold, OPT_INT, 1024) // threshold to flush pending log data
-OPTION(rgw_usage_log_tick_interval, OPT_INT, 30) // flush pending log data every X seconds
-OPTION(rgw_intent_log_object_name, OPT_STR, "%Y-%m-%d-%i-%n") // man date to see codes (a subset are supported)
-OPTION(rgw_intent_log_object_name_utc, OPT_BOOL, false)
-OPTION(rgw_init_timeout, OPT_INT, 300) // time in seconds
-OPTION(rgw_mime_types_file, OPT_STR, "/etc/mime.types")
-OPTION(rgw_gc_max_objs, OPT_INT, 32)
-OPTION(rgw_gc_obj_min_wait, OPT_INT, 2 * 3600) // wait time before object may be handled by gc
-OPTION(rgw_gc_processor_max_time, OPT_INT, 3600) // total run time for a single gc processor work
-OPTION(rgw_gc_processor_period, OPT_INT, 3600) // gc processor cycle time
-OPTION(rgw_s3_success_create_obj_status, OPT_INT, 0) // alternative success status response for create-obj (0 - default)
-OPTION(rgw_resolve_cname, OPT_BOOL, false) // should rgw try to resolve hostname as a dns cname record
-OPTION(rgw_obj_stripe_size, OPT_INT, 4 << 20)
-OPTION(rgw_extended_http_attrs, OPT_STR, "") // list of extended attrs that can be set on objects (beyond the default)
-OPTION(rgw_exit_timeout_secs, OPT_INT, 120) // how many seconds to wait for process to go down before exiting unconditionally
-OPTION(rgw_get_obj_window_size, OPT_INT, 16 << 20) // window size in bytes for single get obj request
-OPTION(rgw_get_obj_max_req_size, OPT_INT, 4 << 20) // max length of a single get obj rados op
-OPTION(rgw_relaxed_s3_bucket_names, OPT_BOOL, false) // enable relaxed bucket name rules for US region buckets
-OPTION(rgw_defer_to_bucket_acls, OPT_STR, "") // if the user has bucket perms, use those before key perms (recurse and full_control)
-OPTION(rgw_list_buckets_max_chunk, OPT_INT, 1000) // max buckets to retrieve in a single op when listing user buckets
-OPTION(rgw_md_log_max_shards, OPT_INT, 64) // max shards for metadata log
-OPTION(rgw_num_zone_opstate_shards, OPT_INT, 128) // max shards for keeping inter-region copy progress info
-OPTION(rgw_opstate_ratelimit_sec, OPT_INT, 30) // min time between opstate updates on a single upload (0 for disabling ratelimit)
-OPTION(rgw_curl_wait_timeout_ms, OPT_INT, 1000) // timeout for certain curl calls
-OPTION(rgw_copy_obj_progress, OPT_BOOL, true) // should dump progress during long copy operations?
-OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT, 1024 * 1024) // min bytes between copy progress output
-OPTION(rgw_obj_tombstone_cache_size, OPT_INT, 1000) // how many objects in tombstone cache, which is used in multi-zone sync to keep
- // track of removed objects' mtime
-
-OPTION(rgw_data_log_window, OPT_INT, 30) // data log entries window (in seconds)
-OPTION(rgw_data_log_changes_size, OPT_INT, 1000) // number of in-memory entries to hold for data changes log
-OPTION(rgw_data_log_num_shards, OPT_INT, 128) // number of objects to keep data changes log on
-OPTION(rgw_data_log_obj_prefix, OPT_STR, "data_log") //
-OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") //
-
-OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance
-OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions
-OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache
-OPTION(rgw_bucket_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
-OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in bytes
-
-OPTION(rgw_expose_bucket, OPT_BOOL, false) // Return the bucket name in the 'Bucket' response header
-
-OPTION(rgw_frontends, OPT_STR, "civetweb port=7480") // rgw front ends
-
-OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT, 180) // time period for accumulating modified buckets before syncing stats
-OPTION(rgw_user_quota_sync_interval, OPT_INT, 3600 * 24) // time period for accumulating modified buckets before syncing entire user stats
-OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idle users be fully synced
-OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users
-OPTION(rgw_user_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
-OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in bytes
-
-OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
-OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload
-
-OPTION(rgw_max_slo_entries, OPT_INT, 1000) // default number of max entries in slo
-
-OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change
-OPTION(rgw_user_max_buckets, OPT_INT, 1000) // global option to set max buckets count for all user
-
-OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting
-OPTION(rgw_objexp_time_step, OPT_U32, 4096) // number of seconds for rounding the timestamps
-OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in
-OPTION(rgw_objexp_chunk_size, OPT_U32, 100) // maximum number of entries in a single operation when processing objexp data
-
-OPTION(rgw_enable_static_website, OPT_BOOL, false) // enable static website feature
-OPTION(rgw_log_http_headers, OPT_STR, "" ) // list of HTTP headers to log when seen, ignores case (e.g., http_x_forwarded_for
-
-OPTION(rgw_num_async_rados_threads, OPT_INT, 32) // num of threads to use for async rados operations
-OPTION(rgw_md_notify_interval_msec, OPT_INT, 200) // metadata changes notification interval to followers
-OPTION(rgw_run_sync_thread, OPT_BOOL, true) // whether radosgw (not radosgw-admin) spawns the sync thread
-OPTION(rgw_sync_lease_period, OPT_INT, 120) // time in second for lease that rgw takes on a specific log (or log shard)
-OPTION(rgw_sync_log_trim_interval, OPT_INT, 1200) // time in seconds between attempts to trim sync logs
-
-OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1]
-OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1]
-
-
-OPTION(rgw_period_push_interval, OPT_DOUBLE, 2) // seconds to wait before retrying "period push"
-OPTION(rgw_period_push_interval_max, OPT_DOUBLE, 30) // maximum interval after exponential backoff
-
-OPTION(rgw_safe_max_objects_per_shard, OPT_INT, 100*1024) // safe max loading
-OPTION(rgw_shard_warning_threshold, OPT_DOUBLE, 90) // pct of safe max
- // at which to warn
-
-OPTION(rgw_swift_versioning_enabled, OPT_BOOL, false) // whether swift object versioning feature is enabled
-
-OPTION(mgr_module_path, OPT_STR, CEPH_PKGLIBDIR "/mgr") // where to load python modules from
-OPTION(mgr_initial_modules, OPT_STR, "restful status") // Which modules to load
-OPTION(mgr_data, OPT_STR, "/var/lib/ceph/mgr/$cluster-$id") // where to find keyring etc
-OPTION(mgr_tick_period, OPT_INT, 2) // How frequently to tick
-OPTION(mgr_stats_period, OPT_INT, 5) // How frequently clients send stats
-OPTION(mgr_client_bytes, OPT_U64, 128*1048576) // bytes from clients
-OPTION(mgr_client_messages, OPT_U64, 512) // messages from clients
-OPTION(mgr_osd_bytes, OPT_U64, 512*1048576) // bytes from osds
-OPTION(mgr_osd_messages, OPT_U64, 8192) // messages from osds
-OPTION(mgr_mds_bytes, OPT_U64, 128*1048576) // bytes from mdss
-OPTION(mgr_mds_messages, OPT_U64, 128) // messages from mdss
-OPTION(mgr_mon_bytes, OPT_U64, 128*1048576) // bytes from mons
-OPTION(mgr_mon_messages, OPT_U64, 128) // messages from mons
-
-OPTION(mgr_connect_retry_interval, OPT_DOUBLE, 1.0)
-OPTION(mgr_service_beacon_grace, OPT_DOUBLE, 60.0)
-
-OPTION(mon_mgr_digest_period, OPT_INT, 5) // How frequently to send digests
-OPTION(mon_mgr_beacon_grace, OPT_INT, 30) // How long to wait to failover
-OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR
-OPTION(mon_mgr_mkfs_grace, OPT_INT, 60) // How long before we complain about MGR_DOWN
-OPTION(rgw_crypt_require_ssl, OPT_BOOL, true) // requests including encryption key headers must be sent over ssl
-OPTION(rgw_crypt_default_encryption_key, OPT_STR, "") // base64 encoded key for encryption of rgw objects
-OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR, "") // extra keys that may be used for aws:kms
- // defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg=="
-OPTION(rgw_crypt_suppress_logs, OPT_BOOL, true) // suppress logs that might print customer key
-OPTION(rgw_list_bucket_min_readahead, OPT_INT, 1000) // minimum number of entries to read from rados for bucket listing
-
-OPTION(rgw_rest_getusage_op_compat, OPT_BOOL, false) // dump description of total stats for s3 GetUsage API
-
-OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
-OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter
-
-/* The following are tunables for torrent data */
-OPTION(rgw_torrent_flag, OPT_BOOL, false) // produce torrent function flag
-OPTION(rgw_torrent_tracker, OPT_STR, "") // torrent field annouce and annouce list
-OPTION(rgw_torrent_createby, OPT_STR, "") // torrent field created by
-OPTION(rgw_torrent_comment, OPT_STR, "") // torrent field comment
-OPTION(rgw_torrent_encoding, OPT_STR, "") // torrent field encoding
-OPTION(rgw_torrent_origin, OPT_STR, "") // torrent origin
-OPTION(rgw_torrent_sha_unit, OPT_INT, 512*1024) // torrent field piece length 512K
-
-OPTION(event_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-
-// This will be set to true when it is safe to start threads.
-// Once it is true, it will never change.
-OPTION(internal_safe_to_start_threads, OPT_BOOL, false)
-
-OPTION(debug_deliberately_leak_memory, OPT_BOOL, false)
-
-OPTION(rgw_swift_custom_header, OPT_STR, "") // option to enable swift custom headers
-
-OPTION(rgw_swift_need_stats, OPT_BOOL, true) // option to enable stats on bucket listing for swift
-
-/* resharding tunables */
-OPTION(rgw_reshard_num_logs, OPT_INT, 16)
-OPTION(rgw_reshard_bucket_lock_duration, OPT_INT, 120) // duration of lock on bucket obj during resharding
-OPTION(rgw_dynamic_resharding, OPT_BOOL, true)
-OPTION(rgw_max_objs_per_shard, OPT_INT, 100000)
-OPTION(rgw_reshard_thread_interval, OPT_U32, 60 * 10) // maximum time between rounds of reshard thread processing
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "common/config_validators.h"
-#include "include/stringify.h"
-
-#include <boost/algorithm/string.hpp>
-#include <boost/lexical_cast.hpp>
-#include <boost/regex.hpp>
-
-int validate(md_config_t::option_rbd_default_pool_t *,
- std::string *value, std::string *error_message) {
- boost::regex pattern("^[^@/]+$");
- if (!boost::regex_match (*value, pattern)) {
- *value = "rbd";
- *error_message = "invalid RBD default pool, resetting to 'rbd'";
- }
- return 0;
-}
-
-int validate(md_config_t::option_rbd_default_data_pool_t *,
- std::string *value, std::string *error_message) {
- boost::regex pattern("^[^@/]*$");
- if (!boost::regex_match (*value, pattern)) {
- *value = "";
- *error_message = "ignoring invalid RBD data pool";
- }
- return 0;
-}
-
-int validate(md_config_t::option_rbd_default_features_t *,
- std::string *value, std::string *error_message) {
- static const std::map<std::string, uint64_t> FEATURE_MAP = {
- {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING},
- {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2},
- {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK},
- {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP},
- {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF},
- {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN},
- {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING},
- {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL},
- };
- static_assert((RBD_FEATURE_DATA_POOL << 1) > RBD_FEATURES_ALL,
- "new RBD feature added");
-
- // convert user-friendly comma delimited feature name list to a bitmask
- // that is used by the librbd API
- uint64_t features = 0;
- error_message->clear();
-
- try {
- features = boost::lexical_cast<decltype(features)>(*value);
-
- uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL);
- if (unsupported_features != 0ull) {
- features &= RBD_FEATURES_ALL;
-
- std::stringstream ss;
- ss << "ignoring unknown feature mask 0x"
- << std::hex << unsupported_features;
- *error_message = ss.str();
- }
- } catch (const boost::bad_lexical_cast& ) {
- int r = 0;
- std::vector<std::string> feature_names;
- boost::split(feature_names, *value, boost::is_any_of(","));
- for (auto feature_name: feature_names) {
- boost::trim(feature_name);
- auto feature_it = FEATURE_MAP.find(feature_name);
- if (feature_it != FEATURE_MAP.end()) {
- features += feature_it->second;
- } else {
- if (!error_message->empty()) {
- *error_message += ", ";
- }
- *error_message += "ignoring unknown feature " + feature_name;
- r = -EINVAL;
- }
- }
-
- if (features == 0 && r == -EINVAL) {
- features = RBD_FEATURES_DEFAULT;
- }
- }
- *value = stringify(features);
- return 0;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_CONFIG_VALIDATORS
-#define CEPH_CONFIG_VALIDATORS
-
-#include "config.h"
-#include <string>
-
-/**
- * Global config value validators for the Ceph project
- */
-
-int validate(md_config_t::option_rbd_default_pool_t *type,
- std::string *value, std::string *error_message);
-int validate(md_config_t::option_rbd_default_data_pool_t *type,
- std::string *value, std::string *error_message);
-int validate(md_config_t::option_rbd_default_features_t *type,
- std::string *value, std::string *error_message);
-
-#endif // CEPH_CONFIG_VALIDATORS
#ifndef CEPH_DOUT_H
#define CEPH_DOUT_H
+#include <type_traits>
+
#include "global/global_context.h"
#include "common/config.h"
#include "common/likely.h"
if (0) { \
char __array[((v >= -1) && (v <= 200)) ? 0 : -1] __attribute__((unused)); \
} \
- static size_t _log_exp_length=80; \
+ static size_t _log_exp_length = 80; \
ceph::logging::Entry *_dout_e = cct->_log->create_entry(v, sub, &_log_exp_length); \
ostream _dout_os(&_dout_e->m_streambuf); \
- CephContext *_dout_cct = cct; \
+ static_assert(std::is_convertible<decltype(&*cct), \
+ CephContext* >::value, \
+ "provided cct must be compatible with CephContext*"); \
+ auto _dout_cct = cct; \
std::ostream* _dout = &_dout_os;
#define lsubdout(cct, sub, v) dout_impl(cct, ceph_subsys_##sub, v) dout_prefix
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* note: no header guard */
+OPTION(host, OPT_STR) // "" means that ceph will use short hostname
+OPTION(fsid, OPT_UUID)
+OPTION(public_addr, OPT_ADDR)
+OPTION(public_bind_addr, OPT_ADDR)
+OPTION(cluster_addr, OPT_ADDR)
+OPTION(public_network, OPT_STR)
+OPTION(cluster_network, OPT_STR)
+OPTION(monmap, OPT_STR)
+OPTION(mon_host, OPT_STR)
+OPTION(mon_dns_srv_name, OPT_STR)
+OPTION(lockdep, OPT_BOOL)
+OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock
+OPTION(run_dir, OPT_STR) // the "/var/run/ceph" dir, created on daemon startup
+OPTION(admin_socket, OPT_STR) // default changed by common_preinit()
+OPTION(admin_socket_mode, OPT_STR) // permission bits to set for admin socket file, e.g., "0775", "0755"
+
+OPTION(daemonize, OPT_BOOL) // default changed by common_preinit()
+OPTION(setuser, OPT_STR) // uid or user name
+OPTION(setgroup, OPT_STR) // gid or group name
+OPTION(setuser_match_path, OPT_STR) // make setuser/group conditional on this path matching ownership
+OPTION(pid_file, OPT_STR) // default changed by common_preinit()
+OPTION(chdir, OPT_STR)
+OPTION(restapi_log_level, OPT_STR) // default set by Python code
+OPTION(restapi_base_url, OPT_STR) // "
+OPTION(fatal_signal_handlers, OPT_BOOL)
+SAFE_OPTION(erasure_code_dir, OPT_STR) // default location for erasure-code plugins
+
+OPTION(log_file, OPT_STR) // default changed by common_preinit()
+OPTION(log_max_new, OPT_INT) // default changed by common_preinit()
+OPTION(log_max_recent, OPT_INT) // default changed by common_preinit()
+OPTION(log_to_stderr, OPT_BOOL) // default changed by common_preinit()
+OPTION(err_to_stderr, OPT_BOOL) // default changed by common_preinit()
+OPTION(log_to_syslog, OPT_BOOL)
+OPTION(err_to_syslog, OPT_BOOL)
+OPTION(log_flush_on_exit, OPT_BOOL) // default changed by common_preinit()
+OPTION(log_stop_at_utilization, OPT_FLOAT) // stop logging at (near) full
+OPTION(log_to_graylog, OPT_BOOL)
+OPTION(err_to_graylog, OPT_BOOL)
+OPTION(log_graylog_host, OPT_STR)
+OPTION(log_graylog_port, OPT_INT)
+
+// options will take k/v pairs, or single-item that will be assumed as general
+// default for all, regardless of channel.
+// e.g., "info" would be taken as the same as "default=info"
+// also, "default=daemon audit=local0" would mean
+// "default all to 'daemon', override 'audit' with 'local0'
+OPTION(clog_to_monitors, OPT_STR)
+OPTION(clog_to_syslog, OPT_STR)
+OPTION(clog_to_syslog_level, OPT_STR) // this level and above
+OPTION(clog_to_syslog_facility, OPT_STR)
+OPTION(clog_to_graylog, OPT_STR)
+OPTION(clog_to_graylog_host, OPT_STR)
+OPTION(clog_to_graylog_port, OPT_STR)
+
+OPTION(mon_cluster_log_to_syslog, OPT_STR)
+OPTION(mon_cluster_log_to_syslog_level, OPT_STR) // this level and above
+OPTION(mon_cluster_log_to_syslog_facility, OPT_STR)
+OPTION(mon_cluster_log_file, OPT_STR)
+OPTION(mon_cluster_log_file_level, OPT_STR)
+OPTION(mon_cluster_log_to_graylog, OPT_STR)
+OPTION(mon_cluster_log_to_graylog_host, OPT_STR)
+OPTION(mon_cluster_log_to_graylog_port, OPT_STR)
+
+OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR)
+
+SAFE_OPTION(plugin_dir, OPT_STR)
+
+OPTION(xio_trace_mempool, OPT_BOOL) // mempool allocation counters
+OPTION(xio_trace_msgcnt, OPT_BOOL) // incoming/outgoing msg counters
+OPTION(xio_trace_xcon, OPT_BOOL) // Xio message encode/decode trace
+OPTION(xio_queue_depth, OPT_INT) // depth of Accelio msg queue
+OPTION(xio_mp_min, OPT_INT) // default min mempool size
+OPTION(xio_mp_max_64, OPT_INT) // max 64-byte chunks (buffer is 40)
+OPTION(xio_mp_max_256, OPT_INT) // max 256-byte chunks
+OPTION(xio_mp_max_1k, OPT_INT) // max 1K chunks
+OPTION(xio_mp_max_page, OPT_INT) // max 1K chunks
+OPTION(xio_mp_max_hint, OPT_INT) // max size-hint chunks
+OPTION(xio_portal_threads, OPT_INT) // xio portal threads per messenger
+OPTION(xio_max_conns_per_portal, OPT_INT) // max xio_connections per portal/ctx
+OPTION(xio_transport_type, OPT_STR) // xio transport type: {rdma or tcp}
+OPTION(xio_max_send_inline, OPT_INT) // xio maximum threshold to send inline
+
+OPTION(compressor_zlib_isal, OPT_BOOL)
+OPTION(compressor_zlib_level, OPT_INT) //regular zlib compression level, not applicable to isa-l optimized version
+
+OPTION(async_compressor_enabled, OPT_BOOL)
+OPTION(async_compressor_type, OPT_STR)
+OPTION(async_compressor_threads, OPT_INT)
+OPTION(async_compressor_thread_timeout, OPT_INT)
+OPTION(async_compressor_thread_suicide_timeout, OPT_INT)
+
+OPTION(plugin_crypto_accelerator, OPT_STR)
+
+OPTION(mempool_debug, OPT_BOOL)
+
+
+
+OPTION(key, OPT_STR)
+OPTION(keyfile, OPT_STR)
+OPTION(keyring, OPT_STR)
+OPTION(heartbeat_interval, OPT_INT)
+OPTION(heartbeat_file, OPT_STR)
+OPTION(heartbeat_inject_failure, OPT_INT) // force an unhealthy heartbeat for N seconds
+OPTION(perf, OPT_BOOL) // enable internal perf counters
+
+SAFE_OPTION(ms_type, OPT_STR) // messenger backend. It will be modified in runtime, so use SAFE_OPTION
+OPTION(ms_public_type, OPT_STR) // messenger backend
+OPTION(ms_cluster_type, OPT_STR) // messenger backend
+OPTION(ms_tcp_nodelay, OPT_BOOL)
+OPTION(ms_tcp_rcvbuf, OPT_INT)
+OPTION(ms_tcp_prefetch_max_size, OPT_INT) // max prefetch size, we limit this to avoid extra memcpy
+OPTION(ms_initial_backoff, OPT_DOUBLE)
+OPTION(ms_max_backoff, OPT_DOUBLE)
+OPTION(ms_crc_data, OPT_BOOL)
+OPTION(ms_crc_header, OPT_BOOL)
+OPTION(ms_die_on_bad_msg, OPT_BOOL)
+OPTION(ms_die_on_unhandled_msg, OPT_BOOL)
+OPTION(ms_die_on_old_message, OPT_BOOL) // assert if we get a dup incoming message and shouldn't have (may be triggered by pre-541cd3c64be0dfa04e8a2df39422e0eb9541a428 code)
+OPTION(ms_die_on_skipped_message, OPT_BOOL) // assert if we skip a seq (kernel client does this intentionally)
+OPTION(ms_dispatch_throttle_bytes, OPT_U64)
+OPTION(ms_bind_ipv6, OPT_BOOL)
+OPTION(ms_bind_port_min, OPT_INT)
+OPTION(ms_bind_port_max, OPT_INT)
+OPTION(ms_bind_retry_count, OPT_INT) // If binding fails, how many times do we retry to bind
+OPTION(ms_bind_retry_delay, OPT_INT) // Delay between attemps to bind
+OPTION(ms_bind_before_connect, OPT_BOOL)
+OPTION(ms_tcp_listen_backlog, OPT_INT)
+OPTION(ms_rwthread_stack_bytes, OPT_U64)
+OPTION(ms_tcp_read_timeout, OPT_U64)
+OPTION(ms_pq_max_tokens_per_priority, OPT_U64)
+OPTION(ms_pq_min_cost, OPT_U64)
+OPTION(ms_inject_socket_failures, OPT_U64)
+SAFE_OPTION(ms_inject_delay_type, OPT_STR) // "osd mds mon client" allowed
+OPTION(ms_inject_delay_msg_type, OPT_STR) // the type of message to delay). This is an additional restriction on the general type filter ms_inject_delay_type.
+OPTION(ms_inject_delay_max, OPT_DOUBLE) // seconds
+OPTION(ms_inject_delay_probability, OPT_DOUBLE) // range [0, 1]
+OPTION(ms_inject_internal_delays, OPT_DOUBLE) // seconds
+OPTION(ms_dump_on_send, OPT_BOOL) // hexdump msg to log on send
+OPTION(ms_dump_corrupt_message_level, OPT_INT) // debug level to hexdump undecodeable messages at
+OPTION(ms_async_op_threads, OPT_U64) // number of worker processing threads for async messenger created on init
+OPTION(ms_async_max_op_threads, OPT_U64) // max number of worker processing threads for async messenger
+OPTION(ms_async_set_affinity, OPT_BOOL)
+// example: ms_async_affinity_cores = 0,1
+// The number of coreset is expected to equal to ms_async_op_threads, otherwise
+// extra op threads will loop ms_async_affinity_cores again.
+// If ms_async_affinity_cores is empty, all threads will be bind to current running
+// core
+OPTION(ms_async_affinity_cores, OPT_STR)
+OPTION(ms_async_rdma_device_name, OPT_STR)
+OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL)
+OPTION(ms_async_rdma_buffer_size, OPT_INT)
+OPTION(ms_async_rdma_send_buffers, OPT_U32)
+OPTION(ms_async_rdma_receive_buffers, OPT_U32)
+OPTION(ms_async_rdma_port_num, OPT_U32)
+OPTION(ms_async_rdma_polling_us, OPT_U32)
+OPTION(ms_async_rdma_local_gid, OPT_STR) // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
+OPTION(ms_async_rdma_roce_ver, OPT_INT) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
+OPTION(ms_async_rdma_sl, OPT_INT) // in RoCE, this means PCP
+OPTION(ms_async_rdma_dscp, OPT_INT) // in RoCE, this means DSCP
+
+OPTION(ms_dpdk_port_id, OPT_INT)
+SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare
+OPTION(ms_dpdk_memory_channel, OPT_STR)
+OPTION(ms_dpdk_hugepages, OPT_STR)
+OPTION(ms_dpdk_pmd, OPT_STR)
+SAFE_OPTION(ms_dpdk_host_ipv4_addr, OPT_STR)
+SAFE_OPTION(ms_dpdk_gateway_ipv4_addr, OPT_STR)
+SAFE_OPTION(ms_dpdk_netmask_ipv4_addr, OPT_STR)
+OPTION(ms_dpdk_lro, OPT_BOOL)
+OPTION(ms_dpdk_hw_flow_control, OPT_BOOL)
+// Weighing of a hardware network queue relative to a software queue (0=no work, 1= equal share)")
+OPTION(ms_dpdk_hw_queue_weight, OPT_FLOAT)
+OPTION(ms_dpdk_debug_allow_loopback, OPT_BOOL)
+OPTION(ms_dpdk_rx_buffer_count_per_core, OPT_INT)
+
+OPTION(inject_early_sigterm, OPT_BOOL)
+
+OPTION(mon_data, OPT_STR)
+OPTION(mon_initial_members, OPT_STR) // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
+OPTION(mon_compact_on_start, OPT_BOOL) // compact leveldb on ceph-mon start
+OPTION(mon_compact_on_bootstrap, OPT_BOOL) // trigger leveldb compaction on bootstrap
+OPTION(mon_compact_on_trim, OPT_BOOL) // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT) // the size of osdmaps cache, not to rely on underlying store's cache
+
+OPTION(mon_cpu_threads, OPT_INT)
+OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT)
+OPTION(mon_osd_max_creating_pgs, OPT_INT)
+OPTION(mon_tick_interval, OPT_INT)
+OPTION(mon_session_timeout, OPT_INT) // must send keepalive or subscribe
+OPTION(mon_subscribe_interval, OPT_DOUBLE) // for legacy clients only
+OPTION(mon_delta_reset_interval, OPT_DOUBLE) // seconds of inactivity before we reset the pg delta to 0
+OPTION(mon_osd_laggy_halflife, OPT_INT) // (seconds) how quickly our laggy estimations decay
+OPTION(mon_osd_laggy_weight, OPT_DOUBLE) // weight for new 'samples's in laggy estimations
+OPTION(mon_osd_laggy_max_interval, OPT_INT) // maximum value of laggy_interval in laggy estimations
+OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL) // true if we should scale based on laggy estimations
+OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL) // true if we should scale based on laggy estimations
+OPTION(mon_osd_auto_mark_in, OPT_BOOL) // mark any booting osds 'in'
+OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL) // mark booting auto-marked-out osds 'in'
+OPTION(mon_osd_auto_mark_new_in, OPT_BOOL) // mark booting new osds 'in'
+OPTION(mon_osd_destroyed_out_interval, OPT_INT) // seconds
+OPTION(mon_osd_down_out_interval, OPT_INT) // seconds
+OPTION(mon_osd_down_out_subtree_limit, OPT_STR) // smallest crush unit/type that we will not automatically mark out
+OPTION(mon_osd_min_up_ratio, OPT_DOUBLE) // min osds required to be up to mark things down
+OPTION(mon_osd_min_in_ratio, OPT_DOUBLE) // min osds required to be in to mark things out
+OPTION(mon_osd_warn_op_age, OPT_DOUBLE) // max op age before we generate a warning (make it a power of 2)
+OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE) // when to generate an error, as multiple of mon_osd_warn_op_age
+OPTION(mon_osd_max_split_count, OPT_INT) // largest number of PGs per "involved" OSD to let split create
+OPTION(mon_osd_allow_primary_temp, OPT_BOOL) // allow primary_temp to be set in the osdmap
+OPTION(mon_osd_allow_primary_affinity, OPT_BOOL) // allow primary_affinity to be set in the osdmap
+OPTION(mon_osd_prime_pg_temp, OPT_BOOL) // prime osdmap with pg mapping changes
+OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT) // max time to spend priming
+OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT) // max estimate of pg total before we do all pgs in parallel
+OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL) // whether turn on fast read on the pool or not
+OPTION(mon_stat_smooth_intervals, OPT_INT) // smooth stats over last N PGMap maps
+OPTION(mon_election_timeout, OPT_FLOAT) // on election proposer, max waiting time for all ACKs
+OPTION(mon_lease, OPT_FLOAT) // lease interval
+OPTION(mon_lease_renew_interval_factor, OPT_FLOAT) // on leader, to renew the lease
+OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT) // on leader, if lease isn't acked by all peons
+OPTION(mon_accept_timeout_factor, OPT_FLOAT) // on leader, if paxos update isn't accepted
+
+OPTION(mon_clock_drift_allowed, OPT_FLOAT) // allowed clock drift between monitors
+OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT) // exponential backoff for clock drift warnings
+OPTION(mon_timecheck_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval (seconds)
+OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
+OPTION(mon_pg_stuck_threshold, OPT_INT) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_pg_min_inactive, OPT_U64) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
+OPTION(mon_pg_warn_min_per_osd, OPT_INT) // min # pgs per (in) osd before we warn the admin
+OPTION(mon_pg_warn_max_per_osd, OPT_INT) // max # pgs per (in) osd before we warn the admin
+OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT) // max skew few average in objects per pg
+OPTION(mon_pg_warn_min_objects, OPT_INT) // do not warn below this object #
+OPTION(mon_pg_warn_min_pool_objects, OPT_INT) // do not warn on pools below this object #
+OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT) // threshold of down osds after which we check all pgs
+OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT) // position between pool cache_target_full and max where we start warning
+OPTION(mon_osd_full_ratio, OPT_FLOAT) // what % full makes an OSD "full"
+OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT) // what % full makes an OSD backfill full (backfill halted)
+OPTION(mon_osd_nearfull_ratio, OPT_FLOAT) // what % full makes an OSD near full
+OPTION(mon_osd_initial_require_min_compat_client, OPT_STR)
+OPTION(mon_allow_pool_delete, OPT_BOOL) // allow pool deletion
+OPTION(mon_fake_pool_delete, OPT_BOOL) // fake pool deletion (add _DELETED suffix)
+OPTION(mon_globalid_prealloc, OPT_U32) // how many globalids to prealloc
+OPTION(mon_osd_report_timeout, OPT_INT) // grace period before declaring unresponsive OSDs dead
+OPTION(mon_force_standby_active, OPT_BOOL) // should mons force standby-replay mds to be active
+OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL) // warn if crush tunables are too old (older than mon_min_crush_required_version)
+OPTION(mon_crush_min_required_version, OPT_STR)
+OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL) // warn if crush straw_calc_version==0
+OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL) // warn if 'mon_osd_down_out_interval == 0'
+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL)
+OPTION(mon_min_osdmap_epochs, OPT_INT)
+OPTION(mon_max_pgmap_epochs, OPT_INT)
+OPTION(mon_max_log_epochs, OPT_INT)
+OPTION(mon_max_mdsmap_epochs, OPT_INT)
+OPTION(mon_max_osd, OPT_INT)
+OPTION(mon_probe_timeout, OPT_DOUBLE)
+OPTION(mon_client_bytes, OPT_U64) // client msg data allowed in memory (in bytes)
+OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
+OPTION(mon_log_max_summary, OPT_U64)
+OPTION(mon_daemon_bytes, OPT_U64) // mds, osd message memory cap (in bytes)
+OPTION(mon_max_log_entries_per_event, OPT_INT)
+OPTION(mon_reweight_min_pgs_per_osd, OPT_U64) // min pgs per osd for reweight-by-pg command
+OPTION(mon_reweight_min_bytes_per_osd, OPT_U64) // min bytes per osd for reweight-by-utilization command
+OPTION(mon_reweight_max_osds, OPT_INT) // max osds to change per reweight-by-* command
+OPTION(mon_reweight_max_change, OPT_DOUBLE)
+OPTION(mon_health_data_update_interval, OPT_FLOAT)
+OPTION(mon_health_to_clog, OPT_BOOL)
+OPTION(mon_health_to_clog_interval, OPT_INT)
+OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
+OPTION(mon_health_preluminous_compat, OPT_BOOL)
+OPTION(mon_health_max_detail, OPT_INT) // max detailed pgs to report in health detail
+OPTION(mon_data_avail_crit, OPT_INT)
+OPTION(mon_data_avail_warn, OPT_INT)
+OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
+OPTION(mon_warn_not_scrubbed, OPT_INT)
+OPTION(mon_warn_not_deep_scrubbed, OPT_INT)
+OPTION(mon_scrub_interval, OPT_INT) // once a day
+OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not.
+OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time
+OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE) // probability of injected crc mismatch [0.0, 1.0]
+OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE) // probability of injected missing keys [0.0, 1.0]
+OPTION(mon_config_key_max_entry_size, OPT_INT) // max num bytes per config-key entry
+OPTION(mon_sync_timeout, OPT_DOUBLE)
+OPTION(mon_sync_max_payload_size, OPT_U32) // max size for a sync chunk payload (say)
+OPTION(mon_sync_debug, OPT_BOOL) // enable sync-specific debug
+OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE) // inject N second delay on each get_chunk request
+OPTION(mon_osd_min_down_reporters, OPT_INT) // number of OSDs from different subtrees who need to report a down OSD for it to count
+OPTION(mon_osd_reporter_subtree_level , OPT_STR) // in which level of parent bucket the reporters are counted
+OPTION(mon_osd_force_trim_to, OPT_INT) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous)
+OPTION(mon_mds_force_trim_to, OPT_INT) // force mon to trim mdsmaps to this point (dangerous)
+OPTION(mon_mds_skip_sanity, OPT_BOOL) // skip safety assertions on FSMap (in case of bugs where we want to continue anyway)
+
+// monitor debug options
+OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL) // consider deprecated commands as obsolete
+
+// dump transactions
+OPTION(mon_debug_dump_transactions, OPT_BOOL)
+OPTION(mon_debug_dump_json, OPT_BOOL)
+OPTION(mon_debug_dump_location, OPT_STR)
+OPTION(mon_debug_no_require_luminous, OPT_BOOL)
+OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL)
+OPTION(mon_debug_no_initial_persistent_features, OPT_BOOL)
+OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE) // seconds
+OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE) // range [0, 1]
+
+OPTION(mon_sync_provider_kill_at, OPT_INT) // kill the sync provider at a specific point in the work flow
+OPTION(mon_sync_requester_kill_at, OPT_INT) // kill the sync requester at a specific point in the work flow
+OPTION(mon_force_quorum_join, OPT_BOOL) // force monitor to join quorum even if it has been previously removed from the map
+OPTION(mon_keyvaluedb, OPT_STR) // type of keyvaluedb backend
+
+// UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
+OPTION(mon_debug_unsafe_allow_tier_with_nonempty_snaps, OPT_BOOL)
+OPTION(mon_osd_blacklist_default_expire, OPT_DOUBLE) // default one hour
+OPTION(mon_osd_crush_smoke_test, OPT_BOOL)
+
+OPTION(paxos_stash_full_interval, OPT_INT) // how often (in commits) to stash a full copy of the PaxosService state
+OPTION(paxos_max_join_drift, OPT_INT) // max paxos iterations before we must first sync the monitor stores
+OPTION(paxos_propose_interval, OPT_DOUBLE) // gather updates for this long before proposing a map update
+OPTION(paxos_min_wait, OPT_DOUBLE) // min time to gather updates for after period of inactivity
+OPTION(paxos_min, OPT_INT) // minimum number of paxos states to keep around
+OPTION(paxos_trim_min, OPT_INT) // number of extra proposals tolerated before trimming
+OPTION(paxos_trim_max, OPT_INT) // max number of extra proposals to trim at a time
+OPTION(paxos_service_trim_min, OPT_INT) // minimum amount of versions to trigger a trim (0 disables it)
+OPTION(paxos_service_trim_max, OPT_INT) // maximum amount of versions to trim during a single proposal (0 disables it)
+OPTION(paxos_kill_at, OPT_INT)
+OPTION(auth_cluster_required, OPT_STR) // required of mon, mds, osd daemons
+OPTION(auth_service_required, OPT_STR) // required by daemons of clients
+OPTION(auth_client_required, OPT_STR) // what clients require of daemons
+OPTION(auth_supported, OPT_STR) // deprecated; default value for above if they are not defined.
+OPTION(max_rotating_auth_attempts, OPT_INT)
+OPTION(cephx_require_signatures, OPT_BOOL) // If true, don't talk to Cephx partners if they don't support message signing; off by default
+OPTION(cephx_cluster_require_signatures, OPT_BOOL)
+OPTION(cephx_service_require_signatures, OPT_BOOL)
+OPTION(cephx_sign_messages, OPT_BOOL) // Default to signing session messages if supported
+OPTION(auth_mon_ticket_ttl, OPT_DOUBLE)
+OPTION(auth_service_ticket_ttl, OPT_DOUBLE)
+OPTION(auth_debug, OPT_BOOL) // if true, assert when weird things happen
+OPTION(mon_client_hunt_parallel, OPT_U32) // how many mons to try to connect to in parallel during hunt
+OPTION(mon_client_hunt_interval, OPT_DOUBLE) // try new mon every N seconds until we connect
+OPTION(mon_client_ping_interval, OPT_DOUBLE) // ping every N seconds
+OPTION(mon_client_ping_timeout, OPT_DOUBLE) // fail if we don't hear back
+OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE) // each time we reconnect to a monitor, double our timeout
+OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE) // up to a max of 10*default (30 seconds)
+OPTION(mon_client_max_log_entries_per_message, OPT_INT)
+OPTION(mon_max_pool_pg_num, OPT_INT)
+OPTION(mon_pool_quota_warn_threshold, OPT_INT) // percent of quota at which to issue warnings
+OPTION(mon_pool_quota_crit_threshold, OPT_INT) // percent of quota at which to issue errors
+OPTION(client_cache_size, OPT_INT)
+OPTION(client_cache_mid, OPT_FLOAT)
+OPTION(client_use_random_mds, OPT_BOOL)
+OPTION(client_mount_timeout, OPT_DOUBLE)
+OPTION(client_tick_interval, OPT_DOUBLE)
+OPTION(client_trace, OPT_STR)
+OPTION(client_readahead_min, OPT_LONGLONG) // readahead at _least_ this much.
+OPTION(client_readahead_max_bytes, OPT_LONGLONG) // default unlimited
+OPTION(client_readahead_max_periods, OPT_LONGLONG) // as multiple of file layout period (object size * num stripes)
+OPTION(client_reconnect_stale, OPT_BOOL) // automatically reconnect stale session
+OPTION(client_snapdir, OPT_STR)
+OPTION(client_mountpoint, OPT_STR)
+OPTION(client_mount_uid, OPT_INT)
+OPTION(client_mount_gid, OPT_INT)
+OPTION(client_notify_timeout, OPT_INT) // in seconds
+OPTION(osd_client_watch_timeout, OPT_INT) // in seconds
+OPTION(client_caps_release_delay, OPT_INT) // in seconds
+OPTION(client_quota_df, OPT_BOOL) // use quota for df on subdir mounts
+OPTION(client_oc, OPT_BOOL)
+OPTION(client_oc_size, OPT_INT) // MB * n
+OPTION(client_oc_max_dirty, OPT_INT) // MB * n (dirty OR tx.. bigish)
+OPTION(client_oc_target_dirty, OPT_INT) // target dirty (keep this smallish)
+OPTION(client_oc_max_dirty_age, OPT_DOUBLE) // max age in cache before writeback
+OPTION(client_oc_max_objects, OPT_INT) // max objects in cache
+OPTION(client_debug_getattr_caps, OPT_BOOL) // check if MDS reply contains wanted caps
+OPTION(client_debug_force_sync_read, OPT_BOOL) // always read synchronously (go to osds)
+OPTION(client_debug_inject_tick_delay, OPT_INT) // delay the client tick for a number of seconds
+OPTION(client_max_inline_size, OPT_U64)
+OPTION(client_inject_release_failure, OPT_BOOL) // synthetic client bug for testing
+OPTION(client_inject_fixed_oldest_tid, OPT_BOOL) // synthetic client bug for testing
+OPTION(client_metadata, OPT_STR)
+OPTION(client_acl_type, OPT_STR)
+OPTION(client_permissions, OPT_BOOL)
+OPTION(client_dirsize_rbytes, OPT_BOOL)
+
+// note: the max amount of "in flight" dirty data is roughly (max - target)
+OPTION(fuse_use_invalidate_cb, OPT_BOOL) // use fuse 2.8+ invalidate callback to keep page cache consistent
+OPTION(fuse_disable_pagecache, OPT_BOOL)
+OPTION(fuse_allow_other, OPT_BOOL)
+OPTION(fuse_default_permissions, OPT_BOOL)
+OPTION(fuse_big_writes, OPT_BOOL)
+OPTION(fuse_atomic_o_trunc, OPT_BOOL)
+OPTION(fuse_debug, OPT_BOOL)
+OPTION(fuse_multithreaded, OPT_BOOL)
+OPTION(fuse_require_active_mds, OPT_BOOL) // if ceph_fuse requires active mds server
+OPTION(fuse_syncfs_on_mksnap, OPT_BOOL)
+OPTION(fuse_set_user_groups, OPT_BOOL) // if ceph_fuse fills in group lists or not
+
+OPTION(client_try_dentry_invalidate, OPT_BOOL) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
+OPTION(client_die_on_failed_remount, OPT_BOOL)
+OPTION(client_check_pool_perm, OPT_BOOL)
+OPTION(client_use_faked_inos, OPT_BOOL)
+OPTION(client_mds_namespace, OPT_STR)
+
+OPTION(crush_location, OPT_STR) // whitespace-separated list of key=value pairs describing crush location
+OPTION(crush_location_hook, OPT_STR)
+OPTION(crush_location_hook_timeout, OPT_INT)
+
+OPTION(objecter_tick_interval, OPT_DOUBLE)
+OPTION(objecter_timeout, OPT_DOUBLE) // before we ask for a map
+OPTION(objecter_inflight_op_bytes, OPT_U64) // max in-flight data (both directions)
+OPTION(objecter_inflight_ops, OPT_U64) // max in-flight ios
+OPTION(objecter_completion_locks_per_session, OPT_U64) // num of completion locks per each session, for serializing same object responses
+OPTION(objecter_inject_no_watch_ping, OPT_BOOL) // suppress watch pings
+OPTION(objecter_retry_writes_after_first_reply, OPT_BOOL) // ignore the first reply for each write, and resend the osd op instead
+OPTION(objecter_debug_inject_relock_delay, OPT_BOOL)
+
+// Max number of deletes at once in a single Filer::purge call
+OPTION(filer_max_purge_ops, OPT_U32)
+// Max number of truncate at once in a single Filer::truncate call
+OPTION(filer_max_truncate_ops, OPT_U32)
+
+OPTION(journaler_write_head_interval, OPT_INT)
+OPTION(journaler_prefetch_periods, OPT_INT) // * journal object size
+OPTION(journaler_prezero_periods, OPT_INT) // * journal object size
+OPTION(mds_data, OPT_STR)
+OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
+// max xattr kv pairs size for each dir/file
+OPTION(mds_max_xattr_pairs_size, OPT_U32)
+OPTION(mds_cache_size, OPT_INT)
+OPTION(mds_cache_mid, OPT_FLOAT)
+OPTION(mds_max_file_recover, OPT_U32)
+OPTION(mds_dir_max_commit_size, OPT_INT) // MB
+OPTION(mds_dir_keys_per_op, OPT_INT)
+OPTION(mds_decay_halflife, OPT_FLOAT)
+OPTION(mds_beacon_interval, OPT_FLOAT)
+OPTION(mds_beacon_grace, OPT_FLOAT)
+OPTION(mds_enforce_unique_name, OPT_BOOL)
+OPTION(mds_blacklist_interval, OPT_FLOAT) // how long to blacklist failed nodes
+
+OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client idle
+OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist clients whose sessions are dropped due to timeout
+OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands
+
+OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation?
+OPTION(mds_revoke_cap_timeout, OPT_FLOAT) // detect clients which aren't revoking caps
+OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps
+OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
+OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
+OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
+OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor
+OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
+ // make it (mds_session_timeout - mds_beacon_grace)
+OPTION(mds_tick_interval, OPT_FLOAT)
+OPTION(mds_dirstat_min_interval, OPT_FLOAT) // try to avoid propagating more often than this
+OPTION(mds_scatter_nudge_interval, OPT_FLOAT) // how quickly dirstat changes propagate up the hierarchy
+OPTION(mds_client_prealloc_inos, OPT_INT)
+OPTION(mds_early_reply, OPT_BOOL)
+OPTION(mds_default_dir_hash, OPT_INT)
+OPTION(mds_log_pause, OPT_BOOL)
+OPTION(mds_log_skip_corrupt_events, OPT_BOOL)
+OPTION(mds_log_max_events, OPT_INT)
+OPTION(mds_log_events_per_segment, OPT_INT)
+OPTION(mds_log_segment_size, OPT_INT) // segment size for mds log, default to default file_layout_t
+OPTION(mds_log_max_segments, OPT_U32)
+OPTION(mds_log_max_expiring, OPT_INT)
+OPTION(mds_bal_export_pin, OPT_BOOL) // allow clients to pin directory trees to ranks
+OPTION(mds_bal_sample_interval, OPT_DOUBLE) // every 3 seconds
+OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
+OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
+OPTION(mds_bal_frag, OPT_BOOL)
+OPTION(mds_bal_split_size, OPT_INT)
+OPTION(mds_bal_split_rd, OPT_FLOAT)
+OPTION(mds_bal_split_wr, OPT_FLOAT)
+OPTION(mds_bal_split_bits, OPT_INT)
+OPTION(mds_bal_merge_size, OPT_INT)
+OPTION(mds_bal_interval, OPT_INT) // seconds
+OPTION(mds_bal_fragment_interval, OPT_INT) // seconds
+OPTION(mds_bal_fragment_size_max, OPT_INT) // order of magnitude higher than split size
+OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT) // multiple of size_max that triggers immediate split
+OPTION(mds_bal_idle_threshold, OPT_FLOAT)
+OPTION(mds_bal_max, OPT_INT)
+OPTION(mds_bal_max_until, OPT_INT)
+OPTION(mds_bal_mode, OPT_INT)
+OPTION(mds_bal_min_rebalance, OPT_FLOAT) // must be this much above average before we export anything
+OPTION(mds_bal_min_start, OPT_FLOAT) // if we need less than this, we don't do anything
+OPTION(mds_bal_need_min, OPT_FLOAT) // take within this range of what we need
+OPTION(mds_bal_need_max, OPT_FLOAT)
+OPTION(mds_bal_midchunk, OPT_FLOAT) // any sub bigger than this taken in full
+OPTION(mds_bal_minchunk, OPT_FLOAT) // never take anything smaller than this
+OPTION(mds_bal_target_decay, OPT_DOUBLE) // target decay half-life in MDSMap (2x larger is approx. 2x slower)
+OPTION(mds_replay_interval, OPT_FLOAT) // time to wait before starting replay again
+OPTION(mds_shutdown_check, OPT_INT)
+OPTION(mds_thrash_exports, OPT_INT)
+OPTION(mds_thrash_fragments, OPT_INT)
+OPTION(mds_dump_cache_on_map, OPT_BOOL)
+OPTION(mds_dump_cache_after_rejoin, OPT_BOOL)
+OPTION(mds_verify_scatter, OPT_BOOL)
+OPTION(mds_debug_scatterstat, OPT_BOOL)
+OPTION(mds_debug_frag, OPT_BOOL)
+OPTION(mds_debug_auth_pins, OPT_BOOL)
+OPTION(mds_debug_subtrees, OPT_BOOL)
+OPTION(mds_kill_mdstable_at, OPT_INT)
+OPTION(mds_kill_export_at, OPT_INT)
+OPTION(mds_kill_import_at, OPT_INT)
+OPTION(mds_kill_link_at, OPT_INT)
+OPTION(mds_kill_rename_at, OPT_INT)
+OPTION(mds_kill_openc_at, OPT_INT)
+OPTION(mds_kill_journal_expire_at, OPT_INT)
+OPTION(mds_kill_journal_replay_at, OPT_INT)
+OPTION(mds_journal_format, OPT_U32) // Default to most recent JOURNAL_FORMAT_*
+OPTION(mds_kill_create_at, OPT_INT)
+OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE) /* percentage
+ of MDS modify replies to skip sending the
+ client a trace on [0-1]*/
+OPTION(mds_wipe_sessions, OPT_BOOL)
+OPTION(mds_wipe_ino_prealloc, OPT_BOOL)
+OPTION(mds_skip_ino, OPT_INT)
+OPTION(mds_standby_for_name, OPT_STR)
+OPTION(mds_standby_for_rank, OPT_INT)
+OPTION(mds_standby_for_fscid, OPT_INT)
+OPTION(mds_standby_replay, OPT_BOOL)
+OPTION(mds_enable_op_tracker, OPT_BOOL) // enable/disable MDS op tracking
+OPTION(mds_op_history_size, OPT_U32) // Max number of completed ops to track
+OPTION(mds_op_history_duration, OPT_U32) // Oldest completed op to track
+OPTION(mds_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
+OPTION(mds_op_log_threshold, OPT_INT) // how many op log messages to show in one go
+OPTION(mds_snap_min_uid, OPT_U32) // The minimum UID required to create a snapshot
+OPTION(mds_snap_max_uid, OPT_U32) // The maximum UID allowed to create a snapshot
+OPTION(mds_snap_rstat, OPT_BOOL) // enable/disbale nested stat for snapshot
+OPTION(mds_verify_backtrace, OPT_U32)
+// detect clients which aren't trimming completed requests
+OPTION(mds_max_completed_flushes, OPT_U32)
+OPTION(mds_max_completed_requests, OPT_U32)
+
+OPTION(mds_action_on_write_error, OPT_U32) // 0: ignore; 1: force readonly; 2: crash
+OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE)
+
+// Maximum number of concurrent stray files to purge
+OPTION(mds_max_purge_files, OPT_U32)
+// Maximum number of concurrent RADOS ops to issue in purging
+OPTION(mds_max_purge_ops, OPT_U32)
+// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count
+OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT)
+
+OPTION(mds_purge_queue_busy_flush_period, OPT_FLOAT)
+
+OPTION(mds_root_ino_uid, OPT_INT) // The UID of / on new filesystems
+OPTION(mds_root_ino_gid, OPT_INT) // The GID of / on new filesystems
+
+OPTION(mds_max_scrub_ops_in_progress, OPT_INT) // the number of simultaneous scrubs allowed
+
+// Maximum number of damaged frags/dentries before whole MDS rank goes damaged
+OPTION(mds_damage_table_max_entries, OPT_INT)
+
+// Maximum increment for client writable range, counted by number of objects
+OPTION(mds_client_writeable_range_max_inc_objs, OPT_U32)
+
+// verify backend can support configured max object name length
+OPTION(osd_check_max_object_name_len_on_startup, OPT_BOOL)
+
+// Maximum number of backfills to or from a single osd
+OPTION(osd_max_backfills, OPT_U64)
+
+// Minimum recovery priority (255 = max, smaller = lower)
+OPTION(osd_min_recovery_priority, OPT_INT)
+
+// Seconds to wait before retrying refused backfills
+OPTION(osd_backfill_retry_interval, OPT_DOUBLE)
+
+// Seconds to wait before retrying refused recovery
+OPTION(osd_recovery_retry_interval, OPT_DOUBLE)
+
+// max agent flush ops
+OPTION(osd_agent_max_ops, OPT_INT)
+OPTION(osd_agent_max_low_ops, OPT_INT)
+OPTION(osd_agent_min_evict_effort, OPT_FLOAT)
+OPTION(osd_agent_quantize_effort, OPT_FLOAT)
+OPTION(osd_agent_delay_time, OPT_FLOAT)
+
+// osd ignore history.last_epoch_started in find_best_info
+OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL)
+
+// decay atime and hist histograms after how many objects go by
+OPTION(osd_agent_hist_halflife, OPT_INT)
+
+// must be this amount over the threshold to enable,
+// this amount below the threshold to disable.
+OPTION(osd_agent_slop, OPT_FLOAT)
+
+OPTION(osd_uuid, OPT_UUID)
+OPTION(osd_data, OPT_STR)
+OPTION(osd_journal, OPT_STR)
+OPTION(osd_journal_size, OPT_INT) // in mb
+OPTION(osd_journal_flush_on_shutdown, OPT_BOOL) // Flush journal to data store on shutdown
+// flags for specific control purpose during osd mount() process.
+// e.g., can be 1 to skip over replaying journal
+// or 2 to skip over mounting omap or 3 to skip over both.
+// This might be helpful in case the journal is totally corrupted
+// and we still want to bring the osd daemon back normally, etc.
+OPTION(osd_os_flags, OPT_U32)
+OPTION(osd_max_write_size, OPT_INT)
+OPTION(osd_max_pgls, OPT_U64) // max number of pgls entries to return
+OPTION(osd_client_message_size_cap, OPT_U64) // client data allowed in-memory (in bytes)
+OPTION(osd_client_message_cap, OPT_U64) // num client messages allowed in-memory
+OPTION(osd_pg_bits, OPT_INT) // bits per osd
+OPTION(osd_pgp_bits, OPT_INT) // bits per osd
+OPTION(osd_crush_update_weight_set, OPT_BOOL) // update weight set while updating weights
+OPTION(osd_crush_chooseleaf_type, OPT_INT) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL) // try to use gmt for hitset archive names if all osds in cluster support it.
+OPTION(osd_crush_update_on_start, OPT_BOOL)
+OPTION(osd_class_update_on_start, OPT_BOOL) // automatically set device class on start
+OPTION(osd_crush_initial_weight, OPT_DOUBLE) // if >=0, the initial weight is for newly added osds.
+OPTION(osd_pool_default_crush_rule, OPT_INT)
+OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32) // in bytes
+OPTION(osd_pool_default_size, OPT_INT)
+OPTION(osd_pool_default_min_size, OPT_INT) // 0 means no specific default; ceph will use size-size/2
+OPTION(osd_pool_default_pg_num, OPT_INT) // number of PGs for new pools. Configure in global or mon section of ceph.conf
+OPTION(osd_pool_default_pgp_num, OPT_INT) // number of PGs for placement purposes. Should be equal to pg_num
+OPTION(osd_pool_default_type, OPT_STR)
+OPTION(osd_pool_default_erasure_code_profile, OPT_STR) // default properties of osd pool create
+OPTION(osd_erasure_code_plugins, OPT_STR) // list of erasure code plugins
+
+// Allows the "peered" state for recovery and backfill below min_size
+OPTION(osd_allow_recovery_below_min_size, OPT_BOOL)
+
+OPTION(osd_pool_default_flags, OPT_INT) // default flags for new pools
+OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL) // use new pg hashing to prevent pool/pg overlap
+OPTION(osd_pool_default_flag_nodelete, OPT_BOOL) // pool can't be deleted
+OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL) // pool's pg and pgp num can't be changed
+OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL) // pool's size and min size can't be changed
+OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_min_flush_age, OPT_INT) // seconds
+OPTION(osd_pool_default_cache_min_evict_age, OPT_INT) // seconds
+OPTION(osd_pool_default_cache_max_evict_check_size, OPT_INT) // max size to check for eviction
+OPTION(osd_hit_set_min_size, OPT_INT) // min target size for a HitSet
+OPTION(osd_hit_set_max_size, OPT_INT) // max target size for a HitSet
+OPTION(osd_hit_set_namespace, OPT_STR) // rados namespace for hit_set tracking
+
+// conservative default throttling values
+OPTION(osd_tier_promote_max_objects_sec, OPT_U64)
+OPTION(osd_tier_promote_max_bytes_sec, OPT_U64)
+
+OPTION(osd_tier_default_cache_mode, OPT_STR)
+OPTION(osd_tier_default_cache_hit_set_count, OPT_INT)
+OPTION(osd_tier_default_cache_hit_set_period, OPT_INT)
+OPTION(osd_tier_default_cache_hit_set_type, OPT_STR)
+OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT) // number of recent HitSets the object must appear in to be promoted (on read)
+OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT) // number of recent HitSets the object must appear in to be promoted (on write)
+OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT)
+OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT)
+
+OPTION(osd_map_dedup, OPT_BOOL)
+OPTION(osd_map_max_advance, OPT_INT) // make this < cache_size!
+OPTION(osd_map_cache_size, OPT_INT)
+OPTION(osd_map_message_max, OPT_INT) // max maps per MOSDMap message
+OPTION(osd_map_share_max_epochs, OPT_INT) // cap on # of inc maps we send to peers, clients
+OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT)
+OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL)
+// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
+OPTION(osd_max_markdown_period , OPT_INT)
+OPTION(osd_max_markdown_count, OPT_INT)
+
+OPTION(osd_peering_wq_threads, OPT_INT)
+OPTION(osd_peering_wq_batch_size, OPT_U64)
+OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64)
+OPTION(osd_op_pq_min_cost, OPT_U64)
+OPTION(osd_disk_threads, OPT_INT)
+OPTION(osd_disk_thread_ioprio_class, OPT_STR) // rt realtime be best effort idle
+OPTION(osd_disk_thread_ioprio_priority, OPT_INT) // 0-7
+OPTION(osd_recover_clone_overlap, OPT_BOOL) // preserve clone_overlap during recovery/migration
+OPTION(osd_op_num_threads_per_shard, OPT_INT)
+OPTION(osd_op_num_threads_per_shard_hdd, OPT_INT)
+OPTION(osd_op_num_threads_per_shard_ssd, OPT_INT)
+OPTION(osd_op_num_shards, OPT_INT)
+OPTION(osd_op_num_shards_hdd, OPT_INT)
+OPTION(osd_op_num_shards_ssd, OPT_INT)
+
+// PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
+// mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
+// and "mclock_client" are based on the mClock/dmClock algorithm
+// (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
+// class the operation belongs to. "mclock_client" does the same but
+// also works to ienforce fairness between clients. "debug_random"
+// chooses among all four with equal probability.
+OPTION(osd_op_queue, OPT_STR)
+
+OPTION(osd_op_queue_cut_off, OPT_STR) // Min priority to go to strict queue. (low, high)
+
+// mClock priority queue parameters for five types of ops
+OPTION(osd_op_queue_mclock_client_op_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_client_op_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_client_op_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_subop_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_subop_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_subop_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE)
+
+OPTION(osd_ignore_stale_divergent_priors, OPT_BOOL) // do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
+
+// Set to true for testing. Users should NOT set this.
+// If set to true even after reading enough shards to
+// decode the object, any error will be reported.
+OPTION(osd_read_ec_check_for_errors, OPT_BOOL) // return error if any ec shard has an error
+
+// Only use clone_overlap for recovery if there are fewer than
+// osd_recover_clone_overlap_limit entries in the overlap set
+OPTION(osd_recover_clone_overlap_limit, OPT_INT)
+
+OPTION(osd_backfill_scan_min, OPT_INT)
+OPTION(osd_backfill_scan_max, OPT_INT)
+OPTION(osd_op_thread_timeout, OPT_INT)
+OPTION(osd_op_thread_suicide_timeout, OPT_INT)
+OPTION(osd_recovery_thread_timeout, OPT_INT)
+OPTION(osd_recovery_thread_suicide_timeout, OPT_INT)
+OPTION(osd_recovery_sleep, OPT_FLOAT) // seconds to sleep between recovery ops
+OPTION(osd_recovery_sleep_hdd, OPT_FLOAT)
+OPTION(osd_recovery_sleep_ssd, OPT_FLOAT)
+OPTION(osd_snap_trim_sleep, OPT_DOUBLE)
+OPTION(osd_scrub_invalid_stats, OPT_BOOL)
+OPTION(osd_remove_thread_timeout, OPT_INT)
+OPTION(osd_remove_thread_suicide_timeout, OPT_INT)
+OPTION(osd_command_thread_timeout, OPT_INT)
+OPTION(osd_command_thread_suicide_timeout, OPT_INT)
+OPTION(osd_heartbeat_addr, OPT_ADDR)
+OPTION(osd_heartbeat_interval, OPT_INT) // (seconds) how often we ping peers
+
+// (seconds) how long before we decide a peer has failed
+// This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
+OPTION(osd_heartbeat_grace, OPT_INT)
+OPTION(osd_heartbeat_min_peers, OPT_INT) // minimum number of peers
+OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL) // prio the heartbeat tcp socket and set dscp as CS6 on it if true
+OPTION(osd_heartbeat_min_size, OPT_INT) // the minimum size of OSD heartbeat messages to send
+
+// max number of parallel snap trims/pg
+OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64)
+// max number of trimming pgs
+OPTION(osd_max_trimming_pgs, OPT_U64)
+
+// minimum number of peers that must be reachable to mark ourselves
+// back up after being wrongly marked down.
+OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT)
+
+OPTION(osd_mon_heartbeat_interval, OPT_INT) // (seconds) how often to ping monitor if no peers
+OPTION(osd_mon_report_interval_max, OPT_INT)
+OPTION(osd_mon_report_interval_min, OPT_INT) // pg stats, failures, up_thru, boot.
+OPTION(osd_mon_report_max_in_flight, OPT_INT) // max updates in flight
+OPTION(osd_beacon_report_interval, OPT_INT) // (second) how often to send beacon message to monitor
+OPTION(osd_pg_stat_report_interval_max, OPT_INT) // report pg stats for any given pg at least this often
+OPTION(osd_mon_ack_timeout, OPT_DOUBLE) // time out a mon if it doesn't ack stats
+OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE) // multiples of mon_ack_timeout
+OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE)
+OPTION(osd_default_data_pool_replay_window, OPT_INT)
+OPTION(osd_auto_mark_unfound_lost, OPT_BOOL)
+OPTION(osd_recovery_delay_start, OPT_FLOAT)
+OPTION(osd_recovery_max_active, OPT_U64)
+OPTION(osd_recovery_max_single_start, OPT_U64)
+OPTION(osd_recovery_max_chunk, OPT_U64) // max size of push chunk
+OPTION(osd_recovery_max_omap_entries_per_chunk, OPT_U64) // max number of omap entries per chunk; 0 to disable limit
+OPTION(osd_copyfrom_max_chunk, OPT_U64) // max size of a COPYFROM chunk
+OPTION(osd_push_per_object_cost, OPT_U64) // push cost per object
+OPTION(osd_max_push_cost, OPT_U64) // max size of push message
+OPTION(osd_max_push_objects, OPT_U64) // max objects in single push op
+OPTION(osd_recovery_forget_lost_objects, OPT_BOOL) // off for now
+OPTION(osd_max_scrubs, OPT_INT)
+OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
+OPTION(osd_scrub_begin_hour, OPT_INT)
+OPTION(osd_scrub_end_hour, OPT_INT)
+OPTION(osd_scrub_load_threshold, OPT_FLOAT)
+OPTION(osd_scrub_min_interval, OPT_FLOAT) // if load is low
+OPTION(osd_scrub_max_interval, OPT_FLOAT) // regardless of load
+OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT) // randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
+OPTION(osd_scrub_backoff_ratio, OPT_DOUBLE) // the probability to back off the scheduled scrub
+OPTION(osd_scrub_chunk_min, OPT_INT)
+OPTION(osd_scrub_chunk_max, OPT_INT)
+OPTION(osd_scrub_sleep, OPT_FLOAT) // sleep between [deep]scrub ops
+OPTION(osd_scrub_auto_repair, OPT_BOOL) // whether auto-repair inconsistencies upon deep-scrubbing
+OPTION(osd_scrub_auto_repair_num_errors, OPT_U32) // only auto-repair when number of errors is below this threshold
+OPTION(osd_deep_scrub_interval, OPT_FLOAT) // once a week
+OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
+OPTION(osd_deep_scrub_stride, OPT_INT)
+OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT) // objects must be this old (seconds) before we update the whole-object digest on scrub
+OPTION(osd_class_dir, OPT_STR) // where rados plugins are stored
+OPTION(osd_open_classes_on_start, OPT_BOOL)
+OPTION(osd_class_load_list, OPT_STR) // list of object classes allowed to be loaded (allow all: *)
+OPTION(osd_class_default_list, OPT_STR) // list of object classes with default execute perm (allow all: *)
+OPTION(osd_check_for_log_corruption, OPT_BOOL)
+OPTION(osd_use_stale_snap, OPT_BOOL)
+OPTION(osd_rollback_to_cluster_snap, OPT_STR)
+OPTION(osd_default_notify_timeout, OPT_U32) // default notify timeout in seconds
+OPTION(osd_kill_backfill_at, OPT_INT)
+
+// Bounds how infrequently a new map epoch will be persisted for a pg
+OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size!
+
+OPTION(osd_min_pg_log_entries, OPT_U32) // number of entries to keep in the pg log when trimming it
+OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim
+OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs
+OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT) // max entries factor before force recovery
+OPTION(osd_pg_log_trim_min, OPT_U32)
+OPTION(osd_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
+OPTION(osd_command_max_records, OPT_INT)
+OPTION(osd_max_pg_blocked_by, OPT_U32) // max peer osds to report that are blocking our progress
+OPTION(osd_op_log_threshold, OPT_INT) // how many op log messages to show in one go
+OPTION(osd_verify_sparse_read_holes, OPT_BOOL) // read fiemap-reported holes and verify they are zeros
+OPTION(osd_backoff_on_unfound, OPT_BOOL) // object unfound
+OPTION(osd_backoff_on_degraded, OPT_BOOL) // [mainly for debug?] object unreadable/writeable
+OPTION(osd_backoff_on_down, OPT_BOOL) // pg in down/incomplete state
+OPTION(osd_backoff_on_peering, OPT_BOOL) // [debug] pg peering
+OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL) // crash osd if client ignores a backoff; useful for debugging
+OPTION(osd_debug_inject_dispatch_delay_probability, OPT_DOUBLE)
+OPTION(osd_debug_inject_dispatch_delay_duration, OPT_DOUBLE)
+OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE)
+OPTION(osd_debug_drop_ping_duration, OPT_INT)
+OPTION(osd_debug_op_order, OPT_BOOL)
+OPTION(osd_debug_verify_missing_on_start, OPT_BOOL)
+OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64)
+OPTION(osd_debug_verify_snaps_on_info, OPT_BOOL)
+OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL)
+OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL)
+OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE)
+OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL) // inject failure during copyfrom completion
+OPTION(osd_debug_misdirected_ops, OPT_BOOL)
+OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL)
+OPTION(osd_debug_random_push_read_error, OPT_DOUBLE)
+OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
+OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
+OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
+OPTION(osd_op_history_size, OPT_U32) // Max number of completed ops to track
+OPTION(osd_op_history_duration, OPT_U32) // Oldest completed op to track
+OPTION(osd_op_history_slow_op_size, OPT_U32) // Max number of slow ops to track
+OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
+OPTION(osd_target_transaction_size, OPT_INT) // to adjust various transactions that batch smaller items
+OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
+OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
+
+OPTION(osd_pg_object_context_cache_count, OPT_INT)
+OPTION(osd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+OPTION(osd_function_tracing, OPT_BOOL) // true if function instrumentation should use LTTng
+
+OPTION(osd_fast_info, OPT_BOOL) // use fast info attr, if we can
+
+// determines whether PGLog::check() compares written out log to stored log
+OPTION(osd_debug_pg_log_writeout, OPT_BOOL)
+OPTION(osd_loop_before_reset_tphandle, OPT_U32) // Max number of loop before we reset thread-pool's handle
+// default timeout while caling WaitInterval on an empty queue
+OPTION(threadpool_default_timeout, OPT_INT)
+// default wait time for an empty queue before pinging the hb timeout
+OPTION(threadpool_empty_queue_max_wait, OPT_INT)
+
+OPTION(leveldb_log_to_ceph_log, OPT_BOOL)
+OPTION(leveldb_write_buffer_size, OPT_U64) // leveldb write buffer size
+OPTION(leveldb_cache_size, OPT_U64) // leveldb cache size
+OPTION(leveldb_block_size, OPT_U64) // leveldb block size
+OPTION(leveldb_bloom_size, OPT_INT) // leveldb bloom bits per entry
+OPTION(leveldb_max_open_files, OPT_INT) // leveldb max open files
+OPTION(leveldb_compression, OPT_BOOL) // leveldb uses compression
+OPTION(leveldb_paranoid, OPT_BOOL) // leveldb paranoid flag
+OPTION(leveldb_log, OPT_STR) // enable leveldb log file
+OPTION(leveldb_compact_on_mount, OPT_BOOL)
+
+OPTION(kinetic_host, OPT_STR) // hostname or ip address of a kinetic drive to use
+OPTION(kinetic_port, OPT_INT) // port number of the kinetic drive
+OPTION(kinetic_user_id, OPT_INT) // kinetic user to authenticate as
+OPTION(kinetic_hmac_key, OPT_STR) // kinetic key to authenticate with
+OPTION(kinetic_use_ssl, OPT_BOOL) // whether to secure kinetic traffic with TLS
+
+
+OPTION(rocksdb_separate_wal_dir, OPT_BOOL) // use $path.wal for wal
+SAFE_OPTION(rocksdb_db_paths, OPT_STR) // path,size( path,size)*
+OPTION(rocksdb_log_to_ceph_log, OPT_BOOL) // log to ceph log
+OPTION(rocksdb_cache_size, OPT_U64) // rocksdb cache size (unless set by bluestore/etc)
+OPTION(rocksdb_cache_row_ratio, OPT_FLOAT) // ratio of cache for row (vs block)
+OPTION(rocksdb_cache_shard_bits, OPT_INT) // rocksdb block cache shard bits, 4 bit -> 16 shards
+OPTION(rocksdb_cache_type, OPT_STR) // 'lru' or 'clock'
+OPTION(rocksdb_block_size, OPT_INT) // default rocksdb block size
+OPTION(rocksdb_perf, OPT_BOOL) // Enabling this will have 5-10% impact on performance for the stats collection
+OPTION(rocksdb_collect_compaction_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_collect_extended_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_collect_memory_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_enable_rmrange, OPT_BOOL) // see https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L253
+
+// rocksdb options that will be used for omap(if omap_backend is rocksdb)
+OPTION(filestore_rocksdb_options, OPT_STR)
+// rocksdb options that will be used in monstore
+OPTION(mon_rocksdb_options, OPT_STR)
+
+/**
+ * osd_*_priority adjust the relative priority of client io, recovery io,
+ * snaptrim io, etc
+ *
+ * osd_*_priority determines the ratio of available io between client and
+ * recovery. Each option may be set between
+ * 1..63.
+ */
+OPTION(osd_client_op_priority, OPT_U32)
+OPTION(osd_recovery_op_priority, OPT_U32)
+
+OPTION(osd_snap_trim_priority, OPT_U32)
+OPTION(osd_snap_trim_cost, OPT_U32) // set default cost equal to 1MB io
+
+OPTION(osd_scrub_priority, OPT_U32)
+// set default cost equal to 50MB io
+OPTION(osd_scrub_cost, OPT_U32)
+// set requested scrub priority higher than scrub priority to make the
+// requested scrubs jump the queue of scheduled scrubs
+OPTION(osd_requested_scrub_priority, OPT_U32)
+
+OPTION(osd_recovery_priority, OPT_U32)
+// set default cost equal to 20MB io
+OPTION(osd_recovery_cost, OPT_U32)
+
+/**
+ * osd_recovery_op_warn_multiple scales the normal warning threshhold,
+ * osd_op_complaint_time, so that slow recovery ops won't cause noise
+ */
+OPTION(osd_recovery_op_warn_multiple, OPT_U32)
+
+// Max time to wait between notifying mon of shutdown and shutting down
+OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE)
+OPTION(osd_shutdown_pgref_assert, OPT_BOOL) // crash if the OSD has stray PG refs on shutdown
+
+OPTION(osd_max_object_size, OPT_U64) // OSD's maximum object size
+OPTION(osd_max_object_name_len, OPT_U32) // max rados object name len
+OPTION(osd_max_object_namespace_len, OPT_U32) // max rados object namespace len
+OPTION(osd_max_attr_name_len, OPT_U32) // max rados attr name len; cannot go higher than 100 chars for file system backends
+OPTION(osd_max_attr_size, OPT_U64)
+
+OPTION(osd_max_omap_entries_per_request, OPT_U64)
+OPTION(osd_max_omap_bytes_per_request, OPT_U64)
+
+OPTION(osd_objectstore, OPT_STR) // ObjectStore backend type
+OPTION(osd_objectstore_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+OPTION(osd_objectstore_fuse, OPT_BOOL)
+
+OPTION(osd_bench_small_size_max_iops, OPT_U32) // 100 IOPS
+OPTION(osd_bench_large_size_max_throughput, OPT_U64) // 100 MB/s
+OPTION(osd_bench_max_block_size, OPT_U64) // cap the block size at 64MB
+OPTION(osd_bench_duration, OPT_U32) // duration of 'osd bench', capped at 30s to avoid triggering timeouts
+
+OPTION(osd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all osd requests
+OPTION(osdc_blkin_trace_all, OPT_BOOL) // create a blkin trace for all objecter requests
+
+OPTION(osd_discard_disconnected_ops, OPT_BOOL)
+
+OPTION(memstore_device_bytes, OPT_U64)
+OPTION(memstore_page_set, OPT_BOOL)
+OPTION(memstore_page_size, OPT_U64)
+
+OPTION(bdev_debug_inflight_ios, OPT_BOOL)
+OPTION(bdev_inject_crash, OPT_INT) // if N>0, then ~ 1/N IOs will complete before we crash on flush.
+OPTION(bdev_inject_crash_flush_delay, OPT_INT) // wait N more seconds on flush
+OPTION(bdev_aio, OPT_BOOL)
+OPTION(bdev_aio_poll_ms, OPT_INT) // milliseconds
+OPTION(bdev_aio_max_queue_depth, OPT_INT)
+OPTION(bdev_aio_reap_max, OPT_INT)
+OPTION(bdev_block_size, OPT_INT)
+OPTION(bdev_debug_aio, OPT_BOOL)
+OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT)
+
+// if yes, osd will unbind all NVMe devices from kernel driver and bind them
+// to the uio_pci_generic driver. The purpose is to prevent the case where
+// NVMe driver is loaded while osd is running.
+OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL)
+OPTION(bdev_nvme_retry_count, OPT_INT) // -1 means by default which is 4
+
+OPTION(objectstore_blackhole, OPT_BOOL)
+
+OPTION(bluefs_alloc_size, OPT_U64)
+OPTION(bluefs_max_prefetch, OPT_U64)
+OPTION(bluefs_min_log_runway, OPT_U64) // alloc when we get this low
+OPTION(bluefs_max_log_runway, OPT_U64) // alloc this much at a time
+OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT) // before we consider
+OPTION(bluefs_log_compact_min_size, OPT_U64) // before we consider
+OPTION(bluefs_min_flush_size, OPT_U64) // ignore flush until its this big
+OPTION(bluefs_compact_log_sync, OPT_BOOL) // sync or async log compaction?
+OPTION(bluefs_buffered_io, OPT_BOOL)
+OPTION(bluefs_sync_write, OPT_BOOL)
+OPTION(bluefs_allocator, OPT_STR) // stupid | bitmap
+OPTION(bluefs_preextend_wal_files, OPT_BOOL) // this *requires* that rocksdb has recycling enabled
+
+OPTION(bluestore_bluefs, OPT_BOOL)
+OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug
+OPTION(bluestore_bluefs_min, OPT_U64) // 1gb
+OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT) // min fs free / total free
+OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT) // max fs free / total free
+OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time
+OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time
+OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore
+// If you want to use spdk driver, you need to specify NVMe serial number here
+// with "spdk:" prefix.
+// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
+// get the serial number of Intel(R) Fultondale NVMe controllers.
+// Example:
+// bluestore_block_path = spdk:55cd2e404bd73932
+// If you want to run multiple SPDK instances per node, you must specify the
+// amount of dpdk memory size in MB each instance will use, to make sure each
+// instance uses its own dpdk memory
+OPTION(bluestore_spdk_mem, OPT_U32)
+// A hexadecimal bit mask of the cores to run on. Note the core numbering can change between platforms and should be determined beforehand.
+OPTION(bluestore_spdk_coremask, OPT_STR)
+// Specify the maximal I/Os to be batched completed while checking queue pair completions.
+// Default value 0 means that let SPDK nvme library determine the value.
+OPTION(bluestore_spdk_max_io_completion, OPT_U32)
+OPTION(bluestore_block_path, OPT_STR)
+OPTION(bluestore_block_size, OPT_U64) // 10gb for testing
+OPTION(bluestore_block_create, OPT_BOOL)
+OPTION(bluestore_block_db_path, OPT_STR)
+OPTION(bluestore_block_db_size, OPT_U64) // rocksdb ssts (hot/warm)
+OPTION(bluestore_block_db_create, OPT_BOOL)
+OPTION(bluestore_block_wal_path, OPT_STR)
+OPTION(bluestore_block_wal_size, OPT_U64) // rocksdb wal
+OPTION(bluestore_block_wal_create, OPT_BOOL)
+OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space if block/db_path/wal_path is file rather that block device.
+OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
+OPTION(bluestore_csum_min_block, OPT_U32)
+OPTION(bluestore_csum_max_block, OPT_U32)
+OPTION(bluestore_min_alloc_size, OPT_U32)
+OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
+OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
+OPTION(bluestore_max_alloc_size, OPT_U32)
+OPTION(bluestore_prefer_deferred_size, OPT_U32)
+OPTION(bluestore_prefer_deferred_size_hdd, OPT_U32)
+OPTION(bluestore_prefer_deferred_size_ssd, OPT_U32)
+OPTION(bluestore_compression_mode, OPT_STR) // force|aggressive|passive|none
+OPTION(bluestore_compression_algorithm, OPT_STR)
+OPTION(bluestore_compression_min_blob_size, OPT_U32)
+OPTION(bluestore_compression_min_blob_size_hdd, OPT_U32)
+OPTION(bluestore_compression_min_blob_size_ssd, OPT_U32)
+OPTION(bluestore_compression_max_blob_size, OPT_U32)
+OPTION(bluestore_compression_max_blob_size_hdd, OPT_U32)
+OPTION(bluestore_compression_max_blob_size_ssd, OPT_U32)
+/*
+ * Specifies minimum expected amount of saved allocation units
+ * per single blob to enable compressed blobs garbage collection
+ *
+ */
+OPTION(bluestore_gc_enable_blob_threshold, OPT_INT)
+/*
+ * Specifies minimum expected amount of saved allocation units
+ * per all blobsb to enable compressed blobs garbage collection
+ *
+ */
+OPTION(bluestore_gc_enable_total_threshold, OPT_INT)
+
+OPTION(bluestore_max_blob_size, OPT_U32)
+OPTION(bluestore_max_blob_size_hdd, OPT_U32)
+OPTION(bluestore_max_blob_size_ssd, OPT_U32)
+/*
+ * Require the net gain of compression at least to be at this ratio,
+ * otherwise we don't compress.
+ * And ask for compressing at least 12.5%(1/8) off, by default.
+ */
+OPTION(bluestore_compression_required_ratio, OPT_DOUBLE)
+OPTION(bluestore_extent_map_shard_max_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_target_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_min_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE)
+OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32)
+OPTION(bluestore_cache_trim_interval, OPT_DOUBLE)
+OPTION(bluestore_cache_trim_max_skip_pinned, OPT_U32) // skip this many onodes pinned in cache before we give up
+OPTION(bluestore_cache_type, OPT_STR) // lru, 2q
+OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE) // kin page slot size / max page slot size
+OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE) // number of kout page slot / total number of page slot
+OPTION(bluestore_cache_size, OPT_U64)
+OPTION(bluestore_cache_size_hdd, OPT_U64)
+OPTION(bluestore_cache_size_ssd, OPT_U64)
+OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE)
+OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE)
+OPTION(bluestore_cache_kv_max, OPT_U64) // limit the maximum amount of cache for the kv store
+OPTION(bluestore_kvbackend, OPT_STR)
+OPTION(bluestore_allocator, OPT_STR) // stupid | bitmap
+OPTION(bluestore_freelist_blocks_per_key, OPT_INT)
+OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
+OPTION(bluestore_bitmapallocator_span_size, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
+OPTION(bluestore_max_deferred_txc, OPT_U64)
+OPTION(bluestore_rocksdb_options, OPT_STR)
+OPTION(bluestore_fsck_on_mount, OPT_BOOL)
+OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL)
+OPTION(bluestore_fsck_on_umount, OPT_BOOL)
+OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL)
+OPTION(bluestore_fsck_on_mkfs, OPT_BOOL)
+OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL)
+OPTION(bluestore_sync_submit_transaction, OPT_BOOL) // submit kv txn in queueing thread (not kv_sync_thread)
+OPTION(bluestore_throttle_bytes, OPT_U64)
+OPTION(bluestore_throttle_deferred_bytes, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io, OPT_U64)
+OPTION(bluestore_deferred_batch_ops, OPT_U64)
+OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64)
+OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64)
+OPTION(bluestore_nid_prealloc, OPT_INT)
+OPTION(bluestore_blobid_prealloc, OPT_U64)
+OPTION(bluestore_clone_cow, OPT_BOOL) // do copy-on-write for clones
+OPTION(bluestore_default_buffered_read, OPT_BOOL)
+OPTION(bluestore_default_buffered_write, OPT_BOOL)
+OPTION(bluestore_debug_misc, OPT_BOOL)
+OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL)
+OPTION(bluestore_debug_small_allocations, OPT_INT)
+OPTION(bluestore_debug_freelist, OPT_BOOL)
+OPTION(bluestore_debug_prefill, OPT_FLOAT)
+OPTION(bluestore_debug_prefragment_max, OPT_INT)
+OPTION(bluestore_debug_inject_read_err, OPT_BOOL)
+OPTION(bluestore_debug_randomize_serial_transaction, OPT_INT)
+OPTION(bluestore_debug_omit_block_device_write, OPT_BOOL)
+OPTION(bluestore_debug_fsck_abort, OPT_BOOL)
+OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
+OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
+OPTION(bluestore_shard_finishers, OPT_BOOL)
+OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+
+OPTION(kstore_max_ops, OPT_U64)
+OPTION(kstore_max_bytes, OPT_U64)
+OPTION(kstore_backend, OPT_STR)
+OPTION(kstore_rocksdb_options, OPT_STR)
+OPTION(kstore_fsck_on_mount, OPT_BOOL)
+OPTION(kstore_fsck_on_mount_deep, OPT_BOOL)
+OPTION(kstore_nid_prealloc, OPT_U64)
+OPTION(kstore_sync_transaction, OPT_BOOL)
+OPTION(kstore_sync_submit_transaction, OPT_BOOL)
+OPTION(kstore_onode_map_size, OPT_U64)
+OPTION(kstore_default_stripe_size, OPT_INT)
+
+OPTION(filestore_omap_backend, OPT_STR)
+OPTION(filestore_omap_backend_path, OPT_STR)
+
+/// filestore wb throttle limits
+OPTION(filestore_wbthrottle_enable, OPT_BOOL)
+OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_ios_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_inodes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_bytes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_bytes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_ios_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_ios_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_inodes_start_flusher, OPT_U64)
+
+/// These must be less than the fd limit
+OPTION(filestore_wbthrottle_btrfs_inodes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_inodes_hard_limit, OPT_U64)
+
+//Introduce a O_DSYNC write in the filestore
+OPTION(filestore_odsync_write, OPT_BOOL)
+
+// Tests index failure paths
+OPTION(filestore_index_retry_probability, OPT_DOUBLE)
+
+// Allow object read error injection
+OPTION(filestore_debug_inject_read_err, OPT_BOOL)
+OPTION(filestore_debug_random_read_err, OPT_DOUBLE)
+
+OPTION(filestore_debug_omap_check, OPT_BOOL) // Expensive debugging check on sync
+OPTION(filestore_omap_header_cache_size, OPT_INT)
+
+// Use omap for xattrs for attrs over
+// filestore_max_inline_xattr_size or
+OPTION(filestore_max_inline_xattr_size, OPT_U32) //Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32)
+
+// for more than filestore_max_inline_xattrs attrs
+OPTION(filestore_max_inline_xattrs, OPT_U32) //Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32)
+
+// max xattr value size
+OPTION(filestore_max_xattr_value_size, OPT_U32) //Override
+OPTION(filestore_max_xattr_value_size_xfs, OPT_U32)
+OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32)
+// ext4 allows 4k xattrs total including some smallish extra fields and the
+// keys. We're allowing 2 512 inline attrs in addition some some filestore
+// replay attrs. After accounting for those, we still need to fit up to
+// two attrs of this value. That means we need this value to be around 1k
+// to be safe. This is hacky, but it's not worth complicating the code
+// to work around ext4's total xattr limit.
+OPTION(filestore_max_xattr_value_size_other, OPT_U32)
+
+OPTION(filestore_sloppy_crc, OPT_BOOL) // track sloppy crcs
+OPTION(filestore_sloppy_crc_block_size, OPT_INT)
+
+OPTION(filestore_max_alloc_hint_size, OPT_U64) // bytes
+
+OPTION(filestore_max_sync_interval, OPT_DOUBLE) // seconds
+OPTION(filestore_min_sync_interval, OPT_DOUBLE) // seconds
+OPTION(filestore_btrfs_snap, OPT_BOOL)
+OPTION(filestore_btrfs_clone_range, OPT_BOOL)
+OPTION(filestore_zfs_snap, OPT_BOOL) // zfsonlinux is still unstable
+OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL)
+OPTION(filestore_fiemap, OPT_BOOL) // (try to) use fiemap
+OPTION(filestore_punch_hole, OPT_BOOL)
+OPTION(filestore_seek_data_hole, OPT_BOOL) // (try to) use seek_data/hole
+OPTION(filestore_splice, OPT_BOOL)
+OPTION(filestore_fadvise, OPT_BOOL)
+//collect device partition information for management application to use
+OPTION(filestore_collect_device_partition_information, OPT_BOOL)
+
+// (try to) use extsize for alloc hint NOTE: extsize seems to trigger
+// data corruption in xfs prior to kernel 3.5. filestore will
+// implicity disable this if it cannot confirm the kernel is newer
+// than that.
+// NOTE: This option involves a tradeoff: When disabled, fragmentation is
+// worse, but large sequential writes are faster. When enabled, large
+// sequential writes are slower, but fragmentation is reduced.
+OPTION(filestore_xfs_extsize, OPT_BOOL)
+
+OPTION(filestore_journal_parallel, OPT_BOOL)
+OPTION(filestore_journal_writeahead, OPT_BOOL)
+OPTION(filestore_journal_trailing, OPT_BOOL)
+OPTION(filestore_queue_max_ops, OPT_U64)
+OPTION(filestore_queue_max_bytes, OPT_U64)
+
+OPTION(filestore_caller_concurrency, OPT_INT)
+
+/// Expected filestore throughput in B/s
+OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE)
+/// Expected filestore throughput in ops/s
+OPTION(filestore_expected_throughput_ops, OPT_DOUBLE)
+
+/// Filestore max delay multiple. Defaults to 0 (disabled)
+OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE)
+/// Filestore high delay multiple. Defaults to 0 (disabled)
+OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE)
+
+/// Use above to inject delays intended to keep the op queue between low and high
+OPTION(filestore_queue_low_threshhold, OPT_DOUBLE)
+OPTION(filestore_queue_high_threshhold, OPT_DOUBLE)
+
+OPTION(filestore_op_threads, OPT_INT)
+OPTION(filestore_op_thread_timeout, OPT_INT)
+OPTION(filestore_op_thread_suicide_timeout, OPT_INT)
+OPTION(filestore_commit_timeout, OPT_FLOAT)
+OPTION(filestore_fiemap_threshold, OPT_INT)
+OPTION(filestore_merge_threshold, OPT_INT)
+OPTION(filestore_split_multiple, OPT_INT)
+OPTION(filestore_split_rand_factor, OPT_U32) // randomize the split threshold by adding 16 * [0)
+OPTION(filestore_update_to, OPT_INT)
+OPTION(filestore_blackhole, OPT_BOOL) // drop any new transactions on the floor
+OPTION(filestore_fd_cache_size, OPT_INT) // FD lru size
+OPTION(filestore_fd_cache_shards, OPT_INT) // FD number of shards
+OPTION(filestore_ondisk_finisher_threads, OPT_INT)
+OPTION(filestore_apply_finisher_threads, OPT_INT)
+OPTION(filestore_dump_file, OPT_STR) // file onto which store transaction dumps
+OPTION(filestore_kill_at, OPT_INT) // inject a failure at the n'th opportunity
+OPTION(filestore_inject_stall, OPT_INT) // artificially stall for N seconds in op queue thread
+OPTION(filestore_fail_eio, OPT_BOOL) // fail/crash on EIO
+OPTION(filestore_debug_verify_split, OPT_BOOL)
+OPTION(journal_dio, OPT_BOOL)
+OPTION(journal_aio, OPT_BOOL)
+OPTION(journal_force_aio, OPT_BOOL)
+OPTION(journal_block_size, OPT_INT)
+
+// max bytes to search ahead in journal searching for corruption
+OPTION(journal_max_corrupt_search, OPT_U64)
+OPTION(journal_block_align, OPT_BOOL)
+OPTION(journal_write_header_frequency, OPT_U64)
+OPTION(journal_max_write_bytes, OPT_INT)
+OPTION(journal_max_write_entries, OPT_INT)
+
+/// Target range for journal fullness
+OPTION(journal_throttle_low_threshhold, OPT_DOUBLE)
+OPTION(journal_throttle_high_threshhold, OPT_DOUBLE)
+
+/// Multiple over expected at high_threshhold. Defaults to 0 (disabled).
+OPTION(journal_throttle_high_multiple, OPT_DOUBLE)
+/// Multiple over expected at max. Defaults to 0 (disabled).
+OPTION(journal_throttle_max_multiple, OPT_DOUBLE)
+
+OPTION(journal_align_min_size, OPT_INT) // align data payloads >= this.
+OPTION(journal_replay_from, OPT_INT)
+OPTION(journal_zero_on_create, OPT_BOOL)
+OPTION(journal_ignore_corruption, OPT_BOOL) // assume journal is not corrupt
+OPTION(journal_discard, OPT_BOOL) //using ssd disk as journal, whether support discard nouse journal-data.
+
+OPTION(fio_dir, OPT_STR) // fio data directory for fio-objectstore
+
+OPTION(rados_mon_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means no limit.
+OPTION(rados_osd_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
+OPTION(rados_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+
+OPTION(rbd_op_threads, OPT_INT)
+OPTION(rbd_op_thread_timeout, OPT_INT)
+OPTION(rbd_non_blocking_aio, OPT_BOOL) // process AIO ops from a worker thread to prevent blocking
+OPTION(rbd_cache, OPT_BOOL) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
+OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
+OPTION(rbd_cache_size, OPT_LONGLONG) // cache size in bytes
+OPTION(rbd_cache_max_dirty, OPT_LONGLONG) // dirty limit in bytes - set to 0 for write-through caching
+OPTION(rbd_cache_target_dirty, OPT_LONGLONG) // target dirty limit in bytes
+OPTION(rbd_cache_max_dirty_age, OPT_FLOAT) // seconds in cache before writeback starts
+OPTION(rbd_cache_max_dirty_object, OPT_INT) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
+OPTION(rbd_cache_block_writes_upfront, OPT_BOOL) // whether to block writes to the cache before the aio_write call completes (true))
+OPTION(rbd_concurrent_management_ops, OPT_INT) // how many operations can be in flight for a management operation like deleting or resizing an image
+OPTION(rbd_balance_snap_reads, OPT_BOOL)
+OPTION(rbd_localize_snap_reads, OPT_BOOL)
+OPTION(rbd_balance_parent_reads, OPT_BOOL)
+OPTION(rbd_localize_parent_reads, OPT_BOOL)
+OPTION(rbd_readahead_trigger_requests, OPT_INT) // number of sequential requests necessary to trigger readahead
+OPTION(rbd_readahead_max_bytes, OPT_LONGLONG) // set to 0 to disable readahead
+OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG) // how many bytes are read in total before readahead is disabled
+OPTION(rbd_clone_copy_on_read, OPT_BOOL)
+OPTION(rbd_blacklist_on_break_lock, OPT_BOOL) // whether to blacklist clients whose lock was broken
+OPTION(rbd_blacklist_expire_seconds, OPT_INT) // number of seconds to blacklist - set to 0 for OSD default
+OPTION(rbd_request_timed_out_seconds, OPT_INT) // number of seconds before maint request times out
+OPTION(rbd_skip_partial_discard, OPT_BOOL) // when trying to discard a range inside an object, set to true to skip zeroing the range.
+OPTION(rbd_enable_alloc_hint, OPT_BOOL) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
+OPTION(rbd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+OPTION(rbd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all RBD requests
+OPTION(rbd_validate_pool, OPT_BOOL) // true if empty pools should be validated for RBD compatibility
+OPTION(rbd_validate_names, OPT_BOOL) // true if image specs should be validated
+OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API
+OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL) // automatically start image resync after mirroring is disconnected due to being laggy
+OPTION(rbd_mirroring_replay_delay, OPT_INT) // time-delay in seconds for rbd-mirror asynchronous replication
+
+OPTION(rbd_default_pool, OPT_STR) // default pool for storing images
+
+/*
+ * The following options change the behavior for librbd's image creation methods that
+ * don't require all of the parameters. These are provided so that older programs
+ * can take advantage of newer features without being rewritten to use new versions
+ * of the image creation functions.
+ *
+ * rbd_create()/RBD::create() are affected by all of these options.
+ *
+ * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by:
+ * - rbd_default_order
+ * - rbd_default_stripe_count
+ * - rbd_default_stripe_size
+ *
+ * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
+ * affected by rbd_default_order.
+ */
+OPTION(rbd_default_format, OPT_INT)
+OPTION(rbd_default_order, OPT_INT)
+OPTION(rbd_default_stripe_count, OPT_U64) // changing requires stripingv2 feature
+OPTION(rbd_default_stripe_unit, OPT_U64) // changing to non-object size requires stripingv2 feature
+OPTION(rbd_default_data_pool, OPT_STR) // optional default pool for storing image data blocks
+
+/**
+ * RBD features are only applicable for v2 images. This setting accepts either
+ * an integer bitmask value or comma-delimited string of RBD feature names.
+ * This setting is always internally stored as an integer bitmask value. The
+ * mapping between feature bitmask value and feature name is as follows:
+ *
+ * +1 -> layering
+ * +2 -> striping
+ * +4 -> exclusive-lock
+ * +8 -> object-map
+ * +16 -> fast-diff
+ * +32 -> deep-flatten
+ * +64 -> journaling
+ * +128 -> data-pool
+ */
+SAFE_OPTION(rbd_default_features, OPT_STR)
+
+OPTION(rbd_default_map_options, OPT_STR) // default rbd map -o / --options
+
+/**
+ * RBD journal options.
+ */
+OPTION(rbd_journal_order, OPT_U32) // bits to shift to compute journal object max size, between 12 and 64
+OPTION(rbd_journal_splay_width, OPT_U32) // number of active journal objects
+OPTION(rbd_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
+OPTION(rbd_journal_object_flush_interval, OPT_INT) // maximum number of pending commits per journal object
+OPTION(rbd_journal_object_flush_bytes, OPT_INT) // maximum number of pending bytes per journal object
+OPTION(rbd_journal_object_flush_age, OPT_DOUBLE) // maximum age (in seconds) for pending commits
+OPTION(rbd_journal_pool, OPT_STR) // pool for journal objects
+OPTION(rbd_journal_max_payload_bytes, OPT_U32) // maximum journal payload size before splitting
+OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT) // maximum number of object sets a journal client can be behind before it is automatically unregistered
+
+/**
+ * RBD Mirror options
+ */
+OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds
+OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE) // maximum age (in seconds) between successive journal polls
+OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32) // maximum bytes to read from each journal data object per fetch
+OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE) // number of seconds between each update of the image sync point object number
+OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32) // maximum number of image syncs in parallel
+OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT) // interval to refresh peers in rbd-mirror daemon
+OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE) // interval to check and retry the failed requests in deleter
+OPTION(rbd_mirror_image_state_check_interval, OPT_INT) // interval to get images from pool watcher and set sources in replayer
+OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT) // interval (in seconds) between mirror leader heartbeats
+OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT) // number of missed heartbeats for non-lock owner to attempt to acquire lock
+OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT) // number of failed attempts to acquire lock after missing heartbeats before breaking lock
+
+OPTION(nss_db_path, OPT_STR) // path to nss db
+
+
+OPTION(rgw_max_chunk_size, OPT_INT)
+OPTION(rgw_put_obj_min_window_size, OPT_INT)
+OPTION(rgw_put_obj_max_window_size, OPT_INT)
+OPTION(rgw_max_put_size, OPT_U64)
+OPTION(rgw_max_put_param_size, OPT_U64) // max input size for PUT requests accepting json/xml params
+
+/**
+ * override max bucket index shards in zone configuration (if not zero)
+ *
+ * Represents the number of shards for the bucket index object, a value of zero
+ * indicates there is no sharding. By default (no sharding, the name of the object
+ * is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}',
+ * sharding_id is zero-based value. It is not recommended to set a too large value
+ * (e.g. thousand) as it increases the cost for bucket listing.
+ */
+OPTION(rgw_override_bucket_index_max_shards, OPT_U32)
+
+/**
+ * Represents the maximum AIO pending requests for the bucket index object shards.
+ */
+OPTION(rgw_bucket_index_max_aio, OPT_U32)
+
+/**
+ * whether or not the quota/gc threads should be started
+ */
+OPTION(rgw_enable_quota_threads, OPT_BOOL)
+OPTION(rgw_enable_gc_threads, OPT_BOOL)
+OPTION(rgw_enable_lc_threads, OPT_BOOL)
+
+
+OPTION(rgw_data, OPT_STR)
+OPTION(rgw_enable_apis, OPT_STR)
+OPTION(rgw_cache_enabled, OPT_BOOL) // rgw cache enabled
+OPTION(rgw_cache_lru_size, OPT_INT) // num of entries in rgw cache
+OPTION(rgw_socket_path, OPT_STR) // path to unix domain socket, if not specified, rgw will not run as external fcgi
+OPTION(rgw_host, OPT_STR) // host for radosgw, can be an IP, default is 0.0.0.0
+OPTION(rgw_port, OPT_STR) // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi
+OPTION(rgw_dns_name, OPT_STR) // hostname suffix on buckets
+OPTION(rgw_dns_s3website_name, OPT_STR) // hostname suffix on buckets for s3-website endpoint
+OPTION(rgw_content_length_compat, OPT_BOOL) // Check both HTTP_CONTENT_LENGTH and CONTENT_LENGTH in fcgi env
+OPTION(rgw_lifecycle_work_time, OPT_STR) //job process lc at 00:00-06:00s
+OPTION(rgw_lc_lock_max_time, OPT_INT) // total run time for a single lc processor work
+OPTION(rgw_lc_max_objs, OPT_INT)
+OPTION(rgw_lc_debug_interval, OPT_INT) // Debug run interval, in seconds
+OPTION(rgw_script_uri, OPT_STR) // alternative value for SCRIPT_URI if not set in request
+OPTION(rgw_request_uri, OPT_STR) // alternative value for REQUEST_URI if not set in request
+OPTION(rgw_swift_url, OPT_STR) // the swift url, being published by the internal swift auth
+OPTION(rgw_swift_url_prefix, OPT_STR) // entry point for which a url is considered a swift url
+OPTION(rgw_swift_auth_url, OPT_STR) // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
+OPTION(rgw_swift_auth_entry, OPT_STR) // entry point for which a url is considered a swift auth url
+OPTION(rgw_swift_tenant_name, OPT_STR) // tenant name to use for swift access
+OPTION(rgw_swift_account_in_url, OPT_BOOL) // assume that URL always contain the account (aka tenant) part
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL) // enforce generation of Content-Length even in cost of performance or scalability
+OPTION(rgw_keystone_url, OPT_STR) // url for keystone server
+OPTION(rgw_keystone_admin_token, OPT_STR) // keystone admin token (shared secret)
+OPTION(rgw_keystone_admin_user, OPT_STR) // keystone admin user name
+OPTION(rgw_keystone_admin_password, OPT_STR) // keystone admin user password
+OPTION(rgw_keystone_admin_tenant, OPT_STR) // keystone admin user tenant (for keystone v2.0)
+OPTION(rgw_keystone_admin_project, OPT_STR) // keystone admin user project (for keystone v3)
+OPTION(rgw_keystone_admin_domain, OPT_STR) // keystone admin user domain
+OPTION(rgw_keystone_barbican_user, OPT_STR) // keystone user to access barbican secrets
+OPTION(rgw_keystone_barbican_password, OPT_STR) // keystone password for barbican user
+OPTION(rgw_keystone_barbican_tenant, OPT_STR) // keystone barbican user tenant (for keystone v2.0)
+OPTION(rgw_keystone_barbican_project, OPT_STR) // keystone barbican user project (for keystone v3)
+OPTION(rgw_keystone_barbican_domain, OPT_STR) // keystone barbican user domain
+OPTION(rgw_keystone_api_version, OPT_INT) // Version of Keystone API to use (2 or 3)
+OPTION(rgw_keystone_accepted_roles, OPT_STR) // roles required to serve requests
+OPTION(rgw_keystone_accepted_admin_roles, OPT_STR) // list of roles allowing an user to gain admin privileges
+OPTION(rgw_keystone_token_cache_size, OPT_INT) // max number of entries in keystone token cache
+OPTION(rgw_keystone_revocation_interval, OPT_INT) // seconds between tokens revocation check
+OPTION(rgw_keystone_verify_ssl, OPT_BOOL) // should we try to verify keystone's ssl
+OPTION(rgw_keystone_implicit_tenants, OPT_BOOL) // create new users in their own tenants of the same name
+OPTION(rgw_cross_domain_policy, OPT_STR)
+OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503
+OPTION(rgw_s3_auth_use_rados, OPT_BOOL) // should we try to use the internal credentials for s3?
+OPTION(rgw_s3_auth_use_keystone, OPT_BOOL) // should we try to use keystone for s3?
+OPTION(rgw_s3_auth_aws4_force_boto2_compat, OPT_BOOL) // force aws4 auth boto2 compatibility
+OPTION(rgw_barbican_url, OPT_STR) // url for barbican server
+
+/* OpenLDAP-style LDAP parameter strings */
+/* rgw_ldap_uri space-separated list of LDAP servers in URI format */
+OPTION(rgw_ldap_uri, OPT_STR)
+/* rgw_ldap_binddn LDAP entry RGW will bind with (user match) */
+OPTION(rgw_ldap_binddn, OPT_STR)
+/* rgw_ldap_searchdn LDAP search base (basedn) */
+OPTION(rgw_ldap_searchdn, OPT_STR)
+/* rgw_ldap_dnattr LDAP attribute containing RGW user names (to form binddns)*/
+OPTION(rgw_ldap_dnattr, OPT_STR)
+/* rgw_ldap_secret file containing credentials for rgw_ldap_binddn */
+OPTION(rgw_ldap_secret, OPT_STR)
+/* rgw_s3_auth_use_ldap use LDAP for RGW auth? */
+OPTION(rgw_s3_auth_use_ldap, OPT_BOOL)
+/* rgw_ldap_searchfilter LDAP search filter */
+OPTION(rgw_ldap_searchfilter, OPT_STR)
+
+OPTION(rgw_admin_entry, OPT_STR) // entry point for which a url is considered an admin request
+OPTION(rgw_enforce_swift_acls, OPT_BOOL)
+OPTION(rgw_swift_token_expiration, OPT_INT) // time in seconds for swift token expiration
+OPTION(rgw_print_continue, OPT_BOOL) // enable if 100-Continue works
+OPTION(rgw_print_prohibited_content_length, OPT_BOOL) // violate RFC 7230 and send Content-Length in 204 and 304
+OPTION(rgw_remote_addr_param, OPT_STR) // e.g. X-Forwarded-For, if you have a reverse proxy
+OPTION(rgw_op_thread_timeout, OPT_INT)
+OPTION(rgw_op_thread_suicide_timeout, OPT_INT)
+OPTION(rgw_thread_pool_size, OPT_INT)
+OPTION(rgw_num_control_oids, OPT_INT)
+OPTION(rgw_num_rados_handles, OPT_U32)
+OPTION(rgw_verify_ssl, OPT_BOOL) // should http_client try to verify ssl when sent https request
+
+/* The following are tunables for caches of RGW NFS (and other file
+ * client) objects.
+ *
+ * The file handle cache is a partitioned hash table
+ * (fhcache_partitions), each with a closed hash part and backing
+ * b-tree mapping. The number of partions is expected to be a small
+ * prime, the cache size something larger but less than 5K, the total
+ * size of the cache is n_part * cache_size.
+ */
+OPTION(rgw_nfs_lru_lanes, OPT_INT)
+OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT)
+OPTION(rgw_nfs_fhcache_partitions, OPT_INT)
+OPTION(rgw_nfs_fhcache_size, OPT_INT) /* 3*2017=6051 */
+OPTION(rgw_nfs_namespace_expire_secs, OPT_INT) /* namespace invalidate
+ * timer */
+OPTION(rgw_nfs_max_gc, OPT_INT) /* max gc events per cycle */
+OPTION(rgw_nfs_write_completion_interval_s, OPT_INT) /* stateless (V3)
+ * commit
+ * delay */
+
+OPTION(rgw_zone, OPT_STR) // zone name
+OPTION(rgw_zone_root_pool, OPT_STR) // pool where zone specific info is stored
+OPTION(rgw_default_zone_info_oid, OPT_STR) // oid where default zone info is stored
+OPTION(rgw_region, OPT_STR) // region name
+OPTION(rgw_region_root_pool, OPT_STR) // pool where all region info is stored
+OPTION(rgw_default_region_info_oid, OPT_STR) // oid where default region info is stored
+OPTION(rgw_zonegroup, OPT_STR) // zone group name
+OPTION(rgw_zonegroup_root_pool, OPT_STR) // pool where all zone group info is stored
+OPTION(rgw_default_zonegroup_info_oid, OPT_STR) // oid where default zone group info is stored
+OPTION(rgw_realm, OPT_STR) // realm name
+OPTION(rgw_realm_root_pool, OPT_STR) // pool where all realm info is stored
+OPTION(rgw_default_realm_info_oid, OPT_STR) // oid where default realm info is stored
+OPTION(rgw_period_root_pool, OPT_STR) // pool where all period info is stored
+OPTION(rgw_period_latest_epoch_info_oid, OPT_STR) // oid where current period info is stored
+OPTION(rgw_log_nonexistent_bucket, OPT_BOOL)
+OPTION(rgw_log_object_name, OPT_STR) // man date to see codes (a subset are supported)
+OPTION(rgw_log_object_name_utc, OPT_BOOL)
+OPTION(rgw_usage_max_shards, OPT_INT)
+OPTION(rgw_usage_max_user_shards, OPT_INT)
+OPTION(rgw_enable_ops_log, OPT_BOOL) // enable logging every rgw operation
+OPTION(rgw_enable_usage_log, OPT_BOOL) // enable logging bandwidth usage
+OPTION(rgw_ops_log_rados, OPT_BOOL) // whether ops log should go to rados
+OPTION(rgw_ops_log_socket_path, OPT_STR) // path to unix domain socket where ops log can go
+OPTION(rgw_ops_log_data_backlog, OPT_INT) // max data backlog for ops log
+OPTION(rgw_fcgi_socket_backlog, OPT_INT) // socket backlog for fcgi
+OPTION(rgw_usage_log_flush_threshold, OPT_INT) // threshold to flush pending log data
+OPTION(rgw_usage_log_tick_interval, OPT_INT) // flush pending log data every X seconds
+OPTION(rgw_intent_log_object_name, OPT_STR) // man date to see codes (a subset are supported)
+OPTION(rgw_intent_log_object_name_utc, OPT_BOOL)
+OPTION(rgw_init_timeout, OPT_INT) // time in seconds
+OPTION(rgw_mime_types_file, OPT_STR)
+OPTION(rgw_gc_max_objs, OPT_INT)
+OPTION(rgw_gc_obj_min_wait, OPT_INT) // wait time before object may be handled by gc
+OPTION(rgw_gc_processor_max_time, OPT_INT) // total run time for a single gc processor work
+OPTION(rgw_gc_processor_period, OPT_INT) // gc processor cycle time
+OPTION(rgw_s3_success_create_obj_status, OPT_INT) // alternative success status response for create-obj (0 - default)
+OPTION(rgw_resolve_cname, OPT_BOOL) // should rgw try to resolve hostname as a dns cname record
+OPTION(rgw_obj_stripe_size, OPT_INT)
+OPTION(rgw_extended_http_attrs, OPT_STR) // list of extended attrs that can be set on objects (beyond the default)
+OPTION(rgw_exit_timeout_secs, OPT_INT) // how many seconds to wait for process to go down before exiting unconditionally
+OPTION(rgw_get_obj_window_size, OPT_INT) // window size in bytes for single get obj request
+OPTION(rgw_get_obj_max_req_size, OPT_INT) // max length of a single get obj rados op
+OPTION(rgw_relaxed_s3_bucket_names, OPT_BOOL) // enable relaxed bucket name rules for US region buckets
+OPTION(rgw_defer_to_bucket_acls, OPT_STR) // if the user has bucket perms)
+OPTION(rgw_list_buckets_max_chunk, OPT_INT) // max buckets to retrieve in a single op when listing user buckets
+OPTION(rgw_md_log_max_shards, OPT_INT) // max shards for metadata log
+OPTION(rgw_num_zone_opstate_shards, OPT_INT) // max shards for keeping inter-region copy progress info
+OPTION(rgw_opstate_ratelimit_sec, OPT_INT) // min time between opstate updates on a single upload (0 for disabling ratelimit)
+OPTION(rgw_curl_wait_timeout_ms, OPT_INT) // timeout for certain curl calls
+OPTION(rgw_copy_obj_progress, OPT_BOOL) // should dump progress during long copy operations?
+OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT) // min bytes between copy progress output
+OPTION(rgw_obj_tombstone_cache_size, OPT_INT) // how many objects in tombstone cache, which is used in multi-zone sync to keep
+ // track of removed objects' mtime
+
+OPTION(rgw_data_log_window, OPT_INT) // data log entries window (in seconds)
+OPTION(rgw_data_log_changes_size, OPT_INT) // number of in-memory entries to hold for data changes log
+OPTION(rgw_data_log_num_shards, OPT_INT) // number of objects to keep data changes log on
+OPTION(rgw_data_log_obj_prefix, OPT_STR) //
+OPTION(rgw_replica_log_obj_prefix, OPT_STR) //
+
+OPTION(rgw_bucket_quota_ttl, OPT_INT) // time for cached bucket stats to be cached within rgw instance
+OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE) // threshold from which we don't rely on cached info for quota decisions
+OPTION(rgw_bucket_quota_cache_size, OPT_INT) // number of entries in bucket quota cache
+OPTION(rgw_bucket_default_quota_max_objects, OPT_INT) // number of objects allowed
+OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes
+
+OPTION(rgw_expose_bucket, OPT_BOOL) // Return the bucket name in the 'Bucket' response header
+
+OPTION(rgw_frontends, OPT_STR) // rgw front ends
+
+OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing stats
+OPTION(rgw_user_quota_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing entire user stats
+OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL) // whether stats for idle users be fully synced
+OPTION(rgw_user_quota_sync_wait_time, OPT_INT) // min time between two full stats sync for non-idle users
+OPTION(rgw_user_default_quota_max_objects, OPT_INT) // number of objects allowed
+OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes
+
+OPTION(rgw_multipart_min_part_size, OPT_INT) // min size for each part (except for last one) in multipart upload
+OPTION(rgw_multipart_part_upload_limit, OPT_INT) // parts limit in multipart upload
+
+OPTION(rgw_max_slo_entries, OPT_INT) // default number of max entries in slo
+
+OPTION(rgw_olh_pending_timeout_sec, OPT_INT) // time until we retire a pending olh change
+OPTION(rgw_user_max_buckets, OPT_INT) // global option to set max buckets count for all user
+
+OPTION(rgw_objexp_gc_interval, OPT_U32) // maximum time between round of expired objects garbage collecting
+OPTION(rgw_objexp_time_step, OPT_U32) // number of seconds for rounding the timestamps
+OPTION(rgw_objexp_hints_num_shards, OPT_U32) // maximum number of parts in which the hint index is stored in
+OPTION(rgw_objexp_chunk_size, OPT_U32) // maximum number of entries in a single operation when processing objexp data
+
+OPTION(rgw_enable_static_website, OPT_BOOL) // enable static website feature
+OPTION(rgw_log_http_headers, OPT_STR) // list of HTTP headers to log when seen, ignores case (e.g., http_x_forwarded_for
+
+OPTION(rgw_num_async_rados_threads, OPT_INT) // num of threads to use for async rados operations
+OPTION(rgw_md_notify_interval_msec, OPT_INT) // metadata changes notification interval to followers
+OPTION(rgw_run_sync_thread, OPT_BOOL) // whether radosgw (not radosgw-admin) spawns the sync thread
+OPTION(rgw_sync_lease_period, OPT_INT) // time in second for lease that rgw takes on a specific log (or log shard)
+OPTION(rgw_sync_log_trim_interval, OPT_INT) // time in seconds between attempts to trim sync logs
+
+OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE) // range [0, 1]
+OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE) // range [0, 1]
+
+
+OPTION(rgw_period_push_interval, OPT_DOUBLE) // seconds to wait before retrying "period push"
+OPTION(rgw_period_push_interval_max, OPT_DOUBLE) // maximum interval after exponential backoff
+
+OPTION(rgw_safe_max_objects_per_shard, OPT_INT) // safe max loading
+OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
+ // at which to warn
+
+OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
+
+OPTION(mgr_module_path, OPT_STR) // where to load python modules from
+OPTION(mgr_initial_modules, OPT_STR) // Which modules to load
+OPTION(mgr_data, OPT_STR) // where to find keyring etc
+OPTION(mgr_tick_period, OPT_INT) // How frequently to tick
+OPTION(mgr_stats_period, OPT_INT) // How frequently clients send stats
+OPTION(mgr_client_bytes, OPT_U64) // bytes from clients
+OPTION(mgr_client_messages, OPT_U64) // messages from clients
+OPTION(mgr_osd_bytes, OPT_U64) // bytes from osds
+OPTION(mgr_osd_messages, OPT_U64) // messages from osds
+OPTION(mgr_mds_bytes, OPT_U64) // bytes from mdss
+OPTION(mgr_mds_messages, OPT_U64) // messages from mdss
+OPTION(mgr_mon_bytes, OPT_U64) // bytes from mons
+OPTION(mgr_mon_messages, OPT_U64) // messages from mons
+
+OPTION(mgr_connect_retry_interval, OPT_DOUBLE)
+OPTION(mgr_service_beacon_grace, OPT_DOUBLE)
+
+OPTION(mon_mgr_digest_period, OPT_INT) // How frequently to send digests
+OPTION(mon_mgr_beacon_grace, OPT_INT) // How long to wait to failover
+OPTION(mon_mgr_inactive_grace, OPT_INT) // How long before health WARN -> ERR
+OPTION(mon_mgr_mkfs_grace, OPT_INT) // How long before we complain about MGR_DOWN
+OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
+OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
+OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
+ // defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg=="
+OPTION(rgw_crypt_suppress_logs, OPT_BOOL) // suppress logs that might print customer key
+OPTION(rgw_list_bucket_min_readahead, OPT_INT) // minimum number of entries to read from rados for bucket listing
+
+OPTION(rgw_rest_getusage_op_compat, OPT_BOOL) // dump description of total stats for s3 GetUsage API
+
+OPTION(mutex_perf_counter, OPT_BOOL) // enable/disable mutex perf counter
+OPTION(throttler_perf_counter, OPT_BOOL) // enable/disable throttler perf counter
+
+/* The following are tunables for torrent data */
+OPTION(rgw_torrent_flag, OPT_BOOL) // produce torrent function flag
+OPTION(rgw_torrent_tracker, OPT_STR) // torrent field annouce and annouce list
+OPTION(rgw_torrent_createby, OPT_STR) // torrent field created by
+OPTION(rgw_torrent_comment, OPT_STR) // torrent field comment
+OPTION(rgw_torrent_encoding, OPT_STR) // torrent field encoding
+OPTION(rgw_torrent_origin, OPT_STR) // torrent origin
+OPTION(rgw_torrent_sha_unit, OPT_INT) // torrent field piece length 512K
+
+OPTION(event_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+
+// This will be set to true when it is safe to start threads.
+// Once it is true, it will never change.
+OPTION(internal_safe_to_start_threads, OPT_BOOL)
+
+OPTION(debug_deliberately_leak_memory, OPT_BOOL)
+
+OPTION(rgw_swift_custom_header, OPT_STR) // option to enable swift custom headers
+
+OPTION(rgw_swift_need_stats, OPT_BOOL) // option to enable stats on bucket listing for swift
+
+/* resharding tunables */
+OPTION(rgw_reshard_num_logs, OPT_INT)
+OPTION(rgw_reshard_bucket_lock_duration, OPT_INT) // duration of lock on bucket obj during resharding
+OPTION(rgw_dynamic_resharding, OPT_BOOL)
+OPTION(rgw_max_objs_per_shard, OPT_INT)
+OPTION(rgw_reshard_thread_interval, OPT_U32) // maximum time between rounds of reshard thread processing
+
+OPTION(rgw_acl_grants_max_num, OPT_INT) // According to AWS S3(http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html), An ACL can have up to 100 grants.
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "options.h"
+#include "common/Formatter.h"
+
+// Helpers for validators
+#include "include/stringify.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/regex.hpp>
+
+
+void Option::dump_value(const char *field_name,
+ const Option::value_t &v, Formatter *f) const
+{
+ if (boost::get<boost::blank>(&v)) {
+ // This should be nil but Formatter doesn't allow it.
+ f->dump_string(field_name, "");
+ } else if (type == TYPE_UINT) {
+ f->dump_unsigned(field_name, boost::get<uint64_t>(v));
+ } else if (type == TYPE_INT) {
+ f->dump_int(field_name, boost::get<int64_t>(v));
+ } else if (type == TYPE_STR) {
+ f->dump_string(field_name, boost::get<std::string>(v));
+ } else if (type == TYPE_FLOAT) {
+ f->dump_float(field_name, boost::get<double>(v));
+ } else if (type == TYPE_BOOL) {
+ f->dump_bool(field_name, boost::get<bool>(v));
+ } else {
+ f->dump_stream(field_name) << v;
+ }
+}
+
+int Option::pre_validate(std::string *new_value, std::string *err) const
+{
+ if (validator) {
+ return validator(new_value, err);
+ } else {
+ return 0;
+ }
+}
+
+int Option::validate(const Option::value_t &new_value, std::string *err) const
+{
+ // Generic validation: min
+ if (!boost::get<boost::blank>(&(min))) {
+ if (new_value < min) {
+ std::ostringstream oss;
+ oss << "Value '" << new_value << "' is below minimum " << min;
+ *err = oss.str();
+ return -EINVAL;
+ }
+ }
+
+ // Generic validation: max
+ if (!boost::get<boost::blank>(&(max))) {
+ if (new_value > max) {
+ std::ostringstream oss;
+ oss << "Value '" << new_value << "' exceeds maximum " << max;
+ *err = oss.str();
+ return -EINVAL;
+ }
+ }
+
+ // Generic validation: enum
+ if (!enum_allowed.empty() && type == Option::TYPE_STR) {
+ auto found = std::find(enum_allowed.begin(), enum_allowed.end(),
+ boost::get<std::string>(new_value));
+ if (found == enum_allowed.end()) {
+ std::ostringstream oss;
+ oss << "'" << new_value << "' is not one of the permitted "
+ "values: " << joinify(enum_allowed.begin(),
+ enum_allowed.end(),
+ std::string(", "));
+ *err = oss.str();
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+void Option::dump(Formatter *f) const
+{
+ f->open_object_section("option");
+ f->dump_string("name", name);
+
+ f->dump_string("type", type_to_str(type));
+ std::string level_str;
+
+ f->dump_string("level", level_to_str(level));
+
+ f->dump_string("desc", desc);
+ f->dump_string("long_desc", long_desc);
+
+ dump_value("default", value, f);
+ dump_value("daemon_default", daemon_value, f);
+
+ f->open_array_section("tags");
+ for (const auto t : tags) {
+ f->dump_string("tag", t);
+ }
+ f->close_section();
+
+ f->open_array_section("services");
+ for (const auto s : services) {
+ f->dump_string("service", s);
+ }
+ f->close_section();
+
+ f->open_array_section("see_also");
+ for (const auto sa : see_also) {
+ f->dump_string("see_also", sa);
+ }
+ f->close_section();
+
+ if (type == TYPE_STR) {
+ f->open_array_section("enum_values");
+ for (const auto &ea : enum_allowed) {
+ f->dump_string("enum_value", ea);
+ }
+ f->close_section();
+ }
+
+ dump_value("min", min, f);
+ dump_value("max", max, f);
+
+ f->close_section();
+}
+
+
+std::vector<Option> global_options = {
+ Option("host", Option::TYPE_STR, Option::LEVEL_BASIC)
+ .set_description("local hostname")
+ .set_long_description("if blank, ceph assumes the short hostname (hostname -s)")
+ .add_service("common")
+ .add_tag("network"),
+
+ Option("fsid", Option::TYPE_UUID, Option::LEVEL_BASIC)
+ .set_description("cluster fsid (uuid)")
+ .add_service("common")
+ .add_tag("service"),
+
+ Option("public_addr", Option::TYPE_ADDR, Option::LEVEL_BASIC)
+ .set_description("public-facing address to bind to")
+ .add_service({"mon", "mds", "osd", "mgr"}),
+
+ Option("public_bind_addr", Option::TYPE_ADDR, Option::LEVEL_ADVANCED)
+ .set_default(entity_addr_t())
+ .add_service("mon")
+ .set_description(""),
+
+ Option("cluster_addr", Option::TYPE_ADDR, Option::LEVEL_BASIC)
+ .set_description("cluster-facing address to bind to")
+ .add_service("osd")
+ .add_tag("network"),
+
+ Option("public_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .add_service({"mon", "mds", "osd", "mgr"})
+ .add_tag("network")
+ .set_description(""),
+
+ Option("cluster_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .add_service("osd")
+ .add_tag("network")
+ .set_description(""),
+
+ Option("monmap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("path to MonMap file")
+ .set_long_description("This option is normally used during mkfs, but can also "
+ "be used to identify which monitors to connect to.")
+ .add_service("mon")
+ .add_tag("mkfs"),
+
+ Option("mon_host", Option::TYPE_STR, Option::LEVEL_BASIC)
+ .set_description("list of hosts or addresses to search for a monitor")
+ .set_long_description("This is a comma, whitespace, or semicolon separated "
+ "list of IP addresses or hostnames. Hostnames are "
+ "resolved via DNS and all A or AAAA records are "
+ "included in the search list.")
+ .add_service("common"),
+
+ Option("mon_dns_srv_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("name of DNS SRV record to check for monitor addresses")
+ .add_service("common")
+ .add_tag("network")
+ .add_see_also("mon_host"),
+
+ // lockdep
+ Option("lockdep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_description("enable lockdep lock dependency analyzer")
+ .add_service("common"),
+
+ Option("lockdep_force_backtrace", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_description("always gather current backtrace at every lock")
+ .add_service("common")
+ .add_see_also("lockdep"),
+
+ Option("run_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/run/ceph")
+ .set_description("path for the 'run' directory for storing pid and socket files")
+ .add_service("common")
+ .add_see_also("admin_socket"),
+
+ Option("admin_socket", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_daemon_default("$run_dir/$cluster-$name.asok")
+ .set_description("path for the runtime control socket file, used by the 'ceph daemon' command")
+ .add_service("common"),
+
+ Option("admin_socket_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("file mode to set for the admin socket file, e.g, '0755'")
+ .add_service("common")
+ .add_see_also("admin_socket"),
+
+ Option("crushtool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("name of the 'crushtool' utility")
+ .add_service("mon"),
+
+ // daemon
+ Option("daemonize", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_daemon_default(true)
+ .set_description("whether to daemonize (background) after startup")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service")
+ .add_see_also({"pid_file", "chdir"}),
+
+ Option("setuser", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("uid or user name to switch to on startup")
+ .set_long_description("This is normally specified by the systemd unit file.")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service")
+ .add_see_also("setgroup"),
+
+ Option("setgroup", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("gid or group name to switch to on startup")
+ .set_long_description("This is normally specified by the systemd unit file.")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service")
+ .add_see_also("setuser"),
+
+ Option("setuser_match_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("if set, setuser/setgroup is condition on this path matching ownership")
+ .set_long_description("If setuser or setgroup are specified, and this option is non-empty, then the uid/gid of the daemon will only be changed if the file or directory specified by this option has a matching uid and/or gid. This exists primarily to allow switching to user ceph for OSDs to be conditional on whether the osd data contents have also been chowned after an upgrade. This is normally specified by the systemd unit file.")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service")
+ .add_see_also({"setuser", "setgroup"}),
+
+ Option("pid_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("path to write a pid file (if any)")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service"),
+
+ Option("chdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("path to chdir(2) to after daemonizing")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service")
+ .add_see_also("daemonize"),
+
+ Option("fatal_signal_handlers", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description("whether to register signal handlers for SIGABRT etc that dump a stack trace")
+ .set_long_description("This is normally true for daemons and values for libraries.")
+ .add_service({"mon", "mgr", "osd", "mds"})
+ .add_tag("service"),
+
+ // restapi
+ Option("restapi_log_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("default set by python code"),
+
+ Option("restapi_base_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_description("default set by python code"),
+
+ Option("erasure_code_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(CEPH_PKGLIBDIR"/erasure-code")
+ .set_description("directory where erasure-code plugins can be found")
+ .add_service({"mon", "osd"})
+ .set_safe(),
+
+ // logging
+ Option("log_file", Option::TYPE_STR, Option::LEVEL_BASIC)
+ .set_default("")
+ .set_daemon_default("/var/log/ceph/$cluster-$name.log")
+ .set_description("path to log file")
+ .add_see_also({"log_to_stderr",
+ "err_to_stderr",
+ "log_to_syslog",
+ "err_to_syslog"}),
+
+ Option("log_max_new", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description("max unwritten log entries to allow before waiting to flush to the log")
+ .add_see_also("log_max_recent"),
+
+ Option("log_max_recent", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_daemon_default(10000)
+ .set_description("recent log entries to keep in memory to dump in the event of a crash")
+ .set_long_description("The purpose of this option is to log at a higher debug level only to the in-memory buffer, and write out the detailed log messages only if there is a crash. Only log entries below the lower log level will be written unconditionally to the log. For example, debug_osd=1/5 will write everything <= 1 to the log unconditionally but keep entries at levels 2-5 in memory. If there is a seg fault or assertion failure, all entries will be dumped to the log."),
+
+ Option("log_to_stderr", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(true)
+ .set_daemon_default(false)
+ .set_description("send log lines to stderr"),
+
+ Option("err_to_stderr", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(false)
+ .set_daemon_default(true)
+ .set_description("send critical error log lines to stderr"),
+
+ Option("log_to_syslog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(false)
+ .set_description("send log lines to syslog facility"),
+
+ Option("err_to_syslog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(false)
+ .set_description("send critical error log lines to syslog facility"),
+
+ Option("log_flush_on_exit", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description("set a process exit handler to ensure the log is flushed on exit"),
+
+ Option("log_stop_at_utilization", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+ .set_default(.97)
+ .set_min_max(0.0, 1.0)
+ .set_description("stop writing to the log file when device utilization reaches this ratio")
+ .add_see_also("log_file"),
+
+ Option("log_to_graylog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(false)
+ .set_description("send log lines to remote graylog server")
+ .add_see_also({"err_to_graylog",
+ "log_graylog_host",
+ "log_graylog_port"}),
+
+ Option("err_to_graylog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+ .set_default(false)
+ .set_description("send critical error log lines to remote graylog server")
+ .add_see_also({"log_to_graylog",
+ "log_graylog_host",
+ "log_graylog_port"}),
+
+ Option("log_graylog_host", Option::TYPE_STR, Option::LEVEL_BASIC)
+ .set_default("127.0.0.1")
+ .set_description("address or hostname of graylog server to log to")
+ .add_see_also({"log_to_graylog",
+ "err_to_graylog",
+ "log_graylog_port"}),
+
+ Option("log_graylog_port", Option::TYPE_INT, Option::LEVEL_BASIC)
+ .set_default(12201)
+ .set_description("port number for the remote graylog server")
+ .add_see_also("log_graylog_host"),
+
+
+
+ // unmodified
+ Option("clog_to_monitors", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default=true")
+ .set_description(""),
+
+ Option("clog_to_syslog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("false")
+ .set_description(""),
+
+ Option("clog_to_syslog_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("info")
+ .set_description(""),
+
+ Option("clog_to_syslog_facility", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default=daemon audit=local0")
+ .set_description(""),
+
+ Option("clog_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("false")
+ .set_description(""),
+
+ Option("clog_to_graylog_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("127.0.0.1")
+ .set_description(""),
+
+ Option("clog_to_graylog_port", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("12201")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_syslog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default=false")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_syslog_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("info")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_syslog_facility", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("daemon")
+ .set_description(""),
+
+ Option("mon_cluster_log_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default=/var/log/ceph/$cluster.$channel.log cluster=/var/log/ceph/$cluster.log")
+ .set_description(""),
+
+ Option("mon_cluster_log_file_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("info")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("false")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_graylog_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("127.0.0.1")
+ .set_description(""),
+
+ Option("mon_cluster_log_to_graylog_port", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("12201")
+ .set_description(""),
+
+ Option("enable_experimental_unrecoverable_data_corrupting_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("plugin_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(CEPH_PKGLIBDIR)
+ .set_description("")
+ .set_safe(),
+
+ Option("xio_trace_mempool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("xio_trace_msgcnt", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("xio_trace_xcon", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("xio_queue_depth", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("xio_mp_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("xio_mp_max_64", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("xio_mp_max_256", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8192)
+ .set_description(""),
+
+ Option("xio_mp_max_1k", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8192)
+ .set_description(""),
+
+ Option("xio_mp_max_page", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("xio_mp_max_hint", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("xio_portal_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("xio_max_conns_per_portal", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("xio_transport_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rdma")
+ .set_description(""),
+
+ Option("xio_max_send_inline", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("compressor_zlib_isal", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("compressor_zlib_level", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("async_compressor_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("async_compressor_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("snappy")
+ .set_description(""),
+
+ Option("async_compressor_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("async_compressor_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("async_compressor_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("plugin_crypto_accelerator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("crypto_isal")
+ .set_description(""),
+
+ Option("mempool_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("keyfile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("keyring", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(
+ "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,"
+ "/etc/ceph/keyring,/etc/ceph/keyring.bin,"
+#if defined(__FreeBSD)
+ "/usr/local/etc/ceph/$cluster.$name.keyring,"
+ "/usr/local/etc/ceph/$cluster.keyring,"
+ "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin,"
+#endif
+ )
+ .set_description(""),
+
+ Option("heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("heartbeat_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("heartbeat_inject_failure", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("perf", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("async+posix")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_public_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_cluster_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_tcp_nodelay", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_tcp_rcvbuf", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("ms_tcp_prefetch_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("ms_initial_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.2)
+ .set_description(""),
+
+ Option("ms_max_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(15.0)
+ .set_description(""),
+
+ Option("ms_crc_data", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_crc_header", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_die_on_bad_msg", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_die_on_unhandled_msg", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_die_on_old_message", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_die_on_skipped_message", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_dispatch_throttle_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100 << 20)
+ .set_description(""),
+
+ Option("ms_bind_ipv6", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_bind_port_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(6800)
+ .set_description(""),
+
+ Option("ms_bind_port_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(7300)
+ .set_description(""),
+
+ Option("ms_bind_retry_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+#if !defined(__FreeBSD__)
+ .set_default(3)
+#else
+ // FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+ .set_default(6)
+#endif
+ .set_description(""),
+
+ Option("ms_bind_retry_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+#if !defined(__FreeBSD__)
+ .set_default(5)
+#else
+ // FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+ .set_default(6)
+#endif
+ .set_description(""),
+
+ Option("ms_bind_before_connect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_tcp_listen_backlog", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("ms_rwthread_stack_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024 << 10)
+ .set_description(""),
+
+ Option("ms_tcp_read_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(900)
+ .set_description(""),
+
+ Option("ms_pq_max_tokens_per_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16777216)
+ .set_description(""),
+
+ Option("ms_pq_min_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("ms_inject_socket_failures", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("ms_inject_delay_type", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_inject_delay_msg_type", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_inject_delay_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(1)
+ .set_description(""),
+
+ Option("ms_inject_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("ms_inject_internal_delays", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("ms_dump_on_send", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_dump_corrupt_message_level", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("ms_async_op_threads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("ms_async_max_op_threads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("ms_async_set_affinity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_async_affinity_cores", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_async_rdma_device_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_async_rdma_enable_hugepage", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_async_rdma_buffer_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128 << 10)
+ .set_description(""),
+
+ Option("ms_async_rdma_send_buffers", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("ms_async_rdma_receive_buffers", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("ms_async_rdma_port_num", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("ms_async_rdma_polling_us", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("ms_async_rdma_local_gid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_async_rdma_roce_ver", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("ms_async_rdma_sl", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("ms_async_rdma_dscp", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(96)
+ .set_description(""),
+
+ Option("ms_dpdk_port_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("ms_dpdk_coremask", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("1")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_dpdk_memory_channel", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("4")
+ .set_description(""),
+
+ Option("ms_dpdk_hugepages", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_dpdk_pmd", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("ms_dpdk_host_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_dpdk_gateway_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_dpdk_netmask_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description("")
+ .set_safe(),
+
+ Option("ms_dpdk_lro", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_dpdk_hw_flow_control", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("ms_dpdk_hw_queue_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("ms_dpdk_debug_allow_loopback", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("ms_dpdk_rx_buffer_count_per_core", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8192)
+ .set_description(""),
+
+ Option("inject_early_sigterm", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/mon/$cluster-$id")
+ .set_description(""),
+
+ Option("mon_initial_members", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("mon_compact_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_compact_on_bootstrap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_compact_on_trim", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mon_cpu_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("mon_osd_mapping_pgs_per_chunk", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("mon_osd_max_creating_pgs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("mon_tick_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_session_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("mon_subscribe_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(24*3600)
+ .set_description(""),
+
+ Option("mon_delta_reset_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mon_osd_laggy_halflife", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60*60)
+ .set_description(""),
+
+ Option("mon_osd_laggy_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.3)
+ .set_description(""),
+
+ Option("mon_osd_laggy_max_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("mon_osd_adjust_heartbeat_grace", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_adjust_down_out_interval", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_auto_mark_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_osd_auto_mark_auto_out_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_auto_mark_new_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_destroyed_out_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("mon_osd_down_out_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("mon_osd_down_out_subtree_limit", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rack")
+ .set_description(""),
+
+ Option("mon_osd_min_up_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.3)
+ .set_description(""),
+
+ Option("mon_osd_min_in_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.75)
+ .set_description(""),
+
+ Option("mon_osd_warn_op_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("mon_osd_err_op_age_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("mon_osd_max_split_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("mon_osd_allow_primary_temp", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_osd_allow_primary_affinity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_osd_prime_pg_temp", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_osd_prime_pg_temp_max_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.5)
+ .set_description(""),
+
+ Option("mon_osd_prime_pg_temp_max_estimate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.25)
+ .set_description(""),
+
+ Option("mon_osd_pool_ec_fast_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_stat_smooth_intervals", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(6)
+ .set_description(""),
+
+ Option("mon_election_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_lease", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_lease_renew_interval_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.6)
+ .set_description(""),
+
+ Option("mon_lease_ack_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_description(""),
+
+ Option("mon_accept_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_description(""),
+
+ Option("mon_clock_drift_allowed", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.050)
+ .set_description(""),
+
+ Option("mon_clock_drift_warn_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_timecheck_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(300.0)
+ .set_description(""),
+
+ Option("mon_timecheck_skew_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30.0)
+ .set_description(""),
+
+ Option("mon_pg_stuck_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mon_pg_min_inactive", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("mon_pg_warn_min_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mon_pg_warn_max_per_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("mon_pg_warn_max_object_skew", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("mon_pg_warn_min_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mon_pg_warn_min_pool_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("mon_pg_check_down_all_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.5)
+ .set_description(""),
+
+ Option("mon_cache_target_full_warn_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.66)
+ .set_description(""),
+
+ Option("mon_osd_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.95)
+ .set_description(""),
+
+ Option("mon_osd_backfillfull_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.90)
+ .set_description(""),
+
+ Option("mon_osd_nearfull_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.85)
+ .set_description(""),
+
+ Option("mon_osd_initial_require_min_compat_client", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("jewel")
+ .set_description(""),
+
+ Option("mon_allow_pool_delete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_fake_pool_delete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_globalid_prealloc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mon_osd_report_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(900)
+ .set_description(""),
+
+ Option("mon_force_standby_active", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_warn_on_legacy_crush_tunables", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_crush_min_required_version", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("firefly")
+ .set_description(""),
+
+ Option("mon_warn_on_crush_straw_calc_version_zero", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_warn_on_osd_down_out_interval_zero", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_warn_on_cache_pools_without_hit_sets", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_min_osdmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("mon_max_pgmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("mon_max_log_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("mon_max_mdsmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("mon_max_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mon_probe_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_description(""),
+
+ Option("mon_client_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100ul << 20)
+ .set_description(""),
+
+ Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.3)
+ .set_description(""),
+
+ Option("mon_log_max_summary", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(50)
+ .set_description(""),
+
+ Option("mon_daemon_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(400ul << 20)
+ .set_description(""),
+
+ Option("mon_max_log_entries_per_event", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("mon_reweight_min_pgs_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mon_reweight_min_bytes_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100*1024*1024)
+ .set_description(""),
+
+ Option("mon_reweight_max_osds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("mon_reweight_max_change", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.05)
+ .set_description(""),
+
+ Option("mon_health_data_update_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60.0)
+ .set_description(""),
+
+ Option("mon_health_to_clog", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mon_health_to_clog_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600)
+ .set_description(""),
+
+ Option("mon_health_to_clog_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60.0)
+ .set_description(""),
+
+ Option("mon_health_preluminous_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_health_max_detail", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(50)
+ .set_description(""),
+
+ Option("mon_data_avail_crit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_data_avail_warn", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mon_data_size_warn", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(15ull*1024*1024*1024)
+ .set_description(""),
+
+ Option("mon_warn_not_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_warn_not_deep_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_scrub_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600*24)
+ .set_description(""),
+
+ Option("mon_scrub_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60*5)
+ .set_description(""),
+
+ Option("mon_scrub_max_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("mon_scrub_inject_crc_mismatch", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("mon_scrub_inject_missing_keys", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("mon_config_key_max_entry_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("mon_sync_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60.0)
+ .set_description(""),
+
+ Option("mon_sync_max_payload_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1048576)
+ .set_description(""),
+
+ Option("mon_sync_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_inject_sync_get_chunk_delay", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_osd_min_down_reporters", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("mon_osd_reporter_subtree_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("host")
+ .set_description(""),
+
+ Option("mon_osd_force_trim_to", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_mds_force_trim_to", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_mds_skip_sanity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_deprecated_as_obsolete", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_dump_transactions", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_dump_json", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_dump_location", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("/var/log/ceph/$cluster-$name.tdump")
+ .set_description(""),
+
+ Option("mon_debug_no_require_luminous", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_no_require_bluestore_for_ec_overwrites", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_debug_no_initial_persistent_features", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_inject_transaction_delay_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("mon_inject_transaction_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_sync_provider_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_sync_requester_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_force_quorum_join", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_keyvaluedb", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rocksdb")
+ .set_description(""),
+
+ Option("mon_debug_unsafe_allow_tier_with_nonempty_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_osd_blacklist_default_expire", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60*60)
+ .set_description(""),
+
+ Option("mon_osd_crush_smoke_test", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("paxos_stash_full_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(25)
+ .set_description(""),
+
+ Option("paxos_max_join_drift", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("paxos_propose_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("paxos_min_wait", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.05)
+ .set_description(""),
+
+ Option("paxos_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("paxos_trim_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(250)
+ .set_description(""),
+
+ Option("paxos_trim_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("paxos_service_trim_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(250)
+ .set_description(""),
+
+ Option("paxos_service_trim_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("paxos_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("auth_cluster_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cephx")
+ .set_description(""),
+
+ Option("auth_service_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cephx")
+ .set_description(""),
+
+ Option("auth_client_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cephx, none")
+ .set_description(""),
+
+ Option("auth_supported", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("max_rotating_auth_attempts", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("cephx_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("cephx_cluster_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("cephx_service_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("cephx_sign_messages", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("auth_mon_ticket_ttl", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60*60*12)
+ .set_description(""),
+
+ Option("auth_service_ticket_ttl", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60*60)
+ .set_description(""),
+
+ Option("auth_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mon_client_hunt_parallel", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("mon_client_hunt_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(3.0)
+ .set_description(""),
+
+ Option("mon_client_ping_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("mon_client_ping_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30.0)
+ .set_description(""),
+
+ Option("mon_client_hunt_interval_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_description(""),
+
+ Option("mon_client_hunt_interval_min_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("mon_client_hunt_interval_max_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("mon_client_max_log_entries_per_message", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("mon_max_pool_pg_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("mon_pool_quota_warn_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mon_pool_quota_crit_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("crush_location", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("crush_location_hook", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("crush_location_hook_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("objecter_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5.0)
+ .set_description(""),
+
+ Option("objecter_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("objecter_inflight_op_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024*1024*100)
+ .set_description(""),
+
+ Option("objecter_inflight_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("objecter_completion_locks_per_session", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("objecter_inject_no_watch_ping", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("objecter_retry_writes_after_first_reply", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("objecter_debug_inject_relock_delay", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filer_max_purge_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("filer_max_truncate_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("journaler_write_head_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(15)
+ .set_description(""),
+
+ Option("journaler_prefetch_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("journaler_prezero_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_check_max_object_name_len_on_startup", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_max_backfills", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_min_recovery_priority", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_backfill_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30.0)
+ .set_description(""),
+
+ Option("osd_recovery_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30.0)
+ .set_description(""),
+
+ Option("osd_agent_max_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("osd_agent_max_low_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_agent_min_evict_effort", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.1)
+ .set_description(""),
+
+ Option("osd_agent_quantize_effort", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.1)
+ .set_description(""),
+
+ Option("osd_agent_delay_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5.0)
+ .set_description(""),
+
+ Option("osd_find_best_info_ignore_history_les", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_agent_hist_halflife", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("osd_agent_slop", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.02)
+ .set_description(""),
+
+ Option("osd_uuid", Option::TYPE_UUID, Option::LEVEL_ADVANCED)
+ .set_default(uuid_d())
+ .set_description(""),
+
+ Option("osd_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/osd/$cluster-$id")
+ .set_description(""),
+
+ Option("osd_journal", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/osd/$cluster-$id/journal")
+ .set_description(""),
+
+ Option("osd_journal_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5120)
+ .set_description(""),
+
+ Option("osd_journal_flush_on_shutdown", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_os_flags", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_max_write_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(90)
+ .set_description(""),
+
+ Option("osd_max_pgls", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("osd_client_message_size_cap", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(500*1024L*1024L)
+ .set_description(""),
+
+ Option("osd_client_message_cap", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("osd_pg_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(6)
+ .set_description(""),
+
+ Option("osd_pgp_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(6)
+ .set_description(""),
+
+ Option("osd_crush_update_weight_set", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_crush_chooseleaf_type", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_pool_use_gmt_hitset", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_crush_update_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_class_update_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_crush_initial_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("osd_pool_default_crush_rule", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("osd_pool_erasure_code_stripe_unit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("osd_pool_default_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("osd_pool_default_min_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_pool_default_pg_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8)
+ .set_description(""),
+
+ Option("osd_pool_default_pgp_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8)
+ .set_description(""),
+
+ Option("osd_pool_default_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("replicated")
+ .set_description(""),
+
+ Option("osd_pool_default_erasure_code_profile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("plugin=jerasure technique=reed_sol_van k=2 m=1")
+ .set_description(""),
+
+ Option("osd_erasure_code_plugins", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("jerasure lrc"
+#ifdef HAVE_BETTER_YASM_ELF64
+ " isa"
+#endif
+ )
+ .set_description(""),
+
+ Option("osd_allow_recovery_below_min_size", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_pool_default_flags", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_pool_default_flag_hashpspool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_pool_default_flag_nodelete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_pool_default_flag_nopgchange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_pool_default_flag_nosizechange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_pool_default_hit_set_bloom_fpp", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.05)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_target_dirty_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.4)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_target_dirty_high_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.6)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_target_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.8)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_min_flush_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_min_evict_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_pool_default_cache_max_evict_check_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("osd_hit_set_min_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("osd_hit_set_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100000)
+ .set_description(""),
+
+ Option("osd_hit_set_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".ceph-internal")
+ .set_description(""),
+
+ Option("osd_tier_promote_max_objects_sec", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(25)
+ .set_description(""),
+
+ Option("osd_tier_promote_max_bytes_sec", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5 * 1024*1024)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("writeback")
+ .set_description(""),
+
+ Option("osd_tier_default_cache_hit_set_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_hit_set_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1200)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_hit_set_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("bloom")
+ .set_description(""),
+
+ Option("osd_tier_default_cache_min_read_recency_for_promote", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_min_write_recency_for_promote", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_hit_set_grade_decay_rate", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("osd_tier_default_cache_hit_set_search_last_n", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_map_dedup", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_map_max_advance", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(40)
+ .set_description(""),
+
+ Option("osd_map_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(50)
+ .set_description(""),
+
+ Option("osd_map_message_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(40)
+ .set_description(""),
+
+ Option("osd_map_share_max_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(40)
+ .set_description(""),
+
+ Option("osd_inject_bad_map_crc_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_inject_failure_on_pg_removal", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_max_markdown_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("osd_max_markdown_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_peering_wq_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_peering_wq_batch_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("osd_op_pq_max_tokens_per_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4194304)
+ .set_description(""),
+
+ Option("osd_op_pq_min_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("osd_disk_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_disk_thread_ioprio_class", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("osd_disk_thread_ioprio_priority", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("osd_recover_clone_overlap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_op_num_threads_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_op_num_threads_per_shard_hdd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_op_num_threads_per_shard_ssd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_op_num_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_op_num_shards_hdd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_op_num_shards_ssd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8)
+ .set_description(""),
+
+ Option("osd_op_queue", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("wpq")
+ .set_description(""),
+
+ Option("osd_op_queue_cut_off", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("low")
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_client_op_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1000.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_client_op_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(500.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_client_op_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_osd_subop_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1000.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_osd_subop_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(500.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_osd_subop_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_snap_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_snap_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_snap_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.001)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_recov_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_recov_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_recov_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.001)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_scrub_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_scrub_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("osd_op_queue_mclock_scrub_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.001)
+ .set_description(""),
+
+ Option("osd_ignore_stale_divergent_priors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_read_ec_check_for_errors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_recover_clone_overlap_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("osd_backfill_scan_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_description(""),
+
+ Option("osd_backfill_scan_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("osd_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(15)
+ .set_description(""),
+
+ Option("osd_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(150)
+ .set_description(""),
+
+ Option("osd_recovery_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_recovery_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("osd_recovery_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_recovery_sleep_hdd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.1)
+ .set_description(""),
+
+ Option("osd_recovery_sleep_ssd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_snap_trim_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_scrub_invalid_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_remove_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60*60)
+ .set_description(""),
+
+ Option("osd_remove_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10*60*60)
+ .set_description(""),
+
+ Option("osd_command_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10*60)
+ .set_description(""),
+
+ Option("osd_command_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(15*60)
+ .set_description(""),
+
+ Option("osd_heartbeat_addr", Option::TYPE_ADDR, Option::LEVEL_ADVANCED)
+ .set_default(entity_addr_t())
+ .set_description(""),
+
+ Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(6)
+ .set_description(""),
+
+ Option("osd_heartbeat_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("osd_heartbeat_min_peers", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("osd_heartbeat_use_min_delay_socket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_heartbeat_min_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2000)
+ .set_description(""),
+
+ Option("osd_pg_max_concurrent_snap_trims", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_max_trimming_pgs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_heartbeat_min_healthy_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.33)
+ .set_description(""),
+
+ Option("osd_mon_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_mon_report_interval_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("osd_mon_report_interval_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_mon_report_max_in_flight", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("osd_beacon_report_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("osd_pg_stat_report_interval_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("osd_mon_ack_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30.0)
+ .set_description(""),
+
+ Option("osd_stats_ack_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_description(""),
+
+ Option("osd_stats_ack_timeout_decay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.9)
+ .set_description(""),
+
+ Option("osd_default_data_pool_replay_window", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(45)
+ .set_description(""),
+
+ Option("osd_auto_mark_unfound_lost", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_recovery_delay_start", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_recovery_max_active", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("osd_recovery_max_single_start", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_recovery_max_chunk", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8<<20)
+ .set_description(""),
+
+ Option("osd_recovery_max_omap_entries_per_chunk", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64000)
+ .set_description(""),
+
+ Option("osd_copyfrom_max_chunk", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8<<20)
+ .set_description(""),
+
+ Option("osd_push_per_object_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("osd_max_push_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8<<20)
+ .set_description(""),
+
+ Option("osd_max_push_objects", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("osd_recovery_forget_lost_objects", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_max_scrubs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("osd_scrub_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_scrub_end_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(24)
+ .set_description(""),
+
+ Option("osd_scrub_load_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.5)
+ .set_description(""),
+
+ Option("osd_scrub_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60*60*24)
+ .set_description(""),
+
+ Option("osd_scrub_max_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(7*60*60*24)
+ .set_description(""),
+
+ Option("osd_scrub_interval_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.5)
+ .set_description(""),
+
+ Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.66)
+ .set_description(""),
+
+ Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_scrub_chunk_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(25)
+ .set_description(""),
+
+ Option("osd_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_scrub_auto_repair", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_scrub_auto_repair_num_errors", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60*60*24*7)
+ .set_description(""),
+
+ Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.15)
+ .set_description(""),
+
+ Option("osd_deep_scrub_stride", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(524288)
+ .set_description(""),
+
+ Option("osd_deep_scrub_update_digest_min_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2*60*60)
+ .set_description(""),
+
+ Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(CEPH_LIBDIR "/rados-classes")
+ .set_description(""),
+
+ Option("osd_open_classes_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_class_load_list", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cephfs hello journal lock log numops " "rbd refcount replica_log rgw statelog timeindex user version")
+ .set_description(""),
+
+ Option("osd_class_default_list", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cephfs hello journal lock log numops " "rbd refcount replica_log rgw statelog timeindex user version")
+ .set_description(""),
+
+ Option("osd_check_for_log_corruption", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_use_stale_snap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_rollback_to_cluster_snap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("osd_default_notify_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_kill_backfill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_pg_epoch_persisted_max_stale", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(40)
+ .set_description(""),
+
+ Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1500)
+ .set_description("minimum number of entries to maintain in the PG log")
+ .add_service("osd")
+ .add_see_also("osd_max_pg_log_entries")
+ .add_see_also("osd_pg_log_dups_tracked"),
+
+ Option("osd_max_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description("maximum number of entries to maintain in the PG log when degraded before we trim")
+ .add_service("osd")
+ .add_see_also("osd_min_pg_log_entries")
+ .add_see_also("osd_pg_log_dups_tracked"),
+
+ Option("osd_pg_log_dups_tracked", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3000)
+ .set_description("how many versions back to track in order to detect duplicate ops; this is combined with both the regular pg log entries and additional minimal dup detection entries")
+ .add_service("osd")
+ .add_see_also("osd_min_pg_log_entries")
+ .add_see_also("osd_max_pg_log_entries"),
+
+ Option("osd_force_recovery_pg_log_entries_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.3)
+ .set_description(""),
+
+ Option("osd_pg_log_trim_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_command_max_records", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(256)
+ .set_description(""),
+
+ Option("osd_max_pg_blocked_by", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_description(""),
+
+ Option("osd_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_verify_sparse_read_holes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_backoff_on_unfound", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_backoff_on_degraded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_backoff_on_down", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_backoff_on_peering", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_crash_on_ignored_backoff", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_inject_dispatch_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_inject_dispatch_delay_duration", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.1)
+ .set_description(""),
+
+ Option("osd_debug_drop_ping_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_drop_ping_duration", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_op_order", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_verify_missing_on_start", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_scrub_chance_rewrite_digest", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_verify_snaps_on_info", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_verify_stray_on_activate", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_skip_full_check_in_backfill_reservation", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_reject_backfill_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_inject_copyfrom_error", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_misdirected_ops", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_skip_full_check_in_recovery", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_debug_random_push_read_error", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_debug_verify_cached_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_num_op_tracker_shard", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("osd_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("osd_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("osd_op_history_slow_op_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("osd_op_history_slow_op_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("osd_target_transaction_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_failsafe_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.97)
+ .set_description(""),
+
+ Option("osd_fast_fail_on_connection_refused", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_pg_object_context_cache_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_description(""),
+
+ Option("osd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_function_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_fast_info", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("osd_debug_pg_log_writeout", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_loop_before_reset_tphandle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_description(""),
+
+ Option("threadpool_default_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("threadpool_empty_queue_max_wait", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("leveldb_log_to_ceph_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("leveldb_write_buffer_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8 *1024*1024)
+ .set_description(""),
+
+ Option("leveldb_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128 *1024*1024)
+ .set_description(""),
+
+ Option("leveldb_block_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("leveldb_bloom_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("leveldb_max_open_files", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("leveldb_compression", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("leveldb_paranoid", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("leveldb_log", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/dev/null")
+ .set_description(""),
+
+ Option("leveldb_compact_on_mount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("kinetic_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("kinetic_port", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8123)
+ .set_description(""),
+
+ Option("kinetic_user_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("kinetic_hmac_key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("asdfasdf")
+ .set_description(""),
+
+ Option("kinetic_use_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_separate_wal_dir", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_db_paths", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description("")
+ .set_safe(),
+
+ Option("rocksdb_log_to_ceph_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rocksdb_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1024*1024)
+ .set_description(""),
+
+ Option("rocksdb_cache_row_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rocksdb_cache_shard_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("rocksdb_cache_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("lru")
+ .set_description(""),
+
+ Option("rocksdb_block_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4*1024)
+ .set_description(""),
+
+ Option("rocksdb_perf", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_collect_compaction_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_collect_extended_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_collect_memory_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_enable_rmrange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rocksdb_bloom_bits_per_key", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description("Number of bits per key to use for RocksDB's bloom filters.")
+ .set_long_description("RocksDB bloom filters can be used to quickly answer the question of whether or not a key may exist or definitely does not exist in a given RocksDB SST file without having to read all keys into memory. Using a higher bit value decreases the likelihood of false positives at the expense of additional disk space and memory consumption when the filter is loaded into RAM. The current default value of 20 was found to provide significant performance gains when getattr calls are made (such as during new object creation in bluestore) without significant memory overhead or cache pollution when combined with rocksdb partitioned index filters. See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters for more information."),
+
+ Option("rocksdb_cache_index_and_filter_blocks", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Whether to cache indices and filters in block cache")
+ .set_long_description("By default RocksDB will load an SST file's index and bloom filters into memory when it is opened and remove them from memory when an SST file is closed. Thus, memory consumption by indices and bloom filters is directly tied to the number of concurrent SST files allowed to be kept open. This option instead stores cached indicies and filters in the block cache where they directly compete with other cached data. By default we set this option to true to better account for and bound rocksdb memory usage and keep filters in memory even when an SST file is closed."),
+
+ Option("rocksdb_cache_index_and_filter_blocks_with_high_priority", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Whether to cache indices and filters in the block cache with high priority")
+ .set_long_description("A downside of setting rocksdb_cache_index_and_filter_blocks to true is that regular data can push indices and filters out of memory. Setting this option to true means they are cached with higher priority than other data and should typically stay in the block cache."),
+
+ Option("rocksdb_pin_l0_filter_and_index_blocks_in_cache", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Whether to pin Level 0 indices and bloom filters in the block cache")
+ .set_long_description("A downside of setting rocksdb_cache_index_and_filter_blocks to true is that regular data can push indices and filters out of memory. Setting this option to true means that level 0 SST files will always have their indices and filters pinned in the block cache."),
+
+ Option("rocksdb_index_type", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("binary_search")
+ .set_description("Type of index for SST files: binary_search, hash_search, two_level")
+ .set_long_description("This option controls the table index type. binary_search is a space efficient index block that is optimized for block-search-based index. hash_search may improve prefix lookup performance at the expense of higher disk and memory usage and potentially slower compactions. two_level is an experimental index type that uses two binary search indexes and works in conjunction with partition filters. See: http://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html"),
+
+ Option("rocksdb_partition_filters", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("(experimental) partition SST index/filters into smaller blocks")
+ .set_long_description("This is an experimental option for rocksdb that works in conjunction with two_level indices to avoid having to keep the entire filter/index in cache when cache_index_and_filter_blocks is true. The idea is to keep a much smaller top-level index in heap/cache and then opportunistically cache the lower level indices. See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters"),
+
+ Option("rocksdb_metadata_block_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(4096)
+ .set_description("The block size for index partitions. (0 = rocksdb default)"),
+
+ Option("mon_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("write_buffer_size=33554432,compression=kNoCompression")
+ .set_description(""),
+
+ Option("osd_client_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(63)
+ .set_description(""),
+
+ Option("osd_recovery_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("osd_snap_trim_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_snap_trim_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1<<20)
+ .set_description(""),
+
+ Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_scrub_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(50<<20)
+ .set_description(""),
+
+ Option("osd_requested_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(120)
+ .set_description(""),
+
+ Option("osd_recovery_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_recovery_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20<<20)
+ .set_description(""),
+
+ Option("osd_recovery_op_warn_multiple", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_description(""),
+
+ Option("osd_mon_shutdown_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("osd_shutdown_pgref_assert", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_max_object_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1024L*1024L)
+ .set_description(""),
+
+ Option("osd_max_object_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2048)
+ .set_description(""),
+
+ Option("osd_max_object_namespace_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(256)
+ .set_description(""),
+
+ Option("osd_max_attr_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("osd_max_attr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("osd_max_omap_entries_per_request", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(131072)
+ .set_description(""),
+
+ Option("osd_max_omap_bytes_per_request", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1<<30)
+ .set_description(""),
+
+ Option("osd_objectstore", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("filestore")
+ .set_description(""),
+
+ Option("osd_objectstore_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_objectstore_fuse", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_bench_small_size_max_iops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("osd_bench_large_size_max_throughput", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100 << 20)
+ .set_description(""),
+
+ Option("osd_bench_max_block_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64 << 20)
+ .set_description(""),
+
+ Option("osd_bench_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("osd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osdc_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("osd_discard_disconnected_ops", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("memstore_device_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024*1024*1024)
+ .set_description(""),
+
+ Option("memstore_page_set", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("memstore_page_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64 << 10)
+ .set_description(""),
+
+ Option("objectstore_blackhole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ // --------------------------
+ // bluestore
+
+ Option("bdev_debug_inflight_ios", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bdev_inject_crash", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("bdev_inject_crash_flush_delay", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(2)
+ .set_description(""),
+
+ Option("bdev_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("bdev_aio_poll_ms", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(250)
+ .set_description(""),
+
+ Option("bdev_aio_max_queue_depth", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("bdev_aio_reap_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_description(""),
+
+ Option("bdev_block_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("bdev_debug_aio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bdev_debug_aio_suicide_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(60.0)
+ .set_description(""),
+
+ Option("bdev_nvme_unbind_from_kernel", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bdev_nvme_retry_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("bluefs_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1048576)
+ .set_description(""),
+
+ Option("bluefs_max_prefetch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1048576)
+ .set_description(""),
+
+ Option("bluefs_min_log_runway", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1048576)
+ .set_description(""),
+
+ Option("bluefs_max_log_runway", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4194304)
+ .set_description(""),
+
+ Option("bluefs_log_compact_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5.0)
+ .set_description(""),
+
+ Option("bluefs_log_compact_min_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16*1048576)
+ .set_description(""),
+
+ Option("bluefs_min_flush_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(524288)
+ .set_description(""),
+
+ Option("bluefs_compact_log_sync", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluefs_buffered_io", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluefs_sync_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("bitmap")
+ .set_description(""),
+
+ Option("bluefs_preextend_wal_files", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .add_tag("mkfs")
+ .set_description("Use BlueFS to back rocksdb")
+ .set_long_description("BlueFS allows rocksdb to share the same physical device(s) as the rest of BlueStore. It should be used in all cases unless testing/developing an alternative metadata database for BlueStore."),
+
+ Option("bluestore_bluefs_env_mirror", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .add_tag("mkfs")
+ .set_description("Mirror bluefs data to file system for testing/validation"),
+
+ Option("bluestore_bluefs_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1*1024*1024*1024)
+ .set_description("minimum disk space allocated to BlueFS (e.g., at mkfs)"),
+
+ Option("bluestore_bluefs_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.02)
+ .set_description("Minimum fraction of free space devoted to BlueFS"),
+
+ Option("bluestore_bluefs_max_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.90)
+ .set_description("Maximum fraction of free storage devoted to BlueFS"),
+
+ Option("bluestore_bluefs_gift_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.02)
+ .set_description("Maximum fraction of free space to give to BlueFS at once"),
+
+ Option("bluestore_bluefs_reclaim_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.20)
+ .set_description("Maximum fraction of free space to reclaim from BlueFS at once"),
+
+ Option("bluestore_bluefs_balance_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description("How frequently (in seconds) to balance free space between BlueFS and BlueStore"),
+
+ Option("bluestore_spdk_mem", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(512)
+ .set_description(""),
+
+ Option("bluestore_spdk_coremask", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("0x3")
+ .set_description(""),
+
+ Option("bluestore_spdk_max_io_completion", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("bluestore_block_path", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("")
+ .add_tag("mkfs")
+ .set_description("Path to block device/file"),
+
+ Option("bluestore_block_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(10ull * 1024*1024*1024)
+ .add_tag("mkfs")
+ .set_description("Size of file to create for backing bluestore"),
+
+ Option("bluestore_block_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .add_tag("mkfs")
+ .set_description("Create bluestore_block_path if it doesn't exist")
+ .add_see_also("bluestore_block_path").add_see_also("bluestore_block_size"),
+
+ Option("bluestore_block_db_path", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("")
+ .add_tag("mkfs")
+ .set_description("Path for db block device"),
+
+ Option("bluestore_block_db_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .add_tag("mkfs")
+ .set_description("Size of file to create for bluestore_block_db_path"),
+
+ Option("bluestore_block_db_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .add_tag("mkfs")
+ .set_description("Create bluestore_block_db_path if it doesn't exist")
+ .add_see_also("bluestore_block_db_path")
+ .add_see_also("bluestore_block_db_size"),
+
+ Option("bluestore_block_wal_path", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("")
+ .add_tag("mkfs")
+ .set_description("Path to block device/file backing bluefs wal"),
+
+ Option("bluestore_block_wal_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(96 * 1024*1024)
+ .add_tag("mkfs")
+ .set_description("Size of file to create for bluestore_block_wal_path"),
+
+ Option("bluestore_block_wal_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .add_tag("mkfs")
+ .set_description("Create bluestore_block_wal_path if it doesn't exist")
+ .add_see_also("bluestore_block_wal_path")
+ .add_see_also("bluestore_block_wal_size"),
+
+ Option("bluestore_block_preallocate_file", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .add_tag("mkfs")
+ .set_description("Preallocate file created via bluestore_block*_create"),
+
+ Option("bluestore_csum_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("crc32c")
+ .set_enum_allowed({"none", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", "xxhash64"})
+ .set_safe()
+ .set_description("Default checksum algorithm to use")
+ .set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming."),
+
+ Option("bluestore_csum_min_block", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_safe()
+ .set_description("Minimum block size to checksum")
+ .set_long_description("A larger checksum block means less checksum metadata to store, but results in read amplification when doing a read smaller than this size (because the entire block must be read to verify the checksum).")
+ .add_see_also("bluestore_csum_max_block"),
+
+ Option("bluestore_csum_max_block", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64*1024)
+ .set_safe()
+ .set_description("Maximum block size to checksum")
+ .add_see_also("bluestore_csum_min_block"),
+
+ Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .add_tag("mkfs")
+ .set_description("Minimum allocation size to allocate for an object")
+ .set_long_description("A smaller allocation size generally means less data is read and then rewritten when a copy-on-write operation is triggered (e.g., when writing to something that was recently snapshotted). Similarly, less data is journaled before performing an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore journal). Larger values of min_alloc_size reduce the amount of metadata required to describe the on-disk layout and reduce overall fragmentation."),
+
+ Option("bluestore_min_alloc_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64*1024)
+ .add_tag("mkfs")
+ .set_description("Default min_alloc_size value for rotational media"),
+
+ Option("bluestore_min_alloc_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16*1024)
+ .add_tag("mkfs")
+ .set_description("Default min_alloc_size value for non-rotational (solid state) media"),
+
+ Option("bluestore_max_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .add_tag("mkfs")
+ .set_description("Maximum size of a single allocation (0 for no max)"),
+
+ Option("bluestore_prefer_deferred_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Writes smaller than this size will be written to the journal and then asynchronously written to the device. This can be beneficial when using rotational media where seeks are expensive, and is helpful both with and without solid state journal/wal devices."),
+
+ Option("bluestore_prefer_deferred_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32768)
+ .set_safe()
+ .set_description("Default bluestore_prefer_deferred_size for rotational media"),
+
+ Option("bluestore_prefer_deferred_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Default bluestore_prefer_deferred_size for non-rotational (solid state) media"),
+
+ Option("bluestore_compression_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("none")
+ .set_enum_allowed({"none", "passive", "aggressive", "force"})
+ .set_safe()
+ .set_description("Default policy for using compression when pool does not specify")
+ .set_long_description("'none' means never use compression. 'passive' means use compression when clients hint that data is compressible. 'aggressive' means use compression unless clients hint that data is not compressible. This option is used when the per-pool property for the compression mode is not present."),
+
+ Option("bluestore_compression_algorithm", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("snappy")
+ .set_enum_allowed({"", "snappy", "zlib", "zstd", "lz4"})
+ .set_safe()
+ .set_description("Default compression algorithm to use when writing object data")
+ .set_long_description("This controls the default compressor to use (if any) if the per-pool property is not set. Note that zstd is *not* recommended for bluestore due to high CPU overhead when compressing small amounts of data."),
+
+ Option("bluestore_compression_min_blob_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Chunks smaller than this are never compressed"),
+
+ Option("bluestore_compression_min_blob_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1024)
+ .set_safe()
+ .set_description("Default value of bluestore_compression_min_blob_size for rotational media"),
+
+ Option("bluestore_compression_min_blob_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8*1024)
+ .set_safe()
+ .set_description("Default value of bluestore_compression_min_blob_size for non-rotational (solid state) media"),
+
+ Option("bluestore_compression_max_blob_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Chunks larger than this are broken into smaller chunks before being compressed"),
+
+ Option("bluestore_compression_max_blob_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512*1024)
+ .set_safe()
+ .set_description("Default value of bluestore_compression_max_blob_size for rotational media"),
+
+ Option("bluestore_compression_max_blob_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64*1024)
+ .set_safe()
+ .set_description("Default value of bluestore_compression_max_blob_size for non-rotational (solid state) media"),
+
+ Option("bluestore_gc_enable_blob_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_safe()
+ .set_description(""),
+
+ Option("bluestore_gc_enable_total_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_safe()
+ .set_description(""),
+
+ Option("bluestore_max_blob_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_safe()
+ .set_description(""),
+
+ Option("bluestore_max_blob_size_hdd", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(512*1024)
+ .set_safe()
+ .set_description(""),
+
+ Option("bluestore_max_blob_size_ssd", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(64*1024)
+ .set_safe()
+ .set_description(""),
+
+ Option("bluestore_compression_required_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.875)
+ .set_safe()
+ .set_description("Compression ratio required to store compressed data")
+ .set_long_description("If we compress data and get less than this we discard the result and store the original uncompressed data."),
+
+ Option("bluestore_extent_map_shard_max_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(1200)
+ .set_description("Max size (bytes) for a single extent map shard before splitting"),
+
+ Option("bluestore_extent_map_shard_target_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(500)
+ .set_description("Target size (bytes) for a single extent map shard"),
+
+ Option("bluestore_extent_map_shard_min_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(150)
+ .set_description("Min size (bytes) for a single extent map shard before merging"),
+
+ Option("bluestore_extent_map_shard_target_size_slop", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.2)
+ .set_description("Ratio above/below target for a shard when trying to align to an existing extent or blob boundary"),
+
+ Option("bluestore_extent_map_inline_shard_prealloc_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(256)
+ .set_description("Preallocated buffer for inline shards"),
+
+ Option("bluestore_cache_trim_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.2)
+ .set_description("How frequently we trim the bluestore cache"),
+
+ Option("bluestore_cache_trim_max_skip_pinned", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(64)
+ .set_description("Max pinned cache entries we consider before giving up"),
+
+ Option("bluestore_cache_type", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("2q")
+ .set_enum_allowed({"2q", "lru"})
+ .set_description("Cache replacement algorithm"),
+
+ Option("bluestore_2q_cache_kin_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.5)
+ .set_description("2Q paper suggests .5"),
+
+ Option("bluestore_2q_cache_kout_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.5)
+ .set_description("2Q paper suggests .5"),
+
+ Option("bluestore_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description("Cache size (in bytes) for BlueStore")
+ .set_long_description("This includes data and metadata cached by BlueStore as well as memory devoted to rocksdb's cache(s)."),
+
+ Option("bluestore_cache_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1ull*1024*1024*1024)
+ .set_description("Default bluestore_cache_size for rotational media"),
+
+ Option("bluestore_cache_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3ull*1024*1024*1024)
+ .set_description("Default bluestore_cache_size for non-rotational (solid state) media"),
+
+ Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.01)
+ .set_description("Ratio of bluestore cache to devote to metadata"),
+
+ Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.99)
+ .set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
+
+ Option("bluestore_cache_kv_max", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512*1024*1024)
+ .set_description("Max memory (bytes) to devote to kv database (rocksdb)"),
+
+ Option("bluestore_kvbackend", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("rocksdb")
+ .add_tag("mkfs")
+ .set_description("Key value database to use for bluestore"),
+
+ Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
+ .set_default("bitmap")
+ .add_tag("mkfs")
+ .set_description(""),
+
+ Option("bluestore_freelist_blocks_per_key", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(128)
+ .set_description("Block (and bits) per database key"),
+
+ Option("bluestore_bitmapallocator_blocks_per_zone", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("bluestore_bitmapallocator_span_size", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("bluestore_max_deferred_txc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description("Max transactions with deferred writes that can accumulate before we force flush deferred writes"),
+
+ Option("bluestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152")
+ .set_description("Rocksdb options"),
+
+ Option("bluestore_fsck_on_mount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Run fsck at mount"),
+
+ Option("bluestore_fsck_on_mount_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Run deep fsck at mount"),
+
+ Option("bluestore_fsck_on_umount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Run fsck at umount"),
+
+ Option("bluestore_fsck_on_umount_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Run deep fsck at umount"),
+
+ Option("bluestore_fsck_on_mkfs", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(true)
+ .set_description("Run fsck after mkfs"),
+
+ Option("bluestore_fsck_on_mkfs_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Run deep fsck after mkfs"),
+
+ Option("bluestore_sync_submit_transaction", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description("Try to submit metadata transaction to rocksdb in queuing thread context"),
+
+ Option("bluestore_throttle_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64*1024*1024)
+ .set_safe()
+ .set_description("Maximum bytes in flight before we throttle IO submission"),
+
+ Option("bluestore_throttle_deferred_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1024*1024)
+ .set_safe()
+ .set_description("Maximum bytes for deferred writes before we throttle IO submission"),
+
+ Option("bluestore_throttle_cost_per_io", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Overhead added to transaction cost (in bytes) for each IO"),
+
+Option("bluestore_throttle_cost_per_io_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(670000)
+ .set_safe()
+ .set_description("Default bluestore_throttle_cost_per_io for rotational media"),
+
+ Option("bluestore_throttle_cost_per_io_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4000)
+ .set_safe()
+ .set_description("Default bluestore_throttle_cost_per_io for non-rotation (solid state) media"),
+
+
+ Option("bluestore_deferred_batch_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_safe()
+ .set_description("Max number of deferred writes before we flush the deferred write queue"),
+
+ Option("bluestore_deferred_batch_ops_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_safe()
+ .set_description("Default bluestore_deferred_batch_ops for rotational media"),
+
+ Option("bluestore_deferred_batch_ops_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_safe()
+ .set_description("Default bluestore_deferred_batch_ops for non-rotational (solid state) media"),
+
+ Option("bluestore_nid_prealloc", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(1024)
+ .set_description("Number of unique object ids to preallocate at a time"),
+
+ Option("bluestore_blobid_prealloc", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(10240)
+ .set_description("Number of unique blob ids to preallocate at a time"),
+
+ Option("bluestore_clone_cow", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_safe()
+ .set_description("Use copy-on-write when cloning objects (versus reading and rewriting them at clone time)"),
+
+ Option("bluestore_default_buffered_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_safe()
+ .set_description("Cache read results by default (unless hinted NOCACHE or WONTNEED)"),
+
+ Option("bluestore_default_buffered_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_safe()
+ .set_description("Cache writes by default (unless hinted NOCACHE or WONTNEED)"),
+
+ Option("bluestore_debug_misc", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_no_reuse_blocks", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_small_allocations", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("bluestore_debug_freelist", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_prefill", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description("simulate fragmentation"),
+
+ Option("bluestore_debug_prefragment_max", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(1048576)
+ .set_description(""),
+
+ Option("bluestore_debug_inject_read_err", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_randomize_serial_transaction", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("bluestore_debug_omit_block_device_write", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_fsck_abort", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_omit_kv_commit", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_permit_any_bdev_label", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_shard_finishers", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("bluestore_debug_random_read_err", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ // -----------------------------------------
+ // kstore
+
+ Option("kstore_max_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("kstore_max_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64*1024*1024)
+ .set_description(""),
+
+ Option("kstore_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rocksdb")
+ .set_description(""),
+
+ Option("kstore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("compression=kNoCompression")
+ .set_description(""),
+
+ Option("kstore_fsck_on_mount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("kstore_fsck_on_mount_deep", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("kstore_nid_prealloc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("kstore_sync_transaction", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("kstore_sync_submit_transaction", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("kstore_onode_map_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("kstore_default_stripe_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ // ---------------------
+ // filestore
+
+ Option("filestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("filestore_omap_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rocksdb")
+ .set_description(""),
+
+ Option("filestore_omap_backend_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("filestore_wbthrottle_enable", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_bytes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(41943040)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_bytes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(419430400)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_ios_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_ios_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5000)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_inodes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_bytes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(41943040)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_bytes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(419430400)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_ios_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_ios_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5000)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_inodes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(500)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_btrfs_inodes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5000)
+ .set_description(""),
+
+ Option("filestore_wbthrottle_xfs_inodes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5000)
+ .set_description(""),
+
+ Option("filestore_odsync_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_index_retry_probability", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_debug_inject_read_err", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_debug_random_read_err", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_debug_omap_check", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_omap_header_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattr_size_xfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattr_size_btrfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2048)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattr_size_other", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattrs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattrs_xfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattrs_btrfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("filestore_max_inline_xattrs_other", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("filestore_max_xattr_value_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_max_xattr_value_size_xfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64<<10)
+ .set_description(""),
+
+ Option("filestore_max_xattr_value_size_btrfs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64<<10)
+ .set_description(""),
+
+ Option("filestore_max_xattr_value_size_other", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1<<10)
+ .set_description(""),
+
+ Option("filestore_sloppy_crc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_sloppy_crc_block_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(65536)
+ .set_description(""),
+
+ Option("filestore_max_alloc_hint_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1ULL << 20)
+ .set_description(""),
+
+ Option("filestore_max_sync_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("filestore_min_sync_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.01)
+ .set_description(""),
+
+ Option("filestore_btrfs_snap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_btrfs_clone_range", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_zfs_snap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_fsync_flushes_journal_data", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_fiemap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_punch_hole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_seek_data_hole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_splice", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_fadvise", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_collect_device_partition_information", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_xfs_extsize", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_journal_parallel", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_journal_writeahead", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_journal_trailing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_queue_max_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(50)
+ .set_description(""),
+
+ Option("filestore_queue_max_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100 << 20)
+ .set_description(""),
+
+ Option("filestore_caller_concurrency", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("filestore_expected_throughput_bytes", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(200 << 20)
+ .set_description(""),
+
+ Option("filestore_expected_throughput_ops", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(200)
+ .set_description(""),
+
+ Option("filestore_queue_max_delay_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_queue_high_delay_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_queue_low_threshhold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.3)
+ .set_description(""),
+
+ Option("filestore_queue_high_threshhold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.9)
+ .set_description(""),
+
+ Option("filestore_op_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("filestore_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("filestore_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(180)
+ .set_description(""),
+
+ Option("filestore_commit_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("filestore_fiemap_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("filestore_merge_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("filestore_split_multiple", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("filestore_split_rand_factor", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("filestore_update_to", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("filestore_blackhole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("filestore_fd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("filestore_fd_cache_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_description(""),
+
+ Option("filestore_ondisk_finisher_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("filestore_apply_finisher_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("filestore_dump_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("filestore_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_inject_stall", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("filestore_fail_eio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("filestore_debug_verify_split", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("journal_dio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("journal_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("journal_force_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("journal_block_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("journal_max_corrupt_search", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(10<<20)
+ .set_description(""),
+
+ Option("journal_block_align", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("journal_write_header_frequency", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("journal_max_write_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10 << 20)
+ .set_description(""),
+
+ Option("journal_max_write_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("journal_throttle_low_threshhold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.6)
+ .set_description(""),
+
+ Option("journal_throttle_high_threshhold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.9)
+ .set_description(""),
+
+ Option("journal_throttle_high_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("journal_throttle_max_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("journal_align_min_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(64 << 10)
+ .set_description(""),
+
+ Option("journal_replay_from", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("journal_zero_on_create", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("journal_ignore_corruption", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("journal_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("fio_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/tmp/fio")
+ .set_description(""),
+
+ Option("rados_mon_op_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rados_osd_op_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rados_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("nss_db_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("mgr_module_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(CEPH_PKGLIBDIR "/mgr")
+ .set_description(""),
+
+ Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("restful status")
+ .set_description(""),
+
+ Option("mgr_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/mgr/$cluster-$id")
+ .set_description(""),
+
+ Option("mgr_tick_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mgr_client_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1048576)
+ .set_description(""),
+
+ Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512)
+ .set_description(""),
+
+ Option("mgr_osd_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(512*1048576)
+ .set_description(""),
+
+ Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8192)
+ .set_description(""),
+
+ Option("mgr_mds_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1048576)
+ .set_description(""),
+
+ Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("mgr_mon_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128*1048576)
+ .set_description(""),
+
+ Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("mgr_service_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60.0)
+ .set_description(""),
+
+ Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mon_mgr_beacon_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mon_mgr_inactive_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mon_mgr_mkfs_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mutex_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("throttler_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("event_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("internal_safe_to_start_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("debug_deliberately_leak_memory", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+};
+
+std::vector<Option> rgw_options = {
+ Option("rgw_acl_grants_max_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("rgw_max_chunk_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_put_obj_min_window_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_put_obj_max_window_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(64 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_max_put_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5ULL*1024*1024*1024)
+ .set_description(""),
+
+ Option("rgw_max_put_param_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_override_bucket_index_max_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rgw_bucket_index_max_aio", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8)
+ .set_description(""),
+
+ Option("rgw_enable_quota_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_enable_gc_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_enable_lc_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/radosgw/$cluster-$id")
+ .set_description(""),
+
+ Option("rgw_enable_apis", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("s3, s3website, swift, swift_auth, admin")
+ .set_description(""),
+
+ Option("rgw_cache_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_cache_lru_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("rgw_socket_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_port", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_dns_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_dns_s3website_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_content_length_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_lifecycle_work_time", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("00:00-06:00")
+ .set_description(""),
+
+ Option("rgw_lc_lock_max_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("rgw_lc_max_objs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("rgw_lc_debug_interval", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("rgw_script_uri", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_request_uri", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_swift_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_swift_url_prefix", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("swift")
+ .set_description(""),
+
+ Option("rgw_swift_auth_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_swift_auth_entry", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("auth")
+ .set_description(""),
+
+ Option("rgw_swift_tenant_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_swift_account_in_url", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_swift_enforce_content_length", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_keystone_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_token", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_user", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_password", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_tenant", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_project", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_admin_domain", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_barbican_user", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_barbican_password", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_barbican_tenant", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_barbican_project", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_barbican_domain", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_api_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("rgw_keystone_accepted_roles", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("Member, admin")
+ .set_description(""),
+
+ Option("rgw_keystone_accepted_admin_roles", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_keystone_token_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("rgw_keystone_revocation_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(15 * 60)
+ .set_description(""),
+
+ Option("rgw_keystone_verify_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_keystone_implicit_tenants", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_cross_domain_policy", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("<allow-access-from domain=\"*\" secure=\"false\" />")
+ .set_description(""),
+
+ Option("rgw_healthcheck_disabling_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_s3_auth_use_rados", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_s3_auth_use_keystone", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_s3_auth_aws4_force_boto2_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_barbican_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_ldap_uri", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("ldaps://<ldap.your.domain>")
+ .set_description(""),
+
+ Option("rgw_ldap_binddn", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("uid=admin,cn=users,dc=example,dc=com")
+ .set_description(""),
+
+ Option("rgw_ldap_searchdn", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("cn=users,cn=accounts,dc=example,dc=com")
+ .set_description(""),
+
+ Option("rgw_ldap_dnattr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("uid")
+ .set_description(""),
+
+ Option("rgw_ldap_secret", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/etc/openldap/secret")
+ .set_description(""),
+
+ Option("rgw_s3_auth_use_ldap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_ldap_searchfilter", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_admin_entry", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("admin")
+ .set_description(""),
+
+ Option("rgw_enforce_swift_acls", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_swift_token_expiration", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(24 * 3600)
+ .set_description(""),
+
+ Option("rgw_print_continue", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_print_prohibited_content_length", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_remote_addr_param", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("REMOTE_ADDR")
+ .set_description(""),
+
+ Option("rgw_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10*60)
+ .set_description(""),
+
+ Option("rgw_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rgw_thread_pool_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("rgw_num_control_oids", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(8)
+ .set_description(""),
+
+ Option("rgw_num_rados_handles", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("rgw_verify_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_nfs_lru_lanes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("rgw_nfs_lru_lane_hiwat", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(911)
+ .set_description(""),
+
+ Option("rgw_nfs_fhcache_partitions", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("rgw_nfs_fhcache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2017)
+ .set_description(""),
+
+ Option("rgw_nfs_namespace_expire_secs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rgw_nfs_max_gc", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rgw_nfs_write_completion_interval_s", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("rgw_zone", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_zone_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".rgw.root")
+ .set_description(""),
+
+ Option("rgw_default_zone_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default.zone")
+ .set_description(""),
+
+ Option("rgw_region", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_region_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".rgw.root")
+ .set_description(""),
+
+ Option("rgw_default_region_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default.region")
+ .set_description(""),
+
+ Option("rgw_zonegroup", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_zonegroup_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".rgw.root")
+ .set_description(""),
+
+ Option("rgw_default_zonegroup_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default.zonegroup")
+ .set_description(""),
+
+ Option("rgw_realm", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_realm_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".rgw.root")
+ .set_description(""),
+
+ Option("rgw_default_realm_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("default.realm")
+ .set_description(""),
+
+ Option("rgw_period_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".rgw.root")
+ .set_description(""),
+
+ Option("rgw_period_latest_epoch_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".latest_epoch")
+ .set_description(""),
+
+ Option("rgw_log_nonexistent_bucket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_log_object_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("%Y-%m-%d-%H-%i-%n")
+ .set_description(""),
+
+ Option("rgw_log_object_name_utc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_usage_max_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("rgw_usage_max_user_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rgw_enable_ops_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_enable_usage_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_ops_log_rados", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_ops_log_socket_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_ops_log_data_backlog", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5 << 20)
+ .set_description(""),
+
+ Option("rgw_fcgi_socket_backlog", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("rgw_usage_log_flush_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("rgw_usage_log_tick_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rgw_intent_log_object_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("%Y-%m-%d-%i-%n")
+ .set_description(""),
+
+ Option("rgw_intent_log_object_name_utc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_init_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("rgw_mime_types_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/etc/mime.types")
+ .set_description(""),
+
+ Option("rgw_gc_max_objs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("rgw_gc_obj_min_wait", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2 * 3600)
+ .set_description(""),
+
+ Option("rgw_gc_processor_max_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600)
+ .set_description(""),
+
+ Option("rgw_gc_processor_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600)
+ .set_description(""),
+
+ Option("rgw_s3_success_create_obj_status", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rgw_resolve_cname", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_obj_stripe_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4 << 20)
+ .set_description(""),
+
+ Option("rgw_extended_http_attrs", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_exit_timeout_secs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(120)
+ .set_description(""),
+
+ Option("rgw_get_obj_window_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16 << 20)
+ .set_description(""),
+
+ Option("rgw_get_obj_max_req_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4 << 20)
+ .set_description(""),
+
+ Option("rgw_relaxed_s3_bucket_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_defer_to_bucket_acls", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_list_buckets_max_chunk", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_md_log_max_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_description(""),
+
+ Option("rgw_num_zone_opstate_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("rgw_opstate_ratelimit_sec", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rgw_curl_wait_timeout_ms", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_copy_obj_progress", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_copy_obj_progress_every_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_obj_tombstone_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_data_log_window", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rgw_data_log_changes_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_data_log_num_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128)
+ .set_description(""),
+
+ Option("rgw_data_log_obj_prefix", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("data_log")
+ .set_description(""),
+
+ Option("rgw_replica_log_obj_prefix", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("replica_log")
+ .set_description(""),
+
+ Option("rgw_bucket_quota_ttl", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("rgw_bucket_quota_soft_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.95)
+ .set_description(""),
+
+ Option("rgw_bucket_quota_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("rgw_bucket_default_quota_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("rgw_bucket_default_quota_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("rgw_expose_bucket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_frontends", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("civetweb port=7480")
+ .set_description(""),
+
+ Option("rgw_user_quota_bucket_sync_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(180)
+ .set_description(""),
+
+ Option("rgw_user_quota_sync_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600 * 24)
+ .set_description(""),
+
+ Option("rgw_user_quota_sync_idle_users", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_user_quota_sync_wait_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600 * 24)
+ .set_description(""),
+
+ Option("rgw_user_default_quota_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("rgw_user_default_quota_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("rgw_multipart_min_part_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rgw_multipart_part_upload_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("rgw_max_slo_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_olh_pending_timeout_sec", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3600)
+ .set_description(""),
+
+ Option("rgw_user_max_buckets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_objexp_gc_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(60 * 10)
+ .set_description(""),
+
+ Option("rgw_objexp_time_step", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("rgw_objexp_hints_num_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(127)
+ .set_description(""),
+
+ Option("rgw_objexp_chunk_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100)
+ .set_description(""),
+
+ Option("rgw_enable_static_website", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_log_http_headers", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_num_async_rados_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("rgw_md_notify_interval_msec", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(200)
+ .set_description(""),
+
+ Option("rgw_run_sync_thread", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_sync_lease_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(120)
+ .set_description(""),
+
+ Option("rgw_sync_log_trim_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1200)
+ .set_description(""),
+
+ Option("rgw_sync_data_inject_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rgw_sync_meta_inject_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rgw_period_push_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("rgw_period_push_interval_max", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rgw_safe_max_objects_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100*1024)
+ .set_description(""),
+
+ Option("rgw_shard_warning_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(90)
+ .set_description(""),
+
+ Option("rgw_swift_versioning_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_swift_custom_header", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_swift_need_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_reshard_num_logs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16)
+ .set_description(""),
+
+ Option("rgw_reshard_bucket_lock_duration", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(120)
+ .set_description(""),
+
+ Option("rgw_crypt_require_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_crypt_default_encryption_key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_crypt_s3_kms_encryption_keys", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_crypt_suppress_logs", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_list_bucket_min_readahead", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("rgw_rest_getusage_op_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_torrent_flag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rgw_torrent_tracker", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_torrent_createby", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_torrent_comment", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_torrent_encoding", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_torrent_origin", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rgw_torrent_sha_unit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(512*1024)
+ .set_description(""),
+
+ Option("rgw_dynamic_resharding", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rgw_max_objs_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100000)
+ .set_description(""),
+
+ Option("rgw_reshard_thread_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(60 * 10)
+ .set_description(""),
+};
+
+std::vector<Option> rbd_options = {
+ Option("rbd_default_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("rbd")
+ .set_description("")
+ .set_validator([](std::string *value, std::string *error_message){
+ boost::regex pattern("^[^@/]+$");
+ if (!boost::regex_match (*value, pattern)) {
+ *value = "rbd";
+ *error_message = "invalid RBD default pool, resetting to 'rbd'";
+ }
+ return 0;
+ }),
+
+ Option("rbd_default_data_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description("")
+ .set_validator([](std::string *value, std::string *error_message){
+ boost::regex pattern("^[^@/]*$");
+ if (!boost::regex_match (*value, pattern)) {
+ *value = "";
+ *error_message = "ignoring invalid RBD data pool";
+ }
+ return 0;
+ }),
+
+ Option("rbd_default_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("layering,exclusive-lock,object-map,fast-diff,deep-flatten")
+ .set_description("")
+ .set_safe()
+ .set_validator([](std::string *value, std::string *error_message){
+ static const std::map<std::string, uint64_t> FEATURE_MAP = {
+ {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING},
+ {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2},
+ {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK},
+ {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP},
+ {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF},
+ {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN},
+ {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING},
+ {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL},
+ };
+ static_assert((RBD_FEATURE_DATA_POOL << 1) > RBD_FEATURES_ALL,
+ "new RBD feature added");
+
+ // convert user-friendly comma delimited feature name list to a bitmask
+ // that is used by the librbd API
+ uint64_t features = 0;
+ error_message->clear();
+
+ try {
+ features = boost::lexical_cast<decltype(features)>(*value);
+
+ uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL);
+ if (unsupported_features != 0ull) {
+ features &= RBD_FEATURES_ALL;
+
+ std::stringstream ss;
+ ss << "ignoring unknown feature mask 0x"
+ << std::hex << unsupported_features;
+ *error_message = ss.str();
+ }
+ } catch (const boost::bad_lexical_cast& ) {
+ int r = 0;
+ std::vector<std::string> feature_names;
+ boost::split(feature_names, *value, boost::is_any_of(","));
+ for (auto feature_name: feature_names) {
+ boost::trim(feature_name);
+ auto feature_it = FEATURE_MAP.find(feature_name);
+ if (feature_it != FEATURE_MAP.end()) {
+ features += feature_it->second;
+ } else {
+ if (!error_message->empty()) {
+ *error_message += ", ";
+ }
+ *error_message += "ignoring unknown feature " + feature_name;
+ r = -EINVAL;
+ }
+ }
+
+ if (features == 0 && r == -EINVAL) {
+ features = RBD_FEATURES_DEFAULT;
+ }
+ }
+ *value = stringify(features);
+ return 0;
+ }),
+
+ Option("rbd_op_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("rbd_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("rbd_non_blocking_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_cache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_cache_writethrough_until_flush", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(32<<20)
+ .set_description(""),
+
+ Option("rbd_cache_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(24<<20)
+ .set_description(""),
+
+ Option("rbd_cache_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16<<20)
+ .set_description(""),
+
+ Option("rbd_cache_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("rbd_cache_max_dirty_object", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_cache_block_writes_upfront", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_concurrent_management_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rbd_balance_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_localize_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_balance_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_localize_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_readahead_trigger_requests", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("rbd_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(512 * 1024)
+ .set_description(""),
+
+ Option("rbd_readahead_disable_after_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(50 * 1024 * 1024)
+ .set_description(""),
+
+ Option("rbd_clone_copy_on_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_blacklist_on_break_lock", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_blacklist_expire_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_request_timed_out_seconds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rbd_skip_partial_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_enable_alloc_hint", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_validate_pool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_validate_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_auto_exclusive_lock_until_manual_request", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("rbd_mirroring_resync_after_disconnect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("rbd_mirroring_replay_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_default_format", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("rbd_default_order", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(22)
+ .set_description(""),
+
+ Option("rbd_default_stripe_count", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_default_stripe_unit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_default_map_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(24)
+ .set_description(""),
+
+ Option("rbd_journal_splay_width", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("rbd_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("rbd_journal_object_flush_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_journal_object_flush_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_journal_object_flush_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_journal_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("rbd_journal_max_payload_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(16384)
+ .set_description(""),
+
+ Option("rbd_journal_max_concurrent_object_sets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("rbd_mirror_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("rbd_mirror_journal_poll_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("rbd_mirror_journal_max_fetch_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32768)
+ .set_description(""),
+
+ Option("rbd_mirror_sync_point_update_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rbd_mirror_concurrent_image_syncs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("rbd_mirror_pool_replayers_refresh_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rbd_mirror_delete_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("rbd_mirror_image_state_check_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rbd_mirror_leader_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_min(1)
+ .set_description(""),
+
+ Option("rbd_mirror_leader_max_missed_heartbeats", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_description(""),
+
+ Option("rbd_mirror_leader_max_acquire_attempts_before_break", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+};
+
+std::vector<Option> mds_options = {
+ Option("mds_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/var/lib/ceph/mds/$cluster-$id")
+ .set_description(""),
+
+ Option("mds_max_file_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1ULL << 40)
+ .set_description(""),
+
+ Option("mds_max_xattr_pairs_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64 << 10)
+ .set_description(""),
+
+ Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(100000)
+ .set_description(""),
+
+ Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.7)
+ .set_description(""),
+
+ Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(32)
+ .set_description(""),
+
+ Option("mds_dir_max_commit_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mds_dir_keys_per_op", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16384)
+ .set_description(""),
+
+ Option("mds_decay_halflife", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_beacon_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("mds_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(15)
+ .set_description(""),
+
+ Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_blacklist_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(24.0*60.0)
+ .set_description(""),
+
+ Option("mds_session_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_session_blacklist_on_evict", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_sessionmap_keys_per_op", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("mds_revoke_cap_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60)
+ .set_description(""),
+
+ Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mds_session_autoclose", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(300)
+ .set_description(""),
+
+ Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.5)
+ .set_description(""),
+
+ Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(45)
+ .set_description(""),
+
+ Option("mds_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_dirstat_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("mds_scatter_nudge_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_client_prealloc_inos", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("mds_early_reply", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_default_dir_hash", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(CEPH_STR_HASH_RJENKINS)
+ .set_description(""),
+
+ Option("mds_log_pause", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_log_skip_corrupt_events", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_log_max_events", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("mds_log_events_per_segment", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+
+ Option("mds_log_segment_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_log_max_segments", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mds_log_max_expiring", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("mds_bal_export_pin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_bal_sample_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(3.0)
+ .set_description(""),
+
+ Option("mds_bal_replicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(8000)
+ .set_description(""),
+
+ Option("mds_bal_unreplicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_bal_frag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mds_bal_split_rd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(25000)
+ .set_description(""),
+
+ Option("mds_bal_split_wr", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mds_bal_split_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_description(""),
+
+ Option("mds_bal_merge_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(50)
+ .set_description(""),
+
+ Option("mds_bal_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("mds_bal_fragment_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_bal_fragment_size_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000*10)
+ .set_description(""),
+
+ Option("mds_bal_fragment_fast_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.5)
+ .set_description(""),
+
+ Option("mds_bal_idle_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_bal_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("mds_bal_max_until", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("mds_bal_mode", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_bal_min_rebalance", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.1)
+ .set_description(""),
+
+ Option("mds_bal_min_start", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.2)
+ .set_description(""),
+
+ Option("mds_bal_need_min", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.8)
+ .set_description(""),
+
+ Option("mds_bal_need_max", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.2)
+ .set_description(""),
+
+ Option("mds_bal_midchunk", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.3)
+ .set_description(""),
+
+ Option("mds_bal_minchunk", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.001)
+ .set_description(""),
+
+ Option("mds_bal_target_decay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(10.0)
+ .set_description(""),
+
+ Option("mds_replay_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("mds_shutdown_check", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_thrash_exports", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_thrash_fragments", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_dump_cache_on_map", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_dump_cache_after_rejoin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_verify_scatter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_debug_scatterstat", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_debug_frag", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_debug_auth_pins", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_debug_subtrees", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_kill_mdstable_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_export_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_import_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_link_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_rename_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_openc_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_journal_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_journal_expire_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_kill_journal_replay_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_journal_format", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("mds_kill_create_at", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_inject_traceless_reply_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_wipe_sessions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_wipe_ino_prealloc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_skip_ino", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_standby_for_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("mds_standby_for_rank", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("mds_standby_for_fscid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("mds_standby_replay", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("mds_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(20)
+ .set_description(""),
+
+ Option("mds_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(600)
+ .set_description(""),
+
+ Option("mds_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("mds_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_snap_min_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_snap_max_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4294967294)
+ .set_description(""),
+
+ Option("mds_snap_rstat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("mds_verify_backtrace", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("mds_max_completed_flushes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100000)
+ .set_description(""),
+
+ Option("mds_max_completed_requests", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(100000)
+ .set_description(""),
+
+ Option("mds_action_on_write_error", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1)
+ .set_description(""),
+
+ Option("mds_mon_shutdown_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_max_purge_files", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(64)
+ .set_description(""),
+
+ Option("mds_max_purge_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(8192)
+ .set_description(""),
+
+ Option("mds_max_purge_ops_per_pg", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(0.5)
+ .set_description(""),
+
+ Option("mds_purge_queue_busy_flush_period", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("mds_root_ino_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_root_ino_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("mds_max_scrub_ops_in_progress", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("mds_damage_table_max_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10000)
+ .set_description(""),
+
+ Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_description(""),
+};
+
+std::vector<Option> mds_client_options = {
+ Option("client_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(16384)
+ .set_description(""),
+
+ Option("client_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.75)
+ .set_description(""),
+
+ Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_mount_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(300.0)
+ .set_description(""),
+
+ Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.0)
+ .set_description(""),
+
+ Option("client_trace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("client_readahead_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(128*1024)
+ .set_description(""),
+
+ Option("client_readahead_max_bytes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(0)
+ .set_description(""),
+
+ Option("client_readahead_max_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(4)
+ .set_description(""),
+
+ Option("client_reconnect_stale", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_snapdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default(".snap")
+ .set_description(""),
+
+ Option("client_mountpoint", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("/")
+ .set_description(""),
+
+ Option("client_mount_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("client_mount_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_description(""),
+
+ Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(10)
+ .set_description(""),
+
+ Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_description(""),
+
+ Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .set_description(""),
+
+ Option("client_quota_df", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_oc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_oc_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024*1024* 200)
+ .set_description(""),
+
+ Option("client_oc_max_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024*1024* 100)
+ .set_description(""),
+
+ Option("client_oc_target_dirty", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024*1024* 8)
+ .set_description(""),
+
+ Option("client_oc_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5.0)
+ .set_description(""),
+
+ Option("client_oc_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1000)
+ .set_description(""),
+
+ Option("client_debug_getattr_caps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_debug_force_sync_read", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_debug_inject_tick_delay", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
+ Option("client_max_inline_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(4096)
+ .set_description(""),
+
+ Option("client_inject_release_failure", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_inject_fixed_oldest_tid", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_metadata", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("client_acl_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+
+ Option("client_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_dirsize_rbytes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_use_invalidate_cb", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_disable_pagecache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("fuse_allow_other", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_default_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("fuse_big_writes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_atomic_o_trunc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("fuse_multithreaded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_require_active_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_syncfs_on_mksnap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_die_on_failed_remount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_check_pool_perm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_description(""),
+
+ Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(false)
+ .set_description(""),
+
+ Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("")
+ .set_description(""),
+};
+
+
+static std::vector<Option> build_options()
+{
+ std::vector<Option> result = global_options;
+
+ auto ingest = [&result](std::vector<Option> &options, const char* svc) {
+ for (const auto &o_in : options) {
+ Option o(o_in);
+ o.add_service(svc);
+ result.push_back(o);
+ }
+ };
+
+ ingest(rgw_options, "rgw");
+ ingest(rbd_options, "rbd");
+ ingest(mds_options, "mds");
+ ingest(mds_client_options, "mds_client");
+
+ return result;
+}
+
+const std::vector<Option> ceph_options = build_options();
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <list>
+#include <boost/variant.hpp>
+#include "include/str_list.h"
+#include "msg/msg_types.h"
+#include "include/uuid.h"
+
+struct Option {
+ enum type_t {
+ TYPE_UINT,
+ TYPE_INT,
+ TYPE_STR,
+ TYPE_FLOAT,
+ TYPE_BOOL,
+ TYPE_ADDR,
+ TYPE_UUID,
+ };
+
+ const char *type_to_str(type_t t) const {
+ switch (t) {
+ case TYPE_UINT: return "uint64_t";
+ case TYPE_INT: return "int64_t";
+ case TYPE_STR: return "std::string";
+ case TYPE_FLOAT: return "double";
+ case TYPE_BOOL: return "bool";
+ case TYPE_ADDR: return "entity_addr_t";
+ case TYPE_UUID: return "uuid_d";
+ default: return "unknown";
+ }
+ }
+
+ /**
+ * Basic: for users, configures some externally visible functional aspect
+ * Advanced: for users, configures some internal behaviour
+ * Development: not for users. May be dangerous, may not be documented.
+ */
+ enum level_t {
+ LEVEL_BASIC,
+ LEVEL_ADVANCED,
+ LEVEL_DEV,
+ };
+
+ const char *level_to_str(level_t l) const {
+ switch(l) {
+ case LEVEL_BASIC: return "basic";
+ case LEVEL_ADVANCED: return "advanced";
+ case LEVEL_DEV: return "developer";
+ default: return "unknown";
+ }
+ }
+
+ using value_t = boost::variant<
+ boost::blank,
+ std::string,
+ uint64_t,
+ int64_t,
+ double,
+ bool,
+ entity_addr_t,
+ uuid_d>;
+ const std::string name;
+ const type_t type;
+ const level_t level;
+
+ std::string desc;
+ std::string long_desc;
+
+ value_t value;
+ value_t daemon_value;
+
+ // Items like mon, osd, rgw, rbd, ceph-fuse. This is advisory metadata
+ // for presentation layers (like web dashboards, or generated docs), so that
+ // they know which options to display where.
+ // Additionally: "common" for settings that exist in any Ceph code. Do
+ // not use common for settings that are just shared some places: for those
+ // places, list them.
+ std::list<const char*> services;
+
+ // Topics like:
+ // "service": a catchall for the boring stuff like log/asok paths.
+ // "network"
+ // "performance": a setting that may need adjustment depending on
+ // environment/workload to get best performance.
+ std::list<const char*> tags;
+
+ std::list<const char*> see_also;
+
+ value_t min, max;
+ std::list<std::string> enum_allowed;
+
+ bool safe;
+
+ /**
+ * Return nonzero and set second argument to error string if the
+ * value is invalid.
+ *
+ * These callbacks are more than just validators, as they can also
+ * modify the value as it passes through.
+ */
+ typedef std::function<int(std::string *, std::string *)> validator_fn_t;
+ validator_fn_t validator;
+
+ Option(std::string const &name, type_t t, level_t l)
+ : name(name), type(t), level(l), safe(false)
+ {
+ // While value_t is nullable (via boost::blank), we don't ever
+ // want it set that way in an Option instance: within an instance,
+ // the type of ::value should always match the declared type.
+ if (type == TYPE_INT) {
+ value = int64_t(0);
+ } else if (type == TYPE_UINT) {
+ value = uint64_t(0);
+ } else if (type == TYPE_STR) {
+ value = std::string("");
+ } else if (type == TYPE_FLOAT) {
+ value = 0.0;
+ } else if (type == TYPE_BOOL) {
+ value = false;
+ } else if (type == TYPE_ADDR) {
+ value = entity_addr_t();
+ } else if (type == TYPE_UUID) {
+ value = uuid_d();
+ } else {
+ ceph_abort();
+ }
+ }
+
+ void dump_value(const char *field_name, const value_t &v, Formatter *f) const;
+
+ // Validate and potentially modify incoming string value
+ int pre_validate(std::string *new_value, std::string *err) const;
+
+ // Validate properly typed value against bounds
+ int validate(const Option::value_t &new_value, std::string *err) const;
+
+ // const char * must be explicit to avoid it being treated as an int
+ Option& set_value(value_t& v, const char *new_value) {
+ v = std::string(new_value);
+ return *this;
+ }
+
+ // bool is an integer, but we don't think so. teach it the hard way.
+ template<typename T>
+ using is_not_integer = std::enable_if<!std::is_integral<T>::value ||
+ std::is_same<T, bool>::value, int>;
+ template<typename T>
+ using is_integer = std::enable_if<std::is_integral<T>::value &&
+ !std::is_same<T, bool>::value, int>;
+ template<typename T, typename is_not_integer<T>::type = 0>
+ Option& set_value(value_t& v, const T& new_value) {
+ v = new_value;
+ return *this;
+ }
+
+ // For potentially ambiguous types, inspect Option::type and
+ // do some casting. This is necessary to make sure that setting
+ // a float option to "0" actually sets the double part of variant.
+ template<typename T, typename is_integer<T>::type = 0>
+ Option& set_value(value_t& v, T new_value) {
+ if (type == TYPE_INT) {
+ v = int64_t(new_value);
+ } else if (type == TYPE_UINT) {
+ v = uint64_t(new_value);
+ } else if (type == TYPE_FLOAT) {
+ v = double(new_value);
+ } else if (type == TYPE_BOOL) {
+ v = bool(new_value);
+ } else {
+ std::cerr << "Bad type in set_value: " << name << ": "
+ << typeid(T).name() << std::endl;
+ ceph_abort();
+ }
+ return *this;
+ }
+
+ template<typename T>
+ Option& set_default(const T& v) {
+ return set_value(value, v);
+ }
+
+ template<typename T>
+ Option& set_daemon_default(const T& v) {
+ return set_value(daemon_value, v);
+ }
+ Option& add_tag(const char* tag) {
+ tags.push_back(tag);
+ return *this;
+ }
+ Option& add_tag(std::initializer_list<const char*> ts) {
+ tags.insert(tags.end(), ts);
+ return *this;
+ }
+ Option& add_service(const char* service) {
+ services.push_back(service);
+ return *this;
+ }
+ Option& add_service(std::initializer_list<const char*> ss) {
+ services.insert(services.end(), ss);
+ return *this;
+ }
+ Option& add_see_also(const char* t) {
+ see_also.push_back(t);
+ return *this;
+ }
+ Option& add_see_also(std::initializer_list<const char*> ts) {
+ see_also.insert(see_also.end(), ts);
+ return *this;
+ }
+ Option& set_description(const char* new_desc) {
+ desc = new_desc;
+ return *this;
+ }
+ Option& set_long_description(const char* new_desc) {
+ long_desc = new_desc;
+ return *this;
+ }
+
+ template<typename T>
+ Option& set_min(const T& mi) {
+ set_value(min, mi);
+ return *this;
+ }
+
+ template<typename T>
+ Option& set_min_max(const T& mi, const T& ma) {
+ set_value(min, mi);
+ set_value(max, ma);
+ return *this;
+ }
+
+ Option& set_enum_allowed(const std::list<std::string> allowed)
+ {
+ enum_allowed = allowed;
+ return *this;
+ }
+
+ Option &set_safe() {
+ safe = true;
+ return *this;
+ }
+
+ Option &set_validator(const validator_fn_t &validator_)
+ {
+ validator = validator_;
+ return *this;
+ }
+
+ void dump(Formatter *f) const;
+
+ /**
+ * A crude indicator of whether the value may be
+ * modified safely at runtime -- should be replaced
+ * with proper locking!
+ */
+ bool is_safe() const
+ {
+ return safe || type == TYPE_BOOL || type == TYPE_INT
+ || type == TYPE_UINT || type == TYPE_FLOAT;
+ }
+};
+
+extern const std::vector<Option> ceph_options;
+
pair<uint64_t, T> last;
pair<uint64_t, T> cur;
avg_tracker() : last(0, 0), cur(0, 0) {}
- T avg() const {
+ T current_avg() const {
if (cur.first == last.first)
- return cur.first ?
- cur.second / cur.first :
- 0; // no change, report avg over all time
+ return 0;
return (cur.second - last.second) / (cur.first - last.first);
}
void consume_next(const pair<uint64_t, T> &next) {
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+/**
+ * This header describes the subsystems (each one gets a "--debug-<subsystem>"
+ * log verbosity setting), along with their default verbosities.
+ */
+
+DEFAULT_SUBSYS(0, 5)
+SUBSYS(lockdep, 0, 1)
+SUBSYS(context, 0, 1)
+SUBSYS(crush, 1, 1)
+SUBSYS(mds, 1, 5)
+SUBSYS(mds_balancer, 1, 5)
+SUBSYS(mds_locker, 1, 5)
+SUBSYS(mds_log, 1, 5)
+SUBSYS(mds_log_expire, 1, 5)
+SUBSYS(mds_migrator, 1, 5)
+SUBSYS(buffer, 0, 1)
+SUBSYS(timer, 0, 1)
+SUBSYS(filer, 0, 1)
+SUBSYS(striper, 0, 1)
+SUBSYS(objecter, 0, 1)
+SUBSYS(rados, 0, 5)
+SUBSYS(rbd, 0, 5)
+SUBSYS(rbd_mirror, 0, 5)
+SUBSYS(rbd_replay, 0, 5)
+SUBSYS(journaler, 0, 5)
+SUBSYS(objectcacher, 0, 5)
+SUBSYS(client, 0, 5)
+SUBSYS(osd, 1, 5)
+SUBSYS(optracker, 0, 5)
+SUBSYS(objclass, 0, 5)
+SUBSYS(filestore, 1, 3)
+SUBSYS(journal, 1, 3)
+SUBSYS(ms, 0, 5)
+SUBSYS(mon, 1, 5)
+SUBSYS(monc, 0, 10)
+SUBSYS(paxos, 1, 5)
+SUBSYS(tp, 0, 5)
+SUBSYS(auth, 1, 5)
+SUBSYS(crypto, 1, 5)
+SUBSYS(finisher, 1, 1)
+SUBSYS(heartbeatmap, 1, 5)
+SUBSYS(perfcounter, 1, 5)
+SUBSYS(rgw, 1, 5) // log level for the Rados gateway
+SUBSYS(civetweb, 1, 10)
+SUBSYS(javaclient, 1, 5)
+SUBSYS(asok, 1, 5)
+SUBSYS(throttle, 1, 1)
+SUBSYS(refs, 0, 0)
+SUBSYS(xio, 1, 5)
+SUBSYS(compressor, 1, 5)
+SUBSYS(bluestore, 1, 5)
+SUBSYS(bluefs, 1, 5)
+SUBSYS(bdev, 1, 3)
+SUBSYS(kstore, 1, 5)
+SUBSYS(rocksdb, 4, 5)
+SUBSYS(leveldb, 4, 5)
+SUBSYS(memdb, 4, 5)
+SUBSYS(kinetic, 1, 5)
+SUBSYS(fuse, 1, 5)
+SUBSYS(mgr, 1, 5)
+SUBSYS(mgrc, 1, 5)
+SUBSYS(dpdk, 1, 5)
+SUBSYS(eventtrace, 1, 5)
return COMP_ALG_ZSTD;
if (s == "lz4")
return COMP_ALG_LZ4;
- if (s == "")
+ if (s == "" || s == "none")
return COMP_ALG_NONE;
return boost::optional<CompressionAlgorithm>();
if (crush.get_rule_name(i))
print_rule_name(out, i, crush);
out << " {\n";
- out << "\truleset " << crush.get_rule_mask_ruleset(i) << "\n";
+ out << "\tid " << i << "\n";
+ if (i != crush.get_rule_mask_ruleset(i)) {
+ out << "\t# WARNING: ruleset " << crush.get_rule_mask_ruleset(i) << " != id " << i << "; this will not recompile to the same map\n";
+ }
switch (crush.get_rule_mask_type(i)) {
case CEPH_PG_TYPE_REPLICATED:
int CrushCompiler::parse_rule(iter_t const& i)
{
- crush.populate_classes();
-
int start; // rule name is optional!
string rname = string_node(i->children[1]);
start = 3;
}
- int ruleset = int_node(i->children[start]);
+ int ruleno = int_node(i->children[start]);
string tname = string_node(i->children[start+2]);
int type;
int steps = i->children.size() - start - 8;
//err << "num steps " << steps << std::endl;
-
- int ruleno = crush.add_rule(steps, ruleset, type, minsize, maxsize, -1);
+
+ if (crush.rule_exists(ruleno)) {
+ err << "rule " << ruleno << " already exists" << std::endl;
+ return -1;
+ }
+ int r = crush.add_rule(ruleno, steps, type, minsize, maxsize);
+ if (r != ruleno) {
+ err << "unable to add rule id " << ruleno << " for rule '" << rname
+ << "'" << std::endl;
+ return -1;
+ }
if (rname.length()) {
crush.set_rule_name(ruleno, rname.c_str());
rule_id[rname] = ruleno;
int CrushCompiler::parse_crush(iter_t const& i)
{
find_used_bucket_ids(i);
-
+ bool saw_rule = false;
for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
int r = 0;
switch (p->value.id().to_long()) {
case crush_grammar::_bucket_type:
r = parse_bucket_type(p);
break;
- case crush_grammar::_bucket:
+ case crush_grammar::_bucket:
+ if (saw_rule) {
+ err << "buckets must be defined before rules" << std::endl;
+ return -1;
+ }
r = parse_bucket(p);
break;
- case crush_grammar::_crushrule:
+ case crush_grammar::_crushrule:
+ if (!saw_rule) {
+ saw_rule = true;
+ crush.populate_classes();
+ }
r = parse_rule(p);
break;
case crush_grammar::_choose_args:
int max_id;
public:
CrushWalker(const CrushWrapper *crush, unsigned max_id)
- : Parent(crush), max_id(max_id) {}
+ : Parent(crush, CrushTreeDumper::name_map_t()), max_id(max_id) {}
void dump_item(const CrushTreeDumper::Item &qi, DumbFormatter *) override {
int type = -1;
if (qi.is_bucket()) {
// and see if the maps is also able to handle straying OSDs, whose id >= 0.
// "ceph osd tree" will try to print them, even they are not listed in the
// crush map.
- crush_walker.dump_item(CrushTreeDumper::Item(0, 0, 0), NULL);
+ crush_walker.dump_item(CrushTreeDumper::Item(0, 0, 0, 0), NULL);
} catch (const BadCrushMap& e) {
err << e.what() << ": item#" << e.item << std::endl;
return false;
#define CRUSH_TREE_DUMPER_H
#include "CrushWrapper.h"
+#include "include/stringify.h"
/**
* CrushTreeDumper:
struct Item {
int id;
+ int parent;
int depth;
float weight;
list<int> children;
- Item() : id(0), depth(0), weight(0) {}
- Item(int i, int d, float w) : id(i), depth(d), weight(w) {}
+ Item() : id(0), parent(0), depth(0), weight(0) {}
+ Item(int i, int p, int d, float w) : id(i), parent(p), depth(d), weight(w) {}
bool is_bucket() const { return id < 0; }
};
template <typename F>
class Dumper : public list<Item> {
public:
- explicit Dumper(const CrushWrapper *crush_) : crush(crush_) {
+ explicit Dumper(const CrushWrapper *crush_,
+ const name_map_t& weight_set_names_)
+ : crush(crush_), weight_set_names(weight_set_names_) {
crush->find_nonshadow_roots(roots);
root = roots.begin();
}
+ explicit Dumper(const CrushWrapper *crush_,
+ const name_map_t& weight_set_names_,
+ bool show_shadow)
+ : crush(crush_), weight_set_names(weight_set_names_) {
+ if (show_shadow) {
+ crush->find_roots(roots);
+ } else {
+ crush->find_nonshadow_roots(roots);
+ }
+ root = roots.begin();
+ }
virtual ~Dumper() {}
++root;
if (root == roots.end())
return false;
- push_back(Item(*root, 0, crush->get_bucket_weightf(*root)));
+ push_back(Item(*root, 0, 0, crush->get_bucket_weightf(*root)));
++root;
}
int id = crush->get_bucket_item(qi.id, k);
if (should_dump(id)) {
qi.children.push_back(id);
- push_front(Item(id, qi.depth + 1,
+ push_front(Item(id, qi.id, qi.depth + 1,
crush->get_bucket_item_weightf(qi.id, k)));
}
}
protected:
const CrushWrapper *crush;
+ const name_map_t &weight_set_names;
private:
set<int> touched;
};
inline void dump_item_fields(const CrushWrapper *crush,
+ const name_map_t& weight_set_names,
const Item &qi, Formatter *f) {
f->dump_int("id", qi.id);
const char *c = crush->get_item_class(qi.id);
- if (!c)
- c = "";
- f->dump_string("device_class", c);
+ if (c)
+ f->dump_string("device_class", c);
if (qi.is_bucket()) {
int type = crush->get_bucket_type(qi.id);
f->dump_string("name", crush->get_item_name(qi.id));
f->dump_float("crush_weight", qi.weight);
f->dump_unsigned("depth", qi.depth);
}
+ if (qi.parent < 0) {
+ f->open_object_section("pool_weights");
+ for (auto& p : crush->choose_args) {
+ const crush_choose_arg_map& cmap = p.second;
+ int bidx = -1 - qi.parent;
+ const crush_bucket *b = crush->get_bucket(qi.parent);
+ if (b &&
+ bidx < (int)cmap.size &&
+ cmap.args[bidx].weight_set &&
+ cmap.args[bidx].weight_set_size >= 1) {
+ int bpos;
+ for (bpos = 0;
+ bpos < (int)cmap.args[bidx].weight_set[0].size &&
+ b->items[bpos] != qi.id;
+ ++bpos) ;
+ string name;
+ if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ name = "(compat)";
+ } else {
+ auto q = weight_set_names.find(p.first);
+ name = q != weight_set_names.end() ? q->second :
+ stringify(p.first);
+ }
+ f->open_array_section(name.c_str());
+ for (unsigned opos = 0;
+ opos < cmap.args[bidx].weight_set_size;
+ ++opos) {
+ float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] /
+ (float)0x10000;
+ f->dump_float("weight", w);
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+ }
}
inline void dump_bucket_children(const CrushWrapper *crush,
class FormattingDumper : public Dumper<Formatter> {
public:
- explicit FormattingDumper(const CrushWrapper *crush) : Dumper<Formatter>(crush) {}
+ explicit FormattingDumper(const CrushWrapper *crush,
+ const name_map_t& weight_set_names)
+ : Dumper<Formatter>(crush, weight_set_names) {}
+ explicit FormattingDumper(const CrushWrapper *crush,
+ const name_map_t& weight_set_names,
+ bool show_shadow)
+ : Dumper<Formatter>(crush, weight_set_names, show_shadow) {}
protected:
void dump_item(const Item &qi, Formatter *f) override {
}
virtual void dump_item_fields(const Item &qi, Formatter *f) {
- CrushTreeDumper::dump_item_fields(crush, qi, f);
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
}
virtual void dump_bucket_children(const Item &qi, Formatter *f) {
#include "common/debug.h"
#include "common/Formatter.h"
#include "common/errno.h"
+#include "common/TextTable.h"
#include "include/stringify.h"
#include "CrushWrapper.h"
return false;
}
+bool CrushWrapper::has_non_straw2_buckets() const
+{
+ for (int i=0; i<crush->max_buckets; ++i) {
+ crush_bucket *b = crush->buckets[i];
+ if (!b)
+ continue;
+ if (b->alg != CRUSH_BUCKET_STRAW2)
+ return true;
+ }
+ return false;
+}
+
bool CrushWrapper::has_v2_rules() const
{
for (unsigned i=0; i<crush->max_rules; i++) {
return false;
if (choose_args.size() > 1)
return true;
+ if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
+ return true;
crush_choose_arg_map arg_map = choose_args.begin()->second;
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
}
}
-void CrushWrapper::find_nonshadow_roots(set<int>& roots) const
-{
- for (int i = 0; i < crush->max_buckets; i++) {
- if (!crush->buckets[i])
- continue;
- crush_bucket *b = crush->buckets[i];
- if (_search_item_exists(b->id))
- continue;
- const char *name = get_item_name(b->id);
- if (name && !is_valid_crush_name(name))
- continue;
- roots.insert(b->id);
- }
-}
-
bool CrushWrapper::subtree_contains(int root, int item) const
{
if (root == item)
ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
name_map.erase(item);
have_rmaps = false;
- class_remove_item(item);
+ if (item >= 0 && !unlink_only) {
+ class_remove_item(item);
+ }
}
return true;
}
}
if (class_bucket.count(item) != 0)
class_bucket.erase(item);
+ class_remove_item(item);
return 0;
}
int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
{
- ldout(cct, 5) << "remove_item " << item << (unlink_only ? " unlink_only":"") << dendl;
+ ldout(cct, 5) << "remove_item " << item
+ << (unlink_only ? " unlink_only":"") << dendl;
int ret = -ENOENT;
if (item < 0 && !unlink_only) {
crush_bucket *t = get_bucket(item);
if (IS_ERR(t)) {
- ldout(cct, 1) << "remove_item bucket " << item << " does not exist" << dendl;
+ ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
+ << dendl;
return -ENOENT;
}
if (id == item) {
ldout(cct, 5) << "remove_item removing item " << item
<< " from bucket " << b->id << dendl;
+ for (auto& p : choose_args) {
+ // weight down each weight-set to 0 before we remove the item
+ vector<int> weightv(get_choose_args_positions(p.second), 0);
+ choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
+ }
bucket_remove_item(b, item);
adjust_item_weight(cct, b->id, b->weight);
ret = 0;
return false;
}
-int CrushWrapper::_remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
+int CrushWrapper::_remove_item_under(
+ CephContext *cct, int item, int ancestor, bool unlink_only)
{
ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
<< (unlink_only ? " unlink_only":"") << dendl;
for (unsigned i=0; i<b->size; ++i) {
int id = b->items[i];
if (id == item) {
- ldout(cct, 5) << "_remove_item_under removing item " << item << " from bucket " << b->id << dendl;
+ ldout(cct, 5) << "_remove_item_under removing item " << item
+ << " from bucket " << b->id << dendl;
bucket_remove_item(b, item);
+ for (auto& p : choose_args) {
+ // weight down each weight-set to 0 before we remove the item
+ vector<int> weightv(get_choose_args_positions(p.second), 0);
+ _choose_args_adjust_item_weight_in_bucket(
+ cct, p.second, b->id, item, weightv, nullptr);
+ }
adjust_item_weight(cct, b->id, b->weight);
ret = 0;
} else if (id < 0) {
return ret;
}
-int CrushWrapper::remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
+int CrushWrapper::remove_item_under(
+ CephContext *cct, int item, int ancestor, bool unlink_only)
{
ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
<< (unlink_only ? " unlink_only":"") << dendl;
return 0;
}
-int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
- const map<string,string>& loc) // typename -> bucketname
+int CrushWrapper::insert_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc) // typename -> bucketname
{
ldout(cct, 5) << "insert_item item " << item << " weight " << weight
<< " name " << name << " loc " << loc << dendl;
int cur = item;
- // create locations if locations don't exist and add child in location with 0 weight
- // the more detail in the insert_item method declaration in CrushWrapper.h
- for (map<int,string>::iterator p = type_map.begin(); p != type_map.end(); ++p) {
+ // create locations if locations don't exist and add child in
+ // location with 0 weight the more detail in the insert_item method
+ // declaration in CrushWrapper.h
+ for (auto p = type_map.begin(); p != type_map.end(); ++p) {
// ignore device type
if (p->first == 0)
continue;
// skip types that are unspecified
map<string,string>::const_iterator q = loc.find(p->second);
if (q == loc.end()) {
- ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
+ ldout(cct, 2) << "warning: did not specify location for '"
+ << p->second << "' level (levels are "
<< type_map << ")" << dendl;
continue;
}
int r = add_bucket(0, 0,
CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
if (r < 0) {
- ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r) << dendl;
+ ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
+ << dendl;
return r;
}
set_item_name(newid, q->second);
// check that we aren't creating a cycle.
if (subtree_contains(id, cur)) {
- ldout(cct, 1) << "insert_item item " << cur << " already exists beneath " << id << dendl;
+ ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
+ << id << dendl;
return -EINVAL;
}
}
// adjust the item's weight in location
- if(adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
+ if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
if (item >= crush->max_devices) {
crush->max_devices = item + 1;
- ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices << dendl;
+ ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
+ << dendl;
+ }
+ r = rebuild_roots_with_classes();
+ if (r < 0) {
+ ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+ << cpp_strerror(r) << dendl;
+ return r;
}
return 0;
}
- ldout(cct, 1) << "error: didn't find anywhere to add item " << item << " in " << loc << dendl;
+ ldout(cct, 1) << "error: didn't find anywhere to add item " << item
+ << " in " << loc << dendl;
return -EINVAL;
}
-int CrushWrapper::move_bucket(CephContext *cct, int id, const map<string,string>& loc)
+
+int CrushWrapper::move_bucket(
+ CephContext *cct, int id, const map<string,string>& loc)
{
// sorry this only works for buckets
if (id >= 0)
return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
}
+int CrushWrapper::detach_bucket(CephContext *cct, int item)
+{
+ if (!crush)
+ return (-EINVAL);
+
+ if (item >= 0)
+ return (-EINVAL);
+
+ // check that the bucket that we want to detach exists
+ assert(bucket_exists(item));
+
+ // get the bucket's weight
+ crush_bucket *b = get_bucket(item);
+ unsigned bucket_weight = b->weight;
+
+ // get where the bucket is located
+ pair<string, string> bucket_location = get_immediate_parent(item);
+
+ // get the id of the parent bucket
+ int parent_id = get_item_id(bucket_location.second);
+
+ // get the parent bucket
+ crush_bucket *parent_bucket = get_bucket(parent_id);
+
+ if (!IS_ERR(parent_bucket)) {
+ // zero out the bucket weight
+ bucket_adjust_item_weight(cct, parent_bucket, item, 0);
+ adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
+ for (auto& p : choose_args) {
+ // weight down each weight-set to 0 before we remove the item
+ vector<int> weightv(get_choose_args_positions(p.second), 0);
+ choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
+ }
+
+ // remove the bucket from the parent
+ bucket_remove_item(parent_bucket, item);
+ } else if (PTR_ERR(parent_bucket) != -ENOENT) {
+ return PTR_ERR(parent_bucket);
+ }
+
+ // check that we're happy
+ int test_weight = 0;
+ map<string,string> test_location;
+ test_location[ bucket_location.first ] = (bucket_location.second);
+
+ bool successful_detach = !(check_item_loc(cct, item, test_location,
+ &test_weight));
+ assert(successful_detach);
+ assert(test_weight == 0);
+
+ return bucket_weight;
+}
+
int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
{
if (src >= 0 || dst >= 0)
return 0;
}
-int CrushWrapper::link_bucket(CephContext *cct, int id, const map<string,string>& loc)
+int CrushWrapper::link_bucket(
+ CephContext *cct, int id, const map<string,string>& loc)
{
// sorry this only works for buckets
if (id >= 0)
return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
}
-int CrushWrapper::create_or_move_item(CephContext *cct, int item, float weight, string name,
- const map<string,string>& loc) // typename -> bucketname
+int CrushWrapper::create_or_move_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc) // typename -> bucketname
{
int ret = 0;
int old_iweight;
return -EINVAL;
if (check_item_loc(cct, item, loc, &old_iweight)) {
- ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc << dendl;
+ ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
+ << dendl;
} else {
if (_search_item_exists(item)) {
weight = get_item_weightf(item);
- ldout(cct, 10) << "create_or_move_item " << item << " exists with weight " << weight << dendl;
+ ldout(cct, 10) << "create_or_move_item " << item
+ << " exists with weight " << weight << dendl;
remove_item(cct, item, true);
}
- ldout(cct, 5) << "create_or_move_item adding " << item << " weight " << weight
+ ldout(cct, 5) << "create_or_move_item adding " << item
+ << " weight " << weight
<< " at " << loc << dendl;
ret = insert_item(cct, item, weight, name, loc);
if (ret == 0)
return ret;
}
-int CrushWrapper::update_item(CephContext *cct, int item, float weight, string name,
- const map<string,string>& loc) // typename -> bucketname
+int CrushWrapper::update_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc) // typename -> bucketname
{
ldout(cct, 5) << "update_item item " << item << " weight " << weight
<< " name " << name << " loc " << loc << dendl;
ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
if (old_iweight != iweight) {
ldout(cct, 5) << "update_item " << item << " adjusting weight "
- << ((float)old_iweight/(float)0x10000) << " -> " << weight << dendl;
+ << ((float)old_iweight/(float)0x10000) << " -> " << weight
+ << dendl;
adjust_item_weight_in_loc(cct, item, iweight, loc);
ret = 1;
}
if (get_item_name(item) != name) {
- ldout(cct, 5) << "update_item setting " << item << " name to " << name << dendl;
+ ldout(cct, 5) << "update_item setting " << item << " name to " << name
+ << dendl;
set_item_name(item, name);
ret = 1;
}
for (unsigned i = 0; i < b->size; i++) {
if (b->items[i] == id) {
int diff = bucket_adjust_item_weight(cct, b, id, weight);
- ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff << " in bucket " << bidx << dendl;
+ ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
+ << " in bucket " << bidx << dendl;
adjust_item_weight(cct, -1 - bidx, b->weight);
changed++;
}
int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
{
- ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight << " in " << loc << dendl;
+ ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
+ << " in " << loc << dendl;
int changed = 0;
- for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
+ for (auto l = loc.begin(); l != loc.end(); ++l) {
int bid = get_item_id(l->second);
if (!bucket_exists(bid))
continue;
for (unsigned int i = 0; i < b->size; i++) {
if (b->items[i] == id) {
int diff = bucket_adjust_item_weight(cct, b, id, weight);
- ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff << " in bucket " << bid << dendl;
+ ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
+ << " in bucket " << bid << dendl;
adjust_item_weight(cct, bid, b->weight);
changed++;
}
crush_bucket *b = crush->buckets[bidx];
if (b == 0)
continue;
- const char *n = get_item_name(b->id);
- if (n && !is_valid_crush_name(n))
+ if (is_shadow_item(b->id))
continue;
for (unsigned i = 0; i < b->size; i++)
if (b->items[i] == id) {
crush_bucket *b = crush->buckets[bidx];
if (b == 0)
continue;
- const char *n = get_item_name(b->id);
- if (n && !is_valid_crush_name(n))
+ if (is_shadow_item(b->id))
continue;
for (unsigned i = 0; i < b->size; i++) {
if (b->items[i] == id) {
return item;
}
-bool CrushWrapper::class_is_in_use(int class_id)
-{
- for (auto &i : class_bucket)
- for (auto &j : i.second)
- if (j.first == class_id)
- return true;
- for (auto &i : class_map)
- if (i.second == class_id)
- return true;
-
- return false;
+bool CrushWrapper::class_is_in_use(int class_id, ostream *ss)
+{
+ list<unsigned> rules;
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int root = r->steps[j].arg1;
+ for (auto &p : class_bucket) {
+ auto& q = p.second;
+ if (q.count(class_id) && q[class_id] == root) {
+ rules.push_back(i);
+ }
+ }
+ }
+ }
+ }
+ if (rules.empty()) {
+ return false;
+ }
+ if (ss) {
+ ostringstream os;
+ for (auto &p: rules) {
+ os << "'" << get_rule_name(p) <<"',";
+ }
+ string out(os.str());
+ out.resize(out.size() - 1); // drop last ','
+ *ss << "still referenced by crush_rule(s): " << out;
+ }
+ return true;
}
int CrushWrapper::populate_classes()
{
set<int> roots;
- find_roots(roots);
+ find_nonshadow_roots(roots);
for (auto &r : roots) {
if (r >= 0)
continue;
- if (id_has_class(r))
- continue;
for (auto &c : class_name) {
int clone;
int res = device_class_clone(r, c.first, &clone);
int CrushWrapper::trim_roots_with_class(bool unused)
{
set<int> roots;
- find_roots(roots);
+ find_shadow_roots(roots);
for (auto &r : roots) {
if (r >= 0)
continue;
- if (!id_has_class(r))
- continue;
int res = remove_root(r, unused);
if (res)
return res;
return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
}
+int CrushWrapper::add_bucket(
+ int bucketno, int alg, int hash, int type, int size,
+ int *items, int *weights, int *idout)
+{
+ if (alg == 0) {
+ alg = get_default_bucket_alg();
+ if (alg == 0)
+ return -EINVAL;
+ }
+ crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
+ weights);
+ assert(b);
+ int r = crush_add_bucket(crush, bucketno, b, idout);
+ for (auto& p : choose_args) {
+ crush_choose_arg_map& cmap = p.second;
+ if (cmap.args) {
+ if ((int)cmap.size <= *idout) {
+ cmap.args = (crush_choose_arg*)realloc(
+ cmap.args,
+ sizeof(crush_choose_arg) * (*idout + 1));
+ memset(&cmap.args[cmap.size], 0,
+ sizeof(crush_choose_arg) * (*idout + 1 - cmap.size));
+ cmap.size = *idout + 1;
+ }
+ } else {
+ cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
+ *idout + 1);
+ cmap.size = *idout + 1;
+ }
+ if (size > 0) {
+ int positions = get_choose_args_positions(cmap);
+ crush_choose_arg& carg = cmap.args[*idout];
+ carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
+ size);
+ carg.weight_set_size = positions;
+ for (int ppos = 0; ppos < positions; ++ppos) {
+ carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
+ carg.weight_set[ppos].size = size;
+ for (int bpos = 0; bpos < size; ++bpos) {
+ carg.weight_set[ppos].weights[bpos] = weights[bpos];
+ }
+ }
+ }
+ }
+ return r;
+}
+
int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
{
__u32 new_size = bucket->size + 1;
crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
for (__u32 j = 0; j < arg->weight_set_size; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
- weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
+ weight_set->weights = (__u32*)realloc(weight_set->weights,
+ new_size * sizeof(__u32));
assert(weight_set->size + 1 == new_size);
weight_set->weights[weight_set->size] = weight;
weight_set->size = new_size;
assert(weight_set->size - 1 == new_size);
for (__u32 k = position; k < new_size; k++)
weight_set->weights[k] = weight_set->weights[k+1];
- weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
+ weight_set->weights = (__u32*)realloc(weight_set->weights,
+ new_size * sizeof(__u32));
weight_set->size = new_size;
}
if (arg->ids_size) {
const string& name,
ostream *ss)
{
+ assert(item_exists(id));
+ auto old_class_name = get_item_class(id);
+ if (old_class_name && old_class_name != class_name) {
+ *ss << "osd." << id << " has already bound to class '" << old_class_name
+ << "', can not reset class to '" << class_name << "'; "
+ << "use 'ceph osd crush rm-device-class <osd>' to "
+ << "remove old class first";
+ return -EBUSY;
+ }
+
int class_id = get_or_create_class_id(class_name);
if (id < 0) {
*ss << name << " id " << id << " is negative";
return -EINVAL;
}
- assert(item_exists(id));
if (class_map.count(id) != 0 && class_map[id] == class_id) {
*ss << name << " already set to class " << class_name;
return 1;
}
+int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
+{
+ assert(ss);
+ const char *name = get_item_name(id);
+ if (!name) {
+ *ss << "osd." << id << " does not have a name";
+ return -ENOENT;
+ }
+
+ const char *class_name = get_item_class(id);
+ if (!class_name) {
+ *ss << "osd." << id << " has not been bound to a specific class yet";
+ return 0;
+ }
+ class_remove_item(id);
+
+ // note that there is no need to remove ourselves from shadow parent
+ // and reweight because we are going to destroy all shadow trees
+ // rebuild them all (if necessary) later.
+
+ // see if there is any osds that still reference this class
+ set<int> devices;
+ get_devices_by_class(class_name, &devices);
+ if (devices.empty()) {
+ // class has no more devices
+ remove_class_name(class_name);
+ }
+
+ int r = rebuild_roots_with_classes();
+ if (r < 0) {
+ *ss << "unable to rebuild roots with class '" << class_name << "' "
+ << "of osd." << id << ": " << cpp_strerror(r);
+ return r;
+ }
+ return 0;
+}
+
int CrushWrapper::device_class_clone(int original_id, int device_class, int *clone)
{
const char *item_name = get_item_name(original_id);
::encode(class_name, bl);
::encode(class_bucket, bl);
+ // choose args
__u32 size = (__u32)choose_args.size();
::encode(size, bl);
for (auto c : choose_args) {
::decode(choose_args_index, blp);
crush_choose_arg_map arg_map;
arg_map.size = crush->max_buckets;
- arg_map.args = (crush_choose_arg*)calloc(arg_map.size, sizeof(crush_choose_arg));
+ arg_map.args = (crush_choose_arg*)calloc(
+ arg_map.size, sizeof(crush_choose_arg));
__u32 size;
::decode(size, blp);
for (__u32 j = 0; j < size; j++) {
assert(bucket_index < arg_map.size);
crush_choose_arg *arg = &arg_map.args[bucket_index];
::decode(arg->weight_set_size, blp);
- arg->weight_set = (crush_weight_set*)calloc(arg->weight_set_size, sizeof(crush_weight_set));
- for (__u32 k = 0; k < arg->weight_set_size; k++) {
- crush_weight_set *weight_set = &arg->weight_set[k];
- ::decode(weight_set->size, blp);
- weight_set->weights = (__u32*)calloc(weight_set->size, sizeof(__u32));
- for (__u32 l = 0; l < weight_set->size; l++)
- ::decode(weight_set->weights[l], blp);
+ if (arg->weight_set_size) {
+ arg->weight_set = (crush_weight_set*)calloc(
+ arg->weight_set_size, sizeof(crush_weight_set));
+ for (__u32 k = 0; k < arg->weight_set_size; k++) {
+ crush_weight_set *weight_set = &arg->weight_set[k];
+ ::decode(weight_set->size, blp);
+ weight_set->weights = (__u32*)calloc(
+ weight_set->size, sizeof(__u32));
+ for (__u32 l = 0; l < weight_set->size; l++)
+ ::decode(weight_set->weights[l], blp);
+ }
}
::decode(arg->ids_size, blp);
- arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
- for (__u32 k = 0; k < arg->ids_size; k++)
- ::decode(arg->ids[k], blp);
+ if (arg->ids_size) {
+ assert(arg->ids_size == crush->buckets[bucket_index]->size);
+ arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
+ for (__u32 k = 0; k < arg->ids_size; k++)
+ ::decode(arg->ids[k], blp);
+ }
}
choose_args[choose_args_index] = arg_map;
}
class TreeDumper {
typedef CrushTreeDumper::Item Item;
const CrushWrapper *crush;
+ const CrushTreeDumper::name_map_t& weight_set_names;
public:
- explicit TreeDumper(const CrushWrapper *crush)
- : crush(crush) {}
+ explicit TreeDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : crush(crush), weight_set_names(wsnames) {}
void dump(Formatter *f) {
set<int> roots;
crush->find_roots(roots);
for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
- dump_item(Item(*root, 0, crush->get_bucket_weightf(*root)), f);
+ dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
}
}
void dump_item(const Item& qi, Formatter* f) {
if (qi.is_bucket()) {
f->open_object_section("bucket");
- CrushTreeDumper::dump_item_fields(crush, qi, f);
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
dump_bucket_children(qi, f);
f->close_section();
} else {
f->open_object_section("device");
- CrushTreeDumper::dump_item_fields(crush, qi, f);
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
f->close_section();
}
}
for (int pos = 0; pos < max_pos; pos++) {
int id = crush->get_bucket_item(parent.id, pos);
float weight = crush->get_bucket_item_weightf(parent.id, pos);
- dump_item(Item(id, parent.depth + 1, weight), f);
+ dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
}
f->close_section();
}
};
}
-void CrushWrapper::dump_tree(Formatter *f) const
+void CrushWrapper::dump_tree(
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& weight_set_names) const
{
assert(f);
- TreeDumper(this).dump(f);
+ TreeDumper(this, weight_set_names).dump(f);
}
void CrushWrapper::dump_tunables(Formatter *f) const
}
}
-class CrushTreePlainDumper : public CrushTreeDumper::Dumper<ostream> {
-public:
- typedef CrushTreeDumper::Dumper<ostream> Parent;
-
- explicit CrushTreePlainDumper(const CrushWrapper *crush)
- : Parent(crush) {}
+void CrushWrapper::list_rules(ostream *ss) const
+{
+ for (int rule = 0; rule < get_max_rules(); rule++) {
+ if (!rule_exists(rule))
+ continue;
+ *ss << get_rule_name(rule) << "\n";
+ }
+}
- void dump(ostream *out) {
- *out << "ID\tWEIGHT\tTYPE NAME\n";
- Parent::dump(out);
+class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
+public:
+ typedef CrushTreeDumper::Dumper<TextTable> Parent;
+
+ explicit CrushTreePlainDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : Parent(crush, wsnames) {}
+ explicit CrushTreePlainDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames,
+ bool show_shadow)
+ : Parent(crush, wsnames, show_shadow) {}
+
+
+ void dump(TextTable *tbl) {
+ tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ for (auto& p : crush->choose_args) {
+ if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
+ } else {
+ string name;
+ auto q = weight_set_names.find(p.first);
+ name = q != weight_set_names.end() ? q->second :
+ stringify(p.first);
+ tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
+ }
+ }
+ tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+ Parent::dump(tbl);
}
protected:
- void dump_item(const CrushTreeDumper::Item &qi, ostream *out) override {
- *out << qi.id << "\t"
- << weightf_t(qi.weight) << "\t";
-
- for (int k=0; k < qi.depth; k++)
- *out << "\t";
-
- if (qi.is_bucket())
- {
- *out << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
- << crush->get_item_name(qi.id);
+ void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
+ *tbl << qi.id
+ << weightf_t(qi.weight);
+ for (auto& p : crush->choose_args) {
+ if (qi.parent < 0) {
+ const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
+ int bidx = -1 - qi.parent;
+ const crush_bucket *b = crush->get_bucket(qi.parent);
+ if (b &&
+ bidx < (int)cmap.size &&
+ cmap.args[bidx].weight_set &&
+ cmap.args[bidx].weight_set_size >= 1) {
+ int pos;
+ for (pos = 0;
+ pos < (int)cmap.args[bidx].weight_set[0].size &&
+ b->items[pos] != qi.id;
+ ++pos) ;
+ *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
+ (float)0x10000);
+ continue;
+ }
+ }
+ *tbl << "";
}
- else
- {
- *out << "osd." << qi.id;
+ ostringstream ss;
+ for (int k=0; k < qi.depth; k++) {
+ ss << " ";
}
- *out << "\n";
+ if (qi.is_bucket()) {
+ ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
+ << crush->get_item_name(qi.id);
+ } else {
+ ss << "osd." << qi.id;
+ }
+ *tbl << ss.str();
+ *tbl << TextTable::endrow;
}
};
public:
typedef CrushTreeDumper::FormattingDumper Parent;
- explicit CrushTreeFormattingDumper(const CrushWrapper *crush)
- : Parent(crush) {}
+ explicit CrushTreeFormattingDumper(
+ const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : Parent(crush, wsnames) {}
+
+ explicit CrushTreeFormattingDumper(
+ const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames,
+ bool show_shadow)
+ : Parent(crush, wsnames, show_shadow) {}
void dump(Formatter *f) {
f->open_array_section("nodes");
};
-void CrushWrapper::dump_tree(ostream *out, Formatter *f) const
+void CrushWrapper::dump_tree(
+ ostream *out,
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& weight_set_names,
+ bool show_shadow) const
{
- if (out)
- CrushTreePlainDumper(this).dump(out);
- if (f)
- CrushTreeFormattingDumper(this).dump(f);
+ if (out) {
+ TextTable tbl;
+ CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
+ *out << tbl;
+ }
+ if (f) {
+ CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
+ }
}
void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
return 0;
}
+
+
+int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int bucketid,
+ int id,
+ const vector<int>& weight,
+ ostream *ss)
+{
+ int changed = 0;
+ int bidx = -1 - bucketid;
+ crush_bucket *b = crush->buckets[bidx];
+ if (bidx >= (int)cmap.size) {
+ if (ss)
+ *ss << "no weight-set for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
+ << dendl;
+ return 0;
+ }
+ crush_choose_arg *carg = &cmap.args[bidx];
+ if (carg->weight_set == NULL) {
+ if (ss)
+ *ss << "no weight-set for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
+ << dendl;
+ return 0;
+ }
+ if (carg->weight_set_size != weight.size()) {
+ if (ss)
+ *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
+ << " for bucket " << b->id << dendl;
+ return 0;
+ }
+ for (unsigned i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ for (unsigned j = 0; j < weight.size(); ++j) {
+ carg->weight_set[j].weights[i] = weight[j];
+ }
+ ldout(cct, 5) << __func__ << " set " << id << " to " << weight
+ << " in bucket " << b->id << dendl;
+ changed++;
+ }
+ }
+ if (changed) {
+ vector<int> bucket_weight(weight.size(), 0);
+ for (unsigned i = 0; i < b->size; i++) {
+ for (unsigned j = 0; j < weight.size(); ++j) {
+ bucket_weight[j] += carg->weight_set[j].weights[i];
+ }
+ }
+ choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
+ }
+ return changed;
+}
+
+int CrushWrapper::choose_args_adjust_item_weight(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id,
+ const vector<int>& weight,
+ ostream *ss)
+{
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
+ int changed = 0;
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == nullptr) {
+ continue;
+ }
+ changed += _choose_args_adjust_item_weight_in_bucket(
+ cct, cmap, b->id, id, weight, ss);
+ }
+ if (!changed) {
+ if (ss)
+ *ss << "item " << id << " not found in crush map";
+ return -ENOENT;
+ }
+ return changed;
+}
#include "include/assert.h"
#include "include/err.h"
#include "include/encoding.h"
-
+#include "include/mempool.h"
#include "common/Mutex.h"
class Formatter;
}
+namespace CrushTreeDumper {
+ typedef mempool::osdmap::map<int64_t,string> name_map_t;
+}
+
WRITE_RAW_ENCODER(crush_rule_mask) // it's all u8's
inline static void encode(const crush_rule_step &s, bufferlist &bl)
using namespace std;
class CrushWrapper {
public:
+ // magic value used by OSDMap for a "default" fallback choose_args, used if
+ // the choose_arg_map passed to do_rule does not exist. if this also
+ // doesn't exist, fall back to canonical weights.
+ enum {
+ DEFAULT_CHOOSE_ARGS = -1
+ };
+
std::map<int32_t, string> type_map; /* bucket/device type names */
std::map<int32_t, string> name_map; /* bucket/device names */
std::map<int32_t, string> rule_name_map;
std::map<int32_t, string> class_name; /* class id -> class name */
std::map<string, int32_t> class_rname; /* class name -> class id */
std::map<int32_t, map<int32_t, int32_t> > class_bucket; /* bucket[id][class] == id */
- std::map<uint64_t, crush_choose_arg_map> choose_args;
+ std::map<int64_t, crush_choose_arg_map> choose_args;
private:
struct crush_map *crush;
/// true if any ruleset has more than 1 rule
bool has_multirule_rulesets() const;
+ /// true if any buckets that aren't straw2
+ bool has_non_straw2_buckets() const;
+
// tunables
void set_tunables_argonaut() {
crush->choose_local_tries = 2;
name_rmap[bn] = a;
}
}
- bool id_has_class(int i) {
- int idout;
- int classout;
- if (split_id_class(i, &idout, &classout) != 0)
- return false;
- return classout != -1;
- }
int split_id_class(int i, int *idout, int *classout) const;
bool class_exists(const string& name) const {
return 0;
}
- int rename_class(const string& srcname, const string& dstname) {
- auto p = class_rname.find(srcname);
- if (p == class_rname.end())
- return -ENOENT;
- int class_id = p->second;
- auto q = class_name.find(class_id);
- if (q == class_name.end())
- return -ENOENT;
- class_rname.erase(srcname);
- class_name.erase(class_id);
- class_rname[dstname] = class_id;
- class_name[class_id] = dstname;
- return 0;
- }
-
int32_t _alloc_class_id() const;
int get_or_create_class_id(const string& name) {
if (have_rmaps)
rule_name_rmap[name] = i;
}
+ bool is_shadow_item(int id) const {
+ const char *name = get_item_name(id);
+ return name && !is_valid_crush_name(name);
+ }
/**
*/
void find_roots(set<int>& roots) const;
+
+ /**
+ * find tree roots that contain shadow (device class) items only
+ */
+ void find_shadow_roots(set<int>& roots) const {
+ set<int> all;
+ find_roots(all);
+ for (auto& p: all) {
+ if (is_shadow_item(p)) {
+ roots.insert(p);
+ }
+ }
+ }
+
/**
* find tree roots that are not shadow (device class) items
*
* These are parentless nodes in the map that are not shadow
* items for device classes.
*/
- void find_nonshadow_roots(set<int>& roots) const;
+ void find_nonshadow_roots(set<int>& roots) const {
+ set<int> all;
+ find_roots(all);
+ for (auto& p: all) {
+ if (!is_shadow_item(p)) {
+ roots.insert(p);
+ }
+ }
+ }
/**
* see if an item is contained within a subtree
* FIXME: ambiguous for items that occur multiple times in the map
*/
pair<string,string> get_immediate_parent(int id, int *ret = NULL);
+
int get_immediate_parent_id(int id, int *parent) const;
/**
int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
/* modifiers */
- int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
+
+ int add_rule(int ruleno, int len, int type, int minsize, int maxsize) {
if (!crush) return -ENOENT;
- crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
+ crush_rule *n = crush_make_rule(len, ruleno, type, minsize, maxsize);
assert(n);
ruleno = crush_add_rule(crush, n, ruleno);
return ruleno;
/** buckets **/
-private:
const crush_bucket *get_bucket(int id) const {
if (!crush)
return (crush_bucket *)(-EINVAL);
return (crush_bucket *)(-ENOENT);
return ret;
}
+private:
crush_bucket *get_bucket(int id) {
if (!crush)
return (crush_bucket *)(-EINVAL);
*
* returns the weight of the detached bucket
**/
- int detach_bucket(CephContext *cct, int item){
- if (!crush)
- return (-EINVAL);
-
- if (item >= 0)
- return (-EINVAL);
-
- // check that the bucket that we want to detach exists
- assert(bucket_exists(item));
-
- // get the bucket's weight
- crush_bucket *b = get_bucket(item);
- unsigned bucket_weight = b->weight;
-
- // get where the bucket is located
- pair<string, string> bucket_location = get_immediate_parent(item);
-
- // get the id of the parent bucket
- int parent_id = get_item_id(bucket_location.second);
-
- // get the parent bucket
- crush_bucket *parent_bucket = get_bucket(parent_id);
-
- if (!IS_ERR(parent_bucket)) {
- // zero out the bucket weight
- bucket_adjust_item_weight(cct, parent_bucket, item, 0);
- adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
-
- // remove the bucket from the parent
- bucket_remove_item(parent_bucket, item);
- } else if (PTR_ERR(parent_bucket) != -ENOENT) {
- return PTR_ERR(parent_bucket);
- }
-
- // check that we're happy
- int test_weight = 0;
- map<string,string> test_location;
- test_location[ bucket_location.first ] = (bucket_location.second);
-
- bool successful_detach = !(check_item_loc(cct, item, test_location, &test_weight));
- assert(successful_detach);
- assert(test_weight == 0);
-
- return bucket_weight;
- }
+ int detach_bucket(CephContext *cct, int item);
public:
int get_max_buckets() const {
/* modifiers */
int add_bucket(int bucketno, int alg, int hash, int type, int size,
- int *items, int *weights, int *idout) {
- if (alg == 0) {
- alg = get_default_bucket_alg();
- if (alg == 0)
- return -EINVAL;
- }
- crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items, weights);
- assert(b);
- return crush_add_bucket(crush, bucketno, b, idout);
- }
-
+ int *items, int *weights, int *idout);
int bucket_add_item(crush_bucket *bucket, int item, int weight);
int bucket_remove_item(struct crush_bucket *bucket, int item);
int bucket_adjust_item_weight(CephContext *cct, struct crush_bucket *bucket, int item, int weight);
}
int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
+ int remove_device_class(CephContext *cct, int id, ostream *ss);
int device_class_clone(int original, int device_class, int *clone);
- bool class_is_in_use(int class_id);
+ bool class_is_in_use(int class_id, ostream *ss = nullptr);
int populate_classes();
int rebuild_roots_with_classes();
/* remove unused roots generated for class devices */
* the original choose_total_tries value was off by one (it
* counted "retries" and not "tries"). add one to alloc.
*/
- crush->choose_tries = (__u32 *)malloc(sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
+ crush->choose_tries = (__u32 *)calloc(sizeof(*crush->choose_tries),
+ (crush->choose_total_tries + 1));
memset(crush->choose_tries, 0,
sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
}
return result;
}
- crush_choose_arg_map choose_args_get(uint64_t choose_args_index) const {
+ bool have_choose_args(int64_t choose_args_index) const {
+ return choose_args.count(choose_args_index);
+ }
+
+ crush_choose_arg_map choose_args_get_with_fallback(
+ int64_t choose_args_index) const {
+ auto i = choose_args.find(choose_args_index);
+ if (i == choose_args.end()) {
+ i = choose_args.find(DEFAULT_CHOOSE_ARGS);
+ }
+ if (i == choose_args.end()) {
+ crush_choose_arg_map arg_map;
+ arg_map.args = NULL;
+ arg_map.size = 0;
+ return arg_map;
+ } else {
+ return i->second;
+ }
+ }
+ crush_choose_arg_map choose_args_get(int64_t choose_args_index) const {
auto i = choose_args.find(choose_args_index);
if (i == choose_args.end()) {
crush_choose_arg_map arg_map;
}
free(arg_map.args);
}
-
+
+ void create_choose_args(int64_t id, int positions) {
+ if (choose_args.count(id))
+ return;
+ assert(positions);
+ auto &cmap = choose_args[id];
+ cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
+ crush->max_buckets);
+ cmap.size = crush->max_buckets;
+ for (int bidx=0; bidx < crush->max_buckets; ++bidx) {
+ crush_bucket *b = crush->buckets[bidx];
+ auto &carg = cmap.args[bidx];
+ carg.ids = NULL;
+ carg.ids_size = 0;
+ if (b && b->alg == CRUSH_BUCKET_STRAW2) {
+ crush_bucket_straw2 *sb = (crush_bucket_straw2*)b;
+ carg.weight_set_size = positions;
+ carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
+ carg.weight_set_size);
+ // initialize with canonical weights
+ for (int pos = 0; pos < positions; ++pos) {
+ carg.weight_set[pos].size = b->size;
+ carg.weight_set[pos].weights = (__u32*)calloc(4, b->size);
+ for (unsigned i = 0; i < b->size; ++i) {
+ carg.weight_set[pos].weights[i] = sb->item_weights[i];
+ }
+ }
+ } else {
+ carg.weight_set = NULL;
+ carg.weight_set_size = 0;
+ }
+ }
+ }
+
+ void rm_choose_args(int64_t id) {
+ auto p = choose_args.find(id);
+ if (p != choose_args.end()) {
+ destroy_choose_args(p->second);
+ choose_args.erase(p);
+ }
+ }
+
void choose_args_clear() {
for (auto w : choose_args)
destroy_choose_args(w.second);
choose_args.clear();
}
+ // adjust choose_args_map weight, preserving the hierarchical summation
+ // property. used by callers optimizing layouts by tweaking weights.
+ int _choose_args_adjust_item_weight_in_bucket(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int bucketid,
+ int id,
+ const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weight(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weightf(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<double>& weightf,
+ ostream *ss) {
+ vector<int> weight(weightf.size());
+ for (unsigned i = 0; i < weightf.size(); ++i) {
+ weight[i] = (int)(weightf[i] * (float)0x10000);
+ }
+ return choose_args_adjust_item_weight(cct, cmap, id, weight, ss);
+ }
+
+ int get_choose_args_positions(crush_choose_arg_map cmap) {
+ // infer positions from other buckets
+ for (unsigned j = 0; j < cmap.size; ++j) {
+ if (cmap.args[j].weight_set_size) {
+ return cmap.args[j].weight_set_size;
+ }
+ }
+ return 1;
+ }
+
template<typename WeightVector>
void do_rule(int rule, int x, vector<int>& out, int maxout,
const WeightVector& weight,
int rawout[maxout];
char work[crush_work_size(crush, maxout)];
crush_init_workspace(crush, work);
- crush_choose_arg_map arg_map = choose_args_get(choose_args_index);
+ crush_choose_arg_map arg_map = choose_args_get_with_fallback(
+ choose_args_index);
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0],
weight.size(), work, arg_map.args);
if (numrep < 0)
void dump_tunables(Formatter *f) const;
void dump_choose_args(Formatter *f) const;
void list_rules(Formatter *f) const;
- void dump_tree(ostream *out, Formatter *f) const;
- void dump_tree(Formatter *f) const;
+ void list_rules(ostream *ss) const;
+ void dump_tree(ostream *out,
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& ws,
+ bool show_shadow = false) const;
+ void dump_tree(ostream *out, Formatter *f) {
+ dump_tree(out, f, CrushTreeDumper::name_map_t());
+ }
+ void dump_tree(Formatter *f,
+ const CrushTreeDumper::name_map_t& ws) const;
static void generate_test_instances(list<CrushWrapper*>& o);
int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct);
step_chooseleaf |
step_emit );
crushrule = str_p("rule") >> !name >> '{'
- >> str_p("ruleset") >> posint
+ >> (str_p("id") | str_p("ruleset")) >> posint
>> str_p("type") >> ( str_p("replicated") | str_p("erasure") )
>> str_p("min_size") >> posint
>> str_p("max_size") >> posint
int position)
{
if ((arg == NULL) ||
- (arg->weight_set == NULL) ||
- (arg->weight_set_size == 0))
+ (arg->weight_set == NULL))
return bucket->item_weights;
if (position >= arg->weight_set_size)
position = arg->weight_set_size - 1;
root = crush.class_bucket[root][c];
}
- int rule = 0;
int rno = 0;
for (rno = 0; rno < crush.get_max_rules(); rno++) {
if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
break;
}
- rule = rno;
int steps = 4 + rule_steps.size();
int min_rep = 3;
int max_rep = get_chunk_count();
int ret;
- ret = crush.add_rule(steps, rule, pg_pool_t::TYPE_ERASURE,
- min_rep, max_rep, rno);
+ ret = crush.add_rule(rno, steps, pg_pool_t::TYPE_ERASURE,
+ min_rep, max_rep);
assert(ret == rno);
int step = 0;
ret = crush.set_rule_step(rno, step++, CRUSH_RULE_EMIT, 0, 0);
assert(ret == 0);
crush.set_rule_name(rno, name);
- return rule;
+ return rno;
}
int ErasureCodeLrc::layers_description(const ErasureCodeProfile &profile,
page.h
crc32c.h
rados/objclass.h
- DESTINATION include/rados)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
install(FILES
radosstriper/libradosstriper.h
radosstriper/libradosstriper.hpp
- DESTINATION include/radosstriper)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper)
if(WITH_RBD)
install(FILES
rbd/features.h
rbd/librbd.h
rbd/librbd.hpp
- DESTINATION include/rbd)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd)
endif()
if(WITH_RADOSGW)
install(FILES
rados/librgw.h
rados/rgw_file.h
- DESTINATION include/rados)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
endif()
# include "acconfig.h"
#endif
+#ifdef __cplusplus
class CephContext;
-#ifdef __cplusplus
namespace ceph {
struct BackTrace;
# define __CEPH_ASSERT_FUNCTION ((__const char *) 0)
#endif
+#ifdef __cplusplus
extern void register_assert_context(CephContext *cct);
+#endif
+
extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function)
__attribute__ ((__noreturn__));
extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...)
__attribute__ ((__noreturn__));
extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function);
+#ifdef __cplusplus
+# define _CEPH_ASSERT_VOID_CAST static_cast<void>
+#else
+# define _CEPH_ASSERT_VOID_CAST (void)
+#endif
#define assert_warn(expr) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
#ifdef __cplusplus
#define assert(expr) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assert_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
#define ceph_assert(expr) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assert_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
// this variant will *never* get compiled out to NDEBUG in the future.
// (ceph_assert currently doesn't either, but in the future it might.)
#define ceph_assert_always(expr) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assert_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
// Named by analogy with printf. Along with an expression, takes a format
// string and parameters which are printed if the assertion fails.
#define assertf(expr, ...) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
#define ceph_assertf(expr, ...) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
// this variant will *never* get compiled out to NDEBUG in the future.
// (ceph_assertf currently doesn't either, but in the future it might.)
#define ceph_assertf_always(expr, ...) \
((expr) \
- ? static_cast<void> (0) \
+ ? _CEPH_ASSERT_VOID_CAST (0) \
: __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
-DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit*
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down!
DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
-/*
- * conditionally include blkin in CEPH_FEATURES_ALL/SUPPORTED_DEFAULT
- */
-#ifdef WITH_BLKIN
-#define CEPH_FEATURES_BLKIN CEPH_FEATURE_BLKIN_TRACING
-#else
-#define CEPH_FEATURES_BLKIN 0
-#endif
-
/*
* Features supported. Should be everything above.
*/
CEPH_FEATURE_SERVER_LUMINOUS | \
CEPH_FEATURE_RESEND_ON_SPLIT | \
CEPH_FEATURE_RADOS_BACKOFF | \
- CEPH_FEATURES_BLKIN | \
+ CEPH_FEATURE_OSD_RECOVERY_DELETES | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
CEPH_FEATURE_CRUSH_TUNABLES5 | \
CEPH_FEATURE_CRUSH_V2 | \
CEPH_FEATURE_CRUSH_V4 | \
- CEPH_FEATURE_CRUSH_CHOOSE_ARGS)
+ CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS)
/*
* make sure we don't try to use the reserved features
+#pragma once
+
#ifdef HAVE_SYS_PRCTL_H
#include <iostream>
#include <sys/prctl.h>
set_dumpable(saved_state);
}
};
+
#else
-struct PrCtl {};
+
+struct PrCtl {
+ // to silence the Wunused-variable warning
+ PrCtl() {}
+};
+
#endif
}
-// array
-template<class A>
-inline void encode_array_nohead(const A a[], int n, bufferlist &bl)
-{
- for (int i=0; i<n; i++)
- encode(a[i], bl);
-}
-template<class A>
-inline void decode_array_nohead(A a[], int n, bufferlist::iterator &p)
-{
- for (int i=0; i<n; i++)
- decode(a[i], p);
-}
-
-
-
// -----------------------------
// buffers
typedef __u64 __bitwise__ __le64;
typedef __u64 __bitwise__ __be64;
+#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#endif
+
+#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE
+#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need
+#endif
+
+#ifndef BOOST_MPL_LIMIT_MAP_SIZE
+#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need
+#endif
+
#endif
#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
/* these are hidden in 'ceph status' view */
#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
CEPH_OSDMAP_REQUIRE_KRAKEN | \
CEPH_OSDMAP_REQUIRE_LUMINOUS | \
+ CEPH_OSDMAP_RECOVERY_DELETES | \
CEPH_OSDMAP_SORTBITWISE)
#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
CEPH_OSDMAP_REQUIRE_KRAKEN | \
#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
/* RADOS lock flags
* They are also defined in cls_lock_types.h. Keep them in sync!
* - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
* - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
*
- * All possible options can be found in src/common/config_opts.h in ceph.git
+ * See docs.ceph.com for information about available configuration options`
*
* @{
*/
char *client_address,
uint32_t expire_seconds);
+/**
+ * Enable an application on a pool
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+ const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param values buffer in which to store application names
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+ const char *app_name,
+ const char *key, char *value,
+ size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+ const char *app_name,
+ const char *key,
+ const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name,
+ const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param ioctx pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param keys_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on succcess, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name,
+ char *keys, size_t *key_len,
+ char *values,
+ size_t *vals_len);
+
/**
* @name Mon/OSD/PG Commands
*
void set_osdmap_full_try();
void unset_osdmap_full_try();
+ int application_enable(const std::string& app_name, bool force);
+ int application_enable_async(const std::string& app_name,
+ bool force, PoolAsyncCompletion *c);
+ int application_list(std::set<std::string> *app_names);
+ int application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string *value);
+ int application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value);
+ int application_metadata_remove(const std::string& app_name,
+ const std::string &key);
+ int application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values);
+
private:
/* You can only get IoCtx instances from Rados */
IoCtx(IoCtxImpl *io_ctx_impl_);
#define LIBRGW_FILE_VER_MAJOR 1
#define LIBRGW_FILE_VER_MINOR 1
-#define LIBRGW_FILE_VER_EXTRA 3
+#define LIBRGW_FILE_VER_EXTRA 4
#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
rgw_readdir_cb rcb, void *cb_arg, bool *eof,
uint32_t flags);
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, int64_t *offset,
+ uint32_t flags);
+
/*
get unix attributes for object
*/
#define LIBRBD_SUPPORTS_INVALIDATE 1
#define LIBRBD_SUPPORTS_IOVEC 1
#define LIBRBD_SUPPORTS_WATCH 0
+#define LIBRBD_SUPPORTS_WRITESAME 1
#if __GNUC__ >= 4
#define CEPH_RBD_API __attribute__ ((visibility ("default")))
CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
const char *buf, size_t data_len, int op_flags);
+CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
+ size_t len, const char *cmp_buf,
+ const char *buf, uint64_t *mismatch_off,
+ int op_flags);
+
CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
const char *buf, rbd_completion_t c);
CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
const char *buf, size_t data_len,
rbd_completion_t c, int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
+ uint64_t off, size_t len,
+ const char *cmp_buf, const char *buf,
+ rbd_completion_t c, uint64_t *mismatch_off,
+ int op_flags);
CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
rbd_callback_t complete_cb,
ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
int discard(uint64_t ofs, uint64_t len);
ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+ ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
+ ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
/* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
RBD::AioCompletion *c, int op_flags);
int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
RBD::AioCompletion *c, int op_flags);
+ int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
+ ceph::bufferlist& bl, RBD::AioCompletion *c,
+ uint64_t *mismatch_off, int op_flags);
/**
* read async from image
*
inline bool operator!=(const uuid_d& l, const uuid_d& r) {
return l.uuid != r.uuid;
}
+inline bool operator<(const uuid_d& l, const uuid_d& r) {
+ return l.to_string() < r.to_string();
+}
#endif
int r = 0;
std::map<std::string, bufferlist> kvmap;
- r = cls_cxx_map_get_vals(hctx, key_data(key).encoded(), "", 2, &kvmap);
+ bool more;
+
+ r = cls_cxx_map_get_vals(hctx, key_data(key).encoded(), "", 2, &kvmap, &more);
if (r < 0) {
CLS_LOG(20, "error reading index for range %s: %d", key.c_str(), r);
return r;
index_data &out_data) {
int r = 0;
std::map<std::string, bufferlist> kvs;
- r = cls_cxx_map_get_vals(hctx, idata.kdata.encoded(), "", 1, &kvs);
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, idata.kdata.encoded(), "", 1, &kvs, &more);
if (r < 0){
CLS_LOG(20, "getting kvs failed with error %d", r);
return r;
index_data &out_data) {
int r = 0;
std::map<std::string, bufferlist> kvs;
- r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &kvs);
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &kvs, &more);
if (r < 0){
CLS_LOG(20, "getting kvs failed with error %d", r);
return r;
static int read_many(cls_method_context_t hctx, const set<string> &keys,
map<string, bufferlist> * out) {
int r = 0;
+ bool more;
CLS_ERR("reading from a map of size %d, first key encoded is %s",
(int)keys.size(), key_data(*keys.begin()).encoded().c_str());
r = cls_cxx_map_get_vals(hctx, key_data(*keys.begin()).encoded().c_str(),
- "", LONG_MAX, out);
+ "", LONG_MAX, out, &more);
if (r < 0) {
CLS_ERR("getting omap vals failed with error %d", r);
}
}
//if the assert succeeded, it needs to be balanced
- r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &odata.omap);
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &odata.omap, &more);
if (r < 0){
CLS_LOG(20, "rebalance read: getting kvs failed with error %d", r);
return r;
{
close();
delete logger;
- delete ceph_logger;
// Ensure db is destroyed before dependent db_cache and filterpolicy
db.reset();
+ delete ceph_logger;
}
void LevelDBStore::close()
#undef dout_prefix
#define dout_prefix *_dout << "rocksdb: "
-static rocksdb::SliceParts prepare_sliceparts(const bufferlist &bl, rocksdb::Slice *slices)
+static rocksdb::SliceParts prepare_sliceparts(const bufferlist &bl,
+ vector<rocksdb::Slice> *slices)
{
unsigned n = 0;
- for (std::list<buffer::ptr>::const_iterator p = bl.buffers().begin();
- p != bl.buffers().end(); ++p, ++n) {
- slices[n].data_ = p->c_str();
- slices[n].size_ = p->length();
+ for (auto& buf : bl.buffers()) {
+ (*slices)[n].data_ = buf.c_str();
+ (*slices)[n].size_ = buf.length();
+ n++;
}
- return rocksdb::SliceParts(slices, n);
+ return rocksdb::SliceParts(slices->data(), slices->size());
}
//
return new CephRocksdbLogger(g_ceph_context);
}
-int string2bool(string val, bool &b_val)
+static int string2bool(const string &val, bool &b_val)
{
if (strcasecmp(val.c_str(), "false") == 0) {
b_val = false;
}
}
-int RocksDBStore::tryInterpret(const string key, const string val, rocksdb::Options &opt)
+int RocksDBStore::tryInterpret(const string &key, const string &val, rocksdb::Options &opt)
{
if (key == "compaction_threads") {
std::string err;
return 0;
}
-int RocksDBStore::ParseOptionsFromString(const string opt_str, rocksdb::Options &opt)
+int RocksDBStore::ParseOptionsFromString(const string &opt_str, rocksdb::Options &opt)
{
map<string, string> str_map;
int r = get_str_map(opt_str, &str_map, ",\n;");
g_conf->rocksdb_cache_shard_bits);
} else {
derr << "unrecognized rocksdb_cache_type '" << g_conf->rocksdb_cache_type
- << "'" << dendl;
+ << "'" << dendl;
return -EINVAL;
}
}
if (row_cache_size > 0)
opt.row_cache = rocksdb::NewLRUCache(row_cache_size,
- g_conf->rocksdb_cache_shard_bits);
-
- if (g_conf->kstore_rocksdb_bloom_bits_per_key > 0) {
+ g_conf->rocksdb_cache_shard_bits);
+ uint64_t bloom_bits = g_conf->get_val<uint64_t>("rocksdb_bloom_bits_per_key");
+ if (bloom_bits > 0) {
dout(10) << __func__ << " set bloom filter bits per key to "
- << g_conf->kstore_rocksdb_bloom_bits_per_key << dendl;
- bbt_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(
- g_conf->kstore_rocksdb_bloom_bits_per_key));
+ << bloom_bits << dendl;
+ bbt_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bloom_bits));
}
+ if (g_conf->get_val<std::string>("rocksdb_index_type") == "binary_search")
+ bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch;
+ if (g_conf->get_val<std::string>("rocksdb_index_type") == "hash_search")
+ bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kHashSearch;
+ if (g_conf->get_val<std::string>("rocksdb_index_type") == "two_level")
+ bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbt_opts.cache_index_and_filter_blocks =
+ g_conf->get_val<bool>("rocksdb_cache_index_and_filter_blocks");
+ bbt_opts.cache_index_and_filter_blocks_with_high_priority =
+ g_conf->get_val<bool>("rocksdb_cache_index_and_filter_blocks_with_high_priority");
+ bbt_opts.partition_filters = g_conf->get_val<bool>("rocksdb_partition_filters");
+ if (g_conf->get_val<uint64_t>("rocksdb_metadata_block_size") > 0)
+ bbt_opts.metadata_block_size = g_conf->get_val<uint64_t>("rocksdb_metadata_block_size");
+ bbt_opts.pin_l0_filter_and_index_blocks_in_cache =
+ g_conf->get_val<bool>("rocksdb_pin_l0_filter_and_index_blocks_in_cache");
+
opt.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbt_opts));
dout(10) << __func__ << " block size " << g_conf->rocksdb_block_size
<< ", block_cache size " << prettybyte_t(block_cache_size)
to_set_bl.length()));
} else {
rocksdb::Slice key_slice(key);
- rocksdb::Slice value_slices[to_set_bl.buffers().size()];
+ vector<rocksdb::Slice> value_slices(to_set_bl.buffers().size());
bat.Put(nullptr, rocksdb::SliceParts(&key_slice, 1),
- prepare_sliceparts(to_set_bl, value_slices));
+ prepare_sliceparts(to_set_bl, &value_slices));
}
}
to_set_bl.length()));
} else {
rocksdb::Slice key_slice(key);
- rocksdb::Slice value_slices[to_set_bl.buffers().size()];
+ vector<rocksdb::Slice> value_slices(to_set_bl.buffers().size());
bat.Put(nullptr, rocksdb::SliceParts(&key_slice, 1),
- prepare_sliceparts(to_set_bl, value_slices));
+ prepare_sliceparts(to_set_bl, &value_slices));
}
}
} else {
// make a copy
rocksdb::Slice key_slice(key);
- rocksdb::Slice value_slices[to_set_bl.buffers().size()];
+ vector<rocksdb::Slice> value_slices(to_set_bl.buffers().size());
bat.Merge(nullptr, rocksdb::SliceParts(&key_slice, 1),
- prepare_sliceparts(to_set_bl, value_slices));
+ prepare_sliceparts(to_set_bl, &value_slices));
}
}
bool enable_rmrange;
void compact() override;
- int tryInterpret(const string key, const string val, rocksdb::Options &opt);
- int ParseOptionsFromString(const string opt_str, rocksdb::Options &opt);
+ int tryInterpret(const string& key, const string& val, rocksdb::Options &opt);
+ int ParseOptionsFromString(const string& opt_str, rocksdb::Options &opt);
static int _test_init(const string& dir);
int init(string options_str) override;
/// compact rocksdb for all keys with a given prefix
cephd_cls
cephd_cls_kvs
cephd_rados
- cephd_rbd
- cephd_rgw
common
common_utf8
erasure_code
list(APPEND merge_libs ${ROCKSDB_LIBRARIES})
endif(NOT WITH_SYSTEM_ROCKSDB)
+if(WITH_RADOSGW)
+ list(APPEND merge_libs cephd_rgw)
+endif(WITH_RADOSGW)
+
+if(WITH_RBD)
+ list(APPEND merge_libs cephd_rbd)
+endif(WITH_RBD)
+
if(HAVE_ARMV8_CRC)
list(APPEND merge_libs common_crc_aarch64)
endif(HAVE_ARMV8_CRC)
# TODO: install these libraries and add them to rpm and deb packages
#install(TARGETS cephd DESTINATION ${CMAKE_INSTALL_LIBDIR})
-#install(FILES ../include/cephd/libcephd.h DESTINATION include/cephd)
+#install(FILES ../include/cephd/libcephd.h
+# DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cephd)
hobject_t::_reverse_bits(rev_finish), poolid, string());
}
+int librados::IoCtxImpl::application_enable(const std::string& app_name,
+ bool force)
+{
+ auto c = new PoolAsyncCompletionImpl();
+ application_enable_async(app_name, force, c);
+
+ int r = c->wait();
+ assert(r == 0);
+
+ r = c->get_return_value();
+ c->release();
+ if (r < 0) {
+ return r;
+ }
+
+ return client->wait_for_latest_osdmap();
+}
+
+void librados::IoCtxImpl::application_enable_async(const std::string& app_name,
+ bool force,
+ PoolAsyncCompletionImpl *c)
+{
+ // pre-Luminous clusters will return -EINVAL and application won't be
+ // preserved until Luminous is configured as minimim version.
+ if (!client->get_required_monitor_features().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS)) {
+ client->finisher.queue(new C_PoolAsync_Safe(c), -EOPNOTSUPP);
+ return;
+ }
+
+ std::stringstream cmd;
+ cmd << "{"
+ << "\"prefix\": \"osd pool application enable\","
+ << "\"pool\": \"" << get_cached_pool_name() << "\","
+ << "\"app\": \"" << app_name << "\"";
+ if (force) {
+ cmd << ",\"force\":\"--yes-i-really-mean-it\"";
+ }
+ cmd << "}";
+
+ std::vector<std::string> cmds;
+ cmds.push_back(cmd.str());
+ bufferlist inbl;
+ client->mon_command_async(cmds, inbl, nullptr, nullptr,
+ new C_PoolAsync_Safe(c));
+}
+
+int librados::IoCtxImpl::application_list(std::set<std::string> *app_names)
+{
+ int r = 0;
+ app_names->clear();
+ objecter->with_osdmap([&](const OSDMap& o) {
+ auto pg_pool = o.get_pg_pool(poolid);
+ if (pg_pool == nullptr) {
+ r = -ENOENT;
+ return;
+ }
+
+ for (auto &pair : pg_pool->application_metadata) {
+ app_names->insert(pair.first);
+ }
+ });
+ return r;
+}
+
+int librados::IoCtxImpl::application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string* value)
+{
+ int r = 0;
+ objecter->with_osdmap([&](const OSDMap& o) {
+ auto pg_pool = o.get_pg_pool(poolid);
+ if (pg_pool == nullptr) {
+ r = -ENOENT;
+ return;
+ }
+
+ auto app_it = pg_pool->application_metadata.find(app_name);
+ if (app_it == pg_pool->application_metadata.end()) {
+ r = -ENOENT;
+ return;
+ }
+
+ auto it = app_it->second.find(key);
+ if (it == app_it->second.end()) {
+ r = -ENOENT;
+ return;
+ }
+
+ *value = it->second;
+ });
+ return r;
+}
+
+int librados::IoCtxImpl::application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value)
+{
+ std::stringstream cmd;
+ cmd << "{"
+ << "\"prefix\":\"osd pool application set\","
+ << "\"pool\":\"" << get_cached_pool_name() << "\","
+ << "\"app\":\"" << app_name << "\","
+ << "\"key\":\"" << key << "\","
+ << "\"value\":\"" << value << "\""
+ << "}";
+
+ std::vector<std::string> cmds;
+ cmds.push_back(cmd.str());
+ bufferlist inbl;
+ int r = client->mon_command(cmds, inbl, nullptr, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ // ensure we have the latest osd map epoch before proceeding
+ return client->wait_for_latest_osdmap();
+}
+
+int librados::IoCtxImpl::application_metadata_remove(const std::string& app_name,
+ const std::string &key)
+{
+ std::stringstream cmd;
+ cmd << "{"
+ << "\"prefix\":\"osd pool application rm\","
+ << "\"pool\":\"" << get_cached_pool_name() << "\","
+ << "\"app\":\"" << app_name << "\","
+ << "\"key\":\"" << key << "\""
+ << "}";
+
+ std::vector<std::string> cmds;
+ cmds.push_back(cmd.str());
+ bufferlist inbl;
+ int r = client->mon_command(cmds, inbl, nullptr, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ // ensure we have the latest osd map epoch before proceeding
+ return client->wait_for_latest_osdmap();
+}
+
+int librados::IoCtxImpl::application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values)
+{
+ int r = 0;
+ values->clear();
+ objecter->with_osdmap([&](const OSDMap& o) {
+ auto pg_pool = o.get_pg_pool(poolid);
+ if (pg_pool == nullptr) {
+ r = -ENOENT;
+ return;
+ }
+
+ auto it = pg_pool->application_metadata.find(app_name);
+ if (it == pg_pool->application_metadata.end()) {
+ r = -ENOENT;
+ return;
+ }
+
+ *values = it->second;
+ });
+ return r;
+}
+
int cache_pin(const object_t& oid);
int cache_unpin(const object_t& oid);
+ int application_enable(const std::string& app_name, bool force);
+ void application_enable_async(const std::string& app_name, bool force,
+ PoolAsyncCompletionImpl *c);
+ int application_list(std::set<std::string> *app_names);
+ int application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string* value);
+ int application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value);
+ int application_metadata_remove(const std::string& app_name,
+ const std::string &key);
+ int application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values);
+
};
#endif
const bufferlist &inbl,
bufferlist *outbl, string *outs)
{
- Mutex mylock("RadosClient::mon_command::mylock");
- Cond cond;
- bool done;
- int rval;
+ C_SaferCond ctx;
+ mon_command_async(cmd, inbl, outbl, outs, &ctx);
+ return ctx.wait();
+}
+
+void librados::RadosClient::mon_command_async(const vector<string>& cmd,
+ const bufferlist &inbl,
+ bufferlist *outbl, string *outs,
+ Context *on_finish)
+{
lock.Lock();
- monclient.start_mon_command(cmd, inbl, outbl, outs,
- new C_SafeCond(&mylock, &cond, &done, &rval));
+ monclient.start_mon_command(cmd, inbl, outbl, outs, on_finish);
lock.Unlock();
- mylock.Lock();
- while (!done)
- cond.Wait(mylock);
- mylock.Unlock();
- return rval;
}
-
int librados::RadosClient::mgr_command(const vector<string>& cmd,
const bufferlist &inbl,
bufferlist *outbl, string *outs)
}
return mgrclient.service_daemon_update_status(status);
}
+
+mon_feature_t librados::RadosClient::get_required_monitor_features() const
+{
+ return monclient.monmap.get_required_features();
+}
#include "IoCtxImpl.h"
struct AuthAuthorizer;
+struct Context;
class CephContext;
struct Connection;
struct md_config_t;
int mon_command(const vector<string>& cmd, const bufferlist &inbl,
bufferlist *outbl, string *outs);
+ void mon_command_async(const vector<string>& cmd, const bufferlist &inbl,
+ bufferlist *outbl, string *outs, Context *on_finish);
int mon_command(int rank,
const vector<string>& cmd, const bufferlist &inbl,
bufferlist *outbl, string *outs);
const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
int service_daemon_update_status(
const std::map<std::string,std::string>& status);
+
+ mon_feature_t get_required_monitor_features() const;
};
#endif
};
bufferlist inbl, outbl;
string outstring;
- int ret = client.mon_command(cmd, inbl, &outbl, &outstring);
+ int ret = client.mgr_command(cmd, inbl, &outbl, &outstring);
if (ret) {
return ret;
}
return radosp->blacklist_add(client_address, expire_seconds);
}
+extern "C" int rados_application_enable(rados_ioctx_t io, const char *app_name,
+ int force)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ return ctx->application_enable(app_name, force != 0);
+}
+
+extern "C" int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ std::set<std::string> app_names;
+ int r = ctx->application_list(&app_names);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t total_len = 0;
+ for (auto app_name : app_names) {
+ total_len += app_name.size() + 1;
+ }
+
+ if (*values_len < total_len) {
+ *values_len = total_len;
+ return -ERANGE;
+ }
+
+ char *values_p = values;
+ for (auto app_name : app_names) {
+ size_t len = app_name.size() + 1;
+ strncpy(values_p, app_name.c_str(), len);
+ values_p += len;
+ }
+ *values_p = '\0';
+ *values_len = total_len;
+ return 0;
+}
+
+extern "C" int rados_application_metadata_get(rados_ioctx_t io,
+ const char *app_name,
+ const char *key, char *value,
+ size_t *value_len)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ std::string value_str;
+ int r = ctx->application_metadata_get(app_name, key, &value_str);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t len = value_str.size() + 1;
+ if (*value_len < len) {
+ *value_len = len;
+ return -ERANGE;
+ }
+
+ strncpy(value, value_str.c_str(), len);
+ *value_len = len;
+ return 0;
+}
+
+extern "C" int rados_application_metadata_set(rados_ioctx_t io,
+ const char *app_name,
+ const char *key,
+ const char *value)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ return ctx->application_metadata_set(app_name, key, value);
+}
+
+extern "C" int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name,
+ const char *key)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ return ctx->application_metadata_remove(app_name, key);
+}
+
+extern "C" int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name,
+ char *keys, size_t *keys_len,
+ char *values, size_t *vals_len)
+{
+ librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+ std::map<std::string, std::string> metadata;
+ int r = ctx->application_metadata_list(app_name, &metadata);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t total_key_len = 0;
+ size_t total_val_len = 0;
+ for (auto pair : metadata) {
+ total_key_len += pair.first.size() + 1;
+ total_val_len += pair.second.size() + 1;
+ }
+
+ if (*keys_len < total_key_len || *vals_len < total_val_len) {
+ *keys_len = total_key_len;
+ *vals_len = total_val_len;
+ return -ERANGE;
+ }
+
+ char *keys_p = keys;
+ char *vals_p = values;
+ for (auto pair : metadata) {
+ size_t key_len = pair.first.size() + 1;
+ strncpy(keys_p, pair.first.c_str(), key_len);
+ keys_p += key_len;
+
+ size_t val_len = pair.second.size() + 1;
+ strncpy(vals_p, pair.second.c_str(), val_len);
+ vals_p += val_len;
+ }
+ *keys_p = '\0';
+ *keys_len = total_key_len;
+
+ *vals_p = '\0';
+ *vals_len = total_val_len;
+ return 0;
+}
+
extern "C" int rados_pool_list(rados_t cluster, char *buf, size_t len)
{
tracepoint(librados, rados_pool_list_enter, cluster, len);
return retval;
}
+
+static void dict_to_map(const char *dict,
+ std::map<std::string, std::string>* dict_map)
+{
+ while (*dict != '\0') {
+ const char* key = dict;
+ dict += strlen(key) + 1;
+ const char* value = dict;
+ dict += strlen(value) + 1;
+ (*dict_map)[key] = value;
+ }
+}
+
+CEPH_RADOS_API int rados_service_register(rados_t cluster, const char *service,
+ const char *daemon,
+ const char *metadata_dict)
+{
+ librados::RadosClient *client = (librados::RadosClient *)cluster;
+
+ std::map<std::string, std::string> metadata;
+ dict_to_map(metadata_dict, &metadata);
+
+ return client->service_daemon_register(service, daemon, metadata);
+}
+
+CEPH_RADOS_API int rados_service_update_status(rados_t cluster,
+ const char *status_dict)
+{
+ librados::RadosClient *client = (librados::RadosClient *)cluster;
+
+ std::map<std::string, std::string> status;
+ dict_to_map(status_dict, &status);
+
+ return client->service_daemon_update_status(status);
+}
+
static void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen)
{
if (outbuf) {
(hobject_t*)(split_finish->c_cursor));
}
+int librados::IoCtx::application_enable(const std::string& app_name,
+ bool force)
+{
+ return io_ctx_impl->application_enable(app_name, force);
+}
+
+int librados::IoCtx::application_enable_async(const std::string& app_name,
+ bool force,
+ PoolAsyncCompletion *c)
+{
+ io_ctx_impl->application_enable_async(app_name, force, c->pc);
+ return 0;
+}
+
+int librados::IoCtx::application_list(std::set<std::string> *app_names)
+{
+ return io_ctx_impl->application_list(app_names);
+}
+
+int librados::IoCtx::application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string* value)
+{
+ return io_ctx_impl->application_metadata_get(app_name, key, value);
+}
+
+int librados::IoCtx::application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value)
+{
+ return io_ctx_impl->application_metadata_set(app_name, key, value);
+}
+
+int librados::IoCtx::application_metadata_remove(const std::string& app_name,
+ const std::string &key)
+{
+ return io_ctx_impl->application_metadata_remove(app_name, key);
+}
+
+int librados::IoCtx::application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values)
+{
+ return io_ctx_impl->application_metadata_list(app_name, values);
+}
+
+
*
*/
+#include <boost/algorithm/string/replace.hpp>
+
#include "libradosstriper/RadosStriperImpl.h"
#include <errno.h>
// get list of extents to be read from
vector<ObjectExtent> *extents = new vector<ObjectExtent>();
if (read_len > 0) {
- std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
+ std::string format = soid;
+ boost::replace_all(format, "%", "%%");
+ format += RADOS_OBJECT_EXTENSION_FORMAT;
file_layout_t l;
l.from_legacy(layout);
Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len,
if (len > 0) {
// get list of extents to be written to
vector<ObjectExtent> extents;
- std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
+ std::string format = soid;
+ boost::replace_all(format, "%", "%%");
+ format += RADOS_OBJECT_EXTENSION_FORMAT;
file_layout_t l;
l.from_legacy(layout);
Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents);
plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames");
plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data");
plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency");
+ plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites");
+ plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps");
+ plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps");
plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks");
r = librbd::cls_client::mirror_image_status_list(&io_ctx, start_id, max,
&images_, &statuses_);
- if (r < 0) {
+ if (r < 0 && r != -ENOENT) {
lderr(cct) << "failed to list mirror image statuses: "
<< cpp_strerror(r) << dendl;
return r;
virtual void aio_writesame(uint64_t offset, uint64_t length,
ceph::bufferlist&& bl,
int fadvise_flags, Context *on_finish) = 0;
+ virtual void aio_compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) = 0;
/// internal state methods
virtual void init(Context *on_finish) = 0;
req.send();
}
+template <typename I>
+void ImageWriteback<I>::aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
+ io::AIO_TYPE_COMPARE_AND_WRITE);
+ io::ImageCompareAndWriteRequest<I> req(m_image_ctx, aio_comp,
+ std::move(image_extents),
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, fadvise_flags, {});
+ req.set_bypass_image_cache();
+ req.send();
+}
+
} // namespace cache
} // namespace librbd
void aio_writesame(uint64_t offset, uint64_t length,
ceph::bufferlist&& bl,
int fadvise_flags, Context *on_finish);
-
+ void aio_compare_and_write(Extents &&image_extents,
+ ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags, Context *on_finish);
private:
ImageCtxT &m_image_ctx;
on_finish);
}
+template <typename I>
+void PassthroughImageCache<I>::aio_compare_and_write(Extents &&image_extents,
+ bufferlist&& cmp_bl,
+ bufferlist&& bl,
+ uint64_t *mismatch_offset,
+ int fadvise_flags,
+ Context *on_finish) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "image_extents=" << image_extents << ", "
+ << "on_finish=" << on_finish << dendl;
+
+ m_image_writeback.aio_compare_and_write(
+ std::move(image_extents), std::move(cmp_bl), std::move(bl), mismatch_offset,
+ fadvise_flags, on_finish);
+}
+
template <typename I>
void PassthroughImageCache<I>::init(Context *on_finish) {
CephContext *cct = m_image_ctx.cct;
void aio_writesame(uint64_t offset, uint64_t length,
ceph::bufferlist&& bl,
int fadvise_flags, Context *on_finish) override;
+ void aio_compare_and_write(Extents&& image_extents,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl,
+ uint64_t *mismatch_offset,int fadvise_flags,
+ Context *on_finish) override;
/// internal state methods
void init(Context *on_finish) override;
ldout(m_cct, 20) << this << " " << __func__ << " r=" << r << dendl;
if (r == 0) {
- lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
- << dendl;
bufferlist::iterator it = m_out_bl.begin();
r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode);
}
if (r < 0 && r != -ENOENT) {
+ lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r)
+ << dendl;
+
m_r_saved = r;
send_remove_child();
} else {
return;
}
- if (!validate_layout(m_cct, m_size, m_layout)) {
+ if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) &&
+ (!validate_layout(m_cct, m_size, m_layout))) {
complete(-EINVAL);
return;
}
if (*result < 0) {
if (*result == -EOPNOTSUPP) {
*result = -ENOENT;
- } else if (*result == -ENOENT) {
- lderr(cct) << "image id " << m_image_ctx->id << " does not exist in rbd "
- << "trash, failed to retrieve image name: "
- << cpp_strerror(*result) << dendl;
+ }
+ if (*result == -ENOENT) {
+ ldout(cct, 5) << "failed to retrieve name for image id "
+ << m_image_ctx->id << dendl;
} else {
lderr(cct) << "failed to retreive name from trash: "
<< cpp_strerror(*result) << dendl;
CephContext *cct((CephContext *)io_ctx.cct());
ldout(cct, 20) << "trash_list " << &io_ctx << dendl;
- map<string, cls::rbd::TrashImageSpec> trash_entries;
- int r = cls_client::trash_list(&io_ctx, &trash_entries);
- if (r < 0) {
- if (r != -ENOENT) {
- lderr(cct) << "error listing rbd_trash entries: " << cpp_strerror(r)
+ bool more_entries;
+ uint32_t max_read = 1024;
+ std::string last_read = "";
+ do {
+ map<string, cls::rbd::TrashImageSpec> trash_entries;
+ int r = cls_client::trash_list(&io_ctx, last_read, max_read,
+ &trash_entries);
+ if (r < 0 && r != -ENOENT) {
+ lderr(cct) << "error listing rbd trash entries: " << cpp_strerror(r)
<< dendl;
- } else {
- r = 0;
+ return r;
+ } else if (r == -ENOENT) {
+ break;
}
- return r;
- }
- for (const auto &entry : trash_entries) {
- rbd_trash_image_source_t source =
- static_cast<rbd_trash_image_source_t>(entry.second.source);
- entries.push_back({entry.first, entry.second.name, source,
- entry.second.deletion_time.sec(),
- entry.second.deferment_end_time.sec()});
- }
+ if (trash_entries.empty()) {
+ break;
+ }
+
+ for (const auto &entry : trash_entries) {
+ rbd_trash_image_source_t source =
+ static_cast<rbd_trash_image_source_t>(entry.second.source);
+ entries.push_back({entry.first, entry.second.name, source,
+ entry.second.deletion_time.sec(),
+ entry.second.deferment_end_time.sec()});
+ }
+ last_read = trash_entries.rbegin()->first;
+ more_entries = (trash_entries.size() >= max_read);
+ } while (more_entries);
+
return 0;
}
l_librbd_ws_bytes,
l_librbd_ws_latency,
+ l_librbd_cmp,
+ l_librbd_cmp_bytes,
+ l_librbd_cmp_latency,
+
l_librbd_snap_create,
l_librbd_snap_remove,
l_librbd_snap_rollback,
ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
case AIO_TYPE_WRITESAME:
ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break;
+ case AIO_TYPE_COMPARE_AND_WRITE:
+ ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break;
default:
lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
break;
std::move(bl), op_flags, parent_trace);
}
+template <typename I>
+ImageRequest<I>* ImageRequest<I>::create_compare_and_write_request(
+ I &image_ctx, AioCompletion *c, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageCompareAndWriteRequest<I>(image_ctx, c,
+ std::move(image_extents),
+ std::move(cmp_bl),
+ std::move(bl), mismatch_offset,
+ op_flags, parent_trace);
+}
+
template <typename I>
void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c,
Extents &&image_extents,
req.send();
}
+template <typename I>
+void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents),
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, op_flags, parent_trace);
+ req.send();
+}
+
+
template <typename I>
void ImageRequest<I>::send() {
I &image_ctx = this->m_image_ctx;
CephContext *cct = image_ctx.cct;
AioCompletion *aio_comp = this->m_aio_comp;
ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", "
- << "completion=" << aio_comp << dendl;
+ << "completion=" << aio_comp << dendl;
aio_comp->get();
int r = clip_request();
image_ctx.journal->is_journal_appending());
}
- prune_object_extents(object_extents);
+ int ret = prune_object_extents(object_extents);
+ if (ret < 0) {
+ aio_comp->fail(ret);
+ return;
+ }
if (!object_extents.empty()) {
uint64_t journal_tid = 0;
}
template <typename I>
-void ImageDiscardRequest<I>::prune_object_extents(ObjectExtents &object_extents) {
+int ImageDiscardRequest<I>::prune_object_extents(ObjectExtents &object_extents) {
I &image_ctx = this->m_image_ctx;
CephContext *cct = image_ctx.cct;
if (!this->m_skip_partial_discard) {
- return;
+ return 0;
}
for (auto p = object_extents.begin(); p != object_extents.end(); ) {
++p;
}
}
+
+ return 0;
}
template <typename I>
image_ctx.perfcounter->inc(l_librbd_ws_bytes, length);
}
+template <typename I>
+uint64_t ImageCompareAndWriteRequest<I>::append_journal_event(
+ const ObjectRequests &requests, bool synchronous) {
+
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ assert(this->m_image_extents.size() == 1);
+ auto &extent = this->m_image_extents.front();
+ journal::EventEntry event_entry(journal::AioCompareAndWriteEvent(extent.first,
+ extent.second,
+ m_cmp_bl, m_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ requests, extent.first,
+ extent.second, synchronous);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->associate_journal_event(tid);
+
+ return tid;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::send_object_cache_requests(
+ const ObjectExtents &object_extents, uint64_t journal_tid) {
+ I &image_ctx = this->m_image_ctx;
+
+ if (image_ctx.object_cacher != NULL) {
+ Mutex::Locker cache_locker(image_ctx.cache_lock);
+ image_ctx.object_cacher->discard_set(image_ctx.object_set,
+ object_extents);
+ }
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::assemble_extent(
+ const ObjectExtent &object_extent, bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::send_image_cache_request() {
+ I &image_ctx = this->m_image_ctx;
+ assert(image_ctx.image_cache != nullptr);
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ image_ctx.image_cache->aio_compare_and_write(
+ std::move(this->m_image_extents), std::move(m_cmp_bl), std::move(m_bl),
+ m_mismatch_offset, m_op_flags, req_comp);
+}
+
+template <typename I>
+ObjectRequestHandle *ImageCompareAndWriteRequest<I>::create_object_request(
+ const ObjectExtent &object_extent,
+ const ::SnapContext &snapc,
+ Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ assemble_extent(object_extent, &bl);
+ ObjectRequest<I> *req = ObjectRequest<I>::create_compare_and_write(
+ &image_ctx, object_extent.oid.name,
+ object_extent.objectno, object_extent.offset,
+ m_cmp_bl, bl, snapc, m_mismatch_offset,
+ m_op_flags, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_cmp);
+ image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length);
+}
+
+template <typename I>
+int ImageCompareAndWriteRequest<I>::prune_object_extents(ObjectExtents &object_extents) {
+ if (object_extents.size() > 1)
+ return -EINVAL;
+
+ I &image_ctx = this->m_image_ctx;
+ uint64_t sector_size = 512ULL;
+ uint64_t su = image_ctx.layout.stripe_unit;
+ ObjectExtent object_extent = object_extents.front();
+ if (object_extent.offset % sector_size + object_extent.length > sector_size ||
+ (su != 0 && (object_extent.offset % su + object_extent.length > su)))
+ return -EINVAL;
+
+ return 0;
+}
+
} // namespace io
} // namespace librbd
template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
uint64_t off, uint64_t len,
bufferlist &&bl, int op_flags,
const ZTracer::Trace &parent_trace);
+ static ImageRequest* create_compare_and_write_request(
+ ImageCtxT &image_ctx, AioCompletion *c, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace);
static void aio_read(ImageCtxT *ictx, AioCompletion *c,
Extents &&image_extents, ReadResult &&read_result,
uint64_t len, bufferlist &&bl, int op_flags,
const ZTracer::Trace &parent_trace);
+ static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace);
+
virtual bool is_write_op() const {
return false;
}
void send_request() override;
- virtual void prune_object_extents(ObjectExtents &object_extents) {
+ virtual int prune_object_extents(ObjectExtents &object_extents) {
+ return 0;
}
virtual uint32_t get_object_cache_request_count(bool journaling) const {
return 0;
return "aio_discard";
}
- void prune_object_extents(ObjectExtents &object_extents) override;
+ int prune_object_extents(ObjectExtents &object_extents) override;
void send_image_cache_request() override;
int m_op_flags;
};
+template <typename ImageCtxT = ImageCtx>
+class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+ using typename AbstractImageWriteRequest<ImageCtxT>::ObjectExtents;
+
+ ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), "compare_and_write", parent_trace),
+ m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+protected:
+ void send_image_cache_request() override;
+
+ void send_object_cache_requests(const ObjectExtents &object_extents,
+ uint64_t journal_tid) override;
+
+ void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl);
+
+ ObjectRequestHandle *create_object_request(const ObjectExtent &object_extent,
+ const ::SnapContext &snapc,
+ Context *on_finish) override;
+
+ uint64_t append_journal_event(const ObjectRequests &requests,
+ bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_COMPARE_AND_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_compare_and_write";
+ }
+
+ int prune_object_extents(ObjectExtents &object_extents) override;
+private:
+ bufferlist m_cmp_bl;
+ bufferlist m_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
} // namespace io
} // namespace librbd
extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H
return len;
}
+template <typename I>
+ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_off,
+ int op_flags){
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << "compare_and_write ictx=" << &m_image_ctx << ", off="
+ << off << ", " << "len = " << len << dendl;
+
+ m_image_ctx.snap_lock.get_read();
+ int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
+ m_image_ctx.snap_lock.put_read();
+ if (r < 0) {
+ lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ C_SaferCond cond;
+ AioCompletion *c = AioCompletion::create(&cond);
+ aio_compare_and_write(c, off, len, std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, false);
+
+ r = cond.wait();
+ if (r < 0) {
+ return r;
+ }
+
+ return len;
+}
+
template <typename I>
void ImageRequestWQ<I>::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
ReadResult &&read_result, int op_flags,
trace.event("finish");
}
+template <typename I>
+void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
+ uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_off,
+ int op_flags, bool native_async) {
+ CephContext *cct = m_image_ctx.cct;
+ ZTracer::Trace trace;
+ if (cct->_conf->rbd_blkin_trace_all) {
+ trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint);
+ trace.event("init");
+ }
+
+ c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_COMPARE_AND_WRITE);
+ ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
+ << "completion=" << c << ", off=" << off << ", "
+ << "len=" << len << dendl;
+
+ if (native_async && m_image_ctx.event_socket.is_valid()) {
+ c->set_event_notify(true);
+ }
+
+ if (!start_in_flight_io(c)) {
+ return;
+ }
+
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ if (m_image_ctx.non_blocking_aio || writes_blocked()) {
+ queue(ImageRequest<I>::create_compare_and_write_request(
+ m_image_ctx, c, {{off, len}}, std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, trace));
+ } else {
+ c->start_op();
+ ImageRequest<I>::aio_compare_and_write(&m_image_ctx, c, {{off, len}},
+ std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, trace);
+ finish_in_flight_io();
+ }
+ trace.event("finish");
+}
+
template <typename I>
void ImageRequestWQ<I>::shut_down(Context *on_shutdown) {
assert(m_image_ctx.owner_lock.is_locked());
ssize_t write(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
ssize_t discard(uint64_t off, uint64_t len, bool skip_partial_discard);
ssize_t writesame(uint64_t off, uint64_t len, bufferlist &&bl, int op_flags);
+ ssize_t compare_and_write(uint64_t off, uint64_t len,
+ bufferlist &&cmp_bl, bufferlist &&bl,
+ uint64_t *mismatch_off, int op_flags);
void aio_read(AioCompletion *c, uint64_t off, uint64_t len,
ReadResult &&read_result, int op_flags, bool native_async=true);
void aio_flush(AioCompletion *c, bool native_async=true);
void aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
bufferlist &&bl, int op_flags, bool native_async=true);
+ void aio_compare_and_write(AioCompletion *c, uint64_t off,
+ uint64_t len, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_off,
+ int op_flags, bool native_async=true);
using ThreadPool::PointerWQ<ImageRequest<ImageCtxT> >::drain;
#include "common/RWLock.h"
#include "common/WorkQueue.h"
#include "include/Context.h"
+#include "include/err.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/ImageCtx.h"
op_flags, parent_trace, completion);
}
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_compare_and_write(I *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ const ceph::bufferlist &cmp_data,
+ const ceph::bufferlist &write_data,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ return new ObjectCompareAndWriteRequest(util::get_image_ctx(ictx), oid,
+ object_no, object_off, cmp_data,
+ write_data, snapc, mismatch_offset,
+ op_flags, parent_trace, completion);
+}
+
template <typename I>
ObjectRequest<I>::ObjectRequest(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t off,
AbstractObjectWriteRequest::send_write();
}
+void ObjectCompareAndWriteRequest::add_write_ops(librados::ObjectWriteOperation *wr,
+ bool set_hints) {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+
+ if (set_hints && m_ictx->enable_alloc_hint &&
+ (m_ictx->object_map == nullptr || !m_object_exist)) {
+ wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
+ }
+
+ // add cmpext ops
+ wr->cmpext(m_object_off, m_cmp_bl, nullptr);
+
+ if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
+ wr->write_full(m_write_bl);
+ } else {
+ wr->write(m_object_off, m_write_bl);
+ }
+ wr->set_op_flags2(m_op_flags);
+}
+
+void ObjectCompareAndWriteRequest::send_write() {
+ bool write_full = (m_object_off == 0 &&
+ m_object_len == m_ictx->get_object_size());
+ ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
+ << " object exist " << m_object_exist
+ << " write_full " << write_full << dendl;
+ if (write_full && !has_parent()) {
+ m_guard = false;
+ }
+
+ AbstractObjectWriteRequest::send_write();
+}
+
+void ObjectCompareAndWriteRequest::complete(int r)
+{
+ if (should_complete(r)) {
+ ImageCtx *image_ctx = this->m_ictx;
+ ldout(m_ictx->cct, 20) << "complete " << this << dendl;
+
+ if (this->m_hide_enoent && r == -ENOENT) {
+ r = 0;
+ }
+
+ vector<pair<uint64_t,uint64_t> > file_extents;
+ if (r <= -MAX_ERRNO) {
+ // object extent compare mismatch
+ uint64_t offset = -MAX_ERRNO - r;
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout,
+ this->m_object_no, offset, this->m_object_len,
+ file_extents);
+
+ assert(file_extents.size() == 1);
+
+ uint64_t mismatch_offset = file_extents[0].first;
+ if (this->m_mismatch_offset)
+ *this->m_mismatch_offset = mismatch_offset;
+ r = -EILSEQ;
+ }
+
+ //compare and write object extent error
+ m_completion->complete(r);
+ delete this;
+ }
+}
+
} // namespace io
} // namespace librbd
int op_flags,
const ZTracer::Trace &parent_trace,
Context *completion);
+ static ObjectRequest* create_compare_and_write(ImageCtxT *ictx,
+ const std::string &oid,
+ uint64_t object_no,
+ uint64_t object_off,
+ const ceph::bufferlist &cmp_data,
+ const ceph::bufferlist &write_data,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion);
ObjectRequest(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t off, uint64_t len,
bool set_hints) {
};
- void complete(int r) override;
+ virtual void complete(int r);
virtual bool should_complete(int r) = 0;
void send() override = 0;
int m_op_flags;
};
+class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest {
+public:
+ typedef std::vector<std::pair<uint64_t, uint64_t> > Extents;
+
+ ObjectCompareAndWriteRequest(ImageCtx *ictx, const std::string &oid,
+ uint64_t object_no, uint64_t object_off,
+ const ceph::bufferlist &cmp_bl,
+ const ceph::bufferlist &write_bl,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion)
+ : AbstractObjectWriteRequest(ictx, oid, object_no, object_off,
+ cmp_bl.length(), snapc, false, "compare_and_write",
+ parent_trace, completion),
+ m_cmp_bl(cmp_bl), m_write_bl(write_bl),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "compare_and_write";
+ }
+
+ bool pre_object_map_update(uint8_t *new_state) override {
+ *new_state = OBJECT_EXISTS;
+ return true;
+ }
+
+ void complete(int r) override;
+protected:
+ void add_write_ops(librados::ObjectWriteOperation *wr,
+ bool set_hints) override;
+
+ void send_write() override;
+
+private:
+ ceph::bufferlist m_cmp_bl;
+ ceph::bufferlist m_write_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
} // namespace io
} // namespace librbd
AIO_TYPE_DISCARD,
AIO_TYPE_FLUSH,
AIO_TYPE_WRITESAME,
+ AIO_TYPE_COMPARE_AND_WRITE,
} aio_type_t;
enum Direction {
bool flush_required;
auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
io::AIO_TYPE_DISCARD,
- &flush_required);
+ &flush_required,
+ {});
if (aio_comp == nullptr) {
return;
}
bool flush_required;
auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
io::AIO_TYPE_WRITE,
- &flush_required);
+ &flush_required,
+ {});
if (aio_comp == nullptr) {
return;
}
bool flush_required;
auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
io::AIO_TYPE_WRITESAME,
- &flush_required);
+ &flush_required,
+ {});
if (aio_comp == nullptr) {
return;
}
}
}
+ template <typename I>
+ void Replay<I>::handle_event(const journal::AioCompareAndWriteEvent &event,
+ Context *on_ready, Context *on_safe) {
+ CephContext *cct = m_image_ctx.cct;
+ ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl;
+
+ bufferlist cmp_data = event.cmp_data;
+ bufferlist write_data = event.write_data;
+ bool flush_required;
+ auto aio_comp = create_aio_modify_completion(on_ready, on_safe,
+ io::AIO_TYPE_COMPARE_AND_WRITE,
+ &flush_required,
+ {-EILSEQ});
+ io::ImageRequest<I>::aio_compare_and_write(&m_image_ctx, aio_comp,
+ {{event.offset, event.length}},
+ std::move(cmp_data),
+ std::move(write_data),
+ nullptr, 0, {});
+ if (flush_required) {
+ m_lock.Lock();
+ auto flush_comp = create_aio_flush_completion(nullptr);
+ m_lock.Unlock();
+
+ io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, {});
+ }
+}
+
template <typename I>
void Replay<I>::handle_event(const journal::OpFinishEvent &event,
Context *on_ready, Context *on_safe) {
template <typename I>
void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe,
- int r) {
+ int r, std::set<int> &filters) {
Mutex::Locker locker(m_lock);
CephContext *cct = m_image_ctx.cct;
ldout(cct, 20) << ": on_ready=" << on_ready << ", "
if (on_ready != nullptr) {
on_ready->complete(0);
}
+
+ if (filters.find(r) != filters.end())
+ r = 0;
+
if (r < 0) {
lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl;
on_safe->complete(r);
template <typename I>
io::AioCompletion *
-Replay<I>::create_aio_modify_completion(Context *on_ready, Context *on_safe,
+Replay<I>::create_aio_modify_completion(Context *on_ready,
+ Context *on_safe,
io::aio_type_t aio_type,
- bool *flush_required) {
+ bool *flush_required,
+ std::set<int> &&filters) {
Mutex::Locker locker(m_lock);
CephContext *cct = m_image_ctx.cct;
assert(m_on_aio_ready == nullptr);
// event. when flushed, the completion of the next flush will fire the
// on_safe callback
auto aio_comp = io::AioCompletion::create_and_start<Context>(
- new C_AioModifyComplete(this, on_ready, on_safe),
+ new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters)),
util::get_image_ctx(&m_image_ctx), aio_type);
return aio_comp;
}
Replay *replay;
Context *on_ready;
Context *on_safe;
- C_AioModifyComplete(Replay *replay, Context *on_ready, Context *on_safe)
- : replay(replay), on_ready(on_ready), on_safe(on_safe) {
+ std::set<int> filters;
+ C_AioModifyComplete(Replay *replay, Context *on_ready,
+ Context *on_safe, std::set<int> &&filters)
+ : replay(replay), on_ready(on_ready), on_safe(on_safe),
+ filters(std::move(filters)) {
}
void finish(int r) override {
- replay->handle_aio_modify_complete(on_ready, on_safe, r);
+ replay->handle_aio_modify_complete(on_ready, on_safe, r, filters);
}
};
Context *on_safe);
void handle_event(const AioWriteSameEvent &event, Context *on_ready,
Context *on_safe);
+ void handle_event(const AioCompareAndWriteEvent &event, Context *on_ready,
+ Context *on_safe);
void handle_event(const AioFlushEvent &event, Context *on_ready,
Context *on_safe);
void handle_event(const OpFinishEvent &event, Context *on_ready,
void handle_event(const UnknownEvent &event, Context *on_ready,
Context *on_safe);
- void handle_aio_modify_complete(Context *on_ready, Context *on_safe, int r);
+ void handle_aio_modify_complete(Context *on_ready, Context *on_safe,
+ int r, std::set<int> &filters);
void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs,
int r);
io::AioCompletion *create_aio_modify_completion(Context *on_ready,
Context *on_safe,
io::aio_type_t aio_type,
- bool *flush_required);
+ bool *flush_required,
+ std::set<int> &&filters);
io::AioCompletion *create_aio_flush_completion(Context *on_safe);
void handle_aio_completion(io::AioCompletion *aio_comp);
f->dump_unsigned("length", length);
}
+uint32_t AioCompareAndWriteEvent::get_fixed_size() {
+ return EventEntry::get_fixed_size() + 32 /* offset, length */;
+}
+
+void AioCompareAndWriteEvent::encode(bufferlist& bl) const {
+ ::encode(offset, bl);
+ ::encode(length, bl);
+ ::encode(cmp_data, bl);
+ ::encode(write_data, bl);
+}
+
+void AioCompareAndWriteEvent::decode(__u8 version, bufferlist::iterator& it) {
+ ::decode(offset, it);
+ ::decode(length, it);
+ ::decode(cmp_data, it);
+ ::decode(write_data, it);
+}
+
+void AioCompareAndWriteEvent::dump(Formatter *f) const {
+ f->dump_unsigned("offset", offset);
+ f->dump_unsigned("length", length);
+}
+
void AioFlushEvent::encode(bufferlist& bl) const {
}
case EVENT_TYPE_AIO_WRITESAME:
event = AioWriteSameEvent();
break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ event = AioCompareAndWriteEvent();
+ break;
default:
event = UnknownEvent();
break;
case EVENT_TYPE_AIO_WRITESAME:
out << "AioWriteSame";
break;
+ case EVENT_TYPE_AIO_COMPARE_AND_WRITE:
+ out << "AioCompareAndWrite";
+ break;
default:
out << "Unknown (" << static_cast<uint32_t>(type) << ")";
break;
#include <boost/none.hpp>
#include <boost/optional.hpp>
#include <boost/variant.hpp>
+#include <boost/mpl/vector.hpp>
namespace ceph {
class Formatter;
namespace journal {
enum EventType {
- EVENT_TYPE_AIO_DISCARD = 0,
- EVENT_TYPE_AIO_WRITE = 1,
- EVENT_TYPE_AIO_FLUSH = 2,
- EVENT_TYPE_OP_FINISH = 3,
- EVENT_TYPE_SNAP_CREATE = 4,
- EVENT_TYPE_SNAP_REMOVE = 5,
- EVENT_TYPE_SNAP_RENAME = 6,
- EVENT_TYPE_SNAP_PROTECT = 7,
- EVENT_TYPE_SNAP_UNPROTECT = 8,
- EVENT_TYPE_SNAP_ROLLBACK = 9,
- EVENT_TYPE_RENAME = 10,
- EVENT_TYPE_RESIZE = 11,
- EVENT_TYPE_FLATTEN = 12,
- EVENT_TYPE_DEMOTE_PROMOTE = 13,
- EVENT_TYPE_SNAP_LIMIT = 14,
- EVENT_TYPE_UPDATE_FEATURES = 15,
- EVENT_TYPE_METADATA_SET = 16,
- EVENT_TYPE_METADATA_REMOVE = 17,
- EVENT_TYPE_AIO_WRITESAME = 18,
+ EVENT_TYPE_AIO_DISCARD = 0,
+ EVENT_TYPE_AIO_WRITE = 1,
+ EVENT_TYPE_AIO_FLUSH = 2,
+ EVENT_TYPE_OP_FINISH = 3,
+ EVENT_TYPE_SNAP_CREATE = 4,
+ EVENT_TYPE_SNAP_REMOVE = 5,
+ EVENT_TYPE_SNAP_RENAME = 6,
+ EVENT_TYPE_SNAP_PROTECT = 7,
+ EVENT_TYPE_SNAP_UNPROTECT = 8,
+ EVENT_TYPE_SNAP_ROLLBACK = 9,
+ EVENT_TYPE_RENAME = 10,
+ EVENT_TYPE_RESIZE = 11,
+ EVENT_TYPE_FLATTEN = 12,
+ EVENT_TYPE_DEMOTE_PROMOTE = 13,
+ EVENT_TYPE_SNAP_LIMIT = 14,
+ EVENT_TYPE_UPDATE_FEATURES = 15,
+ EVENT_TYPE_METADATA_SET = 16,
+ EVENT_TYPE_METADATA_REMOVE = 17,
+ EVENT_TYPE_AIO_WRITESAME = 18,
+ EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
};
struct AioDiscardEvent {
void dump(Formatter *f) const;
};
+struct AioCompareAndWriteEvent {
+ static const EventType TYPE = EVENT_TYPE_AIO_COMPARE_AND_WRITE;
+
+ uint64_t offset;
+ uint64_t length;
+ bufferlist cmp_data;
+ bufferlist write_data;
+
+ static uint32_t get_fixed_size();
+
+ AioCompareAndWriteEvent() : offset(0), length(0) {
+ }
+ AioCompareAndWriteEvent(uint64_t _offset, uint64_t _length,
+ const bufferlist &_cmp_data, const bufferlist &_write_data)
+ : offset(_offset), length(_length), cmp_data(_cmp_data), write_data(_write_data) {
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(__u8 version, bufferlist::iterator& it);
+ void dump(Formatter *f) const;
+};
+
struct AioFlushEvent {
static const EventType TYPE = EVENT_TYPE_AIO_FLUSH;
void dump(Formatter *f) const;
};
-typedef boost::variant<AioDiscardEvent,
- AioWriteEvent,
- AioFlushEvent,
- OpFinishEvent,
- SnapCreateEvent,
- SnapRemoveEvent,
- SnapRenameEvent,
- SnapProtectEvent,
- SnapUnprotectEvent,
- SnapRollbackEvent,
- RenameEvent,
- ResizeEvent,
- FlattenEvent,
- DemotePromoteEvent,
- SnapLimitEvent,
- UpdateFeaturesEvent,
- MetadataSetEvent,
- MetadataRemoveEvent,
- AioWriteSameEvent,
- UnknownEvent> Event;
+typedef boost::mpl::vector<AioDiscardEvent,
+ AioWriteEvent,
+ AioFlushEvent,
+ OpFinishEvent,
+ SnapCreateEvent,
+ SnapRemoveEvent,
+ SnapRenameEvent,
+ SnapProtectEvent,
+ SnapUnprotectEvent,
+ SnapRollbackEvent,
+ RenameEvent,
+ ResizeEvent,
+ FlattenEvent,
+ DemotePromoteEvent,
+ SnapLimitEvent,
+ UpdateFeaturesEvent,
+ MetadataSetEvent,
+ MetadataRemoveEvent,
+ AioWriteSameEvent,
+ AioCompareAndWriteEvent,
+ UnknownEvent> EventVector;
+typedef boost::make_variant_over<EventVector>::type Event;
struct EventEntry {
static uint32_t get_fixed_size() {
return r;
}
+ ssize_t Image::compare_and_write(uint64_t ofs, size_t len,
+ ceph::bufferlist &cmp_bl, ceph::bufferlist& bl,
+ uint64_t *mismatch_off, int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, ofs, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ int r = ictx->io_work_queue->compare_and_write(ofs, len, bufferlist{cmp_bl},
+ bufferlist{bl}, mismatch_off,
+ op_flags);
+
+ tracepoint(librbd, compare_and_write_exit, r);
+
+ return r;
+ }
int Image::aio_write(uint64_t off, size_t len, bufferlist& bl,
RBD::AioCompletion *c)
{
return 0;
}
+ int Image::aio_compare_and_write(uint64_t off, size_t len,
+ ceph::bufferlist& cmp_bl, ceph::bufferlist& bl,
+ RBD::AioCompletion *c, uint64_t *mismatch_off,
+ int op_flags)
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(),
+ bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags);
+
+ if (bl.length() < len) {
+ tracepoint(librbd, compare_and_write_exit, -EINVAL);
+ return -EINVAL;
+ }
+
+ ictx->io_work_queue->aio_compare_and_write(get_aio_completion(c), off, len,
+ bufferlist{cmp_bl}, bufferlist{bl},
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+
+ return 0;
+ }
int Image::invalidate_cache()
{
return r;
}
+extern "C" ssize_t rbd_compare_and_write(rbd_image_t image,
+ uint64_t ofs, size_t len,
+ const char *cmp_buf,
+ const char *buf,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(),
+ ictx->snap_name.c_str(), ictx->read_only, ofs,
+ len, cmp_buf, buf, op_flags);
+
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+
+ int r = ictx->io_work_queue->compare_and_write(ofs, len, std::move(cmp_bl),
+ std::move(bl), mismatch_off,
+ op_flags);
+ tracepoint(librbd, compare_and_write_exit, r);
+ return r;
+}
+
extern "C" int rbd_aio_create_completion(void *cb_arg,
rbd_callback_t complete_cb,
rbd_completion_t *c)
return 0;
}
+extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off,
+ size_t len, const char *cmp_buf,
+ const char *buf, rbd_completion_t c,
+ uint64_t *mismatch_off,
+ int op_flags)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
+ ictx->read_only, off, len, cmp_buf, buf, comp->pc, op_flags);
+
+ bufferlist cmp_bl;
+ cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len));
+ bufferlist bl;
+ bl.push_back(create_write_raw(ictx, buf, len));
+ ictx->io_work_queue->aio_compare_and_write(get_aio_completion(comp), off, len,
+ std::move(cmp_bl), std::move(bl),
+ mismatch_off, op_flags, false);
+
+ tracepoint(librbd, aio_compare_and_write_exit, 0);
+ return 0;
+}
+
extern "C" int rbd_invalidate_cache(rbd_image_t image)
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
public:
SubsystemMap() : m_max_name_len(0) {}
- int get_num() const {
+ size_t get_num() const {
return m_subsys.size();
}
inode_t *pi = get_projected_inode();
fnode_t *pf = dir->project_fnode();
- pf->version = dir->pre_dirty();
const char *ename = 0;
switch (lock->get_type()) {
pf->rstat.version = pi->rstat.version;
pf->accounted_rstat = pf->rstat;
ename = "lock inest accounted scatter stat update";
+
+ if (!is_auth() && lock->get_state() == LOCK_MIX) {
+ dout(10) << "finish_scatter_update try to assimilate dirty rstat on "
+ << *dir << dendl;
+ dir->assimilate_dirty_rstat_inodes();
+ }
+
break;
default:
ceph_abort();
}
+ pf->version = dir->pre_dirty();
mut->add_projected_fnode(dir);
EUpdate *le = new EUpdate(mdlog, ename);
assert(!dir->is_frozen());
mut->auth_pin(dir);
+
+ if (lock->get_type() == CEPH_LOCK_INEST &&
+ !is_auth() && lock->get_state() == LOCK_MIX) {
+ dout(10) << "finish_scatter_update finish assimilating dirty rstat on "
+ << *dir << dendl;
+ dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob);
+
+ if (!(pf->rstat == pf->accounted_rstat)) {
+ if (mut->wrlocks.count(&nestlock) == 0) {
+ mdcache->mds->locker->wrlock_force(&nestlock, mut);
+ }
+
+ mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
+ mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
+ }
+ }
mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
} else {
{
dout(10) << "_finish_frag_update on " << *dir << dendl;
mut->apply();
+ mdcache->mds->locker->drop_locks(mut.get());
mut->cleanup();
}
new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
new_fs->mds_map.enabled = true;
+ // Remember mds ranks that have ever started. (They should load old inotable
+ // instead of creating new one if they start again.)
+ new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
+ new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
+ new_fs->mds_map.stopped.erase(mds_rank_t(0));
+
// Persist the new FSMap
filesystems[new_fs->fscid] = new_fs;
}
// okay, we actually need to kick the head's lock to get ourselves synced up.
CInode *head = mdcache->get_inode(in->ino());
assert(head);
- SimpleLock *hlock = head->get_lock(lock->get_type());
+ SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+ if (hlock->get_state() == LOCK_SYNC)
+ hlock = head->get_lock(lock->get_type());
+
if (hlock->get_state() != LOCK_SYNC) {
dout(10) << "rdlock_start trying head inode " << *head << dendl;
- if (!rdlock_start(head->get_lock(lock->get_type()), mut, true)) // ** as_anon, no rdlock on EXCL **
+ if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL **
return false;
// oh, check our lock again then
}
-void Locker::_do_null_snapflush(CInode *head_in, client_t client)
+void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last)
{
dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
- compact_map<snapid_t, set<client_t> >::iterator p = head_in->client_need_snapflush.begin();
- while (p != head_in->client_need_snapflush.end()) {
+ for (auto p = head_in->client_need_snapflush.begin();
+ p != head_in->client_need_snapflush.end() && p->first < last; ) {
snapid_t snapid = p->first;
set<client_t>& clients = p->second;
++p; // be careful, q loop below depends on this
// this cap now follows a later snap (i.e. the one initiating this flush, or later)
if (in == head_in)
cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
+ else if (head_in->client_need_snapflush.begin()->first < snap)
+ _do_null_snapflush(head_in, client, snap);
_do_snap_update(in, snap, m->get_dirty(), follows, client, m, ack);
mds->mdlog->flush();
break;
}
-
+
// ok
- lock->decode_locked_state(m->get_data());
lock->set_state(LOCK_MIX);
+ lock->decode_locked_state(m->get_data());
if (caps)
issue_caps(in);
void handle_client_caps(class MClientCaps *m);
void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi);
void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack);
- void _do_null_snapflush(CInode *head_in, client_t client);
+ void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP);
bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
MClientCaps *ack=0, bool *need_flush=NULL);
void handle_client_cap_release(class MClientCapRelease *m);
rejoin_imported_caps.clear();
}
+class C_MDC_ReIssueCaps : public MDCacheContext {
+ CInode *in;
+public:
+ C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
+ MDCacheContext(mdc), in(i)
+ {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
+ mdcache->mds->locker->issue_caps(in);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
void MDCache::reissue_all_caps()
{
++p) {
CInode *in = p->second;
if (in->is_head() && in->is_any_caps()) {
+ // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
+ if (in->is_frozen_inode()) {
+ in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
+ continue;
+ }
if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
mds->locker->issue_caps(in);
}
<< ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
<< dendl;
+ mds->update_mlogger();
mds->mlogger->set(l_mdm_rss, last.get_rss());
mds->mlogger->set(l_mdm_heap, last.get_heap());
assert(!migrator->is_exporting());
assert(!migrator->is_importing());
+ if ((myin && myin->is_auth_pinned()) ||
+ (mydir && mydir->is_auth_pinned())) {
+ dout(7) << "still have auth pinned objects" << dendl;
+ return false;
+ }
// flush what we can from the log
mds->mdlog->trim(0);
/* Stray/purge statistics */
pcb.add_u64(l_mdc_num_strays, "num_strays",
- "Stray dentries", "stry");
+ "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
/* Recovery queue statistics */
pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
- "Files waiting for recovery", "recy");
+ "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
- "File recoveries completed", "recd");
+ "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
logger.reset(pcb.create_perf_counters());
g_ceph_context->get_perfcounters_collection()->add(logger.get());
PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last);
plb.add_u64_counter(l_mdl_evadd, "evadd",
- "Events submitted", "subm");
+ "Events submitted", "subm", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
plb.add_u64(l_mdl_ev, "ev",
- "Events", "evts");
+ "Events", "evts", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64(l_mdl_evexg, "evexg", "Expiring events");
plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
plb.add_u64_counter(l_mdl_segex, "segex", "Total expired segments");
plb.add_u64_counter(l_mdl_segtrm, "segtrm", "Trimmed segments");
plb.add_u64(l_mdl_seg, "seg",
- "Segments", "segs");
+ "Segments", "segs", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
#include <unistd.h>
#include "include/compat.h"
-#include "global/signal_handler.h"
-
#include "include/types.h"
#include "include/str_list.h"
-#include "common/entity_name.h"
+
#include "common/Clock.h"
-#include "common/signal.h"
+#include "common/HeartbeatMap.h"
+#include "common/Timer.h"
+#include "common/backport14.h"
#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/entity_name.h"
#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/signal.h"
+#include "common/version.h"
+
+#include "global/signal_handler.h"
#include "msg/Messenger.h"
#include "mon/MonClient.h"
#include "SnapServer.h"
#include "SnapClient.h"
-#include "common/HeartbeatMap.h"
-
-#include "common/perf_counters.h"
-
-#include "common/Timer.h"
-
#include "events/ESession.h"
#include "events/ESubtreeMap.h"
#include "auth/RotatingKeyRing.h"
#include "auth/KeyRing.h"
-#include "common/config.h"
-
#include "perfglue/cpu_profiler.h"
#include "perfglue/heap_profiler.h"
explicit C_MDS_Tick(MDSDaemon *m) : mds_daemon(m) {}
void finish(int r) override {
assert(mds_daemon->mds_lock.is_locked_by_me());
-
- mds_daemon->tick_event = 0;
mds_daemon->tick();
}
};
void MDSDaemon::tick()
{
- tick_event = 0;
-
// reschedule
reset_tick();
"name=injected_args,type=CephString,n=N",
"inject configuration arguments into running MDS",
"mds", "*", "cli,rest")
+COMMAND("config set " \
+ "name=key,type=CephString name=value,type=CephString",
+ "Set a configuration option at runtime (not persistent)",
+ "mds", "*", "cli,rest")
COMMAND("exit",
"Terminate this MDS",
"mds", "*", "cli,rest")
"List detected metadata damage", "mds", "r", "cli,rest")
COMMAND("damage rm name=damage_id,type=CephInt",
"Remove a damage table entry", "mds", "rw", "cli,rest")
+COMMAND("version", "report version of MDS", "mds", "r", "cli,rest")
COMMAND("heap " \
"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
"show heap usage info (available only if compiled with tcmalloc)", \
std::stringstream ds;
std::stringstream ss;
std::string prefix;
+ std::string format;
+ std::unique_ptr<Formatter> f(Formatter::create(format));
cmd_getval(cct, cmdmap, "prefix", prefix);
int r = 0;
if (prefix == "get_command_descriptions") {
int cmdnum = 0;
- JSONFormatter *f = new JSONFormatter();
+ std::unique_ptr<JSONFormatter> f(ceph::make_unique<JSONFormatter>());
f->open_object_section("command_descriptions");
for (MDSCommand *cp = mds_commands;
cp < &mds_commands[ARRAY_SIZE(mds_commands)]; cp++) {
ostringstream secname;
secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
- dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
+ dump_cmddesc_to_json(f.get(), secname.str(), cp->cmdstring, cp->helpstring,
cp->module, cp->perm, cp->availability, 0);
cmdnum++;
}
f->close_section(); // command_descriptions
f->flush(ds);
- delete f;
+ goto out;
+ }
+
+ cmd_getval(cct, cmdmap, "format", format);
+ if (prefix == "version") {
+ if (f) {
+ f->open_object_section("version");
+ f->dump_string("version", pretty_version_to_str());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << pretty_version_to_str();
+ }
} else if (prefix == "injectargs") {
vector<string> argsvec;
cmd_getval(cct, cmdmap, "injected_args", argsvec);
for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
args += " " + *a;
r = cct->_conf->injectargs(args, &ss);
+ } else if (prefix == "config set") {
+ std::string key;
+ cmd_getval(cct, cmdmap, "key", key);
+ std::string val;
+ cmd_getval(cct, cmdmap, "value", val);
+ r = cct->_conf->set_val(key, val, true, &ss);
} else if (prefix == "exit") {
// We will send response before executing
ss << "Exiting...";
*run_later = new SuicideLater(this);
- }
- else if (prefix == "respawn") {
+ } else if (prefix == "respawn") {
// We will send response before executing
ss << "Respawning...";
*run_later = new RespawnLater(this);
// see who i am
addr = messenger->get_myaddr();
- dout(10) << "map says i am " << addr << " mds." << whoami << "." << incarnation
+ dout(10) << "map says I am " << addr << " mds." << whoami << "." << incarnation
<< " state " << ceph_mds_state_name(new_state) << dendl;
if (whoami == MDS_RANK_NONE) {
if (mds_rank != NULL) {
+ const auto myid = monc->get_global_id();
// We have entered a rank-holding state, we shouldn't be back
// here!
if (g_conf->mds_enforce_unique_name) {
if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) {
const MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing);
- if (i.global_id > monc->get_global_id()) {
- dout(1) << "handle_mds_map i (" << addr
- << ") dne in the mdsmap, new instance has larger gid " << i.global_id
- << ", suicide" << dendl;
+ if (i.global_id > myid) {
+ dout(1) << "map replaced me with another mds." << whoami
+ << " with gid (" << i.global_id << ") larger than myself ("
+ << myid << "); quitting!" << dendl;
// Call suicide() rather than respawn() because if someone else
// has taken our ID, we don't want to keep restarting and
// fighting them for the ID.
}
}
- dout(1) << "handle_mds_map i (" << addr
- << ") dne in the mdsmap, respawning myself" << dendl;
+ dout(1) << "map removed me (mds." << whoami << " gid:"
+ << myid << ") from cluster due to lost contact; respawning" << dendl;
respawn();
}
// MDSRank not active: process the map here to see if we have
EntityName name;
uint64_t global_id;
- is_valid = authorize_handler->verify_authorizer(
- cct, monc->rotating_secrets.get(),
- authorizer_data, authorizer_reply, name, global_id, caps_info, session_key);
+ RotatingKeyRing *keys = monc->rotating_secrets.get();
+ if (keys) {
+ is_valid = authorize_handler->verify_authorizer(
+ cct, keys,
+ authorizer_data, authorizer_reply, name, global_id, caps_info,
+ session_key);
+ } else {
+ dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
+ is_valid = false;
+ }
if (is_valid) {
entity_name_t n(con->get_peer_type(), global_id);
fs_cluster_id_t standby_for_fscid;
bool standby_replay;
std::set<mds_rank_t> export_targets;
- uint64_t mds_features;
+ uint64_t mds_features = 0;
mds_info_t() : global_id(MDS_GID_NONE), rank(MDS_RANK_NONE), inc(0),
state(STATE_STANDBY), state_seq(0),
int64_t get_first_data_pool() const { return *data_pools.begin(); }
int64_t get_metadata_pool() const { return metadata_pool; }
bool is_data_pool(int64_t poolid) const {
- return std::binary_search(data_pools.begin(), data_pools.end(), poolid);
+ auto p = std::find(data_pools.begin(), data_pools.end(), poolid);
+ if (p == data_pools.end())
+ return false;
+ return true;
}
bool pool_in_use(int64_t poolid) const {
}
*/
+ update_mlogger();
+ return true;
+}
+
+void MDSRank::update_mlogger()
+{
if (mlogger) {
mlogger->set(l_mdm_ino, CInode::count());
mlogger->set(l_mdm_dir, CDir::count());
mlogger->set(l_mdm_dn, CDentry::count());
mlogger->set(l_mdm_cap, Capability::count());
-
mlogger->set(l_mdm_inoa, CInode::increments());
mlogger->set(l_mdm_inos, CInode::decrements());
mlogger->set(l_mdm_dira, CDir::increments());
mlogger->set(l_mdm_dns, CDentry::decrements());
mlogger->set(l_mdm_capa, Capability::increments());
mlogger->set(l_mdm_caps, Capability::decrements());
-
mlogger->set(l_mdm_buf, buffer::get_total_alloc());
}
-
- return true;
}
/*
purge_queue.handle_conf_change(conf, changed, *mdsmap);
}
+ void update_mlogger();
protected:
// Flag to indicate we entered shutdown: anyone seeing this to be true
// after taking mds_lock must drop out.
dout(7) << "must have aborted" << dendl;
} else {
assert(it->second.state == EXPORT_DISCOVERING);
- // release locks to avoid deadlock
- MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
- assert(mdr);
- mds->mdcache->request_finish(mdr);
- it->second.mut.reset();
- // freeze the subtree
- it->second.state = EXPORT_FREEZING;
- dir->auth_unpin(this);
- assert(g_conf->mds_kill_export_at != 3);
+
+ if (m->is_success()) {
+ // release locks to avoid deadlock
+ MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
+ assert(mdr);
+ mds->mdcache->request_finish(mdr);
+ it->second.mut.reset();
+ // freeze the subtree
+ it->second.state = EXPORT_FREEZING;
+ dir->auth_unpin(this);
+ assert(g_conf->mds_kill_export_at != 3);
+
+ } else {
+ dout(7) << "peer failed to discover (not active?), canceling" << dendl;
+ export_try_cancel(dir, false);
+ }
}
m->put(); // done
assert(it->second.state == EXPORT_PREPPING);
if (!m->is_success()) {
- dout(7) << "peer couldn't acquire all needed locks, canceling" << dendl;
+ dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
export_try_cancel(dir, false);
m->put();
return;
// note import state
dirfrag_t df = m->get_dirfrag();
+
+ if (!mds->is_active()) {
+ dout(7) << " not active, send NACK " << dendl;
+ mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
+ m->put();
+ return;
+ }
+
// only start discovering on this message once.
map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
if (!m->started) {
mds->queue_waiters(finished);
- // open all bounds
- set<CDir*> import_bounds;
- for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
- p != import_bound_fragset.end();
- ++p) {
- CInode *in = cache->get_inode(p->first);
- assert(in);
+ bool success = true;
+ if (mds->is_active()) {
+ // open all bounds
+ set<CDir*> import_bounds;
+ for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
+ p != import_bound_fragset.end();
+ ++p) {
+ CInode *in = cache->get_inode(p->first);
+ assert(in);
- // map fragset into a frag_t list, based on the inode fragtree
- list<frag_t> fglist;
- for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
- in->dirfragtree.get_leaves_under(*q, fglist);
- dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
-
- for (list<frag_t>::iterator q = fglist.begin();
- q != fglist.end();
- ++q) {
- CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
- if (!bound) {
- dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
- cache->open_remote_dirfrag(in, *q,
- new C_MDS_RetryMessage(mds, m));
- return;
- }
+ // map fragset into a frag_t list, based on the inode fragtree
+ list<frag_t> fglist;
+ for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+ in->dirfragtree.get_leaves_under(*q, fglist);
+ dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
+
+ for (list<frag_t>::iterator q = fglist.begin();
+ q != fglist.end();
+ ++q) {
+ CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
+ if (!bound) {
+ dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
+ cache->open_remote_dirfrag(in, *q,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ }
- if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
- dout(7) << " pinning import bound " << *bound << dendl;
- bound->get(CDir::PIN_IMPORTBOUND);
- bound->state_set(CDir::STATE_IMPORTBOUND);
- } else {
- dout(7) << " already pinned import bound " << *bound << dendl;
+ if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
+ dout(7) << " pinning import bound " << *bound << dendl;
+ bound->get(CDir::PIN_IMPORTBOUND);
+ bound->state_set(CDir::STATE_IMPORTBOUND);
+ } else {
+ dout(7) << " already pinned import bound " << *bound << dendl;
+ }
+ import_bounds.insert(bound);
}
- import_bounds.insert(bound);
}
- }
-
- dout(7) << " all ready, noting auth and freezing import region" << dendl;
- bool success = true;
- if (!mds->mdcache->is_readonly() &&
- dir->get_inode()->filelock.can_wrlock(-1) &&
- dir->get_inode()->nestlock.can_wrlock(-1)) {
- it->second.mut = new MutationImpl();
- // force some locks. hacky.
- mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
- mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
-
- // note that i am an ambiguous auth for this subtree.
- // specify bounds, since the exporter explicitly defines the region.
- cache->adjust_bounded_subtree_auth(dir, import_bounds,
- pair<int,int>(oldauth, mds->get_nodeid()));
- cache->verify_subtree_bounds(dir, import_bounds);
- // freeze.
- dir->_freeze_tree();
- // note new state
- it->second.state = IMPORT_PREPPED;
+ dout(7) << " all ready, noting auth and freezing import region" << dendl;
+
+ if (!mds->mdcache->is_readonly() &&
+ dir->get_inode()->filelock.can_wrlock(-1) &&
+ dir->get_inode()->nestlock.can_wrlock(-1)) {
+ it->second.mut = new MutationImpl();
+ // force some locks. hacky.
+ mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
+ mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
+
+ // note that i am an ambiguous auth for this subtree.
+ // specify bounds, since the exporter explicitly defines the region.
+ cache->adjust_bounded_subtree_auth(dir, import_bounds,
+ pair<int,int>(oldauth, mds->get_nodeid()));
+ cache->verify_subtree_bounds(dir, import_bounds);
+ // freeze.
+ dir->_freeze_tree();
+ // note new state
+ it->second.state = IMPORT_PREPPED;
+ } else {
+ dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
+ success = false;
+ }
} else {
- dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
+ dout(7) << " not active, failing. " << *dir << dendl;
success = false;
- import_reverse_prepping(dir);
}
+ if (!success)
+ import_reverse_prepping(dir);
+
// ok!
dout(7) << " sending export_prep_ack on " << *dir << dendl;
mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
"purge_queue", l_pq_first, l_pq_last);
pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
- pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed", "purg");
+ pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed", "purg",
+ PerfCountersBuilder::PRIO_INTERESTING);
logger.reset(pcb.create_perf_counters());
g_ceph_context->get_perfcounters_collection()->add(logger.get());
timer.init();
}
+void PurgeQueue::activate()
+{
+ Mutex::Locker l(lock);
+ if (journaler.get_read_pos() == journaler.get_write_pos())
+ return;
+
+ if (in_flight.empty()) {
+ dout(4) << "start work (by drain)" << dendl;
+ finisher.queue(new FunctionContext([this](int r) {
+ Mutex::Locker l(lock);
+ _consume();
+ }));
+ }
+}
+
void PurgeQueue::shutdown()
{
Mutex::Locker l(lock);
public:
void init();
+ void activate();
void shutdown();
void create_logger();
{
PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
- "Client requests", "hcr");
+ "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
- "Slave requests", "hsr");
+ "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
- "Client session messages", "hcs");
+ "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
if (!dir) return 0;
- dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl;
CInode *diri = dir->get_inode();
if (!mdr->reqid.name.is_mds()) {
return;
}
- if (!(req->head.args.open.flags & CEPH_O_EXCL)) {
+ bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+
+ if (!excl) {
int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
&mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
if (r > 0) return;
}
return;
}
- // r == -ENOENT
}
- bool excl = (req->head.args.open.flags & CEPH_O_EXCL);
set<SimpleLock*> rdlocks, wrlocks, xlocks;
file_layout_t *dir_layout = NULL;
CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
return;
}
+ // created null dn.
CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
rdlocks.insert(&diri->authlock);
return;
}
- // created null dn.
-
// create inode.
SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
snapid_t follows = realm->get_newest_seq();
void StrayManager::_enqueue(CDentry *dn, bool trunc)
{
+ assert(started);
+
CInode *in = dn->get_linkage()->get_inode();
if (in->snaprealm &&
!in->snaprealm->have_past_parents_open() &&
return;
}
- if (!started) {
- // If the MDS is not yet active, defer executing this purge
- // in order to avoid the mdlog writes we do on purge completion.
- mds->wait_for_active(
- new MDSInternalContextWrapper(mds,
- new FunctionContext([this, dn, trunc](int r){
- // It is safe to hold on to this CDentry* pointer
- // because the dentry is pinned with PIN_PURGING
- _enqueue(dn, trunc);
- })
- )
- );
-
- return;
- }
-
if (trunc) {
truncate(dn);
} else {
void StrayManager::advance_delayed()
{
+ if (!started)
+ return;
+
for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
CDentry *dn = *p;
++p;
return false;
}
+ if (!started)
+ delay = true;
+
if (dn->item_stray.is_on_list()) {
if (delay)
return false;
{
dout(10) << __func__ << dendl;
started = true;
+ purge_queue.activate();
}
bool StrayManager::eval_stray(CDentry *dn, bool delay)
class EFragment : public LogEvent {
public:
EMetaBlob metablob;
- __u8 op;
+ __u8 op{0};
inodeno_t ino;
frag_t basefrag;
- __s32 bits; // positive for split (from basefrag), negative for merge (to basefrag)
+ __s32 bits{0}; // positive for split (from basefrag), negative for merge (to basefrag)
list<frag_t> orig_frags;
bufferlist rollback;
public:
EMetaBlob metablob;
bufferlist client_map; // encoded map<__u32,entity_inst_t>
- version_t cmapv;
+ version_t cmapv{0};
EImportStart(MDLog *log, dirfrag_t di, vector<dirfrag_t>& b, mds_rank_t f) :
LogEvent(EVENT_IMPORTSTART),
typedef compact_map<snapid_t, old_inode_t> old_inodes_t;
string dn; // dentry
snapid_t dnfirst, dnlast;
- version_t dnv;
+ version_t dnv{0};
inode_t inode; // if it's not
fragtree_t dirfragtree;
map<string,bufferptr> xattrs;
string symlink;
snapid_t oldest_snap;
bufferlist snapbl;
- __u8 state;
+ __u8 state{0};
old_inodes_t old_inodes;
fullbit(const fullbit& o);
protected:
entity_inst_t client_inst;
bool open; // open or close
- version_t cmapv; // client map version
+ version_t cmapv{0}; // client map version
interval_set<inodeno_t> inos;
- version_t inotablev;
+ version_t inotablev{0};
// Client metadata stored during open
std::map<std::string, std::string> client_metadata;
#define CEPH_MMGRBEACON_H
#include "messages/PaxosServiceMessage.h"
+#include "mon/MonCommand.h"
#include "include/types.h"
class MMgrBeacon : public PaxosServiceMessage {
- static const int HEAD_VERSION = 3;
+ static const int HEAD_VERSION = 5;
static const int COMPAT_VERSION = 1;
protected:
std::string name;
uuid_d fsid;
std::set<std::string> available_modules;
+ map<string,string> metadata; ///< misc metadata about this osd
+
+ // Only populated during activation
+ std::vector<MonCommand> command_descs;
public:
MMgrBeacon()
MMgrBeacon(const uuid_d& fsid_, uint64_t gid_, const std::string &name_,
entity_addr_t server_addr_, bool available_,
- const std::set<std::string>& module_list)
+ const std::set<std::string>& module_list,
+ map<string,string>&& metadata)
: PaxosServiceMessage(MSG_MGR_BEACON, 0, HEAD_VERSION, COMPAT_VERSION),
gid(gid_), server_addr(server_addr_), available(available_), name(name_),
- fsid(fsid_), available_modules(module_list)
+ fsid(fsid_), available_modules(module_list), metadata(std::move(metadata))
{
}
const std::string& get_name() const { return name; }
const uuid_d& get_fsid() const { return fsid; }
std::set<std::string>& get_available_modules() { return available_modules; }
+ const std::map<std::string,std::string>& get_metadata() const {
+ return metadata;
+ }
+
+ void set_command_descs(const std::vector<MonCommand> &cmds)
+ {
+ command_descs = cmds;
+ }
+
+ const std::vector<MonCommand> &get_command_descs()
+ {
+ return command_descs;
+ }
private:
~MMgrBeacon() override {}
::encode(name, payload);
::encode(fsid, payload);
::encode(available_modules, payload);
+ ::encode(command_descs, payload);
+ ::encode(metadata, payload);
}
void decode_payload() override {
bufferlist::iterator p = payload.begin();
if (header.version >= 3) {
::decode(available_modules, p);
}
+ if (header.version >= 4) {
+ ::decode(command_descs, p);
+ }
+ if (header.version >= 5) {
+ ::decode(metadata, p);
+ }
}
};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MOSDFORCERECOVERY_H
+#define CEPH_MOSDFORCERECOVERY_H
+
+#include "msg/Message.h"
+
+/*
+ * instruct an OSD to boost/unboost recovery/backfill priority of some or all pg(s)
+ */
+
+// boost priority of recovery
+static const int OFR_RECOVERY = 1;
+
+// boost priority of backfill
+static const int OFR_BACKFILL = 2;
+
+// cancel priority boost, requeue if necessary
+static const int OFR_CANCEL = 4;
+
+struct MOSDForceRecovery : public Message {
+
+ static const int HEAD_VERSION = 1;
+ static const int COMPAT_VERSION = 1;
+
+ uuid_d fsid;
+ vector<pg_t> forced_pgs;
+ uint8_t options;
+
+ MOSDForceRecovery() : Message(MSG_OSD_FORCE_RECOVERY, HEAD_VERSION, COMPAT_VERSION) {}
+ MOSDForceRecovery(const uuid_d& f, char opts) :
+ Message(MSG_OSD_FORCE_RECOVERY, HEAD_VERSION, COMPAT_VERSION),
+ fsid(f), options(opts) {}
+ MOSDForceRecovery(const uuid_d& f, vector<pg_t>& pgs, char opts) :
+ Message(MSG_OSD_FORCE_RECOVERY, HEAD_VERSION, COMPAT_VERSION),
+ fsid(f), forced_pgs(pgs), options(opts) {}
+private:
+ ~MOSDForceRecovery() {}
+
+public:
+ const char *get_type_name() const { return "force_recovery"; }
+ void print(ostream& out) const {
+ out << "force_recovery(";
+ if (forced_pgs.empty())
+ out << "osd";
+ else
+ out << forced_pgs;
+ if (options & OFR_RECOVERY)
+ out << " recovery";
+ if (options & OFR_BACKFILL)
+ out << " backfill";
+ if (options & OFR_CANCEL)
+ out << " cancel";
+ out << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ ::encode(fsid, payload);
+ ::encode(forced_pgs, payload);
+ ::encode(options, payload);
+ }
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ ::decode(fsid, p);
+ ::decode(forced_pgs, p);
+ ::decode(options, p);
+ }
+};
+
+#endif /* CEPH_MOSDFORCERECOVERY_H_ */
}
void encode_payload(uint64_t features) override {
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
+ } else {
header.version = 4;
// for kraken+jewel only
::encode(missing, payload);
::encode(query_epoch, payload);
if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
::encode(past_intervals, payload);
} else {
header.version = 4;
const char *get_type_name() const override { return "PGnot"; }
void encode_payload(uint64_t features) override {
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
+ } else {
// for jewel+kraken compat only
header.version = 5;
}
void encode_payload(uint64_t features) override {
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
+ } else {
// for kraken/jewel only
header.version = 3;
::encode(epoch, payload);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MOSDPGRECOVERYDELETE_H
+#define CEPH_MOSDPGRECOVERYDELETE_H
+
+#include "MOSDFastDispatchOp.h"
+
+/*
+ * instruct non-primary to remove some objects during recovery
+ */
+
+struct MOSDPGRecoveryDelete : public MOSDFastDispatchOp {
+
+ static const int HEAD_VERSION = 1;
+ static const int COMPAT_VERSION = 1;
+
+ pg_shard_t from;
+ spg_t pgid; ///< target spg_t
+ epoch_t map_epoch, min_epoch;
+ list<pair<hobject_t, eversion_t> > objects; ///< objects to remove
+
+private:
+ uint64_t cost;
+
+public:
+ int get_cost() const override {
+ return cost;
+ }
+
+ epoch_t get_map_epoch() const override {
+ return map_epoch;
+ }
+ epoch_t get_min_epoch() const override {
+ return min_epoch;
+ }
+ spg_t get_spg() const override {
+ return pgid;
+ }
+
+ void set_cost(uint64_t c) {
+ cost = c;
+ }
+
+ MOSDPGRecoveryDelete()
+ : MOSDFastDispatchOp(MSG_OSD_PG_RECOVERY_DELETE, HEAD_VERSION,
+ COMPAT_VERSION), cost(0) {}
+
+ MOSDPGRecoveryDelete(pg_shard_t from, spg_t pgid, epoch_t map_epoch,
+ epoch_t min_epoch)
+ : MOSDFastDispatchOp(MSG_OSD_PG_RECOVERY_DELETE, HEAD_VERSION,
+ COMPAT_VERSION),
+ from(from),
+ pgid(pgid),
+ map_epoch(map_epoch),
+ min_epoch(min_epoch),
+ cost(0) {}
+
+private:
+ ~MOSDPGRecoveryDelete() {}
+
+public:
+ const char *get_type_name() const { return "recovery_delete"; }
+ void print(ostream& out) const {
+ out << "MOSDPGRecoveryDelete(" << pgid << " e" << map_epoch << ","
+ << min_epoch << " " << objects << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ ::encode(from, payload);
+ ::encode(pgid, payload);
+ ::encode(map_epoch, payload);
+ ::encode(min_epoch, payload);
+ ::encode(cost, payload);
+ ::encode(objects, payload);
+ }
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ ::decode(from, p);
+ ::decode(pgid, p);
+ ::decode(map_epoch, p);
+ ::decode(min_epoch, p);
+ ::decode(cost, p);
+ ::decode(objects, p);
+ }
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef MOSDRECOVERYDELETEREPLY_H
+#define MOSDRECOVERYDELETEREPLY_H
+
+#include "MOSDFastDispatchOp.h"
+
+struct MOSDPGRecoveryDeleteReply : public MOSDFastDispatchOp {
+ static const int HEAD_VERSION = 1;
+ static const int COMPAT_VERSION = 1;
+
+ pg_shard_t from;
+ spg_t pgid;
+ epoch_t map_epoch, min_epoch;
+ list<pair<hobject_t, eversion_t> > objects;
+
+ epoch_t get_map_epoch() const override {
+ return map_epoch;
+ }
+ epoch_t get_min_epoch() const override {
+ return min_epoch;
+ }
+ spg_t get_spg() const override {
+ return pgid;
+ }
+
+ MOSDPGRecoveryDeleteReply()
+ : MOSDFastDispatchOp(MSG_OSD_PG_RECOVERY_DELETE_REPLY, HEAD_VERSION, COMPAT_VERSION),
+ map_epoch(0), min_epoch(0)
+ {}
+
+ void decode_payload() override {
+ bufferlist::iterator p = payload.begin();
+ ::decode(pgid.pgid, p);
+ ::decode(map_epoch, p);
+ ::decode(min_epoch, p);
+ ::decode(objects, p);
+ ::decode(pgid.shard, p);
+ ::decode(from, p);
+ }
+
+ void encode_payload(uint64_t features) override {
+ ::encode(pgid.pgid, payload);
+ ::encode(map_epoch, payload);
+ ::encode(min_epoch, payload);
+ ::encode(objects, payload);
+ ::encode(pgid.shard, payload);
+ ::encode(from, payload);
+ }
+
+ void print(ostream& out) const override {
+ out << "MOSDPGRecoveryDeleteReply(" << pgid
+ << " e" << map_epoch << "," << min_epoch << " " << objects << ")";
+ }
+
+ const char *get_type_name() const override { return "recovery_delete_reply"; }
+};
+
+#endif
const char *get_type_name() const override { return "PGrm"; }
void encode_payload(uint64_t features) override {
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
+ } else {
// for jewel+kraken
header.version = 2;
::encode(epoch, payload);
::encode(map_epoch, payload);
// with luminous, we drop peer_as_of_epoch and peer_stat
- if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
+ ::encode(op, payload);
+ } else {
epoch_t dummy_epoch = {};
osd_peer_stat_t dummy_stat = {};
header.version = 3;
::encode(dummy_epoch, payload);
::encode(op, payload);
::encode(dummy_stat, payload);
- } else {
- ::encode(op, payload);
}
::encode(stamp, payload);
size_t s = 0;
void encode_payload(uint64_t features) override {
::encode(map_epoch, payload);
if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
::encode(min_epoch, payload);
encode_trace(payload, features);
} else {
void encode_payload(uint64_t features) override {
::encode(map_epoch, payload);
if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ header.version = HEAD_VERSION;
::encode(min_epoch, payload);
encode_trace(payload, features);
} else {
#include "auth/RotatingKeyRing.h"
#include "json_spirit/json_spirit_writer.h"
+#include "mgr/mgr_commands.h"
+#include "mon/MonCommand.h"
+
#include "messages/MMgrOpen.h"
#include "messages/MMgrConfigure.h"
#include "messages/MMonMgrReport.h"
#include "messages/MCommandReply.h"
#include "messages/MPGStats.h"
#include "messages/MOSDScrub.h"
+#include "messages/MOSDForceRecovery.h"
#include "common/errno.h"
#define dout_context g_ceph_context
#undef dout_prefix
#define dout_prefix *_dout << "mgr.server " << __func__ << " "
+
+
DaemonServer::DaemonServer(MonClient *monc_,
Finisher &finisher_,
DaemonStateIndex &daemon_state_,
s->inst.addr = con->get_peer_addr();
AuthCapsInfo caps_info;
- is_valid = handler->verify_authorizer(
- cct, monc->rotating_secrets.get(),
- authorizer_data,
- authorizer_reply, s->entity_name,
- s->global_id, caps_info,
- session_key);
+ RotatingKeyRing *keys = monc->rotating_secrets.get();
+ if (keys) {
+ is_valid = handler->verify_authorizer(
+ cct, keys,
+ authorizer_data,
+ authorizer_reply, s->entity_name,
+ s->global_id, caps_info,
+ session_key);
+ } else {
+ dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
+ is_valid = false;
+ }
if (is_valid) {
if (caps_info.allow_all) {
configure->stats_period = g_conf->mgr_stats_period;
m->get_connection()->send_message(configure);
+ DaemonStatePtr daemon;
if (daemon_state.exists(key)) {
+ daemon = daemon_state.get(key);
+ }
+ if (daemon) {
dout(20) << "updating existing DaemonState for " << m->daemon_name << dendl;
+ Mutex::Locker l(daemon->lock);
daemon_state.get(key)->perf_counters.clear();
}
if (m->service_daemon) {
- DaemonStatePtr daemon;
- if (daemon_state.exists(key)) {
- daemon = daemon_state.get(key);
- } else {
+ if (!daemon) {
dout(4) << "constructing new DaemonState for " << key << dendl;
daemon = std::make_shared<DaemonState>(daemon_state.types);
daemon->key = key;
}
daemon_state.insert(daemon);
}
+ Mutex::Locker l(daemon->lock);
daemon->service_daemon = true;
daemon->metadata = m->daemon_metadata;
daemon->service_status = m->daemon_status;
}
assert(daemon != nullptr);
auto &daemon_counters = daemon->perf_counters;
- daemon_counters.update(m);
+ {
+ Mutex::Locker l(daemon->lock);
+ daemon_counters.update(m);
+ }
+ // if there are any schema updates, notify the python modules
+ if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
+ ostringstream oss;
+ oss << key.first << '.' << key.second;
+ py_modules.notify_all("perf_schema_update", oss.str());
+ }
if (daemon->service_daemon) {
utime_t now = ceph_clock_now();
return true;
}
-struct MgrCommand {
- string cmdstring;
- string helpstring;
- string module;
- string perm;
- string availability;
-
- bool requires_perm(char p) const {
- return (perm.find(p) != string::npos);
- }
-
-} mgr_commands[] = {
-
-#define COMMAND(parsesig, helptext, module, perm, availability) \
- {parsesig, helptext, module, perm, availability},
-#include "MgrCommands.h"
-#undef COMMAND
-};
void DaemonServer::_generate_command_map(
map<string,cmd_vartype>& cmdmap,
}
}
-const MgrCommand *DaemonServer::_get_mgrcommand(
+const MonCommand *DaemonServer::_get_mgrcommand(
const string &cmd_prefix,
- MgrCommand *cmds,
- int cmds_size)
+ const std::vector<MonCommand> &cmds)
{
- MgrCommand *this_cmd = NULL;
- for (MgrCommand *cp = cmds;
- cp < &cmds[cmds_size]; cp++) {
- if (cp->cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
- this_cmd = cp;
+ const MonCommand *this_cmd = nullptr;
+ for (const auto &cmd : cmds) {
+ if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+ this_cmd = &cmd;
break;
}
}
const string &prefix,
const map<string,cmd_vartype>& cmdmap,
const map<string,string>& param_str_map,
- const MgrCommand *this_cmd) {
+ const MonCommand *this_cmd) {
if (s->entity_name.is_mon()) {
// mon is all-powerful. even when it is forwarding commands on behalf of
dout(4) << "prefix=" << prefix << dendl;
if (prefix == "get_command_descriptions") {
- int cmdnum = 0;
-
dout(10) << "reading commands from python modules" << dendl;
- auto py_commands = py_modules.get_commands();
+ const auto py_commands = py_modules.get_commands();
+ int cmdnum = 0;
JSONFormatter f;
f.open_object_section("command_descriptions");
- for (const auto &pyc : py_commands) {
+
+ auto dump_cmd = [&cmdnum, &f](const MonCommand &mc){
ostringstream secname;
secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
- dout(20) << "Dumping " << pyc.cmdstring << " (" << pyc.helpstring
- << ")" << dendl;
- dump_cmddesc_to_json(&f, secname.str(), pyc.cmdstring, pyc.helpstring,
- "mgr", pyc.perm, "cli", 0);
+ dump_cmddesc_to_json(&f, secname.str(), mc.cmdstring, mc.helpstring,
+ mc.module, mc.req_perms, mc.availability, 0);
cmdnum++;
+ };
+
+ for (const auto &pyc : py_commands) {
+ dump_cmd(pyc);
}
- for (const auto &cp : mgr_commands) {
- ostringstream secname;
- secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
- dump_cmddesc_to_json(&f, secname.str(), cp.cmdstring, cp.helpstring,
- cp.module, cp.perm, cp.availability, 0);
- cmdnum++;
+ for (const auto &mgr_cmd : mgr_commands) {
+ dump_cmd(mgr_cmd);
}
+
f.close_section(); // command_descriptions
f.flush(cmdctx->odata);
cmdctx->reply(0, ss);
}
// lookup command
- const MgrCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands,
- ARRAY_SIZE(mgr_commands));
+ const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands);
_generate_command_map(cmdctx->cmdmap, param_str_map);
if (!mgr_cmd) {
- MgrCommand py_command = {"", "", "py", "rw", "cli"};
+ MonCommand py_command = {"", "", "py", "rw", "cli"};
if (!_allowed_command(session.get(), py_command.module, prefix, cmdctx->cmdmap,
param_str_map, &py_command)) {
dout(1) << " access denied" << dendl;
DaemonKey key(p.first, q.first);
assert(daemon_state.exists(key));
auto daemon = daemon_state.get(key);
+ Mutex::Locker l(daemon->lock);
f->dump_stream("status_stamp") << daemon->service_status_stamp;
f->dump_stream("last_beacon") << daemon->last_service_beacon;
f->open_object_section("status");
});
cmdctx->reply(r, "");
return true;
+ } else if (prefix == "pg force-recovery" ||
+ prefix == "pg force-backfill" ||
+ prefix == "pg cancel-force-recovery" ||
+ prefix == "pg cancel-force-backfill") {
+ string forceop = prefix.substr(3, string::npos);
+ list<pg_t> parsed_pgs;
+ map<int, list<pg_t> > osdpgs;
+
+ // figure out actual op just once
+ int actual_op = 0;
+ if (forceop == "force-recovery") {
+ actual_op = OFR_RECOVERY;
+ } else if (forceop == "force-backfill") {
+ actual_op = OFR_BACKFILL;
+ } else if (forceop == "cancel-force-backfill") {
+ actual_op = OFR_BACKFILL | OFR_CANCEL;
+ } else if (forceop == "cancel-force-recovery") {
+ actual_op = OFR_RECOVERY | OFR_CANCEL;
+ }
+
+ // covnert pg names to pgs, discard any invalid ones while at it
+ {
+ // we don't want to keep pgidstr and pgidstr_nodup forever
+ vector<string> pgidstr;
+ // get pgids to process and prune duplicates
+ cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr);
+ set<string> pgidstr_nodup(pgidstr.begin(), pgidstr.end());
+ if (pgidstr.size() != pgidstr_nodup.size()) {
+ // move elements only when there were duplicates, as this
+ // reorders them
+ pgidstr.resize(pgidstr_nodup.size());
+ auto it = pgidstr_nodup.begin();
+ for (size_t i = 0 ; i < pgidstr_nodup.size(); i++) {
+ pgidstr[i] = std::move(*it++);
+ }
+ }
+
+ cluster_state.with_pgmap([&](const PGMap& pg_map) {
+ for (auto& pstr : pgidstr) {
+ pg_t parsed_pg;
+ if (!parsed_pg.parse(pstr.c_str())) {
+ ss << "invalid pgid '" << pstr << "'; ";
+ r = -EINVAL;
+ } else {
+ auto workit = pg_map.pg_stat.find(parsed_pg);
+ if (workit == pg_map.pg_stat.end()) {
+ ss << "pg " << pstr << " not exists; ";
+ r = -ENOENT;
+ } else {
+ pg_stat_t workpg = workit->second;
+
+ // discard pgs for which user requests are pointless
+ switch (actual_op)
+ {
+ case OFR_RECOVERY:
+ if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING)) == 0) {
+ // don't return error, user script may be racing with cluster. not fatal.
+ ss << "pg " << pstr << " doesn't require recovery; ";
+ continue;
+ } else if (workpg.state & PG_STATE_FORCED_RECOVERY) {
+ ss << "pg " << pstr << " recovery already forced; ";
+ // return error, as it may be a bug in user script
+ r = -EINVAL;
+ continue;
+ }
+ break;
+ case OFR_BACKFILL:
+ if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL)) == 0) {
+ ss << "pg " << pstr << " doesn't require backfilling; ";
+ continue;
+ } else if (workpg.state & PG_STATE_FORCED_BACKFILL) {
+ ss << "pg " << pstr << " backfill already forced; ";
+ r = -EINVAL;
+ continue;
+ }
+ break;
+ case OFR_BACKFILL | OFR_CANCEL:
+ if ((workpg.state & PG_STATE_FORCED_BACKFILL) == 0) {
+ ss << "pg " << pstr << " backfill not forced; ";
+ continue;
+ }
+ break;
+ case OFR_RECOVERY | OFR_CANCEL:
+ if ((workpg.state & PG_STATE_FORCED_RECOVERY) == 0) {
+ ss << "pg " << pstr << " recovery not forced; ";
+ continue;
+ }
+ break;
+ default:
+ assert(0 == "actual_op value is not supported");
+ }
+
+ parsed_pgs.push_back(std::move(parsed_pg));
+ }
+ }
+ }
+
+ // group pgs to process by osd
+ for (auto& pgid : parsed_pgs) {
+ auto workit = pg_map.pg_stat.find(pgid);
+ if (workit != pg_map.pg_stat.end()) {
+ pg_stat_t workpg = workit->second;
+ set<int32_t> osds(workpg.up.begin(), workpg.up.end());
+ osds.insert(workpg.acting.begin(), workpg.acting.end());
+ for (auto i : osds) {
+ osdpgs[i].push_back(pgid);
+ }
+ }
+ }
+
+ });
+ }
+
+ // respond with error only when no pgs are correct
+ // yes, in case of mixed errors, only the last one will be emitted,
+ // but the message presented will be fine
+ if (parsed_pgs.size() != 0) {
+ // clear error to not confuse users/scripts
+ r = 0;
+ }
+
+ // optimize the command -> messages conversion, use only one message per distinct OSD
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ for (auto& i : osdpgs) {
+ if (osdmap.is_up(i.first)) {
+ vector<pg_t> pgvec(make_move_iterator(i.second.begin()), make_move_iterator(i.second.end()));
+ auto p = osd_cons.find(i.first);
+ if (p == osd_cons.end()) {
+ ss << "osd." << i.first << " is not currently connected";
+ r = -EAGAIN;
+ continue;
+ }
+ for (auto& con : p->second) {
+ con->send_message(new MOSDForceRecovery(monc->get_fsid(), pgvec, actual_op));
+ }
+ ss << "instructing pg(s) " << i.second << " on osd." << i.first << " to " << forceop << "; ";
+ }
+ }
+ });
+ ss << std::endl;
+ cmdctx->reply(r, ss);
+ return true;
} else {
r = cluster_state.with_pgmap([&](const PGMap& pg_map) {
return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
// None of the special native commands,
MgrPyModule *handler = nullptr;
- auto py_commands = py_modules.get_commands();
+ auto py_commands = py_modules.get_py_commands();
for (const auto &pyc : py_commands) {
auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
dout(1) << "pyc_prefix: '" << pyc_prefix << "'" << dendl;
continue;
}
auto daemon = daemon_state.get(key);
+ Mutex::Locker l(daemon->lock);
if (daemon->last_service_beacon == utime_t()) {
// we must have just restarted; assume they are alive now.
daemon->last_service_beacon = ceph_clock_now();
}
auto m = new MMonMgrReport();
+ py_modules.get_health_checks(&m->health_checks);
+
cluster_state.with_pgmap([&](const PGMap& pg_map) {
cluster_state.update_delta_stats();
pg_map.get_health_checks(g_ceph_context, osdmap,
&m->health_checks);
+
dout(10) << m->health_checks.checks.size() << " health checks"
<< dendl;
dout(20) << "health checks:\n";
class MMgrOpen;
class MMonMgrReport;
class MCommand;
-struct MgrCommand;
+struct MonCommand;
/**
static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
map<string,string> ¶m_str_map);
- static const MgrCommand *_get_mgrcommand(const string &cmd_prefix,
- MgrCommand *cmds, int cmds_size);
+ static const MonCommand *_get_mgrcommand(const string &cmd_prefix,
+ const std::vector<MonCommand> &commands);
bool _allowed_command(
MgrSession *s, const string &module, const string &prefix,
const map<string,cmd_vartype>& cmdmap,
const map<string,string>& param_str_map,
- const MgrCommand *this_cmd);
+ const MonCommand *this_cmd);
private:
friend class ReplyOnFinish;
return result;
}
-DaemonStateCollection DaemonStateIndex::get_by_server(const std::string &hostname) const
+DaemonStateCollection DaemonStateIndex::get_by_server(
+ const std::string &hostname) const
{
Mutex::Locker l(lock);
class DaemonState
{
public:
+ Mutex lock = {"DaemonState::lock"};
+
DaemonKey key;
// The hostname where daemon was last seen running (extracted
#include "global/signal_handler.h"
#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
#include "MgrPyModule.h"
#include "DaemonServer.h"
DaemonStatePtr state;
if (daemon_state.exists(key)) {
state = daemon_state.get(key);
- // TODO lock state
+ Mutex::Locker l(state->lock);
daemon_meta.erase("name");
daemon_meta.erase("hostname");
state->metadata.clear();
dout(10) << "listing keys" << dendl;
JSONCommand cmd;
- cmd.run(monc, "{\"prefix\": \"config-key list\"}");
+ cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
lock.Unlock();
cmd.wait();
lock.Lock();
if (daemon_state.exists(k)) {
auto metadata = daemon_state.get(k);
+ Mutex::Locker l(metadata->lock);
auto addr_iter = metadata->metadata.find("front_addr");
if (addr_iter != metadata->metadata.end()) {
const std::string &metadata_addr = addr_iter->second;
bool update = false;
if (daemon_state.exists(k)) {
auto metadata = daemon_state.get(k);
+ Mutex::Locker l(metadata->lock);
if (metadata->metadata.empty() ||
metadata->metadata.count("addr") == 0) {
update = true;
dout(10) << dendl;
server.send_report();
}
+
+std::vector<MonCommand> Mgr::get_command_set() const
+{
+ Mutex::Locker l(lock);
+
+ std::vector<MonCommand> commands = mgr_commands;
+ std::vector<MonCommand> py_commands = py_modules.get_commands();
+ commands.insert(commands.end(), py_commands.begin(), py_commands.end());
+ return commands;
+}
+
Client *client;
Messenger *client_messenger;
- Mutex lock;
+ mutable Mutex lock;
SafeTimer timer;
Finisher finisher;
void background_init(Context *completion);
void shutdown();
+
+ std::vector<MonCommand> get_command_set() const;
};
#endif
// Don't send an open if we're just a client (i.e. doing
// command-sending, not stats etc)
- if ((g_conf && !g_conf->name.is_client()) ||
- service_daemon) {
+ if (!cct->_conf->name.is_client() || service_daemon) {
_send_open();
}
open->service_name = service_name;
open->daemon_name = daemon_name;
} else {
- open->daemon_name = g_conf->name.get_id();
+ open->daemon_name = cct->_conf->name.get_id();
}
if (service_daemon) {
open->service_daemon = service_daemon;
if (daemon_name.size()) {
report->daemon_name = daemon_name;
} else {
- report->daemon_name = g_conf->name.get_id();
+ report->daemon_name = cct->_conf->name.get_id();
}
report->service_name = service_name;
daemon_dirty_status = true;
// late register?
- if (g_conf->name.is_client() && session && session->con) {
+ if (cct->_conf->name.is_client() && session && session->con) {
_send_open();
}
COMMAND("pg ls-by-pool " \
"name=poolstr,type=CephString " \
- "name=states,type=CephChoices,strings=active|clean|down|scrubbing|degraded|inconsistent|peering|repair|recovering|backfill_wait|incomplete|stale|remapped|deep_scrub|backfill|backfill_toofull|recovery_wait|undersized|activating|peered,n=N,req=false ", \
+ "name=states,type=CephString,n=N,req=false", \
"list pg with pool = [poolname]", "pg", "r", "cli,rest")
COMMAND("pg ls-by-primary " \
"name=osd,type=CephOsdName " \
"name=pool,type=CephInt,req=false " \
- "name=states,type=CephChoices,strings=active|clean|down|scrubbing|degraded|inconsistent|peering|repair|recovering|backfill_wait|incomplete|stale|remapped|deep_scrub|backfill|backfill_toofull|recovery_wait|undersized|activating|peered,n=N,req=false ", \
+ "name=states,type=CephString,n=N,req=false", \
"list pg with primary = [osd]", "pg", "r", "cli,rest")
COMMAND("pg ls-by-osd " \
"name=osd,type=CephOsdName " \
"name=pool,type=CephInt,req=false " \
- "name=states,type=CephChoices,strings=active|clean|down|scrubbing|degraded|inconsistent|peering|repair|recovering|backfill_wait|incomplete|stale|remapped|deep_scrub|backfill|backfill_toofull|recovery_wait|undersized|activating|peered,n=N,req=false ", \
+ "name=states,type=CephString,n=N,req=false", \
"list pg on osd [osd]", "pg", "r", "cli,rest")
COMMAND("pg ls " \
"name=pool,type=CephInt,req=false " \
- "name=states,type=CephChoices,strings=active|clean|down|scrubbing|degraded|inconsistent|peering|repair|recovering|backfill_wait|incomplete|stale|remapped|deep_scrub|backfill|backfill_toofull|recovery_wait|undersized|activating|peered,n=N,req=false ", \
+ "name=states,type=CephString,n=N,req=false", \
"list pg with specific pool, osd, state", "pg", "r", "cli,rest")
COMMAND("pg dump_stuck " \
"name=stuckops,type=CephChoices,strings=inactive|unclean|stale|undersized|degraded,n=N,req=false " \
COMMAND("pg repair name=pgid,type=CephPgid", "start repair on <pgid>", \
"pg", "rw", "cli,rest")
+COMMAND("pg force-recovery name=pgid,type=CephPgid,n=N", "force recovery of <pgid> first", \
+ "pg", "rw", "cli,rest")
+COMMAND("pg force-backfill name=pgid,type=CephPgid,n=N", "force backfill of <pgid> first", \
+ "pg", "rw", "cli,rest")
+COMMAND("pg cancel-force-recovery name=pgid,type=CephPgid,n=N", "restore normal recovery priority of <pgid>", \
+ "pg", "rw", "cli,rest")
+COMMAND("pg cancel-force-backfill name=pgid,type=CephPgid,n=N", "restore normal backfill priority of <pgid>", \
+ "pg", "rw", "cli,rest")
+
// stuff in osd namespace
COMMAND("osd perf", \
"print dump of OSD perf summary stats", \
"name=max_osds,type=CephInt,req=false " \
"name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
"dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
- "osd", "rw", "cli,rest")
+ "osd", "r", "cli,rest")
COMMAND("osd reweight-by-pg " \
"name=oload,type=CephInt,req=false " \
"name=max_change,type=CephFloat,req=false " \
"name=max_osds,type=CephInt,req=false " \
"name=pools,type=CephPoolname,n=N,req=false", \
"dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
- "osd", "rw", "cli,rest")
+ "osd", "r", "cli,rest")
COMMAND("osd scrub " \
"name=who,type=CephString", \
return r;
}
+void MgrPyModule::get_health_checks(health_check_map_t *checks)
+{
+ checks->merge(health_checks);
+}
#include "common/cmdparse.h"
#include "common/LogEntry.h"
+#include "common/Mutex.h"
+#include "mon/health_check.h"
#include <vector>
#include <string>
PyThreadState *pMainThreadState;
PyThreadState *pMyThreadState = nullptr;
+ health_check_map_t health_checks;
+
std::vector<ModuleCommand> commands;
int load_commands();
const cmdmap_t &cmdmap,
std::stringstream *ds,
std::stringstream *ss);
+
+ void set_health_checks(health_check_map_t&& c) {
+ health_checks = std::move(c);
+ }
+ void get_health_checks(health_check_map_t *checks);
};
std::string handle_pyerror();
timer(g_ceph_context, lock),
active_mgr(nullptr),
orig_argc(argc),
- orig_argv(argv)
+ orig_argv(argv),
+ available_in_map(false)
{
}
set<string> modules;
PyModules::list_modules(&modules);
+
+ // Whether I think I am available (request MgrMonitor to set me
+ // as available in the map)
bool available = active_mgr != nullptr && active_mgr->is_initialized();
+
auto addr = available ? active_mgr->get_server_addr() : entity_addr_t();
dout(10) << "sending beacon as gid " << monc.get_global_id()
<< " modules " << modules << dendl;
+ map<string,string> metadata;
+ collect_sys_info(&metadata, g_ceph_context);
+
MMgrBeacon *m = new MMgrBeacon(monc.get_fsid(),
monc.get_global_id(),
g_conf->name.get_id(),
addr,
available,
- modules);
+ modules,
+ std::move(metadata));
+
+ if (available && !available_in_map) {
+ // We are informing the mon that we are done initializing: inform
+ // it of our command set. This has to happen after init() because
+ // it needs the python modules to have loaded.
+ m->set_command_descs(active_mgr->get_command_set());
+ dout(4) << "going active, including " << m->get_command_descs().size()
+ << " commands in beacon" << dendl;
+ }
+
monc.send_mon_message(m);
}
void MgrStandby::handle_mgr_map(MMgrMap* mmap)
{
- auto map = mmap->get_map();
+ auto &map = mmap->get_map();
dout(4) << "received map epoch " << map.get_epoch() << dendl;
const bool active_in_map = map.active_gid == monc.get_global_id();
dout(4) << "active in map: " << active_in_map
respawn();
}
}
+
+ if (!available_in_map && map.get_available()) {
+ dout(4) << "Map now says I am available" << dendl;
+ available_in_map = true;
+ }
} else {
if (active_mgr != nullptr) {
derr << "I was active but no longer am" << dendl;
void _update_log_config();
void send_beacon();
+ bool available_in_map;
+
public:
MgrStandby(int argc, const char **argv);
~MgrStandby() override;
std::string ceph_version;
for (const auto &i : dmc) {
+ Mutex::Locker l(i.second->lock);
const auto &key = i.first;
const std::string &str_type = key.first;
const std::string &svc_name = key.second;
const std::string &svc_id)
{
auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+ Mutex::Locker l(metadata->lock);
PyFormatter f;
f.dump_string("hostname", metadata->hostname);
for (const auto &i : metadata->metadata) {
const std::string &svc_id)
{
auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
+ Mutex::Locker l(metadata->lock);
PyFormatter f;
for (const auto &i : metadata->service_status) {
f.dump_string(i.first.c_str(), i.second);
PyFormatter f;
auto dmc = daemon_state.get_by_service("osd");
for (const auto &i : dmc) {
+ Mutex::Locker l(i.second->lock);
f.open_object_section(i.first.second.c_str());
f.dump_string("hostname", i.second->hostname);
for (const auto &j : i.second->metadata) {
JSONFormatter jf;
jf.open_object_section("cmd");
- jf.dump_string("prefix", "config-key put");
+ jf.dump_string("prefix", "config-key set");
jf.dump_string("key", global_key);
jf.dump_string("val", val);
jf.close_section();
set_cmd.wait();
if (set_cmd.r != 0) {
- // config-key put will fail if mgr's auth key has insufficient
+ // config-key set will fail if mgr's auth key has insufficient
// permission to set config keys
// FIXME: should this somehow raise an exception back into Python land?
- dout(0) << "`config-key put " << global_key << " " << val << "` failed: "
+ dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
<< cpp_strerror(set_cmd.r) << dendl;
dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
}
}
-std::vector<ModuleCommand> PyModules::get_commands()
+std::vector<ModuleCommand> PyModules::get_py_commands() const
{
Mutex::Locker l(lock);
std::vector<ModuleCommand> result;
- for (auto& i : modules) {
+ for (const auto& i : modules) {
auto module = i.second.get();
auto mod_commands = module->get_commands();
for (auto j : mod_commands) {
return result;
}
+std::vector<MonCommand> PyModules::get_commands() const
+{
+ std::vector<ModuleCommand> commands = get_py_commands();
+ std::vector<MonCommand> result;
+ for (auto &pyc: commands) {
+ result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
+ pyc.perm, "cli", MonCommand::FLAG_MGR});
+ }
+ return result;
+}
+
void PyModules::insert_config(const std::map<std::string,
std::string> &new_config)
{
auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id));
- // FIXME: this is unsafe, I need to either be inside DaemonStateIndex's
- // lock or put a lock on individual DaemonStates
+ Mutex::Locker l2(metadata->lock);
if (metadata) {
if (metadata->perf_counters.instances.count(path)) {
auto counter_instance = metadata->perf_counters.instances.at(path);
return f.get();
}
+PyObject* PyModules::get_perf_schema_python(
+ const std::string &handle,
+ const std::string svc_type,
+ const std::string &svc_id)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ Mutex::Locker l(lock);
+ PyEval_RestoreThread(tstate);
+
+ DaemonStateCollection states;
+
+ if (svc_type == "") {
+ states = daemon_state.get_all();
+ } else if (svc_id.empty()) {
+ states = daemon_state.get_by_service(svc_type);
+ } else {
+ auto key = DaemonKey(svc_type, svc_id);
+ // so that the below can be a loop in all cases
+ if (daemon_state.exists(key)) {
+ states[key] = daemon_state.get(key);
+ }
+ }
+
+ PyFormatter f;
+ f.open_object_section("perf_schema");
+
+ // FIXME: this is unsafe, I need to either be inside DaemonStateIndex's
+ // lock or put a lock on individual DaemonStates
+ if (!states.empty()) {
+ for (auto statepair : states) {
+ std::ostringstream daemon_name;
+ auto key = statepair.first;
+ auto state = statepair.second;
+ Mutex::Locker l(state->lock);
+ daemon_name << key.first << "." << key.second;
+ f.open_object_section(daemon_name.str().c_str());
+
+ for (auto typestr : state->perf_counters.declared_types) {
+ f.open_object_section(typestr.c_str());
+ auto type = state->perf_counters.types[typestr];
+ f.dump_string("description", type.description);
+ if (!type.nick.empty()) {
+ f.dump_string("nick", type.nick);
+ }
+ f.dump_unsigned("type", type.type);
+ f.close_section();
+ }
+ f.close_section();
+ }
+ } else {
+ dout(4) << __func__ << ": No daemon state found for "
+ << svc_type << "." << svc_id << ")" << dendl;
+ }
+ f.close_section();
+ return f.get();
+}
+
PyObject *PyModules::get_context()
{
PyThreadState *tstate = PyEval_SaveThread();
{
_list_modules(g_conf->mgr_module_path, modules);
}
+
+void PyModules::set_health_checks(const std::string& handle,
+ health_check_map_t&& checks)
+{
+ Mutex::Locker l(lock);
+ auto p = modules.find(handle);
+ if (p != modules.end()) {
+ p->second->set_health_checks(std::move(checks));
+ }
+}
+
+void PyModules::get_health_checks(health_check_map_t *checks)
+{
+ Mutex::Locker l(lock);
+ for (auto& p : modules) {
+ p.second->get_health_checks(checks);
+ }
+}
#include "client/Client.h"
#include "common/LogClient.h"
#include "mon/MgrMap.h"
+#include "mon/MonCommand.h"
#include "DaemonState.h"
#include "ClusterState.h"
class ServeThread;
+class health_check_map_t;
class PyModules
{
Client &client;
Finisher &finisher;
- mutable Mutex lock{"PyModules"};
+ mutable Mutex lock{"PyModules::lock"};
std::string get_site_packages();
const std::string &svc_name,
const std::string &svc_id,
const std::string &path);
+ PyObject *get_perf_schema_python(
+ const std::string &handle,
+ const std::string svc_type,
+ const std::string &svc_id);
PyObject *get_context();
std::map<std::string, std::string> config_cache;
- std::vector<ModuleCommand> get_commands();
+ // Python command definitions, including callback
+ std::vector<ModuleCommand> get_py_commands() const;
+
+ // Monitor command definitions, suitable for CLI
+ std::vector<MonCommand> get_commands() const;
void insert_config(const std::map<std::string, std::string> &new_config);
void set_config(const std::string &handle,
const std::string &key, const std::string &val);
+ void set_health_checks(const std::string& handle,
+ health_check_map_t&& checks);
+ void get_health_checks(health_check_map_t *checks);
+
void log(const std::string &handle,
int level, const std::string &record);
Py_RETURN_NONE;
}
+static PyObject*
+ceph_set_health_checks(PyObject *self, PyObject *args)
+{
+ char *handle = nullptr;
+ PyObject *checks = NULL;
+ if (!PyArg_ParseTuple(args, "sO:ceph_set_health_checks", &handle, &checks)) {
+ return NULL;
+ }
+ if (!PyDict_Check(checks)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+ PyObject *checksls = PyDict_Items(checks);
+ health_check_map_t out_checks;
+ for (int i = 0; i < PyList_Size(checksls); ++i) {
+ PyObject *kv = PyList_GET_ITEM(checksls, i);
+ char *check_name = nullptr;
+ PyObject *check_info = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+ derr << __func__ << " dict item " << i
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ if (!PyDict_Check(check_info)) {
+ derr << __func__ << " item " << i << " " << check_name
+ << " value not a dict" << dendl;
+ continue;
+ }
+ health_status_t severity = HEALTH_OK;
+ string summary;
+ list<string> detail;
+ PyObject *infols = PyDict_Items(check_info);
+ for (int j = 0; j < PyList_Size(infols); ++j) {
+ PyObject *pair = PyList_GET_ITEM(infols, j);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a tuple" << dendl;
+ continue;
+ }
+ char *k = nullptr;
+ PyObject *v = nullptr;
+ if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ string ks(k);
+ if (ks == "severity") {
+ if (!PyString_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " severity value not string" << dendl;
+ continue;
+ }
+ string vs(PyString_AsString(v));
+ if (vs == "warning") {
+ severity = HEALTH_WARN;
+ } else if (vs == "error") {
+ severity = HEALTH_ERR;
+ }
+ } else if (ks == "summary") {
+ if (!PyString_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " summary value not string" << dendl;
+ continue;
+ }
+ summary = PyString_AsString(v);
+ } else if (ks == "detail") {
+ if (!PyList_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " detail value not list" << dendl;
+ continue;
+ }
+ for (int k = 0; k < PyList_Size(v); ++k) {
+ PyObject *di = PyList_GET_ITEM(v, k);
+ if (!PyString_Check(di)) {
+ derr << __func__ << " check " << check_name
+ << " detail item " << k << " not a string" << dendl;
+ continue;
+ }
+ detail.push_back(PyString_AsString(di));
+ }
+ } else {
+ derr << __func__ << " check " << check_name
+ << " unexpected key " << k << dendl;
+ }
+ }
+ auto& d = out_checks.add(check_name, severity, summary);
+ d.detail.swap(detail);
+ }
+
+ JSONFormatter jf(true);
+ dout(10) << "module " << handle << " health checks:\n";
+ out_checks.dump(&jf);
+ jf.flush(*_dout);
+ *_dout << dendl;
+
+ global_handle->set_health_checks(handle, std::move(out_checks));
+
+ Py_RETURN_NONE;
+}
+
static PyObject*
ceph_state_get(PyObject *self, PyObject *args)
dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
return PyString_FromString(value.c_str());
} else {
- derr << "ceph_config_get " << what << " not found " << dendl;
+ dout(4) << "ceph_config_get " << what << " not found " << dendl;
Py_RETURN_NONE;
}
}
handle, svc_name, svc_id, counter_path);
}
+static PyObject*
+get_perf_schema(PyObject *self, PyObject *args)
+{
+ char *handle = nullptr;
+ char *type_str = nullptr;
+ char *svc_id = nullptr;
+ if (!PyArg_ParseTuple(args, "sss:get_perf_schema", &handle, &type_str,
+ &svc_id)) {
+ return nullptr;
+ }
+
+ return global_handle->get_perf_schema_python(handle, type_str, svc_id);
+}
+
PyMethodDef CephStateMethods[] = {
{"get", ceph_state_get, METH_VARARGS,
"Get a cluster object"},
"Get a service's status"},
{"send_command", ceph_send_command, METH_VARARGS,
"Send a mon command"},
+ {"set_health_checks", ceph_set_health_checks, METH_VARARGS,
+ "Set health checks for this module"},
{"get_mgr_id", ceph_get_mgr_id, METH_NOARGS,
"Get the mgr id"},
{"get_config", ceph_config_get, METH_VARARGS,
"Set a configuration value"},
{"get_counter", get_counter, METH_VARARGS,
"Get a performance counter"},
+ {"get_perf_schema", get_perf_schema, METH_VARARGS,
+ "Get the performance counter schema"},
{"log", ceph_log, METH_VARARGS,
"Emit a (local) log message"},
{"get_version", ceph_get_version, METH_VARARGS,
return "no daemons active";
}
std::ostringstream ss;
- ss << daemons.size() << (daemons.size() > 1 ? "daemonss" : "daemon")
+ ss << daemons.size() << (daemons.size() > 1 ? " daemons" : " daemon")
<< " active";
return ss.str();
}
+
+ void count_metadata(const string& field,
+ std::map<std::string,int> *out) const {
+ for (auto& p : daemons) {
+ auto q = p.second.metadata.find(field);
+ if (q == p.second.metadata.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[q->second]++;
+ }
+ }
+ }
+
};
epoch_t epoch = 0;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr_commands.h"
+
+/* The set of statically defined (C++-handled) commands. This
+ * does not include the Python-defined commands, which are loaded
+ * in PyModules */
+const std::vector<MonCommand> mgr_commands = {
+#define COMMAND(parsesig, helptext, module, perm, availability) \
+ {parsesig, helptext, module, perm, availability, 0},
+#include "MgrCommands.h"
+#undef COMMAND
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "mon/MonCommand.h"
+#include <vector>
+
+extern const std::vector<MonCommand> mgr_commands;
auth.key.encode_plaintext(rdata);
}
r = 0;
- } else if (prefix == "auth list") {
+ } else if (prefix == "auth list" ||
+ prefix == "auth ls") {
if (f) {
mon->key_server.encode_formatted("auth", f.get(), rdata);
} else {
assert(r);
::encode("allow profile bootstrap-mgr", auth_inc.auth.caps["mon"]);
auth_inc.op = KeyServerData::AUTH_INC_ADD;
+ // generate key
+ auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
push_cephx_inc(auth_inc);
}
changed = true;
DataHealthService.cc
PGMonitor.cc
PGMap.cc
- ConfigKeyService.cc)
+ ConfigKeyService.cc
+ ../mgr/mgr_commands.cc)
add_library(mon STATIC
${lib_mon_srcs}
$<TARGET_OBJECTS:kv_objs>
}
ss << "obtained '" << key << "'";
- } else if (prefix == "config-key put") {
+ } else if (prefix == "config-key put" ||
+ prefix == "config-key set") {
if (!mon->is_leader()) {
mon->forward_request_leader(op);
// we forward the message; so return now.
ret = -ENOENT;
}
- } else if (prefix == "config-key list") {
+ } else if (prefix == "config-key list" ||
+ prefix == "config-key ls") {
stringstream tmp_ss;
store_list(tmp_ss);
rdata.append(tmp_ss);
class FsNewHandler : public FileSystemCommandHandler
{
public:
- FsNewHandler()
- : FileSystemCommandHandler("fs new")
+ FsNewHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs new"), m_paxos(paxos)
{
}
+ bool batched_propose() override {
+ return true;
+ }
+
int handle(
Monitor *mon,
FSMap &fsmap,
map<string, cmd_vartype> &cmdmap,
std::stringstream &ss) override
{
+ assert(m_paxos->is_plugged());
+
string metadata_name;
cmd_getval(g_ceph_context, cmdmap, "metadata", metadata_name);
int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
return -ENOENT;
}
- string force;
- cmd_getval(g_ceph_context,cmdmap, "force", force);
+ string force_str;
+ cmd_getval(g_ceph_context,cmdmap, "force", force_str);
+ bool force = (force_str == "--force");
const pool_stat_t *stat = mon->pgservice->get_pool_stat(metadata);
if (stat) {
int64_t metadata_num_objects = stat->stats.sum.num_objects;
- if (force != "--force" && metadata_num_objects > 0) {
+ if (!force && metadata_num_objects > 0) {
ss << "pool '" << metadata_name
<< "' already contains some objects. Use an empty pool instead.";
return -EINVAL;
ss << "pool '" << data_name << "' has id 0, which CephFS does not allow. Use another pool or recreate it to get a non-zero pool id.";
return -EINVAL;
}
-
+
string fs_name;
cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
if (fs_name.empty()) {
// we believe we must. bailing out *after* we request the proposal is
// bad business as we could have changed the osdmon's state and ending up
// returning an error to the user.
- int r = _check_pool(mon->osdmon()->osdmap, data, false, &ss);
+ int r = _check_pool(mon->osdmon()->osdmap, data, false, force, &ss);
if (r < 0) {
return r;
}
- r = _check_pool(mon->osdmon()->osdmap, metadata, true, &ss);
+ r = _check_pool(mon->osdmon()->osdmap, metadata, true, force, &ss);
if (r < 0) {
return r;
}
+ mon->osdmon()->do_application_enable(data,
+ pg_pool_t::APPLICATION_NAME_CEPHFS);
+ mon->osdmon()->do_application_enable(metadata,
+ pg_pool_t::APPLICATION_NAME_CEPHFS);
+
// All checks passed, go ahead and create.
fsmap.create_filesystem(fs_name, metadata, data,
mon->get_quorum_con_features());
ss << "new fs with metadata pool " << metadata << " and data pool " << data;
return 0;
}
+
+private:
+ Paxos *m_paxos;
};
class SetHandler : public FileSystemCommandHandler
class AddDataPoolHandler : public FileSystemCommandHandler
{
public:
- AddDataPoolHandler()
- : FileSystemCommandHandler("fs add_data_pool")
+ AddDataPoolHandler(Paxos *paxos)
+ : FileSystemCommandHandler("fs add_data_pool"), m_paxos(paxos)
{}
+ bool batched_propose() override {
+ return true;
+ }
+
int handle(
Monitor *mon,
FSMap &fsmap,
map<string, cmd_vartype> &cmdmap,
std::stringstream &ss) override
{
+ assert(m_paxos->is_plugged());
+
string poolname;
cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
}
}
- int r = _check_pool(mon->osdmon()->osdmap, poolid, false, &ss);
+ int r = _check_pool(mon->osdmon()->osdmap, poolid, false, false, &ss);
if (r != 0) {
return r;
}
return 0;
}
+ mon->osdmon()->do_application_enable(poolid,
+ pg_pool_t::APPLICATION_NAME_CEPHFS);
+
fsmap.modify_filesystem(
fs->fscid,
[poolid](std::shared_ptr<Filesystem> fs)
return 0;
}
+
+private:
+ Paxos *m_paxos;
};
class SetDefaultHandler : public FileSystemCommandHandler
std::string legacy_prefix;
public:
- LegacyHandler(const std::string &new_prefix)
- : T()
+ template <typename... Args>
+ LegacyHandler(const std::string &new_prefix, Args&&... args)
+ : T(std::forward<Args>(args)...)
{
legacy_prefix = new_prefix;
}
};
-std::list<std::shared_ptr<FileSystemCommandHandler> > FileSystemCommandHandler::load()
+std::list<std::shared_ptr<FileSystemCommandHandler> >
+FileSystemCommandHandler::load(Paxos *paxos)
{
std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
handlers.push_back(std::make_shared<SetHandler>());
handlers.push_back(std::make_shared<LegacyHandler<SetHandler> >("mds set"));
handlers.push_back(std::make_shared<FlagSetHandler>());
- handlers.push_back(std::make_shared<AddDataPoolHandler>());
+ handlers.push_back(std::make_shared<AddDataPoolHandler>(paxos));
handlers.push_back(std::make_shared<LegacyHandler<AddDataPoolHandler> >(
- "mds add_data_pool"));
+ "mds add_data_pool", paxos));
handlers.push_back(std::make_shared<RemoveDataPoolHandler>());
handlers.push_back(std::make_shared<LegacyHandler<RemoveDataPoolHandler> >(
"mds remove_data_pool"));
handlers.push_back(std::make_shared<LegacyHandler<RemoveDataPoolHandler> >(
"mds rm_data_pool"));
- handlers.push_back(std::make_shared<FsNewHandler>());
+ handlers.push_back(std::make_shared<FsNewHandler>(paxos));
handlers.push_back(std::make_shared<RemoveFilesystemHandler>());
handlers.push_back(std::make_shared<ResetFilesystemHandler>());
OSDMap &osd_map,
const int64_t pool_id,
bool metadata,
+ bool force,
std::stringstream *ss) const
{
assert(ss != NULL);
return -EINVAL;
}
+ if (!force && !pool->application_metadata.empty() &&
+ pool->application_metadata.count(
+ pg_pool_t::APPLICATION_NAME_CEPHFS) == 0) {
+ *ss << " pool '" << pool_name << "' (id '" << pool_id
+ << "') has a non-CephFS application enabled.";
+ return -EINVAL;
+ }
+
// Nothing special about this pool, so it is permissible
return 0;
}
OSDMap &osd_map,
const int64_t pool_id,
bool metadata,
+ bool force,
std::stringstream *ss) const;
virtual std::string const &get_prefix() {return prefix;}
return get_prefix() == prefix_;
}
- static std::list<std::shared_ptr<FileSystemCommandHandler> > load();
+ static std::list<std::shared_ptr<FileSystemCommandHandler> > load(Paxos *paxos);
+
+ virtual bool batched_propose() {
+ return false;
+ }
virtual int handle(
Monitor *mon,
}
pending_health.merge(p.second);
}
- for (auto p : pending_health.checks) {
+ for (auto &p : pending_health.checks) {
p.second.summary = boost::regex_replace(
p.second.summary,
- boost::regex("%num%"), stringify(names[p.first].size()));
+ boost::regex("%hasorhave%"),
+ names[p.first].size() > 1 ? "have" : "has");
p.second.summary = boost::regex_replace(
p.second.summary,
boost::regex("%names%"), stringify(names[p.first]));
bool HealthMonitor::preprocess_query(MonOpRequestRef op)
{
- switch (op->get_req()->get_type()) {
+ return false;
+}
+
+bool HealthMonitor::prepare_update(MonOpRequestRef op)
+{
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m
+ << " from " << m->get_orig_source_inst() << dendl;
+ switch (m->get_type()) {
case MSG_MON_HEALTH:
{
MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
}
return services[service_type]->service_dispatch(op);
}
-
case MSG_MON_HEALTH_CHECKS:
- return preprocess_health_checks(op);
+ return prepare_health_checks(op);
+ default:
+ return false;
}
- return false;
-}
-
-bool HealthMonitor::prepare_update(MonOpRequestRef op)
-{
- return false;
}
-bool HealthMonitor::preprocess_health_checks(MonOpRequestRef op)
+bool HealthMonitor::prepare_health_checks(MonOpRequestRef op)
{
MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req());
- quorum_checks[m->get_source().num()] = m->health_checks;
+ // no need to check if it's changed, the peon has done so
+ quorum_checks[m->get_source().num()] = std::move(m->health_checks);
return true;
}
if (check_member_health()) {
changed = true;
}
- if (mon->is_leader()) {
- if (check_leader_health()) {
- changed = true;
- }
+ if (!mon->is_leader()) {
+ return;
+ }
+ if (check_leader_health()) {
+ changed = true;
}
if (changed) {
propose_pending();
} else if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
stringstream ss, ss2;
ss << "mon%plurals% %names% %isorare% low on available space";
- auto& d = next.add("MON_DISK_LOW", HEALTH_ERR, ss.str());
+ auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str());
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
<< "% avail";
d.detail.push_back(ss2.str());
d.detail.push_back(ss2.str());
}
- auto p = quorum_checks.find(mon->rank);
- if (p == quorum_checks.end() ||
- p->second != next) {
- if (mon->is_leader()) {
- // prepare to propose
- quorum_checks[mon->rank] = next;
- changed = true;
- } else {
- // tell the leader
- mon->messenger->send_message(new MMonHealthChecks(next),
- mon->monmap->get_inst(mon->get_leader()));
- }
- }
-
// OSD_NO_DOWN_OUT_INTERVAL
{
// Warn if 'mon_osd_down_out_interval' is set to zero.
if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
g_conf->mon_osd_down_out_interval == 0) {
ostringstream ss, ds;
- ss << "mon%plurals% %names %hasorhave% mon_osd_down_out_interval set to 0";
+ ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
d.detail.push_back(ds.str());
}
}
+ auto p = quorum_checks.find(mon->rank);
+ if (p == quorum_checks.end()) {
+ if (next.empty()) {
+ return false;
+ }
+ } else {
+ if (p->second == next) {
+ return false;
+ }
+ }
+
+ if (mon->is_leader()) {
+ // prepare to propose
+ quorum_checks[mon->rank] = next;
+ changed = true;
+ } else {
+ // tell the leader
+ mon->messenger->send_message(new MMonHealthChecks(next),
+ mon->monmap->get_inst(mon->get_leader()));
+ }
+
return changed;
}
if (!warns.empty())
ss << ",";
}
- auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN,
- "monitor clock skew detected");
+ auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str());
d.detail.swap(details);
}
}
bool preprocess_query(MonOpRequestRef op) override;
bool prepare_update(MonOpRequestRef op) override;
- bool preprocess_health_checks(MonOpRequestRef op);
bool prepare_health_checks(MonOpRequestRef op);
bool check_leader_health();
return entry.prio >= level && (entry.channel == channel || channel == "*");
};
- auto p = summary.tail.end();
- while (num > 0 && p != summary.tail.begin()) {
- if (match(*p)) {
+ auto rp = summary.tail.rbegin();
+ while (num > 0 && rp != summary.tail.rend()) {
+ if (match(*rp)) {
num--;
}
- --p;
+ ++rp;
}
ostringstream ss;
+ auto p = summary.tail.begin();
for ( ; p != summary.tail.end(); ++p) {
if (!match(*p)) {
continue;
if (leaderinfo && (leaderinfo->rank >= 0)) {
auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
auto fs = pending_fsmap.get_filesystem(fscid);
- bool followable = fs->mds_map.is_followable(leaderinfo->rank);
- pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
+ pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
MDSMap::mds_info_t *info) {
info->standby_for_rank = leaderinfo->rank;
info->standby_for_fscid = fscid;
} else {
mdsmap->print(ds);
r = 0;
- }
- if (r == 0) {
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
}
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << p->get_epoch();
+
if (p != &fsmap) {
delete p;
}
} else {
p->print(ds);
r = 0;
- }
- if (r == 0) {
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
}
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << p->get_epoch();
+
if (p != &fsmap)
delete p;
}
derr << "Unexpected error reading metadata: " << cpp_strerror(r)
<< dendl;
ss << get_err.str();
+ f->close_section();
break;
}
f->close_section();
return true;
}
+ bool batched_propose = false;
for (auto h : handlers) {
if (h->can_handle(prefix)) {
+ batched_propose = h->batched_propose();
+ if (batched_propose) {
+ paxos->plug();
+ }
r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
+ if (batched_propose) {
+ paxos->unplug();
+ }
+
if (r == -EAGAIN) {
// message has been enqueued for retry; return.
dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
// success.. delay reply
wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
get_last_committed() + 1));
+ if (batched_propose) {
+ force_immediate_propose();
+ }
return true;
} else {
// reply immediately
return 0;
}
-void MDSMonitor::count_metadata(const string& field, Formatter *f)
+void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
{
- map<string,int> by_val;
map<mds_gid_t,Metadata> meta;
load_metadata(meta);
for (auto& p : meta) {
auto q = p.second.find(field);
if (q == p.second.end()) {
- by_val["unknown"]++;
+ (*out)["unknown"]++;
} else {
- by_val[q->second]++;
+ (*out)[q->second]++;
}
}
+}
+
+void MDSMonitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
f->open_object_section(field.c_str());
for (auto& p : by_val) {
f->dump_int(p.first.c_str(), p.second);
* is available, fail this daemon (remove from map) and pass its
* role to another daemon.
*/
-void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
- const beacon_info_t &beacon,
+void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
bool *mds_propose, bool *osd_propose)
{
assert(mds_propose != nullptr);
assert(osd_propose != nullptr);
- const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
const auto fscid = pending_fsmap.mds_roles.at(gid);
- dout(10) << "no beacon from " << gid << " " << info.addr << " mds."
- << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " since " << beacon.stamp << dendl;
-
// We will only take decisive action (replacing/removing a daemon)
// if we have some indicating that some other daemon(s) are successfully
// getting beacons through recently.
}
}
- // If the OSDMap is writeable, we can blacklist things, so we can
- // try failing any laggy MDS daemons. Consider each one for failure.
- if (mon->osdmon()->is_writeable()) {
- bool propose_osdmap = false;
-
- map<mds_gid_t, beacon_info_t>::iterator p = last_beacon.begin();
- while (p != last_beacon.end()) {
- mds_gid_t gid = p->first;
- auto beacon_info = p->second;
- ++p;
-
- if (!pending_fsmap.gid_exists(gid)) {
- // clean it out
- last_beacon.erase(gid);
- continue;
- }
+ bool propose_osdmap = false;
+ bool osdmap_writeable = mon->osdmon()->is_writeable();
+ auto p = last_beacon.begin();
+ while (p != last_beacon.end()) {
+ mds_gid_t gid = p->first;
+ auto beacon_info = p->second;
+ ++p;
- if (beacon_info.stamp < cutoff) {
- maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap);
- }
+ if (!pending_fsmap.gid_exists(gid)) {
+ // clean it out
+ last_beacon.erase(gid);
+ continue;
}
- if (propose_osdmap) {
- request_proposal(mon->osdmon());
+ if (beacon_info.stamp < cutoff) {
+ auto &info = pending_fsmap.get_info_gid(gid);
+ dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+ << " (gid: " << gid << " addr: " << info.addr
+ << " state: " << ceph_mds_state_name(info.state) << ")"
+ << " since " << beacon_info.stamp << dendl;
+ // If the OSDMap is writeable, we can blacklist things, so we can
+ // try failing any laggy MDS daemons. Consider each one for failure.
+ if (osdmap_writeable) {
+ maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
+ }
}
}
+ if (propose_osdmap) {
+ request_proposal(mon->osdmon());
+ }
for (auto i : pending_fsmap.filesystems) {
auto fs = i.second;
MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
: PaxosService(mn, p, service_name)
{
- handlers = FileSystemCommandHandler::load();
+ handlers = FileSystemCommandHandler::load(p);
}
void MDSMonitor::on_restart()
bool maybe_promote_standby(std::shared_ptr<Filesystem> fs);
bool maybe_expand_cluster(std::shared_ptr<Filesystem> fs);
- void maybe_replace_gid(mds_gid_t gid, const beacon_info_t &beacon,
+ void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
bool *mds_propose, bool *osd_propose);
void tick() override; // check state, take actions
void remove_from_metadata(MonitorDBStore::TransactionRef t);
int load_metadata(map<mds_gid_t, Metadata>& m);
void count_metadata(const string& field, Formatter *f);
+public:
+ void count_metadata(const string& field, map<string,int> *out);
+protected:
// MDS daemon GID to latest health state from that GID
std::map<uint64_t, MDSHealth> pending_daemon_health;
return true;
}
+ bool have_name(const string& name) const {
+ if (active_name == name) {
+ return true;
+ }
+ for (auto& p : standbys) {
+ if (p.second.name == name) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ std::set<std::string> get_all_names() const {
+ std::set<std::string> ls;
+ if (active_name.size()) {
+ ls.insert(active_name);
+ }
+ for (auto& p : standbys) {
+ ls.insert(p.second.name);
+ }
+ return ls;
+ }
+
void encode(bufferlist& bl, uint64_t features) const
{
ENCODE_START(2, 1, bl);
#include "PGStatService.h"
#include "include/stringify.h"
#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
#include "OSDMonitor.h"
#include "MgrMonitor.h"
+#define MGR_METADATA_PREFIX "mgr_metadata"
+
#define dout_subsys ceph_subsys_mon
#undef dout_prefix
#define dout_prefix _prefix(_dout, mon, map)
<< ").mgr e" << mgrmap.get_epoch() << " ";
}
+// Prefix for mon store of active mgr's command descriptions
+const static std::string command_descs_prefix = "mgr_command_descs";
+
void MgrMonitor::create_initial()
{
for (auto& m : tok) {
pending_map.modules.insert(m);
}
- dout(10) << __func__ << " initial modules " << pending_map.modules << dendl;
+ pending_command_descs = mgr_commands;
+ dout(10) << __func__ << " initial modules " << pending_map.modules
+ << ", " << pending_command_descs.size() << " commands"
+ << dendl;
}
void MgrMonitor::update_from_paxos(bool *need_bootstrap)
int err = get_version(version, bl);
assert(err == 0);
+ bool old_available = map.get_available();
+ uint64_t old_gid = map.get_active_gid();
+
bufferlist::iterator p = bl.begin();
map.decode(p);
}
check_subs();
+
+ if (version == 1
+ || (map.get_available()
+ && (!old_available || old_gid != map.get_active_gid()))) {
+ dout(4) << "mkfs or daemon transitioned to available, loading commands"
+ << dendl;
+ bufferlist loaded_commands;
+ int r = mon->store->get(command_descs_prefix, "", loaded_commands);
+ if (r < 0) {
+ derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
+ } else {
+ auto p = loaded_commands.begin();
+ ::decode(command_descs, p);
+ }
+ }
}
// feed our pet MgrClient
put_version(t, pending_map.epoch, bl);
put_last_committed(t, pending_map.epoch);
+ for (auto& p : pending_metadata) {
+ dout(10) << __func__ << " set metadata for " << p.first << dendl;
+ t->put(MGR_METADATA_PREFIX, p.first, p.second);
+ }
+ for (auto& name : pending_metadata_rm) {
+ dout(10) << __func__ << " rm metadata for " << name << dendl;
+ t->erase(MGR_METADATA_PREFIX, name);
+ }
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+
health_check_map_t next;
if (pending_map.active_gid == 0) {
auto level = should_warn_about_mgr_down();
put_value(t, "ever_had_active_mgr", 1);
}
encode_health(next, t);
+
+ if (pending_command_descs.size()) {
+ dout(4) << __func__ << " encoding " << pending_command_descs.size()
+ << " command_descs" << dendl;
+ for (auto& p : pending_command_descs) {
+ p.set_flag(MonCommand::FLAG_MGR);
+ }
+ bufferlist bl;
+ ::encode(pending_command_descs, bl);
+ t->put(command_descs_prefix, "", bl);
+ pending_command_descs.clear();
+ }
}
bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
dout(4) << "available " << m->get_gid() << dendl;
mon->clog->info() << "Manager daemon " << pending_map.active_name
<< " is now available";
+
+ // This beacon should include command descriptions
+ pending_command_descs = m->get_command_descs();
+ if (pending_command_descs.empty()) {
+ // This should not happen, but it also isn't fatal: we just
+ // won't successfully update our list of commands.
+ dout(4) << "First available beacon from " << pending_map.active_name
+ << "(" << m->get_gid() << ") does not include command descs"
+ << dendl;
+ } else {
+ dout(4) << "First available beacon from " << pending_map.active_name
+ << "(" << m->get_gid() << ") includes "
+ << pending_command_descs.size() << " command descs" << dendl;
+ }
+
pending_map.available = m->get_available();
updated = true;
}
pending_map.active_gid = m->get_gid();
pending_map.active_name = m->get_name();
pending_map.available_modules = m->get_available_modules();
+ ::encode(m->get_metadata(), pending_metadata[m->get_name()]);
+ pending_metadata_rm.erase(m->get_name());
mon->clog->info() << "Activating manager daemon "
<< pending_map.active_name;
dout(10) << "new standby " << m->get_gid() << dendl;
mon->clog->debug() << "Standby manager daemon " << m->get_name()
<< " started";
+ pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
+ m->get_available_modules()};
+ ::encode(m->get_metadata(), pending_metadata[m->get_name()]);
+ pending_metadata_rm.erase(m->get_name());
updated = true;
}
}
}
} else {
assert(sub->type == "mgrdigest");
+ if (sub->next == 0) {
+ // new registration; cancel previous timer
+ cancel_timer();
+ }
if (digest_event == nullptr) {
send_digests();
}
}
if (!pending_map.available &&
+ !ever_had_active_mgr &&
should_warn_about_mgr_down() != HEALTH_OK) {
dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace
<< " seconds" << dendl;
last_beacon.erase(pending_map.active_gid);
}
+ pending_metadata_rm.insert(pending_map.active_name);
+ pending_metadata.erase(pending_map.active_name);
pending_map.active_name = "";
pending_map.active_gid = 0;
pending_map.available = false;
void MgrMonitor::drop_standby(uint64_t gid)
{
+ pending_metadata_rm.insert(pending_map.standbys[gid].name);
+ pending_metadata.erase(pending_map.standbys[gid].name);
pending_map.standbys.erase(gid);
if (last_beacon.count(gid) > 0) {
last_beacon.erase(gid);
}
-
}
bool MgrMonitor::preprocess_command(MonOpRequestRef op)
}
f->close_section();
f->flush(rdata);
+ } else if (prefix == "mgr metadata") {
+ string name;
+ cmd_getval(g_ceph_context, cmdmap, "id", name);
+ if (name.size() > 0 && !map.have_name(name)) {
+ ss << "mgr." << name << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ string format;
+ cmd_getval(g_ceph_context, cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ if (name.size()) {
+ f->open_object_section("mgr_metadata");
+ f->dump_string("id", name);
+ r = dump_metadata(name, f.get(), &ss);
+ if (r < 0)
+ goto reply;
+ f->close_section();
+ } else {
+ r = 0;
+ f->open_array_section("mgr_metadata");
+ for (auto& i : map.get_all_names()) {
+ f->open_object_section("mgr");
+ f->dump_string("id", i);
+ r = dump_metadata(i, f.get(), NULL);
+ if (r == -EINVAL || r == -ENOENT) {
+ // Drop error, continue to get other daemons' metadata
+ dout(4) << "No metadata for mgr." << i << dendl;
+ r = 0;
+ } else if (r < 0) {
+ // Unexpected error
+ goto reply;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->flush(rdata);
+ } else if (prefix == "mgr versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ count_metadata("ceph_version", f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "mgr count-metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ string field;
+ cmd_getval(g_ceph_context, cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(rdata);
+ r = 0;
} else {
return false;
}
cancel_timer();
}
+int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
+ ostream *err)
+{
+ bufferlist bl;
+ int r = mon->store->get(MGR_METADATA_PREFIX, name, bl);
+ if (r < 0)
+ return r;
+ try {
+ bufferlist::iterator p = bl.begin();
+ ::decode(m, p);
+ }
+ catch (buffer::error& e) {
+ if (err)
+ *err << "mgr." << name << " metadata is corrupt";
+ return -EIO;
+ }
+ return 0;
+}
+
+void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
+{
+ std::set<string> ls = map.get_all_names();
+ for (auto& name : ls) {
+ std::map<string,string> meta;
+ load_metadata(name, meta, nullptr);
+ auto p = meta.find(field);
+ if (p == meta.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[p->second]++;
+ }
+ }
+}
+
+void MgrMonitor::count_metadata(const string& field, Formatter *f)
+{
+ std::map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
+{
+ std::map<string,string> m;
+ if (int r = load_metadata(name, m, err))
+ return r;
+ for (auto& p : m) {
+ f->dump_string(p.first.c_str(), p.second);
+ }
+ return 0;
+}
#ifndef CEPH_MGRMONITOR_H
#define CEPH_MGRMONITOR_H
+#include <map>
+#include <set>
+
#include "include/Context.h"
#include "MgrMap.h"
#include "PaxosService.h"
+#include "MonCommand.h"
class MgrMonitor: public PaxosService
{
MgrMap pending_map;
bool ever_had_active_mgr = false;
+ std::map<std::string, bufferlist> pending_metadata;
+ std::set<std::string> pending_metadata_rm;
+
utime_t first_seen_inactive;
std::map<uint64_t, ceph::coarse_mono_clock::time_point> last_beacon;
health_status_t should_warn_about_mgr_down();
+ // Command descriptions we've learned from the active mgr
+ std::vector<MonCommand> command_descs;
+ std::vector<MonCommand> pending_command_descs;
+
public:
MgrMonitor(Monitor *mn, Paxos *p, const string& service_name)
: PaxosService(mn, p, service_name)
void print_summary(Formatter *f, std::ostream *ss) const;
+ const std::vector<MonCommand> &get_command_descs() const {
+ return command_descs;
+ }
+
+ int load_metadata(const string& name, std::map<string, string>& m,
+ ostream *err);
+ int dump_metadata(const string& name, Formatter *f, ostream *err);
+ void count_metadata(const string& field, Formatter *f);
+ void count_metadata(const string& field, std::map<string,int> *out);
+
friend class C_Updated;
};
#include <algorithm>
+#include <boost/regex.hpp>
+#include "include/assert.h"
+
static inline bool is_not_alnum_space(char c)
{
return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_'));
ostream& operator<<(ostream& out, const StringConstraint& c)
{
- if (c.prefix.length())
- return out << "prefix " << c.prefix;
- else
+ switch (c.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
return out << "value " << c.value;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ return out << "prefix " << c.value;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ return out << "regex " << c.value;
+ default:
+ break;
+ }
+ return out;
}
ostream& operator<<(ostream& out, const MonCapGrant& m)
for (map<string,StringConstraint>::const_iterator p = m.command_args.begin();
p != m.command_args.end();
++p) {
- if (p->second.value.length())
- out << " " << maybe_quote_string(p->first) << "=" << maybe_quote_string(p->second.value);
- else
- out << " " << maybe_quote_string(p->first) << " prefix " << maybe_quote_string(p->second.prefix);
+ switch (p->second.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
+ out << " " << maybe_quote_string(p->first) << "="
+ << maybe_quote_string(p->second.value);
+ break;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ out << " " << maybe_quote_string(p->first) << " prefix "
+ << maybe_quote_string(p->second.value);
+ break;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ out << " " << maybe_quote_string(p->first) << " regex "
+ << maybe_quote_string(p->second.value);
+ break;
+ default:
+ break;
+ }
}
}
}
(mon_rwxa_t, allow))
BOOST_FUSION_ADAPT_STRUCT(StringConstraint,
- (std::string, value)
- (std::string, prefix))
+ (StringConstraint::MatchType, match_type)
+ (std::string, value))
// </magic>
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R | MON_CAP_W));
profile_grants.push_back(MonCapGrant("auth", MON_CAP_R | MON_CAP_X));
profile_grants.push_back(MonCapGrant("config-key", MON_CAP_R | MON_CAP_W));
- string prefix = string("daemon-private/mgr/");
- profile_grants.push_back(MonCapGrant("config-key get", "key",
- StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key put", "key",
- StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key exists", "key",
- StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key delete", "key",
- StringConstraint("", prefix)));
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+ "daemon-private/mgr/");
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key put", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key exists", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key delete", "key", constraint));
}
if (profile == "osd" || profile == "mds" || profile == "mon" ||
profile == "mgr") {
+ StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+ string("daemon-private/") + stringify(name) +
+ string("/"));
string prefix = string("daemon-private/") + stringify(name) + string("/");
- profile_grants.push_back(MonCapGrant("config-key get", "key", StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key put", "key", StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key exists", "key", StringConstraint("", prefix)));
- profile_grants.push_back(MonCapGrant("config-key delete", "key", StringConstraint("", prefix)));
+ profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key put", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key exists", "key", constraint));
+ profile_grants.push_back(MonCapGrant("config-key delete", "key", constraint));
}
if (profile == "bootstrap-osd") {
- string prefix = "dm-crypt/osd";
- profile_grants.push_back(MonCapGrant("config-key put", "key", StringConstraint("", prefix)));
profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
profile_grants.push_back(MonCapGrant("mon getmap"));
- profile_grants.push_back(MonCapGrant("osd create"));
- profile_grants.push_back(MonCapGrant("auth get-or-create"));
- profile_grants.back().command_args["entity"] = StringConstraint("", "client.");
- prefix = "allow command \"config-key get\" with key=\"dm-crypt/osd/";
- profile_grants.back().command_args["caps_mon"] = StringConstraint("", prefix);
- profile_grants.push_back(MonCapGrant("auth add"));
- profile_grants.back().command_args["entity"] = StringConstraint("", "osd.");
- profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile osd", "");
- profile_grants.back().command_args["caps_osd"] = StringConstraint("allow *", "");
+ profile_grants.push_back(MonCapGrant("osd new"));
}
if (profile == "bootstrap-mds") {
profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
profile_grants.push_back(MonCapGrant("mon getmap"));
profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys
- profile_grants.back().command_args["entity"] = StringConstraint("", "mds.");
- profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile mds", "");
- profile_grants.back().command_args["caps_osd"] = StringConstraint("allow rwx", "");
- profile_grants.back().command_args["caps_mds"] = StringConstraint("allow", "");
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "mds.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow profile mds");
+ profile_grants.back().command_args["caps_osd"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
+ profile_grants.back().command_args["caps_mds"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow");
}
if (profile == "bootstrap-mgr") {
profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
profile_grants.push_back(MonCapGrant("mon getmap"));
profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mgr keys
- profile_grants.back().command_args["entity"] = StringConstraint("", "mgr.");
- profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile mgr", "");
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "mgr.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow profile mgr");
}
if (profile == "bootstrap-rgw") {
profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap
profile_grants.push_back(MonCapGrant("mon getmap"));
profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys
- profile_grants.back().command_args["entity"] = StringConstraint("", "client.rgw.");
- profile_grants.back().command_args["caps_mon"] = StringConstraint("allow rw", "");
- profile_grants.back().command_args["caps_osd"] = StringConstraint("allow rwx", "");
+ profile_grants.back().command_args["entity"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_PREFIX, "client.rgw.");
+ profile_grants.back().command_args["caps_mon"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rw");
+ profile_grants.back().command_args["caps_osd"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
}
if (profile == "fs-client") {
profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
}
+ if (profile == "rbd") {
+ profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+ profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+
+ // exclusive lock dead-client blacklisting (IP+nonce required)
+ profile_grants.push_back(MonCapGrant("osd blacklist"));
+ profile_grants.back().command_args["blacklistop"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_EQUAL, "add");
+ profile_grants.back().command_args["addr"] = StringConstraint(
+ StringConstraint::MATCH_TYPE_REGEX, "^[^/]/[0-9]*$");
+ }
if (profile == "role-definer") {
// grants ALL caps to the auth subsystem, read-only on the
// argument must be present if a constraint exists
if (q == c_args.end())
return 0;
- if (p->second.value.length()) {
- // match value
+ switch (p->second.match_type) {
+ case StringConstraint::MATCH_TYPE_EQUAL:
if (p->second.value != q->second)
return 0;
- } else {
- // match prefix
- if (q->second.find(p->second.prefix) != 0)
+ break;
+ case StringConstraint::MATCH_TYPE_PREFIX:
+ if (q->second.find(p->second.value) != 0)
return 0;
+ break;
+ case StringConstraint::MATCH_TYPE_REGEX:
+ {
+ boost::regex pattern(p->second.value,
+ boost::regex::basic | boost::regex::no_except);
+ if (pattern.empty() || !boost::regex_match(q->second, pattern))
+ return 0;
+ }
+ break;
+ default:
+ break;
}
}
return MON_CAP_ALL;
spaces = +(lit(' ') | lit('\n') | lit('\t'));
// command := command[=]cmd [k1=v1 k2=v2 ...]
- str_match = '=' >> str >> qi::attr(string());
- str_prefix = spaces >> lit("prefix") >> spaces >> qi::attr(string()) >> str;
- kv_pair = str >> (str_match | str_prefix);
+ str_match = '=' >> qi::attr(StringConstraint::MATCH_TYPE_EQUAL) >> str;
+ str_prefix = spaces >> lit("prefix") >> spaces >>
+ qi::attr(StringConstraint::MATCH_TYPE_PREFIX) >> str;
+ str_regex = spaces >> lit("regex") >> spaces >>
+ qi::attr(StringConstraint::MATCH_TYPE_REGEX) >> str;
+ kv_pair = str >> (str_match | str_prefix | str_regex);
kv_map %= kv_pair >> *(spaces >> kv_pair);
command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces)
>> qi::attr(string()) >> qi::attr(string())
>> spaces >> rwxa;
// profile foo
- profile_match %= -spaces >> lit("allow") >> spaces >> lit("profile") >> (lit('=') | spaces)
+ profile_match %= -spaces >> -(lit("allow") >> spaces)
+ >> lit("profile") >> (lit('=') | spaces)
>> qi::attr(string())
>> str
>> qi::attr(string())
qi::rule<Iterator, string()> unquoted_word;
qi::rule<Iterator, string()> str;
- qi::rule<Iterator, StringConstraint()> str_match, str_prefix;
+ qi::rule<Iterator, StringConstraint()> str_match, str_prefix, str_regex;
qi::rule<Iterator, pair<string, StringConstraint>()> kv_pair;
qi::rule<Iterator, map<string, StringConstraint>()> kv_map;
ostream& operator<<(ostream& out, const mon_rwxa_t& p);
struct StringConstraint {
+ enum MatchType {
+ MATCH_TYPE_NONE,
+ MATCH_TYPE_EQUAL,
+ MATCH_TYPE_PREFIX,
+ MATCH_TYPE_REGEX
+ };
+
+ MatchType match_type = MATCH_TYPE_NONE;
string value;
- string prefix;
StringConstraint() {}
- StringConstraint(string a, string b)
- : value(std::move(a)), prefix(std::move(b)) {}
+ StringConstraint(MatchType match_type, string value)
+ : match_type(match_type), value(value) {
+ }
};
ostream& operator<<(ostream& out, const StringConstraint& c);
#include <random>
+#include "include/scope_guard.h"
+
#include "messages/MMonGetMap.h"
#include "messages/MMonGetVersion.h"
#include "messages/MMonGetVersionReply.h"
more_log_pending(false),
want_monmap(true),
had_a_connection(false),
- reopen_interval_multiplier(1.0),
+ reopen_interval_multiplier(
+ cct_->_conf->get_val<double>("mon_client_hunt_interval_min_multiple")),
last_mon_command_tid(0),
version_req_id(0)
{
}
had_a_connection = true;
- reopen_interval_multiplier /= 2.0;
- if (reopen_interval_multiplier < 1.0)
- reopen_interval_multiplier = 1.0;
+ _un_backoff();
}
void MonClient::tick()
{
ldout(cct, 10) << __func__ << dendl;
+ auto reschedule_tick = make_scope_guard([this] {
+ schedule_tick();
+ });
+
_check_auth_tickets();
if (_hunting()) {
ldout(cct, 1) << "continuing hunt" << dendl;
- _reopen_session();
+ return _reopen_session();
} else if (active_con) {
// just renew as needed
utime_t now = ceph_clock_now();
if (interval > cct->_conf->mon_client_ping_timeout) {
ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
<< " seconds), reconnecting" << dendl;
- _reopen_session();
+ return _reopen_session();
}
-
send_log();
}
+
+ _un_backoff();
}
+}
- schedule_tick();
+void MonClient::_un_backoff()
+{
+ // un-backoff our reconnect interval
+ reopen_interval_multiplier = std::max(
+ cct->_conf->get_val<double>("mon_client_hunt_interval_min_multiple"),
+ reopen_interval_multiplier /
+ cct->_conf->get_val<double>("mon_client_hunt_interval_backoff"));
+ ldout(cct, 20) << __func__ << " reopen_interval_multipler now "
+ << reopen_interval_multiplier << dendl;
}
void MonClient::schedule_tick()
void _finish_auth(int auth_err);
void _reopen_session(int rank = -1);
MonConnection& _add_conn(unsigned rank, uint64_t global_id);
+ void _un_backoff();
void _add_conns(uint64_t global_id);
void _send_mon_message(Message *m);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <string>
+#include "include/encoding.h"
+
+struct MonCommand {
+ std::string cmdstring;
+ std::string helpstring;
+ std::string module;
+ std::string req_perms;
+ std::string availability;
+ uint64_t flags;
+
+ // MonCommand flags
+ static const uint64_t FLAG_NONE = 0;
+ static const uint64_t FLAG_NOFORWARD = 1 << 0;
+ static const uint64_t FLAG_OBSOLETE = 1 << 1;
+ static const uint64_t FLAG_DEPRECATED = 1 << 2;
+ static const uint64_t FLAG_MGR = 1 << 3;
+
+ bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
+ void set_flag(uint64_t flag) { flags |= flag; }
+ void unset_flag(uint64_t flag) { flags &= ~flag; }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode_bare(bl);
+ ::encode(flags, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl) {
+ DECODE_START(1, bl);
+ decode_bare(bl);
+ ::decode(flags, bl);
+ DECODE_FINISH(bl);
+ }
+
+ /**
+ * Unversioned encoding for use within encode_array.
+ */
+ void encode_bare(bufferlist &bl) const {
+ ::encode(cmdstring, bl);
+ ::encode(helpstring, bl);
+ ::encode(module, bl);
+ ::encode(req_perms, bl);
+ ::encode(availability, bl);
+ }
+ void decode_bare(bufferlist::iterator &bl) {
+ ::decode(cmdstring, bl);
+ ::decode(helpstring, bl);
+ ::decode(module, bl);
+ ::decode(req_perms, bl);
+ ::decode(availability, bl);
+ }
+ bool is_compat(const MonCommand* o) const {
+ return cmdstring == o->cmdstring &&
+ module == o->module && req_perms == o->req_perms &&
+ availability == o->availability;
+ }
+
+ bool is_noforward() const {
+ return has_flag(MonCommand::FLAG_NOFORWARD);
+ }
+
+ bool is_obsolete() const {
+ return has_flag(MonCommand::FLAG_OBSOLETE);
+ }
+
+ bool is_deprecated() const {
+ return has_flag(MonCommand::FLAG_DEPRECATED);
+ }
+
+ bool is_mgr() const {
+ return has_flag(MonCommand::FLAG_MGR);
+ }
+
+ static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
+ ENCODE_START(2, 1, bl);
+ uint16_t s = size;
+ ::encode(s, bl);
+ for (int i = 0; i < size; ++i) {
+ cmds[i].encode_bare(bl);
+ }
+ for (int i = 0; i < size; i++) {
+ ::encode(cmds[i].flags, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ static void decode_array(MonCommand **cmds, int *size,
+ bufferlist::iterator &bl) {
+ DECODE_START(2, bl);
+ uint16_t s = 0;
+ ::decode(s, bl);
+ *size = s;
+ *cmds = new MonCommand[*size];
+ for (int i = 0; i < *size; ++i) {
+ (*cmds)[i].decode_bare(bl);
+ }
+ if (struct_v >= 2) {
+ for (int i = 0; i < *size; i++)
+ ::decode((*cmds)[i].flags, bl);
+ } else {
+ for (int i = 0; i < *size; i++)
+ (*cmds)[i].flags = 0;
+ }
+ DECODE_FINISH(bl);
+ }
+
+ bool requires_perm(char p) const {
+ return (req_perms.find(p) != std::string::npos);
+ }
+};
+WRITE_CLASS_ENCODER(MonCommand)
"auth", "rx", "cli,rest")
COMMAND("auth print_key name=entity,type=CephString", "display requested key", \
"auth", "rx", "cli,rest")
-COMMAND("auth list", "list authentication state", "auth", "rx", "cli,rest")
+COMMAND_WITH_FLAG("auth list", "list authentication state", "auth", "rx", "cli,rest",
+ FLAG(DEPRECATED))
+COMMAND("auth ls", "list authentication state", "auth", "rx", "cli,rest")
COMMAND("auth import", "auth import: read keyring file from -i <file>", \
"auth", "rwx", "cli,rest")
COMMAND("auth add " \
"name=level,type=CephChoices,strings=debug|info|sec|warn|error,req=false "
"name=channel,type=CephChoices,strings=*|cluster|audit,req=false", \
"print last few lines of the cluster log", \
- "mon", "rw", "cli,rest")
+ "mon", "r", "cli,rest")
COMMAND_WITH_FLAG("injectargs " \
"name=injected_args,type=CephString,n=N", \
"inject config arguments into monitor", "mon", "rw", "cli,rest",
FLAG(NOFORWARD))
+COMMAND("config set " \
+ "name=key,type=CephString name=value,type=CephString",
+ "Set a configuration option at runtime (not persistent)",
+ "mon", "rw", "cli,rest")
COMMAND("status", "show cluster status", "mon", "r", "cli,rest")
COMMAND("health name=detail,type=CephChoices,strings=detail,req=false", \
"show cluster health", "mon", "r", "cli,rest")
COMMAND("mon versions",
"check running versions of monitors",
"mon", "r", "cli,rest")
+COMMAND("versions",
+ "check running versions of ceph daemons",
+ "mon", "r", "cli,rest")
+
/*
"print summary of OSD map", "osd", "r", "cli,rest")
COMMAND("osd tree " \
"name=epoch,type=CephInt,range=0,req=false " \
- "name=states,type=CephChoices,strings=up|down|in|out,n=N,req=false", \
+ "name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false", \
"print OSD tree", "osd", "r", "cli,rest")
COMMAND("osd ls " \
"name=epoch,type=CephInt,range=0,req=false", \
COMMAND("osd blacklist ls", "show blacklisted clients", "osd", "r", "cli,rest")
COMMAND("osd blacklist clear", "clear all blacklisted clients", "osd", "rw",
"cli,rest")
-COMMAND("osd crush rule list", "list crush rules", "osd", "r", "cli,rest")
+COMMAND_WITH_FLAG("osd crush rule list", "list crush rules", "osd", "r", "cli,rest",
+ FLAG(DEPRECATED))
COMMAND("osd crush rule ls", "list crush rules", "osd", "r", "cli,rest")
COMMAND("osd crush rule dump " \
"name=name,type=CephString,goodchars=[A-Za-z0-9-_.],req=false", \
"set the <class> of the osd(s) <id> [<id>...]," \
"or use <all|any|*> to set all.", \
"osd", "rw", "cli,rest")
+COMMAND("osd crush rm-device-class " \
+ "name=ids,type=CephString,n=N", \
+ "remove class of the osd(s) <id> [<id>...]," \
+ "or use <all|any|*> to remove all.", \
+ "osd", "rw", "cli,rest")
COMMAND("osd crush create-or-move " \
"name=id,type=CephOsdName " \
"name=weight,type=CephFloat,range=0.0 " \
COMMAND("osd crush get-tunable " \
"name=tunable,type=CephChoices,strings=straw_calc_version",
"get crush tunable <tunable>",
- "osd", "rw", "cli,rest")
+ "osd", "r", "cli,rest")
COMMAND("osd crush show-tunables", \
"show current crush tunables", "osd", "r", "cli,rest")
COMMAND("osd crush rule create-simple " \
COMMAND("osd crush rule rm " \
"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] ", \
"remove crush rule <name>", "osd", "rw", "cli,rest")
-COMMAND("osd crush tree",
+COMMAND("osd crush tree "
+ "name=shadow,type=CephChoices,strings=--show-shadow,req=false", \
"dump crush buckets and items in a tree view",
"osd", "r", "cli,rest")
-COMMAND("osd crush class create " \
- "name=class,type=CephString,goodchars=[A-Za-z0-9-_]", \
- "create crush device class <class>", \
- "osd", "rw", "cli,rest")
COMMAND("osd crush class rm " \
"name=class,type=CephString,goodchars=[A-Za-z0-9-_]", \
"remove crush device class <class>", \
"osd", "rw", "cli,rest")
-COMMAND("osd crush class rename " \
- "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_] " \
- "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_]", \
- "rename crush device class <srcname> to <dstname>", \
- "osd", "rw", "cli,rest")
COMMAND("osd crush class ls", \
"list all crush device classes", \
"osd", "r", "cli,rest")
"name=class,type=CephString,goodchars=[A-Za-z0-9-_]", \
"list all osds belonging to the specific <class>", \
"osd", "r", "cli,rest")
+COMMAND("osd crush weight-set ls",
+ "list crush weight sets",
+ "osd", "r", "cli,rest")
+COMMAND("osd crush weight-set dump",
+ "dump crush weight sets",
+ "osd", "r", "cli,rest")
+COMMAND("osd crush weight-set create-compat",
+ "create a default backward-compatible weight-set",
+ "osd", "rw", "cli,rest")
+COMMAND("osd crush weight-set create " \
+ "name=pool,type=CephPoolname "\
+ "name=mode,type=CephChoices,strings=flat|positional",
+ "create a weight-set for a given pool",
+ "osd", "rw", "cli,rest")
+COMMAND("osd crush weight-set rm name=pool,type=CephPoolname",
+ "remove the weight-set for a given pool",
+ "osd", "rw", "cli,rest")
+COMMAND("osd crush weight-set rm-compat",
+ "remove the backward-compatible weight-set",
+ "osd", "rw", "cli,rest")
+COMMAND("osd crush weight-set reweight " \
+ "name=pool,type=CephPoolname " \
+ "name=item,type=CephString " \
+ "name=weight,type=CephFloat,range=0.0,n=N",
+ "set weight for an item (bucket or osd) in a pool's weight-set",
+ "osd", "rw", "cli,rest")
+COMMAND("osd crush weight-set reweight-compat " \
+ "name=item,type=CephString " \
+ "name=weight,type=CephFloat,range=0.0,n=N",
+ "set weight for an item (bucket or osd) in the backward-compatible weight-set",
+ "osd", "rw", "cli,rest")
COMMAND("osd setmaxosd " \
"name=newmax,type=CephInt,range=0", \
"set new maximum osd value", "osd", "rw", "cli,rest")
"list all erasure code profiles", \
"osd", "r", "cli,rest")
COMMAND("osd set " \
- "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|require_jewel_osds|require_kraken_osds", \
+ "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds", \
"set <key>", "osd", "rw", "cli,rest")
COMMAND("osd unset " \
"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
"name=weights,type=CephString",
"reweight osds with {<id>: <weight>,...})",
"osd", "rw", "cli,rest")
+COMMAND("osd force-create-pg " \
+ "name=pgid,type=CephPgid ",
+ "force creation of pg <pgid>",
+ "osd", "rw", "cli,rest")
COMMAND("osd pg-temp " \
"name=pgid,type=CephPgid " \
"name=id,type=CephOsdName,n=N,req=false", \
"name=pool,type=CephPoolname ",
"obtain object or byte limits for pool",
"osd", "r", "cli,rest")
+COMMAND("osd pool application enable " \
+ "name=pool,type=CephPoolname " \
+ "name=app,type=CephString,goodchars=[A-Za-z0-9-_.] " \
+ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
+ "enable use of an application <app> [cephfs,rbd,rgw] on pool <poolname>",
+ "osd", "rw", "cli,rest")
+COMMAND("osd pool application disable " \
+ "name=pool,type=CephPoolname " \
+ "name=app,type=CephString " \
+ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
+ "disables use of an application <app> on pool <poolname>",
+ "osd", "rw", "cli,rest")
+COMMAND("osd pool application set " \
+ "name=pool,type=CephPoolname " \
+ "name=app,type=CephString " \
+ "name=key,type=CephString,goodchars=[A-Za-z0-9-_.] " \
+ "name=value,type=CephString,goodchars=[A-Za-z0-9-_.=]",
+ "sets application <app> metadata key <key> to <value> on pool <poolname>",
+ "osd", "rw", "cli,rest")
+COMMAND("osd pool application rm " \
+ "name=pool,type=CephPoolname " \
+ "name=app,type=CephString " \
+ "name=key,type=CephString",
+ "removes application <app> metadata key <key> on pool <poolname>",
+ "osd", "rw", "cli,rest")
COMMAND("osd utilization",
"get basic pg distribution stats",
"osd", "r", "cli,rest")
COMMAND("config-key get " \
"name=key,type=CephString", \
"get <key>", "config-key", "r", "cli,rest")
-COMMAND("config-key put " \
+COMMAND("config-key set " \
"name=key,type=CephString " \
"name=val,type=CephString,req=false", \
- "put <key>, value <val>", "config-key", "rw", "cli,rest")
+ "set <key> to value <val>", "config-key", "rw", "cli,rest")
+COMMAND_WITH_FLAG("config-key put " \
+ "name=key,type=CephString " \
+ "name=val,type=CephString,req=false", \
+ "put <key>, value <val>", "config-key", "rw", "cli,rest",
+ FLAG(DEPRECATED))
COMMAND("config-key del " \
"name=key,type=CephString", \
"delete <key>", "config-key", "rw", "cli,rest")
COMMAND("config-key exists " \
"name=key,type=CephString", \
"check for <key>'s existence", "config-key", "r", "cli,rest")
-COMMAND("config-key list ", "list keys", "config-key", "r", "cli,rest")
+COMMAND_WITH_FLAG("config-key list ", "list keys", "config-key", "r", "cli,rest",
+ FLAG(DEPRECATED))
+COMMAND("config-key ls ", "list keys", "config-key", "r", "cli,rest")
COMMAND("config-key dump", "dump keys and values", "config-key", "r", "cli,rest")
COMMAND("mgr module disable " \
"name=module,type=CephString",
"disable mgr module", "mgr", "rw", "cli,rest")
+COMMAND("mgr metadata name=id,type=CephString,req=false",
+ "dump metadata for all daemons or a specific daemon",
+ "mgr", "r", "cli,rest")
+COMMAND("mgr count-metadata name=property,type=CephString",
+ "count ceph-mgr daemons by metadata field property",
+ "mgr", "r", "cli,rest")
+COMMAND("mgr versions", \
+ "check running versions of ceph-mgr daemons",
+ "mgr", "r", "cli,rest")
#include <mon/MonCommands.h>
#undef COMMAND
#undef COMMAND_WITH_FLAG
-
- // FIXME: slurp up the Mgr commands too
-
-#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
- {parsesig, helptext, modulename, req_perms, avail, FLAG(MGR)},
-#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flags) \
- {parsesig, helptext, modulename, req_perms, avail, flags | FLAG(MGR)},
-#include <mgr/MgrCommands.h>
-#undef COMMAND
-#undef COMMAND_WITH_FLAG
-
};
}
}
-const MonCommand *Monitor::_get_moncommand(const string &cmd_prefix,
- MonCommand *cmds, int cmds_size)
+const MonCommand *Monitor::_get_moncommand(
+ const string &cmd_prefix,
+ const MonCommand *cmds,
+ int cmds_size)
{
- MonCommand *this_cmd = NULL;
- for (MonCommand *cp = cmds;
+ const MonCommand *this_cmd = NULL;
+ for (const MonCommand *cp = cmds;
cp < &cmds[cmds_size]; cp++) {
if (cp->cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
this_cmd = cp;
return capable;
}
-void Monitor::format_command_descriptions(const MonCommand *commands,
- unsigned commands_size,
+void Monitor::format_command_descriptions(const std::vector<MonCommand> &commands,
Formatter *f,
bufferlist *rdata,
bool hide_mgr_flag)
{
int cmdnum = 0;
f->open_object_section("command_descriptions");
- for (const MonCommand *cp = commands;
- cp < &commands[commands_size]; cp++) {
-
- unsigned flags = cp->flags;
+ for (const auto &cmd : commands) {
+ unsigned flags = cmd.flags;
if (hide_mgr_flag) {
flags &= ~MonCommand::FLAG_MGR;
}
ostringstream secname;
secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
dump_cmddesc_to_json(f, secname.str(),
- cp->cmdstring, cp->helpstring, cp->module,
- cp->req_perms, cp->availability, flags);
+ cmd.cmdstring, cmd.helpstring, cmd.module,
+ cmd.req_perms, cmd.availability, flags);
cmdnum++;
}
f->close_section(); // command_descriptions
// hide mgr commands until luminous upgrade is complete
bool hide_mgr_flag =
osdmon()->osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS;
- format_command_descriptions(leader_supported_mon_commands,
- leader_supported_mon_commands_size, f, &rdata,
- hide_mgr_flag);
+
+ std::vector<MonCommand> commands;
+ commands = static_cast<MgrMonitor*>(
+ paxos_service[PAXOS_MGR])->get_command_descs();
+
+ for (int i = 0 ; i < leader_supported_mon_commands_size; ++i) {
+ commands.push_back(leader_supported_mon_commands[i]);
+ }
+
+ format_command_descriptions(commands, f, &rdata, hide_mgr_flag);
delete f;
reply_command(op, 0, "", rdata, 0);
return;
// validate command is in leader map
const MonCommand *leader_cmd;
+ const auto& mgr_cmds = mgrmon()->get_command_descs();
+ const MonCommand *mgr_cmd = nullptr;
+ if (!mgr_cmds.empty()) {
+ mgr_cmd = _get_moncommand(prefix, &mgr_cmds.at(0), mgr_cmds.size());
+ }
leader_cmd = _get_moncommand(prefix,
// the boost underlying this isn't const for some reason
const_cast<MonCommand*>(leader_supported_mon_commands),
leader_supported_mon_commands_size);
if (!leader_cmd) {
- reply_command(op, -EINVAL, "command not known", 0);
- return;
+ leader_cmd = mgr_cmd;
+ if (!leader_cmd) {
+ reply_command(op, -EINVAL, "command not known", 0);
+ return;
+ }
}
// validate command is in our map & matches, or forward if it is allowed
const MonCommand *mon_cmd = _get_moncommand(prefix, mon_commands,
ARRAY_SIZE(mon_commands));
+ if (!mon_cmd) {
+ mon_cmd = mgr_cmd;
+ }
if (!is_leader()) {
if (!mon_cmd) {
if (leader_cmd->is_noforward()) {
f->flush(rdata);
r = 0;
rs = "";
+ } else if (prefix == "config set") {
+ std::string key;
+ cmd_getval(cct, cmdmap, "key", key);
+ std::string val;
+ cmd_getval(cct, cmdmap, "value", val);
+ r = g_conf->set_val(key, val, true, &ss);
+ rs = ss.str();
+ goto out;
} else if (prefix == "status" ||
prefix == "health" ||
prefix == "df") {
rdata.append(ds);
rs = "";
r = 0;
+ } else if (prefix == "versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ map<string,int> overall;
+ f->open_object_section("version");
+ map<string,int> mon, mgr, osd, mds;
+
+ count_metadata("ceph_version", &mon);
+ f->open_object_section("mon");
+ for (auto& p : mon) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+
+ mgrmon()->count_metadata("ceph_version", &mgr);
+ f->open_object_section("mgr");
+ for (auto& p : mgr) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+
+ osdmon()->count_metadata("ceph_version", &osd);
+ f->open_object_section("osd");
+ for (auto& p : osd) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+
+ mdsmon()->count_metadata("ceph_version", &mds);
+ f->open_object_section("mds");
+ for (auto& p : mon) {
+ f->dump_int(p.first.c_str(), p.second);
+ overall[p.first] += p.second;
+ }
+ f->close_section();
+
+ for (auto& p : mgrstatmon()->get_service_map().services) {
+ f->open_object_section(p.first.c_str());
+ map<string,int> m;
+ p.second.count_metadata("ceph_version", &m);
+ for (auto& q : m) {
+ f->dump_int(q.first.c_str(), q.second);
+ overall[q.first] += q.second;
+ }
+ f->close_section();
+ }
+
+ f->open_object_section("overall");
+ for (auto& p : overall) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ rs = "";
+ r = 0;
}
out:
return 0;
}
-void Monitor::count_metadata(const string& field, Formatter *f)
+void Monitor::count_metadata(const string& field, map<string,int> *out)
{
- map<string,int> by_val;
for (auto& p : mon_metadata) {
auto q = p.second.find(field);
if (q == p.second.end()) {
- by_val["unknown"]++;
+ (*out)["unknown"]++;
} else {
- by_val[q->second]++;
+ (*out)[q->second]++;
}
}
+}
+
+void Monitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
f->open_object_section(field.c_str());
for (auto& p : by_val) {
f->dump_int(p.first.c_str(), p.second);
#include "Paxos.h"
#include "Session.h"
#include "PGStatService.h"
+#include "MonCommand.h"
#include "common/LogClient.h"
#include "auth/cephx/CephxKeyServer.h"
struct MForward;
struct MTimeCheck;
struct MMonHealth;
-struct MonCommand;
#define COMPAT_SET_LOC "feature_set"
static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
map<string,string> ¶m_str_map);
- static const MonCommand *_get_moncommand(const string &cmd_prefix,
- MonCommand *cmds, int cmds_size);
+ static const MonCommand *_get_moncommand(
+ const string &cmd_prefix,
+ const MonCommand *cmds,
+ int cmds_size);
bool _allowed_command(MonSession *s, string &module, string &prefix,
const map<string,cmd_vartype>& cmdmap,
const map<string,string>& param_str_map,
void update_mon_metadata(int from, Metadata&& m);
int load_metadata();
void count_metadata(const string& field, Formatter *f);
+ void count_metadata(const string& field, map<string,int> *out);
// features
static CompatSet get_initial_supported_features();
Monitor& operator=(const Monitor &rhs);
public:
- static void format_command_descriptions(const MonCommand *commands,
- unsigned commands_size,
+ static void format_command_descriptions(const std::vector<MonCommand> &commands,
Formatter *f,
bufferlist *rdata,
bool hide_mgr_flag=false);
#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
// make sure you add your feature to Monitor::get_supported_features
-struct MonCommand {
- string cmdstring;
- string helpstring;
- string module;
- string req_perms;
- string availability;
- uint64_t flags;
-
- // MonCommand flags
- static const uint64_t FLAG_NONE = 0;
- static const uint64_t FLAG_NOFORWARD = 1 << 0;
- static const uint64_t FLAG_OBSOLETE = 1 << 1;
- static const uint64_t FLAG_DEPRECATED = 1 << 2;
- static const uint64_t FLAG_MGR = 1 << 3;
-
- bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
- void set_flag(uint64_t flag) { flags |= flag; }
- void unset_flag(uint64_t flag) { flags &= ~flag; }
-
- void encode(bufferlist &bl) const {
- /*
- * very naughty: deliberately unversioned because individual commands
- * shouldn't be encoded standalone, only as a full set (which we do
- * version, see encode_array() below).
- */
- ::encode(cmdstring, bl);
- ::encode(helpstring, bl);
- ::encode(module, bl);
- ::encode(req_perms, bl);
- ::encode(availability, bl);
- }
- void decode(bufferlist::iterator &bl) {
- ::decode(cmdstring, bl);
- ::decode(helpstring, bl);
- ::decode(module, bl);
- ::decode(req_perms, bl);
- ::decode(availability, bl);
- }
- bool is_compat(const MonCommand* o) const {
- return cmdstring == o->cmdstring &&
- module == o->module && req_perms == o->req_perms &&
- availability == o->availability;
- }
-
- bool is_noforward() const {
- return has_flag(MonCommand::FLAG_NOFORWARD);
- }
-
- bool is_obsolete() const {
- return has_flag(MonCommand::FLAG_OBSOLETE);
- }
-
- bool is_deprecated() const {
- return has_flag(MonCommand::FLAG_DEPRECATED);
- }
- bool is_mgr() const {
- return has_flag(MonCommand::FLAG_MGR);
- }
-
- static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
- ENCODE_START(2, 1, bl);
- uint16_t s = size;
- ::encode(s, bl);
- ::encode_array_nohead(cmds, size, bl);
- for (int i = 0; i < size; i++)
- ::encode(cmds[i].flags, bl);
- ENCODE_FINISH(bl);
- }
- static void decode_array(MonCommand **cmds, int *size,
- bufferlist::iterator &bl) {
- DECODE_START(2, bl);
- uint16_t s = 0;
- ::decode(s, bl);
- *size = s;
- *cmds = new MonCommand[*size];
- ::decode_array_nohead(*cmds, *size, bl);
- if (struct_v >= 2) {
- for (int i = 0; i < *size; i++)
- ::decode((*cmds)[i].flags, bl);
- } else {
- for (int i = 0; i < *size; i++)
- (*cmds)[i].flags = 0;
- }
- DECODE_FINISH(bl);
- }
-
- bool requires_perm(char p) const {
- return (req_perms.find(p) != string::npos);
- }
-};
-WRITE_CLASS_ENCODER(MonCommand)
#endif
#include "json_spirit/json_spirit_reader.h"
+#include <boost/algorithm/string/predicate.hpp>
+
#define dout_subsys ceph_subsys_mon
#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
+namespace {
+
+const uint32_t MAX_POOL_APPLICATIONS = 4;
+const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
+const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
+
+} // anonymous namespace
+
void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
{
if (epoch_by_pg.size() <= ps) {
derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
} else {
newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
+ newmap.flags |= CEPH_OSDMAP_RECOVERY_DELETES;
newmap.full_ratio = g_conf->mon_osd_full_ratio;
if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
}
dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
<< " pools" << dendl;
- dout(10) << __func__ << " " << pending_creatings.pgs.size() - total
+ dout(10) << __func__
+ << " " << (pending_creatings.pgs.size() - total)
+ << "/" << pending_creatings.pgs.size()
<< " pgs added from queued pools" << dendl;
return pending_creatings;
}
int next_up_primary, next_acting_primary;
next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
&next_acting, &next_acting_primary);
- if (acting == next_acting)
+ if (acting == next_acting && next_up != next_acting)
return; // no change since last epoch
if (acting.empty())
if (pool && acting.size() < pool->min_size)
return; // can be no worse off than before
+ if (next_up == next_acting) {
+ acting.clear();
+ dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
+ << dendl;
+ }
+
dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
<< " -> " << next_up << "/" << next_acting
<< ", priming " << acting
}
mapping_job.reset();
+ // ensure we don't have blank new_state updates. these are interrpeted as
+ // CEPH_OSD_UP (and almost certainly not what we want!).
+ auto p = pending_inc.new_state.begin();
+ while (p != pending_inc.new_state.end()) {
+ if (p->second == 0) {
+ dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
+ p = pending_inc.new_state.erase(p);
+ } else {
+ ++p;
+ }
+ }
+
bufferlist bl;
{
pending_inc.new_erasure_code_profiles[p.first] = newprofile;
}
}
+
+ // auto-enable pool applications upon upgrade
+ // NOTE: this can be removed post-Luminous assuming upgrades need to
+ // proceed through Luminous
+ for (auto &pool_pair : tmp.pools) {
+ int64_t pool_id = pool_pair.first;
+ pg_pool_t pg_pool = pool_pair.second;
+ if (pg_pool.is_tier()) {
+ continue;
+ }
+
+ std::string pool_name = tmp.get_pool_name(pool_id);
+ uint32_t match_count = 0;
+
+ // CephFS
+ FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
+ if (pending_fsmap.pool_in_use(pool_id)) {
+ dout(10) << __func__ << " auto-enabling CephFS on pool '"
+ << pool_name << "'" << dendl;
+ pg_pool.application_metadata.insert(
+ {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
+ ++match_count;
+ }
+
+ // RBD heuristics (default OpenStack pool names from docs and
+ // ceph-ansible)
+ if (boost::algorithm::contains(pool_name, "rbd") ||
+ pool_name == "images" || pool_name == "volumes" ||
+ pool_name == "backups" || pool_name == "vms") {
+ dout(10) << __func__ << " auto-enabling RBD on pool '"
+ << pool_name << "'" << dendl;
+ pg_pool.application_metadata.insert(
+ {pg_pool_t::APPLICATION_NAME_RBD, {}});
+ ++match_count;
+ }
+
+ // RGW heuristics
+ if (boost::algorithm::contains(pool_name, ".rgw") ||
+ boost::algorithm::contains(pool_name, ".log") ||
+ boost::algorithm::contains(pool_name, ".intent-log") ||
+ boost::algorithm::contains(pool_name, ".usage") ||
+ boost::algorithm::contains(pool_name, ".users")) {
+ dout(10) << __func__ << " auto-enabling RGW on pool '"
+ << pool_name << "'" << dendl;
+ pg_pool.application_metadata.insert(
+ {pg_pool_t::APPLICATION_NAME_RGW, {}});
+ ++match_count;
+ }
+
+ // OpenStack gnocchi (from ceph-ansible)
+ if (pool_name == "metrics" && match_count == 0) {
+ dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
+ << pool_name << "'" << dendl;
+ pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
+ ++match_count;
+ }
+
+ if (match_count == 1) {
+ pg_pool.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = pg_pool;
+ } else if (match_count > 1) {
+ auto pstat = mon->pgservice->get_pool_stat(pool_id);
+ if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
+ mon->clog->info() << "unable to auto-enable application for pool "
+ << "'" << pool_name << "'";
+ }
+ }
+ }
}
}
}
return 0;
}
-void OSDMonitor::count_metadata(const string& field, Formatter *f)
+void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
{
- map<string,int> by_val;
for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
if (osdmap.is_up(osd)) {
map<string,string> meta;
load_metadata(osd, meta, nullptr);
auto p = meta.find(field);
if (p == meta.end()) {
- by_val["unknown"]++;
+ (*out)["unknown"]++;
} else {
- by_val[p->second]++;
+ (*out)[p->second]++;
}
}
}
+}
+
+void OSDMonitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
f->open_object_section(field.c_str());
for (auto& p : by_val) {
f->dump_int(p.first.c_str(), p.second);
goto ignore;
}
+ if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
+ !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
+ mon->clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
+ goto ignore;
+ }
+
if (any_of(osdmap.get_pools().begin(),
osdmap.get_pools().end(),
[](const std::pair<int64_t,pg_pool_t>& pool)
}
}
+void OSDMonitor::do_application_enable(int64_t pool_id,
+ const std::string &app_name)
+{
+ assert(paxos->is_plugged());
+
+ dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
+ << dendl;
+
+ auto pp = osdmap.get_pg_pool(pool_id);
+ assert(pp != nullptr);
+
+ pg_pool_t p = *pp;
+ if (pending_inc.new_pools.count(pool_id)) {
+ p = pending_inc.new_pools[pool_id];
+ }
+
+ p.application_metadata.insert({app_name, {}});
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = p;
+}
+
unsigned OSDMonitor::scan_for_creating_pgs(
const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
const mempool::osdmap::set<int64_t>& removed_pools,
<< creating_pgs.queue.size() << " pools in queue" << dendl;
decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
std::lock_guard<std::mutex> l(creating_pgs_lock);
- for (auto& pg : creating_pgs.pgs) {
+ for (const auto& pg : creating_pgs.pgs) {
int acting_primary = -1;
auto pgid = pg.first;
auto mapped = pg.second.first;
- dout(20) << __func__ << " looking up " << pgid << dendl;
+ dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
// check the previous creating_pgs, look for the target to whom the pg was
// previously mapped
}
}
dout(10) << __func__ << " will instruct osd." << acting_primary
- << " to create " << pgid << dendl;
+ << " to create " << pgid << "@" << mapped << dendl;
new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
}
creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
creating_pgs_epoch = mapping.get_epoch();
}
-epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
+epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
{
dout(30) << __func__ << " osd." << osd << " next=" << next
<< " " << creating_pgs_by_osd_epoch << dendl;
m = new MOSDPGCreate(creating_pgs_epoch);
// Need the create time from the monitor using its clock to set
// last_scrub_stamp upon pg creation.
- const auto& creation = creating_pgs.pgs[pg];
- m->mkpg.emplace(pg, pg_create_t{creation.first, pg, 0});
- m->ctimes.emplace(pg, creation.second);
+ auto create = creating_pgs.pgs.find(pg);
+ assert(create != creating_pgs.pgs.end());
+ m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
+ m->ctimes.emplace(pg, create->second.second);
dout(20) << __func__ << " will create " << pg
- << " at " << creation.first << dendl;
+ << " at " << create->second.first << dendl;
}
}
if (!m) {
}
}
- if (g_conf->mon_osd_down_out_interval > 0 &&
- down.sec() >= grace) {
+ bool down_out = !osdmap.is_destroyed(o) &&
+ g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
+ bool destroyed_out = osdmap.is_destroyed(o) &&
+ g_conf->mon_osd_destroyed_out_interval > 0 &&
+ // this is not precise enough as we did not make a note when this osd
+ // was marked as destroyed, but let's not bother with that
+ // complexity for now.
+ down.sec() >= g_conf->mon_osd_destroyed_out_interval;
+ if (down_out || destroyed_out) {
dout(10) << "tick marking osd." << o << " OUT after " << down
<< " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
pending_inc.new_weight[o] = CEPH_OSD_OUT;
for (int i=0; i < max_osd; ++i) {
dout(30) << __func__ << ": checking up on osd " << i << dendl;
+ if (!osdmap.exists(i)) {
+ last_osd_report.erase(i); // if any
+ continue;
+ }
if (!osdmap.is_up(i))
continue;
const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
}
}
- if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
- // An osd could configure failsafe ratio, to something different
- // but for now assume it is the same here.
- float fsr = g_conf->osd_failsafe_full_ratio;
- if (fsr > 1.0) fsr /= 100;
- float fr = osdmap.get_full_ratio();
- float br = osdmap.get_backfillfull_ratio();
- float nr = osdmap.get_nearfull_ratio();
-
- bool out_of_order = false;
- // These checks correspond to how OSDService::check_full_status() in an OSD
- // handles the improper setting of these values.
- if (br < nr) {
- out_of_order = true;
- if (detail) {
- ostringstream ss;
- ss << "backfillfull_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased";
- detail->push_back(make_pair(HEALTH_ERR, ss.str()));
- }
- br = nr;
- }
- if (fr < br) {
- out_of_order = true;
- if (detail) {
- ostringstream ss;
- ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased";
- detail->push_back(make_pair(HEALTH_ERR, ss.str()));
- }
- fr = br;
- }
- if (fsr < fr) {
- out_of_order = true;
- if (detail) {
- ostringstream ss;
- ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased";
- detail->push_back(make_pair(HEALTH_ERR, ss.str()));
- }
- }
- if (out_of_order) {
- ostringstream ss;
- ss << "Full ratio(s) out of order";
- summary.push_back(make_pair(HEALTH_ERR, ss.str()));
- }
-
- set<int> full, backfillfull, nearfull;
- osdmap.get_full_osd_counts(&full, &backfillfull, &nearfull);
- if (full.size()) {
- ostringstream ss;
- ss << full.size() << " full osd(s)";
- summary.push_back(make_pair(HEALTH_ERR, ss.str()));
- }
- if (backfillfull.size()) {
- ostringstream ss;
- ss << backfillfull.size() << " backfillfull osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- if (nearfull.size()) {
- ostringstream ss;
- ss << nearfull.size() << " nearfull osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- if (detail) {
- for (auto& i: full) {
- ostringstream ss;
- ss << "osd." << i << " is full";
- detail->push_back(make_pair(HEALTH_ERR, ss.str()));
- }
- for (auto& i: backfillfull) {
- ostringstream ss;
- ss << "osd." << i << " is backfill full";
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- for (auto& i: nearfull) {
- ostringstream ss;
- ss << "osd." << i << " is near full";
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
- // warn if there is any noup osds.
- vector<int> noup_osds;
- osdmap.get_noup_osds(&noup_osds);
- if (noup_osds.size()) {
- ostringstream ss;
- ss << noup_osds.size() << " noup osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail) {
- ss << ": " << noup_osds;
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
- // warn if there is any nodown osds.
- vector<int> nodown_osds;
- osdmap.get_nodown_osds(&nodown_osds);
- if (nodown_osds.size()) {
- ostringstream ss;
- ss << nodown_osds.size() << " nodown osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail) {
- ss << ": " << nodown_osds;
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
- // warn if there is any noin osds.
- vector<int> noin_osds;
- osdmap.get_noin_osds(&noin_osds);
- if (noin_osds.size()) {
- ostringstream ss;
- ss << noin_osds.size() << " noin osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail) {
- ss << ": " << noin_osds;
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
- // warn if there is any noout osds.
- vector<int> noout_osds;
- osdmap.get_noout_osds(&noout_osds);
- if (noout_osds.size()) {
- ostringstream ss;
- ss << noout_osds.size() << " noout osd(s)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail) {
- ss << ": " << noout_osds;
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
- }
// note: we leave it to ceph-mgr to generate details health warnings
// with actual osd utilizations
filter |= OSDMap::DUMP_IN;
} else if (s == "out") {
filter |= OSDMap::DUMP_OUT;
+ } else if (s == "destroyed") {
+ filter |= OSDMap::DUMP_DESTROYED;
} else {
ss << "unrecognized state '" << s << "'";
r = -EINVAL;
}
}
if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
- (OSDMap::DUMP_IN|OSDMap::DUMP_OUT) ||
- (filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
- (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) {
- ss << "cannot specify both up and down or both in and out";
+ (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
+ ss << "cannot specify both 'in' and 'out'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
+ ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
+ ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
+ ss << "can specify only one of 'up', 'down' and 'destroyed'";
r = -EINVAL;
goto reply;
}
CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
MIN_READ_RECENCY_FOR_PROMOTE,
+ MIN_WRITE_RECENCY_FOR_PROMOTE,
HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
};
const choices_set_t ONLY_ERASURE_CHOICES = {
for(choices_set_t::const_iterator it = selected_choices.begin();
it != selected_choices.end(); ++it) {
choices_map_t::const_iterator i;
- f->open_object_section("pool");
- f->dump_string("pool", poolstr);
- f->dump_int("pool_id", pool);
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it) {
+ break;
+ }
+ }
+ assert(i != ALL_CHOICES.end());
+ bool pool_opt = pool_opts_t::is_opt_name(i->first);
+ if (!pool_opt) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ }
switch(*it) {
case PG_NUM:
f->dump_int("pg_num", p->get_pg_num());
case WRITE_FADVISE_DONTNEED:
case NOSCRUB:
case NODEEP_SCRUB:
- for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
- if (i->second == *it)
- break;
- }
- assert(i != ALL_CHOICES.end());
f->dump_string(i->first.c_str(),
p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
"true" : "false");
case CSUM_TYPE:
case CSUM_MAX_BLOCK:
case CSUM_MIN_BLOCK:
- for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
- if (i->second == *it)
- break;
- }
- assert(i != ALL_CHOICES.end());
- if(*it == CSUM_TYPE) {
- int val;
- p->opts.get(pool_opts_t::CSUM_TYPE, &val);
- f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
- }
- else {
- p->opts.dump(i->first, f.get());
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+ if (p->opts.is_set(key)) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ if(*it == CSUM_TYPE) {
+ int val;
+ p->opts.get(pool_opts_t::CSUM_TYPE, &val);
+ f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
+ } else {
+ p->opts.dump(i->first, f.get());
+ }
+ f->close_section();
+ f->flush(rdata);
}
break;
}
- f->close_section();
- f->flush(rdata);
+ if (!pool_opt) {
+ f->close_section();
+ f->flush(rdata);
+ }
}
} else /* !f */ {
r = 0;
} else if (prefix == "osd crush rule list" ||
prefix == "osd crush rule ls") {
- string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format);
- boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
- f->open_array_section("rules");
- osdmap.crush->list_rules(f.get());
- f->close_section();
- ostringstream rs;
- f->flush(rs);
- rs << "\n";
- rdata.append(rs.str());
+ if (f) {
+ f->open_array_section("rules");
+ osdmap.crush->list_rules(f.get());
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->list_rules(&ss);
+ rdata.append(ss.str());
+ }
} else if (prefix == "osd crush rule dump") {
string name;
cmd_getval(g_ceph_context, cmdmap, "name", name);
rs << "\n";
rdata.append(rs.str());
} else if (prefix == "osd crush tree") {
- boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
- f->open_array_section("crush_map_roots");
- osdmap.crush->dump_tree(f.get());
- f->close_section();
- f->flush(rdata);
+ string shadow;
+ cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
+ bool show_shadow = shadow == "--show-shadow";
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ osdmap.crush->dump_tree(nullptr,
+ f.get(),
+ osdmap.get_pool_names(),
+ show_shadow);
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->dump_tree(&ss,
+ nullptr,
+ osdmap.get_pool_names(),
+ show_shadow);
+ rdata.append(ss.str());
+ }
} else if (prefix == "osd crush class ls") {
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
f->open_array_section("crush_classes");
rs << "\n";
rdata.append(rs.str());
}
+ } else if (prefix == "osd crush weight-set ls") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_array_section("weight_sets");
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ f->dump_string("pool", "(compat)");
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ f->dump_string("pool", osdmap.get_pool_name(i.first));
+ }
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream rs;
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ rs << "(compat)\n";
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ rs << osdmap.get_pool_name(i.first) << "\n";
+ }
+ }
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush weight-set dump") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ osdmap.crush->dump_choose_args(f.get());
+ f->flush(rdata);
} else if (prefix == "osd erasure-code-profile get") {
string name;
cmd_getval(g_ceph_context, cmdmap, "name", name);
return -EEXIST;
return 0;
}
+ if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
+ ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf->mon_max_pool_pg_num
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
string force;
cmd_getval(g_ceph_context,cmdmap, "force", force);
if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
return 0;
}
+int OSDMonitor::prepare_command_pool_application(const string &prefix,
+ map<string,cmd_vartype> &cmdmap,
+ stringstream& ss)
+{
+ string pool_name;
+ cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ return -ENOENT;
+ }
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (pending_inc.new_pools.count(pool)) {
+ p = pending_inc.new_pools[pool];
+ }
+
+ string app;
+ cmd_getval(g_ceph_context, cmdmap, "app", app);
+ bool app_exists = (p.application_metadata.count(app) > 0);
+
+ if (boost::algorithm::ends_with(prefix, "enable")) {
+ if (app.empty()) {
+ ss << "application name must be provided";
+ return -EINVAL;
+ }
+
+ if (p.is_tier()) {
+ ss << "application must be enabled on base tier";
+ return -EINVAL;
+ }
+
+ string force;
+ cmd_getval(g_ceph_context, cmdmap, "force", force);
+
+ if (!app_exists && !p.application_metadata.empty() &&
+ force != "--yes-i-really-mean-it") {
+ ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
+ << "application; pass --yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
+ ss << "too many enabled applications on pool '" << pool_name << "'; "
+ << "max " << MAX_POOL_APPLICATIONS;
+ return -EINVAL;
+ }
+
+ if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "application name '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ p.application_metadata[app] = {};
+ }
+ ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "disable")) {
+ string force;
+ cmd_getval(g_ceph_context, cmdmap, "force", force);
+
+ if (force != "--yes-i-really-mean-it") {
+ ss << "Are you SURE? Disabling an application within a pool might result "
+ << "in loss of application functionality; pass "
+ << "--yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata.erase(app);
+ ss << "disable application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "set")) {
+ if (p.is_tier()) {
+ ss << "application metadata must be set on base tier";
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+
+ if (key.empty()) {
+ ss << "key must be provided";
+ return -EINVAL;
+ }
+
+ auto &app_keys = p.application_metadata[app];
+ if (app_keys.count(key) == 0 &&
+ app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
+ ss << "too many keys set for application '" << app << "' on pool '"
+ << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
+ return -EINVAL;
+ }
+
+ if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "key '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ string value;
+ cmd_getval(g_ceph_context, cmdmap, "value", value);
+ if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "value '" << value << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ p.application_metadata[app][key] = value;
+ ss << "set application '" << app << "' key '" << key << "' to '"
+ << value << "' on pool '" << pool_name << "'";
+ } else if (boost::algorithm::ends_with(prefix, "rm")) {
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+ auto it = p.application_metadata[app].find(key);
+ if (it == p.application_metadata[app].end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata[app].erase(it);
+ ss << "removed application '" << app << "' key '" << key << "' on pool '"
+ << pool_name << "'";
+ } else {
+ assert(false);
+ }
+
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ return 0;
+}
+
int OSDMonitor::_prepare_command_osd_crush_remove(
CrushWrapper &newcrush,
int32_t id,
assert(osdmap.is_destroyed(id));
pending_inc.new_weight[id] = CEPH_OSD_OUT;
pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
+ if (osdmap.get_state(id) & CEPH_OSD_UP) {
+ // due to http://tracker.ceph.com/issues/20751 some clusters may
+ // have UP set for non-existent OSDs; make sure it is cleared
+ // for a newly created osd.
+ pending_inc.new_state[id] |= CEPH_OSD_UP;
+ }
pending_inc.new_uuid[id] = uuid;
} else {
assert(id >= 0);
return true;
}
+ } else if (prefix == "osd crush rm-device-class") {
+ bool stop = false;
+ vector<string> idvec;
+ cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ set<int> updated;
+
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ osdmap.get_all_osds(osds);
+ stop = true;
+ } else {
+ // try traditional single osd way
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ // ss has reason for failure
+ ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+ err = -EINVAL;
+ goto reply;
+ }
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ auto class_name = newcrush.get_item_class(osd);
+ stringstream ts;
+ if (!class_name) {
+ ss << "osd." << osd << " belongs to no class, ";
+ continue;
+ }
+ // note that we do not verify if class_is_in_use here
+ // in case the device is misclassified and user wants
+ // to overridely reset...
+
+ err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply;
+ }
+ updated.insert(osd);
+ }
+ }
+
+ if (!updated.empty()) {
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+ ss << "done removing class of osd(s): " << updated;
+ getline(ss, rs);
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+ return true;
+ }
+
} else if (prefix == "osd crush add-bucket") {
// os crush add-bucket <name> <type>
string name, typestr;
goto reply;
else
goto update;
- } else if (prefix == "osd crush class create") {
- string device_class;
- if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
- err = -EINVAL; // no value!
- goto reply;
- }
- if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
- ss << "you must complete the upgrade and 'ceph osd require-osd-release "
- << "luminous' before using crush device classes";
- err = -EPERM;
- goto reply;
- }
- if (!_have_pending_crush() &&
- _get_stable_crush().class_exists(device_class)) {
- ss << "class '" << device_class << "' already exists";
- goto reply;
- }
-
- CrushWrapper newcrush;
- _get_pending_crush(newcrush);
-
- if (newcrush.class_exists(name)) {
- ss << "class '" << device_class << "' already exists";
- goto update;
- }
-
- int class_id = newcrush.get_or_create_class_id(device_class);
-
- pending_inc.crush.clear();
- newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
- ss << "created class " << device_class << " with id " << class_id
- << " to crush map";
- goto update;
-
} else if (prefix == "osd crush class rm") {
string device_class;
if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
int class_id = newcrush.get_class_id(device_class);
- if (newcrush.class_is_in_use(class_id)) {
+ stringstream ts;
+ if (newcrush.class_is_in_use(class_id, &ts)) {
err = -EBUSY;
- ss << "class '" << device_class << "' is in use";
+ ss << "class '" << device_class << "' " << ts.str();
goto reply;
}
- err = newcrush.remove_class_name(device_class);
- if (err < 0) {
- ss << "class '" << device_class << "' cannot be removed '"
- << cpp_strerror(err) << "'";
- goto reply;
+ set<int> osds;
+ newcrush.get_devices_by_class(device_class, &osds);
+ for (auto& p: osds) {
+ err = newcrush.remove_device_class(g_ceph_context, p, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply;
+ }
+ }
+
+ if (osds.empty()) {
+ // empty class, remove directly
+ err = newcrush.remove_class_name(device_class);
+ if (err < 0) {
+ ss << "class '" << device_class << "' cannot be removed '"
+ << cpp_strerror(err) << "'";
+ goto reply;
+ }
}
pending_inc.crush.clear();
<< " from crush map";
goto update;
- } else if (prefix == "osd crush class rename") {
- string srcname, dstname;
- if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
- err = -EINVAL;
- goto reply;
- }
- if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
- ss << "you must complete the upgrade and 'ceph osd require-osd-release "
- << "luminous' before using crush device classes";
+ } else if (prefix == "osd crush weight-set create" ||
+ prefix == "osd crush weight-set create-compat") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ int64_t pool;
+ int positions;
+ if (newcrush.has_non_straw2_buckets()) {
+ ss << "crush map contains one or more bucket(s) that are not straw2";
err = -EPERM;
goto reply;
}
-
- if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
- err = -EINVAL;
- goto reply;
+ if (prefix == "osd crush weight-set create") {
+ if (osdmap.require_min_compat_client > 0 &&
+ osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
+ ss << "require_min_compat_client "
+ << ceph_release_name(osdmap.require_min_compat_client)
+ << " < luminous, which is required for per-pool weight-sets. "
+ << "Try 'ceph osd set-require-min-compat-client luminous' "
+ << "before using the new interface";
+ err = -EPERM;
+ goto reply;
+ }
+ string poolname, mode;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+ if (mode != "flat" && mode != "positional") {
+ ss << "unrecognized weight-set mode '" << mode << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ positions = 1;
}
+ newcrush.create_choose_args(pool, positions);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+ goto update;
+ } else if (prefix == "osd crush weight-set rm" ||
+ prefix == "osd crush weight-set rm-compat") {
CrushWrapper newcrush;
_get_pending_crush(newcrush);
-
- if (!newcrush.class_exists(srcname)) {
- err = -ENOENT;
- ss << "class '" << srcname << "' does not exist";
- goto reply;
+ int64_t pool;
+ if (prefix == "osd crush weight-set rm") {
+ string poolname;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
}
+ newcrush.rm_choose_args(pool);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+ goto update;
- if (newcrush.class_exists(dstname)) {
- err = -EEXIST;
- ss << "class '" << dstname << "' already exists";
- goto reply;
+ } else if (prefix == "osd crush weight-set reweight" ||
+ prefix == "osd crush weight-set reweight-compat") {
+ string poolname, item;
+ vector<double> weight;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+ cmd_getval(g_ceph_context, cmdmap, "item", item);
+ cmd_getval(g_ceph_context, cmdmap, "weight", weight);
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ int64_t pool;
+ if (prefix == "osd crush weight-set reweight") {
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no weight-set for pool '" << poolname << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ auto arg_map = newcrush.choose_args_get(pool);
+ int positions = newcrush.get_choose_args_positions(arg_map);
+ if (weight.size() != (size_t)positions) {
+ ss << "must specify exact " << positions << " weight values";
+ err = -EINVAL;
+ goto reply;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no backward-compatible weight-set";
+ err = -ENOENT;
+ goto reply;
+ }
}
-
- int class_id = newcrush.get_class_id(srcname);
-
- if (newcrush.class_is_in_use(class_id)) {
- err = -EBUSY;
- ss << "class '" << srcname << "' is in use";
+ if (!newcrush.name_exists(item)) {
+ ss << "item '" << item << "' does not exist";
+ err = -ENOENT;
goto reply;
}
-
- err = newcrush.rename_class(srcname, dstname);
+ err = newcrush.choose_args_adjust_item_weightf(
+ g_ceph_context,
+ newcrush.choose_args_get(pool),
+ newcrush.get_item_id(item),
+ weight,
+ &ss);
if (err < 0) {
- ss << "fail to rename '" << srcname << "' to '" << dstname << "':"
- << cpp_strerror(err);
goto reply;
}
-
+ err = 0;
pending_inc.crush.clear();
newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
- ss << "rename class '" << srcname << "' to '" << dstname << "'";
goto update;
-
} else if (osdid_present &&
(prefix == "osd crush set" || prefix == "osd crush add")) {
// <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
if (!osdmap.exists(osdid)) {
err = -ENOENT;
- ss << name << " does not exist. create it before updating the crush map";
+ ss << name << " does not exist. Create it before updating the crush map";
goto reply;
}
err = -EPERM;
goto reply;
}
+ } else if (key == "recovery_deletes") {
+ if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
+ return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
+ } else {
+ ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
+ err = -EPERM;
+ goto reply;
+ }
} else if (key == "require_jewel_osds") {
if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
ss << "the sortbitwise flag must be set before require_jewel_osds";
goto reply;
}
pending_inc.new_require_osd_release = rel;
+ if (rel >= CEPH_RELEASE_LUMINOUS &&
+ !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
+ return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
+ }
goto update;
} else if (prefix == "osd cluster_snap") {
// ** DISABLE THIS FOR NOW **
err = -ENOENT;
goto reply;
}
- new_pg_upmap_items.push_back(make_pair(from, to));
+ pair<int32_t,int32_t> entry = make_pair(from, to);
+ auto it = std::find(new_pg_upmap_items.begin(),
+ new_pg_upmap_items.end(), entry);
+ if (it != new_pg_upmap_items.end()) {
+ ss << "osd." << from << " -> osd." << to << " already exists, ";
+ continue;
+ }
+ new_pg_upmap_items.push_back(entry);
items << from << "->" << to << ",";
}
string out(items.str());
wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
get_last_committed() + 1));
return true;
+ } else if (prefix == "osd pool application enable" ||
+ prefix == "osd pool application disable" ||
+ prefix == "osd pool application set" ||
+ prefix == "osd pool application rm") {
+ err = prepare_command_pool_application(prefix, cmdmap, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err < 0)
+ goto reply;
+ getline(ss, rs);
+ wait_for_finished_proposal(
+ op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+ return true;
} else if (prefix == "osd reweight-by-pg" ||
prefix == "osd reweight-by-utilization" ||
prefix == "osd test-reweight-by-pg" ||
new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
return true;
}
+ } else if (prefix == "osd force-create-pg") {
+ pg_t pgid;
+ string pgidstr;
+ cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ bool creating_now;
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ auto emplaced = creating_pgs.pgs.emplace(pgid,
+ make_pair(osdmap.get_epoch(),
+ ceph_clock_now()));
+ creating_now = emplaced.second;
+ }
+ if (creating_now) {
+ ss << "pg " << pgidstr << " now creating, ok";
+ err = 0;
+ goto update;
+ } else {
+ ss << "pg " << pgid << " already creating";
+ err = 0;
+ goto reply;
+ }
} else {
err = -EINVAL;
}
int load_metadata(int osd, map<string, string>& m, ostream *err);
void count_metadata(const string& field, Formatter *f);
+public:
+ void count_metadata(const string& field, map<string,int> *out);
+protected:
int get_osd_objectstore_type(int osd, std::string *type);
bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
ostream *err);
// the epoch when the pg mapping was calculated
epoch_t creating_pgs_epoch = 0;
creating_pgs_t creating_pgs;
- std::mutex creating_pgs_lock;
+ mutable std::mutex creating_pgs_lock;
creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc);
void trim_creating_pgs(creating_pgs_t *creating_pgs,
pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
void update_creating_pgs();
void check_pg_creates_subs();
- epoch_t send_pg_creates(int osd, Connection *con, epoch_t next);
+ epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
int32_t _allocate_osd_id(int32_t* existing_id);
int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
stringstream& ss);
+ int prepare_command_pool_application(const string &prefix,
+ map<string,cmd_vartype> &cmdmap,
+ stringstream& ss);
bool handle_osd_timeouts(const utime_t &now,
std::map<int,utime_t> &last_osd_report);
void check_osdmap_sub(Subscription *sub);
void check_pg_creates_sub(Subscription *sub);
+ void do_application_enable(int64_t pool_id, const std::string &app_name);
+
void add_flag(int flag) {
if (!(osdmap.flags & flag)) {
if (pending_inc.new_flags < 0)
const unsigned max = cct->_conf->mon_health_max_detail;
const auto& pools = osdmap.get_pools();
- checks->clear();
-
typedef enum pg_consequence_t {
UNAVAILABLE = 1, // Client IO to the pool may block
DEGRADED = 2, // Fewer than the requested number of replicas are present
std::map<unsigned, PgStateResponse> state_to_response = {
// Immediate reports
{ PG_STATE_INCONSISTENT, {DAMAGED, {}} },
- { PG_STATE_INCOMPLETE, {DEGRADED, {}} },
+ { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
{ PG_STATE_REPAIR, {DAMAGED, {}} },
{ PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
- { PG_STATE_BACKFILL_TOOFULL, {DEGRADED, {}} },
- { PG_STATE_RECOVERY_TOOFULL, {DEGRADED, {}} },
+ { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
+ { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
{ PG_STATE_DEGRADED, {DEGRADED, {}} },
{ PG_STATE_DOWN, {UNAVAILABLE, {}} },
// Delayed (wait until stuck) reports
}
}
- // OSD_SKEWED_USAGE
- if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
- int max_osd = -1, min_osd = -1;
- float max_osd_usage = 0.0, min_osd_usage = 1.0;
- for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- // kb should never be 0, but avoid divide by zero in case of corruption
- if (p->second.kb <= 0)
- continue;
- float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
- if (usage > max_osd_usage) {
- max_osd_usage = usage;
- max_osd = p->first;
- }
- if (usage < min_osd_usage) {
- min_osd_usage = usage;
- min_osd = p->first;
- }
- }
- float diff = max_osd_usage - min_osd_usage;
- if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
- auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN,
- "skewed osd utilization");
- ostringstream ss;
- ss << "difference between min (osd." << min_osd << " at "
- << roundf(min_osd_usage*1000.0)/100.0
- << "%) and max (osd." << max_osd << " at "
- << roundf(max_osd_usage*1000.0)/100.0
- << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
- << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
- << " (mon_warn_osd_usage_min_max_delta)";
- d.detail.push_back(ss.str());
- }
- }
-
// OSD_SCRUB_ERRORS
if (pg_sum.stats.sum.num_scrub_errors) {
ostringstream ss;
ostringstream ss;
ss << pg_sum.stats.sum.num_objects_unfound
<< "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
- checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
+ auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
+
+ for (auto& p : pg_stat) {
+ if (p.second.stats.sum.num_objects_unfound) {
+ ostringstream ss;
+ ss << "pg " << p.first
+ << " has " << p.second.stats.sum.num_objects_unfound
+ << " unfound objects";
+ d.detail.push_back(ss.str());
+ if (d.detail.size() > max) {
+ d.detail.push_back("(additional pgs left out for brevity)");
+ break;
+ }
+ }
+ }
}
// REQUEST_SLOW
// REQUEST_STUCK
if (cct->_conf->mon_osd_warn_op_age > 0 &&
- osd_sum.op_queue_age_hist.upper_bound() >
+ !osd_sum.op_queue_age_hist.h.empty() &&
+ osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
cct->_conf->mon_osd_warn_op_age) {
list<string> warn_detail, error_detail;
unsigned warn = 0, error = 0;
for (auto& p : warn_osd_by_max) {
ostringstream ss;
if (p.second.size() > 1) {
- ss << "osds " << p.second;
+ ss << "osds " << p.second
+ << " have blocked requests > " << p.first << " sec";
} else {
- ss << "osd." << *p.second.begin();
+ ss << "osd." << *p.second.begin()
+ << " has blocked requests > " << p.first << " sec";
}
- ss << " have blocked requests > " << p.first << " sec";
d.detail.push_back(ss.str());
if (--left == 0) {
break;
for (auto& p : error_osd_by_max) {
ostringstream ss;
if (p.second.size() > 1) {
- ss << "osds " << p.second;
+ ss << "osds " << p.second
+ << " have stuck requests > " << p.first << " sec";
} else {
- ss << "osd." << *p.second.begin();
+ ss << "osd." << *p.second.begin()
+ << " has stuck requests > " << p.first << " sec";
}
- ss << " have stuck requests > " << p.first << " sec";
d.detail.push_back(ss.str());
if (--left == 0) {
break;
// PG_NOT_SCRUBBED
// PG_NOT_DEEP_SCRUBBED
{
- list<string> detail, deep_detail;
- const double age = cct->_conf->mon_warn_not_scrubbed +
- cct->_conf->mon_scrub_interval;
- utime_t cutoff = now;
- cutoff -= age;
- const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
- cct->_conf->osd_deep_scrub_interval;
- utime_t deep_cutoff = now;
- deep_cutoff -= deep_age;
- for (auto& p : pg_stat) {
- if (p.second.last_scrub_stamp < cutoff) {
- ostringstream ss;
- ss << "pg " << p.first << " not scrubbed since "
- << p.second.last_scrub_stamp;
- detail.push_back(ss.str());
+ if (cct->_conf->mon_warn_not_scrubbed ||
+ cct->_conf->mon_warn_not_deep_scrubbed) {
+ list<string> detail, deep_detail;
+ const double age = cct->_conf->mon_warn_not_scrubbed +
+ cct->_conf->mon_scrub_interval;
+ utime_t cutoff = now;
+ cutoff -= age;
+ const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
+ cct->_conf->osd_deep_scrub_interval;
+ utime_t deep_cutoff = now;
+ deep_cutoff -= deep_age;
+ for (auto& p : pg_stat) {
+ if (cct->_conf->mon_warn_not_scrubbed &&
+ p.second.last_scrub_stamp < cutoff) {
+ ostringstream ss;
+ ss << "pg " << p.first << " not scrubbed since "
+ << p.second.last_scrub_stamp;
+ detail.push_back(ss.str());
+ }
+ if (cct->_conf->mon_warn_not_deep_scrubbed &&
+ p.second.last_deep_scrub_stamp < deep_cutoff) {
+ ostringstream ss;
+ ss << "pg " << p.first << " not deep-scrubbed since "
+ << p.second.last_deep_scrub_stamp;
+ deep_detail.push_back(ss.str());
+ }
}
- if (p.second.last_deep_scrub_stamp < deep_cutoff) {
- ostringstream ss;
- ss << "pg " << p.first << " not deep-scrubbed since "
- << p.second.last_deep_scrub_stamp;
- deep_detail.push_back(ss.str());
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pgs not scrubbed for " << age;
+ auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+ d.detail.swap(detail);
+ }
+ if (!deep_detail.empty()) {
+ ostringstream ss;
+ ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
+ auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+ d.detail.swap(deep_detail);
+ }
+ }
+ }
+
+ // POOL_APP
+ {
+ list<string> detail;
+ for (auto &it : pools) {
+ const pg_pool_t &pool = it.second;
+ const string& pool_name = osdmap.get_pool_name(it.first);
+ auto it2 = pg_pool_sum.find(it.first);
+ if (it2 == pg_pool_sum.end()) {
+ continue;
+ }
+ const pool_stat_t *pstat = &it2->second;
+ if (pstat == nullptr) {
+ continue;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ // application metadata is not encoded until luminous is minimum
+ // required release
+ if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+ sum.num_objects > 0 && pool.application_metadata.empty() &&
+ !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
+ stringstream ss;
+ ss << "application not enabled on pool '" << pool_name << "'";
+ detail.push_back(ss.str());
}
}
if (!detail.empty()) {
ostringstream ss;
- ss << detail.size() << " pgs not scrubbed for " << age;
- auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+ ss << "application not enabled on " << detail.size() << " pool(s)";
+ auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
+ stringstream tip;
+ tip << "use 'ceph osd pool application enable <pool-name> "
+ << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
+ << "or freeform for custom applications.";
+ detail.push_back(tip.str());
d.detail.swap(detail);
}
- if (!deep_detail.empty()) {
- ostringstream ss;
- ss << deep_detail.size() << " pgs not deep-scrubbed for " << age;
- auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
- d.detail.swap(deep_detail);
- }
}
}
// slow requests
if (cct->_conf->mon_osd_warn_op_age > 0 &&
- osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
+ osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
+ cct->_conf->mon_osd_warn_op_age) {
auto sum = _warn_slow_request_histogram(
cct, osd_sum.op_queue_age_hist, "", summary, NULL);
if (sum.first > 0 || sum.second > 0) {
}
if (sum.second > 0) {
ostringstream ss;
- ss << sum.first << " requests are blocked > "
+ ss << sum.second << " requests are blocked > "
<< (cct->_conf->mon_osd_warn_op_age *
cct->_conf->mon_osd_err_op_age_ratio)
<< " sec";
}
}
- if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
- float max_osd_usage = 0.0, min_osd_usage = 1.0;
- for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- // kb should never be 0, but avoid divide by zero in case of corruption
- if (p->second.kb <= 0)
- continue;
- float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
- if (usage > max_osd_usage)
- max_osd_usage = usage;
- if (usage < min_osd_usage)
- min_osd_usage = usage;
- }
- float diff = max_osd_usage - min_osd_usage;
- if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
- ostringstream ss;
- ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0
- << "%) and max (" << roundf(max_osd_usage*1000.0)/10.0
- << "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > "
- << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0
- << "% (mon_warn_osd_usage_min_max_delta)";
- summary.push_back(make_pair(HEALTH_WARN, ss.str()));
- if (detail)
- detail->push_back(make_pair(HEALTH_WARN, ss.str()));
- }
- }
-
// recovery
list<string> sl;
overall_recovery_summary(NULL, &sl);
break;
} else {
int filter = pg_string_state(state_str);
- assert(filter != -1);
+ if (filter < 0) {
+ *ss << "'" << state_str << "' is not a valid pg state,"
+ << " available choices: " << pg_state_string(0xFFFFFFFF);
+ return -EINVAL;
+ }
state |= filter;
}
}
ss << "pg " << pgidstr << " now creating, ok";
goto update;
+ } else if (prefix == "pg force-recovery" ||
+ prefix == "pg force-backfill" ||
+ prefix == "pg cancel-force-recovery" ||
+ prefix == "pg cancel-force-backfill") {
+ if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+ ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+ << "luminous' before using forced recovery";
+ r = -EPERM;
+ goto reply;
+ }
} else if (prefix == "pg set_full_ratio" ||
prefix == "pg set_nearfull_ratio") {
if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
}
/**
- * @note What we contirbute to the pending Paxos transaction is
+ * @note What we contribute to the pending Paxos transaction is
* obtained by calling a function that must be implemented by
* the class implementing us. I.e., the function
* encode_pending will be the one responsible to encode
void clear() {
checks.clear();
}
+ bool empty() const {
+ return checks.empty();
+ }
void swap(health_check_map_t& other) {
checks.swap(other.checks);
}
cd $script_root/../build
script_root=$PWD
fi
+ceph_bin=$script_root/bin
vstart_path=`dirname $0`
[ "$#" -lt 2 ] && echo "usage: $0 <name> <port> [params...]" && exit 1
#include "messages/MOSDScrubReserve.h"
#include "messages/MOSDRepScrub.h"
#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDForceRecovery.h"
#include "messages/MOSDPGScan.h"
#include "messages/MOSDPGBackfill.h"
#include "messages/MOSDBackoff.h"
#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
#include "messages/MRemoveSnaps.h"
case MSG_OSD_RECOVERY_RESERVE:
m = new MRecoveryReserve;
break;
+ case MSG_OSD_FORCE_RECOVERY:
+ m = new MOSDForceRecovery;
+ break;
case MSG_ROUTE:
m = new MRoute;
case MSG_OSD_PG_PUSH_REPLY:
m = new MOSDPGPushReply;
break;
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ m = new MOSDPGRecoveryDelete;
+ break;
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ m = new MOSDPGRecoveryDeleteReply;
+ break;
case MSG_OSD_EC_WRITE:
m = new MOSDECSubOpWrite;
break;
#define MSG_OSD_BACKFILL_RESERVE 99
#define MSG_OSD_RECOVERY_RESERVE 150
+#define MSG_OSD_FORCE_RECOVERY 151
#define MSG_OSD_PG_PUSH 105
#define MSG_OSD_PG_PULL 106
#define MSG_OSD_PG_CREATED 116
#define MSG_OSD_REP_SCRUBMAP 117
+#define MSG_OSD_PG_RECOVERY_DELETE 118
+#define MSG_OSD_PG_RECOVERY_DELETE_REPLY 119
// *** MDS ***
<< " - presumably this is the same node!" << dendl;
} else {
ldout(async_msgr->cct, 10) << __func__ << " connect claims to be "
- << paddr << " not " << peer_addr
- << " (peer is possibly using public_bind_addr?) " << dendl;
+ << paddr << " not " << peer_addr << dendl;
+ goto fail;
}
}
<< " off " << header.data_off << dendl;
if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) {
- std::list<buffer::ptr>::const_iterator pb;
- for (pb = bl.buffers().begin(); pb != bl.buffers().end(); ++pb) {
- outcoming_bl.append((char*)pb->c_str(), pb->length());
+ for (const auto &pb : bl.buffers()) {
+ outcoming_bl.append((char*)pb.c_str(), pb.length());
}
} else {
outcoming_bl.claim_append(bl);
}
m->trace.event("async writing message");
- logger->inc(l_msgr_send_bytes, outcoming_bl.length() - original_bl_len);
ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
<< " " << m << dendl;
+ ssize_t total_send_size = outcoming_bl.length();
ssize_t rc = _try_send(more);
if (rc < 0) {
ldout(async_msgr->cct, 1) << __func__ << " error sending " << m << ", "
<< cpp_strerror(rc) << dendl;
} else if (rc == 0) {
+ logger->inc(l_msgr_send_bytes, total_send_size - original_bl_len);
ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " done." << dendl;
} else {
+ logger->inc(l_msgr_send_bytes, total_send_size - outcoming_bl.length());
ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " continuely." << dendl;
}
if (m->get_type() == CEPH_MSG_OSD_OP)
plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes");
- plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network received bytes");
+ plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes");
plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
<< paddr << " not " << peer_addr << " - presumably this is the same node!" << dendl;
} else {
ldout(msgr->cct,10) << "connect claims to be "
- << paddr << " not " << peer_addr
- << " (peer is possibly using public_bind_addr?) " << dendl;
+ << paddr << " not " << peer_addr << dendl;
+ goto fail;
}
}
return (*pctx)->pg->do_osd_ops(*pctx, ops);
}
-int cls_cxx_map_get_all_vals(cls_method_context_t hctx, map<string, bufferlist>* vals)
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx, map<string, bufferlist>* vals,
+ bool *more)
{
PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
vector<OSDOp> ops(1);
bufferlist::iterator iter = op.outdata.begin();
try {
::decode(*vals, iter);
+ ::decode(*more, iter);
} catch (buffer::error& err) {
return -EIO;
}
}
int cls_cxx_map_get_keys(cls_method_context_t hctx, const string &start_obj,
- uint64_t max_to_get, set<string> *keys)
+ uint64_t max_to_get, set<string> *keys,
+ bool *more)
{
PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
vector<OSDOp> ops(1);
bufferlist::iterator iter = op.outdata.begin();
try {
::decode(*keys, iter);
+ ::decode(*more, iter);
} catch (buffer::error& err) {
return -EIO;
}
int cls_cxx_map_get_vals(cls_method_context_t hctx, const string &start_obj,
const string &filter_prefix, uint64_t max_to_get,
- map<string, bufferlist> *vals)
+ map<string, bufferlist> *vals, bool *more)
{
PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
vector<OSDOp> ops(1);
bufferlist::iterator iter = op.outdata.begin();
try {
::decode(*vals, iter);
+ ::decode(*more, iter);
} catch (buffer::error& err) {
return -EIO;
}
extern int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid);
extern int cls_cxx_map_clear(cls_method_context_t hctx);
extern int cls_cxx_map_get_all_vals(cls_method_context_t hctx,
- std::map<string, bufferlist> *vals);
+ std::map<string, bufferlist> *vals,
+ bool *more);
extern int cls_cxx_map_get_keys(cls_method_context_t hctx,
const string &start_after,
uint64_t max_to_get,
- std::set<string> *keys);
+ std::set<string> *keys,
+ bool *more);
extern int cls_cxx_map_get_vals(cls_method_context_t hctx,
const string &start_after,
const string &filter_prefix,
uint64_t max_to_get,
- std::map<string, bufferlist> *vals);
+ std::map<string, bufferlist> *vals,
+ bool *more);
extern int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl);
extern int cls_cxx_map_set_vals(cls_method_context_t hctx,
const std::map<string, bufferlist> *map);
memstore/MemStore.cc
kstore/KStore.cc
kstore/kstore_types.cc
- fs/FS.cc
- fs/aio.cc)
+ fs/FS.cc)
if(HAVE_LIBAIO)
list(APPEND libos_srcs
bluestore/StupidAllocator.cc
bluestore/BitMapAllocator.cc
bluestore/BitAllocator.cc
+ bluestore/aio.cc
)
endif(HAVE_LIBAIO)
#include <list>
#include "acconfig.h"
-#include "os/fs/aio.h"
+#include "aio.h"
#define SPDK_PREFIX "spdk:"
}
}
-int BlueFS::add_block_device(unsigned id, string path)
+int BlueFS::add_block_device(unsigned id, const string& path)
{
dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
assert(id < bdev.size());
/// sync any uncommitted state to disk
void sync_metadata();
- int add_block_device(unsigned bdev, string path);
+ int add_block_device(unsigned bdev, const string& path);
bool bdev_support_label(unsigned id);
uint64_t get_block_device_size(unsigned bdev);
"bluestore_compression_max_blob_size",
"bluestore_compression_max_blob_size_ssd",
"bluestore_compression_max_blob_size_hdd",
+ "bluestore_compression_required_ratio",
"bluestore_max_alloc_size",
"bluestore_prefer_deferred_size",
"bluestore_deferred_batch_ops",
}
OpSequencerRef osr = txc->osr;
- CollectionRef c;
bool empty = false;
bool submit_deferred = false;
OpSequencer::q_list_t releasing_txc;
break;
}
- if (!c && txc->first_collection) {
- c = txc->first_collection;
- }
osr->q.pop_front();
releasing_txc.push_back(*txc);
notify = true;
t->set(PREFIX_SUPER, "blobid_max", bl);
dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
}
- for (auto txc : kv_submitting) {
- assert(txc->state == TransContext::STATE_KV_QUEUED);
- txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
- int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
- assert(r == 0);
- _txc_applied_kv(txc);
- --txc->osr->kv_committing_serially;
- txc->state = TransContext::STATE_KV_SUBMITTED;
- if (txc->osr->kv_submitted_waiters) {
- std::lock_guard<std::mutex> l(txc->osr->qlock);
- if (txc->osr->_is_all_kv_submitted()) {
- txc->osr->qcond.notify_all();
+
+ for (auto txc : kv_committing) {
+ if (txc->state == TransContext::STATE_KV_QUEUED) {
+ txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
+ int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+ assert(r == 0);
+ _txc_applied_kv(txc);
+ --txc->osr->kv_committing_serially;
+ txc->state = TransContext::STATE_KV_SUBMITTED;
+ if (txc->osr->kv_submitted_waiters) {
+ std::lock_guard<std::mutex> l(txc->osr->qlock);
+ if (txc->osr->_is_all_kv_submitted()) {
+ txc->osr->qcond.notify_all();
+ }
}
+
+ } else {
+ assert(txc->state == TransContext::STATE_KV_SUBMITTED);
+ txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
}
- }
- for (auto txc : kv_committing) {
if (txc->had_ios) {
--txc->osr->txc_with_unstable_io;
}
- txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
}
// release throttle *before* we commit. this allows new ops
for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
++p, ++j) {
cvec[j] = _get_collection(*p);
-
- // note first collection we reference
- if (!txc->first_collection)
- txc->first_collection = cvec[j];
}
vector<OnodeRef> ovec(i.objects.size());
IOContext ioc;
bool had_ios = false; ///< true if we submitted IOs before our kv txn
- CollectionRef first_collection; ///< first referenced collection
-
uint64_t seq = 0;
utime_t start;
utime_t last_stamp;
objectstore_perf_stat_t get_cur_stats() const {
objectstore_perf_stat_t ret;
- ret.os_commit_latency = os_commit_latency.avg();
- ret.os_apply_latency = os_apply_latency.avg();
+ ret.os_commit_latency = os_commit_latency.current_avg();
+ ret.os_apply_latency = os_apply_latency.current_avg();
return ret;
}
#include <atomic>
#include "os/fs/FS.h"
-#include "os/fs/aio.h"
#include "include/interval_set.h"
+#include "aio.h"
#include "BlockDevice.h"
class KernelDevice : public BlockDevice {
#include <atomic>
#include "os/fs/FS.h"
-#include "os/fs/aio.h"
#include "include/interval_set.h"
+#include "aio.h"
#include "BlockDevice.h"
class PMEMDevice : public BlockDevice {
if (skew)
skew = alloc_unit - skew;
*offset = p.get_start() + skew;
- *length = MIN(MAX(alloc_unit, want_size), p.get_len() - skew);
+ *length = MIN(MAX(alloc_unit, want_size), P2ALIGN((p.get_len() - skew), alloc_unit));
if (cct->_conf->bluestore_debug_small_allocations) {
uint64_t max =
alloc_unit * (rand() % cct->_conf->bluestore_debug_small_allocations);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "aio.h"
+
+#if defined(HAVE_LIBAIO)
+
+
+int aio_queue_t::submit(aio_t &aio, int *retries)
+{
+ // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+ int attempts = 16;
+ int delay = 125;
+ iocb *piocb = &aio.iocb;
+ int r;
+ while (true) {
+ r = io_submit(ctx, 1, &piocb);
+ if (r < 0) {
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(delay);
+ delay *= 2;
+ (*retries)++;
+ continue;
+ }
+ }
+ assert(r == 1);
+ break;
+ }
+ return r;
+}
+
+int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
+ uint16_t aios_size, void *priv,
+ int *retries)
+{
+ // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+ int attempts = 16;
+ int delay = 125;
+
+ aio_iter cur = begin;
+ struct iocb *piocb[aios_size];
+ int r, pos = 0;
+ while (cur != end) {
+ cur->priv = priv;
+ *(piocb+pos) = &cur->iocb;
+ ++pos;
+ ++cur;
+ }
+ while (true) {
+ r = io_submit(ctx, pos, piocb);
+ if (r < 0) {
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(delay);
+ delay *= 2;
+ (*retries)++;
+ continue;
+ }
+ }
+ break;
+ }
+ return r;
+}
+
+int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
+{
+ io_event event[max];
+ struct timespec t = {
+ timeout_ms / 1000,
+ (timeout_ms % 1000) * 1000 * 1000
+ };
+
+ int r = 0;
+ do {
+ r = io_getevents(ctx, 1, max, event, &t);
+ } while (r == -EINTR);
+
+ for (int i=0; i<r; ++i) {
+ paio[i] = (aio_t *)event[i].obj;
+ paio[i]->rval = event[i].res;
+ }
+ return r;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "acconfig.h"
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/container/small_vector.hpp>
+
+#include "include/buffer.h"
+#include "include/types.h"
+
+struct aio_t {
+ struct iocb iocb; // must be first element; see shenanigans in aio_queue_t
+ void *priv;
+ int fd;
+ boost::container::small_vector<iovec,4> iov;
+ uint64_t offset, length;
+ int rval;
+ bufferlist bl; ///< write payload (so that it remains stable for duration)
+
+ boost::intrusive::list_member_hook<> queue_item;
+
+ aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) {
+ }
+
+ void pwritev(uint64_t _offset, uint64_t len) {
+ offset = _offset;
+ length = len;
+ io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
+ }
+ void pread(uint64_t _offset, uint64_t len) {
+ offset = _offset;
+ length = len;
+ bufferptr p = buffer::create_page_aligned(length);
+ io_prep_pread(&iocb, fd, p.c_str(), length, offset);
+ bl.append(std::move(p));
+ }
+
+ int get_return_value() {
+ return rval;
+ }
+};
+
+typedef boost::intrusive::list<
+ aio_t,
+ boost::intrusive::member_hook<
+ aio_t,
+ boost::intrusive::list_member_hook<>,
+ &aio_t::queue_item> > aio_list_t;
+
+struct aio_queue_t {
+ int max_iodepth;
+ io_context_t ctx;
+
+ typedef list<aio_t>::iterator aio_iter;
+
+ explicit aio_queue_t(unsigned max_iodepth)
+ : max_iodepth(max_iodepth),
+ ctx(0) {
+ }
+ ~aio_queue_t() {
+ assert(ctx == 0);
+ }
+
+ int init() {
+ assert(ctx == 0);
+ int r = io_setup(max_iodepth, &ctx);
+ if (r < 0) {
+ if (ctx) {
+ io_destroy(ctx);
+ ctx = 0;
+ }
+ }
+ return r;
+ }
+ void shutdown() {
+ if (ctx) {
+ int r = io_destroy(ctx);
+ assert(r == 0);
+ ctx = 0;
+ }
+ }
+
+ int submit(aio_t &aio, int *retries);
+ int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
+ void *priv, int *retries);
+ int get_next_completed(int timeout_ms, aio_t **paio, int max);
+};
+
+#endif
cout << desc << std::endl;
}
+void validate_path(CephContext *cct, const string& path, bool bluefs)
+{
+ BlueStore bluestore(cct, path);
+ string type;
+ int r = bluestore.read_meta("type", &type);
+ if (r < 0) {
+ cerr << "failed to load os-type: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (type != "bluestore") {
+ cerr << "expected bluestore, but type is " << type << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (!bluefs) {
+ return;
+ }
+
+ string kv_backend;
+ r = bluestore.read_meta("kv_backend", &kv_backend);
+ if (r < 0) {
+ cerr << "failed to load kv_backend: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (kv_backend != "rocksdb") {
+ cerr << "expect kv_backend to be rocksdb, but is " << kv_backend
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ string bluefs_enabled;
+ r = bluestore.read_meta("bluefs", &bluefs_enabled);
+ if (r < 0) {
+ cerr << "failed to load do_bluefs: " << cpp_strerror(r) << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (bluefs_enabled != "1") {
+ cerr << "bluefs not enabled for rocksdb" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+}
+
int main(int argc, char **argv)
{
string out_dir;
po::include_positional);
} catch(po::error &e) {
std::cerr << e.what() << std::endl;
- return 1;
+ exit(EXIT_FAILURE);
}
if (vm.count("help")) {
usage(po_all);
- return 1;
+ exit(EXIT_SUCCESS);
}
if (action.empty()) {
cerr << "must specify an action; --help for help" << std::endl;
- return 1;
+ exit(EXIT_FAILURE);
}
if (action == "fsck") {
if (path.empty()) {
cerr << "must specify bluestore path" << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
}
- if (action == "bluefs-export" ||
- action == "show-label") {
+ if (action == "show-label") {
if (devs.empty() && path.empty()) {
cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
cout << "infering bluefs devices from bluestore path" << std::endl;
for (auto fn : {"block", "block.wal", "block.db"}) {
}
}
}
+ if (action == "bluefs-export") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (out_dir.empty()) {
+ cerr << "must specify out-dir to export bluefs" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ cout << "infering bluefs devices from bluestore path" << std::endl;
+ for (auto fn : {"block", "block.wal", "block.db"}) {
+ string p = path + "/" + fn;
+ struct stat st;
+ if (::stat(p.c_str(), &st) == 0) {
+ devs.push_back(p);
+ }
+ }
+ }
vector<const char*> args;
for (auto& i : ceph_option_strings) {
if (action == "fsck" ||
action == "fsck-deep") {
+ validate_path(cct.get(), path, false);
BlueStore bluestore(cct.get(), path);
int r = bluestore.fsck(fsck_deep);
if (r < 0) {
cerr << "error from fsck: " << cpp_strerror(r) << std::endl;
- return 1;
+ exit(EXIT_FAILURE);
}
}
else if (action == "show-label") {
if (r < 0) {
cerr << "unable to read label for " << i << ": "
<< cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
jf.open_object_section(i.c_str());
label.dump(&jf);
jf.flush(cout);
}
else if (action == "bluefs-export") {
- if (out_dir.empty()) {
- cerr << "must specify out-dir to export bluefs" << std::endl;
- exit(1);
- }
+ validate_path(cct.get(), path, true);
BlueFS fs(&(*cct));
string main;
set<int> got;
if (r < 0) {
cerr << "unable to read label for " << i << ": "
<< cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
int id = -1;
if (label.description == "main")
int r = fs.add_block_device(id, i);
if (r < 0) {
cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
}
}
if (r < 0) {
cerr << "unable to open " << main << ": " << cpp_strerror(r)
<< std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
}
if (r < 0) {
cerr << "unable to mount bluefs: " << cpp_strerror(r)
<< std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
vector<string> dirs;
r = fs.readdir("", &dirs);
if (r < 0) {
cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
for (auto& dir : dirs) {
if (dir[0] == '.')
r = fs.readdir(dir, &ls);
if (r < 0) {
cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
string full = out_dir + "/" + dir;
r = ::mkdir(full.c_str(), 0755);
if (r < 0) {
+ r = -errno;
cerr << "mkdir " << full << " failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
for (auto& file : ls) {
if (file[0] == '.')
r = fs.stat(dir, file, &size, &mtime);
if (r < 0) {
cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
string path = out_dir + "/" + dir + "/" + file;
int fd = ::open(path.c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0644);
if (fd < 0) {
r = -errno;
cerr << "open " << path << " failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
assert(fd >= 0);
if (size > 0) {
if (r < 0) {
cerr << "open_for_read " << dir << "/" << file << " failed: "
<< cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
int pos = 0;
int left = size;
if (r <= 0) {
cerr << "read " << dir << "/" << file << " from " << pos
<< " failed: " << cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
int rc = bl.write_fd(fd);
if (rc < 0) {
cerr << "write to " << path << " failed: "
<< cpp_strerror(r) << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
pos += r;
left -= r;
objectstore_perf_stat_t get_cur_stats() const {
objectstore_perf_stat_t ret;
- ret.os_commit_latency = os_commit_latency.avg();
- ret.os_apply_latency = os_apply_latency.avg();
+ ret.os_commit_latency = os_commit_latency.current_avg();
+ ret.os_apply_latency = os_apply_latency.current_avg();
return ret;
}
*
*/
+#include "include/compat.h"
#include "include/types.h"
#include "include/buffer.h"
#include "osd/osd_types.h"
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "aio.h"
-
-#if defined(HAVE_LIBAIO)
-
-
-int aio_queue_t::submit(aio_t &aio, int *retries)
-{
- // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
- int attempts = 16;
- int delay = 125;
- iocb *piocb = &aio.iocb;
- int r;
- while (true) {
- r = io_submit(ctx, 1, &piocb);
- if (r < 0) {
- if (r == -EAGAIN && attempts-- > 0) {
- usleep(delay);
- delay *= 2;
- (*retries)++;
- continue;
- }
- }
- assert(r == 1);
- break;
- }
- return r;
-}
-
-int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
- uint16_t aios_size, void *priv,
- int *retries)
-{
- // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
- int attempts = 16;
- int delay = 125;
-
- aio_iter cur = begin;
- struct iocb *piocb[aios_size];
- int r, pos = 0;
- while (cur != end) {
- cur->priv = priv;
- *(piocb+pos) = &cur->iocb;
- ++pos;
- ++cur;
- }
- while (true) {
- r = io_submit(ctx, pos, piocb);
- if (r < 0) {
- if (r == -EAGAIN && attempts-- > 0) {
- usleep(delay);
- delay *= 2;
- (*retries)++;
- continue;
- }
- }
- break;
- }
- return r;
-}
-
-int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
-{
- io_event event[max];
- struct timespec t = {
- timeout_ms / 1000,
- (timeout_ms % 1000) * 1000 * 1000
- };
-
- int r = 0;
- do {
- r = io_getevents(ctx, 1, max, event, &t);
- } while (r == -EINTR);
-
- for (int i=0; i<r; ++i) {
- paio[i] = (aio_t *)event[i].obj;
- paio[i]->rval = event[i].res;
- }
- return r;
-}
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include "acconfig.h"
-#ifdef HAVE_LIBAIO
-# include <libaio.h>
-
-#include <boost/intrusive/list.hpp>
-#include <boost/container/small_vector.hpp>
-
-#include "include/buffer.h"
-#include "include/types.h"
-
-struct aio_t {
- struct iocb iocb; // must be first element; see shenanigans in aio_queue_t
- void *priv;
- int fd;
- boost::container::small_vector<iovec,4> iov;
- uint64_t offset, length;
- int rval;
- bufferlist bl; ///< write payload (so that it remains stable for duration)
-
- boost::intrusive::list_member_hook<> queue_item;
-
- aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) {
- }
-
- void pwritev(uint64_t _offset, uint64_t len) {
- offset = _offset;
- length = len;
- io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
- }
- void pread(uint64_t _offset, uint64_t len) {
- offset = _offset;
- length = len;
- bufferptr p = buffer::create_page_aligned(length);
- io_prep_pread(&iocb, fd, p.c_str(), length, offset);
- bl.append(std::move(p));
- }
-
- int get_return_value() {
- return rval;
- }
-};
-
-typedef boost::intrusive::list<
- aio_t,
- boost::intrusive::member_hook<
- aio_t,
- boost::intrusive::list_member_hook<>,
- &aio_t::queue_item> > aio_list_t;
-
-struct aio_queue_t {
- int max_iodepth;
- io_context_t ctx;
-
- typedef list<aio_t>::iterator aio_iter;
-
- explicit aio_queue_t(unsigned max_iodepth)
- : max_iodepth(max_iodepth),
- ctx(0) {
- }
- ~aio_queue_t() {
- assert(ctx == 0);
- }
-
- int init() {
- assert(ctx == 0);
- int r = io_setup(max_iodepth, &ctx);
- if (r < 0) {
- if (ctx) {
- io_destroy(ctx);
- ctx = 0;
- }
- }
- return r;
- }
- void shutdown() {
- if (ctx) {
- int r = io_destroy(ctx);
- assert(r == 0);
- ctx = 0;
- }
- }
-
- int submit(aio_t &aio, int *retries);
- int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
- void *priv, int *retries);
- int get_next_completed(int timeout_ms, aio_t **paio, int max);
-};
-
-#endif
::encode(offset, bl);
}
void decode(bufferlist::iterator &p, size_t page_size) {
- ::decode_array_nohead(data, page_size, p);
+ p.copy(page_size, data);
::decode(offset, p);
}
op.soid,
op.recovery_info,
recovery_ops[op.soid].obc,
+ false,
&m->t);
} else {
get_parent()->on_local_recover(
op.soid,
op.recovery_info,
ObjectContextRef(),
+ false,
&m->t);
}
}
stat.num_bytes_recovered = op.recovery_info.size;
stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
stat.num_objects_recovered = 1;
- get_parent()->on_global_recover(op.hoid, stat);
+ get_parent()->on_global_recover(op.hoid, stat, false);
dout(10) << __func__ << ": WRITING return " << op << dendl;
recovery_ops.erase(op.hoid);
return;
RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second;
continue_recovery_op(op, &m);
}
+
dispatch_recovery_messages(m, priority);
+ send_recovery_deletes(priority, h->deletes);
delete _h;
}
return false;
}
-bool ECBackend::handle_message(
+bool ECBackend::_handle_message(
OpRequestRef _op)
{
dout(10) << __func__ << ": " << *_op->get_req() << dendl;
hinfo = get_hash_info(i->first);
if (!hinfo) {
r = -EIO;
- get_parent()->clog_error() << __func__ << ": No hinfo for " << i->first;
+ get_parent()->clog_error() << "Corruption detected: object " << i->first
+ << " is missing hash_info";
dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
goto error;
}
j->get<1>(),
bl, j->get<2>());
if (r < 0) {
- get_parent()->clog_error() << __func__
- << ": Error " << r
- << " reading "
+ get_parent()->clog_error() << "Error " << r
+ << " reading object "
<< i->first;
dout(5) << __func__ << ": Error " << r
<< " reading " << i->first << dendl;
bufferhash h(-1);
h << bl;
if (h.digest() != hinfo->get_chunk_hash(shard)) {
- get_parent()->clog_error() << __func__ << ": Bad hash for " << i->first << " digest 0x"
+ get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x"
<< hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec;
dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
<< hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
}
// Couldn't read any additional shards so handle as completed with errors
}
- if (rop.complete[iter->first].errors.empty()) {
- dout(20) << __func__ << " simply not enough copies err=" << err << dendl;
- } else {
- // Grab the first error
- err = rop.complete[iter->first].errors.begin()->second;
- dout(20) << __func__ << ": Use one of the shard errors err=" << err << dendl;
- }
+ // We don't want to confuse clients / RBD with objectstore error
+ // values in particular ENOENT. We may have different error returns
+ // from different shards, so we'll return minimum_to_decode() error
+ // (usually EIO) to reader. It is likely an error here is due to a
+ // damaged pg.
rop.complete[iter->first].r = err;
++is_complete;
}
err = rop.complete[iter->first].errors.begin()->second;
rop.complete[iter->first].r = err;
} else {
- get_parent()->clog_error() << __func__ << ": Error(s) ignored for "
+ get_parent()->clog_warn() << "Error(s) ignored for "
<< iter->first << " enough copies available";
dout(10) << __func__ << " Error(s) ignored for " << iter->first
<< " enough copies available" << dendl;
RecoveryHandle *h
) override;
- bool handle_message(
+ bool _handle_message(
OpRequestRef op
) override;
bool can_handle_while_inactive(
#include "messages/MOSDPGBackfill.h"
#include "messages/MBackfillReserve.h"
#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDForceRecovery.h"
#include "messages/MOSDECSubOpWrite.h"
#include "messages/MOSDECSubOpWriteReply.h"
#include "messages/MOSDECSubOpRead.h"
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
ceph_osd_feature_incompat);
}
dout(10) << __func__ << " " << get_full_state_name(cur_state)
<< " -> " << get_full_state_name(new_state) << dendl;
if (new_state == FAILSAFE) {
- clog->error() << "failsafe engaged, dropping updates, now "
+ clog->error() << "full status failsafe engaged, dropping updates, now "
<< (int)roundf(ratio * 100) << "% full";
} else if (cur_state == FAILSAFE) {
- clog->error() << "failsafe disengaged, no longer dropping updates, now "
- << (int)roundf(ratio * 100) << "% full";
+ clog->error() << "full status failsafe disengaged, no longer dropping "
+ << "updates, now " << (int)roundf(ratio * 100) << "% full";
}
cur_state = new_state;
}
void OSDService::send_pg_created(pg_t pgid)
{
dout(20) << __func__ << dendl;
- monc->send_mon_message(new MOSDPGCreated(pgid));
+ if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+ monc->send_mon_message(new MOSDPGCreated(pgid));
+ }
}
// --------------------------------------
} else if (admin_command == "flush_journal") {
store->flush_journal();
} else if (admin_command == "dump_ops_in_flight" ||
- admin_command == "ops") {
- if (!op_tracker.dump_ops_in_flight(f)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_blocked_ops") {
- if (!op_tracker.dump_ops_in_flight(f, true)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_ops") {
- if (!op_tracker.dump_historic_ops(f, false)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_ops_by_duration") {
- if (!op_tracker.dump_historic_ops(f, true)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_slow_ops") {
- if (!op_tracker.dump_historic_slow_ops(f)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ admin_command == "ops" ||
+ admin_command == "dump_blocked_ops" ||
+ admin_command == "dump_historic_ops" ||
+ admin_command == "dump_historic_ops_by_duration" ||
+ admin_command == "dump_historic_slow_ops") {
+
+ const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
+even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
+will start to track new ops received afterwards.";
+
+ set<string> filters;
+ vector<string> filter_str;
+ if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
+ copy(filter_str.begin(), filter_str.end(),
+ inserter(filters, filters.end()));
+ }
+
+ if (admin_command == "dump_ops_in_flight" ||
+ admin_command == "ops") {
+ if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_blocked_ops") {
+ if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_ops") {
+ if (!op_tracker.dump_historic_ops(f, false, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_ops_by_duration") {
+ if (!op_tracker.dump_historic_ops(f, true, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_slow_ops") {
+ if (!op_tracker.dump_historic_slow_ops(f, filters)) {
+ ss << error_str;
+ }
}
} else if (admin_command == "dump_op_pq_state") {
f->open_object_section("pq");
delete fuse_store;
fuse_store = NULL;
r = ::rmdir(mntpath.c_str());
- if (r < 0)
- r = -errno;
if (r < 0) {
- derr << __func__ << " failed to rmdir " << mntpath << dendl;
+ r = -errno;
+ derr << __func__ << " failed to rmdir " << mntpath << ": "
+ << cpp_strerror(r) << dendl;
return r;
}
return 0;
return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
}
+float OSD::get_osd_recovery_sleep()
+{
+ if (cct->_conf->osd_recovery_sleep)
+ return cct->_conf->osd_recovery_sleep;
+ if (store_is_rotational)
+ return cct->_conf->osd_recovery_sleep_hdd;
+ else
+ return cct->_conf->osd_recovery_sleep_ssd;
+}
+
int OSD::init()
{
CompatSet initial, diff;
r = monc->authenticate();
if (r < 0) {
+ derr << __func__ << " authentication failed: " << cpp_strerror(r)
+ << dendl;
osd_lock.Lock(); // locker is going to unlock this on function exit
if (is_stopping())
- r = 0;
+ r = 0;
goto monout;
}
derr << "unable to obtain rotating service keys; retrying" << dendl;
++rotating_auth_attempts;
if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
+ derr << __func__ << " wait_auth_rotating timed out" << dendl;
osd_lock.Lock(); // make locker happy
if (!is_stopping()) {
- r = - ETIMEDOUT;
+ r = -ETIMEDOUT;
}
goto monout;
}
r = update_crush_device_class();
if (r < 0) {
+ derr << __func__ <<" unable to update_crush_device_class: "
+ << cpp_strerror(r) << dendl;
osd_lock.Lock();
goto monout;
}
r = update_crush_location();
if (r < 0) {
+ derr << __func__ <<" unable to update_crush_location: "
+ << cpp_strerror(r) << dendl;
osd_lock.Lock();
goto monout;
}
return 0;
monout:
- mgrc.shutdown();
- monc->shutdown();
+ exit(1);
out:
enable_disable_fuse(true);
"flush the journal to permanent store");
assert(r == 0);
r = admin_socket->register_command("dump_ops_in_flight",
- "dump_ops_in_flight", asok_hook,
+ "dump_ops_in_flight " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the ops currently in flight");
assert(r == 0);
r = admin_socket->register_command("ops",
- "ops", asok_hook,
+ "ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the ops currently in flight");
assert(r == 0);
r = admin_socket->register_command("dump_blocked_ops",
- "dump_blocked_ops", asok_hook,
+ "dump_blocked_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the blocked ops currently in flight");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+ r = admin_socket->register_command("dump_historic_ops",
+ "dump_historic_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show recent ops");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
+ r = admin_socket->register_command("dump_historic_slow_ops",
+ "dump_historic_slow_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show slowest recent ops");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
+ r = admin_socket->register_command("dump_historic_ops_by_duration",
+ "dump_historic_ops_by_duration " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show slowest recent ops, sorted by duration");
assert(r == 0);
if (acting_primary != new_acting_primary) {
h->same_primary_since = e;
}
+ if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+ osdmap->get_pg_num(pgid.pgid.pool()),
+ nullptr)) {
+ h->last_epoch_split = e;
+ }
lastmap = osdmap;
}
dout(20) << __func__ << " " << debug.str() << dendl;
}
OSDMapRef curmap = service.get_osdmap();
- assert(curmap);
+ if (!curmap) {
+ heartbeat_lock.Unlock();
+ m->put();
+ return;
+ }
switch (m->op) {
// if our map within recent history, try to add ourselves to the osdmap.
if (osdmap->get_epoch() == 0) {
derr << "waiting for initial osdmap" << dendl;
+ } else if (osdmap->is_destroyed(whoami)) {
+ derr << "osdmap says I am destroyed, exiting" << dendl;
+ exit(0);
} else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
} else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
{
// config info
(*pm)["osd_data"] = dev_path;
- (*pm)["osd_journal"] = journal_path;
+ if (store->get_type() == "filestore") {
+ // not applicable for bluestore
+ (*pm)["osd_journal"] = journal_path;
+ }
(*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
(*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
(*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
// backend
(*pm)["osd_objectstore"] = store->get_type();
(*pm)["rotational"] = store_is_rotational ? "1" : "0";
+ (*pm)["default_device_class"] = store->get_default_device_class();
store->collect_metadata(pm);
collect_sys_info(pm, cct);
"name=injected_args,type=CephString,n=N",
"inject configuration arguments into running OSD",
"osd", "rw", "cli,rest")
+COMMAND("config set " \
+ "name=key,type=CephString name=value,type=CephString",
+ "Set a configuration option at runtime (not persistent)",
+ "osd", "rw", "cli,rest")
COMMAND("cluster_log " \
"name=level,type=CephChoices,strings=error,warning,info,debug " \
"name=message,type=CephString,n=N",
r = cct->_conf->injectargs(args, &ss);
osd_lock.Lock();
}
+ else if (prefix == "config set") {
+ std::string key;
+ std::string val;
+ cmd_getval(cct, cmdmap, "key", key);
+ cmd_getval(cct, cmdmap, "value", val);
+ osd_lock.Unlock();
+ r = cct->_conf->set_val(key, val, true, &ss);
+ osd_lock.Lock();
+ }
else if (prefix == "cluster_log") {
vector<string> msg;
cmd_getval(cct, cmdmap, "message", msg);
uint64_t global_id;
uint64_t auid = CEPH_AUTH_UID_DEFAULT;
- isvalid = authorize_handler->verify_authorizer(
- cct, monc->rotating_secrets.get(),
- authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
- &auid);
+ RotatingKeyRing *keys = monc->rotating_secrets.get();
+ if (keys) {
+ isvalid = authorize_handler->verify_authorizer(
+ cct, keys,
+ authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
+ &auid);
+ } else {
+ dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
+ isvalid = false;
+ }
if (isvalid) {
Session *s = static_cast<Session *>(con->get_priv());
handle_scrub(static_cast<MOSDScrub*>(m));
break;
+ case MSG_OSD_FORCE_RECOVERY:
+ handle_force_recovery(m);
+ break;
+
// -- need OSDMap --
case MSG_OSD_PG_CREATE:
if (service.is_preparing_to_stop() || service.is_stopping()) {
service.got_stop_ack();
} else {
- clog->warn() << "map e" << osdmap->get_epoch()
- << " wrongly marked me down at e"
- << osdmap->get_down_at(whoami);
+ clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
+ "but it is still running";
+ clog->debug() << "map e" << osdmap->get_epoch()
+ << " wrongly marked me down at e"
+ << osdmap->get_down_at(whoami);
}
} else if (!osdmap->get_addr(whoami).probably_equals(
client_messenger->get_myaddr())) {
} else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
hb_back_server_messenger->get_myaddr())) {
clog->error() << "map e" << osdmap->get_epoch()
- << " had wrong hb back addr ("
+ << " had wrong heartbeat back addr ("
<< osdmap->get_hb_back_addr(whoami)
<< " != my " << hb_back_server_messenger->get_myaddr()
<< ")";
!osdmap->get_hb_front_addr(whoami).probably_equals(
hb_front_server_messenger->get_myaddr())) {
clog->error() << "map e" << osdmap->get_epoch()
- << " had wrong hb front addr ("
+ << " had wrong heartbeat front addr ("
<< osdmap->get_hb_front_addr(whoami)
<< " != my " << hb_front_server_messenger->get_myaddr()
<< ")";
pg->unlock();
}
+void OSD::handle_force_recovery(Message *m)
+{
+ MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
+ assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
+ RWLock::RLocker l(pg_map_lock);
+
+ vector<PG*> local_pgs;
+ local_pgs.reserve(msg->forced_pgs.size());
+
+ for (auto& i : msg->forced_pgs) {
+ spg_t locpg;
+ if (osdmap->get_primary_shard(i, &locpg)) {
+ auto pg_map_entry = pg_map.find(locpg);
+ if (pg_map_entry != pg_map.end()) {
+ local_pgs.push_back(pg_map_entry->second);
+ }
+ }
+ }
+
+ if (local_pgs.size()) {
+ service.adjust_pg_priorities(local_pgs, msg->options);
+ }
+
+ msg->put();
+}
/** PGQuery
* from primary to replica | stray
return true;
}
+
+void OSDService::adjust_pg_priorities(vector<PG*> pgs, int newflags)
+{
+ if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
+ return;
+ int newstate = 0;
+
+ Mutex::Locker l(recovery_lock);
+
+ if (newflags & OFR_BACKFILL) {
+ newstate = PG_STATE_FORCED_BACKFILL;
+ } else if (newflags & OFR_RECOVERY) {
+ newstate = PG_STATE_FORCED_RECOVERY;
+ }
+
+ // debug output here may get large, don't generate it if debug level is below
+ // 10 and use abbreviated pg ids otherwise
+ if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
+ stringstream ss;
+
+ for (auto& i : pgs) {
+ ss << i->get_pgid() << " ";
+ }
+
+ dout(10) << __func__ << " working on " << ss.str() << dendl;
+ }
+
+ if (newflags & OFR_CANCEL) {
+ for (auto& i : pgs) {
+ i->change_recovery_force_mode(newstate, true);
+ }
+ } else {
+ for (auto& i : pgs) {
+ // make sure the PG is in correct state before forcing backfill or recovery, or
+ // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
+ // or forcing somehow recovery/backfill.
+ int pgstate = i->get_state();
+ if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
+ ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+ i->change_recovery_force_mode(newstate, false);
+ }
+ }
+}
+
void OSD::do_recovery(
PG *pg, epoch_t queued, uint64_t reserved_pushes,
ThreadPool::TPHandle &handle)
* recovery_requeue_callback event, which re-queues the recovery op using
* queue_recovery_after_sleep.
*/
- if (cct->_conf->osd_recovery_sleep > 0 && service.recovery_needs_sleep) {
+ float recovery_sleep = get_osd_recovery_sleep();
+ if (recovery_sleep > 0 && service.recovery_needs_sleep) {
PGRef pgref(pg);
auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
dout(20) << "do_recovery wake up at "
if (service.recovery_schedule_time < ceph_clock_now()) {
service.recovery_schedule_time = ceph_clock_now();
}
- service.recovery_schedule_time += cct->_conf->osd_recovery_sleep;
+ service.recovery_schedule_time += recovery_sleep;
service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
recovery_requeue_callback);
dout(20) << "Recovery event scheduled at "
if (base_pool && base_pool->require_rollback()) {
if ((iter->op.op != CEPH_OSD_OP_READ) &&
(iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
+ (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
(iter->op.op != CEPH_OSD_OP_STAT) &&
(iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
(iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
}
}
// delayed pg activation
- void queue_for_recovery(PG *pg, bool front = false) {
+ void queue_for_recovery(PG *pg) {
Mutex::Locker l(recovery_lock);
- if (front) {
+
+ if (pg->get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL)) {
awaiting_throttle.push_front(make_pair(pg->get_osdmap()->get_epoch(), pg));
} else {
awaiting_throttle.push_back(make_pair(pg->get_osdmap()->get_epoch(), pg));
_queue_for_recovery(make_pair(queued, pg), reserved_pushes);
}
+ void adjust_pg_priorities(vector<PG*> pgs, int newflags);
// osd map cache (past osd maps)
Mutex map_cache_lock;
void handle_pg_backfill_reserve(OpRequestRef op);
void handle_pg_recovery_reserve(OpRequestRef op);
+ void handle_force_recovery(Message *m);
+
void handle_pg_remove(OpRequestRef op);
void _remove_pg(PG *pg);
case MSG_OSD_REP_SCRUBMAP:
case MSG_OSD_PG_UPDATE_LOG_MISSING:
case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
return true;
default:
return false;
int get_num_op_shards();
int get_num_op_threads();
+ float get_osd_recovery_sleep();
+
public:
static int peek_meta(ObjectStore *store, string& magic,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami);
using std::vector;
ostream& operator<<(ostream& out, const osd_rwxa_t& p)
-{
+{
if (p == OSD_CAP_ANY)
return out << "*";
return out;
}
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns)
+{
+ if (!pns.pool_name.empty()) {
+ out << "pool " << pns.pool_name << " ";
+ }
+ if (pns.nspace) {
+ out << "namespace ";
+ if (pns.nspace->empty()) {
+ out << "\"\"";
+ } else {
+ out << *pns.nspace;
+ }
+ out << " ";
+ }
+ return out;
+}
+
ostream& operator<<(ostream& out, const OSDCapMatch& m)
{
if (m.auid != -1LL) {
out << "auid " << m.auid << " ";
+ } else {
+ out << m.pool_namespace;
}
+
if (m.object_prefix.length()) {
out << "object_prefix " << m.object_prefix << " ";
}
- if (m.pool_name.length()) {
- out << "pool " << m.pool_name << " ";
- }
- if (m.is_nspace) {
- out << "namespace ";
- if (m.nspace.length() == 0)
- out << "\"\"";
- else
- out << m.nspace;
- out << " ";
- }
return out;
}
-bool OSDCapMatch::is_match(const string& pn, const string& ns, int64_t pool_auid, const string& object) const
+ostream& operator<<(ostream& out, const OSDCapProfile& m)
{
- if (auid >= 0) {
- if (auid != pool_auid)
+ out << "profile " << m.name;
+ out << m.pool_namespace;
+ return out;
+}
+
+bool OSDCapPoolNamespace::is_match(const std::string& pn,
+ const std::string& ns) const
+{
+ if (!pool_name.empty()) {
+ if (pool_name != pn) {
return false;
+ }
}
- if (pool_name.length()) {
- if (pool_name != pn)
+ if (nspace) {
+ if (*nspace != ns) {
return false;
+ }
}
- if (is_nspace) {
- if (nspace != ns)
+ return true;
+}
+
+bool OSDCapPoolNamespace::is_match_all() const
+{
+ if (!pool_name.empty())
+ return false;
+ if (nspace)
+ return false;
+ return true;
+}
+
+bool OSDCapMatch::is_match(const string& pn, const string& ns,
+ int64_t pool_auid, const string& object) const
+{
+ if (auid >= 0) {
+ if (auid != pool_auid)
return false;
+ } else if (!pool_namespace.is_match(pn, ns)) {
+ return false;
}
+
if (object_prefix.length()) {
if (object.find(object_prefix) != 0)
return false;
bool OSDCapMatch::is_match_all() const
{
- if (auid >= 0)
- return false;
- if (pool_name.length())
+ if (auid >= 0) {
return false;
- if (is_nspace)
+ } else if (!pool_namespace.is_match_all()) {
return false;
- if (object_prefix.length())
+ }
+
+ if (object_prefix.length()) {
return false;
+ }
return true;
}
ostream& operator<<(ostream& out, const OSDCapGrant& g)
{
- return out << "grant(" << g.match << g.spec << ")";
+ out << "grant(";
+ if (g.profile.is_valid()) {
+ out << g.profile << " [";
+ for (auto it = g.profile_grants.cbegin();
+ it != g.profile_grants.cend(); ++it) {
+ if (it != g.profile_grants.cbegin()) {
+ out << ",";
+ }
+ out << *it;
+ }
+ out << "]";
+ } else {
+ out << g.match << g.spec;
+ }
+ out << ")";
+ return out;
}
-
-bool OSDCap::allow_all() const
+bool OSDCapGrant::allow_all() const
{
- for (vector<OSDCapGrant>::const_iterator p = grants.begin(); p != grants.end(); ++p)
- if (p->match.is_match_all() && p->spec.allow_all())
- return true;
- return false;
-}
+ if (profile.is_valid()) {
+ return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+ [](const OSDCapGrant& grant) {
+ return grant.allow_all();
+ });
+ }
-void OSDCap::set_allow_all()
-{
- grants.clear();
- grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY)));
+ return (match.is_match_all() && spec.allow_all());
}
-bool OSDCap::is_capable(const string& pool_name, const string& ns, int64_t pool_auid,
- const string& object, bool op_may_read, bool op_may_write,
- const std::vector<OpRequest::ClassInfo>& classes) const
+bool OSDCapGrant::is_capable(const string& pool_name, const string& ns,
+ int64_t pool_auid, const string& object,
+ bool op_may_read, bool op_may_write,
+ const std::vector<OpRequest::ClassInfo>& classes,
+ std::vector<bool>* class_allowed) const
{
- const size_t num_classes = classes.size();
- std::vector<bool> class_allowed(num_classes, false);
osd_rwxa_t allow = 0;
- for (vector<OSDCapGrant>::const_iterator p = grants.begin();
- p != grants.end(); ++p) {
- if (p->match.is_match(pool_name, ns, pool_auid, object)) {
- allow = allow | p->spec.allow;
+ if (profile.is_valid()) {
+ return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+ [&](const OSDCapGrant& grant) {
+ return grant.is_capable(pool_name, ns, pool_auid, object, op_may_read,
+ op_may_write, classes, class_allowed);
+ });
+ } else {
+ if (match.is_match(pool_name, ns, pool_auid, object)) {
+ allow = allow | spec.allow;
if ((op_may_read && !(allow & OSD_CAP_R)) ||
- (op_may_write && !(allow & OSD_CAP_W)))
- continue;
- if (num_classes) {
+ (op_may_write && !(allow & OSD_CAP_W))) {
+ return false;
+ }
+ if (!classes.empty()) {
// check 'allow *'
- if (p->spec.allow_all())
+ if (spec.allow_all()) {
return true;
+ }
+
// compare this grant to each class in the operation
- for (size_t i = 0; i < num_classes; i++) {
+ for (size_t i = 0; i < classes.size(); ++i) {
// check 'allow class foo'
- if (classes[i].name == p->spec.class_name &&
- !p->spec.class_name.empty()) {
- class_allowed[i] = true;
+ if (!spec.class_name.empty() && classes[i].name == spec.class_name) {
+ (*class_allowed)[i] = true;
continue;
}
// check 'allow x | class-{rw}': must be on whitelist
- if (!classes[i].whitelisted)
+ if (!classes[i].whitelisted) {
continue;
+ }
if ((classes[i].read && !(allow & OSD_CAP_CLS_R)) ||
(classes[i].write && !(allow & OSD_CAP_CLS_W))) {
continue;
}
- class_allowed[i] = true;
+ (*class_allowed)[i] = true;
+ }
+ if (!std::all_of(class_allowed->cbegin(), class_allowed->cend(),
+ [](bool v) { return v; })) {
+ return false;
}
- if (std::all_of(class_allowed.cbegin(), class_allowed.cend(),
- [](bool v) { return v; }))
- return true;
- else
- continue;
}
return true;
}
return false;
}
+void OSDCapGrant::expand_profile()
+{
+ if (profile.name == "read-only") {
+ // grants READ-ONLY caps to the OSD
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R)));
+ return;
+ }
+ if (profile.name == "read-write") {
+ // grants READ-WRITE caps to the OSD
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R | OSD_CAP_W)));
+ }
+
+ if (profile.name == "rbd") {
+ // RBD read-write grant
+ profile_grants.emplace_back(OSDCapMatch("", "", "rbd_children"),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+ profile_grants.emplace_back(OSDCapMatch("", "", "rbd_mirroring"),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+ OSD_CAP_W |
+ OSD_CAP_X)));
+ }
+ if (profile.name == "rbd-read-only") {
+ // RBD read-only grant
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+ OSD_CAP_CLS_R)));
+ }
+}
+
+bool OSDCap::allow_all() const
+{
+ for (auto &grant : grants) {
+ if (grant.allow_all()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void OSDCap::set_allow_all()
+{
+ grants.clear();
+ grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY)));
+}
+
+bool OSDCap::is_capable(const string& pool_name, const string& ns,
+ int64_t pool_auid, const string& object,
+ bool op_may_read, bool op_may_write,
+ const std::vector<OpRequest::ClassInfo>& classes) const
+{
+ std::vector<bool> class_allowed(classes.size(), false);
+ for (auto &grant : grants) {
+ if (grant.is_capable(pool_name, ns, pool_auid, object, op_may_read,
+ op_may_write, classes, &class_allowed)) {
+ return true;
+ }
+ }
+ return false;
+}
+
// grammar
namespace qi = boost::spirit::qi;
spaces = +ascii::space;
-
- // match := [pool[=]<poolname> [namespace[=]<namespace>] | auid <123>] [object_prefix <prefix>]
pool_name %= -(spaces >> lit("pool") >> (lit('=') | spaces) >> str);
nspace %= (spaces >> lit("namespace") >> (lit('=') | spaces) >> estr);
+
+ // match := [pool[=]<poolname> [namespace[=]<namespace>] | auid <123>] [object_prefix <prefix>]
auid %= (spaces >> lit("auid") >> spaces >> int_);
object_prefix %= -(spaces >> lit("object_prefix") >> spaces >> str);
- match = ( (auid >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)] |
- (pool_name >> nspace >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2, _3)] |
- (pool_name >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)]);
+ match = (
+ (auid >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)] |
+ (pool_name >> nspace >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2, _3)] |
+ (pool_name >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)]);
// rwxa := * | [r][w][x] [class-read] [class-write]
rwxa =
(spaces >> lit("class-write")[_val |= OSD_CAP_CLS_W]) ));
// capspec := * | rwx | class <name> [classcap]
- capspec =
- rwxa [_val = phoenix::construct<OSDCapSpec>(_1)] |
- ( spaces >> lit("class") >> spaces >> ((str >> spaces >> str) [_val = phoenix::construct<OSDCapSpec>(_1, _2)] |
- str [_val = phoenix::construct<OSDCapSpec>(_1, string())] ));
+ class_name %= (spaces >> lit("class") >> spaces >> str);
+ class_cap %= -(spaces >> str);
+ capspec = (
+ (rwxa) [_val = phoenix::construct<OSDCapSpec>(_1)] |
+ (class_name >> class_cap) [_val = phoenix::construct<OSDCapSpec>(_1, _2)]);
+
+ // profile := profile <name> [pool[=]<pool> [namespace[=]<namespace>]]
+ profile_name %= (lit("profile") >> (lit('=') | spaces) >> str);
+ profile = (
+ (profile_name >> pool_name >> nspace) [_val = phoenix::construct<OSDCapProfile>(_1, _2, _3)] |
+ (profile_name >> pool_name) [_val = phoenix::construct<OSDCapProfile>(_1, _2)]);
// grant := allow match capspec
- grant = (*ascii::blank >> lit("allow") >>
- ((capspec >> match) [_val = phoenix::construct<OSDCapGrant>(_2, _1)] |
- (match >> capspec) [_val = phoenix::construct<OSDCapGrant>(_1, _2)]) >>
- *ascii::blank);
+ grant = (*ascii::blank >>
+ ((lit("allow") >> capspec >> match) [_val = phoenix::construct<OSDCapGrant>(_2, _1)] |
+ (lit("allow") >> match >> capspec) [_val = phoenix::construct<OSDCapGrant>(_1, _2)] |
+ (profile) [_val = phoenix::construct<OSDCapGrant>(_1)]
+ ) >> *ascii::blank);
// osdcap := grant [grant ...]
grants %= (grant % (lit(';') | lit(',')));
- osdcap = grants [_val = phoenix::construct<OSDCap>(_1)];
+ osdcap = grants [_val = phoenix::construct<OSDCap>(_1)];
}
qi::rule<Iterator> spaces;
qi::rule<Iterator, unsigned()> rwxa;
qi::rule<Iterator, string()> unquoted_word;
qi::rule<Iterator, string()> str, estr;
qi::rule<Iterator, int()> auid;
+ qi::rule<Iterator, string()> class_name;
+ qi::rule<Iterator, string()> class_cap;
qi::rule<Iterator, OSDCapSpec()> capspec;
qi::rule<Iterator, string()> pool_name;
qi::rule<Iterator, string()> nspace;
qi::rule<Iterator, string()> object_prefix;
qi::rule<Iterator, OSDCapMatch()> match;
+ qi::rule<Iterator, string()> profile_name;
+ qi::rule<Iterator, OSDCapProfile()> profile;
qi::rule<Iterator, OSDCapGrant()> grant;
qi::rule<Iterator, std::vector<OSDCapGrant>()> grants;
qi::rule<Iterator, OSDCap()> osdcap;
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
+ * License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
- *
- * OSDCaps: Hold the capabilities associated with a single authenticated
+ *
+ * OSDCaps: Hold the capabilities associated with a single authenticated
* user key. These are specified by text strings of the form
* "allow r" (which allows reading anything on the OSD)
* "allow rwx auid foo" (which allows full access to listed auids)
#include "include/types.h"
#include "OpRequest.h"
+#include <list>
+#include <vector>
+#include <boost/optional.hpp>
+
static const __u8 OSD_CAP_R = (1 << 1); // read
static const __u8 OSD_CAP_W = (1 << 2); // write
static const __u8 OSD_CAP_CLS_R = (1 << 3); // class read
ostream& operator<<(ostream& out, const OSDCapSpec& s);
+struct OSDCapPoolNamespace {
+ std::string pool_name;
+ boost::optional<std::string> nspace = boost::none;
+
+ OSDCapPoolNamespace() {
+ }
+ OSDCapPoolNamespace(const std::string& pool_name,
+ const boost::optional<std::string>& nspace = boost::none)
+ : pool_name(pool_name), nspace(nspace) {
+ }
+
+ bool is_match(const std::string& pn, const std::string& ns) const;
+ bool is_match_all() const;
+};
+
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns);
+
struct OSDCapMatch {
// auid and pool_name/nspace are mutually exclusive
- int64_t auid;
- std::string pool_name;
- bool is_nspace; // true if nspace is defined; false if not constrained.
- std::string nspace;
-
+ int64_t auid = CEPH_AUTH_UID_DEFAULT;
+ OSDCapPoolNamespace pool_namespace;
std::string object_prefix;
- OSDCapMatch() : auid(CEPH_AUTH_UID_DEFAULT), is_nspace(false) {}
- OSDCapMatch(std::string pl, std::string pre) :
- auid(CEPH_AUTH_UID_DEFAULT), pool_name(pl), is_nspace(false), object_prefix(pre) {}
- OSDCapMatch(std::string pl, std::string ns, std::string pre) :
- auid(CEPH_AUTH_UID_DEFAULT), pool_name(pl), is_nspace(true), nspace(ns), object_prefix(pre) {}
- OSDCapMatch(uint64_t auid, std::string pre) : auid(auid), is_nspace(false), object_prefix(pre) {}
+ OSDCapMatch() {}
+ OSDCapMatch(const OSDCapPoolNamespace& pns) : pool_namespace(pns) {}
+ OSDCapMatch(const std::string& pl, const std::string& pre)
+ : pool_namespace(pl), object_prefix(pre) {}
+ OSDCapMatch(const std::string& pl, const std::string& ns,
+ const std::string& pre)
+ : pool_namespace(pl, ns), object_prefix(pre) {}
+ OSDCapMatch(uint64_t auid, const std::string& pre)
+ : auid(auid), object_prefix(pre) {}
/**
* check if given request parameters match our constraints
* @param object object name
* @return true if we match, false otherwise
*/
- bool is_match(const std::string& pool_name, const std::string& nspace_name, int64_t pool_auid, const std::string& object) const;
+ bool is_match(const std::string& pool_name, const std::string& nspace_name,
+ int64_t pool_auid, const std::string& object) const;
bool is_match_all() const;
};
ostream& operator<<(ostream& out, const OSDCapMatch& m);
+struct OSDCapProfile {
+ std::string name;
+ OSDCapPoolNamespace pool_namespace;
+
+ OSDCapProfile() {
+ }
+ OSDCapProfile(const std::string& name,
+ const std::string& pool_name,
+ const boost::optional<std::string>& nspace = boost::none)
+ : name(name), pool_namespace(pool_name, nspace) {
+ }
+
+ inline bool is_valid() const {
+ return !name.empty();
+ }
+};
+
+ostream& operator<<(ostream& out, const OSDCapProfile& m);
+
struct OSDCapGrant {
OSDCapMatch match;
OSDCapSpec spec;
+ OSDCapProfile profile;
+
+ // explicit grants that a profile grant expands to; populated as
+ // needed by expand_profile() and cached here.
+ std::list<OSDCapGrant> profile_grants;
OSDCapGrant() {}
- OSDCapGrant(OSDCapMatch m, OSDCapSpec s) : match(m), spec(s) {}
+ OSDCapGrant(const OSDCapMatch& m, const OSDCapSpec& s) : match(m), spec(s) {}
+ OSDCapGrant(const OSDCapProfile& profile) : profile(profile) {
+ expand_profile();
+ }
+
+ bool allow_all() const;
+ bool is_capable(const string& pool_name, const string& ns, int64_t pool_auid,
+ const string& object, bool op_may_read, bool op_may_write,
+ const std::vector<OpRequest::ClassInfo>& classes,
+ std::vector<bool>* class_allowed) const;
+
+ void expand_profile();
};
ostream& operator<<(ostream& out, const OSDCapGrant& g);
for (auto &state : st)
f->dump_string("state", state);
f->close_section();
+ f->close_section();
}
f->close_section();
features |= CEPH_FEATURE_CRUSH_V4;
if (crush->has_nondefault_tunables5())
features |= CEPH_FEATURE_CRUSH_TUNABLES5;
- if (crush->has_incompat_choose_args())
- features |= CEPH_FEATURE_CRUSH_CHOOSE_ARGS;
+ if (crush->has_incompat_choose_args()) {
+ features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
+ }
mask |= CEPH_FEATURES_CRUSH;
if (!pg_upmap.empty() || !pg_upmap_items.empty())
require_osd_release = inc.new_require_osd_release;
if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
}
}
auto q = pg_upmap_items.find(pg);
if (q != pg_upmap_items.end()) {
- // NOTE: this approach does not allow a bidirectional swap,
- // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
- for (auto& r : q->second) {
- // make sure the replacement value doesn't already appear
- bool exists = false;
- ssize_t pos = -1;
- for (unsigned i = 0; i < raw->size(); ++i) {
- int osd = (*raw)[i];
- if (osd == r.second) {
- exists = true;
- break;
- }
- // ignore mapping if target is marked out (or invalid osd id)
- if (osd == r.first &&
- pos < 0 &&
- !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
- osd_weight[r.second] == 0)) {
- pos = i;
- }
- }
- if (!exists && pos >= 0) {
- (*raw)[pos] = r.second;
+ for (auto& i : *raw) {
+ for (auto& r : q->second) {
+ if (r.first != i) {
+ continue;
+ }
+ if (!(r.second != CRUSH_ITEM_NONE &&
+ r.second < max_osd &&
+ osd_weight[r.second] == 0)) {
+ i = r.second;
+ }
+ break;
}
}
}
if (v < 4) {
decltype(flags) f = flags;
if (require_osd_release >= CEPH_RELEASE_LUMINOUS)
- f |= CEPH_OSDMAP_REQUIRE_LUMINOUS;
+ f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
else if (require_osd_release == CEPH_RELEASE_KRAKEN)
f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
else if (require_osd_release == CEPH_RELEASE_JEWEL)
::decode(require_osd_release, bl);
if (require_osd_release >= CEPH_RELEASE_LUMINOUS) {
flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
}
} else {
if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
// only for compat with post-kraken pre-luminous test clusters
require_osd_release = CEPH_RELEASE_LUMINOUS;
flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
} else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
require_osd_release = CEPH_RELEASE_KRAKEN;
} else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
s += ",require_kraken_osds";
if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
s += ",require_luminous_osds";
+ if (f & CEPH_OSDMAP_RECOVERY_DELETES)
+ s += ",recovery_deletes";
if (s.length())
s.erase(0, 1);
return s;
return get_flag_string(flags);
}
-struct qi {
- int item;
- int depth;
- float weight;
- qi() : item(0), depth(0), weight(0) {}
- qi(int i, int d, float w) : item(i), depth(d), weight(w) {}
-};
-
void OSDMap::print_pools(ostream& out) const
{
for (const auto &pool : pools) {
OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
unsigned f)
- : Parent(crush), osdmap(osdmap_), filter(f) { }
+ : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
bool should_dump_leaf(int i) const override {
- if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
- ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
- ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
- ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
- return false;
+ if (!filter) {
+ return true; // normal case
+ }
+ if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+ ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+ ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+ ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+ ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+ return true;
}
- return true;
+ return false;
}
bool should_dump_empty_bucket() const override {
tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
- tbl->define_column("UP/DOWN", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
for (int i = 0; i < osdmap->get_max_osd(); i++) {
if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
- dump_item(CrushTreeDumper::Item(i, 0, 0), tbl);
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
}
}
}
*tbl << "DNE"
<< 0;
} else {
- *tbl << (osdmap->is_up(qi.id) ? "up" : "down")
+ string s;
+ if (osdmap->is_up(qi.id)) {
+ s = "up";
+ } else if (osdmap->is_destroyed(qi.id)) {
+ s = "destroyed";
+ } else {
+ s = "down";
+ }
+ *tbl << s
<< weightf_t(osdmap->get_weightf(qi.id))
<< weightf_t(osdmap->get_primary_affinityf(qi.id));
}
OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
unsigned f)
- : Parent(crush), osdmap(osdmap_), filter(f) { }
+ : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
bool should_dump_leaf(int i) const override {
- if (((filter & OSDMap::DUMP_UP) && !osdmap->is_up(i)) ||
- ((filter & OSDMap::DUMP_DOWN) && !osdmap->is_down(i)) ||
- ((filter & OSDMap::DUMP_IN) && !osdmap->is_in(i)) ||
- ((filter & OSDMap::DUMP_OUT) && !osdmap->is_out(i))) {
- return false;
+ if (!filter) {
+ return true; // normal case
+ }
+ if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+ ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+ ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+ ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+ ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+ return true;
}
- return true;
+ return false;
}
bool should_dump_empty_bucket() const override {
f->open_array_section("stray");
for (int i = 0; i < osdmap->get_max_osd(); i++) {
if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
- dump_item(CrushTreeDumper::Item(i, 0, 0), f);
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
}
f->close_section();
}
Parent::dump_item_fields(qi, f);
if (!qi.is_bucket())
{
+ string s;
+ if (osdmap->is_up(qi.id)) {
+ s = "up";
+ } else if (osdmap->is_destroyed(qi.id)) {
+ s = "destroyed";
+ } else {
+ s = "down";
+ }
f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
- f->dump_string("status", osdmap->is_up(qi.id) ? "up" : "down");
+ f->dump_string("status", s);
f->dump_float("reweight", osdmap->get_weightf(qi.id));
f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
}
pools[pool].set_pg_num(poolbase << pg_bits);
pools[pool].set_pgp_num(poolbase << pgp_bits);
pools[pool].last_change = epoch;
+ pools[pool].application_metadata.insert(
+ {pg_pool_t::APPLICATION_NAME_RBD, {}});
pool_name[pool] = plname;
name_pool[plname] = pool;
}
OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
const PGStatService *pgs_, bool tree_) :
- Parent(crush),
+ Parent(crush, osdmap_->get_pool_names()),
osdmap(osdmap_),
pgs(pgs_),
tree(tree_),
void dump_stray(F *f) {
for (int i = 0; i < osdmap->get_max_osd(); i++) {
if (osdmap->exists(i) && !this->is_touched(i))
- dump_item(CrushTreeDumper::Item(i, 0, 0), f);
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
}
}
const size_t num_pgs,
Formatter *f) override {
f->open_object_section("item");
- CrushTreeDumper::dump_item_fields(crush, qi, f);
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
f->dump_float("reweight", reweight);
f->dump_int("kb", kb);
f->dump_int("kb_used", kb_used);
assert(i != pool_name.end());
return i->second;
}
+ const mempool::osdmap::map<int64_t,string>& get_pool_names() const {
+ return pool_name;
+ }
bool have_pg_pool(int64_t p) const {
return pools.count(p);
}
void print_oneline_summary(ostream& out) const;
enum {
- DUMP_IN = 1, // only 'in' osds
- DUMP_OUT = 2, // only 'out' osds
- DUMP_UP = 4, // only 'up' osds
- DUMP_DOWN = 8, // only 'down' osds
+ DUMP_IN = 1, // only 'in' osds
+ DUMP_OUT = 2, // only 'out' osds
+ DUMP_UP = 4, // only 'up' osds
+ DUMP_DOWN = 8, // only 'down' osds
+ DUMP_DESTROYED = 16, // only 'destroyed' osds
};
void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0) const;
} else if (req->get_type() == MSG_OSD_REPOP) {
reqid = static_cast<MOSDRepOp*>(req)->reqid;
}
+ req_src_inst = req->get_source_inst();
mark_event("header_read", request->get_recv_stamp());
mark_event("throttled", request->get_throttle_stamp());
mark_event("all_read", request->get_recv_complete_stamp());
if (m->get_orig_source().is_client()) {
f->open_object_section("client_info");
stringstream client_name, client_addr;
- client_name << m->get_orig_source();
- client_addr << m->get_orig_source_addr();
+ client_name << req_src_inst.name;
+ client_addr << req_src_inst.addr;
f->dump_string("client", client_name.str());
f->dump_string("client_addr", client_addr.str());
f->dump_unsigned("tid", m->get_tid());
flag, s.c_str(), old_flags, hit_flag_points);
}
+bool OpRequest::filter_out(const set<string>& filters)
+{
+ set<entity_addr_t> addrs;
+ for (auto it = filters.begin(); it != filters.end(); it++) {
+ entity_addr_t addr;
+ if (addr.parse((*it).c_str())) {
+ addrs.insert(addr);
+ }
+ }
+ if (addrs.empty())
+ return true;
+
+ entity_addr_t cmp_addr = req_src_inst.addr;
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+ cmp_addr.set_nonce(0);
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+ cmp_addr.set_port(0);
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+
+ return false;
+}
+
ostream& operator<<(ostream& out, const OpRequest::ClassInfo& i)
{
out << "class " << i.name << " rd " << i.read
private:
Message *request; /// the logical request we are tracking
osd_reqid_t reqid;
+ entity_inst_t req_src_inst;
uint8_t hit_flag_points;
uint8_t latest_flag_point;
utime_t dequeued_time;
protected:
void _dump_op_descriptor_unlocked(ostream& stream) const override;
void _unregistered() override;
+ bool filter_out(const set<string>& filters) override;
public:
~OpRequest() override {
#include "messages/MOSDSubOpReply.h"
#include "messages/MOSDRepOpReply.h"
#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
#include "common/BackTrace.h"
#include "common/EventTrace.h"
}
#endif
+
void PGPool::update(OSDMapRef map)
{
const pg_pool_t *pi = map->get_pg_pool(id);
dirty_info(false), dirty_big_info(false),
info(p),
info_struct_v(0),
- coll(p), pg_log(cct),
+ coll(p),
+ pg_log(cct),
pgmeta_oid(p.make_pgmeta_oid()),
missing_loc(this),
past_intervals(
bool PG::MissingLoc::readable_with_acting(
const hobject_t &hoid,
const set<pg_shard_t> &acting) const {
- if (!needs_recovery(hoid)) return true;
+ if (!needs_recovery(hoid))
+ return true;
+ if (is_deleted(hoid))
+ return false;
auto missing_loc_entry = missing_loc.find(hoid);
- if (missing_loc_entry == missing_loc.end()) return false;
+ if (missing_loc_entry == missing_loc.end())
+ return false;
const set<pg_shard_t> &locs = missing_loc_entry->second;
ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
set<pg_shard_t> have_acting;
handle->reset_tp_timeout();
loop = 0;
}
+ if (i->second.is_delete())
+ continue;
missing_loc[i->first].insert(sources.begin(), sources.end());
missing_loc_sources.insert(sources.begin(), sources.end());
}
handle->reset_tp_timeout();
loop = 0;
}
+ if (p->second.is_delete()) {
+ ldout(pg->cct, 10) << __func__ << " " << soid
+ << " delete, ignoring source" << dendl;
+ found_missing = true;
+ continue;
+ }
if (oinfo.last_update < need) {
ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
<< " also missing on osd." << fromosd
if (m && pi.last_backfill != hobject_t()) {
for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
p != m->log.log.end();
- ++p)
+ ++p) {
if (p->soid <= pi.last_backfill &&
- !p->is_error())
- pm.add_next_event(*p);
+ !p->is_error()) {
+ if (perform_deletes_during_peering() && p->is_delete()) {
+ pm.rm(p->soid, p->version);
+ } else {
+ pm.add_next_event(*p);
+ }
+ }
+ }
}
-
+
if (m) {
dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
//m->log.print(cout);
for (set<pg_shard_t>::iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
+ dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
if (*i == get_primary()) {
missing_loc.add_active_missing(missing);
if (!missing.have_missing())
op->need_write_cap(),
op->classes());
- dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
- << " " << req->get_hobj().nspace
+ dout(20) << "op_has_sufficient_caps "
+ << "session=" << session
+ << " pool=" << pool.id << " (" << pool.name
+ << " " << req->get_hobj().nspace
<< ") owner=" << pool.auid
<< " need_read_cap=" << op->need_read_cap()
<< " need_write_cap=" << op->need_write_cap()
}
}
-void PG::queue_recovery(bool front)
+void PG::queue_recovery()
{
if (!is_primary() || !is_peered()) {
dout(10) << "queue_recovery -- not primary or not peered " << dendl;
} else {
dout(10) << "queue_recovery -- queuing" << dendl;
recovery_queued = true;
- osd->queue_for_recovery(this, front);
+ osd->queue_for_recovery(this);
}
}
void PG::mark_clean()
{
if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
+ state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
state_set(PG_STATE_CLEAN);
info.history.last_epoch_clean = get_osdmap()->get_epoch();
info.history.last_interval_clean = info.history.same_interval_since;
kick_snap_trim();
}
-unsigned PG::get_recovery_priority()
+void PG::change_recovery_force_mode(int new_mode, bool clear)
{
- // a higher value -> a higher priority
+ lock(true);
+ if (clear) {
+ state_clear(new_mode);
+ } else {
+ state_set(new_mode);
+ }
+ publish_stats_to_osd();
- int pool_recovery_priority = 0;
- pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+ unlock();
+}
- int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
+inline int PG::clamp_recovery_priority(int priority)
+{
+ static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+ static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
// Clamp to valid range
- if (ret > OSD_RECOVERY_PRIORITY_MAX) {
- ret = OSD_RECOVERY_PRIORITY_MAX;
- } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
- ret = OSD_RECOVERY_PRIORITY_MIN;
+ if (priority > OSD_RECOVERY_PRIORITY_MAX) {
+ return OSD_RECOVERY_PRIORITY_MAX;
+ } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
+ return OSD_RECOVERY_PRIORITY_MIN;
+ } else {
+ return priority;
}
+}
- static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
- static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+unsigned PG::get_recovery_priority()
+{
+ // a higher value -> a higher priority
+ int ret = 0;
+ if (state & PG_STATE_FORCED_RECOVERY) {
+ ret = OSD_RECOVERY_PRIORITY_FORCED;
+ } else {
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
+ ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
+ }
+ dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
return static_cast<unsigned>(ret);
}
unsigned PG::get_backfill_priority()
{
// a higher value -> a higher priority
-
int ret = OSD_BACKFILL_PRIORITY_BASE;
- if (acting.size() < pool.info.min_size) {
- // inactive: no. of replicas < min_size, highest priority since it blocks IO
- ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
+ if (state & PG_STATE_FORCED_BACKFILL) {
+ ret = OSD_RECOVERY_PRIORITY_FORCED;
+ } else {
+ if (acting.size() < pool.info.min_size) {
+ // inactive: no. of replicas < min_size, highest priority since it blocks IO
+ ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
- } else if (is_undersized()) {
- // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
- assert(pool.info.size > actingset.size());
- ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
+ } else if (is_undersized()) {
+ // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
+ assert(pool.info.size > actingset.size());
+ ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
- } else if (is_degraded()) {
- // degraded: baseline degraded
- ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
- }
+ } else if (is_degraded()) {
+ // degraded: baseline degraded
+ ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+ }
- // Adjust with pool's recovery priority
- int pool_recovery_priority = 0;
- pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
- ret += pool_recovery_priority;
+ // Adjust with pool's recovery priority
+ int pool_recovery_priority = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
- // Clamp to valid range
- if (ret > OSD_RECOVERY_PRIORITY_MAX) {
- ret = OSD_RECOVERY_PRIORITY_MAX;
- } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
- ret = OSD_RECOVERY_PRIORITY_MIN;
+ ret = clamp_recovery_priority(pool_recovery_priority + ret);
}
return static_cast<unsigned>(ret);
auto last = logv.rbegin();
if (is_primary() && last != logv.rend()) {
projected_log.skip_can_rollback_to_to_head();
- projected_log.trim(cct, last->version, nullptr);
+ projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
}
if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
// sloppy check
if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
osd->clog->error() << info.pgid
- << " log bound mismatch, info (" << pg_log.get_tail() << ","
- << pg_log.get_head() << "]"
+ << " log bound mismatch, info (tail,head] ("
+ << pg_log.get_tail() << "," << pg_log.get_head() << "]"
<< " actual ["
<< pg_log.get_log().log.begin()->version << ","
<< pg_log.get_log().log.rbegin()->version << "]";
} else {
osd->clog->error() << "osd." << osd->whoami
<< " pg " << info.pgid
- << " Regular scrub request, losing deep-scrub details";
+ << " Regular scrub request, deep-scrub details will be lost";
}
}
queue_scrub();
assert(0);
}
if (bad_peer != primary) {
- peer_missing[bad_peer].add(soid, oi.version, eversion_t());
+ peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
} else {
// We should only be scrubbing if the PG is clean.
assert(waiting_for_unreadable_object.empty());
return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
case MSG_OSD_REPOPREPLY:
return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
case MSG_OSD_EC_WRITE:
return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
pg->backfill_reserved = false;
pg->backfill_reserving = false;
pg->state_clear(PG_STATE_BACKFILL);
+ pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
}
{
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_RECOVERING);
+ pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
return transit<Recovered>();
}
{
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_RECOVERING);
+ pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
return transit<WaitRemoteBackfillReserved>();
}
if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
pg->actingbackfill.size()) {
pg->state_clear(PG_STATE_DEGRADED);
+ pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
pg->publish_stats_to_osd();
}
pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
- << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
+ << " objects unfound and apparently lost, would automatically "
+ << "mark these objects lost but this feature is not yet implemented "
+ << "(osd_auto_mark_unfound_lost)";
} else
- pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
+ pg->osd->clog->error() << pg->info.pgid.pgid << " has "
+ << unfound << " objects unfound and apparently lost";
}
if (pg->is_active()) {
*v = i->second.need;
return true;
}
+ bool is_deleted(const hobject_t &hoid) const {
+ auto i = needs_recovery_map.find(hoid);
+ if (i == needs_recovery_map.end())
+ return false;
+ return i->second.is_delete();
+ }
bool is_unfound(const hobject_t &hoid) const {
- return needs_recovery(hoid) && (
+ return needs_recovery(hoid) && !is_deleted(hoid) && (
!missing_loc.count(hoid) ||
!(*is_recoverable)(missing_loc.find(hoid)->second));
}
if (j == needs_recovery_map.end()) {
needs_recovery_map.insert(*i);
} else {
+ lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
+ << i->first << " have " << j->second
+ << " tried to add " << i->second << dendl;
assert(i->second.need == j->second.need);
}
}
return; // recovered!
needs_recovery_map[hoid] = *item;
+ if (item->is_delete())
+ return;
auto mliter =
missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
assert(info.last_backfill.is_max());
bool needs_recovery() const;
bool needs_backfill() const;
+ /// clip calculated priority to reasonable range
+ inline int clamp_recovery_priority(int priority);
/// get log recovery reservation priority
unsigned get_recovery_priority();
/// get backfill reservation priority
unsigned get_backfill_priority();
void mark_clean(); ///< mark an active pg clean
+ void change_recovery_force_mode(int new_mode, bool clear);
/// return [start,end) bounds for required past_intervals
static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
typedef boost::mpl::list<
- boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction< CancelRecovery >
> reactions;
explicit NotRecovering(my_context ctx);
+ boost::statechart::result react(const CancelRecovery& evt) {
+ /* no-op */
+ return discard_event();
+ }
void exit();
};
uint64_t get_min_acting_features() const { return acting_features; }
uint64_t get_min_upacting_features() const { return upacting_features; }
+ bool perform_deletes_during_peering() const {
+ return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
+ }
void init_primary_up_acting(
const vector<int> &newup,
virtual void kick_snap_trim() = 0;
virtual void snap_trimmer_scrub_complete() = 0;
bool requeue_scrub(bool high_priority = false);
- void queue_recovery(bool front = false);
+ void queue_recovery();
bool queue_scrub();
unsigned get_scrub_priority();
#include "OSDMap.h"
#include "PGLog.h"
#include "common/LogClient.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
#define dout_context cct
#define dout_subsys ceph_subsys_osd
return *_dout << pgb->get_parent()->gen_dbg_prefix();
}
+void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h)
+{
+ assert(get_parent()->get_actingbackfill_shards().size() > 0);
+ for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " will remove " << oid << " " << v << " from "
+ << shard << dendl;
+ h->deletes[shard].push_back(make_pair(oid, v));
+ get_parent()->begin_peer_recover(shard, oid);
+ }
+ }
+}
+
+void PGBackend::send_recovery_deletes(int prio,
+ const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
+{
+ epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
+ for (const auto& p : deletes) {
+ const auto& shard = p.first;
+ const auto& objects = p.second;
+ ConnectionRef con = get_parent()->get_con_osd_cluster(
+ shard.osd,
+ get_osdmap()->get_epoch());
+ if (!con)
+ continue;
+ auto it = objects.begin();
+ while (it != objects.end()) {
+ uint64_t cost = 0;
+ uint64_t deletes = 0;
+ spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
+ MOSDPGRecoveryDelete *msg =
+ new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
+ target_pg,
+ get_osdmap()->get_epoch(),
+ min_epoch);
+ msg->set_priority(prio);
+
+ while (it != objects.end() &&
+ cost < cct->_conf->osd_max_push_cost &&
+ deletes < cct->_conf->osd_max_push_objects) {
+ dout(20) << __func__ << ": sending recovery delete << " << it->first
+ << " " << it->second << " to osd." << shard << dendl;
+ msg->objects.push_back(*it);
+ cost += cct->_conf->osd_push_per_object_cost;
+ ++deletes;
+ ++it;
+ }
+
+ msg->set_cost(cost);
+ get_parent()->send_message_osd_cluster(msg, con);
+ }
+ }
+}
+
+bool PGBackend::handle_message(OpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ handle_recovery_delete(op);
+ return true;
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ handle_recovery_delete_reply(op);
+ return true;
+
+ default:
+ break;
+ }
+
+ return _handle_message(op);
+}
+
+void PGBackend::handle_recovery_delete(OpRequestRef op)
+{
+ const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
+ assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
+ dout(20) << __func__ << " " << op << dendl;
+
+ op->mark_started();
+
+ C_GatherBuilder gather(cct);
+ for (const auto &p : m->objects) {
+ get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
+ }
+
+ MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
+ reply->from = get_parent()->whoami_shard();
+ reply->set_priority(m->get_priority());
+ reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->objects = m->objects;
+ ConnectionRef conn = m->get_connection();
+
+ gather.set_finisher(new FunctionContext(
+ [=](int r) {
+ if (r != -EAGAIN) {
+ get_parent()->send_message_osd_cluster(reply, conn.get());
+ }
+ }));
+ gather.activate();
+}
+
+void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
+{
+ const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
+ assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
+ dout(20) << __func__ << " " << op << dendl;
+
+ for (const auto &p : m->objects) {
+ ObjectRecoveryInfo recovery_info;
+ hobject_t oid = p.first;
+ recovery_info.version = p.second;
+ get_parent()->on_peer_recover(m->from, oid, recovery_info);
+ bool peers_recovered = true;
+ for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " " << oid << " still missing on at least "
+ << shard << dendl;
+ peers_recovered = false;
+ break;
+ }
+ }
+ if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
+ dout(20) << __func__ << " completed recovery, local_missing = "
+ << get_parent()->get_local_missing() << dendl;
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ get_parent()->on_global_recover(p.first, stat_diff, true);
+ }
+ }
+}
+
void PGBackend::rollback(
const pg_log_entry_t &entry,
ObjectStore::Transaction *t)
const hobject_t &oid,
const ObjectRecoveryInfo &recovery_info,
ObjectContextRef obc,
+ bool is_delete,
ObjectStore::Transaction *t
) = 0;
*/
virtual void on_global_recover(
const hobject_t &oid,
- const object_stat_sum_t &stat_diff
+ const object_stat_sum_t &stat_diff,
+ bool is_delete
) = 0;
/**
virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
virtual void primary_failed(const hobject_t &soid) = 0;
virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
-
virtual void cancel_pull(const hobject_t &soid) = 0;
virtual void apply_stats(
eversion_t v
) = 0;
+ virtual void remove_missing_object(const hobject_t &oid,
+ eversion_t v,
+ Context *on_complete) = 0;
/**
* Bless a context
virtual ceph_tid_t get_tid() = 0;
virtual LogClientTemp clog_error() = 0;
+ virtual LogClientTemp clog_warn() = 0;
virtual bool check_failsafe_full(ostream &ss) = 0;
*/
struct RecoveryHandle {
bool cache_dont_need;
+ map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
RecoveryHandle(): cache_dont_need(false) {}
virtual ~RecoveryHandle() {}
int priority ///< [in] msg priority
) = 0;
+ void recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h);
+ void send_recovery_deletes(int prio,
+ const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
+
/**
* recover_object
*
virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
/// gives PGBackend a crack at an incoming message
- virtual bool handle_message(
+ bool handle_message(
OpRequestRef op ///< [in] message received
- ) = 0; ///< @return true if the message was handled
+ ); ///< @return true if the message was handled
+
+ /// the variant of handle_message that is overridden by child classes
+ virtual bool _handle_message(OpRequestRef op) = 0;
virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
ObjectStore::Transaction *t);
protected:
+
+ void handle_recovery_delete(OpRequestRef op);
+ void handle_recovery_delete_reply(OpRequestRef op);
+
/// Reapply old attributes
void rollback_setattrs(
const hobject_t &hoid,
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
+ * License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
- *
+ *
*/
#include "PGLog.h"
PGLog::IndexedLog *target)
{
unindex();
- *target = pg_log_t::split_out_child(child_pgid, split_bits);
+ *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
index();
target->index();
reset_rollback_info_trimmed_to_riter();
void PGLog::IndexedLog::trim(
CephContext* cct,
eversion_t s,
- set<eversion_t> *trimmed)
+ set<eversion_t> *trimmed,
+ set<string>* trimmed_dups,
+ bool* dirty_dups)
{
if (complete_to != log.end() &&
complete_to->version <= s) {
assert(s <= can_rollback_to);
+ auto earliest_dup_version =
+ log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
+ ? 0u
+ : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked;
+
while (!log.empty()) {
- pg_log_entry_t &e = *log.begin();
+ const pg_log_entry_t &e = *log.begin();
if (e.version > s)
break;
generic_dout(20) << "trim " << e << dendl;
unindex(e); // remove from index,
+ // add to dup list
+ if (e.version.version >= earliest_dup_version) {
+ if (dirty_dups) *dirty_dups = true;
+ dups.push_back(pg_log_dup_t(e));
+ index(dups.back());
+ for (const auto& extra : e.extra_reqids) {
+ // note: extras have the same version as outer op
+ dups.push_back(pg_log_dup_t(e.version, extra.second,
+ extra.first, e.return_code));
+ index(dups.back());
+ }
+ }
+
if (rollback_info_trimmed_to_riter == log.rend() ||
e.version == rollback_info_trimmed_to_riter->version) {
log.pop_front();
}
}
+ while (!dups.empty()) {
+ const auto& e = *dups.begin();
+ if (e.version.version >= earliest_dup_version)
+ break;
+ generic_dout(20) << "trim dup " << e << dendl;
+ if (trimmed_dups)
+ trimmed_dups->insert(e.get_key_name());
+ if (indexed_data & PGLOG_INDEXED_DUPS) {
+ dup_index.erase(e.reqid);
+ }
+ dups.pop_front();
+ }
+
// raise tail?
if (tail < s)
tail = s;
for (list<pg_log_entry_t>::const_iterator p = log.begin();
p != log.end();
++p) {
- out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl;
+ out << *p << " " <<
+ (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
+ std::endl;
assert(!p->reqid_is_indexed() || logged_req(p->reqid));
}
+
+ for (list<pg_log_dup_t>::const_iterator p = dups.begin();
+ p != dups.end();
+ ++p) {
+ out << *p << std::endl;
+ }
+
return out;
}
assert(trim_to <= info.last_complete);
dout(10) << "trim " << log << " to " << trim_to << dendl;
- log.trim(cct, trim_to, &trimmed);
+ log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
info.log_tail = log.tail;
}
}
} else {
oinfo.last_complete = oinfo.last_update;
}
-}
+} // proc_replica_log
/**
* rewind divergent entries at the head of the log
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info)
{
- dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl;
+ dout(10) << "rewind_divergent_log truncate divergent future " <<
+ newhead << dendl;
if (info.last_complete > newhead)
// splice into our log.
log.log.splice(log.log.begin(),
olog.log, from, to);
-
+
info.log_tail = log.tail = olog.tail;
changed = true;
}
// extend on head?
if (olog.head > log.head) {
dout(10) << "merge_log extending head to " << olog.head << dendl;
-
+
// find start point in olog
list<pg_log_entry_t>::iterator to = olog.log.end();
list<pg_log_entry_t>::iterator from = olog.log.end();
changed = true;
}
-
- dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl;
+
+ // now handle dups
+ if (merge_log_dups(olog)) {
+ dirty_dups = true;
+ changed = true;
+ }
+
+ dout(10) << "merge_log result " << log << " " << missing <<
+ " changed=" << changed << dendl;
if (changed) {
dirty_info = true;
}
}
+
+// returns true if any changes were made to log.dups
+bool PGLog::merge_log_dups(const pg_log_t& olog) {
+ bool changed = false;
+
+ if (!olog.dups.empty()) {
+ if (log.dups.empty()) {
+ dout(10) << "merge_log copying olog dups to log " <<
+ olog.dups.front().version << " to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+ // since our log.dups is empty just copy them
+ for (const auto& i : olog.dups) {
+ log.dups.push_back(i);
+ log.index(log.dups.back());
+ }
+ } else {
+ // since our log.dups is not empty try to extend on each end
+
+ if (olog.dups.back().version > log.dups.back().version) {
+ // extend the dups's tail (i.e., newer dups)
+ dout(10) << "merge_log extending dups tail to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+
+ auto log_tail_version = log.dups.back().version;
+
+ auto insert_cursor = log.dups.end();
+ for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
+ if (i->version <= log_tail_version) break;
+ log.dups.insert(insert_cursor, *i);
+
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass reference of copy in log.dups
+ log.index(*prev);
+
+ --insert_cursor; // make sure we insert in reverse order
+ }
+ }
+
+ if (olog.dups.front().version < log.dups.front().version) {
+ // extend the dups's head (i.e., older dups)
+ dout(10) << "merge_log extending dups head to " <<
+ olog.dups.front().version << dendl;
+ changed = true;
+
+ auto insert_cursor = log.dups.begin();
+ for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
+ if (i->version >= insert_cursor->version) break;
+ log.dups.insert(insert_cursor, *i);
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass address of copy in log.dups
+ log.index(*prev);
+ }
+ }
+ }
+ }
+
+ // remove any dup entries that overlap with pglog
+ if (!log.dups.empty() && log.dups.back().version >= log.tail) {
+ dout(10) << "merge_log removed dups overlapping log entries [" <<
+ log.tail << "," << log.dups.back().version << "]" << dendl;
+ changed = true;
+
+ while (!log.dups.empty() && log.dups.back().version >= log.tail) {
+ log.unindex(log.dups.back());
+ log.dups.pop_back();
+ }
+ }
+
+ return changed;
+}
+
void PGLog::check() {
if (!pg_log_debug)
return;
}
}
+// non-static
void PGLog::write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
- const coll_t& coll, const ghobject_t &log_oid,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
bool require_rollback)
{
if (is_dirty()) {
<< ", dirty_from: " << dirty_from
<< ", writeout_from: " << writeout_from
<< ", trimmed: " << trimmed
+ << ", trimmed_dups: " << trimmed_dups
<< ", clear_divergent_priors: " << clear_divergent_priors
<< dendl;
_write_log_and_missing(
dirty_from,
writeout_from,
trimmed,
+ trimmed_dups,
missing,
!touched_log,
require_rollback,
clear_divergent_priors,
- (pg_log_debug ? &log_keys_debug : 0));
+ dirty_dups,
+ &rebuilt_missing_with_deletes,
+ (pg_log_debug ? &log_keys_debug : nullptr));
undirty();
} else {
dout(10) << "log is not dirty" << dendl;
}
}
+// static
void PGLog::write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
pg_log_t &log,
const coll_t& coll, const ghobject_t &log_oid,
map<eversion_t, hobject_t> &divergent_priors,
- bool require_rollback)
+ bool require_rollback,
+ bool dirty_dups)
{
_write_log_and_missing_wo_missing(
t, km, log, coll, log_oid,
divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
set<eversion_t>(),
- true, true, require_rollback, 0);
+ set<string>(),
+ true, true, require_rollback, dirty_dups, nullptr);
}
+// static
void PGLog::write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
const coll_t& coll,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
- bool require_rollback)
+ bool require_rollback,
+ bool dirty_dups,
+ bool *rebuilt_missing_with_deletes)
{
_write_log_and_missing(
t, km, log, coll, log_oid,
eversion_t(),
eversion_t(),
set<eversion_t>(),
+ set<string>(),
missing,
- true, require_rollback, false, 0);
+ true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
}
+// static
void PGLog::_write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
map<string,bufferlist> *km,
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
bool dirty_divergent_priors,
bool touch_log,
bool require_rollback,
+ bool dirty_dups,
set<string> *log_keys_debug
)
{
- set<string> to_remove;
+ set<string> to_remove(trimmed_dups);
for (set<eversion_t>::const_iterator i = trimmed.begin();
i != trimmed.end();
++i) {
}
}
-//dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
+ // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
if (touch_log)
t.touch(coll, log_oid);
if (dirty_to != eversion_t()) {
clear_up_to(log_keys_debug, dirty_to.get_key_name());
}
if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
- // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
+ // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
t.omap_rmkeyrange(
coll, log_oid,
dirty_from.get_key_name(), eversion_t::max().get_key_name());
}
}
+ // process dirty_dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_dups) {
+ pg_log_dup_t min;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), log.dups.begin()->get_key_name());
+ for (const auto& entry : log.dups) {
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+ }
+
if (dirty_divergent_priors) {
//dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
::encode(divergent_priors, (*km)["divergent_priors"]);
t.omap_rmkeys(coll, log_oid, to_remove);
}
+// static
void PGLog::_write_log_and_missing(
ObjectStore::Transaction& t,
map<string,bufferlist>* km,
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
const pg_missing_tracker_t &missing,
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
+ bool dirty_dups,
+ bool *rebuilt_missing_with_deletes, // in/out param
set<string> *log_keys_debug
) {
- set<string> to_remove;
+ set<string> to_remove(trimmed_dups);
for (set<eversion_t>::const_iterator i = trimmed.begin();
i != trimmed.end();
++i) {
}
}
+ // process dirty_dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_dups) {
+ pg_log_dup_t min;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), log.dups.begin()->get_key_name());
+ for (const auto& entry : log.dups) {
+ bufferlist bl;
+ ::encode(entry, bl);
+ (*km)[entry.get_key_name()].claim(bl);
+ }
+ }
+
if (clear_divergent_priors) {
//dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
to_remove.insert("divergent_priors");
}
+ // since we encode individual missing items instead of a whole
+ // missing set, we need another key to store this bit of state
+ if (*rebuilt_missing_with_deletes) {
+ (*km)["may_include_deletes_in_missing"] = bufferlist();
+ *rebuilt_missing_with_deletes = false;
+ }
missing.get_changed(
[&](const hobject_t &obj) {
string key = string("missing/") + obj.to_str();
if (!missing.is_missing(obj, &item)) {
to_remove.insert(key);
} else {
- ::encode(make_pair(obj, item), (*km)[key]);
+ uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
+ ::encode(make_pair(obj, item), (*km)[key], features);
}
});
if (require_rollback) {
if (!to_remove.empty())
t.omap_rmkeys(coll, log_oid, to_remove);
}
+
+void PGLog::rebuild_missing_set_with_deletes(ObjectStore *store,
+ coll_t pg_coll,
+ const pg_info_t &info)
+{
+ // save entries not generated from the current log (e.g. added due
+ // to repair, EIO handling, or divergent_priors).
+ map<hobject_t, pg_missing_item> extra_missing;
+ for (const auto& p : missing.get_items()) {
+ if (!log.logged_object(p.first)) {
+ dout(20) << __func__ << " extra missing entry: " << p.first
+ << " " << p.second << dendl;
+ extra_missing[p.first] = p.second;
+ }
+ }
+ missing.clear();
+ missing.may_include_deletes = true;
+
+ // go through the log and add items that are not present or older
+ // versions on disk, just as if we were reading the log + metadata
+ // off disk originally
+ set<hobject_t> did;
+ for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
+ i != log.log.rend();
+ ++i) {
+ if (i->version <= info.last_complete)
+ break;
+ if (i->soid > info.last_backfill ||
+ i->is_error() ||
+ did.find(i->soid) != did.end())
+ continue;
+ did.insert(i->soid);
+
+ bufferlist bv;
+ int r = store->getattr(
+ pg_coll,
+ ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
+
+ if (r >= 0) {
+ object_info_t oi(bv);
+ dout(20) << __func__ << " store version = " << oi.version << dendl;
+ if (oi.version < i->version) {
+ missing.add(i->soid, i->version, oi.version, i->is_delete());
+ }
+ } else {
+ missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+ }
+ }
+
+ for (const auto& p : extra_missing) {
+ missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
+ }
+ rebuilt_missing_with_deletes = true;
+}
* Foundation. See file COPYING.
*
*/
-#ifndef CEPH_PG_LOG_H
-#define CEPH_PG_LOG_H
+#pragma once
// re-include our assert to clobber boost's
#include "include/assert.h"
#define PGLOG_INDEXED_OBJECTS (1 << 0)
#define PGLOG_INDEXED_CALLER_OPS (1 << 1)
#define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
-#define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | PGLOG_INDEXED_CALLER_OPS | PGLOG_INDEXED_EXTRA_CALLER_OPS)
+#define PGLOG_INDEXED_DUPS (1 << 3)
+#define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | \
+ PGLOG_INDEXED_CALLER_OPS | \
+ PGLOG_INDEXED_EXTRA_CALLER_OPS | \
+ PGLOG_INDEXED_DUPS)
class CephContext;
mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
+ mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
// recovery pointers
list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
last_requested(0),
indexed_data(0),
rollback_info_trimmed_to_riter(log.rbegin())
- {}
+ { }
template <typename... Args>
IndexedLog(Args&&... args) :
complete_to(log.end()),
last_requested(0),
indexed_data(0),
- rollback_info_trimmed_to_riter(log.rbegin()) {
+ rollback_info_trimmed_to_riter(log.rbegin())
+ {
reset_rollback_info_trimmed_to_riter();
index();
}
complete_to(log.end()),
last_requested(rhs.last_requested),
indexed_data(0),
- rollback_info_trimmed_to_riter(log.rbegin()) {
+ rollback_info_trimmed_to_riter(log.rbegin())
+ {
reset_rollback_info_trimmed_to_riter();
index(rhs.indexed_data);
}
+
IndexedLog &operator=(const IndexedLog &rhs) {
this->~IndexedLog();
new (this) IndexedLog(rhs);
const osd_reqid_t &r,
eversion_t *version,
version_t *user_version,
- int *return_code) const {
+ int *return_code) const
+ {
assert(version);
assert(user_version);
assert(return_code);
}
assert(0 == "in extra_caller_ops but not extra_reqids");
}
+
+ if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
+ index_dups();
+ }
+ auto q = dup_index.find(r);
+ if (q != dup_index.end()) {
+ *version = q->second->version;
+ *user_version = q->second->user_version;
+ *return_code = q->second->return_code;
+ return true;
+ }
+
return false;
}
}
}
}
-
+
void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
+ // if to_index is 0, no need to run any of this code, especially
+ // loop below; this can happen with copy constructor for
+ // IndexedLog (and indirectly through assignment operator)
+ if (!to_index) return;
+
if (to_index & PGLOG_INDEXED_OBJECTS)
objects.clear();
if (to_index & PGLOG_INDEXED_CALLER_OPS)
caller_ops.clear();
if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
extra_caller_ops.clear();
+ if (to_index & PGLOG_INDEXED_DUPS) {
+ dup_index.clear();
+ for (auto& i : dups) {
+ dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
+ }
+ }
- for (list<pg_log_entry_t>::const_iterator i = log.begin();
- i != log.end();
- ++i) {
- if (to_index & PGLOG_INDEXED_OBJECTS) {
- if (i->object_is_indexed()) {
- objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+ constexpr __u16 any_log_entry_index =
+ PGLOG_INDEXED_OBJECTS |
+ PGLOG_INDEXED_CALLER_OPS |
+ PGLOG_INDEXED_EXTRA_CALLER_OPS;
+
+ if (to_index & any_log_entry_index) {
+ for (list<pg_log_entry_t>::const_iterator i = log.begin();
+ i != log.end();
+ ++i) {
+ if (to_index & PGLOG_INDEXED_OBJECTS) {
+ if (i->object_is_indexed()) {
+ objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+ }
}
- }
- if (to_index & PGLOG_INDEXED_CALLER_OPS) {
- if (i->reqid_is_indexed()) {
- caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+ if (to_index & PGLOG_INDEXED_CALLER_OPS) {
+ if (i->reqid_is_indexed()) {
+ caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+ }
}
- }
-
- if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
- for (auto j = i->extra_reqids.begin();
- j != i->extra_reqids.end();
- ++j) {
- extra_caller_ops.insert(
- make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+
+ if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+ for (auto j = i->extra_reqids.begin();
+ j != i->extra_reqids.end();
+ ++j) {
+ extra_caller_ops.insert(
+ make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+ }
}
}
}
-
+
indexed_data |= to_index;
}
index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
}
+ void index_dups() const {
+ index(PGLOG_INDEXED_DUPS);
+ }
+
void index(pg_log_entry_t& e) {
if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
if (objects.count(e.soid) == 0 ||
}
}
}
+
void unindex() {
objects.clear();
caller_ops.clear();
extra_caller_ops.clear();
+ dup_index.clear();
indexed_data = 0;
}
- void unindex(pg_log_entry_t& e) {
+
+ void unindex(const pg_log_entry_t& e) {
// NOTE: this only works if we remove from the _tail_ of the log!
if (indexed_data & PGLOG_INDEXED_OBJECTS) {
if (objects.count(e.soid) && objects[e.soid]->version == e.version)
if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
// divergent merge_log indexes new before unindexing old
if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e)
- caller_ops.erase(e.reqid);
+ caller_ops.erase(e.reqid);
}
}
if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
}
}
+ void index(pg_log_dup_t& e) {
+ if (PGLOG_INDEXED_DUPS) {
+ dup_index[e.reqid] = &e;
+ }
+ }
+
+ void unindex(const pg_log_dup_t& e) {
+ if (PGLOG_INDEXED_DUPS) {
+ auto i = dup_index.find(e.reqid);
+ if (i != dup_index.end()) {
+ dup_index.erase(i);
+ }
+ }
+ }
+
// actors
void add(const pg_log_entry_t& e, bool applied = true) {
if (!applied) {
caller_ops[e.reqid] = &(log.back());
}
}
-
+
if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
for (auto j = e.extra_reqids.begin();
j != e.extra_reqids.end();
if (!applied) {
skip_can_rollback_to_to_head();
}
- }
+ } // add
void trim(
CephContext* cct,
eversion_t s,
- set<eversion_t> *trimmed);
+ set<eversion_t> *trimmed,
+ set<string>* trimmed_dups,
+ bool* dirty_dups);
ostream& print(ostream& out) const;
- };
+ }; // IndexedLog
protected:
eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
eversion_t writeout_from; ///< must writout keys >= writeout_from
set<eversion_t> trimmed; ///< must clear keys in trimmed
+ set<string> trimmed_dups; ///< must clear keys in trimmed_dups
CephContext *cct;
bool pg_log_debug;
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
bool clear_divergent_priors;
+ bool dirty_dups; /// log.dups is updated
+ bool rebuilt_missing_with_deletes = false;
void mark_dirty_to(eversion_t to) {
if (to > dirty_to)
(dirty_from != eversion_t::max()) ||
(writeout_from != eversion_t::max()) ||
!(trimmed.empty()) ||
- !missing.is_clean();
+ !missing.is_clean() ||
+ !(trimmed_dups.empty()) ||
+ dirty_dups ||
+ rebuilt_missing_with_deletes;
}
void mark_log_for_rewrite() {
mark_dirty_to(eversion_t::max());
mark_dirty_from(eversion_t());
touched_log = false;
}
+ bool get_rebuilt_missing_with_deletes() const {
+ return rebuilt_missing_with_deletes;
+ }
protected:
/// DEBUG
dirty_from = eversion_t::max();
touched_log = true;
trimmed.clear();
+ trimmed_dups.clear();
writeout_from = eversion_t::max();
check();
missing.flush();
+ dirty_dups = false;
}
public:
+
// cppcheck-suppress noExplicitConstructor
- PGLog(CephContext *cct, DoutPrefixProvider *dpp = 0) :
+ PGLog(CephContext *cct, DoutPrefixProvider *dpp = nullptr) :
prefix_provider(dpp),
dirty_from(eversion_t::max()),
writeout_from(eversion_t::max()),
cct(cct),
pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
touched_log(false),
- clear_divergent_priors(false) {}
-
+ clear_divergent_priors(false),
+ dirty_dups(false)
+ { }
void reset_backfill();
missing.revise_have(oid, have);
}
- void revise_need(hobject_t oid, eversion_t need) {
- missing.revise_need(oid, need);
- }
-
void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) {
- missing.add(oid, need, have);
+ missing.add(oid, need, have, false);
}
//////////////////// get or set log ////////////////////
void split_into(
pg_t child_pgid,
unsigned split_bits,
- PGLog *opg_log) {
+ PGLog *opg_log) {
log.split_out_child(child_pgid, split_bits, &opg_log->log);
missing.split_into(child_pgid, split_bits, &(opg_log->missing));
opg_log->mark_dirty_to(eversion_t::max());
mark_dirty_to(eversion_t::max());
+ if (missing.may_include_deletes)
+ opg_log->rebuilt_missing_with_deletes = true;
}
void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
if (missing.is_missing(oid, v)) {
missing.got(oid, v);
-
+
// raise last_complete?
if (missing.get_items().empty()) {
log.complete_to = log.log.end();
assert(log.get_can_rollback_to() >= v);
}
- void activate_not_complete(pg_info_t &info) {
+ void reset_complete_to(pg_info_t *info) {
log.complete_to = log.log.begin();
- while (log.complete_to->version <
+ while (!missing.get_items().empty() && log.complete_to->version <
missing.get_items().at(
missing.get_rmissing().begin()->second
- ).need)
+ ).need) {
++log.complete_to;
+ }
assert(log.complete_to != log.log.end());
if (log.complete_to == log.log.begin()) {
- info.last_complete = eversion_t();
+ if (info)
+ info->last_complete = eversion_t();
} else {
--log.complete_to;
- info.last_complete = log.complete_to->version;
+ if (info)
+ info->last_complete = log.complete_to->version;
++log.complete_to;
}
+ }
+
+ void activate_not_complete(pg_info_t &info) {
+ reset_complete_to(&info);
log.last_requested = 0;
}
const pg_log_t &olog,
pg_missing_t& omissing, pg_shard_t from) const;
+ void rebuild_missing_set_with_deletes(ObjectStore *store,
+ coll_t pg_coll,
+ const pg_info_t &info);
+
protected:
static void split_by_object(
mempool::osd_pglog::list<pg_log_entry_t> &entries,
const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
const pg_info_t &info, ///< [in] info for merging entries
eversion_t olog_can_rollback_to, ///< [in] rollback boundary
- missing_type &missing, ///< [in,out] missing to adjust, use
+ missing_type &missing, ///< [in,out] missing to adjust, use
LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
const DoutPrefixProvider *dpp ///< [in] logging provider
) {
assert(objiter->second->version > last_divergent_update);
// ensure missing has been updated appropriately
- if (objiter->second->is_update()) {
+ if (objiter->second->is_update() ||
+ (missing.may_include_deletes && objiter->second->is_delete())) {
assert(missing.is_missing(hoid) &&
missing.get_items().at(hoid).need == objiter->second->version);
} else {
ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
<< " missing.have is " << missing.get_items().at(hoid).have
<< ", adjusting" << dendl;
- missing.revise_need(hoid, prior_version);
+ missing.revise_need(hoid, prior_version, false);
if (prior_version <= info.log_tail) {
ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
<< " prior_version " << prior_version
rollbacker->trim(i);
}
}
- missing.add(hoid, prior_version, eversion_t());
+ missing.add(hoid, prior_version, eversion_t(), false);
if (prior_version <= info.log_tail) {
ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
<< " prior_version " << prior_version
rollbacker,
this);
}
+
+ bool merge_log_dups(const pg_log_t& olog);
+
public:
+
void rewind_divergent_log(eversion_t newhead,
pg_info_t &info,
LogEntryHandler *rollbacker,
}
if (p->soid <= last_backfill &&
!p->is_error()) {
- missing.add_next_event(*p);
- if (rollbacker) {
- // hack to match PG::mark_all_unfound_lost
- if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
- rollbacker->try_stash(p->soid, p->version.version);
- } else if (p->is_delete()) {
- rollbacker->remove(p->soid);
+ if (missing.may_include_deletes) {
+ missing.add_next_event(*p);
+ } else {
+ if (p->is_delete()) {
+ missing.rm(p->soid, p->version);
+ } else {
+ missing.add_next_event(*p);
+ }
+ if (rollbacker) {
+ // hack to match PG::mark_all_unfound_lost
+ if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
+ rollbacker->try_stash(p->soid, p->version.version);
+ } else if (p->is_delete()) {
+ rollbacker->remove(p->soid);
+ }
}
}
}
this);
if (!entries.empty()) {
mark_writeout_from(entries.begin()->version);
+ if (entries.begin()->is_lost_delete()) {
+ // hack: since lost deletes queue recovery directly, and don't
+ // go through activate_not_complete() again, our complete_to
+ // iterator may still point at log.end(). Reset it to point
+ // before these new lost_delete entries. This only occurs
+ // when lost+delete entries are initially added, which is
+ // always in a list of solely lost_delete entries, so it is
+ // sufficient to check whether the first entry is a
+ // lost_delete
+ reset_complete_to(nullptr);
+ }
}
return invalidate_stats;
}
- void write_log_and_missing(ObjectStore::Transaction& t,
- map<string,bufferlist> *km,
- const coll_t& coll,
- const ghobject_t &log_oid,
- bool require_rollback);
+ void write_log_and_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist> *km,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
+ bool require_rollback);
static void write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
pg_log_t &log,
const coll_t& coll,
const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
- bool require_rollback);
+ bool require_rollback,
+ bool dirty_dups);
static void write_log_and_missing(
ObjectStore::Transaction& t,
const coll_t& coll,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
- bool require_rollback);
+ bool require_rollback,
+ bool dirty_dups,
+ bool *rebuilt_missing_set_with_deletes);
static void _write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
bool dirty_divergent_priors,
bool touch_log,
bool require_rollback,
+ bool dirty_dups,
set<string> *log_keys_debug
);
eversion_t dirty_from,
eversion_t writeout_from,
const set<eversion_t> &trimmed,
+ const set<string> &trimmed_dups,
const pg_missing_tracker_t &missing,
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
+ bool dirty_dups,
+ bool *rebuilt_missing_with_deletes,
set<string> *log_keys_debug
);
void read_log_and_missing(
- ObjectStore *store, coll_t pg_coll,
- coll_t log_coll, ghobject_t log_oid,
+ ObjectStore *store,
+ coll_t pg_coll,
+ coll_t log_coll,
+ ghobject_t log_oid,
const pg_info_t &info,
ostringstream &oss,
bool tolerate_divergent_missing_log,
tolerate_divergent_missing_log,
&clear_divergent_priors,
this,
- (pg_log_debug ? &log_keys_debug : 0),
+ (pg_log_debug ? &log_keys_debug : nullptr),
debug_verify_stored_missing);
}
template <typename missing_type>
- static void read_log_and_missing(ObjectStore *store, coll_t pg_coll,
- coll_t log_coll, ghobject_t log_oid,
+ static void read_log_and_missing(
+ ObjectStore *store,
+ coll_t pg_coll,
+ coll_t log_coll,
+ ghobject_t log_oid,
const pg_info_t &info,
IndexedLog &log,
- missing_type &missing, ostringstream &oss,
+ missing_type &missing,
+ ostringstream &oss,
bool tolerate_divergent_missing_log,
- bool *clear_divergent_priors = NULL,
- const DoutPrefixProvider *dpp = NULL,
- set<string> *log_keys_debug = 0,
+ bool *clear_divergent_priors = nullptr,
+ const DoutPrefixProvider *dpp = nullptr,
+ set<string> *log_keys_debug = nullptr,
bool debug_verify_stored_missing = false
) {
ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll
ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
map<eversion_t, hobject_t> divergent_priors;
bool has_divergent_priors = false;
+ missing.may_include_deletes = false;
list<pg_log_entry_t> entries;
+ list<pg_log_dup_t> dups;
if (p) {
for (p->seek_to_first(); p->valid() ; p->next(false)) {
// non-log pgmeta_oid keys are prefixed with _; skip those
::decode(on_disk_can_rollback_to, bp);
} else if (p->key() == "rollback_info_trimmed_to") {
::decode(on_disk_rollback_info_trimmed_to, bp);
+ } else if (p->key() == "may_include_deletes_in_missing") {
+ missing.may_include_deletes = true;
} else if (p->key().substr(0, 7) == string("missing")) {
- pair<hobject_t, pg_missing_item> p;
- ::decode(p, bp);
- missing.add(p.first, p.second.need, p.second.have);
+ hobject_t oid;
+ pg_missing_item item;
+ ::decode(oid, bp);
+ ::decode(item, bp);
+ if (item.is_delete()) {
+ assert(missing.may_include_deletes);
+ }
+ missing.add(oid, item.need, item.have, item.is_delete());
+ } else if (p->key().substr(0, 4) == string("dup_")) {
+ pg_log_dup_t dup;
+ ::decode(dup, bp);
+ if (!dups.empty()) {
+ assert(dups.back().version < dup.version);
+ }
+ dups.push_back(dup);
} else {
pg_log_entry_t e;
e.decode_with_checksum(bp);
info.log_tail,
on_disk_can_rollback_to,
on_disk_rollback_info_trimmed_to,
- std::move(entries));
+ std::move(entries),
+ std::move(dups));
if (has_divergent_priors || debug_verify_stored_missing) {
// build missing
if (debug_verify_stored_missing || info.last_complete < info.last_update) {
- ldpp_dout(dpp, 10) << "read_log_and_missing checking for missing items over interval ("
- << info.last_complete
- << "," << info.last_update << "]" << dendl;
+ ldpp_dout(dpp, 10)
+ << "read_log_and_missing checking for missing items over interval ("
+ << info.last_complete
+ << "," << info.last_update << "]" << dendl;
set<hobject_t> did;
set<hobject_t> checked;
if (did.count(i->soid)) continue;
did.insert(i->soid);
- if (i->is_delete()) continue;
+ if (!missing.may_include_deletes && i->is_delete())
+ continue;
bufferlist bv;
int r = store->getattr(
auto miter = missing.get_items().find(i->soid);
assert(miter != missing.get_items().end());
assert(miter->second.need == i->version);
- assert(miter->second.have == oi.version);
+ // the 'have' version is reset if an object is deleted,
+ // then created again
+ assert(miter->second.have == oi.version || miter->second.have == eversion_t());
checked.insert(i->soid);
} else {
- missing.add(i->soid, i->version, oi.version);
+ missing.add(i->soid, i->version, oi.version, i->is_delete());
}
}
} else {
ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
if (debug_verify_stored_missing) {
auto miter = missing.get_items().find(i->soid);
- assert(miter != missing.get_items().end());
- assert(miter->second.need == i->version);
- assert(miter->second.have == eversion_t());
+ if (i->is_delete()) {
+ assert(miter == missing.get_items().end() ||
+ (miter->second.need == i->version &&
+ miter->second.have == eversion_t()));
+ } else {
+ assert(miter != missing.get_items().end());
+ assert(miter->second.need == i->version);
+ assert(miter->second.have == eversion_t());
+ }
checked.insert(i->soid);
} else {
- missing.add(i->soid, i->version, eversion_t());
+ missing.add(i->soid, i->version, eversion_t(), i->is_delete());
}
}
}
for (auto &&i: missing.get_items()) {
if (checked.count(i.first))
continue;
- if (i.second.need > log.tail ||
- i.first > info.last_backfill) {
- ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry found "
- << i.first
- << dendl;
+ if (i.first > info.last_backfill) {
+ ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry "
+ << "found before last_backfill: "
+ << i.first << " " << i.second
+ << " last_backfill = " << info.last_backfill
+ << dendl;
assert(0 == "invalid missing set entry found");
}
bufferlist bv;
object_info_t oi(bv);
assert(oi.version == i.second.have);
} else {
- assert(eversion_t() == i.second.have);
+ assert(i.second.is_delete() || eversion_t() == i.second.have);
}
}
} else {
}
} else {
ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
- missing.add(i->second, i->first, eversion_t());
+ missing.add(i->second, i->first, eversion_t(), false);
}
}
}
missing.flush();
}
ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
- }
-};
-
-#endif // CEPH_PG_LOG_H
+ } // static read_log_and_missing
+}; // struct PGLog
PrimaryLogPG *pg,
PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
void finish(int r) override {
- if (r < 0)
- opcontext->async_read_result = r;
opcontext->finish_read(pg);
}
~OnReadComplete() override {}
assert(pg->in_progress_async_reads.size());
assert(pg->in_progress_async_reads.front().second == this);
pg->in_progress_async_reads.pop_front();
- pg->complete_read_ctx(async_read_result, this);
+
+ // Restart the op context now that all reads have been
+ // completed. Read failures will be handled by the op finisher
+ pg->execute_ctx(this);
}
}
-class CopyFromCallback: public PrimaryLogPG::CopyCallback {
+class CopyFromCallback : public PrimaryLogPG::CopyCallback {
public:
- PrimaryLogPG::CopyResults *results;
- int retval;
+ PrimaryLogPG::CopyResults *results = nullptr;
PrimaryLogPG::OpContext *ctx;
- explicit CopyFromCallback(PrimaryLogPG::OpContext *ctx_)
- : results(NULL),
- retval(0),
- ctx(ctx_) {}
+ OSDOp &osd_op;
+
+ CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
+ : ctx(ctx), osd_op(osd_op) {
+ }
~CopyFromCallback() override {}
void finish(PrimaryLogPG::CopyCallbackResults results_) override {
results = results_.get<1>();
int r = results_.get<0>();
- retval = r;
// for finish_copyfrom
ctx->user_at_version = results->user_version;
if (r >= 0) {
ctx->pg->execute_ctx(ctx);
- }
- ctx->copy_cb = NULL;
- if (r < 0) {
+ } else {
if (r != -ECANCELED) { // on cancel just toss it out; client resends
if (ctx->op)
ctx->pg->osd->reply_op_error(ctx->op, r);
uint64_t get_data_size() {
return results->object_size;
}
- int get_result() {
- return retval;
+};
+
+struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
+ CopyFromCallback *copy_from_callback;
+
+ CopyFromFinisher(CopyFromCallback *copy_from_callback)
+ : copy_from_callback(copy_from_callback) {
+ }
+
+ int execute() override {
+ // instance will be destructed after this method completes
+ copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
+ return 0;
}
};
const hobject_t &hoid,
const ObjectRecoveryInfo &_recovery_info,
ObjectContextRef obc,
+ bool is_delete,
ObjectStore::Transaction *t
)
{
ObjectRecoveryInfo recovery_info(_recovery_info);
clear_object_snap_mapping(t, hoid);
- if (recovery_info.soid.is_snap()) {
+ if (!is_delete && recovery_info.soid.is_snap()) {
OSDriver::OSTransaction _t(osdriver.get_transaction(t));
set<snapid_t> snaps;
dout(20) << " snapset " << recovery_info.ss
snaps,
&_t);
}
- if (pg_log.get_missing().is_missing(recovery_info.soid) &&
+ if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
assert(is_primary());
const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
recover_got(recovery_info.soid, recovery_info.version);
if (is_primary()) {
- assert(obc);
- obc->obs.exists = true;
- obc->ondisk_write_lock();
-
- bool got = obc->get_recovery_read();
- assert(got);
+ if (!is_delete) {
+ obc->obs.exists = true;
+ obc->ondisk_write_lock();
- assert(recovering.count(obc->obs.oi.soid));
- recovering[obc->obs.oi.soid] = obc;
- obc->obs.oi = recovery_info.oi; // may have been updated above
+ bool got = obc->get_recovery_read();
+ assert(got);
+ assert(recovering.count(obc->obs.oi.soid));
+ recovering[obc->obs.oi.soid] = obc;
+ obc->obs.oi = recovery_info.oi; // may have been updated above
+ t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
+ }
t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
- t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
publish_stats_to_osd();
assert(missing_loc.needs_recovery(hoid));
- missing_loc.add_location(hoid, pg_whoami);
+ if (!is_delete)
+ missing_loc.add_location(hoid, pg_whoami);
release_backoffs(hoid);
if (!is_unreadable_object(hoid)) {
auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
void PrimaryLogPG::on_global_recover(
const hobject_t &soid,
- const object_stat_sum_t &stat_diff)
+ const object_stat_sum_t &stat_diff,
+ bool is_delete)
{
info.stats.stats.sum.add(stat_diff);
missing_loc.recovered(soid);
map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
assert(i != recovering.end());
- // recover missing won't have had an obc, but it gets filled in
- // during on_local_recover
- assert(i->second);
- list<OpRequestRef> requeue_list;
- i->second->drop_recovery_read(&requeue_list);
- requeue_ops(requeue_list);
+ if (!is_delete) {
+ // recover missing won't have had an obc, but it gets filled in
+ // during on_local_recover
+ assert(i->second);
+ list<OpRequestRef> requeue_list;
+ i->second->drop_recovery_read(&requeue_list);
+ requeue_ops(requeue_list);
+ }
backfills_in_flight.erase(soid);
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
if (is_missing_object(soid)) {
recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
+ } else if (missing_loc.is_deleted(soid)) {
+ prep_object_replica_deletes(soid, v, h);
} else {
prep_object_replica_pushes(soid, v, h);
}
ConnectionRef con,
ceph_tid_t tid)
{
- const pg_missing_t &missing = pg_log.get_missing();
+ const auto &missing = pg_log.get_missing();
string prefix;
string format;
if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
continue;
+ if (missing_loc.is_deleted(candidate))
+ continue;
+
// skip wrong namespace
if (m->get_hobj().nspace != librados::all_nspaces &&
candidate.get_namespace() != m->get_hobj().nspace)
if (candidate.get_namespace() != m->get_hobj().nspace)
continue;
+ if (missing_loc.is_deleted(candidate))
+ continue;
+
if (filter && !pgls_filter(filter, candidate, filter_out))
continue;
}
}
- OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
+ OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
if (!obc->obs.exists)
ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
case CEPH_OSD_OP_SYNC_READ:
case CEPH_OSD_OP_SPARSE_READ:
case CEPH_OSD_OP_CHECKSUM:
+ case CEPH_OSD_OP_CMPEXT:
op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
}
osd->logger->inc(l_osd_tier_proxy_read);
const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
- OpContext *ctx = new OpContext(op, m->get_reqid(), prdop->ops, this);
+ OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
ctx->user_at_version = prdop->user_version;
ctx->data_off = prdop->data_offset;
dout(10) << __func__ << " Start proxy write for " << *m << dendl;
ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
- pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
+ pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
pwop->mtime = m->get_mtime();
ObjectOperation obj_op;
ctx->at_version = get_next_version();
ctx->mtime = m->get_mtime();
- dout(10) << __func__ << " " << soid << " " << ctx->ops
+ dout(10) << __func__ << " " << soid << " " << *ctx->ops
<< " ov " << obc->obs.oi.version << " av " << ctx->at_version
<< " snapc " << ctx->snapc
<< " snapset " << obc->ssc->snapset
<< dendl;
} else {
- dout(10) << __func__ << " " << soid << " " << ctx->ops
+ dout(10) << __func__ << " " << soid << " " << *ctx->ops
<< " ov " << obc->obs.oi.version
<< dendl;
}
obc->ondisk_read_unlock();
}
- if (result == -EINPROGRESS) {
+ bool pending_async_reads = !ctx->pending_async_reads.empty();
+ if (result == -EINPROGRESS || pending_async_reads) {
// come back later.
+ if (pending_async_reads) {
+ in_progress_async_reads.push_back(make_pair(op, ctx));
+ ctx->start_async_reads(this);
+ }
return;
}
if (result >= 0)
do_osd_op_effects(ctx, m->get_connection());
- if (ctx->pending_async_reads.empty()) {
- complete_read_ctx(result, ctx);
- } else {
- in_progress_async_reads.push_back(make_pair(op, ctx));
- ctx->start_async_reads(this);
- }
-
+ complete_read_ctx(result, ctx);
return;
}
// save just what we need from ctx
MOSDOpReply *reply = ctx->reply;
ctx->reply = nullptr;
- reply->claim_op_out_data(ctx->ops);
- reply->get_header().data_off = ctx->data_off;
+ reply->claim_op_out_data(*ctx->ops);
+ reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
close_op_ctx(ctx);
if (result == -ENOENT) {
repop->put();
}
+void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
+ release_object_locks(ctx->lock_manager);
+
+ ctx->op_t.reset();
+
+ for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
+ ctx->on_finish.erase(p++)) {
+ (*p)();
+ }
+ delete ctx;
+}
+
void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
{
if (ctx->op)
} else {
auto p = snapset.clone_snaps.find(coid.snap);
if (p == snapset.clone_snaps.end()) {
- osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
- << " for " << coid << "\n";
+ osd->clog->error() << "No clone_snaps in snapset " << snapset
+ << " for object " << coid << "\n";
return -ENOENT;
}
old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
snapset.clone_snaps[coid.snap].end());
}
if (old_snaps.empty()) {
- osd->clog->error() << __func__ << " No object info snaps for " << coid;
+ osd->clog->error() << "No object info snaps for object " << coid;
return -ENOENT;
}
dout(10) << coid << " old_snaps " << old_snaps
<< " old snapset " << snapset << dendl;
if (snapset.seq == 0) {
- osd->clog->error() << __func__ << " No snapset.seq for " << coid;
+ osd->clog->error() << "No snapset.seq for object " << coid;
return -ENOENT;
}
if (new_snaps.empty()) {
p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
if (p == snapset.clones.end()) {
- osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones";
+ osd->clog->error() << "Snap " << coid.snap << " not in clones";
return -ENOENT;
}
}
}
}
-int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
-{
- ceph_osd_op& op = osd_op.op;
- vector<OSDOp> read_ops(1);
- OSDOp& read_op = read_ops[0];
- int result = 0;
-
- read_op.op.op = CEPH_OSD_OP_SYNC_READ;
- read_op.op.extent.offset = op.extent.offset;
- read_op.op.extent.length = op.extent.length;
- read_op.op.extent.truncate_seq = op.extent.truncate_seq;
- read_op.op.extent.truncate_size = op.extent.truncate_size;
-
- result = do_osd_ops(ctx, read_ops);
- if (result < 0) {
- derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
- return result;
- }
-
- if (read_op.outdata.length() != osd_op.indata.length())
- return -EINVAL;
-
- for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
- if (read_op.outdata[p] != osd_op.indata[p]) {
- return (-MAX_ERRNO - p);
- }
- }
-
- return result;
-}
-
int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
{
ceph_osd_op& op = osd_op.op;
r(r), rval(rv), outdatap(blp), maybe_crc(mc),
size(size), osd(osd), soid(soid), flags(flags) {}
void finish(int len) override {
- *rval = len;
*r = len;
- if (len < 0)
+ if (len < 0) {
+ *rval = len;
return;
+ }
+ *rval = 0;
+
// whole object? can we verify the checksum?
if (maybe_crc && *r == size) {
uint32_t crc = outdatap->crc32c(-1);
};
struct ToSparseReadResult : public Context {
- bufferlist& data_bl;
+ int* result;
+ bufferlist* data_bl;
uint64_t data_offset;
- ceph_le64& len;
- ToSparseReadResult(bufferlist& bl, uint64_t offset, ceph_le64& len):
- data_bl(bl), data_offset(offset),len(len) {}
+ ceph_le64* len;
+ ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
+ ceph_le64* len)
+ : result(result), data_bl(bl), data_offset(offset),len(len) {}
void finish(int r) override {
- if (r < 0) return;
- len = r;
+ if (r < 0) {
+ *result = r;
+ return;
+ }
+ *result = 0;
+ *len = r;
bufferlist outdata;
map<uint64_t, uint64_t> extents = {{data_offset, r}};
::encode(extents, outdata);
- ::encode_destructively(data_bl, outdata);
- data_bl.swap(outdata);
+ ::encode_destructively(*data_bl, outdata);
+ data_bl->swap(outdata);
}
};
}
}
+struct ReadFinisher : public PrimaryLogPG::OpFinisher {
+ OSDOp& osd_op;
+
+ ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
+ }
+
+ int execute() override {
+ return osd_op.rval;
+ }
+};
+
struct C_ChecksumRead : public Context {
PrimaryLogPG *primary_log_pg;
OSDOp &osd_op;
&read_bl, maybe_crc, size,
osd, soid, flags)) {
}
+ ~C_ChecksumRead() override {
+ delete fill_extent_ctx;
+ }
void finish(int r) override {
fill_extent_ctx->complete(r);
+ fill_extent_ctx = nullptr;
if (osd_op.rval >= 0) {
bufferlist::iterator init_value_bl_it = init_value_bl.begin();
osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
- &init_value_bl_it,
- read_bl);
+ &init_value_bl_it, read_bl);
}
}
};
int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
- bufferlist::iterator *bl_it, bool *async_read)
+ bufferlist::iterator *bl_it)
{
dout(20) << __func__ << dendl;
auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
std::move(init_value_bl), maybe_crc,
oi.size, osd, soid, op.flags);
+
ctx->pending_async_reads.push_back({
{op.checksum.offset, op.checksum.length, op.flags},
{&checksum_ctx->read_bl, checksum_ctx}});
dout(10) << __func__ << ": async_read noted for " << soid << dendl;
- *async_read = true;
- return 0;
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ return -EINPROGRESS;
}
// sync read
- *async_read = false;
std::vector<OSDOp> read_ops(1);
auto& read_op = read_ops[0];
if (op.checksum.length > 0) {
return 0;
}
+struct C_ExtentCmpRead : public Context {
+ PrimaryLogPG *primary_log_pg;
+ OSDOp &osd_op;
+ ceph_le64 read_length;
+ bufferlist read_bl;
+ Context *fill_extent_ctx;
+
+ C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
+ boost::optional<uint32_t> maybe_crc, uint64_t size,
+ OSDService *osd, hobject_t soid, __le32 flags)
+ : primary_log_pg(primary_log_pg), osd_op(osd_op),
+ fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
+ &read_bl, maybe_crc, size,
+ osd, soid, flags)) {
+ }
+ ~C_ExtentCmpRead() override {
+ delete fill_extent_ctx;
+ }
+
+ void finish(int r) override {
+ if (r == -ENOENT) {
+ osd_op.rval = 0;
+ read_bl.clear();
+ delete fill_extent_ctx;
+ } else {
+ fill_extent_ctx->complete(r);
+ }
+ fill_extent_ctx = nullptr;
+
+ if (osd_op.rval >= 0) {
+ osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
+ }
+ }
+};
+
+int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
+{
+ dout(20) << __func__ << dendl;
+ ceph_osd_op& op = osd_op.op;
+
+ if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+ dout(20) << __func__ << " object DNE" << dendl;
+ return finish_extent_cmp(osd_op, {});
+ } else if (pool.info.require_rollback()) {
+ // If there is a data digest and it is possible we are reading
+ // entire object, pass the digest.
+ auto& oi = ctx->new_obs.oi;
+ boost::optional<uint32_t> maybe_crc;
+ if (oi.is_data_digest() && op.checksum.offset == 0 &&
+ op.checksum.length >= oi.size) {
+ maybe_crc = oi.data_digest;
+ }
+
+ // async read
+ auto& soid = oi.soid;
+ auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
+ osd, soid, op.flags);
+ ctx->pending_async_reads.push_back({
+ {op.extent.offset, op.extent.length, op.flags},
+ {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
+
+ dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ return -EINPROGRESS;
+ }
+
+ // sync read
+ vector<OSDOp> read_ops(1);
+ OSDOp& read_op = read_ops[0];
+
+ read_op.op.op = CEPH_OSD_OP_SYNC_READ;
+ read_op.op.extent.offset = op.extent.offset;
+ read_op.op.extent.length = op.extent.length;
+ read_op.op.extent.truncate_seq = op.extent.truncate_seq;
+ read_op.op.extent.truncate_size = op.extent.truncate_size;
+
+ int result = do_osd_ops(ctx, read_ops);
+ if (result < 0) {
+ derr << __func__ << " failed " << result << dendl;
+ return result;
+ }
+ return finish_extent_cmp(osd_op, read_op.outdata);
+}
+
+int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
+{
+ for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
+ char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
+ if (osd_op.indata[idx] != read_byte) {
+ return (-MAX_ERRNO - idx);
+ }
+ }
+
+ return 0;
+}
+
+int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
+ dout(20) << __func__ << dendl;
+ auto& op = osd_op.op;
+ auto& oi = ctx->new_obs.oi;
+ auto& soid = oi.soid;
+ __u32 seq = oi.truncate_seq;
+ uint64_t size = oi.size;
+ bool trimmed_read = false;
+
+ // are we beyond truncate_size?
+ if ( (seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size) )
+ size = op.extent.truncate_size;
+
+ if (op.extent.length == 0) //length is zero mean read the whole object
+ op.extent.length = size;
+
+ if (op.extent.offset >= size) {
+ op.extent.length = 0;
+ trimmed_read = true;
+ } else if (op.extent.offset + op.extent.length > size) {
+ op.extent.length = size - op.extent.offset;
+ trimmed_read = true;
+ }
+
+ // read into a buffer
+ int result = 0;
+ if (trimmed_read && op.extent.length == 0) {
+ // read size was trimmed to zero and it is expected to do nothing
+ // a read operation of 0 bytes does *not* do nothing, this is why
+ // the trimmed_read boolean is needed
+ } else if (pool.info.require_rollback()) {
+ boost::optional<uint32_t> maybe_crc;
+ // If there is a data digest and it is possible we are reading
+ // entire object, pass the digest. FillInVerifyExtent will
+ // will check the oi.size again.
+ if (oi.is_data_digest() && op.extent.offset == 0 &&
+ op.extent.length >= oi.size)
+ maybe_crc = oi.data_digest;
+ ctx->pending_async_reads.push_back(
+ make_pair(
+ boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
+ make_pair(&osd_op.outdata,
+ new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
+ &osd_op.outdata, maybe_crc, oi.size,
+ osd, soid, op.flags))));
+ dout(10) << " async_read noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ } else {
+ int r = pgbackend->objects_read_sync(
+ soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
+ if (r == -EIO) {
+ r = rep_repair_primary_object(soid, ctx->op);
+ }
+ if (r >= 0)
+ op.extent.length = r;
+ else {
+ result = r;
+ op.extent.length = 0;
+ }
+ dout(10) << " read got " << r << " / " << op.extent.length
+ << " bytes from obj " << soid << dendl;
+
+ // whole object? can we verify the checksum?
+ if (op.extent.length == oi.size && oi.is_data_digest()) {
+ uint32_t crc = osd_op.outdata.crc32c(-1);
+ if (oi.data_digest != crc) {
+ osd->clog->error() << info.pgid << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << soid;
+ // FIXME fall back to replica or something?
+ result = -EIO;
+ }
+ }
+ }
+
+ // XXX the op.extent.length is the requested length for async read
+ // On error this length is changed to 0 after the error comes back.
+ ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
+ ctx->delta_stats.num_rd++;
+ return result;
+}
+
+int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
+ dout(20) << __func__ << dendl;
+ auto& op = osd_op.op;
+ auto& oi = ctx->new_obs.oi;
+ auto& soid = oi.soid;
+
+ if (op.extent.truncate_seq) {
+ dout(0) << "sparse_read does not support truncation sequence " << dendl;
+ return -EINVAL;
+ }
+
+ ++ctx->num_read;
+ if (pool.info.ec_pool()) {
+ // translate sparse read to a normal one if not supported
+ uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ if (offset > oi.size) {
+ length = 0;
+ } else if (offset + length > oi.size) {
+ length = oi.size - offset;
+ }
+
+ if (length > 0) {
+ ctx->pending_async_reads.push_back(
+ make_pair(
+ boost::make_tuple(offset, length, op.flags),
+ make_pair(
+ &osd_op.outdata,
+ new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
+ &op.extent.length))));
+ dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ } else {
+ dout(10) << " sparse read ended up empty for " << soid << dendl;
+ map<uint64_t, uint64_t> extents;
+ ::encode(extents, osd_op.outdata);
+ }
+ } else {
+ // read into a buffer
+ map<uint64_t, uint64_t> m;
+ uint32_t total_read = 0;
+ int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
+ info.pgid.shard),
+ op.extent.offset, op.extent.length, m);
+ if (r < 0) {
+ return r;
+ }
+
+ map<uint64_t, uint64_t>::iterator miter;
+ bufferlist data_bl;
+ uint64_t last = op.extent.offset;
+ for (miter = m.begin(); miter != m.end(); ++miter) {
+ // verify hole?
+ if (cct->_conf->osd_verify_sparse_read_holes &&
+ last < miter->first) {
+ bufferlist t;
+ uint64_t len = miter->first - last;
+ r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
+ if (r == -EIO) {
+ r = rep_repair_primary_object(soid, ctx->op);
+ }
+ if (r < 0) {
+ osd->clog->error() << coll << " " << soid
+ << " sparse-read failed to read: "
+ << r;
+ } else if (!t.is_zero()) {
+ osd->clog->error() << coll << " " << soid
+ << " sparse-read found data in hole "
+ << last << "~" << len;
+ }
+ }
+
+ bufferlist tmpbl;
+ r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
+ op.flags, &tmpbl);
+ if (r < 0) {
+ return r;
+ }
+
+ // this is usually happen when we get extent that exceeds the actual file
+ // size
+ if (r < (int)miter->second)
+ miter->second = r;
+ total_read += r;
+ dout(10) << "sparse-read " << miter->first << "@" << miter->second
+ << dendl;
+ data_bl.claim_append(tmpbl);
+ last = miter->first + r;
+ }
+
+ if (r < 0) {
+ return r;
+ }
+
+ // verify trailing hole?
+ if (cct->_conf->osd_verify_sparse_read_holes) {
+ uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
+ if (last < end) {
+ bufferlist t;
+ uint64_t len = end - last;
+ r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
+ if (r < 0) {
+ osd->clog->error() << coll << " " << soid
+ << " sparse-read failed to read: " << r;
+ } else if (!t.is_zero()) {
+ osd->clog->error() << coll << " " << soid
+ << " sparse-read found data in hole "
+ << last << "~" << len;
+ }
+ }
+ }
+
+ // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
+ // Maybe at first, there is no much whole objects. With continued use, more
+ // and more whole object exist. So from this point, for spare-read add
+ // checksum make sense.
+ if (total_read == oi.size && oi.is_data_digest()) {
+ uint32_t crc = data_bl.crc32c(-1);
+ if (oi.data_digest != crc) {
+ osd->clog->error() << info.pgid << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << soid;
+ // FIXME fall back to replica or something?
+ return -EIO;
+ }
+ }
+
+ op.extent.length = total_read;
+
+ ::encode(m, osd_op.outdata); // re-encode since it might be modified
+ ::encode_destructively(data_bl, osd_op.outdata);
+
+ dout(10) << " sparse_read got " << total_read << " bytes from object "
+ << soid << dendl;
+ }
+
+ ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
+ ctx->delta_stats.num_rd++;
+ return 0;
+}
+
int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
{
int result = 0;
object_info_t& oi = obs.oi;
const hobject_t& soid = oi.soid;
- bool first_read = true;
-
PGTransaction* t = ctx->op_t.get();
dout(10) << "do_osd_op " << soid << " " << ops << dendl;
+ ctx->current_osd_subop_num = 0;
for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
OSDOp& osd_op = *p;
ceph_osd_op& op = osd_op.op;
+ OpFinisher* op_finisher = nullptr;
+ {
+ auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
+ if (op_finisher_it != ctx->op_finishers.end()) {
+ op_finisher = op_finisher_it->second.get();
+ }
+ }
+
// TODO: check endianness (__le32 vs uint32_t, etc.)
// The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
// but the code in this function seems to treat them as native-endian. What should the
case CEPH_OSD_OP_CMPEXT:
++ctx->num_read;
- tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
- result = do_extent_cmp(ctx, osd_op);
+ tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+
+ if (op_finisher == nullptr) {
+ result = do_extent_cmp(ctx, osd_op);
+ } else {
+ result = op_finisher->execute();
+ }
break;
case CEPH_OSD_OP_SYNC_READ:
// fall through
case CEPH_OSD_OP_READ:
++ctx->num_read;
- {
- __u32 seq = oi.truncate_seq;
- uint64_t size = oi.size;
- tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
- bool trimmed_read = false;
- // are we beyond truncate_size?
- if ( (seq < op.extent.truncate_seq) &&
- (op.extent.offset + op.extent.length > op.extent.truncate_size) )
- size = op.extent.truncate_size;
-
- if (op.extent.length == 0) //length is zero mean read the whole object
- op.extent.length = size;
-
- if (op.extent.offset >= size) {
- op.extent.length = 0;
- trimmed_read = true;
- } else if (op.extent.offset + op.extent.length > size) {
- op.extent.length = size - op.extent.offset;
- trimmed_read = true;
- }
-
- // read into a buffer
- bool async = false;
- if (trimmed_read && op.extent.length == 0) {
- // read size was trimmed to zero and it is expected to do nothing
- // a read operation of 0 bytes does *not* do nothing, this is why
- // the trimmed_read boolean is needed
- } else if (pool.info.require_rollback()) {
- async = true;
- boost::optional<uint32_t> maybe_crc;
- // If there is a data digest and it is possible we are reading
- // entire object, pass the digest. FillInVerifyExtent will
- // will check the oi.size again.
- if (oi.is_data_digest() && op.extent.offset == 0 &&
- op.extent.length >= oi.size)
- maybe_crc = oi.data_digest;
- ctx->pending_async_reads.push_back(
- make_pair(
- boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
- make_pair(&osd_op.outdata,
- new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
- &osd_op.outdata, maybe_crc, oi.size, osd,
- soid, op.flags))));
- dout(10) << " async_read noted for " << soid << dendl;
- } else {
- int r = pgbackend->objects_read_sync(
- soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
- if (r == -EIO) {
- r = rep_repair_primary_object(soid, ctx->op);
- }
- if (r >= 0)
- op.extent.length = r;
- else {
- result = r;
- op.extent.length = 0;
- }
- dout(10) << " read got " << r << " / " << op.extent.length
- << " bytes from obj " << soid << dendl;
-
- // whole object? can we verify the checksum?
- if (op.extent.length == oi.size && oi.is_data_digest()) {
- uint32_t crc = osd_op.outdata.crc32c(-1);
- if (oi.data_digest != crc) {
- osd->clog->error() << info.pgid << std::hex
- << " full-object read crc 0x" << crc
- << " != expected 0x" << oi.data_digest
- << std::dec << " on " << soid;
- // FIXME fall back to replica or something?
- result = -EIO;
- }
- }
- }
- if (first_read) {
- first_read = false;
+ tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+ if (op_finisher == nullptr) {
+ if (!ctx->data_off) {
ctx->data_off = op.extent.offset;
}
- // XXX the op.extent.length is the requested length for async read
- // On error this length is changed to 0 after the error comes back.
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
- ctx->delta_stats.num_rd++;
-
- // Skip checking the result and just proceed to the next operation
- if (async)
- continue;
-
+ result = do_read(ctx, osd_op);
+ } else {
+ result = op_finisher->execute();
}
break;
op.checksum.offset, op.checksum.length,
op.checksum.chunk_size);
- bool async_read;
- result = do_checksum(ctx, osd_op, &bp, &async_read);
- if (result == 0 && async_read) {
- continue;
+ if (op_finisher == nullptr) {
+ result = do_checksum(ctx, osd_op, &bp);
+ } else {
+ result = op_finisher->execute();
}
}
break;
/* map extents */
case CEPH_OSD_OP_SPARSE_READ:
- tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
- if (op.extent.truncate_seq) {
- dout(0) << "sparse_read does not support truncation sequence " << dendl;
- result = -EINVAL;
- break;
- }
- ++ctx->num_read;
- if (pool.info.ec_pool()) {
- // translate sparse read to a normal one if not supported
- uint64_t offset = op.extent.offset;
- uint64_t length = op.extent.length;
- if (offset > oi.size) {
- length = 0;
- } else if (offset + length > oi.size) {
- length = oi.size - offset;
- }
- if (length > 0) {
- ctx->pending_async_reads.push_back(
- make_pair(
- boost::make_tuple(offset, length, op.flags),
- make_pair(
- &osd_op.outdata,
- new ToSparseReadResult(
- osd_op.outdata, offset,
- op.extent.length /* updated by the callback */))));
- dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
- } else {
- dout(10) << " sparse read ended up empty for " << soid << dendl;
- map<uint64_t, uint64_t> extents;
- ::encode(extents, osd_op.outdata);
- }
+ tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+ if (op_finisher == nullptr) {
+ result = do_sparse_read(ctx, osd_op);
} else {
- // read into a buffer
- map<uint64_t, uint64_t> m;
- uint32_t total_read = 0;
- int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
- info.pgid.shard),
- op.extent.offset, op.extent.length, m);
- if (r < 0) {
- result = r;
- break;
- }
- map<uint64_t, uint64_t>::iterator miter;
- bufferlist data_bl;
- uint64_t last = op.extent.offset;
- for (miter = m.begin(); miter != m.end(); ++miter) {
- // verify hole?
- if (cct->_conf->osd_verify_sparse_read_holes &&
- last < miter->first) {
- bufferlist t;
- uint64_t len = miter->first - last;
- r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
- if (r == -EIO) {
- r = rep_repair_primary_object(soid, ctx->op);
- }
- if (r < 0) {
- osd->clog->error() << coll << " " << soid
- << " sparse-read failed to read: "
- << r;
- } else if (!t.is_zero()) {
- osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
- << last << "~" << len;
- }
- }
-
- bufferlist tmpbl;
- r = pgbackend->objects_read_sync(soid, miter->first, miter->second, op.flags, &tmpbl);
- if (r < 0) {
- result = r;
- break;
- }
-
- if (r < (int)miter->second) /* this is usually happen when we get extent that exceeds the actual file size */
- miter->second = r;
- total_read += r;
- dout(10) << "sparse-read " << miter->first << "@" << miter->second << dendl;
- data_bl.claim_append(tmpbl);
- last = miter->first + r;
- }
-
- if (r < 0) {
- result = r;
- break;
- }
-
- // verify trailing hole?
- if (cct->_conf->osd_verify_sparse_read_holes) {
- uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
- if (last < end) {
- bufferlist t;
- uint64_t len = end - last;
- r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
- if (r < 0) {
- osd->clog->error() << coll << " " << soid
- << " sparse-read failed to read: "
- << r;
- } else if (!t.is_zero()) {
- osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
- << last << "~" << len;
- }
- }
- }
-
- // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
- // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
- // So from this point, for spare-read add checksum make sense.
- if (total_read == oi.size && oi.is_data_digest()) {
- uint32_t crc = data_bl.crc32c(-1);
- if (oi.data_digest != crc) {
- osd->clog->error() << info.pgid << std::hex
- << " full-object read crc 0x" << crc
- << " != expected 0x" << oi.data_digest
- << std::dec << " on " << soid;
- // FIXME fall back to replica or something?
- result = -EIO;
- break;
- }
- }
-
- op.extent.length = total_read;
-
- ::encode(m, osd_op.outdata); // re-encode since it might be modified
- ::encode_destructively(data_bl, osd_op.outdata);
-
- dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
+ result = op_finisher->execute();
}
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
- ctx->delta_stats.num_rd++;
break;
case CEPH_OSD_OP_CALL:
case CEPH_OSD_OP_COPY_GET:
++ctx->num_read;
- tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), soid.snap.val);
- result = fill_in_copy_get(ctx, bp, osd_op, ctx->obc);
+ tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
+ soid.snap.val);
+ if (op_finisher == nullptr) {
+ result = do_copy_get(ctx, bp, osd_op, ctx->obc);
+ } else {
+ result = op_finisher->execute();
+ }
break;
case CEPH_OSD_OP_COPY_FROM:
src_oloc.hash,
src_snapid,
src_version);
- if (!ctx->copy_cb) {
+ if (op_finisher == nullptr) {
// start
pg_t raw_pg;
get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
result = -EINVAL;
break;
}
- CopyFromCallback *cb = new CopyFromCallback(ctx);
- ctx->copy_cb = cb;
+ CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new CopyFromFinisher(cb));
start_copy(cb, ctx->obc, src, src_oloc, src_version,
op.copy_from.flags,
false,
result = -EINPROGRESS;
} else {
// finish
- assert(ctx->copy_cb->get_result() >= 0);
- finish_copyfrom(ctx);
- result = 0;
+ result = op_finisher->execute();
+ assert(result == 0);
+
+ // COPY_FROM cannot be executed multiple times -- it must restart
+ ctx->op_finishers.erase(ctx->current_osd_subop_num);
}
}
break;
&rollback_to, false, false, &missing_oid);
if (ret == -EAGAIN) {
/* clone must be missing */
- assert(is_missing_object(missing_oid));
- dout(20) << "_rollback_to attempted to roll back to a missing object "
+ assert(is_degraded_or_backfilling_object(missing_oid));
+ dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
<< missing_oid << " (requested snapid: ) " << snapid << dendl;
block_write_on_degraded_snap(missing_oid, ctx->op);
return ret;
int PrimaryLogPG::prepare_transaction(OpContext *ctx)
{
- assert(!ctx->ops.empty());
-
+ assert(!ctx->ops->empty());
+
const hobject_t& soid = ctx->obs->oi.soid;
// valid snap context?
}
// prepare the actual mutation
- int result = do_osd_ops(ctx, ctx->ops);
+ int result = do_osd_ops(ctx, *ctx->ops);
if (result < 0) {
if (ctx->op->may_write() &&
get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
assert(ctx->async_reads_complete());
- for (vector<OSDOp>::iterator p = ctx->ops.begin();
- p != ctx->ops.end() && result >= 0; ++p) {
+ for (vector<OSDOp>::iterator p = ctx->ops->begin();
+ p != ctx->ops->end() && result >= 0; ++p) {
if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
result = p->rval;
break;
}
ctx->bytes_read += p->outdata.length();
}
- ctx->reply->claim_op_out_data(ctx->ops);
- ctx->reply->get_header().data_off = ctx->data_off;
+ ctx->reply->claim_op_out_data(*ctx->ops);
+ ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
MOSDOpReply *reply = ctx->reply;
ctx->reply = nullptr;
C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
osd_op(osd_op), features(features), len(0) {}
void finish(int r) override {
+ osd_op->rval = r;
+ if (r < 0) {
+ return;
+ }
+
assert(len > 0);
assert(len <= reply_obj.data.length());
bufferlist bl;
}
};
-int PrimaryLogPG::fill_in_copy_get(
- OpContext *ctx,
- bufferlist::iterator& bp,
- OSDOp& osd_op,
- ObjectContextRef &obc)
+int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
+ OSDOp& osd_op, ObjectContextRef &obc)
{
object_info_t& oi = obc->obs.oi;
hobject_t& soid = oi.soid;
make_pair(
boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
make_pair(&bl, cb)));
- result = max_read;
- cb->len = result;
+ cb->len = max_read;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ result = -EINPROGRESS;
+
+ dout(10) << __func__ << ": async_read noted for " << soid << dendl;
} else {
result = pgbackend->objects_read_sync(
- oi.soid, cursor.data_offset, left, osd_op.op.flags, &bl);
+ oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
if (result < 0)
return result;
}
- assert(result <= left);
- left -= result;
- cursor.data_offset += result;
+ left -= max_read;
+ cursor.data_offset += max_read;
}
if (cursor.data_offset == oi.size) {
cursor.data_complete = true;
if (cb && !async_read_started) {
delete cb;
}
- result = 0;
+
+ if (result > 0) {
+ result = 0;
+ }
return result;
}
cop->temp_cursor = cop->cursor;
}
-void PrimaryLogPG::finish_copyfrom(OpContext *ctx)
+void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
{
+ OpContext *ctx = cb->ctx;
dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
- ObjectState& obs = ctx->new_obs;
- CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
+ ObjectState& obs = ctx->new_obs;
if (obs.exists) {
dout(20) << __func__ << ": exists, removing" << dendl;
ctx->op_t->remove(obs.oi.soid);
last_update_applied = applied_version;
if (is_primary()) {
if (scrubber.active) {
- if (last_update_applied == scrubber.subset_last_update) {
+ if (last_update_applied >= scrubber.subset_last_update) {
if (ops_blocked_by_scrub()) {
requeue_scrub(true);
} else {
}
} else {
if (scrubber.active_rep_scrub) {
- if (last_update_applied == static_cast<const MOSDRepScrub*>(
+ if (last_update_applied >= static_cast<const MOSDRepScrub*>(
scrubber.active_rep_scrub->get_req())->scrub_to) {
osd->enqueue_back(
info.pgid,
PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
{
dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
- vector<OSDOp> ops;
ceph_tid_t rep_tid = osd->get_tid();
osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
- OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, ops, obc, this));
+ OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
ctx->op_t.reset(new PGTransaction());
ctx->mtime = ceph_clock_now();
return ctx;
ObjectContextRef obc = get_object_context(soid, false);
if (!obc || !obc->obs.exists) {
- dout(20) << __func__ << " missing clone " << soid << dendl;
if (pmissing)
*pmissing = soid;
put_snapset_context(ssc);
- return -ENOENT;
+ if (is_degraded_or_backfilling_object(soid)) {
+ dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
+ return -EAGAIN;
+ } else {
+ dout(20) << __func__ << " missing clone " << soid << dendl;
+ return -ENOENT;
+ }
}
if (!obc->ssc) {
return PULL_NONE;
}
+ if (missing_loc.is_deleted(soid)) {
+ start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(make_pair(soid, ObjectContextRef()));
+ epoch_t cur_epoch = get_osdmap()->get_epoch();
+ remove_missing_object(soid, v, new FunctionContext(
+ [=](int) {
+ lock();
+ if (!pg_has_reset_since(cur_epoch)) {
+ bool object_missing = false;
+ for (const auto& shard : actingbackfill) {
+ if (shard == pg_whoami)
+ continue;
+ if (peer_missing[shard].is_missing(soid)) {
+ dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
+ object_missing = true;
+ break;
+ }
+ }
+ if (!object_missing) {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ on_global_recover(soid, stat_diff, true);
+ } else {
+ auto recovery_handle = pgbackend->open_recovery_op();
+ pgbackend->recover_delete_object(soid, v, recovery_handle);
+ pgbackend->run_recovery_op(recovery_handle, priority);
+ }
+ }
+ unlock();
+ }));
+ return PULL_YES;
+ }
+
// is this a snapped object? if so, consult the snapset.. we may not need the entire object!
ObjectContextRef obc;
ObjectContextRef head_obc;
osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
}
+void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
+ eversion_t v, Context *on_complete)
+{
+ dout(20) << __func__ << " " << soid << " " << v << dendl;
+ assert(on_complete != nullptr);
+ // delete locally
+ ObjectStore::Transaction t;
+ remove_snap_mapped_object(t, soid);
+
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = v;
+
+ epoch_t cur_epoch = get_osdmap()->get_epoch();
+ t.register_on_complete(new FunctionContext(
+ [=](int) {
+ lock();
+ if (!pg_has_reset_since(cur_epoch)) {
+ ObjectStore::Transaction t2;
+ on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
+ t2.register_on_complete(on_complete);
+ int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
+ assert(r == 0);
+ unlock();
+ } else {
+ unlock();
+ on_complete->complete(-EAGAIN);
+ }
+ }));
+ int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
+ assert(r == 0);
+}
void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
{
dout(10) << "finish_degraded_object " << oid << dendl;
- ObjectContextRef obc(object_contexts.lookup(oid));
if (callbacks_for_degraded_object.count(oid)) {
list<Context*> contexts;
contexts.swap(callbacks_for_degraded_object[oid]);
void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
{
lock();
- dout(10) << "_applied_recovered_object " << *obc << dendl;
-
+ dout(20) << __func__ << dendl;
+ if (obc) {
+ dout(20) << "obc = " << *obc << dendl;
+ }
assert(active_pushes >= 1);
--active_pushes;
requeue_scrub(false);
}
}
-
unlock();
}
void PrimaryLogPG::_applied_recovered_object_replica()
{
lock();
- dout(10) << "_applied_recovered_object_replica" << dendl;
-
+ dout(20) << __func__ << dendl;
assert(active_pushes >= 1);
--active_pushes;
PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
scrubber.active_rep_scrub = OpRequestRef();
}
-
unlock();
}
std::move(manager),
boost::optional<std::function<void(void)> >(
[this, oids, con, num_unfound, tid]() {
- for (auto oid: oids)
+ if (perform_deletes_during_peering()) {
+ for (auto oid : oids) {
+ // clear old locations - merge_new_log_entries will have
+ // handled rebuilding missing_loc for each of these
+ // objects if we have the RECOVERY_DELETES flag
missing_loc.recovered(oid);
+ }
+ }
+
for (auto& p : waiting_for_unreadable_object) {
release_backoffs(p.first);
}
on_shutdown();
}
+void PrimaryLogPG::clear_async_reads()
+{
+ dout(10) << __func__ << dendl;
+ for(auto& i : in_progress_async_reads) {
+ dout(10) << "clear ctx: "
+ << "OpRequestRef " << i.first
+ << " OpContext " << i.second
+ << dendl;
+ close_op_ctx(i.second);
+ }
+}
+
void PrimaryLogPG::on_shutdown()
{
dout(10) << "on_shutdown" << dendl;
context_registry_on_change();
object_contexts.clear();
+ clear_async_reads();
+
osd->remote_reserver.cancel_reservation(info.pgid);
osd->local_reserver.cancel_reservation(info.pgid);
void PrimaryLogPG::_on_new_interval()
{
+ dout(20) << __func__ << "checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
+ if (!pg_log.get_missing().may_include_deletes &&
+ get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
+ pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
+ }
+ assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
}
void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
return false;
}
- const pg_missing_t &missing = pg_log.get_missing();
+ const auto &missing = pg_log.get_missing();
unsigned int num_missing = missing.num_missing();
uint64_t num_unfound = get_num_unfound();
if (missing.num_missing() > 0) {
// this shouldn't happen!
- osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing()
- << ": " << missing.get_items();
+ osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
+ << missing.num_missing() << ": " << missing.get_items();
return work_in_progress;
}
if (needs_recovery()) {
// this shouldn't happen!
// We already checked num_missing() so we must have missing replicas
- osd->clog->error() << info.pgid << " recovery ending with missing replicas";
+ osd->clog->error() << info.pgid
+ << " Unexpected Error: recovery ending with missing replicas";
return work_in_progress;
}
if (state_test(PG_STATE_RECOVERING)) {
state_clear(PG_STATE_RECOVERING);
+ state_clear(PG_STATE_FORCED_RECOVERY);
if (needs_backfill()) {
dout(10) << "recovery done, queuing backfill" << dendl;
queue_peering_event(
} else {
dout(10) << "recovery done, no backfill" << dendl;
eio_errors_to_process = false;
+ state_clear(PG_STATE_FORCED_BACKFILL);
queue_peering_event(
CephPeeringEvtRef(
std::make_shared<CephPeeringEvt>(
}
} else { // backfilling
state_clear(PG_STATE_BACKFILL);
+ state_clear(PG_STATE_FORCED_BACKFILL);
+ state_clear(PG_STATE_FORCED_RECOVERY);
dout(10) << "recovery done, backfill done" << dendl;
eio_errors_to_process = false;
queue_peering_event(
{
assert(is_primary());
- const pg_missing_t &missing = pg_log.get_missing();
+ const auto &missing = pg_log.get_missing();
dout(10) << "recover_primary recovering " << recovering.size()
<< " in pg" << dendl;
if (pg_log.get_log().objects.count(p->second)) {
latest = pg_log.get_log().objects.find(p->second)->second;
- assert(latest->is_update());
+ assert(latest->is_update() || latest->is_delete());
soid = latest->soid;
} else {
latest = 0;
return uhoh;
}
+int PrimaryLogPG::prep_object_replica_deletes(
+ const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h)
+{
+ assert(is_primary());
+ dout(10) << __func__ << ": on " << soid << dendl;
+
+ start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(make_pair(soid, ObjectContextRef()));
+
+ pgbackend->recover_delete_object(soid, v, h);
+ return 1;
+}
+
int PrimaryLogPG::prep_object_replica_pushes(
const hobject_t& soid, eversion_t v,
PGBackend::RecoveryHandle *h)
continue;
}
+ if (missing_loc.is_deleted(soid)) {
+ dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
+ map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
+ started += prep_object_replica_deletes(soid, r->second.need, h);
+ continue;
+ }
+
if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
dout(10) << __func__ << ": " << soid.get_head()
<< " still missing on primary" << dendl;
for (unsigned int i = 0 ; i < peers.size(); ++i) {
map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
assert(bpm != peer_missing.end());
- bpm->second.add(oid, eversion_t(), eversion_t());
+ bpm->second.add(oid, eversion_t(), eversion_t(), false);
}
assert(!recovering.count(oid));
continue;
did.insert(p->soid);
- if (p->is_delete()) {
+ if (p->is_delete() && !is_missing_object(p->soid)) {
dout(10) << " checking " << p->soid
<< " at " << p->version << dendl;
struct stat st;
if (!allow_incomplete_clones) {
next_clone.snap = **curclone;
clog->error() << mode << " " << pgid << " " << head.get()
- << " expected clone " << next_clone;
+ << " expected clone " << next_clone << " " << missing
+ << " missing";
++scrubber.shallow_errors;
e.set_clone_missing(next_clone.snap);
}
ObjectContextRef obc = get_object_context(p->first, false);
if (!obc) {
osd->clog->error() << info.pgid << " " << mode
- << " cannot get object context for "
+ << " cannot get object context for object "
<< p->first;
continue;
} else if (obc->obs.oi.soid != p->first) {
ObjectContextRef obc = get_object_context(p.first, true);
if (!obc) {
osd->clog->error() << info.pgid << " " << mode
- << " cannot get object context for "
+ << " cannot get object context for object "
<< p.first;
continue;
} else if (obc->obs.oi.soid != p.first) {
typedef boost::tuple<int, CopyResults*> CopyCallbackResults;
friend class CopyFromCallback;
+ friend class CopyFromFinisher;
friend class PromoteCallback;
struct ProxyReadOp {
const hobject_t &oid,
const ObjectRecoveryInfo &recovery_info,
ObjectContextRef obc,
+ bool is_delete,
ObjectStore::Transaction *t
) override;
void on_peer_recover(
const hobject_t oid) override;
void on_global_recover(
const hobject_t &oid,
- const object_stat_sum_t &stat_diff) override;
+ const object_stat_sum_t &stat_diff,
+ bool is_delete) override;
void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) override;
void primary_failed(const hobject_t &soid) override;
bool primary_error(const hobject_t& soid, eversion_t v) override;
const hobject_t &soid,
const object_stat_sum_t &delta_stats) override;
void on_primary_error(const hobject_t &oid, eversion_t v) override;
+ void remove_missing_object(const hobject_t &oid,
+ eversion_t v,
+ Context *on_complete) override;
template<class T> class BlessedGenContext;
class BlessedContext;
ceph_tid_t get_tid() override { return osd->get_tid(); }
LogClientTemp clog_error() override { return osd->clog->error(); }
+ LogClientTemp clog_warn() override { return osd->clog->warn(); }
struct watch_disconnect_t {
uint64_t cookie;
ObjectContextRef obc,
const list<watch_disconnect_t> &to_disconnect);
+ struct OpFinisher {
+ virtual ~OpFinisher() {
+ }
+
+ virtual int execute() = 0;
+ };
+
/*
* Capture all object state associated with an in-progress read or write.
*/
struct OpContext {
OpRequestRef op;
osd_reqid_t reqid;
- vector<OSDOp> &ops;
+ vector<OSDOp> *ops;
const ObjectState *obs; // Old objectstate
const SnapSet *snapset; // Old snapset
ObjectContextRef clone_obc; // if we created a clone
ObjectContextRef snapset_obc; // if we created/deleted a snapdir
- int data_off; // FIXME: we may want to kill this msgr hint off at some point!
+ // FIXME: we may want to kill this msgr hint off at some point!
+ boost::optional<int> data_off = boost::none;
MOSDOpReply *reply;
mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
- CopyFromCallback *copy_cb;
-
hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
list<std::function<void()>> on_applied;
// pending async reads <off, len, op_flags> -> <outbl, outr>
list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
pair<bufferlist*, Context*> > > pending_async_reads;
- int async_read_result;
int inflightreads;
friend struct OnReadComplete;
void start_async_reads(PrimaryLogPG *pg);
ObjectContext::RWState::State lock_type;
ObcLockManager lock_manager;
+ std::map<int, std::unique_ptr<OpFinisher>> op_finishers;
+
OpContext(const OpContext& other);
const OpContext& operator=(const OpContext& other);
- OpContext(OpRequestRef _op, osd_reqid_t _reqid, vector<OSDOp>& _ops,
+ OpContext(OpRequestRef _op, osd_reqid_t _reqid, vector<OSDOp>* _ops,
ObjectContextRef& obc,
PrimaryLogPG *_pg) :
op(_op), reqid(_reqid), ops(_ops),
bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
obc(obc),
- data_off(0), reply(NULL), pg(_pg),
+ reply(NULL), pg(_pg),
num_read(0),
num_write(0),
- copy_cb(NULL),
sent_reply(false),
- async_read_result(0),
inflightreads(0),
lock_type(ObjectContext::RWState::RWNONE) {
if (obc->ssc) {
}
}
OpContext(OpRequestRef _op, osd_reqid_t _reqid,
- vector<OSDOp>& _ops, PrimaryLogPG *_pg) :
+ vector<OSDOp>* _ops, PrimaryLogPG *_pg) :
op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0),
modify(false), user_modify(false), undirty(false), cache_evict(false),
ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
- data_off(0), reply(NULL), pg(_pg),
+ reply(NULL), pg(_pg),
num_read(0),
num_write(0),
- copy_cb(NULL),
- async_read_result(0),
inflightreads(0),
lock_type(ObjectContext::RWState::RWNONE) {}
void reset_obs(ObjectContextRef obc) {
*
* @param ctx [in] ctx to clean up
*/
- void close_op_ctx(OpContext *ctx) {
- release_object_locks(ctx->lock_manager);
- ctx->op_t.reset();
- for (auto p = ctx->on_finish.begin();
- p != ctx->on_finish.end();
- ctx->on_finish.erase(p++)) {
- (*p)();
- }
- delete ctx;
- }
+ void close_op_ctx(OpContext *ctx);
/**
* Releases locks
int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
PGBackend::RecoveryHandle *h);
+ int prep_object_replica_deletes(const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h);
void finish_degraded_object(const hobject_t& oid);
// -- copyfrom --
map<hobject_t, CopyOpRef> copy_ops;
- int fill_in_copy_get(
- OpContext *ctx,
- bufferlist::iterator& bp,
- OSDOp& op,
- ObjectContextRef& obc);
+ int do_copy_get(OpContext *ctx, bufferlist::iterator& bp, OSDOp& op,
+ ObjectContextRef& obc);
+ int finish_copy_get();
+
void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
OSDOp& osd_op);
return size;
}
void _copy_some(ObjectContextRef obc, CopyOpRef cop);
- void finish_copyfrom(OpContext *ctx);
+ void finish_copyfrom(CopyFromCallback *cb);
void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
void cancel_copy(CopyOpRef cop, bool requeue);
void cancel_copy_ops(bool requeue);
int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
// -- checksum --
- int do_checksum(OpContext *ctx, OSDOp& osd_op, bufferlist::iterator *bl_it,
- bool *async_read);
+ int do_checksum(OpContext *ctx, OSDOp& osd_op, bufferlist::iterator *bl_it);
int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type,
bufferlist::iterator *init_value_bl_it,
const bufferlist &read_bl);
friend class C_ChecksumRead;
int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
+ int finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl);
+
+ friend class C_ExtentCmpRead;
+
+ int do_read(OpContext *ctx, OSDOp& osd_op);
+ int do_sparse_read(OpContext *ctx, OSDOp& osd_op);
int do_writesame(OpContext *ctx, OSDOp& osd_op);
bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
void on_role_change() override;
void on_pool_change() override;
void _on_new_interval() override;
+ void clear_async_reads();
void on_change(ObjectStore::Transaction *t) override;
void on_activate() override;
void on_flushed() override;
RPGHandle *h = static_cast<RPGHandle *>(_h);
send_pushes(priority, h->pushes);
send_pulls(priority, h->pulls);
+ send_recovery_deletes(priority, h->deletes);
delete h;
}
}
}
-bool ReplicatedBackend::handle_message(
+bool ReplicatedBackend::_handle_message(
OpRequestRef op
)
{
bc->get_parent()->primary_error(i.hoid, obc->obs.oi.version);
} else if (!started) {
bc->get_parent()->on_global_recover(
- i.hoid, i.stat);
+ i.hoid, i.stat, false);
}
handle.reset_tp_timeout();
}
clear_pull_from(piter);
to_continue->push_back({hoid, pi.stat});
get_parent()->on_local_recover(
- hoid, pi.recovery_info, pi.obc, t);
+ hoid, pi.recovery_info, pi.obc, false, t);
return false;
} else {
response->soid = pop.soid;
pop.recovery_info.soid,
pop.recovery_info,
ObjectContextRef(), // ok, is replica
+ false,
t);
}
if (pushing[soid].empty()) {
if (!error)
- get_parent()->on_global_recover(soid, stat);
+ get_parent()->on_global_recover(soid, stat, false);
else
get_parent()->on_primary_error(soid, v);
-
pushing.erase(soid);
} else {
// This looks weird, but we erased the current peer and need to remember
bool can_handle_while_inactive(OpRequestRef op) override;
/// @see PGBackend::handle_message
- bool handle_message(
+ bool _handle_message(
OpRequestRef op
) override;
oss << "recovery_toofull+";
if (state & PG_STATE_RECOVERING)
oss << "recovering+";
+ if (state & PG_STATE_FORCED_RECOVERY)
+ oss << "forced_recovery+";
if (state & PG_STATE_DOWN)
oss << "down+";
if (state & PG_STATE_UNDERSIZED)
oss << "backfill_wait+";
if (state & PG_STATE_BACKFILL)
oss << "backfilling+";
+ if (state & PG_STATE_FORCED_BACKFILL)
+ oss << "forced_backfill+";
if (state & PG_STATE_BACKFILL_TOOFULL)
oss << "backfill_toofull+";
if (state & PG_STATE_INCOMPLETE)
type = PG_STATE_REPAIR;
else if (state == "recovering")
type = PG_STATE_RECOVERING;
+ else if (state == "forced_recovery")
+ type = PG_STATE_FORCED_RECOVERY;
else if (state == "backfill_wait")
type = PG_STATE_BACKFILL_WAIT;
else if (state == "incomplete")
type = PG_STATE_DEEP_SCRUB;
else if (state == "backfill")
type = PG_STATE_BACKFILL;
+ else if (state == "forced_backfill")
+ type = PG_STATE_FORCED_BACKFILL;
else if (state == "backfill_toofull")
type = PG_STATE_BACKFILL_TOOFULL;
else if (state == "recovery_wait")
// -- pg_pool_t --
+const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
+const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
+const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
+
void pg_pool_t::dump(Formatter *f) const
{
f->dump_unsigned("flags", get_flags());
f->open_object_section("options");
opts.dump(f);
f->close_section(); // options
+ f->open_object_section("application_metadata");
+ for (auto &app_pair : application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ }
+ f->close_section(); // application_metadata
}
void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
return;
}
- uint8_t v = 25;
+ uint8_t v = 26;
if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
// this was the first post-hammer thing we added; if it's missing, encode
// like hammer.
v = 21;
}
- if ((features &
- (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) !=
- (CEPH_FEATURE_RESEND_ON_SPLIT|CEPH_FEATURE_SERVER_JEWEL)) {
+ if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
v = 24;
}
if (v >= 25) {
::encode(last_force_op_resend, bl);
}
+ if (v >= 26) {
+ ::encode(application_metadata, bl);
+ }
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(25, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(26, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_rule, bl);
} else {
last_force_op_resend = last_force_op_resend_preluminous;
}
+ if (struct_v >= 26) {
+ ::decode(application_metadata, bl);
+ }
DECODE_FINISH(bl);
calc_pg_masks();
calc_grade_table();
a.erasure_code_profile = "profile in osdmap";
a.expected_num_objects = 123456;
a.fast_read = false;
+ a.application_metadata = {{"rbd", {{"key", "value"}}}};
o.push_back(new pg_pool_t(a));
}
if (p.fast_read)
out << " fast_read " << p.fast_read;
out << p.opts;
+ if (!p.application_metadata.empty()) {
+ out << " application ";
+ for (auto it = p.application_metadata.begin();
+ it != p.application_metadata.end(); ++it) {
+ if (it != p.application_metadata.begin())
+ out << ",";
+ out << it->first;
+ }
+ }
return out;
}
unsigned new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
pg_t pgid) {
return old_acting_primary != new_acting_primary ||
new_acting != old_acting ||
old_min_size != new_min_size ||
old_size != new_size ||
pgid.is_split(old_pg_num, new_pg_num, 0) ||
- old_sort_bitwise != new_sort_bitwise;
+ old_sort_bitwise != new_sort_bitwise ||
+ old_recovery_deletes != new_recovery_deletes;
}
bool PastIntervals::is_new_interval(
osdmap->get_pg_num(pgid.pool()),
lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+ lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+ osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
pgid);
}
return out;
}
+// -- pg_log_dup_t --
+
+string pg_log_dup_t::get_key_name() const
+{
+ return "dup_" + version.get_key_name();
+}
+
+void pg_log_dup_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(reqid, bl);
+ ::encode(version, bl);
+ ::encode(user_version, bl);
+ ::encode(return_code, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_log_dup_t::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(1, bl);
+ ::decode(reqid, bl);
+ ::decode(version, bl);
+ ::decode(user_version, bl);
+ ::decode(return_code, bl);
+ DECODE_FINISH(bl);
+}
+
+void pg_log_dup_t::dump(Formatter *f) const
+{
+ f->dump_stream("reqid") << reqid;
+ f->dump_stream("version") << version;
+ f->dump_stream("user_version") << user_version;
+ f->dump_stream("return_code") << return_code;
+}
+
+void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
+{
+ o.push_back(new pg_log_dup_t());
+ o.push_back(new pg_log_dup_t(eversion_t(1,2),
+ 1,
+ osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ 0));
+ o.push_back(new pg_log_dup_t(eversion_t(1,2),
+ 2,
+ osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ -ENOENT));
+}
+
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
+ return out << "log_dup(reqid=" << e.reqid <<
+ " v=" << e.version << " uv=" << e.user_version <<
+ " rc=" << e.return_code << ")";
+}
+
// -- pg_log_t --
void pg_log_t::encode(bufferlist& bl) const
{
- ENCODE_START(6, 3, bl);
+ ENCODE_START(7, 3, bl);
::encode(head, bl);
::encode(tail, bl);
::encode(log, bl);
::encode(can_rollback_to, bl);
::encode(rollback_info_trimmed_to, bl);
+ ::encode(dups, bl);
ENCODE_FINISH(bl);
}
void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
{
- DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
::decode(head, bl);
::decode(tail, bl);
if (struct_v < 2) {
::decode(rollback_info_trimmed_to, bl);
else
rollback_info_trimmed_to = tail;
+
+ if (struct_v >= 7)
+ ::decode(dups, bl);
+
DECODE_FINISH(bl);
// handle hobject_t format change
f->close_section();
}
f->close_section();
+ f->open_array_section("dups");
+ for (const auto& entry : dups) {
+ f->open_object_section("entry");
+ entry.dump(f);
+ f->close_section();
+ }
+ f->close_section();
}
void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
}
}
-ostream& pg_log_t::print(ostream& out) const
+ostream& pg_log_t::print(ostream& out) const
{
out << *this << std::endl;
for (list<pg_log_entry_t>::const_iterator p = log.begin();
p != log.end();
- ++p)
+ ++p)
out << *p << std::endl;
+ for (const auto& entry : dups) {
+ out << " dup entry: " << entry << std::endl;
+ }
return out;
}
out << i.need;
if (i.have != eversion_t())
out << "(" << i.have << ")";
+ out << " flags = " << i.flag_str();
return out;
}
#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
+#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
/// min recovery priority for MBackfillReserve
/// base backfill priority for MBackfillReserve (inactive PG)
#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
-/// max recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_MAX 255
+/// max manually/automatically set recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MAX 254
+
+/// max recovery priority for MBackfillReserve, only when forced manually
+#define OSD_RECOVERY_PRIORITY_FORCED 255
typedef hobject_t collection_list_handle_t;
*/
struct osd_reqid_t {
entity_name_t name; // who
- ceph_tid_t tid;
+ ceph_tid_t tid;
int32_t inc; // incarnation
osd_reqid_t()
- : tid(0), inc(0) {}
+ : tid(0), inc(0)
+ {}
+ osd_reqid_t(const osd_reqid_t& other)
+ : name(other.name), tid(other.tid), inc(other.inc)
+ {}
osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
- : name(a), tid(t), inc(i) {}
+ : name(a), tid(t), inc(i)
+ {}
DENC(osd_reqid_t, v, p) {
DENC_START(2, 2, p);
eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
// cppcheck-suppress noExplicitConstructor
- eversion_t(const ceph_eversion& ce) :
+ eversion_t(const ceph_eversion& ce) :
version(ce.version),
epoch(ce.epoch),
__pad(0) { }
#define PG_STATE_SNAPTRIM_WAIT (1<<27) // queued to trim snaps
#define PG_STATE_RECOVERY_TOOFULL (1<<28) // recovery can't proceed: too full
#define PG_STATE_SNAPTRIM_ERROR (1<<29) // error stopped trimming snaps
+#define PG_STATE_FORCED_RECOVERY (1<<30) // force recovery of this pg before any other
+#define PG_STATE_FORCED_BACKFILL (1<<31) // force backfill of this pg before any other
std::string pg_state_string(int state);
std::string pg_vector_string(const vector<int32_t> &a);
* pg_pool
*/
struct pg_pool_t {
+ static const char *APPLICATION_NAME_CEPHFS;
+ static const char *APPLICATION_NAME_RBD;
+ static const char *APPLICATION_NAME_RGW;
+
enum {
TYPE_REPLICATED = 1, // replication
//TYPE_RAID4 = 2, // raid4 (never implemented)
pool_opts_t opts; ///< options
+ /// application -> key/value metadata
+ map<string, std::map<string, string>> application_metadata;
+
private:
vector<uint32_t> grade_table;
unsigned new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
pg_t pgid
);
PastIntervals *past_intervals, ///< [out] intervals
ostream *out = 0 ///< [out] debug ostream
);
+
friend ostream& operator<<(ostream& out, const PastIntervals &i);
template <typename F>
ostream& operator<<(ostream& out, const pg_log_entry_t& e);
+struct pg_log_dup_t {
+ osd_reqid_t reqid; // caller+tid to uniquely identify request
+ eversion_t version;
+ version_t user_version; // the user version for this entry
+ int32_t return_code; // only stored for ERRORs for dup detection
+
+ pg_log_dup_t()
+ : user_version(0), return_code(0)
+ {}
+ explicit pg_log_dup_t(const pg_log_entry_t& entry)
+ : reqid(entry.reqid), version(entry.version),
+ user_version(entry.user_version), return_code(entry.return_code)
+ {}
+ pg_log_dup_t(const eversion_t& v, version_t uv,
+ const osd_reqid_t& rid, int return_code)
+ : reqid(rid), version(v), user_version(uv),
+ return_code(return_code)
+ {}
+
+ string get_key_name() const;
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<pg_log_dup_t*>& o);
+ friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+};
+WRITE_CLASS_ENCODER(pg_log_dup_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
/**
* pg_log_t - incremental log of recent pg changes.
eversion_t rollback_info_trimmed_to;
public:
- mempool::osd_pglog::list<pg_log_entry_t> log; // the actual log.
-
+ // the actual log
+ mempool::osd_pglog::list<pg_log_entry_t> log;
+
+ // entries just for dup op detection ordered oldest to newest
+ mempool::osd_pglog::list<pg_log_dup_t> dups;
+
pg_log_t() = default;
pg_log_t(const eversion_t &last_update,
const eversion_t &log_tail,
const eversion_t &can_rollback_to,
const eversion_t &rollback_info_trimmed_to,
- mempool::osd_pglog::list<pg_log_entry_t> &&entries)
+ mempool::osd_pglog::list<pg_log_entry_t> &&entries,
+ mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
: head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
rollback_info_trimmed_to(rollback_info_trimmed_to),
- log(std::move(entries)) {}
+ log(std::move(entries)), dups(std::move(dup_entries)) {}
pg_log_t(const eversion_t &last_update,
const eversion_t &log_tail,
const eversion_t &can_rollback_to,
const eversion_t &rollback_info_trimmed_to,
- const std::list<pg_log_entry_t> &entries)
+ const std::list<pg_log_entry_t> &entries,
+ const std::list<pg_log_dup_t> &dup_entries)
: head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
rollback_info_trimmed_to(rollback_info_trimmed_to) {
for (auto &&entry: entries) {
log.push_back(entry);
}
+ for (auto &&entry: dup_entries) {
+ dups.push_back(entry);
+ }
}
void clear() {
eversion_t z;
rollback_info_trimmed_to = can_rollback_to = head = tail = z;
log.clear();
+ dups.clear();
}
eversion_t get_rollback_info_trimmed_to() const {
oldlog.erase(i++);
}
+ // osd_reqid is unique, so it doesn't matter if there are extra
+ // dup entries in each pg. To avoid storing oid with the dup
+ // entries, just copy the whole list.
+ auto childdups(dups);
+
return pg_log_t(
head,
tail,
can_rollback_to,
rollback_info_trimmed_to,
- std::move(childlog));
- }
+ std::move(childlog),
+ std::move(childdups));
+ }
mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
assert(newhead >= tail);
};
WRITE_CLASS_ENCODER(pg_log_t)
-inline ostream& operator<<(ostream& out, const pg_log_t& log)
+inline ostream& operator<<(ostream& out, const pg_log_t& log)
{
out << "log((" << log.tail << "," << log.head << "], crt="
<< log.get_can_rollback_to() << ")";
*/
struct pg_missing_item {
eversion_t need, have;
- pg_missing_item() {}
- explicit pg_missing_item(eversion_t n) : need(n) {} // have no old version
- pg_missing_item(eversion_t n, eversion_t h) : need(n), have(h) {}
+ enum missing_flags_t {
+ FLAG_NONE = 0,
+ FLAG_DELETE = 1,
+ } flags;
+ pg_missing_item() : flags(FLAG_NONE) {}
+ explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
+ pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false) : need(n), have(h) {
+ set_delete(is_delete);
+ }
- void encode(bufferlist& bl) const {
- ::encode(need, bl);
- ::encode(have, bl);
+ void encode(bufferlist& bl, uint64_t features) const {
+ if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
+ // encoding a zeroed eversion_t to differentiate between this and
+ // legacy unversioned encoding - a need value of 0'0 is not
+ // possible. This can be replaced with the legacy encoding
+ // macros post-luminous.
+ eversion_t e;
+ ::encode(e, bl);
+ ::encode(need, bl);
+ ::encode(have, bl);
+ ::encode(static_cast<uint8_t>(flags), bl);
+ } else {
+ // legacy unversioned encoding
+ ::encode(need, bl);
+ ::encode(have, bl);
+ }
}
void decode(bufferlist::iterator& bl) {
- ::decode(need, bl);
- ::decode(have, bl);
+ eversion_t e;
+ ::decode(e, bl);
+ if (e != eversion_t()) {
+ // legacy encoding, this is the need value
+ need = e;
+ ::decode(have, bl);
+ } else {
+ ::decode(need, bl);
+ ::decode(have, bl);
+ uint8_t f;
+ ::decode(f, bl);
+ flags = static_cast<missing_flags_t>(f);
+ }
+ }
+
+ void set_delete(bool is_delete) {
+ flags = is_delete ? FLAG_DELETE : FLAG_NONE;
+ }
+
+ bool is_delete() const {
+ return (flags & FLAG_DELETE) == FLAG_DELETE;
+ }
+
+ string flag_str() const {
+ if (flags == FLAG_NONE) {
+ return "none";
+ } else {
+ return "delete";
+ }
}
+
void dump(Formatter *f) const {
f->dump_stream("need") << need;
f->dump_stream("have") << have;
+ f->dump_stream("flags") << flag_str();
}
static void generate_test_instances(list<pg_missing_item*>& o) {
o.push_back(new pg_missing_item);
o.push_back(new pg_missing_item);
o.back()->need = eversion_t(1, 2);
o.back()->have = eversion_t(1, 1);
+ o.push_back(new pg_missing_item);
+ o.back()->need = eversion_t(3, 5);
+ o.back()->have = eversion_t(3, 4);
+ o.back()->flags = FLAG_DELETE;
}
bool operator==(const pg_missing_item &rhs) const {
- return need == rhs.need && have == rhs.have;
+ return need == rhs.need && have == rhs.have && flags == rhs.flags;
}
bool operator!=(const pg_missing_item &rhs) const {
return !(*this == rhs);
}
};
-WRITE_CLASS_ENCODER(pg_missing_item)
+WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
ostream& operator<<(ostream& out, const pg_missing_item &item);
class pg_missing_const_i {
virtual const map<hobject_t, pg_missing_item> &
get_items() const = 0;
virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
+ virtual bool get_may_include_deletes() const = 0;
virtual unsigned int num_missing() const = 0;
virtual bool have_missing() const = 0;
virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
pg_missing_set(const missing_type &m) {
missing = m.get_items();
rmissing = m.get_rmissing();
+ may_include_deletes = m.get_may_include_deletes();
for (auto &&i: missing)
tracker.changed(i.first);
}
+ bool may_include_deletes = false;
+
const map<hobject_t, item> &get_items() const override {
return missing;
}
const map<version_t, hobject_t> &get_rmissing() const override {
return rmissing;
}
+ bool get_may_include_deletes() const override {
+ return may_include_deletes;
+ }
unsigned int num_missing() const override {
return missing.size();
}
* assumes missing is accurate up through the previous log entry.
*/
void add_next_event(const pg_log_entry_t& e) {
- if (e.is_update()) {
- map<hobject_t, item>::iterator missing_it;
- missing_it = missing.find(e.soid);
- bool is_missing_divergent_item = missing_it != missing.end();
- if (e.prior_version == eversion_t() || e.is_clone()) {
- // new object.
- if (is_missing_divergent_item) { // use iterator
- rmissing.erase((missing_it->second).need.version);
- missing_it->second = item(e.version, eversion_t()); // .have = nil
- } else // create new element in missing map
- missing[e.soid] = item(e.version, eversion_t()); // .have = nil
- } else if (is_missing_divergent_item) {
- // already missing (prior).
+ map<hobject_t, item>::iterator missing_it;
+ missing_it = missing.find(e.soid);
+ bool is_missing_divergent_item = missing_it != missing.end();
+ if (e.prior_version == eversion_t() || e.is_clone()) {
+ // new object.
+ if (is_missing_divergent_item) { // use iterator
rmissing.erase((missing_it->second).need.version);
- (missing_it->second).need = e.version; // leave .have unchanged.
- } else if (e.is_backlog()) {
- // May not have prior version
- assert(0 == "these don't exist anymore");
- } else {
- // not missing, we must have prior_version (if any)
- assert(!is_missing_divergent_item);
- missing[e.soid] = item(e.version, e.prior_version);
- }
- rmissing[e.version.version] = e.soid;
- } else if (e.is_delete()) {
- rm(e.soid, e.version);
+ missing_it->second = item(e.version, eversion_t(), e.is_delete()); // .have = nil
+ } else // create new element in missing map
+ missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); // .have = nil
+ } else if (is_missing_divergent_item) {
+ // already missing (prior).
+ rmissing.erase((missing_it->second).need.version);
+ (missing_it->second).need = e.version; // leave .have unchanged.
+ missing_it->second.set_delete(e.is_delete());
+ } else if (e.is_backlog()) {
+ // May not have prior version
+ assert(0 == "these don't exist anymore");
+ } else {
+ // not missing, we must have prior_version (if any)
+ assert(!is_missing_divergent_item);
+ missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
}
-
+ rmissing[e.version.version] = e.soid;
tracker.changed(e.soid);
}
- void revise_need(hobject_t oid, eversion_t need) {
+ void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
if (missing.count(oid)) {
rmissing.erase(missing[oid].need.version);
missing[oid].need = need; // no not adjust .have
+ missing[oid].set_delete(is_delete);
} else {
- missing[oid] = item(need, eversion_t());
+ missing[oid] = item(need, eversion_t(), is_delete);
}
rmissing[need.version] = oid;
}
}
- void add(const hobject_t& oid, eversion_t need, eversion_t have) {
- missing[oid] = item(need, have);
+ void add(const hobject_t& oid, eversion_t need, eversion_t have,
+ bool is_delete) {
+ missing[oid] = item(need, have, is_delete);
rmissing[need.version] = oid;
tracker.changed(oid);
}
void got(const hobject_t& oid, eversion_t v) {
std::map<hobject_t, item>::iterator p = missing.find(oid);
assert(p != missing.end());
- assert(p->second.need <= v);
+ assert(p->second.need <= v || p->second.is_delete());
got(p);
}
pg_t child_pgid,
unsigned split_bits,
pg_missing_set *omissing) {
+ omissing->may_include_deletes = may_include_deletes;
unsigned mask = ~((~0)<<split_bits);
for (map<hobject_t, item>::iterator i = missing.begin();
i != missing.end();
) {
if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
- omissing->add(i->first, i->second.need, i->second.have);
+ omissing->add(i->first, i->second.need, i->second.have,
+ i->second.is_delete());
rm(i++);
} else {
++i;
}
void encode(bufferlist &bl) const {
- ENCODE_START(3, 2, bl);
- ::encode(missing, bl);
+ ENCODE_START(4, 2, bl);
+ ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
+ ::encode(may_include_deletes, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &bl, int64_t pool = -1) {
for (auto const &i: missing)
tracker.changed(i.first);
- DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
::decode(missing, bl);
+ if (struct_v >= 4) {
+ ::decode(may_include_deletes, bl);
+ }
DECODE_FINISH(bl);
if (struct_v < 3) {
f->close_section();
}
f->close_section();
+ f->dump_bool("may_include_deletes", may_include_deletes);
}
template <typename F>
void filter_objects(F &&f) {
o.push_back(new pg_missing_set);
o.back()->add(
hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
- eversion_t(5, 6), eversion_t(5, 1));
+ eversion_t(5, 6), eversion_t(5, 1), false);
+ o.push_back(new pg_missing_set);
+ o.back()->add(
+ hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+ eversion_t(5, 6), eversion_t(5, 1), true);
+ o.back()->may_include_deletes = true;
}
template <typename F>
void get_changed(F &&f) const {
template <bool TrackChanges>
ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
{
- out << "missing(" << missing.num_missing();
+ out << "missing(" << missing.num_missing()
+ << " may_include_deletes = " << missing.may_include_deletes;
//if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
out << ")";
return out;
}
-
-
-
-
// ---------------------------------------
class OSDSuperblock {
WRITE_CLASS_ENCODER(ScrubMap::object)
WRITE_CLASS_ENCODER(ScrubMap)
-
struct OSDOp {
ceph_osd_op op;
sobject_t soid;
set(osdc_files
- Objecter.cc
- Filer.cc)
-set(osdc_rbd_files
+ Filer.cc
ObjectCacher.cc
+ Objecter.cc
Striper.cc)
-add_library(osdc_rbd_objs OBJECT ${osdc_rbd_files})
-add_library(osdc STATIC ${osdc_files} $<TARGET_OBJECTS:osdc_rbd_objs>)
+add_library(osdc STATIC ${osdc_files})
if(WITH_LTTNG AND WITH_EVENTTRACE)
add_dependencies(osdc eventtrace_tp)
endif()
osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
&acting, &acting_primary);
bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+ bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);
unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
pg_t prev_pgid(prev_seed, pgid.pool());
if (any_change && PastIntervals::is_new_interval(
pg_num,
t->sort_bitwise,
sort_bitwise,
+ t->recovery_deletes,
+ recovery_deletes,
prev_pgid)) {
force_resend = true;
}
pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()),
&t->actual_pgid);
t->sort_bitwise = sort_bitwise;
+ t->recovery_deletes = recovery_deletes;
ldout(cct, 10) << __func__ << " "
<< " raw pgid " << pgid << " -> actual " << t->actual_pgid
<< " acting " << acting
if (rc == -EAGAIN) {
ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
-
- if ((op->target.flags & CEPH_OSD_FLAG_BALANCE_READS)
- && (op->target.acting_primary != op->target.osd)) {
- if (op->onfinish)
- num_in_flight--;
- _session_op_remove(s, op);
- sl.unlock();
- put_session(s);
-
- op->tid = 0;
- op->target.flags &= ~CEPH_OSD_FLAG_BALANCE_READS;
- op->target.pgid = pg_t();
- _op_submit(op, sul, NULL);
- m->put();
- return;
- }
-
- // new tid
- s->ops.erase(op->tid);
- op->tid = ++last_tid;
-
- _send_op(op);
+ if (op->onfinish)
+ num_in_flight--;
+ _session_op_remove(s, op);
sl.unlock();
put_session(s);
+
+ op->tid = 0;
+ op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS);
+ op->target.pgid = pg_t();
+ _op_submit(op, sul, NULL);
m->put();
return;
}
int size = -1; ///< the size of the pool when were were last mapped
int min_size = -1; ///< the min size of the pool when were were last mapped
bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
+ bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
bool used_replica = false;
bool paused = false;
DESTINATION ${PYTHON_INSTDIR})
if(WITH_MGR)
- # Needs to match src/common/config_opts.h, which has:
+ # Location needs to match default setting for mgr_module_path, currently:
# OPTION(mgr_module_path, OPT_STR, CEPH_PKGLIBDIR "/mgr")
install(DIRECTORY
${CMAKE_CURRENT_SOURCE_DIR}/mgr
::
- ceph config-key put mgr/dashboard/server_addr ::
+ ceph config-key set mgr/dashboard/server_addr ::
Restart the ceph-mgr daemon after modifying the setting to load the module.
href="/static/AdminLTE-2.3.7/dist/css/AdminLTE.min.css">
<link rel="stylesheet"
href="/static/AdminLTE-2.3.7/dist/css/skins/skin-blue.min.css">
+ <link rel="stylesheet"
+ href="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.css">
<script src="/static/AdminLTE-2.3.7/plugins/jQuery/jquery-2.2.3.min.js"></script>
+ <script src="/static/AdminLTE-2.3.7/plugins/sparkline/jquery.sparkline.min.js"></script>
<script src="/static/rivets.bundled.min.js"></script>
<script src="/static/underscore-min.js"></script>
<script src="/static/AdminLTE-2.3.7/bootstrap/js/bootstrap.min.js"></script>
<script src="/static/AdminLTE-2.3.7/dist/js/app.min.js"></script>
+ <script src="/static/AdminLTE-2.3.7/plugins/datatables/jquery.dataTables.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.17.1/moment.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.4.0/Chart.min.js"></script>
return format_number(n, 1024, ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']);
};
+ rivets.formatters.block_health_color = function(rbd_mirroring) {
+ if (rbd_mirroring.errors > 0) {
+ return "color: #ff0000";
+ } else if (rbd_mirroring.warnings > 0) {
+ return "color: #ffc200";
+ }
+ return "";
+ };
+
+ rivets.formatters.short_version = function(version) {
+ // Expect "ceph version 1.2.3-g9asdasd (as98d7a0s8d7)"
+ var result = /ceph version\s+([^ ]+)\s+\(.+\)/.exec(version);
+ if (result) {
+ // Return the "1.2.3-g9asdasd" part
+ return result[1];
+ } else {
+ // Unexpected format, pass it through
+ return version;
+ }
+ return
+ };
+
/* This is useful if you need to display some alternative text
* when a collection is empty using rv-hide */
rivets.formatters.length = function(val) {
return val.length;
}
+ rivets.formatters.hide_count_box = function(value) {
+ value = +value
+ return (isNaN(value) || value == 0)
+ };
+
rivets.bind($("#health"), toplevel_data);
rivets.bind($("section.sidebar"), toplevel_data);
setTimeout(refresh, refresh_interval);
font-weight: bold;
}
+ .dataTables_wrapper .dataTables_filter {
+ color: #ddd
+ }
+
+ .dataTables_wrapper .dataTables_filter input {
+ color: #222d32
+ }
+
+ .dataTables_wrapper tbody td {
+ background-color: #424d52;
+ color: #ddd;
+ }
+
+ .dataTables_wrapper .dataTables_paginate .ellipsis {
+ color: #ddd !important
+ }
+
+ .dataTables_wrapper .dataTables_paginate .paginate_button {
+ color: #ddd !important
+ }
+
+ .dataTables_wrapper .dataTables_paginate .paginate_button.disabled {
+ color: #424d52 !important
+ }
+
+ .nav-tabs-custom {
+ background-color: #222d32;
+ color: #ddd;
+ }
+ .nav-tabs-custom>.nav-tabs>li{
+ margin-right: 0;
+ }
+ .nav-tabs-custom>.nav-tabs>li>a{
+ border-bottom-color: transparent;
+ color: #aaa;
+ }
+ .nav-tabs-custom>.nav-tabs>li.active>a{
+ background-color: #424d52;
+ color: #eee;
+ }
+ .nav-tabs-custom>.nav-tabs>li.active:hover>a{
+ background-color: #66777f;
+ color: #eee;
+ }
+ .nav-tabs-custom>.tab-content {
+ background-color: #424d52;
+ color: #eee;
+ }
+
</style>
</head>
<!-- Sidebar Menu -->
<ul class="sidebar-menu">
<!-- Optionally, you can add icons to the links -->
- <li><a href="/health">
+ <li class="{%if path_info=='/' or path_info.startswith('/health')%}active{%endif%}">
+ <a href="/health">
<i class="fa fa-heartbeat" rv-style="health_status | health_color"></i>
<span>Cluster health</span></a>
</li>
- <li><a href="/servers">
- <i class="fa fa-server"></i>
- <span>Servers</span></a>
+ <li class="treeview{%if path_info.startswith(('/server', '/osd'))%} active{%endif%}">
+ <a href="#"><i class="fa fa-server"></i> <span>Cluster</span>
+ <span class="pull-right-container">
+ <i class="fa fa-angle-left pull-right"></i>
+ </span>
+ </a>
+ <ul class="treeview-menu menu-open">
+ <li>
+ <a href="/servers">Servers</a>
+ </li>
+ <li>
+ <a href="/osd">OSDs</a>
+ </li>
+ </ul>
</li>
- <li class="treeview active">
- <a href="#"><i class="fa fa-hdd-o"></i> <span>Block</span>
+ <li class="treeview{%if path_info.startswith('/rbd')%} active{%endif%}">
+ <a href="#">
+ <i class="fa fa-hdd-o" rv-style="rbd_mirroring | block_health_color"></i> <span>Block</span>
<span class="pull-right-container">
<i class="fa fa-angle-left pull-right"></i>
</span>
</a>
<ul class="treeview-menu menu-open">
- <li rv-each-pool="rbd_pools">
- <a rv-href="pool.url">{pool.name}</a>
+ <li>
+ <a href="/rbd_mirroring">
+ <i class="fa fa-exchange"></i> Mirroring
+ <span class="pull-right-container">
+ <small rv-hide="rbd_mirroring.warnings | hide_count_box" class="label pull-right bg-yellow">{rbd_mirroring.warnings}</small>
+ <small rv-hide="rbd_mirroring.errors | hide_count_box" class="label pull-right bg-red">{rbd_mirroring.errors}</small>
+ </span>
+ </a>
+ </li>
+ <li>
+ <a href="/rbd_iscsi">
+ <i class="fa fa-upload"></i> iSCSI
+ <span class="pull-right-container" />
+ </a>
+ </li>
+ <li class="treeview{%if path_info.startswith('/rbd_pool')%} active menu-open{%endif%}">
+ <a href="#">
+ <i class="fa fa-dot-circle-o"></i> <span>Pools</span>
+ <span class="pull-right-container">
+ <i class="fa fa-angle-left pull-right"></i>
+ </span>
+ </a>
+ <ul class="treeview-menu">
+ <li rv-each-pool="rbd_pools">
+ <a rv-href="pool.url"><i class="fa fa-circle-o"></i> {pool.name}</a>
+ </li>
+ </ul>
</li>
<li class="ceph-none-found" rv-hide="rbd_pools | length">None found</li>
</ul>
</li>
- <li class="treeview active">
+ <li class="treeview{%if path_info.startswith(('/filesystem/', '/clients/'))%} active{%endif%}">
<a href="#"><i class="fa fa-folder"></i> <span>Filesystems</span>
<span class="pull-right-container">
<i class="fa fa-angle-left pull-right"></i>
PgSummary, Health, MonStatus
import rados
+import rbd_iscsi
+import rbd_mirroring
from rbd_ls import RbdLs, RbdPoolLs
from cephfs_clients import CephFSClients
-
log = logging.getLogger("dashboard")
# pools
self.rbd_pool_ls = RbdPoolLs(self)
+ # Stateful instance of RbdISCSI
+ self.rbd_iscsi = rbd_iscsi.Controller(self)
+
+ # Stateful instance of RbdMirroring, hold cached results.
+ self.rbd_mirroring = rbd_mirroring.Controller(self)
+
# Stateful instances of CephFSClients, hold cached results. Key to
# dict is FSCID
self.cephfs_clients = {}
self.log_primed = True
- class Root(object):
+ class EndPoint(object):
+ def _health_data(self):
+ health = global_instance().get_sync_object(Health).data
+ # Transform the `checks` dict into a list for the convenience
+ # of rendering from javascript.
+ checks = []
+ for k, v in health['checks'].iteritems():
+ v['type'] = k
+ checks.append(v)
+
+ checks = sorted(checks, cmp=lambda a, b: a['severity'] > b['severity'])
+
+ health['checks'] = checks
+
+ return health
+
def _toplevel_data(self):
"""
Data consumed by the base.html template
rbd_pools = sorted([
{
"name": name,
- "url": "/rbd/{0}/".format(name)
+ "url": "/rbd_pool/{0}/".format(name)
}
for name in data
], key=lambda k: k['name'])
+ status, rbd_mirroring = global_instance().rbd_mirroring.toplevel.get()
+ if rbd_mirroring is None:
+ log.warning("Failed to get RBD mirroring summary")
+ rbd_mirroring = {}
+
fsmap = global_instance().get_sync_object(FsMap)
filesystems = [
{
return {
'rbd_pools': rbd_pools,
+ 'rbd_mirroring': rbd_mirroring,
'health_status': self._health_data()['status'],
'filesystems': filesystems
}
+ class Root(EndPoint):
@cherrypy.expose
def filesystem(self, fs_id):
template = env.get_template("filesystem.html")
return template.render(
ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
content_data=json.dumps(content_data, indent=2)
)
def filesystem_data(self, fs_id):
return global_instance().fs_status(int(fs_id))
- def _osd(self, osd_id):
- #global_instance().fs_status(int(fs_id))
- osd_id = int(osd_id)
-
- osd_map = global_instance().get("osd_map")
-
- osd = None
- for o in osd_map['osds']:
- if o['osd'] == osd_id:
- osd = o
- break
-
- assert osd is not None # TODO 400
-
- osd_spec = "{0}".format(osd_id)
-
- osd_metadata = global_instance().get_metadata(
- "osd", osd_spec)
-
- result = CommandResult("")
- global_instance().send_command(result, "osd", osd_spec,
- json.dumps({
- "prefix": "perf histogram dump",
- }),
- "")
- r, outb, outs = result.wait()
- assert r == 0
- histogram = json.loads(outb)
-
- return {
- "osd": osd,
- "osd_metadata": osd_metadata,
- "osd_histogram": histogram
- }
-
- @cherrypy.expose
- def osd_perf(self, osd_id):
- template = env.get_template("osd_perf.html")
- toplevel_data = self._toplevel_data()
-
- return template.render(
- ceph_version=global_instance().version,
- toplevel_data=json.dumps(toplevel_data, indent=2),
- content_data=json.dumps(self._osd(osd_id), indent=2)
- )
-
- @cherrypy.expose
- @cherrypy.tools.json_out()
- def osd_perf_data(self, osd_id):
- return self._osd(osd_id)
-
def _clients(self, fs_id):
cephfs_clients = global_instance().cephfs_clients.get(fs_id, None)
if cephfs_clients is None:
template = env.get_template("clients.html")
return template.render(
ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
content_data=json.dumps(content_data, indent=2)
)
def clients_data(self, fs_id):
return self._clients(int(fs_id))
- def _rbd(self, pool_name):
+ def _rbd_pool(self, pool_name):
rbd_ls = global_instance().rbd_ls.get(pool_name, None)
if rbd_ls is None:
rbd_ls = RbdLs(global_instance(), pool_name)
return value
@cherrypy.expose
- def rbd(self, pool_name):
- template = env.get_template("rbd.html")
+ def rbd_pool(self, pool_name):
+ template = env.get_template("rbd_pool.html")
toplevel_data = self._toplevel_data()
- images = self._rbd(pool_name)
+ images = self._rbd_pool(pool_name)
content_data = {
"images": images,
"pool_name": pool_name
return template.render(
ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
+ toplevel_data=json.dumps(toplevel_data, indent=2),
+ content_data=json.dumps(content_data, indent=2)
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def rbd_pool_data(self, pool_name):
+ return self._rbd_pool(pool_name)
+
+ def _rbd_mirroring(self):
+ status, data = global_instance().rbd_mirroring.content_data.get()
+ if data is None:
+ log.warning("Failed to get RBD mirroring status")
+ return {}
+ return data
+
+ @cherrypy.expose
+ def rbd_mirroring(self):
+ template = env.get_template("rbd_mirroring.html")
+
+ toplevel_data = self._toplevel_data()
+ content_data = self._rbd_mirroring()
+
+ return template.render(
+ ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
+ toplevel_data=json.dumps(toplevel_data, indent=2),
+ content_data=json.dumps(content_data, indent=2)
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def rbd_mirroring_data(self):
+ return self._rbd_mirroring()
+
+ def _rbd_iscsi(self):
+ status, data = global_instance().rbd_iscsi.content_data.get()
+ if data is None:
+ log.warning("Failed to get RBD iSCSI status")
+ return {}
+ return data
+
+ @cherrypy.expose
+ def rbd_iscsi(self):
+ template = env.get_template("rbd_iscsi.html")
+
+ toplevel_data = self._toplevel_data()
+ content_data = self._rbd_iscsi()
+
+ return template.render(
+ ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(toplevel_data, indent=2),
content_data=json.dumps(content_data, indent=2)
)
@cherrypy.expose
@cherrypy.tools.json_out()
- def rbd_data(self, pool_name):
- return self._rbd(pool_name)
+ def rbd_iscsi_data(self):
+ return self._rbd_iscsi()
@cherrypy.expose
def health(self):
template = env.get_template("health.html")
return template.render(
ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
content_data=json.dumps(self._health(), indent=2)
)
template = env.get_template("servers.html")
return template.render(
ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
content_data=json.dumps(self._servers(), indent=2)
)
def servers_data(self):
return self._servers()
- def _health_data(self):
- health = global_instance().get_sync_object(Health).data
- # Transform the `checks` dict into a list for the convenience
- # of rendering from javascript.
- checks = []
- for k, v in health['checks'].iteritems():
- v['type'] = k
- checks.append(v)
-
- checks = sorted(checks, cmp=lambda a, b: a['severity'] > b['severity'])
-
- health['checks'] = checks
-
- return health
-
def _health(self):
# Fuse osdmap with pg_summary to get description of pools
# including their PG states
return dict(result)
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def get_counter(self, type, id, path):
+ return global_instance().get_counter(type, id, path)
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def get_perf_schema(self, **args):
+ type = args.get('type', '')
+ id = args.get('id', '')
+ schema = global_instance().get_perf_schema(type, id)
+ ret = dict()
+ for k1 in schema.keys(): # 'perf_schema'
+ ret[k1] = collections.OrderedDict()
+ for k2 in sorted(schema[k1].keys()):
+ sorted_dict = collections.OrderedDict(
+ sorted(schema[k1][k2].items(), key=lambda i: i[0])
+ )
+ ret[k1][k2] = sorted_dict
+ return ret
+
server_addr = self.get_localized_config('server_addr', '::')
server_port = self.get_localized_config('server_port', '7000')
if server_addr is None:
- raise RuntimeError('no server_addr configured; try "ceph config-key put mgr/dashboard/server_addr <ip>"')
+ raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/dashboard/server_addr <ip>"')
log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
cherrypy.config.update({
'server.socket_host': server_addr,
}
}
log.info("Serving static from {0}".format(static_dir))
+
+ class OSDEndpoint(EndPoint):
+ def _osd(self, osd_id):
+ osd_id = int(osd_id)
+
+ osd_map = global_instance().get("osd_map")
+
+ osd = None
+ for o in osd_map['osds']:
+ if o['osd'] == osd_id:
+ osd = o
+ break
+
+ assert osd is not None # TODO 400
+
+ osd_spec = "{0}".format(osd_id)
+
+ osd_metadata = global_instance().get_metadata(
+ "osd", osd_spec)
+
+ result = CommandResult("")
+ global_instance().send_command(result, "osd", osd_spec,
+ json.dumps({
+ "prefix": "perf histogram dump",
+ }),
+ "")
+ r, outb, outs = result.wait()
+ assert r == 0
+ histogram = json.loads(outb)
+
+ return {
+ "osd": osd,
+ "osd_metadata": osd_metadata,
+ "osd_histogram": histogram
+ }
+
+ @cherrypy.expose
+ def perf(self, osd_id):
+ template = env.get_template("osd_perf.html")
+ toplevel_data = self._toplevel_data()
+
+ return template.render(
+ ceph_version=global_instance().version,
+ path_info='/osd' + cherrypy.request.path_info,
+ toplevel_data=json.dumps(toplevel_data, indent=2),
+ content_data=json.dumps(self._osd(osd_id), indent=2)
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def perf_data(self, osd_id):
+ return self._osd(osd_id)
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def list_data(self):
+ return self._osds_by_server()
+
+ def _osd_summary(self, osd_id, osd_info):
+ """
+ The info used for displaying an OSD in a table
+ """
+
+ osd_spec = "{0}".format(osd_id)
+
+ result = {}
+ result['id'] = osd_id
+ result['stats'] = {}
+ result['stats_history'] = {}
+
+ # Counter stats
+ for s in ['osd.op_w', 'osd.op_in_bytes', 'osd.op_r', 'osd.op_out_bytes']:
+ result['stats'][s.split(".")[1]] = global_instance().get_rate('osd', osd_spec, s)
+ result['stats_history'][s.split(".")[1]] = \
+ global_instance().get_counter('osd', osd_spec, s)[s]
+
+ # Gauge stats
+ for s in ["osd.numpg", "osd.stat_bytes", "osd.stat_bytes_used"]:
+ result['stats'][s.split(".")[1]] = global_instance().get_latest('osd', osd_spec, s)
+
+ result['up'] = osd_info['up']
+ result['in'] = osd_info['in']
+
+ result['url'] = "/osd/perf/{0}".format(osd_id)
+
+ return result
+
+ def _osds_by_server(self):
+ result = defaultdict(list)
+ servers = global_instance().list_servers()
+
+ osd_map = global_instance().get_sync_object(OsdMap)
+
+ for server in servers:
+ hostname = server['hostname']
+ services = server['services']
+ first = True
+ for s in services:
+ if s["type"] == "osd":
+ osd_id = int(s["id"])
+ # If metadata doesn't tally with osdmap, drop it.
+ if osd_id not in osd_map.osds_by_id:
+ global_instance().log.warn(
+ "OSD service {0} missing in OSDMap, stale metadata?".format(osd_id))
+ continue
+ summary = self._osd_summary(osd_id,
+ osd_map.osds_by_id[osd_id])
+
+ if first:
+ # A little helper for rendering
+ summary['first'] = True
+ first = False
+ result[hostname].append(summary)
+
+ global_instance().log.warn("result.size {0} servers.size {1}".format(
+ len(result), len(servers)
+ ))
+
+ # Return list form for convenience of rendering
+ return result.items()
+
+ @cherrypy.expose
+ def index(self):
+ """
+ List of all OSDS grouped by host
+ :return:
+ """
+
+ template = env.get_template("osds.html")
+ toplevel_data = self._toplevel_data()
+
+ content_data = {
+ "osds_by_server": self._osds_by_server()
+ }
+
+ return template.render(
+ ceph_version=global_instance().version,
+ path_info='/osd' + cherrypy.request.path_info,
+ toplevel_data=json.dumps(toplevel_data, indent=2),
+ content_data=json.dumps(content_data, indent=2)
+ )
+
cherrypy.tree.mount(Root(), "/", conf)
+ cherrypy.tree.mount(OSDEndpoint(), "/osd", conf)
log.info("Starting engine...")
cherrypy.engine.start()
// Pre-populated initial data at page load
var content_data = {{ content_data }};
-
-
var hexdigits = function(v) {
var i = Math.floor(v * 255);
if (Math.floor(i) < 0x10) {
post_load();
var refresh = function() {
- $.get("/osd_perf_data/" + content_data.osd.osd + "/", function(data) {
+ $.get("/osd/perf_data/" + content_data.osd.osd + "/", function(data) {
_.extend(content_data.osd_histogram, data.osd_histogram);
_.extend(content_data.osd, data.osd);
_.extend(content_data.osd_metadata, data.osd_metadata);
--- /dev/null
+{% extends "base.html" %}
+
+{% block content %}
+
+<script>
+ $(document).ready(function(){
+ // Pre-populated initial data at page load
+ var content_data = {{ content_data }};
+
+ var refresh = function() {
+ $.get("/osd/list_data/", function(data) {
+ content_data.osds_by_server = data;
+ $('.inlinesparkline').sparkline();
+ setTimeout(refresh, 5000);
+ });
+ };
+
+ rivets.formatters.colored_up_in = function(osd){
+ var result = "";
+ if (osd.up) {
+ result += "<span style='color:#00bb00;'>up</span>";
+ } else {
+ result += "<span style='color:#bb0000;'>down</span>";
+ }
+
+ result += ", ";
+
+ if (osd.in) {
+ result += "<span style='color:#00bb00;'>in</span>";
+ } else {
+ result += "<span style='color:#bb0000;'>out</span>";
+ }
+
+ return result;
+ };
+
+ rivets.formatters.sparkline_data = function(time_series) {
+ result = "";
+ for (var i = 1; i < time_series.length; ++i) {
+ var delta_v = time_series[i][1] - time_series[i - 1][1];
+ var delta_t = time_series[i][0] - time_series[i - 1][0];
+ result += (delta_v / delta_t + ",");
+ }
+ return result;
+ };
+
+ rivets.bind($("div#content"), content_data);
+ $('.inlinesparkline').sparkline();
+ setTimeout(refresh, 5000);
+ });
+</script>
+
+<section class="content-header">
+ <h1>
+ OSD daemons
+ </h1>
+</section>
+
+<section class="content">
+ <div class="box">
+ <div class="box-body">
+
+ <table class="table table-condensed table-bordered">
+ <thead>
+ <tr>
+ <th>Host</th>
+ <th>ID</th>
+ <th>Status</th>
+ <th>PGs</th>
+ <th>Usage</th>
+ <th>Read bytes</th>
+ <th>Write bytes</th>
+ <th>Read ops</th>
+ <th>Write ops</th>
+ </tr>
+ </thead>
+
+ <tbody rv-each-server="osds_by_server">
+ <tr rv-each-osd="server.1">
+ <td rv-if="osd.first" rv-rowspan="server.1 | length">{server.0}</td>
+ <td><a rv-href="osd.url">{osd.id}</a></td>
+ <td rv-html="osd | colored_up_in"></td>
+ <td>{osd.stats.numpg}</td>
+ <td>{osd.stats.stat_bytes_used | dimless_binary} / {osd.stats.stat_bytes | dimless_binary}</td>
+ <td>{osd.stats.op_out_bytes | dimless_binary}/s <span class="inlinesparkline" rv-html="osd.stats_history.op_out_bytes | sparkline_data"></span></td>
+ <td>{osd.stats.op_in_bytes | dimless_binary}/s <span class="inlinesparkline" rv-html="osd.stats_history.op_in_bytes | sparkline_data"></span></td>
+ <td>{osd.stats.op_r | dimless}/s</td>
+ <td>{osd.stats.op_w | dimless}/s</td>
+ </tr>
+ </tbody>
+
+
+ </table>
+
+
+
+
+ </div>
+ </div>
+
+
+</section>
+<!-- /.content -->
+
+{% endblock %}
+++ /dev/null
-{% extends "base.html" %}
-
-{% block content %}
-
-<script>
- $(document).ready(function(){
- // Pre-populated initial data at page load
- var content_data = {{ content_data }};
-
- var refresh = function() {
- $.get("/rbd_data/" + content_data.pool_name + "/", function(data) {
- content_data.images = data;
- setTimeout(refresh, 5000);
- });
- };
-
- console.log(content_data);
-
- rivets.bind($("div#content"), content_data);
- setTimeout(refresh, 5000);
- });
-</script>
-
-
-<section class="content-header">
- <h1>
- RBD
- </h1>
-</section>
-
-<section class="content">
- <div class="box">
- <div class="box-header">
- <h3 class="box-title">Images</h3>
- </div>
- <div class="box-body">
- <table class="table table-bordered">
- <thead>
- <tr>
- <th>Name</th>
- <th>Size</th>
- <th>Objects</th>
- <th>Object size</th>
- <th>Parent</th>
- </tr>
- </thead>
- <tbody>
- <tr rv-each-image="images">
- <td>{image.name}</td>
- <td>{image.size | dimless_binary}</td>
- <td>{image.num_objs | dimless}</td>
- <td>{image.obj_size | dimless_binary}</td>
- <td>{image.parent}</td>
- </tr>
- </tbody>
- </table>
- </div>
- </div>
-
-
-</section>
-<!-- /.content -->
-
-{% endblock %}
--- /dev/null
+{% extends "base.html" %}
+
+{% block content %}
+
+<script>
+ $(document).ready(function(){
+ // Pre-populated initial data at page load
+ var content_data = {{ content_data }};
+
+ var refresh = function() {
+ $.get("/rbd_iscsi_data", function(data) {
+ _.extend(content_data, data);
+ setTimeout(refresh, 30000);
+ });
+ };
+
+ console.log(content_data);
+
+ rivets.bind($("div#content"), content_data);
+ setTimeout(refresh, 30000);
+
+ $('#daemons').DataTable({
+ 'paging' : true,
+ 'pageLength' : 5,
+ 'lengthChange': false,
+ 'info' : false,
+ 'autoWidth' : false,
+ 'searching' : false
+ });
+
+ $('#images').DataTable({
+ 'paging' : true,
+ 'pageLength' : 10,
+ 'lengthChange': false,
+ 'searching' : true,
+ 'ordering' : true,
+ 'info' : false
+ });
+ });
+</script>
+
+
+<section class="content-header">
+ <h1>
+ Block iSCSI
+ </h1>
+</section>
+
+<section class="content">
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Daemons</h3>
+ </div>
+ <div class="box-body">
+ <table id="daemons" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Hostname</th>
+ <th># Active/Optimized</th>
+ <th># Active/Non-Optimized</th>
+ <th>Version</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-daemon="daemons">
+ <td>{daemon.server_hostname}</td>
+ <td>{daemon.optimized_paths}</td>
+ <td>{daemon.non_optimized_paths}</td>
+ <td>{daemon.version | short_version}</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Images</h3>
+ </div>
+ <div class="box-body">
+ <table id="images" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Pool</th>
+ <th>Image</th>
+ <th>Active/Optimized</th>
+ <th>Active/Non-Optimized</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-image="images">
+ <td>{image.pool_name}</td>
+ <td>{image.name}</td>
+ <td>{image.optimized_paths}</td>
+ <td>{image.non_optimized_paths}</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+</section>
+<!-- /.content -->
+
+{% endblock %}
--- /dev/null
+
+import rados
+import rbd
+from remote_view_cache import RemoteViewCache
+
+SERVICE_TYPE = 'tcmu-runner'
+
+class DaemonsAndImages(RemoteViewCache):
+ def _get(self):
+ daemons = {}
+ images = {}
+ for server in self._module.list_servers():
+ for service in server['services']:
+ if service['type'] == SERVICE_TYPE:
+ metadata = self._module.get_metadata(SERVICE_TYPE,
+ service['id'])
+ status = self._module.get_daemon_status(SERVICE_TYPE,
+ service['id'])
+
+ daemon = daemons.get(server['hostname'], None)
+ if daemon is None:
+ daemon = {
+ 'server_hostname': server['hostname'],
+ 'version': metadata['ceph_version'],
+ 'optimized_paths': 0,
+ 'non_optimized_paths': 0
+ }
+ daemons[server['hostname']] = daemon
+
+ image = images.get(service['id'])
+ if image is None:
+ image = {
+ 'id': service['id'],
+ 'pool_name': metadata['pool_name'],
+ 'name': metadata['image_name'],
+ 'optimized_paths': [],
+ 'non_optimized_paths': []
+ }
+ if status.get('lock_owner', 'false') == 'true':
+ daemon['optimized_paths'] += 1
+ image['optimized_paths'].append(server['hostname'])
+ else:
+ daemon['non_optimized_paths'] += 1
+ image['non_optimized_paths'].append(server['hostname'])
+ images[service['id']] = image
+
+ return {
+ 'daemons': [daemons[k] for k in sorted(daemons, key=daemons.get)],
+ 'images': [images[k] for k in sorted(images, key=images.get)]
+ }
+
+class Controller:
+ def __init__(self, module_inst):
+ self.content_data = DaemonsAndImages(module_inst)
i = rbd.Image(self.ioctx, name)
stat = i.stat()
stat['name'] = name
+ features = i.features()
+ stat['features'] = features
+ stat['features_name'] = self._format_bitmask(features)
try:
parent_info = i.parent_info()
pass
result.append(stat)
return result
+
+ def _format_bitmask(self, features):
+ names = ""
+ RBD_FEATURES_NAME_MAPPING = {
+ rbd.RBD_FEATURE_LAYERING: "layering",
+ rbd.RBD_FEATURE_STRIPINGV2: "striping",
+ rbd.RBD_FEATURE_EXCLUSIVE_LOCK: "exclusive-lock",
+ rbd.RBD_FEATURE_OBJECT_MAP: "object-map",
+ rbd.RBD_FEATURE_FAST_DIFF: "fast-diff",
+ rbd.RBD_FEATURE_DEEP_FLATTEN: "deep-flatten",
+ rbd.RBD_FEATURE_JOURNALING: "journaling",
+ rbd.RBD_FEATURE_DATA_POOL: "data-pool",
+ }
+
+ for key in RBD_FEATURES_NAME_MAPPING.keys():
+ if (key & features == 0):
+ continue
+
+ if names:
+ names = names + ", "
+ names = names + RBD_FEATURES_NAME_MAPPING.get(key)
+
+ return names
--- /dev/null
+{% extends "base.html" %}
+
+{% block content %}
+
+<script>
+ $(document).ready(function(){
+ // Pre-populated initial data at page load
+ var content_data = {{ content_data }};
+
+ var refresh = function() {
+ $.get("/rbd_mirroring_data", function(data) {
+ _.extend(content_data, data);
+ setTimeout(refresh, 30000);
+ });
+ };
+
+ console.log(content_data);
+
+ rivets.formatters.mirror_health_color = function(status_str) {
+ if (status_str == "warning") {
+ return "label label-warning";
+ } else if (status_str == "error") {
+ return "label label-danger";
+ } else if (status_str == "success") {
+ return "label label-success";
+ }
+ return "label label-info";
+ }
+ rivets.formatters.sync_progress_bar = function(progress){
+ var ratio = progress / 100.0;
+ return "width: " + Math.round(ratio * 100).toString() + "%";
+ };
+
+ rivets.bind($("div#content"), content_data);
+ setTimeout(refresh, 30000);
+
+ var table_ids = ["daemons", "pools"];
+ for (var i = 0; i < table_ids.length; ++i) {
+ $('#' + table_ids[i]).DataTable({
+ 'paging' : true,
+ 'pageLength' : 5,
+ 'lengthChange': false,
+ 'info' : false,
+ 'autoWidth' : false,
+ 'searching' : false
+ });
+ }
+
+ var table_ids = ["image_errors", "image_syncing", "image_ready"];
+ for (var i = 0; i < table_ids.length; ++i) {
+ $('#' + table_ids[i]).DataTable({
+ 'paging' : true,
+ 'pageLength' : 10,
+ 'lengthChange': false,
+ 'searching' : true,
+ 'ordering' : true,
+ 'info' : false
+ });
+ };
+ });
+</script>
+
+
+<section class="content-header">
+ <h1>
+ Block Mirroring
+ </h1>
+</section>
+
+<section class="content">
+ <div class="row">
+ <div class="col-sm-6">
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Daemons</h3>
+ </div>
+ <div class="box-body">
+ <table id="daemons" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>ID</th>
+ <th>Instance</th>
+ <th>Hostname</th>
+ <th>Version</th>
+ <th>Health</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-daemon="daemons">
+ <td>{daemon.id}</td>
+ <td>{daemon.instance_id}</td>
+ <td>{daemon.server_hostname}</td>
+ <td>{daemon.version | short_version}</td>
+ <td><span rv-class="daemon.health_color | mirror_health_color">{daemon.health}</span></td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+ </div>
+
+ <div class="col-sm-6">
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Pools</h3>
+ </div>
+ <div class="box-body">
+ <table id="pools" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Name</th>
+ <th>Mode</th>
+ <th>Leader</th>
+ <th># Local</th>
+ <th># Remote</th>
+ <th>Health</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-pool="pools">
+ <td>{pool.name}</td>
+ <td>{pool.mirror_mode}</td>
+ <td>{pool.leader_id}</td>
+ <td>{pool.image_local_count}</td>
+ <td>{pool.image_remote_count}</td>
+ <td><span rv-class="pool.health_color | mirror_health_color">{pool.health}</span></td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Images</h3>
+ </div>
+ <div class="box-body">
+ <div class="nav-tabs-custom">
+ <ul class="nav nav-tabs">
+ <li class="active"><a href="#tab_1" data-toggle="tab">Issues</a></li>
+ <li><a href="#tab_2" data-toggle="tab">Syncing</a></li>
+ <li><a href="#tab_3" data-toggle="tab">Ready</a></li>
+ </ul>
+ <div class="tab-content">
+ <div class="tab-pane active" id="tab_1">
+ <table id="image_errors" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Pool</th>
+ <th>Image</th>
+ <th>Issue</th>
+ <th>State</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-image="image_error">
+ <td>{image.pool_name}</td>
+ <td>{image.name}</td>
+ <td>{image.description}</td>
+ <td><span rv-class="image.state_color | mirror_health_color">{image.state}</span></td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <div class="tab-pane" id="tab_2">
+ <table id="image_syncing" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Pool</th>
+ <th>Image</th>
+ <th>Progress</th>
+ <th>State</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-image="image_syncing">
+ <td>{image.pool_name}</td>
+ <td>{image.name}</td>
+ <td>
+ <div class="progress" style="width: 100px; height: 18px;">
+ <div class="progress-bar progress-bar-aqua" rv-style="image.progress | sync_progress_bar">
+ {image.progress}
+ </div>
+ </div>
+ </td>
+ <td><span class="label label-info">Syncing</span></td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ <div class="tab-pane" id="tab_3">
+ <table id="image_ready" class="table table-condensed">
+ <thead>
+ <tr>
+ <th>Pool</th>
+ <th>Image</th>
+ <th>Description</th>
+ <th>State</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-image="image_ready">
+ <td>{image.pool_name}</td>
+ <td>{image.name}</td>
+ <td>{image.description}</td>
+ <td><span rv-class="image.state_color | mirror_health_color">{image.state}</span></td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</section>
+<!-- /.content -->
+
+{% endblock %}
--- /dev/null
+
+import json
+import re
+import rados
+import rbd
+from remote_view_cache import RemoteViewCache
+
+class DaemonsAndPools(RemoteViewCache):
+ def _get(self):
+ daemons = self.get_daemons()
+ return {
+ 'daemons': daemons,
+ 'pools': self.get_pools(daemons)
+ }
+
+ def get_daemons(self):
+ daemons = []
+ for server in self._module.list_servers():
+ for service in server['services']:
+ if service['type'] == 'rbd-mirror':
+ metadata = self._module.get_metadata('rbd-mirror',
+ service['id'])
+ status = self._module.get_daemon_status('rbd-mirror',
+ service['id'])
+ try:
+ status = json.loads(status['json'])
+ except:
+ status = {}
+
+ # extract per-daemon service data and health
+ daemon = {
+ 'id': service['id'],
+ 'instance_id': metadata['instance_id'],
+ 'version': metadata['ceph_version'],
+ 'server_hostname': server['hostname'],
+ 'service': service,
+ 'server': server,
+ 'metadata': metadata,
+ 'status': status
+ }
+ daemon = dict(daemon, **self.get_daemon_health(daemon))
+ daemons.append(daemon)
+
+ return sorted(daemons, key=lambda k: k['id'])
+
+ def get_daemon_health(self, daemon):
+ health = {
+ 'health_color': 'info',
+ 'health' : 'Unknown'
+ }
+ for pool_id, pool_data in daemon['status'].items():
+ if (health['health'] != 'error' and
+ [k for k,v in pool_data.get('callouts', {}).items() if v['level'] == 'error']):
+ health = {
+ 'health_color': 'error',
+ 'health': 'Error'
+ }
+ elif (health['health'] != 'error' and
+ [k for k,v in pool_data.get('callouts', {}).items() if v['level'] == 'warning']):
+ health = {
+ 'health_color': 'warning',
+ 'health': 'Warning'
+ }
+ elif health['health_color'] == 'info':
+ health = {
+ 'health_color': 'success',
+ 'health': 'OK'
+ }
+ return health
+
+ def get_pools(self, daemons):
+ status, pool_names = self._module.rbd_pool_ls.get()
+ if pool_names is None:
+ self.log.warning("Failed to get RBD pool list")
+ return {}
+
+ pool_stats = {}
+ rbdctx = rbd.RBD()
+ for pool_name in pool_names:
+ self.log.debug("Constructing IOCtx " + pool_name)
+ try:
+ ioctx = self._module.rados.open_ioctx(pool_name)
+ except:
+ self.log.exception("Failed to open pool " + pool_name)
+ continue
+
+ try:
+ mirror_mode = rbdctx.mirror_mode_get(ioctx)
+ except:
+ self.log.exception("Failed to query mirror mode " + pool_name)
+
+ stats = {}
+ if mirror_mode == rbd.RBD_MIRROR_MODE_DISABLED:
+ continue
+ elif mirror_mode == rbd.RBD_MIRROR_MODE_IMAGE:
+ mirror_mode = "image"
+ elif mirror_mode == rbd.RBD_MIRROR_MODE_POOL:
+ mirror_mode = "pool"
+ else:
+ mirror_mode = "unknown"
+ stats['health_color'] = "warning"
+ stats['health'] = "Warning"
+
+ pool_stats[pool_name] = dict(stats, **{
+ 'mirror_mode': mirror_mode
+ })
+
+ for daemon in daemons:
+ for pool_id, pool_data in daemon['status'].items():
+ stats = pool_stats.get(pool_data['name'], None)
+ if stats is None:
+ continue
+
+ if pool_data.get('leader', False):
+ # leader instance stores image counts
+ stats['leader_id'] = daemon['metadata']['instance_id']
+ stats['image_local_count'] = pool_data.get('image_local_count', 0)
+ stats['image_remote_count'] = pool_data.get('image_remote_count', 0)
+
+ if (stats.get('health_color', '') != 'error' and
+ pool_data.get('image_error_count', 0) > 0):
+ stats['health_color'] = 'error'
+ stats['health'] = 'Error'
+ elif (stats.get('health_color', '') != 'error' and
+ pool_data.get('image_warning_count', 0) > 0):
+ stats['health_color'] = 'warning'
+ stats['health'] = 'Warning'
+ elif stats.get('health', None) is None:
+ stats['health_color'] = 'success'
+ stats['health'] = 'OK'
+
+ for name, stats in pool_stats.items():
+ if stats.get('health', None) is None:
+ # daemon doesn't know about pool
+ stats['health_color'] = 'error'
+ stats['health'] = 'Error'
+ elif stats.get('leader_id', None) is None:
+ # no daemons are managing the pool as leader instance
+ stats['health_color'] = 'warning'
+ stats['health'] = 'Warning'
+ return pool_stats
+
+
+class PoolDatum(RemoteViewCache):
+ def __init__(self, module_inst, pool_name):
+ super(PoolDatum, self).__init__(module_inst)
+ self.pool_name = pool_name
+
+ def _get(self):
+ data = {}
+ self.log.debug("Constructing IOCtx " + self.pool_name)
+ try:
+ ioctx = self._module.rados.open_ioctx(self.pool_name)
+ except:
+ self.log.exception("Failed to open pool " + pool_name)
+ return None
+
+ mirror_state = {
+ 'down': {
+ 'health': 'issue',
+ 'state_color': 'warning',
+ 'state': 'Unknown',
+ 'description': None
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_UNKNOWN: {
+ 'health': 'issue',
+ 'state_color': 'warning',
+ 'state': 'Unknown'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_ERROR: {
+ 'health': 'issue',
+ 'state_color': 'error',
+ 'state': 'Error'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_SYNCING: {
+ 'health': 'syncing'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY: {
+ 'health': 'ok',
+ 'state_color': 'success',
+ 'state': 'Starting'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_REPLAYING: {
+ 'health': 'ok',
+ 'state_color': 'success',
+ 'state': 'Replaying'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY: {
+ 'health': 'ok',
+ 'state_color': 'success',
+ 'state': 'Stopping'
+ },
+ rbd.MIRROR_IMAGE_STATUS_STATE_STOPPED: {
+ 'health': 'ok',
+ 'state_color': 'info',
+ 'state': 'Primary'
+ }
+ }
+
+ rbdctx = rbd.RBD()
+ try:
+ mirror_image_status = rbdctx.mirror_image_status_list(ioctx)
+ data['mirror_images'] = sorted([
+ dict({
+ 'name': image['name'],
+ 'description': image['description']
+ }, **mirror_state['down' if not image['up'] else image['state']])
+ for image in mirror_image_status
+ ], key=lambda k: k['name'])
+ except rbd.ImageNotFound:
+ pass
+ except:
+ self.log.exception("Failed to list mirror image status " + self.pool_name)
+
+ return data
+
+class Toplevel(RemoteViewCache):
+ def __init__(self, module_inst, daemons_and_pools):
+ super(Toplevel, self).__init__(module_inst)
+ self.daemons_and_pools = daemons_and_pools
+
+ def _get(self):
+ status, data = self.daemons_and_pools.get()
+ if data is None:
+ self.log.warning("Failed to get rbd-mirror daemons and pools")
+ daemons = {}
+ daemons = data.get('daemons', [])
+ pools = data.get('pools', {})
+
+ warnings = 0
+ errors = 0
+ for daemon in daemons:
+ if daemon['health_color'] == 'error':
+ errors += 1
+ elif daemon['health_color'] == 'warning':
+ warnings += 1
+ for pool_name, pool in pools.items():
+ if pool['health_color'] == 'error':
+ errors += 1
+ elif pool['health_color'] == 'warning':
+ warnings += 1
+ return {'warnings': warnings, 'errors': errors}
+
+
+class ContentData(RemoteViewCache):
+ def __init__(self, module_inst, daemons_and_pools, pool_data):
+ super(ContentData, self).__init__(module_inst)
+
+ self.daemons_and_pools = daemons_and_pools
+ self.pool_data = pool_data
+
+ def _get(self):
+ status, pool_names = self._module.rbd_pool_ls.get()
+ if pool_names is None:
+ self.log.warning("Failed to get RBD pool list")
+ return None
+
+ status, data = self.daemons_and_pools.get()
+ if data is None:
+ self.log.warning("Failed to get rbd-mirror daemons list")
+ data = {}
+ daemons = data.get('daemons', [])
+ pool_stats = data.get('pools', {})
+
+ pools = []
+ image_error = []
+ image_syncing = []
+ image_ready = []
+ for pool_name in pool_names:
+ pool = self.get_pool_datum(pool_name) or {}
+ stats = pool_stats.get(pool_name, {})
+ if stats.get('mirror_mode', None) is None:
+ continue
+
+ mirror_images = pool.get('mirror_images', [])
+ for mirror_image in mirror_images:
+ image = {
+ 'pool_name': pool_name,
+ 'name': mirror_image['name']
+ }
+
+ if mirror_image['health'] == 'ok':
+ image.update({
+ 'state_color': mirror_image['state_color'],
+ 'state': mirror_image['state'],
+ 'description': mirror_image['description']
+ })
+ image_ready.append(image)
+ elif mirror_image['health'] == 'syncing':
+ p = re.compile("bootstrapping, IMAGE_COPY/COPY_OBJECT (.*)%")
+ image.update({
+ 'progress': (p.findall(mirror_image['description']) or [0])[0]
+ })
+ image_syncing.append(image)
+ else:
+ image.update({
+ 'state_color': mirror_image['state_color'],
+ 'state': mirror_image['state'],
+ 'description': mirror_image['description']
+ })
+ image_error.append(image)
+
+ pools.append(dict({
+ 'name': pool_name
+ }, **stats))
+
+ return {
+ 'daemons': daemons,
+ 'pools' : pools,
+ 'image_error': image_error,
+ 'image_syncing': image_syncing,
+ 'image_ready': image_ready
+ }
+
+ def get_pool_datum(self, pool_name):
+ pool_datum = self.pool_data.get(pool_name, None)
+ if pool_datum is None:
+ pool_datum = PoolDatum(self._module, pool_name)
+ self.pool_data[pool_name] = pool_datum
+
+ status, value = pool_datum.get()
+ return value
+
+class Controller:
+ def __init__(self, module_inst):
+ self.daemons_and_pools = DaemonsAndPools(module_inst)
+ self.pool_data = {}
+ self.toplevel = Toplevel(module_inst, self.daemons_and_pools)
+ self.content_data = ContentData(module_inst, self.daemons_and_pools,
+ self.pool_data)
+
--- /dev/null
+{% extends "base.html" %}
+
+{% block content %}
+
+<script>
+ $(document).ready(function(){
+ // Pre-populated initial data at page load
+ var content_data = {{ content_data }};
+
+ var refresh = function() {
+ $.get("/rbd_pool_data/" + content_data.pool_name + "/", function(data) {
+ content_data.images = data;
+ setTimeout(refresh, 10000);
+ });
+ };
+
+ console.log(content_data);
+
+ rivets.bind($("div#content"), content_data);
+ setTimeout(refresh, 10000);
+
+ $('#images').DataTable({
+ 'paging' : true,
+ 'pageLength' : 15,
+ 'lengthChange': false,
+ 'searching' : true,
+ 'ordering' : true,
+ 'info' : false,
+ 'autoWidth' : false
+ });
+ });
+</script>
+
+
+<section class="content-header">
+ <h1>
+ Block Pool { pool_name }
+ </h1>
+</section>
+
+<section class="content">
+ <div class="box">
+ <div class="box-header">
+ <h3 class="box-title">Images</h3>
+ </div>
+ <div class="box-body">
+ <table id="images" class="table table-bordered">
+ <thead>
+ <tr>
+ <th>Name</th>
+ <th>Size</th>
+ <th>Objects</th>
+ <th>Object size</th>
+ <th>Features</th>
+ <th>Parent</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-image="images">
+ <td>{image.name}</td>
+ <td>{image.size | dimless_binary}</td>
+ <td>{image.num_objs | dimless}</td>
+ <td>{image.obj_size | dimless_binary}</td>
+ <td>{image.features_name}</td>
+ <td>{image.parent}</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+
+
+</section>
+<!-- /.content -->
+
+{% endblock %}
return strings.join(", ");
};
- rivets.formatters.short_version = function(version) {
- // Expect "ceph version 1.2.3-g9asdasd (as98d7a0s8d7)"
- var result = /ceph version\s+([^ ]+)\s+\(.+\)/.exec(version);
- if (result) {
- // Return the "1.2.3-g9asdasd" part
- return result[1];
- } else {
- // Unexpected format, pass it through
- return version;
- }
- return
- };
-
rivets.bind($("#content"), content_data);
});
"""
return ceph_state.get_server(self._handle, hostname)
+ def get_perf_schema(self, svc_type, svc_name):
+ """
+ Called by the plugin to fetch perf counter schema info.
+ svc_name can be nullptr, as can svc_type, in which case
+ they are wildcards
+
+ :param svc_type:
+ :param svc_name:
+ :return: list of dicts describing the counters requested
+ """
+ return ceph_state.get_perf_schema(self._handle, svc_type, svc_name)
+
def get_counter(self, svc_type, svc_name, path):
"""
Called by the plugin to fetch data for a particular perf counter
"""
ceph_state.send_command(self._handle, *args, **kwargs)
+ def set_health_checks(self, checks):
+ """
+ Set module's health checks
+
+ Set the module's current map of health checks. Argument is a
+ dict of check names to info, in this form:
+
+ {
+ 'CHECK_FOO': {
+ 'severity': 'warning', # or 'error'
+ 'summary': 'summary string',
+ 'detail': [ 'list', 'of', 'detail', 'strings' ],
+ },
+ 'CHECK_BAR': {
+ 'severity': 'error',
+ 'summary': 'bars are bad',
+ 'detail': [ 'too hard' ],
+ },
+ }
+
+ :param list: dict of health check dicts
+ """
+ ceph_state.set_health_checks(self._handle, checks)
+
def handle_command(self, cmd):
"""
Called by ceph-mgr to request the plugin to handle one
--- /dev/null
+from module import * # NOQA
+
--- /dev/null
+import cherrypy
+import math
+import os
+import time
+from collections import OrderedDict
+from mgr_module import MgrModule
+
+# Defaults for the Prometheus HTTP server. Can also set in config-key
+# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
+# for Prometheus exporter port registry
+
+DEFAULT_ADDR = '::'
+DEFAULT_PORT = 9283
+
+
+# cherrypy likes to sys.exit on error. don't let it take us down too!
+def os_exit_noop():
+ pass
+
+
+os._exit = os_exit_noop
+
+
+# to access things in class Module from subclass Root. Because
+# it's a dict, the writer doesn't need to declare 'global' for access
+
+_global_instance = {'plugin': None}
+
+
+def global_instance():
+ assert _global_instance['plugin'] is not None
+ return _global_instance['plugin']
+
+
+# counter value types
+PERFCOUNTER_TIME = 1
+PERFCOUNTER_U64 = 2
+
+# counter types
+PERFCOUNTER_LONGRUNAVG = 4
+PERFCOUNTER_COUNTER = 8
+PERFCOUNTER_HISTOGRAM = 0x10
+PERFCOUNTER_TYPE_MASK = ~2
+
+
+def stattype_to_str(stattype):
+
+ typeonly = stattype & PERFCOUNTER_TYPE_MASK
+ if typeonly == 0:
+ return 'gauge'
+ if typeonly == PERFCOUNTER_LONGRUNAVG:
+ # this lie matches the DaemonState decoding: only val, no counts
+ return 'counter'
+ if typeonly == PERFCOUNTER_COUNTER:
+ return 'counter'
+ if typeonly == PERFCOUNTER_HISTOGRAM:
+ return 'histogram'
+
+ return ''
+
+
+class Metric(object):
+ def __init__(self, mtype, name, desc, labels=None):
+ self.mtype = mtype
+ self.name = name
+ self.desc = desc
+ self.labelnames = labels # tuple if present
+ self.value = dict() # indexed by label values
+
+ def set(self, value, labelvalues=None):
+ # labelvalues must be a tuple
+ labelvalues = labelvalues or ('',)
+ self.value[labelvalues] = value
+
+ def str_expfmt(self):
+
+ def promethize(path):
+ ''' replace illegal metric name characters '''
+ return path.replace('.', '_').replace('-', '_')
+
+ def floatstr(value):
+ ''' represent as Go-compatible float '''
+ if value == float('inf'):
+ return '+Inf'
+ if value == float('-inf'):
+ return '-Inf'
+ if math.isnan(value):
+ return 'NaN'
+ return repr(float(value))
+
+ name = promethize(self.name)
+ expfmt = '''
+# HELP {name} {desc}
+# TYPE {name} {mtype}'''.format(
+ name=name,
+ desc=self.desc,
+ mtype=self.mtype,
+ )
+
+ for labelvalues, value in self.value.items():
+ if self.labelnames:
+ labels = zip(self.labelnames, labelvalues)
+ labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
+ else:
+ labels = ''
+ if labels:
+ fmtstr = '\n{name}{{{labels}}} {value}'
+ else:
+ fmtstr = '\n{name} {value}'
+ expfmt += fmtstr.format(
+ name=name,
+ labels=labels,
+ value=floatstr(value),
+ )
+ return expfmt
+
+
+class Module(MgrModule):
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ self.notified = False
+ self.serving = False
+ self.metrics = dict()
+ self.schema = OrderedDict()
+ _global_instance['plugin'] = self
+
+ def _get_ordered_schema(self, **kwargs):
+
+ '''
+ fetch an ordered-by-key performance counter schema
+ ['perf_schema'][daemontype.id][countername] with keys
+ 'nick' (if present)
+ 'description'
+ 'type' (counter type....counter/gauge/avg/histogram/etc.)
+ '''
+
+ daemon_type = kwargs.get('daemon_type', '')
+ daemon_id = kwargs.get('daemon_id', '')
+
+ schema = self.get_perf_schema(daemon_type, daemon_id)
+ if not schema:
+ self.log.warning('_get_ordered_schema: no data')
+ return
+
+ new_schema = dict()
+ for k1 in schema.keys(): # 'perf_schema', but assume only one
+ for k2 in sorted(schema[k1].keys()):
+ sorted_dict = OrderedDict(
+ sorted(schema[k1][k2].items(), key=lambda i: i[0])
+ )
+ new_schema[k2] = sorted_dict
+ for k in sorted(new_schema.keys()):
+ self.log.debug("updating schema for %s" % k)
+ self.schema[k] = new_schema[k]
+
+ def shutdown(self):
+ self.serving = False
+ pass
+
+ # XXX duplicated from dashboard; factor out?
+ def get_latest(self, daemon_type, daemon_name, stat):
+ data = self.get_counter(daemon_type, daemon_name, stat)[stat]
+ if data:
+ return data[-1][1]
+ else:
+ return 0
+
+ def get_stat(self, daemon, path):
+
+ perfcounter = self.schema[daemon][path]
+ stattype = stattype_to_str(perfcounter['type'])
+ # XXX simplify first effort: no histograms
+ # averages are already collapsed to one value for us
+ if not stattype or stattype == 'histogram':
+ self.log.debug('ignoring %s, type %s' % (path, stattype))
+ return
+
+ if path not in self.metrics:
+ self.metrics[path] = Metric(
+ stattype,
+ path,
+ perfcounter['description'],
+ ('daemon',),
+ )
+
+ daemon_type, daemon_id = daemon.split('.')
+
+ self.metrics[path].set(
+ self.get_latest(daemon_type, daemon_id, path),
+ (daemon,)
+ )
+
+ def collect(self):
+ for daemon in self.schema.keys():
+ for path in self.schema[daemon].keys():
+ self.get_stat(daemon, path)
+ return self.metrics
+
+ def notify(self, ntype, nid):
+ ''' Just try to sync and not run until we're notified once '''
+ if not self.notified:
+ self.serving = True
+ self.notified = True
+ if ntype == 'perf_schema_update':
+ daemon_type, daemon_id = nid.split('.')
+ self._get_ordered_schema(
+ daemon_type=daemon_type,
+ daemon_id=daemon_id
+ )
+
+ def serve(self):
+
+ class Root(object):
+
+ # collapse everything to '/'
+ def _cp_dispatch(self, vpath):
+ cherrypy.request.path = ''
+ return self
+
+ def format_metrics(self, metrics):
+ formatted = ''
+ for m in metrics.values():
+ formatted += m.str_expfmt()
+ return formatted + '\n'
+
+ @cherrypy.expose
+ def index(self):
+ metrics = global_instance().collect()
+ cherrypy.response.headers['Content-Type'] = 'text/plain'
+ if metrics:
+ return self.format_metrics(metrics)
+
+ server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
+ server_port = self.get_localized_config('server_port', DEFAULT_PORT)
+ self.log.info(
+ "server_addr: %s server_port: %s" %
+ (server_addr, server_port)
+ )
+ # wait for first notification (of any kind) to start up
+ while not self.serving:
+ time.sleep(1)
+
+ cherrypy.config.update({
+ 'server.socket_host': server_addr,
+ 'server.socket_port': server_port,
+ 'engine.autoreload.on': False
+ })
+ cherrypy.tree.mount(Root(), "/")
+ cherrypy.engine.start()
+ cherrypy.engine.block()
return {
'api_version': 1,
'auth':
- 'Use "ceph tell mgr restful create_key <key>" to create a key pair, '
+ 'Use "ceph restful create-key <key>" to create a key pair, '
'pass it as HTTP Basic auth to authenticate',
'doc': 'See /doc endpoint',
'info': "Ceph Manager RESTful API server",
server_addr = self.get_localized_config('server_addr', '::')
if server_addr is None:
- raise RuntimeError('no server_addr configured; try "ceph config-key put mgr/restful/server_addr <ip>"')
+ raise RuntimeError('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
server_port = int(self.get_localized_config('server_port', '8003'))
self.log.info('server_addr: %s server_port: %d',
server_addr, server_port)
else:
rank_table.add_row([
- rank, "failed", "", "", ""
+ rank, "failed", "", "", "", ""
])
# Find the standby replays
value = self.get_localized_config(key, default)
if value is None:
raise RuntimeError('Configuration key {0} not set; "ceph '
- 'config-key put mgr/zabbix/{0} '
+ 'config-key set mgr/zabbix/{0} '
'<value>"'.format(key))
self.set_config_option(key, value)
data = dict()
health = json.loads(self.get('health')['json'])
- data['overall_status'] = health['overall_status']
+ # 'status' is luminous+, 'overall_status' is legacy mode.
+ data['overall_status'] = health.get('status',
+ health.get('overall_status'))
data['overall_status_int'] = \
self.ceph_health_mapping.get(data['overall_status'])
int rados_cluster_stat(rados_t cluster, rados_cluster_stat_t *result)
int rados_cluster_fsid(rados_t cluster, char *buf, size_t len)
int rados_blacklist_add(rados_t cluster, char *client_address, uint32_t expire_seconds)
-
+ int rados_application_enable(rados_ioctx_t io, const char *app_name,
+ int force)
+ int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len)
+ int rados_application_metadata_get(rados_ioctx_t io, const char *app_name,
+ const char *key, char *value,
+ size_t *value_len)
+ int rados_application_metadata_set(rados_ioctx_t io, const char *app_name,
+ const char *key, const char *value)
+ int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name, const char *key)
+ int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name, char *keys,
+ size_t *key_len, char *values,
+ size_t *value_len)
int rados_ping_monitor(rados_t cluster, const char *mon_id, char **outstr, size_t *outstrlen)
int rados_mon_command(rados_t cluster, const char **cmd, size_t cmdlen,
const char *inbuf, size_t inbuflen,
if ret < 0:
raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
+ def application_enable(self, app_name, force=False):
+ """
+ Enable an application on an OSD pool
+
+ :param app_name: application name
+ :type app_name: str
+ :param force: False if only a single app should exist per pool
+ :type expire_seconds: boool
+
+ :raises: :class:`Error`
+ """
+ app_name = cstr(app_name, 'app_name')
+ cdef:
+ char *_app_name = app_name
+ int _force = (1 if force else 0)
+
+ with nogil:
+ ret = rados_application_enable(self.io, _app_name, _force)
+ if ret < 0:
+ raise make_ex(ret, "error enabling application")
+
+ def application_list(self):
+ """
+ Returns a list of enabled applications
+
+ :returns: list of app name string
+ """
+ cdef:
+ size_t length = 128
+ char *apps = NULL
+
+ try:
+ while True:
+ apps = <char *>realloc_chk(apps, length)
+ with nogil:
+ ret = rados_application_list(self.io, apps, &length)
+ if ret == 0:
+ return [decode_cstr(app) for app in
+ apps[:length].split(b'\0') if app]
+ elif ret == -errno.ENOENT:
+ return None
+ elif ret == -errno.ERANGE:
+ pass
+ else:
+ raise make_ex(ret, "error listing applications")
+ finally:
+ free(apps)
+
+ def application_metadata_set(self, app_name, key, value):
+ """
+ Sets application metadata on an OSD pool
+
+ :param app_name: application name
+ :type app_name: str
+ :param key: metadata key
+ :type key: str
+ :param value: metadata value
+ :type value: str
+
+ :raises: :class:`Error`
+ """
+ app_name = cstr(app_name, 'app_name')
+ key = cstr(key, 'key')
+ value = cstr(value, 'value')
+ cdef:
+ char *_app_name = app_name
+ char *_key = key
+ char *_value = value
+
+ with nogil:
+ ret = rados_application_metadata_set(self.io, _app_name, _key,
+ _value)
+ if ret < 0:
+ raise make_ex(ret, "error setting application metadata")
+
+ def application_metadata_remove(self, app_name, key):
+ """
+ Remove application metadata from an OSD pool
+
+ :param app_name: application name
+ :type app_name: str
+ :param key: metadata key
+ :type key: str
+
+ :raises: :class:`Error`
+ """
+ app_name = cstr(app_name, 'app_name')
+ key = cstr(key, 'key')
+ cdef:
+ char *_app_name = app_name
+ char *_key = key
+
+ with nogil:
+ ret = rados_application_metadata_remove(self.io, _app_name, _key)
+ if ret < 0:
+ raise make_ex(ret, "error removing application metadata")
+
+ def application_metadata_list(self, app_name):
+ """
+ Returns a list of enabled applications
+
+ :param app_name: application name
+ :type app_name: str
+ :returns: list of key/value tuples
+ """
+ app_name = cstr(app_name, 'app_name')
+ cdef:
+ char *_app_name = app_name
+ size_t key_length = 128
+ size_t val_length = 128
+ char *c_keys = NULL
+ char *c_vals = NULL
+
+ try:
+ while True:
+ c_keys = <char *>realloc_chk(c_keys, key_length)
+ c_vals = <char *>realloc_chk(c_vals, val_length)
+ with nogil:
+ ret = rados_application_metadata_list(self.io, _app_name,
+ c_keys, &key_length,
+ c_vals, &val_length)
+ if ret == 0:
+ keys = [decode_cstr(key) for key in
+ c_keys[:key_length].split(b'\0') if key]
+ vals = [decode_cstr(val) for val in
+ c_vals[:val_length].split(b'\0') if val]
+ return zip(keys, vals)
+ elif ret == -errno.ERANGE:
+ pass
+ else:
+ raise make_ex(ret, "error listing application metadata")
+ finally:
+ free(c_keys)
+ free(c_vals)
+
def set_object_locator(func):
def retfunc(self, *args, **kwargs):
def trash_move(self, ioctx, name, delay=0):
"""
- Moves an RBD image to the trash.
+ Move an RBD image to the trash.
:param ioctx: determines which RADOS pool the image is in
:type ioctx: :class:`rados.Ioctx`
:param name: the name of the image to remove
def trash_remove(self, ioctx, image_id, force=False):
"""
- Deletes an RBD image from trash. If image deferment time has not
+ Delete an RBD image from trash. If image deferment time has not
expired :class:`PermissionError` is raised.
:param ioctx: determines which RADOS pool the image is in
:type ioctx: :class:`rados.Ioctx`
def trash_list(self, ioctx):
"""
- Lists all entries from trash.
+ List all entries from trash.
:param ioctx: determines which RADOS pool the image is in
:type ioctx: :class:`rados.Ioctx`
:returns: :class:`TrashIterator`
def trash_restore(self, ioctx, image_id, name):
"""
- Restores an RBD image from trash.
+ Restore an RBD image from trash.
:param ioctx: determines which RADOS pool the image is in
:type ioctx: :class:`rados.Ioctx`
:param image_id: the id of the image to restore
def features(self):
"""
- Gets the features bitmask of the image.
+ Get the features bitmask of the image.
:returns: int - the features bitmask of the image
"""
def update_features(self, features, enabled):
"""
- Updates the features bitmask of the image by enabling/disabling
+ Update the features bitmask of the image by enabling/disabling
a single feature. The feature must support the ability to be
dynamically enabled/disabled.
def overlap(self):
"""
- Gets the number of overlapping bytes between the image and its parent
+ Get the number of overlapping bytes between the image and its parent
image. If open to a snapshot, returns the overlap between the snapshot
and the parent image.
def flags(self):
"""
- Gets the flags bitmask of the image.
+ Get the flags bitmask of the image.
:returns: int - the flags bitmask of the image
"""
def is_exclusive_lock_owner(self):
"""
- Gets the status of the image exclusive lock.
+ Get the status of the image exclusive lock.
:returns: bool - true if the image is exclusively locked
"""
def stripe_unit(self):
"""
- Returns the stripe unit used for the image.
+ Return the stripe unit used for the image.
"""
cdef uint64_t stripe_unit
with nogil:
def stripe_count(self):
"""
- Returns the stripe count used for the image.
+ Return the stripe count used for the image.
"""
cdef uint64_t stripe_count
with nogil:
def create_timestamp(self):
"""
- Returns the create timestamp for the image.
+ Return the create timestamp for the image.
"""
cdef:
timespec timestamp
def rebuild_object_map(self):
"""
- Rebuilds the object map for the image HEAD or currently set snapshot
+ Rebuild the object map for the image HEAD or currently set snapshot
"""
cdef librbd_progress_fn_t prog_cb = &no_op_progress_callback
with nogil:
auto expire_s = cct->_conf->rgw_nfs_namespace_expire_secs;
/* delay between gc cycles */
- auto delay_s = std::max(1, std::min(MIN_EXPIRE_S, expire_s/2));
+ auto delay_s = std::max(int64_t(1), std::min(int64_t(MIN_EXPIRE_S), expire_s/2));
unique_lock uniq(mtx);
restart:
cout << " bucket rm remove bucket\n";
cout << " bucket check check bucket index\n";
cout << " bucket reshard reshard bucket\n";
+ cout << " bucket sync disable disable bucket sync\n";
+ cout << " bucket sync enable enable bucket sync\n";
cout << " bi get retrieve bucket index object entries\n";
cout << " bi put store bucket index object entries\n";
cout << " bi list list raw bucket index entries\n";
cout << " --access=<access> Set access permissions for sub-user, should be one\n";
cout << " of read, write, readwrite, full\n";
cout << " --display-name=<name>\n";
- cout << " --max_buckets max number of buckets for a user\n";
+ cout << " --max-buckets max number of buckets for a user\n";
cout << " --admin set the admin flag on the user\n";
cout << " --system set the system flag on the user\n";
cout << " --bucket=<bucket>\n";
cout << " --tags-add=<list> list of tags to add for zonegroup placement modify command\n";
cout << " --tags-rm=<list> list of tags to remove for zonegroup placement modify command\n";
cout << " --endpoints=<list> zone endpoints\n";
- cout << " --index_pool=<pool> placement target index pool\n";
+ cout << " --index-pool=<pool> placement target index pool\n";
cout << " --data-pool=<pool> placement target data pool\n";
cout << " --data-extra-pool=<pool> placement target data extra (non-ec) pool\n";
cout << " --placement-index-type=<type>\n";
OPT_BUCKET_SYNC_STATUS,
OPT_BUCKET_SYNC_INIT,
OPT_BUCKET_SYNC_RUN,
+ OPT_BUCKET_SYNC_DISABLE,
+ OPT_BUCKET_SYNC_ENABLE,
OPT_BUCKET_RM,
OPT_BUCKET_REWRITE,
OPT_BUCKET_RESHARD,
} else if (prev_prev_cmd && strcmp(prev_prev_cmd, "bucket") == 0) {
if (strcmp(prev_cmd, "sync") == 0) {
if (strcmp(cmd, "status") == 0)
- return OPT_BUCKET_SYNC_STATUS;
+ return OPT_BUCKET_SYNC_STATUS;
if (strcmp(cmd, "init") == 0)
- return OPT_BUCKET_SYNC_INIT;
+ return OPT_BUCKET_SYNC_INIT;
if (strcmp(cmd, "run") == 0)
- return OPT_BUCKET_SYNC_RUN;
+ return OPT_BUCKET_SYNC_RUN;
+ if (strcmp(cmd, "disable") == 0)
+ return OPT_BUCKET_SYNC_DISABLE;
+ if (strcmp(cmd, "enable") == 0)
+ return OPT_BUCKET_SYNC_ENABLE;
} else if ((strcmp(prev_cmd, "limit") == 0) &&
(strcmp(cmd, "check") == 0)) {
return OPT_BUCKET_LIMIT_CHECK;
return 0;
}
+int set_bucket_sync_enabled(RGWRados *store, int opt_cmd, const string& tenant_name, const string& bucket_name)
+{
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+ RGWObjectCtx obj_ctx(store);
+
+ int r = store->get_bucket_info(obj_ctx, tenant_name, bucket_name, bucket_info, NULL, &attrs);
+ if (r < 0) {
+ cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+ return -r;
+ }
+
+ if (opt_cmd == OPT_BUCKET_SYNC_ENABLE) {
+ bucket_info.flags &= ~BUCKET_DATASYNC_DISABLED;
+ } else if (opt_cmd == OPT_BUCKET_SYNC_DISABLE) {
+ bucket_info.flags |= BUCKET_DATASYNC_DISABLED;
+ }
+
+ r = store->put_bucket_instance_info(bucket_info, false, real_time(), &attrs);
+ if (r < 0) {
+ cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl;
+ return -r;
+ }
+
+ int shards_num = bucket_info.num_shards? bucket_info.num_shards : 1;
+ int shard_id = bucket_info.num_shards? 0 : -1;
+
+ if (opt_cmd == OPT_BUCKET_SYNC_DISABLE) {
+ r = store->stop_bi_log_entries(bucket_info, -1);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing stop bilog" << dendl;
+ return r;
+ }
+ } else {
+ r = store->resync_bi_log_entries(bucket_info, -1);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing resync bilog" << dendl;
+ return r;
+ }
+ }
+
+ for (int i = 0; i < shards_num; ++i, ++shard_id) {
+ r = store->data_log->add_entry(bucket_info.bucket, shard_id);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
/// search for a matching zone/zonegroup id and return a connection if found
static boost::optional<RGWRESTConn> get_remote_conn(RGWRados *store,
const RGWZoneGroup& zonegroup,
}
}
+ if ((opt_cmd == OPT_BUCKET_SYNC_DISABLE) || (opt_cmd == OPT_BUCKET_SYNC_ENABLE)) {
+ if (bucket_name.empty()) {
+ cerr << "ERROR: bucket not specified" << std::endl;
+ return EINVAL;
+ }
+
+ if (ret < 0) {
+ cerr << "could not init realm " << ": " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+ RGWPeriod period;
+ ret = period.init(g_ceph_context, store, realm_id, realm_name, true);
+ if (ret < 0) {
+ cerr << "failed to init period " << ": " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+
+ if (!store->is_meta_master()) {
+ cerr << "failed to update bucket sync: only allowed on meta master zone " << std::endl;
+ cerr << period.get_master_zone() << " | " << period.get_realm() << std::endl;
+ return EINVAL;
+ }
+
+ rgw_obj obj(bucket, object);
+ ret = set_bucket_sync_enabled(store, opt_cmd, tenant, bucket_name);
+ if (ret < 0)
+ return -ret;
+}
+
if (opt_cmd == OPT_BUCKET_SYNC_STATUS) {
if (source_zone.empty()) {
cerr << "ERROR: source zone not specified" << std::endl;
"torrent",
"uploadId",
"uploads",
- "start-date",
- "end-date",
"versionId",
"versioning",
"versions",
if (ret < 0)
return ret;
- return bucket.remove(op_state, bypass_gc, keep_index_consistent);
+ std::string err_msg;
+ ret = bucket.remove(op_state, bypass_gc, keep_index_consistent, &err_msg);
+ if (!err_msg.empty()) {
+ lderr(store->ctx()) << "ERROR: " << err_msg << dendl;
+ }
+ return ret;
}
int RGWBucketAdminOp::remove_object(RGWRados *store, RGWBucketAdminOpState& op_state)
bci.info.placement_rule = old_bci.info.placement_rule;
}
+ if (exists && old_bci.info.datasync_flag_enabled() != bci.info.datasync_flag_enabled()) {
+ int shards_num = bci.info.num_shards? bci.info.num_shards : 1;
+ int shard_id = bci.info.num_shards? 0 : -1;
+
+ if (!bci.info.datasync_flag_enabled()) {
+ ret = store->stop_bi_log_entries(bci.info, -1);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing bilog" << dendl;
+ return ret;
+ }
+ } else {
+ ret = store->resync_bi_log_entries(bci.info, -1);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing bilog" << dendl;
+ return ret;
+ }
+ }
+
+ for (int i = 0; i < shards_num; ++i, ++shard_id) {
+ ret = store->data_log->add_entry(bci.info.bucket, shard_id);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ return ret;
+ }
+ }
+ }
+
// are we actually going to perform this put, or is it too old?
if (exists &&
!check_versions(old_bci.info.objv_tracker.read_version, orig_mtime,
{ ERR_INVALID_DIGEST, {400, "InvalidDigest" }},
{ ERR_BAD_DIGEST, {400, "BadDigest" }},
{ ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }},
+ { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }},
{ ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }},
{ ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }},
{ ERR_UNRESOLVABLE_EMAIL, {400, "UnresolvableGrantByEmailAddress" }},
{ ERR_MALFORMED_XML, {400, "MalformedXML" }},
{ ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
{ ERR_INVALID_TAG, {400, "InvalidTag"}},
+ { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }},
{ ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
{ EACCES, {403, "AccessDenied" }},
{ EPERM, {403, "AccessDenied" }},
content_started = false;
format = 0;
formatter = NULL;
- bucket_acl = NULL;
- object_acl = NULL;
expect_cont = false;
obj_size = 0;
req_state::~req_state() {
delete formatter;
- delete bucket_acl;
- delete object_acl;
}
bool search_err(rgw_http_errors& errs, int err_no, bool is_website_redirect, int& http_ret, string& code)
*dest = buf;
}
+
+string rgw_to_asctime(const utime_t& t)
+{
+ stringstream s;
+ t.asctime(s);
+ return s.str();
+}
+
/*
* calculate the sha1 value of a given msg and key
*/
return false;
return verify_bucket_permission_no_policy(s,
- s->user_acl.get(),
- s->bucket_acl,
- perm);
+ s->user_acl.get(),
+ s->bucket_acl.get(),
+ perm);
}
bool verify_bucket_permission(struct req_state * const s, const uint64_t op)
{
return verify_bucket_permission(s,
- s->bucket,
+ s->bucket,
s->user_acl.get(),
- s->bucket_acl,
- s->iam_policy,
+ s->bucket_acl.get(),
+ s->iam_policy,
op);
}
if (!verify_requester_payer_permission(s))
return false;
- return verify_object_permission_no_policy(s, s->user_acl.get(),
- s->bucket_acl, s->object_acl,
- perm);
+ return verify_object_permission_no_policy(s,
+ s->user_acl.get(),
+ s->bucket_acl.get(),
+ s->object_acl.get(),
+ perm);
}
bool verify_object_permission(struct req_state *s, uint64_t op)
{
return verify_object_permission(s,
- rgw_obj(s->bucket, s->object),
- s->user_acl.get(),
- s->bucket_acl,
- s->object_acl,
- s->iam_policy,
+ rgw_obj(s->bucket, s->object),
+ s->user_acl.get(),
+ s->bucket_acl.get(),
+ s->object_acl.get(),
+ s->iam_policy,
op);
}
#define ERR_TAG_CONFLICT 2209
#define ERR_INVALID_TAG 2210
#define ERR_ZERO_IN_URL 2211
+#define ERR_MALFORMED_ACL_ERROR 2212
+#define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213
#define ERR_BUSY_RESHARDING 2300
BUCKET_SUSPENDED = 0x1,
BUCKET_VERSIONED = 0x2,
BUCKET_VERSIONS_SUSPENDED = 0x4,
+ BUCKET_DATASYNC_DISABLED = 0X8,
};
enum RGWBucketIndexType {
bool versioned() const { return (flags & BUCKET_VERSIONED) != 0; }
int versioning_status() { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); }
bool versioning_enabled() { return versioning_status() == BUCKET_VERSIONED; }
+ bool datasync_flag_enabled() const { return (flags & BUCKET_DATASYNC_DISABLED) == 0; }
bool has_swift_versioning() const {
/* A bucket may be versioned through one mechanism only. */
} auth;
std::unique_ptr<RGWAccessControlPolicy> user_acl;
- RGWAccessControlPolicy *bucket_acl;
- RGWAccessControlPolicy *object_acl;
+ std::unique_ptr<RGWAccessControlPolicy> bucket_acl;
+ std::unique_ptr<RGWAccessControlPolicy> object_acl;
rgw::IAM::Environment env;
boost::optional<rgw::IAM::Policy> iam_policy;
extern void rgw_to_iso8601(const real_time& t, char *dest, int buf_size);
extern void rgw_to_iso8601(const real_time& t, string *dest);
+extern std::string rgw_to_asctime(const utime_t& t);
/** Check if the req_state's user has the necessary permissions
* to do the requested action */
{
}
-
RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(RGWRados *_store,
const rgw_raw_obj& _obj,
map<string, bufferlist>& _entries) : RGWSimpleCoroutine(_store->ctx()),
return req->get_ret_status();
}
-
int RGWOmapAppend::operate() {
reenter(this) {
for (;;) {
const string& _name, const string& _cookie);
};
-
template <class T>
class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
RGWAsyncRadosProcessor *async_rados;
string bucket_ver;
string master_ver;
string max_marker;
+ bool syncstopped{false};
void decode_json(JSONObj *obj) {
JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
JSONDecoder::decode_json("master_ver", master_ver, obj);
JSONDecoder::decode_json("max_marker", max_marker, obj);
+ JSONDecoder::decode_json("syncstopped", syncstopped, obj);
}
};
return set_cr_error(retcode);
}
yield {
- status.state = rgw_bucket_shard_sync_info::StateFullSync;
- status.inc_marker.position = info.max_marker;
- map<string, bufferlist> attrs;
- status.encode_all_attrs(attrs);
auto store = sync_env->store;
- call(new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, store,
- rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
- attrs));
+ rgw_raw_obj obj(store->get_zone_params().log_pool, sync_status_oid);
+
+ if (info.syncstopped) {
+ call(new RGWRadosRemoveCR(store, obj));
+ } else {
+ status.state = rgw_bucket_shard_sync_info::StateFullSync;
+ status.inc_marker.position = info.max_marker;
+ map<string, bufferlist> attrs;
+ status.encode_all_attrs(attrs);
+ call(new RGWSimpleRadosWriteAttrsCR(sync_env->async_rados, store, obj, attrs));
+ }
}
return set_cr_done();
}
bool updated_status{false};
const string& status_oid;
const string& zone_id;
+ ceph::real_time sync_modify_time;
string cur_id;
RGWDataSyncDebugLogger logger;
int sync_status{0};
+ bool syncstopped{false};
public:
RGWBucketShardIncrementalSyncCR(RGWDataSyncEnv *_sync_env,
drain_all();
return set_cr_error(-ECANCELED);
}
- ldout(sync_env->cct, 20) << __func__ << "(): listing bilog for incremental sync" << dendl;
+ ldout(sync_env->cct, 20) << __func__ << "(): listing bilog for incremental sync" << inc_marker.position << dendl;
set_status() << "listing bilog; position=" << inc_marker.position;
yield call(new RGWListBucketIndexLogCR(sync_env, bs, inc_marker.position,
&list_result));
- if (retcode < 0 && retcode != -ENOENT) {
- /* wait for all operations to complete */
+ if (retcode < 0 && retcode != -ENOENT ) {
drain_all();
- return set_cr_error(retcode);
+ if (!syncstopped) {
+ /* wait for all operations to complete */
+ return set_cr_error(retcode);
+ } else {
+ /* no need to retry */
+ break;
+ }
}
squash_map.clear();
for (auto& e : list_result) {
+ if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP && (sync_modify_time < e.timestamp)) {
+ ldout(sync_env->cct, 20) << " syncstop on " << e.timestamp << dendl;
+ sync_modify_time = e.timestamp;
+ syncstopped = true;
+ continue;
+ }
+ if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC && (sync_modify_time < e.timestamp)) {
+ ldout(sync_env->cct, 20) << " resync on " << e.timestamp << dendl;
+ sync_modify_time = e.timestamp;
+ syncstopped = false;
+ continue;
+ }
if (e.state != CLS_RGW_STATE_COMPLETE) {
continue;
}
squash_entry = make_pair<>(e.timestamp, e.op);
}
}
+
entries_iter = list_result.begin();
for (; entries_iter != list_result.end(); ++entries_iter) {
if (!lease_cr->is_locked()) {
}
inc_marker.position = cur_id;
+ if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+ ldout(sync_env->cct, 20) << "detected syncstop or resync on " << entries_iter->timestamp << " , skipping entry" << dendl;
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+
if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) {
set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry";
ldout(sync_env->cct, 20) << "parse_raw_oid() on " << entry->object << " returned false, skipping entry" << dendl;
}
} while (!list_result.empty() && sync_status == 0);
+ if (syncstopped) {
+ drain_all();
+
+ yield {
+ const string& oid = RGWBucketSyncStatusManager::status_oid(sync_env->source_zone, bs);
+ RGWRados *store = sync_env->store;
+ call(new RGWRadosRemoveCR(store, rgw_raw_obj{store->get_zone_params().log_pool, oid}));
+ }
+ lease_cr->abort();
+ return set_cr_done();
+ }
+
while (num_spawned()) {
yield wait_for_child();
bool again = true;
= get_context()->_conf->rgw_nfs_namespace_expire_secs;
/* max events to gc in one cycle */
- uint32_t max_ev =
- std::max(1, get_context()->_conf->rgw_nfs_max_gc);
+ uint32_t max_ev = get_context()->_conf->rgw_nfs_max_gc;
struct timespec now, expire_ts;
event_vector ve;
return rc;
}
+/* project offset of dirent name */
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, int64_t *offset,
+ uint32_t flags)
+{
+ RGWFileHandle* parent = get_rgwfh(parent_fh);
+ if ((! parent)) {
+ /* bad parent */
+ return -EINVAL;
+ }
+ std::string sname{name};
+ int rc = parent->offset_of(sname, offset, flags);
+ return rc;
+}
+
/*
read data from file
*/
return nullptr;
}
+ int offset_of(const std::string& name, int64_t *offset, uint32_t flags) {
+ if (unlikely(! is_dir())) {
+ return -EINVAL;
+ }
+ *offset = XXH64(name.c_str(), name.length(), fh_key::seed);
+ return 0;
+ }
+
bool is_open() const { return flags & FLAG_OPEN; }
bool is_root() const { return flags & FLAG_ROOT; }
bool is_bucket() const { return flags & FLAG_BUCKET; }
cct = _cct;
store = _store;
- max_objs = min(cct->_conf->rgw_gc_max_objs, rgw_shards_max());
+ max_objs = min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max());
obj_names = new string[max_objs];
#include <utility>
#include <boost/regex.hpp>
-
+#include <iostream>
#include "rapidjson/reader.h"
#include "rgw_auth.h"
bool arraying = false;
bool objecting = false;
+ bool cond_ifexists = false;
void reset();
CephContext* cct;
const string& tenant;
Policy& policy;
+ std::set<TokenID> v;
uint32_t seen = 0;
}
void set(TokenID in) {
seen |= dex(in);
+ if (in == TokenID::Sid || in == TokenID::Effect || in == TokenID::Principal || in == TokenID::NotPrincipal ||
+ in == TokenID::Action || in == TokenID::NotAction || in == TokenID::Resource || in == TokenID::NotResource ||
+ in == TokenID::Condition || in == TokenID::AWS || in == TokenID::Federated || in == TokenID::Service ||
+ in == TokenID::CanonicalUser) {
+ v.insert(in);
+ }
}
void set(std::initializer_list<TokenID> l) {
for (auto in : l) {
seen |= dex(in);
+ if (in == TokenID::Sid || in == TokenID::Effect || in == TokenID::Principal || in == TokenID::NotPrincipal ||
+ in == TokenID::Action || in == TokenID::NotAction || in == TokenID::Resource || in == TokenID::NotResource ||
+ in == TokenID::Condition || in == TokenID::AWS || in == TokenID::Federated || in == TokenID::Service ||
+ in == TokenID::CanonicalUser) {
+ v.insert(in);
+ }
}
}
void reset(TokenID in) {
seen &= ~dex(in);
+ if (in == TokenID::Sid || in == TokenID::Effect || in == TokenID::Principal || in == TokenID::NotPrincipal ||
+ in == TokenID::Action || in == TokenID::NotAction || in == TokenID::Resource || in == TokenID::NotResource ||
+ in == TokenID::Condition || in == TokenID::AWS || in == TokenID::Federated || in == TokenID::Service ||
+ in == TokenID::CanonicalUser) {
+ v.erase(in);
+ }
}
void reset(std::initializer_list<TokenID> l) {
for (auto in : l) {
seen &= ~dex(in);
+ if (in == TokenID::Sid || in == TokenID::Effect || in == TokenID::Principal || in == TokenID::NotPrincipal ||
+ in == TokenID::Action || in == TokenID::NotAction || in == TokenID::Resource || in == TokenID::NotResource ||
+ in == TokenID::Condition || in == TokenID::AWS || in == TokenID::Federated || in == TokenID::Service ||
+ in == TokenID::CanonicalUser) {
+ v.erase(in);
+ }
+ }
+ }
+ void reset(std::set<TokenID> v) {
+ for (auto in : v) {
+ seen &= ~dex(in);
+ v.erase(in);
}
}
if (s.empty()) {
return false;
}
-
return s.back().obj_end();
}
bool Key(const char* str, SizeType length, bool copy) {
if (s.empty()) {
return false;
}
-
return s.back().key(str, length);
}
if (s.empty()) {
return false;
}
-
return s.back().do_string(cct, str, length);
}
bool RawNumber(const char* str, SizeType length, bool copy) {
}
bool ParseState::key(const char* s, size_t l) {
- auto k = pp->tokens.lookup(s, l);
+ auto token_len = l;
+ bool ifexists = false;
+ if (w->id == TokenID::Condition && w->kind == TokenKind::statement) {
+ static constexpr char IfExists[] = "IfExists";
+ if (boost::algorithm::ends_with(boost::string_view{s, l}, IfExists)) {
+ ifexists = true;
+ token_len -= sizeof(IfExists)-1;
+ }
+ }
+ auto k = pp->tokens.lookup(s, token_len);
if (!k) {
if (w->kind == TokenKind::cond_op) {
auto& t = pp->policy.statements.back();
pp->s.emplace_back(pp, cond_key);
- t.conditions.emplace_back(w->id, s, l);
+ t.conditions.emplace_back(w->id, s, l, cond_ifexists);
return true;
} else {
return false;
// If the token we're going with belongs within the condition at the
// top of the stack and we haven't already encountered it, push it
// on the stack
-
// Top
if ((((w->id == TokenID::Top) && (k->kind == TokenKind::top)) ||
// Statement
} else if ((w->id == TokenID::Condition) &&
(k->kind == TokenKind::cond_op)) {
pp->s.emplace_back(pp, k);
+ pp->s.back().cond_ifexists = ifexists;
return true;
}
return false;
}
void ParseState::reset() {
- pp->reset({TokenID::Sid, TokenID::Effect, TokenID::Principal,
- TokenID::NotPrincipal, TokenID::Action, TokenID::NotAction,
- TokenID::Resource, TokenID::NotResource, TokenID::Condition});
+ pp->reset(pp->v);
}
bool ParseState::obj_start() {
}
if (i == env.end()) {
- return false;
+ return ifexists;
}
const auto& s = i->second;
#include "fnmatch.h"
+#include "rgw_acl.h"
#include "rgw_basic_types.h"
#include "rgw_iam_policy_keywords.h"
std::vector<std::string> vals;
Condition() = default;
- Condition(TokenID op, const char* s, std::size_t len) : op(op) {
- static constexpr char ifexistr[] = "IfExists";
- auto l = static_cast<const char*>(memmem(static_cast<const void*>(s), len,
- static_cast<const void*>(ifexistr),
- sizeof(ifexistr) -1));
- if (l && ((l + sizeof(ifexistr) - 1 == (s + len)))) {
- ifexists = true;
- key.assign(s, static_cast<const char*>(l) - s);
- } else {
- key.assign(s, len);
- }
- }
+ Condition(TokenID op, const char* s, std::size_t len, bool ifexists)
+ : op(op), key(s, len), ifexists(ifexists) {}
bool eval(const Environment& e) const;
try {
double d = std::stod(s, &p);
if (p == s.length()) {
- return !((d == +0.0) || (d = -0.0) || std::isnan(d));
+ return !((d == +0.0) || (d == -0.0) || std::isnan(d));
}
} catch (const std::logic_error& e) {
// Fallthrough
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
%language=C++
+%compare-strncmp
%define class-name keyword_hash
%define lookup-function-name lookup
%struct-type
encode_json("log_pool", log_pool, f);
encode_json("intent_log_pool", intent_log_pool, f);
encode_json("usage_log_pool", usage_log_pool, f);
+ encode_json("reshard_pool", reshard_pool, f);
encode_json("user_keys_pool", user_keys_pool, f);
encode_json("user_email_pool", user_email_pool, f);
encode_json("user_swift_pool", user_swift_pool, f);
JSONDecoder::decode_json("lc_pool", lc_pool, obj);
JSONDecoder::decode_json("log_pool", log_pool, obj);
JSONDecoder::decode_json("intent_log_pool", intent_log_pool, obj);
+ JSONDecoder::decode_json("reshard_pool", reshard_pool, obj);
JSONDecoder::decode_json("usage_log_pool", usage_log_pool, obj);
JSONDecoder::decode_json("user_keys_pool", user_keys_pool, obj);
JSONDecoder::decode_json("user_email_pool", user_email_pool, obj);
#include "rgw_common.h"
#include "rgw_http_client.h"
#include "common/Cond.h"
+#include "global/global_init.h"
#include <atomic>
friend class TokenCache;
typedef RGWPostHTTPData RGWGetRevokedTokens;
- CephContext * const cct;
+ CephContext* const cct;
TokenCache* const cache;
const rgw::keystone::Config& config;
config(config),
lock("rgw::keystone::TokenCache::RevokeThread") {
}
+
void *entry() override;
void stop();
int check_revoked();
} revocator;
- CephContext * const cct;
+ const boost::intrusive_ptr<CephContext> cct;
std::string admin_token_id;
std::string barbican_token_id;
utime_t end = ceph_clock_now();
int secs = schedule_next_start_time(start, end);
- time_t next_time = end + secs;
- char buf[30];
- char *nt = ctime_r(&next_time, buf);
- dout(5) << "schedule life cycle next start time: " << nt <<dendl;
+ utime_t next;
+ next.set_from_double(end + secs);
+
+ dout(5) << "schedule life cycle next start time: " << rgw_to_asctime(next) <<dendl;
lock.Lock();
cond.WaitInterval(lock, utime_t(secs, 0));
void RGWLoadGenRequestEnv::set_date(utime_t& tm)
{
- stringstream s;
- tm.asctime(s);
- date_str = s.str();
+ date_str = rgw_to_asctime(tm);
}
int RGWLoadGenRequestEnv::sign(RGWAccessKey& access_key)
}
rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name, entry.bucket);
- if (check_utf8(s->bucket_name.c_str(), entry.bucket.size()) != 0) {
+ if (check_utf8(entry.bucket.c_str(), entry.bucket.size()) != 0) {
ldout(s->cct, 5) << "not logging op on bucket with non-utf8 name" << dendl;
return 0;
}
#include "common/Clock.h"
#include "common/armor.h"
+#include "common/backport14.h"
#include "common/errno.h"
#include "common/mime.h"
#include "common/utf8.h"
}
if(s->dialect.compare("s3") == 0) {
- s->bucket_acl = new RGWAccessControlPolicy_S3(s->cct);
+ s->bucket_acl = ceph::make_unique<RGWAccessControlPolicy_S3>(s->cct);
} else if(s->dialect.compare("swift") == 0) {
/* We aren't allocating the account policy for those operations using
* the Swift's infrastructure that don't really need req_state::user.
* Typical example here is the implementation of /info. */
if (!s->user->user_id.empty()) {
- s->user_acl = std::unique_ptr<RGWAccessControlPolicy>(
- new RGWAccessControlPolicy_SWIFTAcct(s->cct));
+ s->user_acl = ceph::make_unique<RGWAccessControlPolicy_SWIFTAcct>(s->cct);
}
- s->bucket_acl = new RGWAccessControlPolicy_SWIFT(s->cct);
+ s->bucket_acl = ceph::make_unique<RGWAccessControlPolicy_SWIFT>(s->cct);
} else {
- s->bucket_acl = new RGWAccessControlPolicy(s->cct);
+ s->bucket_acl = ceph::make_unique<RGWAccessControlPolicy>(s->cct);
}
/* check if copy source is within the current domain */
s->bucket = s->bucket_info.bucket;
if (s->bucket_exists) {
- ret = read_bucket_policy(store, s, s->bucket_info, s->bucket_attrs, s->bucket_acl, s->bucket);
+ ret = read_bucket_policy(store, s, s->bucket_info, s->bucket_attrs,
+ s->bucket_acl.get(), s->bucket);
acct_acl_user = {
s->bucket_info.owner,
s->bucket_acl->get_owner().get_display_name(),
if (!s->bucket_exists) {
return -ERR_NO_SUCH_BUCKET;
}
- s->object_acl = new RGWAccessControlPolicy(s->cct);
+ s->object_acl = ceph::make_unique<RGWAccessControlPolicy>(s->cct);
rgw_obj obj(s->bucket, s->object);
if (prefetch_data) {
store->set_prefetch_data(s->obj_ctx, obj);
}
- ret = read_obj_policy(store, s, s->bucket_info, s->bucket_attrs, s->object_acl, s->iam_policy, s->bucket, s->object);
+ ret = read_obj_policy(store, s, s->bucket_info, s->bucket_attrs,
+ s->object_acl.get(), s->iam_policy, s->bucket,
+ s->object);
}
return ret;
} else {
bucket = s->bucket;
pbucket_info = &s->bucket_info;
- bucket_acl = s->bucket_acl;
+ bucket_acl = s->bucket_acl.get();
bucket_policy = &s->iam_policy;
}
}
} else {
bucket = s->bucket;
- bucket_acl = s->bucket_acl;
+ bucket_acl = s->bucket_acl.get();
bucket_policy = s->iam_policy.get_ptr();
}
op_ret = rgw_user_sync_all_stats(store, s->user->user_id);
if (op_ret < 0) {
ldout(store->ctx(), 0) << "ERROR: failed to sync user stats: " << dendl;
+ return;
+ }
+
+ op_ret = rgw_user_get_all_buckets_stats(store, s->user->user_id, buckets_usage);
+ if (op_ret < 0) {
+ cerr << "ERROR: failed to sync user stats: " << std::endl;
return ;
}
-
+
string user_str = s->user->user_id.to_str();
op_ret = store->cls_user_get_header(user_str, &header);
if (op_ret < 0) {
ldout(store->ctx(), 0) << "ERROR: can't read user header: " << dendl;
- return ;
+ return;
}
return;
return;
}
+ const auto& zonegroup = store->get_zonegroup();
+ if (!placement_rule.empty() &&
+ !zonegroup.placement_targets.count(placement_rule)) {
+ ldout(s->cct, 0) << "placement target (" << placement_rule << ")"
+ << " doesn't exist in the placement targets of zonegroup"
+ << " (" << store->get_zonegroup().api_name << ")" << dendl;
+ op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
+ s->err.message = "The specified placement target does not exist";
+ return;
+ }
+
/* we need to make sure we read bucket info, it's not read before for this
* specific request */
RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
* contain such keys yet. */
if (has_policy) {
if (s->dialect.compare("swift") == 0) {
- auto old_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl);
+ auto old_policy = \
+ static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl.get());
auto new_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(&policy);
new_policy->filter_merge(policy_rw_mask, old_policy);
policy = *new_policy;
void RGWGetACLs::execute()
{
stringstream ss;
- RGWAccessControlPolicy *acl = (!s->object.empty() ? s->object_acl : s->bucket_acl);
- RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(acl);
+ RGWAccessControlPolicy* const acl = \
+ (!s->object.empty() ? s->object_acl.get() : s->bucket_acl.get());
+ RGWAccessControlPolicy_S3* const s3policy = \
+ static_cast<RGWAccessControlPolicy_S3*>(acl);
s3policy->to_xml(ss);
acls = ss.str();
}
}
- RGWAccessControlPolicy *existing_policy = (s->object.empty() ? s->bucket_acl : s->object_acl);
+ RGWAccessControlPolicy* const existing_policy = \
+ (s->object.empty() ? s->bucket_acl.get() : s->object_acl.get());
owner = existing_policy->get_owner();
op_ret = get_params();
- if (op_ret < 0)
+ if (op_ret < 0) {
+ if (op_ret == -ERANGE) {
+ ldout(s->cct, 4) << "The size of request xml data is larger than the max limitation, data size = "
+ << s->length << dendl;
+ op_ret = -ERR_MALFORMED_XML;
+ s->err.message = "The XML you provided was larger than the maximum " +
+ std::to_string(s->cct->_conf->rgw_max_put_param_size) +
+ " bytes allowed.";
+ }
return;
+ }
ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl;
return;
}
+ const RGWAccessControlList& req_acl = policy->get_acl();
+ const multimap<string, ACLGrant>& req_grant_map = req_acl.get_grant_map();
+#define ACL_GRANTS_MAX_NUM 100
+ int max_num = s->cct->_conf->rgw_acl_grants_max_num;
+ if (max_num < 0) {
+ max_num = ACL_GRANTS_MAX_NUM;
+ }
+
+ int grants_num = req_grant_map.size();
+ if (grants_num > max_num) {
+ ldout(s->cct, 4) << "An acl can have up to "
+ << max_num
+ << " grants, request acl grants num: "
+ << grants_num << dendl;
+ op_ret = -ERR_MALFORMED_ACL_ERROR;
+ s->err.message = "The request is rejected, because the acl grants number you requested is larger than the maximum "
+ + std::to_string(max_num)
+ + " grants allowed in an acl.";
+ return;
+ }
+
// forward bucket acl requests to meta master zone
if (s->object.empty() && !store->is_meta_master()) {
bufferlist in_data;
map<string, bool> categories;
map<rgw_user_bucket, rgw_usage_log_entry> usage;
map<string, rgw_usage_log_entry> summary_map;
+ map<string, cls_user_bucket_entry> buckets_usage;
cls_user_header header;
public:
RGWGetUsage() : sent_data(false), show_log_entries(true), show_log_sum(true){
return r;
} else if (!citer.end()) {
JSONObj *c = *citer;
- dout(0) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl;
+ dout(20) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl;
add_simple_check(c->get_name(), c->get_data());
} else {
void set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats);
int async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs);
void async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats);
+ void async_refresh_fail(const rgw_user& user, rgw_bucket& bucket);
class AsyncRefreshHandler {
protected:
return 0;
}
+template<class T>
+void RGWQuotaCache<T>::async_refresh_fail(const rgw_user& user, rgw_bucket& bucket)
+{
+ ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
+
+ async_refcount->put();
+}
+
template<class T>
void RGWQuotaCache<T>::async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats)
{
const uint64_t rounded_added = rgw_rounded_objsize(added_bytes);
const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes);
- entry->stats.size += added_bytes - removed_bytes;
- entry->stats.size_rounded += rounded_added - rounded_removed;
- entry->stats.num_objects += objs_delta;
+ if ((entry->stats.size + added_bytes - removed_bytes) >= 0) {
+ entry->stats.size += added_bytes - removed_bytes;
+ } else {
+ entry->stats.size = 0;
+ }
+
+ if ((entry->stats.size_rounded + rounded_added - rounded_removed) >= 0) {
+ entry->stats.size_rounded += rounded_added - rounded_removed;
+ } else {
+ entry->stats.size_rounded = 0;
+ }
+
+ if ((entry->stats.num_objects + objs_delta) >= 0) {
+ entry->stats.num_objects += objs_delta;
+ } else {
+ entry->stats.num_objects = 0;
+ }
return true;
}
{
if (r < 0) {
ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
- return; /* nothing to do here */
+ cache->async_refresh_fail(user, bucket);
+ return;
}
RGWStorageStats bs;
{
if (r < 0) {
ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
- return; /* nothing to do here */
+ cache->async_refresh_fail(user, bucket);
+ return;
}
cache->async_refresh_response(user, bucket, stats);
#include "cls/timeindex/cls_timeindex_client.h"
#include "cls/lock/cls_lock_client.h"
#include "cls/user/cls_user_client.h"
+#include "osd/osd_types.h"
#include "rgw_tools.h"
#include "rgw_coroutine.h"
}
r = rados->ioctx_create(pool.name.c_str(), ioctx);
- }
- if (r < 0) {
+ if (r < 0) {
+ return r;
+ }
+
+ r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+ } else if (r < 0) {
return r;
}
if (!pool.ns.empty()) {
delete async_rados;
}
+ delete lc;
+ lc = NULL;
+
delete gc;
gc = NULL;
delete obj_expirer;
obj_expirer = NULL;
-
- delete lc;
- lc = NULL;
delete rest_master_conn;
if (r < 0 && r != -EEXIST)
return r;
- return rgw_init_ioctx(rad, pool, io_ctx);
+ r = rgw_init_ioctx(rad, pool, io_ctx);
+ if (r < 0)
+ return r;
+
+ r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+ if (r < 0 && r != -EOPNOTSUPP)
+ return r;
+ return 0;
}
void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
uint32_t val = index;
if (!name.empty()) {
- int max_user_shards = max(cct->_conf->rgw_usage_max_user_shards, 1);
+ int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
val %= max_user_shards;
val += ceph_str_hash_linux(name.c_str(), name.size());
}
char buf[17];
- int max_shards = max(cct->_conf->rgw_usage_max_shards, 1);
+ int max_shards = cct->_conf->rgw_usage_max_shards;
snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
hash = buf;
}
* common_prefixes: if delim is filled in, any matching prefixes are placed here.
* is_truncated: if number of objects in the bucket is bigger than max, then truncated.
*/
-int RGWRados::Bucket::List::list_objects(int max, vector<rgw_bucket_dir_entry> *result,
+int RGWRados::Bucket::List::list_objects(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
map<string, bool> *common_prefixes,
bool *is_truncated)
{
if (ret < 0)
return ret;
+ librados::IoCtx io_ctx;
+ ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
+ if (ret < 0)
+ return ret;
+
+ ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
return 0;
}
*bucket_id = buf;
}
-/**
- * create a bucket with name bucket and the given list of attrs
- * returns 0 on success, -ERR# otherwise.
- */
int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
const string& zonegroup_id,
const string& placement_rule,
string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
{
- /* first check that rule exists within the specific zonegroup */
+ /* first check that zonegroup exists within current period. */
RGWZoneGroup zonegroup;
int ret = get_zonegroup(zonegroup_id, zonegroup);
if (ret < 0) {
return ret;
}
- /* now check that tag exists within zonegroup */
/* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
- string rule = request_rule;
- if (rule.empty()) {
- rule = user_info.default_placement;
- if (rule.empty())
- rule = zonegroup.default_placement;
- }
-
- if (rule.empty()) {
- ldout(cct, 0) << "misconfiguration, should not have an empty placement rule name" << dendl;
- return -EIO;
- }
-
- map<string, RGWZoneGroupPlacementTarget>::iterator titer = zonegroup.placement_targets.find(rule);
- if (titer == zonegroup.placement_targets.end()) {
- ldout(cct, 0) << "could not find placement rule " << rule << " within zonegroup " << dendl;
- return -EINVAL;
+ std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+
+ if (!request_rule.empty()) {
+ titer = zonegroup.placement_targets.find(request_rule);
+ if (titer == zonegroup.placement_targets.end()) {
+ ldout(cct, 0) << "could not find requested placement id " << request_rule
+ << " within zonegroup " << dendl;
+ return -ERR_INVALID_LOCATION_CONSTRAINT;
+ }
+ } else if (!user_info.default_placement.empty()) {
+ titer = zonegroup.placement_targets.find(user_info.default_placement);
+ if (titer == zonegroup.placement_targets.end()) {
+ ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
+ << " within zonegroup " << dendl;
+ return -ERR_INVALID_LOCATION_CONSTRAINT;
+ }
+ } else {
+ if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
+ ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
+ return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
+ } else {
+ titer = zonegroup.placement_targets.find(zonegroup.default_placement);
+ if (titer == zonegroup.placement_targets.end()) {
+ ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
+ << " within zonegroup " << dendl;
+ return -ERR_INVALID_LOCATION_CONSTRAINT;
+ }
+ }
}
/* now check tag for the rule, whether user is permitted to use rule */
- RGWZoneGroupPlacementTarget& target_rule = titer->second;
+ const auto& target_rule = titer->second;
if (!target_rule.user_permitted(user_info.placement_tags)) {
- ldout(cct, 0) << "user not permitted to use placement rule" << dendl;
+ ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
return -EPERM;
}
if (pselected_rule_name)
- *pselected_rule_name = rule;
+ *pselected_rule_name = titer->first;
- return select_bucket_location_by_rule(rule, rule_info);
+ return select_bucket_location_by_rule(titer->first, rule_info);
}
int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
vector<int>::iterator riter;
vector<librados::PoolAsyncCompletion *>::iterator citer;
+ bool error = false;
assert(rets.size() == completions.size());
for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
int r = *riter;
r = c->get_return_value();
if (r < 0) {
ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
+ error = true;
}
}
c->release();
retcodes.push_back(r);
}
+ if (error) {
+ return 0;
+ }
+
+ std::vector<librados::IoCtx> io_ctxs;
+ retcodes.clear();
+ for (auto pool : pools) {
+ io_ctxs.emplace_back();
+ int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
+ if (ret < 0) {
+ ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
+ error = true;
+ }
+ retcodes.push_back(ret);
+ }
+ if (error) {
+ return 0;
+ }
+
+ completions.clear();
+ for (auto &io_ctx : io_ctxs) {
+ librados::PoolAsyncCompletion *c =
+ librados::Rados::pool_async_create_completion();
+ completions.push_back(c);
+ int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
+ false, c);
+ assert(ret == 0);
+ }
+
+ retcodes.clear();
+ for (auto c : completions) {
+ c->wait();
+ int ret = c->get_return_value();
+ if (ret == -EOPNOTSUPP) {
+ ret = 0;
+ } else if (ret < 0) {
+ ldout(cct, 0) << "WARNING: async application_enable returned " << ret
+ << dendl;
+ error = true;
+ }
+ c->release();
+ retcodes.push_back(ret);
+ }
return 0;
}
int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
if (ret < 0) {
- delete out_stream_req;
return ret;
}
return r;
}
- r = store->data_log->add_entry(bs->bucket, bs->shard_id);
- if (r < 0) {
- lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
- return r;
+ if (target->bucket_info.datasync_flag_enabled()) {
+ r = store->data_log->add_entry(bs->bucket, bs->shard_id);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ return r;
+ }
}
return 0;
ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
- int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
- if (r < 0) {
- lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ if (target->bucket_info.datasync_flag_enabled()) {
+ int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ }
}
return ret;
ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
- int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
- if (r < 0) {
- lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ if (target->bucket_info.datasync_flag_enabled()) {
+ int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ }
}
return ret;
* for following the specific bucket shard log. Otherwise they end up staying behind, and users
* have no way to tell that they're all caught up
*/
- int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
- if (r < 0) {
- lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ if (target->bucket_info.datasync_flag_enabled()) {
+ int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
+ }
}
return ret;
}
int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
- map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker)
+ map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
{
map<string, rgw_bucket_dir_header> headers;
map<int, string> bucket_instance_ids;
} else {
marker_mgr.add(viter->first, iter->second.max_marker);
}
+ if (syncstopped != NULL)
+ *syncstopped = iter->second.syncstopped;
}
ver_mgr.to_string(bucket_ver);
master_ver_mgr.to_string(master_ver);
int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
{
int num_aio = 0;
- RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards);
+ RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
assert(get_ctx);
int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
- get_ctx->put();
if (r < 0) {
ctx->put();
if (num_aio) {
get_ctx->unset_cb();
}
}
+ get_ctx->put();
return r;
}
return r;
}
+int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
+{
+ librados::IoCtx index_ctx;
+ map<int, string> bucket_objs;
+ int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
+ if (r < 0)
+ return r;
+
+ return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
+{
+ librados::IoCtx index_ctx;
+ map<int, string> bucket_objs;
+ int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
+ if (r < 0)
+ return r;
+
+ return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
{
rgw_rados_ref ref;
bucket_info.bucket.convert(&entry.bucket);
- map<string, struct rgw_bucket_dir_header>::iterator hiter = headers.begin();
- for (; hiter != headers.end(); ++hiter) {
- map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = hiter->second.stats.begin();
- for (; iter != hiter->second.stats.end(); ++iter) {
- struct rgw_bucket_category_stats& header_stats = iter->second;
+ for (const auto& hiter : headers) {
+ for (const auto& iter : hiter.second.stats) {
+ const struct rgw_bucket_category_stats& header_stats = iter.second;
entry.size += header_stats.total_size;
entry.size_rounded += header_stats.total_size_rounded;
entry.count += header_stats.num_entries;
return 0;
}
+int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
+{
+ map<string, struct rgw_bucket_dir_header> headers;
+ RGWBucketInfo bucket_info;
+ RGWObjectCtx obj_ctx(this);
+ int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
+ if (ret < 0) {
+ ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
+ return ret;
+ }
+
+ bucket.convert(&entry.bucket);
+
+ for (const auto& hiter : headers) {
+ for (const auto& iter : hiter.second.stats) {
+ const struct rgw_bucket_category_stats& header_stats = iter.second;
+ entry.size += header_stats.total_size;
+ entry.size_rounded += header_stats.total_size_rounded;
+ entry.count += header_stats.num_entries;
+ }
+ }
+
+ return 0;
+}
+
int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
const string& in_marker,
const string& end_marker,
rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {}
rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {}
rgw_obj_select(const rgw_obj_select& rhs) {
+ placement_rule = rhs.placement_rule;
is_raw = rhs.is_raw;
if (is_raw) {
raw_obj = rhs.raw_obj;
if (struct_v >= 10) {
::decode(reshard_pool, bl);
} else {
- reshard_pool = name + ".rgw.reshard";
+ reshard_pool = log_pool.name + ":reshard";
}
DECODE_FINISH(bl);
}
void decode_json(JSONObj *obj);
static void generate_test_instances(list<RGWZoneParams*>& o);
- bool find_placement(const rgw_data_placement_target& placement, string *placement_id) {
- for (const auto& pp : placement_pools) {
- const RGWZonePlacementInfo& info = pp.second;
- if (info.index_pool == placement.index_pool.to_str() &&
- info.data_pool == placement.data_pool.to_str() &&
- info.data_extra_pool == placement.data_extra_pool.to_str()) {
- *placement_id = pp.first;
- return true;
- }
- }
- return false;
- }
-
bool get_placement(const string& placement_id, RGWZonePlacementInfo *placement) const {
auto iter = placement_pools.find(placement_id);
if (iter == placement_pools.end()) {
string name;
set<string> tags;
- bool user_permitted(list<string>& user_tags) {
+ bool user_permitted(list<string>& user_tags) const {
if (tags.empty()) {
return true;
}
int create_pool(const rgw_pool& pool);
- /**
- * create a bucket with name bucket and the given list of attrs
- * returns 0 on success, -ERR# otherwise.
- */
int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
int select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
public:
explicit List(RGWRados::Bucket *_target) : target(_target) {}
- int list_objects(int max, vector<rgw_bucket_dir_entry> *result, map<string, bool> *common_prefixes, bool *is_truncated);
+ int list_objects(int64_t max, vector<rgw_bucket_dir_entry> *result, map<string, bool> *common_prefixes, bool *is_truncated);
rgw_obj_key& get_next_marker() {
return next_marker;
}
int decode_policy(bufferlist& bl, ACLOwner *owner);
int get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
- map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker);
+ map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool* syncstopped = NULL);
int get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *cb);
int get_user_stats(const rgw_user& user, RGWStorageStats& stats);
int get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *cb);
int cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
int list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, std::list<rgw_bi_log_entry>& result, bool *truncated);
int trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, string& end_marker);
+ int resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id);
+ int stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id);
int get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id, map<int, string>& max_marker);
int bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent);
int complete_sync_user_stats(const rgw_user& user_id);
int cls_user_add_bucket(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries);
int cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket);
+ int cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry);
int check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size);
if (reshard_log) {
ret = reshard_log->update(bucket_info, new_bucket_info);
if (ret < 0) {
+ unlock_bucket();
return ret;
}
}
get_logshard_oid(logshard_num, &logshard_oid);
int ret = cls_rgw_reshard_list(store->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
- if (ret < 0 && ret != -ENOENT) {
+
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ *is_truncated = false;
+ ret = 0;
+ }
lderr(store->ctx()) << "ERROR: failed to list reshard log entries, oid=" << logshard_oid << dendl;
- return ret;
- }
- if (ret == -ENOENT) {
- *is_truncated = false;
+ if (ret == -EACCES) {
+ lderr(store->ctx()) << "access denied to pool " << store->get_zone_params().reshard_pool
+ << ". Fix the pool access permissions of your client" << dendl;
+ }
}
- return 0;
+
+ return ret;
}
int RGWReshard::get(cls_rgw_reshard_entry& entry)
static void get_new_date_str(string& date_str)
{
- utime_t tm = ceph_clock_now();
- stringstream s;
- tm.asctime(s);
- date_str = s.str();
+ date_str = rgw_to_asctime(ceph_clock_now());
}
int RGWRESTSimpleRequest::execute(RGWAccessKey& key, const char *method, const char *resource)
param_vec_t params;
populate_params(params, &uid, self_zone_group);
- *req = new RGWRESTStreamWriteRequest(cct, url, NULL, ¶ms);
- return (*req)->put_obj_init(key, obj, obj_size, attrs);
+ RGWRESTStreamWriteRequest *wr = new RGWRESTStreamWriteRequest(cct, url, NULL, ¶ms);
+ ret = wr->put_obj_init(key, obj, obj_size, attrs);
+ if (ret < 0) {
+ delete wr;
+ return ret;
+ }
+ *req = wr;
+ return 0;
}
int RGWRESTConn::complete_request(RGWRESTStreamWriteRequest *req, string& etag, real_time *mtime)
}
}
map<RGWObjCategory, RGWStorageStats> stats;
- int ret = store->get_bucket_stats(bucket_info, shard_id, &bucket_ver, &master_ver, stats, &max_marker);
+ int ret = store->get_bucket_stats(bucket_info, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
if (ret < 0 && ret != -ENOENT) {
http_ret = ret;
return;
encode_json("bucket_ver", bucket_ver, s->formatter);
encode_json("master_ver", master_ver, s->formatter);
encode_json("max_marker", max_marker, s->formatter);
+ encode_json("syncstopped", syncstopped, s->formatter);
s->formatter->close_section();
flusher.flush();
string bucket_ver;
string master_ver;
string max_marker;
+ bool syncstopped;
public:
- RGWOp_BILog_Info() : bucket_ver(), master_ver() {}
+ RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
~RGWOp_BILog_Info() override {}
int check_caps(RGWUserCaps& caps) override {
formatter->close_section(); // Category
}
+static void dump_usage_bucket_info(Formatter *formatter, const std::string& name, const cls_user_bucket_entry& entry)
+{
+ formatter->open_object_section("Entry");
+ formatter->dump_string("Bucket", name);
+ formatter->dump_int("Bytes", entry.size);
+ formatter->dump_int("Bytes_Rounded", entry.size_rounded);
+ formatter->close_section(); // entry
+}
+
void RGWGetUsage_ObjStore_S3::send_response()
{
if (op_ret < 0)
}
formatter->close_section(); // entries
}
-
+
if (show_log_sum) {
formatter->open_array_section("Summary");
map<string, rgw_usage_log_entry>::iterator siter;
formatter->dump_int("BytesReceived", total_usage.bytes_received);
formatter->dump_int("Ops", total_usage.ops);
formatter->dump_int("SuccessfulOps", total_usage.successful_ops);
- formatter->close_section(); // total
+ formatter->close_section(); // total
formatter->close_section(); // user
}
if (s->cct->_conf->rgw_rest_getusage_op_compat) {
- formatter->open_object_section("Stats");
- }
-
+ formatter->open_object_section("Stats");
+ }
+
formatter->dump_int("TotalBytes", header.stats.total_bytes);
formatter->dump_int("TotalBytesRounded", header.stats.total_bytes_rounded);
formatter->dump_int("TotalEntries", header.stats.total_entries);
-
- if (s->cct->_conf->rgw_rest_getusage_op_compat) {
+
+ if (s->cct->_conf->rgw_rest_getusage_op_compat) {
formatter->close_section(); //Stats
}
formatter->close_section(); // summary
}
- formatter->close_section(); // usage
- rgw_flush_formatter_and_reset(s, s->formatter);
+
+ formatter->open_array_section("CapacityUsed");
+ formatter->open_object_section("User");
+ formatter->open_array_section("Buckets");
+ for (const auto& biter : buckets_usage) {
+ const cls_user_bucket_entry& entry = biter.second;
+ dump_usage_bucket_info(formatter, biter.first, entry);
+ }
+ formatter->close_section(); // Buckets
+ formatter->close_section(); // User
+ formatter->close_section(); // CapacityUsed
+
+ formatter->close_section(); // usage
+ rgw_flush_formatter_and_reset(s, s->formatter);
}
int RGWListBucket_ObjStore_S3::get_params()
/* Populate the owner info. */
s->owner.set_id(s->user->user_id);
s->owner.set_name(s->user->display_name);
- ldout(s->cct, 0) << "Successful Signature Verification!" << dendl;
+ ldout(s->cct, 20) << "Successful Signature Verification!" << dendl;
}
ceph::bufferlist decoded_policy;
}
decoded_policy.append('\0'); // NULL terminate
- ldout(s->cct, 0) << "POST policy: " << decoded_policy.c_str() << dendl;
+ ldout(s->cct, 20) << "POST policy: " << decoded_policy.c_str() << dendl;
int r = post_policy.from_json(decoded_policy, err_msg);
dump_header(s, "X-Container-Bytes-Used-Actual", bucket.size_rounded);
if (s->object.empty()) {
- auto swift_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl);
+ auto swift_policy = \
+ static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl.get());
std::string read_acl, write_acl;
swift_policy->to_str(read_acl, write_acl);
return 0;
}
+int rgw_user_get_all_buckets_stats(RGWRados *store, const rgw_user& user_id, map<string, cls_user_bucket_entry>&buckets_usage_map)
+{
+ CephContext *cct = store->ctx();
+ size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+ bool done;
+ bool is_truncated;
+ string marker;
+ int ret;
+
+ do {
+ RGWUserBuckets user_buckets;
+ ret = rgw_read_user_buckets(store, user_id, user_buckets, marker,
+ string(), max_entries, false, &is_truncated);
+ if (ret < 0) {
+ ldout(cct, 0) << "failed to read user buckets: ret=" << ret << dendl;
+ return ret;
+ }
+ map<string, RGWBucketEnt>& buckets = user_buckets.get_buckets();
+ for (const auto& i : buckets) {
+ marker = i.first;
+
+ const RGWBucketEnt& bucket_ent = i.second;
+ cls_user_bucket_entry entry;
+ ret = store->cls_user_get_bucket_stats(bucket_ent.bucket, entry);
+ if (ret < 0) {
+ ldout(cct, 0) << "ERROR: could not get bucket stats: ret=" << ret << dendl;
+ return ret;
+ }
+ buckets_usage_map.emplace(bucket_ent.bucket.name, entry);
+ }
+ done = (buckets.size() < max_entries);
+ } while (!done);
+
+ return 0;
+}
+
/**
* Save the given user information to storage.
* Returns: 0 on success, -ERR# on failure.
WRITE_CLASS_ENCODER(RGWUID)
extern int rgw_user_sync_all_stats(RGWRados *store, const rgw_user& user_id);
+extern int rgw_user_get_all_buckets_stats(RGWRados *store, const rgw_user& user_id, map<string, cls_user_bucket_entry>&buckets_usage_map);
+
/**
* Get the anonymous (ie, unauthenticated) user info.
*/
}
void set_user_email(std::string& email) {
- if (email.empty())
- return;
-
- /* always lowercase email address */
+ /* always lowercase email address */
boost::algorithm::to_lower(email);
user_email = email;
user_email_specified = true;
#ifndef RGW_WEBSITE_H
#define RGW_WEBSITE_H
+#include <list>
+#include <string>
+
#include "rgw_xml.h"
struct RGWRedirectInfo
{
- string protocol;
- string hostname;
- uint16_t http_redirect_code;
+ std::string protocol;
+ std::string hostname;
+ uint16_t http_redirect_code = 0;
void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
struct RGWBWRedirectInfo
{
RGWRedirectInfo redirect;
- string replace_key_prefix_with;
- string replace_key_with;
+ std::string replace_key_prefix_with;
+ std::string replace_key_with;
void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
struct RGWBWRoutingRuleCondition
{
- string key_prefix_equals;
+ std::string key_prefix_equals;
uint16_t http_error_code_returned_equals;
void encode(bufferlist& bl) const {
void decode_json(JSONObj *obj);
void decode_xml(XMLObj *obj);
- bool check_key_condition(const string& key);
+ bool check_key_condition(const std::string& key);
bool check_error_code_condition(const int error_code) {
return (uint16_t)error_code == http_error_code_returned_equals;
}
void decode_json(JSONObj *obj);
void decode_xml(XMLObj *obj);
- bool check_key_condition(const string& key) {
+ bool check_key_condition(const std::string& key) {
return condition.check_key_condition(key);
}
bool check_error_code_condition(int error_code) {
return condition.check_error_code_condition(error_code);
}
- void apply_rule(const string& default_protocol, const string& default_hostname, const string& key, string *redirect, int *redirect_code);
+ void apply_rule(const std::string& default_protocol,
+ const std::string& default_hostname,
+ const std::string& key,
+ std::string *redirect,
+ int *redirect_code);
};
WRITE_CLASS_ENCODER(RGWBWRoutingRule)
struct RGWBWRoutingRules
{
- list<RGWBWRoutingRule> rules;
+ std::list<RGWBWRoutingRule> rules;
void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
void dump_xml(Formatter *f) const;
void decode_json(JSONObj *obj);
- bool check_key_condition(const string& key, RGWBWRoutingRule **rule);
+ bool check_key_condition(const std::string& key, RGWBWRoutingRule **rule);
bool check_error_code_condition(int error_code, RGWBWRoutingRule **rule);
- bool check_key_and_error_code_condition(const string& key, const int error_code, RGWBWRoutingRule **rule);
+ bool check_key_and_error_code_condition(const std::string& key,
+ const int error_code,
+ RGWBWRoutingRule **rule);
};
WRITE_CLASS_ENCODER(RGWBWRoutingRules)
--- /dev/null
+#!/bin/bash
+
+# This can be run from e.g. the senta machines which have docker available. You
+# may need to run this script with sudo.
+#
+# Once you have booted into the image, you should be able to debug the core file:
+# $ gdb -q /ceph/teuthology-archive/.../coredump/1500013578.8678.core
+#
+# You may want to install other packages (yum) as desired.
+#
+# Once you're finished, please delete old images in a timely fashion.
+
+set -e
+
+CACHE=""
+
+function run {
+ printf "%s\n" "$*"
+ "$@"
+}
+
+function main {
+ eval set -- $(getopt --name "$0" --options 'h' --longoptions 'help,no-cache' -- "$@")
+
+ while [ "$#" -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ printf '%s: [--no-cache] <branch> <enviornment>\n' "$0"
+ exit 0
+ ;;
+ --no-cache)
+ CACHE="--no-cache"
+ shift
+ ;;
+ --)
+ shift
+ break
+ ;;
+ esac
+ done
+
+ if [ -z "$1" ]; then
+ printf "specify the branch [default \"master\"]: "
+ read branch
+ if [ -z "$branch" ]; then
+ branch=master
+ fi
+ else
+ branch="$1"
+ fi
+ printf "branch: %s\n" "$branch"
+
+ if [ -z "$2" ]; then
+ printf "specify the build environment [default \"centos:7\"]: "
+ read env
+ if [ -z "$env" ]; then
+ env=centos:7
+ fi
+ else
+ env="$2"
+ fi
+ printf "env: %s\n" "$env"
+
+ if [ -n "$SUDO_USER" ]; then
+ user="$SUDO_USER"
+ elif [ -n "$USER" ]; then
+ user="$USER"
+ else
+ user="$(whoami)"
+ fi
+
+ image="${user}/ceph-ci:${branch}-${env/:/-}"
+
+ T=$(mktemp -d)
+ pushd "$T"
+ if grep ubuntu <<<"$env" > /dev/null 2>&1; then
+ # Docker makes it impossible to access anything outside the CWD : /
+ cp -- /ceph/shaman/cephdev.asc .
+ cat > Dockerfile <<EOF
+FROM ${env}
+
+WORKDIR /root
+RUN apt-get update --yes --quiet && \
+ apt-get install --yes --quiet screen wget gdb software-properties-common python-software-properties apt-transport-https
+COPY cephdev.asc cephdev.asc
+RUN apt-key add cephdev.asc
+RUN add-apt-repository "\$(wget --quiet -O - https://shaman.ceph.com/api/repos/ceph/${branch}/latest/${env/://}/repo)" && \
+ apt-get update --yes && \
+ apt-get install --yes --allow-unauthenticated ceph ceph-mds-dbg ceph-mgr-dbg ceph-mon-dbg ceph-fuse-dbg ceph-test-dbg radosgw-dbg
+EOF
+ time run docker build $CACHE --tag "$image" .
+ else # try RHEL flavor
+ time run docker build $CACHE --tag "$image" - <<EOF
+FROM ${env}
+
+WORKDIR /root
+RUN yum update -y && \
+ yum install -y screen epel-release wget psmisc ca-certificates gdb
+RUN wget -O /etc/yum.repos.d/ceph-dev.repo https://shaman.ceph.com/api/repos/ceph/${branch}/latest/centos/7/repo && \
+ yum clean all && \
+ yum upgrade -y && \
+ yum install -y ceph ceph-debuginfo ceph-fuse
+EOF
+ fi
+ popd
+ rm -rf -- "$T"
+
+ printf "built image %s\n" "$image"
+
+ run docker run -ti -v /ceph:/ceph:ro "$image"
+ return 0
+}
+
+main "$@"
fixes_re = re.compile(r"Fixes\:? #(\d+)")
+reviewed_by_re = re.compile(r"Rev(.*)By", re.IGNORECASE)
# labels is the list of relevant labels defined for github.com/ceph/ceph
labels = ['bluestore', 'build/ops', 'cephfs', 'common', 'core', 'mgr',
'mon', 'performance', 'pybind', 'rdma', 'rgw', 'rbd', 'tests',
'rbd', 'rbd-mirror', 'rbd-nbd', 'rgw', 'tests', 'tools']
signed_off_re = re.compile("Signed-off-by: (.+) <")
tracker_re = re.compile("http://tracker.ceph.com/issues/(\d+)")
+rst_link_re = re.compile(r"([a-zA-Z0-9])_(\W)")
tracker_uri = "http://tracker.ceph.com/issues/{0}.json"
else:
return 'UNKNOWN: ' + title
+def _title_message(commit, pr, strict):
+ title = pr['title']
+ message_lines = commit.message.split('\n')
+ if strict or len(message_lines) < 1:
+ return (title, None)
+ lines = []
+ for line in message_lines[1:]:
+ if reviewed_by_re.match(line):
+ continue
+ line = line.strip()
+ if line:
+ lines.append(line)
+ if len(lines) == 0:
+ return (title, None)
+ duplicates_pr_title = lines[0] == pr['title'].strip()
+ if duplicates_pr_title:
+ return (title, None)
+ assert len(lines) > 0, "missing message content"
+ if len(lines) == 1:
+ # assume that a single line means the intention is to
+ # re-write the PR title
+ return (lines[0], None)
+ message = " " + "\n ".join(lines)
+ return (title, message)
def make_release_notes(gh, repo, ref, plaintext, verbose, strict, use_tags):
for commit in repo.iter_commits(ref, merges=True):
merge = merge_re.match(commit.summary)
- if merge:
- number = merge.group(1)
- print ("Considering PR#" + number)
- # do not pick up ceph/ceph-qa-suite.git PRs
- if int(number) < 1311:
- print ("Ignoring low-numbered PR, probably picked up from"
- " ceph/ceph-qa-suite.git")
- continue
- pr = gh.repos("ceph")("ceph").pulls(number).get()
- title = pr['title']
- message = None
- message_lines = commit.message.split('\n')
- if not strict and len(message_lines) > 1:
- lines = []
- for line in message_lines[1:]:
- if 'Reviewed-by' in line:
- continue
- line = line.strip()
- if line:
- lines.append(line)
- if len(lines) == 0:
- continue
- duplicates_pr_title = lines[0] == pr['title'].strip()
- if duplicates_pr_title:
- continue
- assert len(lines) > 0, "missing message content"
- if len(lines) == 1:
- # assume that a single line means the intention is to
- # re-write the PR title
- title = lines[0]
- message = None
- else:
- message = " " + "\n ".join(lines)
- issues = []
- if pr['body']:
- issues = fixes_re.findall(pr['body']) + tracker_re.findall(
- pr['body']
- )
-
- authors = {}
- for c in repo.iter_commits(
- "{sha1}^1..{sha1}^2".format(sha1=commit.hexsha)
- ):
- for author in re.findall(
- "Signed-off-by:\s*(.*?)\s*<", c.message
- ):
- authors[author] = 1
- issues.extend(fixes_re.findall(c.message) +
- tracker_re.findall(c.message))
- if authors:
- author = ", ".join(authors.keys())
- else:
- author = commit.parents[-1].author.name
+ if not merge:
+ continue
+ number = merge.group(1)
+ print ("Considering PR#" + number)
+ # do not pick up ceph/ceph-qa-suite.git PRs
+ if int(number) < 1311:
+ print ("Ignoring low-numbered PR, probably picked up from"
+ " ceph/ceph-qa-suite.git")
+ continue
+ pr = gh.repos("ceph")("ceph").pulls(number).get()
+ (title, message) = _title_message(commit, pr, strict)
+ issues = []
+ if pr['body']:
+ issues = fixes_re.findall(pr['body']) + tracker_re.findall(
+ pr['body']
+ )
- if strict and not issues:
+ authors = {}
+ for c in repo.iter_commits(
+ "{sha1}^1..{sha1}^2".format(sha1=commit.hexsha)
+ ):
+ for author in re.findall(
+ "Signed-off-by:\s*(.*?)\s*<", c.message
+ ):
+ authors[author] = 1
+ issues.extend(fixes_re.findall(c.message) +
+ tracker_re.findall(c.message))
+ if authors:
+ author = ", ".join(authors.keys())
+ else:
+ author = commit.parents[-1].author.name
+
+ if strict and not issues:
+ print ("ERROR: https://github.com/ceph/ceph/pull/" +
+ str(number) + " has no associated issue")
+ continue
+
+ if strict:
+ title_re = (
+ '^(?:hammer|infernalis|jewel|kraken):\s+(' +
+ '|'.join(prefixes) +
+ ')(:.*)'
+ )
+ match = re.match(title_re, title)
+ if not match:
print ("ERROR: https://github.com/ceph/ceph/pull/" +
- str(number) + " has no associated issue")
- continue
-
+ str(number) + " title " + title.encode("utf-8") +
+ " does not match " + title_re)
+ else:
+ title = match.group(1) + match.group(2)
+ if use_tags:
+ title = split_component(title, gh, number)
+
+ title = title.strip(' \t\n\r\f\v\.\,\;\:\-\=')
+ # escape asterisks, which is used by reStructuredTextrst for inline
+ # emphasis
+ title = title.replace('*', '\*')
+ # and escape the underscores for noting a link
+ title = rst_link_re.sub(r'\1\_\2', title)
+ pr2info[number] = (author, title, message)
+
+ for issue in set(issues):
if strict:
- title_re = (
- '^(?:hammer|infernalis|jewel|kraken):\s+(' +
- '|'.join(prefixes) +
- ')(:.*)'
- )
- match = re.match(title_re, title)
- if not match:
- print ("ERROR: https://github.com/ceph/ceph/pull/" +
- str(number) + " title " + title.encode("utf-8") +
- " does not match " + title_re)
- else:
- title = match.group(1) + match.group(2)
- if use_tags:
- title = split_component(title, gh, number)
-
- title = title.strip(' \t\n\r\f\v\.\,\;\:\-\=')
- # escape asterisks, which is used by reStructuredTextrst for inline
- # emphasis
- title = title.replace('*', '\*')
- pr2info[number] = (author, title, message)
-
- for issue in set(issues):
- if strict:
- issue = get_original_issue(issue, verbose)
- issue2prs.setdefault(issue, set([])).add(number)
- pr2issues.setdefault(number, set([])).add(issue)
- sys.stdout.write('.')
+ issue = get_original_issue(issue, verbose)
+ issue2prs.setdefault(issue, set([])).add(number)
+ pr2issues.setdefault(number, set([])).add(issue)
+ sys.stdout.write('.')
print (" done collecting merges.")
if strict:
- for (issue, prs) in issue2prs.iteritems():
+ for (issue, prs) in issue2prs.items():
if len(prs) > 1:
print (">>>>>>> " + str(len(prs)) + " pr for issue " +
issue + " " + str(prs))
for (pr, (author, title, message)) in sorted(
- pr2info.iteritems(), key=lambda (k, v): (v[2], v[1])
+ pr2info.items(), key=lambda title: title[1][1]
):
if pr in pr2issues:
if plaintext:
test -d dev/osd0/. && test -e dev/sudo && SUDO="sudo"
if [ -e CMakeCache.txt ]; then
- [ -z "$CEPH_BIN" ] && CEPH_BIN=src
-else
[ -z "$CEPH_BIN" ] && CEPH_BIN=bin
fi
pkill -u $MYUID -f valgrind.bin.\*ceph-mon
$SUDO pkill -u $MYUID -f valgrind.bin.\*ceph-osd
pkill -u $MYUID -f valgrind.bin.\*ceph-mds
+ asok_dir=`dirname $("${CEPH_BIN}"/ceph-conf --show-config-value admin_socket)`
+ rm -rf "${asok_dir}"
else
[ $stop_mon -eq 1 ] && do_killall ceph-mon
[ $stop_mds -eq 1 ] && do_killall ceph-mds
add_dependencies(tests ceph-detect-init)
endif(NOT FREEBSD)
-add_ceph_test(test-ceph-helpers.sh ${CMAKE_CURRENT_SOURCE_DIR}/test-ceph-helpers.sh)
-add_ceph_test(erasure-decode-non-regression.sh ${CMAKE_SOURCE_DIR}/qa/workunits/erasure-code/encode-decode-non-regression.sh)
-
-add_ceph_test(ceph_objectstore_tool.py ${CMAKE_CURRENT_SOURCE_DIR}/ceph_objectstore_tool.py)
-if(WITH_LIBCEPHFS)
- add_ceph_test(cephtool-test-mds.sh ${CMAKE_CURRENT_SOURCE_DIR}/cephtool-test-mds.sh)
-endif(WITH_LIBCEPHFS)
-add_ceph_test(cephtool-test-mon.sh ${CMAKE_CURRENT_SOURCE_DIR}/cephtool-test-mon.sh)
-add_ceph_test(cephtool-test-osd.sh ${CMAKE_CURRENT_SOURCE_DIR}/cephtool-test-osd.sh)
-add_ceph_test(cephtool-test-rados.sh ${CMAKE_CURRENT_SOURCE_DIR}/cephtool-test-rados.sh)
if(WITH_RBD)
add_ceph_test(run-rbd-unit-tests.sh ${CMAKE_CURRENT_SOURCE_DIR}/run-rbd-unit-tests.sh)
endif(WITH_RBD)
add_ceph_test(test_objectstore_memstore.sh ${CMAKE_CURRENT_SOURCE_DIR}/test_objectstore_memstore.sh)
add_ceph_test(test_pidfile.sh ${CMAKE_CURRENT_SOURCE_DIR}/test_pidfile.sh)
add_ceph_test(test_subman.sh ${CMAKE_CURRENT_SOURCE_DIR}/test_subman.sh)
+add_ceph_test(smoke.sh ${CMAKE_CURRENT_SOURCE_DIR}/smoke.sh)
add_ceph_test(unittest_bufferlist.sh ${CMAKE_SOURCE_DIR}/src/unittest_bufferlist.sh)
add_test(NAME run-tox-ceph-disk COMMAND bash ${CMAKE_SOURCE_DIR}/src/ceph-disk/run-tox.sh)
shell=True)
weights = []
for line in output.strip().split('\n'):
- osd_id, weight, osd_name = re.split('\s+', line)
- weights.append(float(weight))
+ print(line)
+ linev = re.split('\s+', line)
+ if linev[0] is '':
+ linev.pop(0)
+ print('linev %s' % linev)
+ weights.append(float(linev[1]))
return weights
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/detect-build-env-vars.sh
-
-CEPH_CLI_TEST_DUP_COMMAND=1 \
-MDS=1 MON=1 OSD=3 MGR=1 CEPH_PORT=7200 CEPH_OBJECTSTORE="bluestore" $CEPH_ROOT/src/test/vstart_wrapper.sh \
- $CEPH_ROOT/qa/workunits/cephtool/test.sh \
- --test-mds \
- --asok-does-not-need-root
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/detect-build-env-vars.sh
-
-CEPH_CLI_TEST_DUP_COMMAND=1 \
-# uses CEPH_PORT going from 7202 7203 and 7204 because
-# it starts at 7202 and runs 3 mons (see vstart.sh)
-MON=3 OSD=4 MDS=0 MGR=1 CEPH_PORT=7202 $CEPH_ROOT/src/test/vstart_wrapper.sh \
- $CEPH_ROOT/qa/workunits/cephtool/test.sh \
- --test-mon \
- --asok-does-not-need-root
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/detect-build-env-vars.sh
-
-CEPH_CLI_TEST_DUP_COMMAND=1 \
-MON=1 OSD=3 MDS=0 MGR=1 CEPH_PORT=7201 $CEPH_ROOT/src/test/vstart_wrapper.sh \
- $CEPH_ROOT/qa/workunits/cephtool/test.sh \
- --test-osd \
- --asok-does-not-need-root
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: David Zafman <dzafman@redhat.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/detect-build-env-vars.sh
-
-CEPH_CLI_TEST_DUP_COMMAND=1 \
-MON=1 OSD=3 MDS=0 MGR=1 CEPH_PORT=7205 $CEPH_ROOT/src/test/vstart_wrapper.sh \
- $CEPH_ROOT/src/test/test_rados_tool.sh
# rules
rule data {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule metadata {
- \truleset 1 (esc)
+ \tid 1 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule rbd {
- \truleset 2 (esc)
+ \tid 2 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule simple-rule {
- \truleset 3 (esc)
+ \tid 3 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
# rules
rule data {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule metadata {
- \truleset 1 (esc)
+ \tid 1 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule rbd {
- \truleset 2 (esc)
+ \tid 2 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
$ map="$TESTDIR/foo"
$ crushtool --outfn "$map" --build --set-chooseleaf-vary-r 0 --set-chooseleaf-stable 0 --num_osds 25 node straw 5 rack straw 1 root straw 0 --reweight-item osd.2 99 -o "$map" --tree
crushtool reweighting item osd.2 to 99
- ID\tWEIGHT\tTYPE NAME (esc)
- -11\t123.00000\troot root (esc)
- -6\t103.00000\t\track rack0 (esc)
- -1\t103.00000\t\t\tnode node0 (esc)
- 0\t1.00000\t\t\t\tosd.0 (esc)
- 1\t1.00000\t\t\t\tosd.1 (esc)
- 2\t99.00000\t\t\t\tosd.2 (esc)
- 3\t1.00000\t\t\t\tosd.3 (esc)
- 4\t1.00000\t\t\t\tosd.4 (esc)
- -7\t5.00000\t\track rack1 (esc)
- -2\t5.00000\t\t\tnode node1 (esc)
- 5\t1.00000\t\t\t\tosd.5 (esc)
- 6\t1.00000\t\t\t\tosd.6 (esc)
- 7\t1.00000\t\t\t\tosd.7 (esc)
- 8\t1.00000\t\t\t\tosd.8 (esc)
- 9\t1.00000\t\t\t\tosd.9 (esc)
- -8\t5.00000\t\track rack2 (esc)
- -3\t5.00000\t\t\tnode node2 (esc)
- 10\t1.00000\t\t\t\tosd.10 (esc)
- 11\t1.00000\t\t\t\tosd.11 (esc)
- 12\t1.00000\t\t\t\tosd.12 (esc)
- 13\t1.00000\t\t\t\tosd.13 (esc)
- 14\t1.00000\t\t\t\tosd.14 (esc)
- -9\t5.00000\t\track rack3 (esc)
- -4\t5.00000\t\t\tnode node3 (esc)
- 15\t1.00000\t\t\t\tosd.15 (esc)
- 16\t1.00000\t\t\t\tosd.16 (esc)
- 17\t1.00000\t\t\t\tosd.17 (esc)
- 18\t1.00000\t\t\t\tosd.18 (esc)
- 19\t1.00000\t\t\t\tosd.19 (esc)
- -10\t5.00000\t\track rack4 (esc)
- -5\t5.00000\t\t\tnode node4 (esc)
- 20\t1.00000\t\t\t\tosd.20 (esc)
- 21\t1.00000\t\t\t\tosd.21 (esc)
- 22\t1.00000\t\t\t\tosd.22 (esc)
- 23\t1.00000\t\t\t\tosd.23 (esc)
- 24\t1.00000\t\t\t\tosd.24 (esc)
+ ID WEIGHT TYPE NAME
+ -11 123.00000 root root
+ -6 103.00000 rack rack0
+ -1 103.00000 node node0
+ 0 1.00000 osd.0
+ 1 1.00000 osd.1
+ 2 99.00000 osd.2
+ 3 1.00000 osd.3
+ 4 1.00000 osd.4
+ -7 5.00000 rack rack1
+ -2 5.00000 node node1
+ 5 1.00000 osd.5
+ 6 1.00000 osd.6
+ 7 1.00000 osd.7
+ 8 1.00000 osd.8
+ 9 1.00000 osd.9
+ -8 5.00000 rack rack2
+ -3 5.00000 node node2
+ 10 1.00000 osd.10
+ 11 1.00000 osd.11
+ 12 1.00000 osd.12
+ 13 1.00000 osd.13
+ 14 1.00000 osd.14
+ -9 5.00000 rack rack3
+ -4 5.00000 node node3
+ 15 1.00000 osd.15
+ 16 1.00000 osd.16
+ 17 1.00000 osd.17
+ 18 1.00000 osd.18
+ 19 1.00000 osd.19
+ -10 5.00000 rack rack4
+ -5 5.00000 node node4
+ 20 1.00000 osd.20
+ 21 1.00000 osd.21
+ 22 1.00000 osd.22
+ 23 1.00000 osd.23
+ 24 1.00000 osd.24
$ crushtool -d "$map"
# begin crush map
tunable choose_local_tries 0
# rules
rule replicated_rule {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
# rules
rule replicated_rule {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
- $ crushtool -c "$TESTDIR/check-overlapped-rules.crushmap.txt" -o "$TESTDIR/check-overlapped-rules.crushmap"
$ crushtool -i "$TESTDIR/check-overlapped-rules.crushmap" --check
overlapped rules in ruleset 0: rule-r0, rule-r1, rule-r2
overlapped rules in ruleset 0: rule-r0, rule-r2, rule-r3
overlapped rules in ruleset 0: rule-r0, rule-r3
- $ rm -f "$TESTDIR/check-overlapped-rules.crushmap"
+ $ crushtool -c "$TESTDIR/check-overlapped-rules.crushmap.txt" -o "$TESTDIR/check-overlapped-rules.crushmap.new"
+ rule 0 already exists
+ [1]
# rules
rule data {
- ruleset 3
+ id 3
type replicated
min_size 2
max_size 2
],
"rules": [
{
- "rule_id": 0,
+ "rule_id": 3,
"rule_name": "data",
"ruleset": 3,
"type": 1,
# rules
rule data-ssd {
- ruleset 1
+ id 1
type replicated
min_size 2
max_size 2
step emit
}
rule data-hdd {
- ruleset 2
+ id 2
type replicated
min_size 2
max_size 2
step emit
}
rule data {
- ruleset 3
+ id 3
type replicated
min_size 2
max_size 2
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 1
+ id 1
type replicated
min_size 2
max_size 2
# rules
rule data {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule foo {
- \truleset 1 (esc)
+ \tid 1 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
# rules
rule data {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
\tstep emit (esc)
}
rule foo-ssd {
- \truleset 1 (esc)
+ \tid 1 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
# rules
rule choose {
- ruleset 1
+ id 0
type replicated
min_size 2
max_size 3
}
rule choose-two {
- ruleset 2
+ id 1
type replicated
min_size 2
max_size 3
}
rule chooseleaf {
- ruleset 3
+ id 2
type replicated
min_size 2
max_size 3
}
rule choose-set {
- ruleset 4
+ id 3
type replicated
min_size 2
max_size 3
}
rule choose-set-two {
- ruleset 5
+ id 4
type replicated
min_size 2
max_size 3
}
rule chooseleaf-set {
- ruleset 6
+ id 5
type replicated
min_size 2
max_size 3
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
# rules
rule replicated_ruleset {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
# rules
rule data {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
step emit
}
rule metadata {
- ruleset 1
+ id 1
type replicated
min_size 1
max_size 10
step emit
}
rule rbd {
- ruleset 2
+ id 2
type replicated
min_size 1
max_size 10
nearfull_ratio 0
min_compat_client jewel
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0 application rbd
max_osd 3
nearfull_ratio 0
min_compat_client jewel
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 flags hashpspool stripe_width 0 application rbd
max_osd 1
# rules
rule replicated_rule {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
nearfull_ratio 0
min_compat_client jewel
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0 application rbd
max_osd 3
osdmaptool: writing epoch 1 to myosdmap
$ osdmaptool --print myosdmap | grep 'pool 1'
osdmaptool: osdmap file 'myosdmap'
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0 application rbd
$ rm -f myosdmap
# rules
rule replicated_rule {
- \truleset 0 (esc)
+ \tid 0 (esc)
\ttype replicated (esc)
\tmin_size 1 (esc)
\tmax_size 10 (esc)
nearfull_ratio 0
min_compat_client jewel
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0 application rbd
max_osd 239
osdmaptool: writing epoch 1 to om
$ osdmaptool --print om | grep 'pool 1'
osdmaptool: osdmap file 'om'
- pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0
+ pool 1 'rbd' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0 application rbd
$ rm -f om
$ osdmaptool --tree=plain om
osdmaptool: osdmap file 'om'
- ID CLASS WEIGHT TYPE NAME UP/DOWN REWEIGHT PRI-AFF
- -1 3.00000 root default
- -3 3.00000 rack localrack
- -2 3.00000 host localhost
- 0 1.00000 osd.0 DNE 0
- 1 1.00000 osd.1 DNE 0
- 2 1.00000 osd.2 DNE 0
+ ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -1 3.00000 root default
+ -3 3.00000 rack localrack
+ -2 3.00000 host localhost
+ 0 1.00000 osd.0 DNE 0
+ 1 1.00000 osd.1 DNE 0
+ 2 1.00000 osd.2 DNE 0
- $ osdmaptool --tree=json om
+ $ osdmaptool --tree=json-pretty om
osdmaptool: osdmap file 'om'
- {"nodes":[{"id":-1,"device_class":"","name":"default","type":"root","type_id":10,"children":[-3]},{"id":-3,"device_class":"","name":"localrack","type":"rack","type_id":3,"children":[-2]},{"id":-2,"device_class":"","name":"localhost","type":"host","type_id":1,"children":[2,1,0]},{"id":0,"device_class":"","name":"osd.0","type":"osd","type_id":0,"crush_weight":1.000000,"depth":3,"exists":0,"status":"down","reweight":0.000000,"primary_affinity":1.000000},{"id":1,"device_class":"","name":"osd.1","type":"osd","type_id":0,"crush_weight":1.000000,"depth":3,"exists":0,"status":"down","reweight":0.000000,"primary_affinity":1.000000},{"id":2,"device_class":"","name":"osd.2","type":"osd","type_id":0,"crush_weight":1.000000,"depth":3,"exists":0,"status":"down","reweight":0.000000,"primary_affinity":1.000000}],"stray":[]}
+ {
+ "nodes": [
+ {
+ "id": -1,
+ "name": "default",
+ "type": "root",
+ "type_id": 10,
+ "children": [
+ -3
+ ]
+ },
+ {
+ "id": -3,
+ "name": "localrack",
+ "type": "rack",
+ "type_id": 3,
+ "pool_weights": {},
+ "children": [
+ -2
+ ]
+ },
+ {
+ "id": -2,
+ "name": "localhost",
+ "type": "host",
+ "type_id": 1,
+ "pool_weights": {},
+ "children": [
+ 2,
+ 1,
+ 0
+ ]
+ },
+ {
+ "id": 0,
+ "name": "osd.0",
+ "type": "osd",
+ "type_id": 0,
+ "crush_weight": 1.000000,
+ "depth": 3,
+ "pool_weights": {},
+ "exists": 0,
+ "status": "down",
+ "reweight": 0.000000,
+ "primary_affinity": 1.000000
+ },
+ {
+ "id": 1,
+ "name": "osd.1",
+ "type": "osd",
+ "type_id": 0,
+ "crush_weight": 1.000000,
+ "depth": 3,
+ "pool_weights": {},
+ "exists": 0,
+ "status": "down",
+ "reweight": 0.000000,
+ "primary_affinity": 1.000000
+ },
+ {
+ "id": 2,
+ "name": "osd.2",
+ "type": "osd",
+ "type_id": 0,
+ "crush_weight": 1.000000,
+ "depth": 3,
+ "pool_weights": {},
+ "exists": 0,
+ "status": "down",
+ "reweight": 0.000000,
+ "primary_affinity": 1.000000
+ }
+ ],
+ "stray": []
+ }
+
+
$ rm -f om
bucket rm remove bucket
bucket check check bucket index
bucket reshard reshard bucket
+ bucket sync disable disable bucket sync
+ bucket sync enable enable bucket sync
bi get retrieve bucket index object entries
bi put store bucket index object entries
bi list list raw bucket index entries
--access=<access> Set access permissions for sub-user, should be one
of read, write, readwrite, full
--display-name=<name>
- --max_buckets max number of buckets for a user
+ --max-buckets max number of buckets for a user
--admin set the admin flag on the user
--system set the system flag on the user
--bucket=<bucket>
--tags-add=<list> list of tags to add for zonegroup placement modify command
--tags-rm=<list> list of tags to remove for zonegroup placement modify command
--endpoints=<list> zone endpoints
- --index_pool=<pool> placement target index pool
+ --index-pool=<pool> placement target index pool
--data-pool=<pool> placement target data pool
--data-extra-pool=<pool> placement target data extra (non-ec) pool
--placement-index-type=<type>
nbd unmap Unmap a nbd device.
object-map check Verify the object map is correct.
object-map rebuild Rebuild an invalid object map.
+ pool init Initialize pool for use by RBD.
remove (rm) Delete an image.
rename (mv) Rename image within pool.
resize Resize (expand or shrink) image.
snap limit set Limit the number of snapshots.
snap list (snap ls) Dump list of image snapshots.
snap protect Prevent a snapshot from being deleted.
- snap purge Deletes all snapshots.
- snap remove (snap rm) Deletes a snapshot.
+ snap purge Delete all snapshots.
+ snap remove (snap rm) Delete a snapshot.
snap rename Rename a snapshot.
snap rollback (snap revert) Rollback image to snapshot.
snap unprotect Allow a snapshot to be deleted.
status Show the status of this image.
trash list (trash ls) List trash images.
- trash move (trash mv) Moves an image to the trash.
- trash remove (trash rm) Removes an image from trash.
- trash restore Restores an image from trash.
+ trash move (trash mv) Move an image to the trash.
+ trash remove (trash rm) Remove an image from trash.
+ trash restore Restore an image from trash.
unmap Unmap a rbd device that was used by the kernel.
watch Watch events on image.
--snap arg snapshot name
--no-progress disable progress output
+ rbd help pool init
+ usage: rbd pool init [--pool <pool>] [--force]
+ <pool-name>
+
+ Initialize pool for use by RBD.
+
+ Positional arguments
+ <pool-name> pool name
+
+ Optional arguments
+ -p [ --pool ] arg pool name
+ --force force initialize pool for RBD use if registered by
+ another application
+
rbd help remove
usage: rbd remove [--pool <pool>] [--image <image>] [--no-progress]
<image-spec>
[--image-id <image-id>] [--no-progress]
<image-spec>
- Deletes all snapshots.
+ Delete all snapshots.
Positional arguments
<image-spec> image specification
[--no-progress] [--image-id <image-id>] [--force]
<snap-spec>
- Deletes a snapshot.
+ Delete a snapshot.
Positional arguments
<snap-spec> snapshot specification
usage: rbd trash move [--pool <pool>] [--image <image>] [--delay <delay>]
<image-spec>
- Moves an image to the trash.
+ Move an image to the trash.
Positional arguments
<image-spec> image specification
[--no-progress] [--force]
<image-id>
- Removes an image from trash.
+ Remove an image from trash.
Positional arguments
<image-id> image id
[--image <image>]
<image-id>
- Restores an image from trash.
+ Restore an image from trash.
Positional arguments
<image-id> image id
string id2 = "123456780";
std::map<string, cls::rbd::TrashImageSpec> entries;
- ASSERT_EQ(-ENOENT, trash_list(&ioctx, &entries));
+ ASSERT_EQ(-ENOENT, trash_list(&ioctx, "", 1024, &entries));
utime_t now1 = ceph_clock_now();
utime_t now1_delay = now1;
ASSERT_EQ(0, trash_remove(&ioctx, id));
ASSERT_EQ(-ENOENT, trash_remove(&ioctx, id));
- ASSERT_EQ(0, trash_list(&ioctx, &entries));
+ ASSERT_EQ(0, trash_list(&ioctx, "", 1024, &entries));
ASSERT_TRUE(entries.empty());
ASSERT_EQ(0, trash_add(&ioctx, id, trash_spec2));
ASSERT_EQ(0, trash_add(&ioctx, id2, trash_spec));
- ASSERT_EQ(0, trash_list(&ioctx, &entries));
-
- for (auto& entry : entries) {
- if (entry.first == id) {
- ASSERT_EQ(entry.second.source, cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING);
- ASSERT_EQ(entry.second.name, "name2");
- ASSERT_EQ(entry.second.deletion_time, now2);
- ASSERT_EQ(entry.second.deferment_end_time, now2_delay);
- } else if (entry.first == id2) {
- ASSERT_EQ(entry.second.source, cls::rbd::TRASH_IMAGE_SOURCE_USER);
- ASSERT_EQ(entry.second.name, "name");
- ASSERT_EQ(entry.second.deletion_time, now1);
- ASSERT_EQ(entry.second.deferment_end_time, now1_delay);
- }
- }
+ ASSERT_EQ(0, trash_list(&ioctx, "", 1, &entries));
+ ASSERT_TRUE(entries.find(id2) != entries.end());
+ ASSERT_EQ(cls::rbd::TRASH_IMAGE_SOURCE_USER, entries[id2].source);
+ ASSERT_EQ(std::string("name"), entries[id2].name);
+ ASSERT_EQ(now1, entries[id2].deletion_time);
+ ASSERT_EQ(now1_delay, entries[id2].deferment_end_time);
+
+ ASSERT_EQ(0, trash_list(&ioctx, id2, 1, &entries));
+ ASSERT_TRUE(entries.find(id) != entries.end());
+ ASSERT_EQ(cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING, entries[id].source);
+ ASSERT_EQ(std::string("name2"), entries[id].name);
+ ASSERT_EQ(now2, entries[id].deletion_time);
+ ASSERT_EQ(now2_delay, entries[id].deferment_end_time);
+
+ ASSERT_EQ(0, trash_list(&ioctx, id, 1, &entries));
+ ASSERT_TRUE(entries.empty());
cls::rbd::TrashImageSpec spec_res1;
ASSERT_EQ(0, trash_get(&ioctx, id, &spec_res1));
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, rados));
}
+/*
+ * similar to test_implicit, just changes the order of the tags removal
+ * see issue #20107
+ */
+TEST(cls_rgw, test_implicit_idempotent) /* test refcount using implicit referencing of newly created objects */
+{
+ librados::Rados rados;
+ librados::IoCtx ioctx;
+ string pool_name = get_temp_pool_name();
+
+ /* create pool */
+ ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
+ ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
+
+ /* add chains */
+ string oid = "obj";
+ string oldtag = "oldtag";
+ string newtag = "newtag";
+
+
+ /* get on a missing object will fail */
+ librados::ObjectWriteOperation *op = new_op();
+ cls_refcount_get(*op, newtag, true);
+ ASSERT_EQ(-ENOENT, ioctx.operate(oid, op));
+ delete op;
+
+ /* create object */
+ ASSERT_EQ(0, ioctx.create(oid, true));
+
+ /* read reference, should return a single wildcard entry */
+
+ list<string> refs;
+
+ ASSERT_EQ(0, cls_refcount_read(ioctx, oid, &refs, true));
+ ASSERT_EQ(1, (int)refs.size());
+
+ string wildcard_tag;
+ string tag = refs.front();
+
+ ASSERT_EQ(wildcard_tag, tag);
+
+ /* take another reference, verify */
+ op = new_op();
+ cls_refcount_get(*op, newtag, true);
+ ASSERT_EQ(0, ioctx.operate(oid, op));
+
+ ASSERT_EQ(0, cls_refcount_read(ioctx, oid, &refs, true));
+ ASSERT_EQ(2, (int)refs.size());
+
+ map<string, bool> refs_map;
+ for (list<string>::iterator iter = refs.begin(); iter != refs.end(); ++iter) {
+ refs_map[*iter] = true;
+ }
+
+ ASSERT_EQ(1, (int)refs_map.count(wildcard_tag));
+ ASSERT_EQ(1, (int)refs_map.count(newtag));
+
+ delete op;
+
+ /* drop reference to newtag */
+
+ op = new_op();
+ cls_refcount_put(*op, newtag, true);
+ ASSERT_EQ(0, ioctx.operate(oid, op));
+
+ ASSERT_EQ(0, cls_refcount_read(ioctx, oid, &refs, true));
+ ASSERT_EQ(1, (int)refs.size());
+
+ tag = refs.front();
+ ASSERT_EQ(string(), tag);
+
+ delete op;
+
+ /* drop newtag reference again, op should return success, wouldn't do anything */
+
+ op = new_op();
+ cls_refcount_put(*op, newtag, true);
+ ASSERT_EQ(0, ioctx.operate(oid, op));
+
+ ASSERT_EQ(0, cls_refcount_read(ioctx, oid, &refs, true));
+ ASSERT_EQ(1, (int)refs.size());
+
+ tag = refs.front();
+ ASSERT_EQ(string(), tag);
+
+ delete op;
+
+ /* drop oldtag reference, make sure object removed */
+ op = new_op();
+ cls_refcount_put(*op, oldtag, true);
+ ASSERT_EQ(0, ioctx.operate(oid, op));
+
+ ASSERT_EQ(-ENOENT, ioctx.stat(oid, NULL, NULL));
+
+ delete op;
+
+ /* remove pool */
+ ioctx.close();
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, rados));
+}
+
TEST(cls_rgw, test_put_snap) {
librados::Rados rados;
#include "common/Thread.h"
#include "common/Throttle.h"
#include "common/ceph_argparse.h"
+#include "common/backport14.h"
#include <thread>
#include <atomic>
return NULL;
}
};
-
};
TEST_F(ThrottleTest, Throttle) {
} while(!waited);
}
-TEST_F(ThrottleTest, destructor) {
- Thread_get *t;
- {
- int64_t throttle_max = 10;
- Throttle *throttle = new Throttle(g_ceph_context, "throttle", throttle_max);
-
- ASSERT_FALSE(throttle->get(5));
-
- t = new Thread_get(*throttle, 7);
- t->create("t_throttle");
- bool blocked;
- useconds_t delay = 1;
- do {
- usleep(delay);
- if (throttle->get_or_fail(1)) {
- throttle->put(1);
- blocked = false;
- } else {
- blocked = true;
- }
- delay *= 2;
- } while(!blocked);
- delete throttle;
- }
- { //
- // The thread is left hanging, otherwise it will abort().
- // Deleting the Throttle on which it is waiting creates a
- // inconsistency that will be detected: the Throttle object that
- // it references no longer exists.
- //
- pthread_t id = t->get_thread_id();
- ASSERT_EQ(pthread_kill(id, 0), 0);
- delete t;
- ASSERT_EQ(pthread_kill(id, 0), 0);
- }
+TEST_F(ThrottleTest, destructor) {
+ EXPECT_DEATH({
+ int64_t throttle_max = 10;
+ auto throttle = ceph::make_unique<Throttle>(g_ceph_context, "throttle",
+ throttle_max);
+
+
+ ASSERT_FALSE(throttle->get(5));
+ unique_ptr<Thread_get> t = ceph::make_unique<Thread_get>(*throttle, 7);
+ t->create("t_throttle");
+ bool blocked;
+ useconds_t delay = 1;
+ do {
+ usleep(delay);
+ if (throttle->get_or_fail(1)) {
+ throttle->put(1);
+ blocked = false;
+ } else {
+ blocked = true;
+ }
+ delay *= 2;
+ } while (!blocked);
+ }, ".*");
}
std::pair<double, std::chrono::duration<double> > test_backoff(
wait_time / waits);
}
+TEST(BackoffThrottle, destruct) {
+ EXPECT_DEATH({
+ auto throttle = ceph::make_unique<BackoffThrottle>(
+ g_ceph_context, "destructor test", 10);
+ ASSERT_TRUE(throttle->set_params(0.4, 0.6, 1000, 2, 10, 6, nullptr));
+
+ throttle->get(5);
+ {
+ auto& t = *throttle;
+ std::thread([&t]() {
+ usleep(5);
+ t.get(6);
+ });
+ }
+ // No equivalent of get_or_fail()
+ std::this_thread::sleep_for(std::chrono::milliseconds(250));
+ }, ".*");
+}
+
TEST(BackoffThrottle, undersaturated)
{
auto results = test_backoff(
ASSERT_GT(results.second.count(), 0.0005);
}
+TEST(OrderedThrottle, destruct) {
+ EXPECT_DEATH({
+ auto throttle = ceph::make_unique<OrderedThrottle>(1, false);
+ throttle->start_op(nullptr);
+ {
+ auto& t = *throttle;
+ std::thread([&t]() {
+ usleep(5);
+ t.start_op(nullptr);
+ });
+ }
+ // No equivalent of get_or_fail()
+ std::this_thread::sleep_for(std::chrono::milliseconds(250));
+ }, ".*");
+}
+
/*
* Local Variables:
* compile-command: "cd ../.. ;
* "
* End:
*/
-
out << " get_command_descriptions --pull585\n";
}
-static void json_print(const MonCommand *mon_commands, int size)
+static void json_print(const std::vector<MonCommand> &mon_commands)
{
bufferlist rdata;
Formatter *f = Formatter::create("json");
- Monitor::format_command_descriptions(mon_commands, size, f, &rdata);
+ Monitor::format_command_descriptions(mon_commands, f, &rdata);
delete f;
string data(rdata.c_str(), rdata.length());
cout << data << std::endl;
#undef FLAG
#undef COMMAND
#undef COMMAND_WITH_FLAG
- MonCommand mon_commands[] = {
+ std::vector<MonCommand> mon_commands = {
#define FLAG(f) (MonCommand::FLAG_##f)
#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
{parsesig, helptext, modulename, req_perms, avail, 0},
#undef COMMAND
#undef COMMAND_WITH_FLAG
- // FIXME: slurp up the Mgr commands too
-
#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
{parsesig, helptext, modulename, req_perms, avail, FLAG(MGR)},
#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flags) \
#undef COMMAND_WITH_FLAG
};
- json_print(mon_commands, ARRAY_SIZE(mon_commands));
+ json_print(mon_commands);
}
// syntax error https://github.com/ceph/ceph/pull/585
static void pull585()
{
- MonCommand mon_commands[] = {
+ std::vector<MonCommand> mon_commands = {
{ "osd pool create "
"name=pool,type=CephPoolname "
"name=pg_num,type=CephInt,range=0 "
"create pool", "osd", "rw", "cli,rest" }
};
- json_print(mon_commands, ARRAY_SIZE(mon_commands));
+ json_print(mon_commands);
}
int main(int argc, char **argv) {
class test_md_config_t : public md_config_t, public ::testing::Test {
public:
+
+ test_md_config_t()
+ : md_config_t(true), Test()
+ {}
+
void test_expand_meta() {
Mutex::Locker l(lock);
// successfull meta expansion $run_dir and ${run_dir}
ostringstream oss;
std::string before = " BEFORE ";
std::string after = " AFTER ";
+
std::string val(before + "$run_dir${run_dir}" + after);
EXPECT_TRUE(expand_meta(val, &oss));
EXPECT_EQ(before + "/var/run/ceph/var/run/ceph" + after, val);
void test_expand_all_meta() {
Mutex::Locker l(lock);
int before_count = 0, data_dir = 0;
- for (auto& opt: *config_options) {
- std::string *str;
- opt.conf_ptr(str, this);
- if (str) {
- if (str->find("$") != string::npos)
+ for (const auto &i : schema) {
+ const Option &opt = i.second;
+ if (opt.type == Option::TYPE_STR) {
+ const auto &str = boost::get<std::string>(opt.value);
+ if (str.find("$") != string::npos)
before_count++;
- if (str->find("$data_dir") != string::npos)
+ if (str.find("$data_dir") != string::npos)
data_dir++;
}
}
+
// if there are no meta variables in the default configuration,
// something must be done to check the expected side effect
// of expand_all_meta
ASSERT_LT(0, before_count);
expand_all_meta();
int after_count = 0;
- for (auto& opt: *config_options) {
- std::string *str;
- opt.conf_ptr(str, this);
- if (str) {
+ for (auto& i : values) {
+ const auto &opt = schema.at(i.first);
+ if (opt.type == Option::TYPE_STR) {
+ const std::string &str = boost::get<std::string>(i.second);
size_t pos = 0;
- while ((pos = str->find("$", pos)) != string::npos) {
- if (str->substr(pos, 8) != "$channel") {
+ while ((pos = str.find("$", pos)) != string::npos) {
+ if (str.substr(pos, 8) != "$channel") {
std::cout << "unexpected meta-variable found at pos " << pos
- << " of '" << *str << "'" << std::endl;
+ << " of '" << str << "'" << std::endl;
after_count++;
}
pos++;
}
}
+TEST(Option, validation)
+{
+ Option opt_int("foo", Option::TYPE_INT, Option::LEVEL_BASIC);
+ opt_int.set_min_max(5, 10);
+
+ std::string msg;
+ EXPECT_EQ(-EINVAL, opt_int.validate(Option::value_t(int64_t(4)), &msg));
+ EXPECT_EQ(-EINVAL, opt_int.validate(Option::value_t(int64_t(11)), &msg));
+ EXPECT_EQ(0, opt_int.validate(Option::value_t(int64_t(7)), &msg));
+
+ Option opt_enum("foo", Option::TYPE_STR, Option::LEVEL_BASIC);
+ opt_enum.set_enum_allowed({"red", "blue"});
+ EXPECT_EQ(0, opt_enum.validate(Option::value_t(std::string("red")), &msg));
+ EXPECT_EQ(0, opt_enum.validate(Option::value_t(std::string("blue")), &msg));
+ EXPECT_EQ(-EINVAL, opt_enum.validate(Option::value_t(std::string("green")), &msg));
+
+ Option opt_validator("foo", Option::TYPE_INT, Option::LEVEL_BASIC);
+ opt_validator.set_validator([](std::string *value, std::string *error_message){
+ if (*value == std::string("one")) {
+ *value = "1";
+ return 0;
+ } else if (*value == std::string("666")) {
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+ });
+
+ std::string input = "666"; // An explicitly forbidden value
+ EXPECT_EQ(-EINVAL, opt_validator.pre_validate(&input, &msg));
+ EXPECT_EQ(input, "666");
+
+ input = "123"; // A permitted value with no special behaviour
+ EXPECT_EQ(0, opt_validator.pre_validate(&input, &msg));
+ EXPECT_EQ(input, "123");
+
+ input = "one"; // A value that has a magic conversion
+ EXPECT_EQ(0, opt_validator.pre_validate(&input, &msg));
+ EXPECT_EQ(input, "1");
+}
+
/*
* Local Variables:
* compile-command: "cd ../.. ;
TEST_P(CompressorTest, big_round_trip_randomish)
{
- unsigned len = 1048576 * 100;//269;
+ unsigned len = 1048576 * 10;//269;
bufferlist orig;
const char *alphabet = "abcdefghijklmnopqrstuvwxyz";
if (false) {
}
bufferlist in;
in.append(data, size);
- for (size_t t = 0; t < 100000; t++) {
+ for (size_t t = 0; t < 10000; t++) {
bufferlist out;
int res = compressor->compress(in, out);
EXPECT_EQ(res, 0);
in.append(data, size);
int res = compressor->compress(in, out);
EXPECT_EQ(res, 0);
- for (size_t t = 0; t < 100000; t++) {
+ for (size_t t = 0; t < 10000; t++) {
bufferlist out_dec;
int res = compressor->decompress(out, out_dec);
EXPECT_EQ(res, 0);
g_ceph_context->_conf->apply_changes(NULL);
CompressorRef zlib = Compressor::create(g_ceph_context, "zlib");
- for (int cnt=0; cnt<1000; cnt++)
+ for (int cnt=0; cnt<100; cnt++)
{
srand(cnt + 1000);
int log2 = (rand()%18) + 1;
g_ceph_context->_conf->apply_changes(NULL);
CompressorRef zlib = Compressor::create(g_ceph_context, "zlib");
- for (int cnt=0; cnt<1000; cnt++)
+ for (int cnt=0; cnt<100; cnt++)
{
srand(cnt + 1000);
int log2 = (rand()%18) + 1;
target_link_libraries(unittest_crush global m ${BLKID_LIBRARIES})
add_ceph_test(crush_weights.sh ${CMAKE_CURRENT_SOURCE_DIR}/crush_weights.sh)
-add_ceph_test(crush-classes.sh ${CMAKE_CURRENT_SOURCE_DIR}/crush-classes.sh)
-add_ceph_test(crush-choose-args.sh ${CMAKE_CURRENT_SOURCE_DIR}/crush-choose-args.sh)
#include "crush/CrushWrapper.h"
TEST(CrushWrapper, get_immediate_parent) {
- CrushWrapper *c = new CrushWrapper;
-
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
+
const int ROOT_TYPE = 1;
c->set_type_name(ROOT_TYPE, "root");
const int OSD_TYPE = 0;
EXPECT_EQ(0, ret);
EXPECT_EQ("root", loc.first);
EXPECT_EQ("default", loc.second);
-
- delete c;
}
TEST(CrushWrapper, move_bucket) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
EXPECT_EQ("root", loc.first);
EXPECT_EQ("root1", loc.second);
}
-
- delete c;
}
TEST(CrushWrapper, swap_bucket) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
}
TEST(CrushWrapper, rename_bucket_or_item) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
int osd0id = c->get_item_id("osd0");
EXPECT_EQ(0, c->rename_item("osd.0", "osd0renamed", &ss));
EXPECT_EQ(osd0id, c->get_item_id("osd0renamed"));
-
- delete c;
}
TEST(CrushWrapper, check_item_loc) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
int item = 0;
float expected_weight = 1.0;
EXPECT_TRUE(c->check_item_loc(g_ceph_context, item, loc, &weight));
EXPECT_EQ(expected_weight, weight);
}
-
- delete c;
}
TEST(CrushWrapper, update_item) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
EXPECT_EQ(modified_weight, c->get_item_weightf(item));
EXPECT_FALSE(c->check_item_loc(g_ceph_context, item, loc, &weight));
EXPECT_TRUE(c->check_item_loc(g_ceph_context, item, other_loc, &weight));
-
- delete c;
}
TEST(CrushWrapper, adjust_item_weight) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
EXPECT_EQ(1, c->adjust_item_weightf_in_loc(g_ceph_context, item, modified_weight, loc_two));
EXPECT_EQ(original_weight, c->get_item_weightf_in_loc(item, loc_one));
EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_two));
-
- delete c;
}
TEST(CrushWrapper, adjust_subtree_weight) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
ASSERT_EQ(c->get_bucket_weight(host0), 262144);
ASSERT_EQ(c->get_item_weight(host0), 262144);
ASSERT_EQ(c->get_bucket_weight(rootno), 262144 + 131072);
-
- delete c;
}
TEST(CrushWrapper, insert_item) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
EXPECT_EQ(-EINVAL, c->insert_item(g_ceph_context, item, 1.0,
"osd." + stringify(item), loc));
}
-
- delete c;
}
TEST(CrushWrapper, remove_item) {
- auto *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 2;
c->set_type_name(ROOT_TYPE, "root");
ASSERT_EQ(0, c->remove_item(g_ceph_context, item_to_remove, true));
float weight;
EXPECT_FALSE(c->check_item_loc(g_ceph_context, item_to_remove, loc, &weight));
-
- delete c;
}
TEST(CrushWrapper, item_bucket_names) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
int index = 123;
string name = "NAME";
EXPECT_EQ(-EINVAL, c->set_item_name(index, "\001"));
EXPECT_TRUE(c->item_exists(index));
EXPECT_EQ(index, c->get_item_id(name));
EXPECT_EQ(name, c->get_item_name(index));
- delete c;
}
TEST(CrushWrapper, bucket_types) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
int index = 123;
string name = "NAME";
c->set_type_name(index, name);
EXPECT_EQ(1, c->get_num_type_names());
EXPECT_EQ(index, c->get_type_id(name));
EXPECT_EQ(name, c->get_type_name(index));
- delete c;
}
TEST(CrushWrapper, is_valid_crush_name) {
}
TEST(CrushWrapper, dump_rules) {
- CrushWrapper *c = new CrushWrapper;
+ std::unique_ptr<CrushWrapper> c(new CrushWrapper);
const int ROOT_TYPE = 1;
c->set_type_name(ROOT_TYPE, "root");
c->get_rule_weight_osd_map(0, &wm);
ASSERT_TRUE(wm.size() == 1);
ASSERT_TRUE(wm[0] == 1.0);
-
- delete c;
}
TEST(CrushWrapper, distance) {
arg_map.args = choose_args;
uint64_t features = CEPH_FEATURE_CRUSH_TUNABLES5|CEPH_FEATURE_INCARNATION_2;
+ int64_t caid = CrushWrapper::DEFAULT_CHOOSE_ARGS;
// if the client is capable, encode choose_args
{
- c.choose_args[0] = arg_map;
+ c.choose_args[caid] = arg_map;
bufferlist bl;
c.encode(bl, features|CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
bufferlist::iterator i(bl.begin());
CrushWrapper c_new;
c_new.decode(i);
ASSERT_EQ(1u, c_new.choose_args.size());
- ASSERT_EQ(1u, c_new.choose_args[0].args[-1-id].weight_set_size);
- ASSERT_EQ(weights, c_new.choose_args[0].args[-1-id].weight_set[0].weights[0]);
+ ASSERT_EQ(1u, c_new.choose_args[caid].args[-1-id].weight_set_size);
+ ASSERT_EQ(weights, c_new.choose_args[caid].args[-1-id].weight_set[0].weights[0]);
ASSERT_EQ(weight, c_new.get_bucket_item_weightf(id, 0));
}
// if the client is not compatible, copy choose_arg in the weights
{
- c.choose_args[0] = arg_map;
+ c.choose_args[caid] = arg_map;
bufferlist bl;
c.encode(bl, features);
c.choose_args.clear();
ASSERT_FALSE(c.name_exists("default~ssd"));
}
-TEST(CrushWrapper, class_is_in_use) {
- CrushWrapper c;
- c.create();
- c.set_type_name(1, "root");
-
- int weight = 1;
- map<string,string> loc;
- loc["root"] = "default";
-
- ASSERT_FALSE(c.class_is_in_use(0));
-
- int item = 1;
- c.insert_item(g_ceph_context, item, weight, "osd.1", loc);
- int class_id = c.get_or_create_class_id("ssd");
- c.class_map[item] = class_id;
-
- ASSERT_TRUE(c.class_is_in_use(c.get_class_id("ssd")));
- ASSERT_EQ(0, c.remove_class_name("ssd"));
- ASSERT_FALSE(c.class_is_in_use(c.get_class_id("ssd")));
-}
-
TEST(CrushWrapper, remove_class_name) {
CrushWrapper c;
c.create();
// choose + choose
{
cout << "take + choose + choose + choose + emit" << std::endl;
- int rule = c.add_rule(5, 2, 0, 1, 10, 2);
+ int rule = c.add_rule(2, 5, 0, 1, 10);
ASSERT_EQ(2, rule);
c.set_rule_step_take(rule, 0, bno);
c.set_rule_step_choose_indep(rule, 1, 2, 2);
# rules
rule replicated_rule {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
{
bucket_id -1
weight_set [
- [ 6.000 ]
- [ 7.000 ]
+ [ 2.000 ]
+ [ 1.000 ]
]
ids [ -10 ]
}
bucket_id -2
weight_set [
[ 2.000 0.000 ]
- [ 2.000 0.000 ]
+ [ 1.000 0.000 ]
]
ids [ -20 1 ]
}
# rules
rule replicated_rule {
- ruleset 0
+ id 0
type replicated
min_size 1
max_size 10
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2017 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7131" # git grep '\<7131\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
- CEPH_ARGS+="--crush-location=root=default,host=HOST "
- CEPH_ARGS+="--osd-crush-initial-weight=3 "
- #
- # Disable device auto class feature for now.
- # The device class is non-deterministic and will
- # crash the crushmap comparison below.
- #
- CEPH_ARGS+="--osd-class-update-on-start=false "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_choose_args_update() {
- #
- # adding a weighted OSD updates the weight up to the top
- #
- local dir=$1
-
- run_mon $dir a || return 1
- run_osd $dir 0 || return 1
-
- ceph osd getcrushmap > $dir/map || return 1
- crushtool -d $dir/map -o $dir/map.txt || return 1
- sed -i -e '/end crush map/d' $dir/map.txt
- cat >> $dir/map.txt <<EOF
-# choose_args
-choose_args 0 {
- {
- bucket_id -1
- weight_set [
- [ 3.000 ]
- [ 3.000 ]
- ]
- ids [ -10 ]
- }
- {
- bucket_id -2
- weight_set [
- [ 2.000 ]
- [ 2.000 ]
- ]
- ids [ -20 ]
- }
-}
-
-# end crush map
-EOF
- crushtool -c $dir/map.txt -o $dir/map-new || return 1
- ceph osd setcrushmap -i $dir/map-new || return 1
-
- run_osd $dir 1 || return 1
- ceph osd getcrushmap > $dir/map-one-more || return 1
- crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1
- cat $dir/map-one-more.txt
- diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-3.txt || return 1
-
- destroy_osd $dir 1 || return 1
- ceph osd getcrushmap > $dir/map-one-less || return 1
- crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1
- diff -u $dir/map-one-less.txt $dir/map.txt || return 1
-}
-
-function TEST_no_update_weight_set() {
- #
- # adding a zero weight OSD does not update the weight set at all
- #
- local dir=$1
-
- ORIG_CEPH_ARGS="$CEPH_ARGS"
- CEPH_ARGS+="--osd-crush-update-weight-set=false "
-
- run_mon $dir a || return 1
- run_osd $dir 0 || return 1
-
- ceph osd getcrushmap > $dir/map || return 1
- crushtool -d $dir/map -o $dir/map.txt || return 1
- sed -i -e '/end crush map/d' $dir/map.txt
- cat >> $dir/map.txt <<EOF
-# choose_args
-choose_args 0 {
- {
- bucket_id -1
- weight_set [
- [ 6.000 ]
- [ 7.000 ]
- ]
- ids [ -10 ]
- }
- {
- bucket_id -2
- weight_set [
- [ 2.000 ]
- [ 2.000 ]
- ]
- ids [ -20 ]
- }
-}
-
-# end crush map
-EOF
- crushtool -c $dir/map.txt -o $dir/map-new || return 1
- ceph osd setcrushmap -i $dir/map-new || return 1
-
-
- run_osd $dir 1 || return 1
- ceph osd getcrushmap > $dir/map-one-more || return 1
- crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1
- cat $dir/map-one-more.txt
- diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-0.txt || return 1
-
- destroy_osd $dir 1 || return 1
- ceph osd getcrushmap > $dir/map-one-less || return 1
- crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1
- diff -u $dir/map-one-less.txt $dir/map.txt || return 1
-
- CEPH_ARGS="$ORIG_CEPH_ARGS"
-}
-
-main crush-choose-args "$@"
-
-# Local Variables:
-# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-choose-args.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2017 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7130" # git grep '\<7130\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
- #
- # Disable auto-class, so we can inject device class manually below
- #
- CEPH_ARGS+="--osd-class-update-on-start=false "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function add_something() {
- local dir=$1
- local obj=${2:-SOMETHING}
-
- local payload=ABCDEF
- echo $payload > $dir/ORIGINAL
- rados --pool rbd put $obj $dir/ORIGINAL || return 1
-}
-
-function get_osds_up() {
- local poolname=$1
- local objectname=$2
-
- local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \
- $XMLSTARLET sel -t -m "//up/osd" -v . -o ' ')
- # get rid of the trailing space
- echo $osds
-}
-
-function TEST_classes() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- test "$(get_osds_up rbd SOMETHING)" == "1 2 0" || return 1
- add_something $dir SOMETHING || return 1
-
- #
- # osd.0 has class ssd and the rule is modified
- # to only take ssd devices.
- #
- ceph osd getcrushmap > $dir/map || return 1
- crushtool -d $dir/map -o $dir/map.txt || return 1
- ${SED} -i \
- -e '/device 0 osd.0/s/$/ class ssd/' \
- -e '/step take default/s/$/ class ssd/' \
- $dir/map.txt || return 1
- crushtool -c $dir/map.txt -o $dir/map-new || return 1
- ceph osd setcrushmap -i $dir/map-new || return 1
-
- #
- # There can only be one mapping since there only is
- # one device with ssd class.
- #
- ok=false
- for delay in 2 4 8 16 32 64 128 256 ; do
- if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0" ; then
- ok=true
- break
- fi
- sleep $delay
- ceph osd dump # for debugging purposes
- ceph pg dump # for debugging purposes
- done
- $ok || return 1
- #
- # Writing keeps working because the pool is min_size 1 by
- # default.
- #
- add_something $dir SOMETHING_ELSE || return 1
-
- #
- # Sanity check that the rule indeed has ssd
- # generated bucket with a name including ~ssd.
- #
- ceph osd crush dump | grep -q '~ssd' || return 1
-}
-
-function TEST_set_device_class() {
- local dir=$1
-
- TEST_classes $dir || return 1
-
- ceph osd crush set-device-class ssd osd.0 || return 1
- ceph osd crush class ls-osd ssd | grep 0 || return 1
- ceph osd crush set-device-class ssd osd.1 || return 1
- ceph osd crush class ls-osd ssd | grep 1 || return 1
- ceph osd crush set-device-class ssd 0 1 || return 1 # should be idempotent
-
- ok=false
- for delay in 2 4 8 16 32 64 128 256 ; do
- if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0 1" ; then
- ok=true
- break
- fi
- sleep $delay
- ceph osd crush dump
- ceph osd dump # for debugging purposes
- ceph pg dump # for debugging purposes
- done
- $ok || return 1
-}
-
-function TEST_mon_classes() {
- local dir=$1
-
- run_mon $dir a || return 1
- ceph osd crush class create CLASS || return 1
- ceph osd crush class create CLASS || return 1 # idempotent
- ceph osd crush class ls | grep CLASS || return 1
- ceph osd crush class rename CLASS TEMP || return 1
- ceph osd crush class ls | grep TEMP || return 1
- ceph osd crush class rename TEMP CLASS || return 1
- ceph osd crush class ls | grep CLASS || return 1
- ceph osd crush class rm CLASS || return 1
- expect_failure $dir ENOENT ceph osd crush class rm CLASS || return 1
-}
-
-main crush-classes "$@"
-
-# Local Variables:
-# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-classes.sh"
-# End:
}
int ret;
int ruleno = 0;
- int rule = 0;
- ruleno = rule;
- ret = c->add_rule(4, rule, 123, 1, 20, ruleno);
+ ret = c->add_rule(ruleno, 4, 123, 1, 20);
assert(ret == ruleno);
ret = c->set_rule_step(ruleno, 0, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 10, 0);
assert(ret == 0);
#!/bin/bash
source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
read -r -d '' cm <<'EOF'
# devices
TEST(DaemonConfig, SimpleSet) {
int ret;
- ret = g_ceph_context->_conf->set_val("num_client", "21");
+ ret = g_ceph_context->_conf->set_val("log_graylog_port", "21");
ASSERT_EQ(ret, 0);
g_ceph_context->_conf->apply_changes(NULL);
char buf[128];
memset(buf, 0, sizeof(buf));
char *tmp = buf;
- ret = g_ceph_context->_conf->get_val("num_client", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("21"), string(buf));
}
"false"));
int ret;
- const char *argv[] = { "foo", "--num-client", "22",
+ const char *argv[] = { "foo", "--log-graylog-port", "22",
"--keyfile", "/tmp/my-keyfile", NULL };
size_t argc = (sizeof(argv) / sizeof(argv[0])) - 1;
vector<const char*> args;
ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
memset(buf, 0, sizeof(buf));
- ret = g_ceph_context->_conf->get_val("num_client", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("22"), string(buf));
TEST(DaemonConfig, InjectArgs) {
int ret;
- std::string injection("--num-client 56 --max-open-files 42");
+ std::string injection("--log-graylog-port 56 --leveldb-max-open-files 42");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, 0);
char buf[128];
char *tmp = buf;
memset(buf, 0, sizeof(buf));
- ret = g_ceph_context->_conf->get_val("max_open_files", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("42"), string(buf));
memset(buf, 0, sizeof(buf));
- ret = g_ceph_context->_conf->get_val("num_client", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("56"), string(buf));
- injection = "--num-client 57";
+ injection = "--log-graylog-port 57";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, 0);
- ret = g_ceph_context->_conf->get_val("num_client", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("57"), string(buf));
}
char *tmp2 = buf2;
// We should complain about the garbage in the input
- std::string injection("--random-garbage-in-injectargs 26 --num-client 28");
+ std::string injection("--random-garbage-in-injectargs 26 --log-graylog-port 28");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, -EINVAL);
// But, debug should still be set...
memset(buf, 0, sizeof(buf));
- ret = g_ceph_context->_conf->get_val("num_client", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("log_graylog_port", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("28"), string(buf));
// Injectargs shouldn't let us change this, since it is a string-valued
// variable and there isn't an observer for it.
- std::string injection2("--osd_data /tmp/some-other-directory --num-client 4");
+ std::string injection2("--osd_data /tmp/some-other-directory --log-graylog-port 4");
ret = g_ceph_context->_conf->injectargs(injection2, &cout);
ASSERT_EQ(ret, -ENOSYS);
char *tmp = buf;
// Change log_to_syslog
- std::string injection("--log_to_syslog --num-client 28");
+ std::string injection("--log_to_syslog --log-graylog-port 28");
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("true"), string(buf));
// Turn off log_to_syslog
- injection = "--log_to_syslog=false --num-client 28";
+ injection = "--log_to_syslog=false --log-graylog-port 28";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("false"), string(buf));
// Turn on log_to_syslog
- injection = "--num-client 1 --log_to_syslog=true --max-open-files 40";
+ injection = "--log-graylog-port=1 --log_to_syslog=true --leveldb-max-open-files 40";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("true"), string(buf));
// parse error
- injection = "--num-client 1 --log_to_syslog=falsey --max-open-files 42";
+ injection = "--log-graylog-port 1 --log_to_syslog=falsey --leveldb-max-open-files 42";
ret = g_ceph_context->_conf->injectargs(injection, &cout);
ASSERT_EQ(ret, -EINVAL);
// debug-ms should still become 42...
memset(buf, 0, sizeof(buf));
- ret = g_ceph_context->_conf->get_val("max_open_files", &tmp, sizeof(buf));
+ ret = g_ceph_context->_conf->get_val("leveldb_max_open_files", &tmp, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("42"), string(buf));
}
TEST(DaemonConfig, InvalidIntegers) {
{
- long long bad_value = (long long)std::numeric_limits<int>::max() + 1;
- string str = boost::lexical_cast<string>(bad_value);
- int ret = g_ceph_context->_conf->set_val("num_client", str);
+ int ret = g_ceph_context->_conf->set_val("log_graylog_port", "rhubarb");
ASSERT_EQ(ret, -EINVAL);
}
+
{
- // 4G must be greater than INT_MAX
- ASSERT_GT(4LL * 1024 * 1024 * 1024, (long long)std::numeric_limits<int>::max());
- int ret = g_ceph_context->_conf->set_val("num_client", "4G");
+ int64_t max = std::numeric_limits<int64_t>::max();
+ string str = boost::lexical_cast<string>(max);
+ str = str + "999"; // some extra digits to take us out of bounds
+ int ret = g_ceph_context->_conf->set_val("log_graylog_port", str);
ASSERT_EQ(ret, -EINVAL);
}
}
#!/bin/bash -e
source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
dir=$1
TYPE_FEATUREFUL(pg_query_t)
TYPE(pg_log_entry_t)
TYPE(pg_log_t)
-TYPE(pg_missing_item)
+TYPE_FEATUREFUL(pg_missing_item)
TYPE(pg_missing_t)
TYPE(pg_ls_response_t)
TYPE(pg_nls_response_t)
MESSAGE(MOSDPGQuery)
#include "messages/MOSDPGRemove.h"
MESSAGE(MOSDPGRemove)
+#include "messages/MOSDPGRecoveryDelete.h"
+MESSAGE(MOSDPGRecoveryDelete)
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+MESSAGE(MOSDPGRecoveryDeleteReply)
#include "messages/MOSDPGScan.h"
MESSAGE(MOSDPGScan)
#include "messages/MOSDPGTemp.h"
MESSAGE(MOSDRepScrub)
#include "messages/MOSDScrub.h"
MESSAGE(MOSDScrub)
+#include "messages/MOSDForceRecovery.h"
+MESSAGE(MOSDForceRecovery)
#include "messages/MOSDSubOp.h"
MESSAGE(MOSDSubOp)
#include "messages/MOSDSubOpReply.h"
-add_ceph_test(test-erasure-code-plugins.sh ${CMAKE_CURRENT_SOURCE_DIR}/test-erasure-code-plugins.sh)
-
-add_ceph_test(test-erasure-code.sh ${CMAKE_CURRENT_SOURCE_DIR}/test-erasure-code.sh)
-add_ceph_test(test-erasure-eio.sh ${CMAKE_CURRENT_SOURCE_DIR}/test-erasure-eio.sh)
add_executable(ceph_erasure_code_benchmark
${CMAKE_SOURCE_DIR}/src/erasure-code/ErasureCode.cc
const char* env = getenv("CEPH_LIB");
string directory(env ? env : ".libs");
- g_conf->set_val("erasure_code_dir", directory, false);
+ g_conf->set_val_or_die("erasure_code_dir", directory, false);
::testing::InitGoogleTest(&argc, argv);
const char* env = getenv("CEPH_LIB");
std::string directory(env ? env : ".libs");
- g_conf->set_val("erasure_code_dir", directory, false);
+ g_conf->set_val_or_die("erasure_code_dir", directory, false);
::testing::InitGoogleTest(&argc, argv);
g_ceph_context->_conf->apply_changes(NULL);
const char* env = getenv("CEPH_LIB");
string directory(env ? env : ".libs");
- g_conf->set_val("erasure_code_dir", directory, false);
+ g_conf->set_val_or_die("erasure_code_dir", directory, false);
if (vm.count("help")) {
cout << desc << std::endl;
g_ceph_context->_conf->apply_changes(NULL);
const char* env = getenv("CEPH_LIB");
std::string libs_dir(env ? env : ".libs");
- g_conf->set_val("erasure_code_dir", libs_dir, false);
+ g_conf->set_val_or_die("erasure_code_dir", libs_dir, false);
if (vm.count("help")) {
cout << desc << std::endl;
+++ /dev/null
-#!/bin/bash -x
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-arch=$(uname -m)
-
-case $arch in
- i[[3456]]86*|x86_64*|amd64*)
- legacy_jerasure_plugins=(jerasure_generic jerasure_sse3 jerasure_sse4)
- legacy_shec_plugins=(shec_generic shec_sse3 shec_sse4)
- plugins=(jerasure shec lrc isa)
- ;;
- aarch64*|arm*)
- legacy_jerasure_plugins=(jerasure_generic jerasure_neon)
- legacy_shec_plugins=(shec_generic shec_neon)
- plugins=(jerasure shec lrc)
- ;;
- *)
- echo "unsupported platform ${arch}."
- return 1
- ;;
-esac
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:17110" # git grep '\<17110\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-function TEST_preload_warning() {
- local dir=$1
-
- for plugin in ${legacy_jerasure_plugins[*]} ${legacy_shec_plugins[*]}; do
- setup $dir || return 1
- run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1
- run_mgr $dir x || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/mon.a.log || return 1
- grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/osd.0.log || return 1
- teardown $dir || return 1
- done
- return 0
-}
-
-function TEST_preload_no_warning() {
- local dir=$1
-
- for plugin in ${plugins[*]}; do
- setup $dir || return 1
- run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1
- run_mgr $dir x || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/mon.a.log || return 1
- ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/osd.0.log || return 1
- teardown $dir || return 1
- done
-
- return 0
-}
-
-function TEST_preload_no_warning_default() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- ! grep "WARNING: osd_erasure_code_plugins" $dir/mon.a.log || return 1
- ! grep "WARNING: osd_erasure_code_plugins" $dir/osd.0.log || return 1
- teardown $dir || return 1
-
- return 0
-}
-
-function TEST_ec_profile_warning() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 2) ; do
- run_osd $dir $id || return 1
- done
- wait_for_clean || return 1
-
- for plugin in ${legacy_jerasure_plugins[*]}; do
- ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd technique=reed_sol_van plugin=${plugin} || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1
- done
-
- for plugin in ${legacy_shec_plugins[*]}; do
- ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd plugin=${plugin} || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1
- done
-
- teardown $dir || return 1
-}
-
-main test-erasure-code-plugins "$@"
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7101" # git grep '\<7101\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON --mon-osd-prime-pg-temp=false"
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- # check that erasure code plugins are preloaded
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
- for id in $(seq 0 10) ; do
- run_osd $dir $id || return 1
- done
- wait_for_clean || return 1
- # check that erasure code plugins are preloaded
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
- create_erasure_coded_pool ecpool || return 1
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-
- delete_pool ecpool || return 1
- teardown $dir || return 1
-}
-
-function create_erasure_coded_pool() {
- local poolname=$1
-
- ceph osd erasure-code-profile set myprofile \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 12 12 erasure myprofile \
- || return 1
- wait_for_clean || return 1
-}
-
-function delete_pool() {
- local poolname=$1
-
- ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
-}
-
-function rados_put_get() {
- local dir=$1
- local poolname=$2
- local objname=${3:-SOMETHING}
-
-
- for marker in AAA BBB CCCC DDDD ; do
- printf "%*s" 1024 $marker
- done > $dir/ORIGINAL
-
- #
- # get and put an object, compare they are equal
- #
- rados --pool $poolname put $objname $dir/ORIGINAL || return 1
- rados --pool $poolname get $objname $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
- rm $dir/COPY
-
- #
- # take out an OSD used to store the object and
- # check the object can still be retrieved, which implies
- # recovery
- #
- local -a initial_osds=($(get_osds $poolname $objname))
- local last=$((${#initial_osds[@]} - 1))
- ceph osd out ${initial_osds[$last]} || return 1
- ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
- rados --pool $poolname get $objname $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
- ceph osd in ${initial_osds[$last]} || return 1
-
- rm $dir/ORIGINAL
-}
-
-function rados_osds_out_in() {
- local dir=$1
- local poolname=$2
- local objname=${3:-SOMETHING}
-
-
- for marker in FFFF GGGG HHHH IIII ; do
- printf "%*s" 1024 $marker
- done > $dir/ORIGINAL
-
- #
- # get and put an object, compare they are equal
- #
- rados --pool $poolname put $objname $dir/ORIGINAL || return 1
- rados --pool $poolname get $objname $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
- rm $dir/COPY
-
- #
- # take out two OSDs used to store the object, wait for the cluster
- # to be clean (i.e. all PG are clean and active) again which
- # implies the PG have been moved to use the remaining OSDs. Check
- # the object can still be retrieved.
- #
- wait_for_clean || return 1
- local osds_list=$(get_osds $poolname $objname)
- local -a osds=($osds_list)
- for osd in 0 1 ; do
- ceph osd out ${osds[$osd]} || return 1
- done
- wait_for_clean || return 1
- #
- # verify the object is no longer mapped to the osds that are out
- #
- for osd in 0 1 ; do
- ! get_osds $poolname $objname | grep '\<'${osds[$osd]}'\>' || return 1
- done
- rados --pool $poolname get $objname $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
- #
- # bring the osds back in, , wait for the cluster
- # to be clean (i.e. all PG are clean and active) again which
- # implies the PG go back to using the same osds as before
- #
- for osd in 0 1 ; do
- ceph osd in ${osds[$osd]} || return 1
- done
- wait_for_clean || return 1
- test "$osds_list" = "$(get_osds $poolname $objname)" || return 1
- rm $dir/ORIGINAL
-}
-
-function TEST_rados_put_get_lrc_advanced() {
- local dir=$1
- local poolname=pool-lrc-a
- local profile=profile-lrc-a
-
- ceph osd erasure-code-profile set $profile \
- plugin=lrc \
- mapping=DD_ \
- crush-steps='[ [ "chooseleaf", "osd", 0 ] ]' \
- layers='[ [ "DDc", "" ] ]' || return 1
- ceph osd pool create $poolname 12 12 erasure $profile \
- || return 1
-
- rados_put_get $dir $poolname || return 1
-
- delete_pool $poolname
- ceph osd erasure-code-profile rm $profile
-}
-
-function TEST_rados_put_get_lrc_kml() {
- local dir=$1
- local poolname=pool-lrc
- local profile=profile-lrc
-
- ceph osd erasure-code-profile set $profile \
- plugin=lrc \
- k=4 m=2 l=3 \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 12 12 erasure $profile \
- || return 1
-
- rados_put_get $dir $poolname || return 1
-
- delete_pool $poolname
- ceph osd erasure-code-profile rm $profile
-}
-
-function TEST_rados_put_get_isa() {
- if ! erasure_code_plugin_exists isa ; then
- echo "SKIP because plugin isa has not been built"
- return 0
- fi
- local dir=$1
- local poolname=pool-isa
-
- ceph osd erasure-code-profile set profile-isa \
- plugin=isa \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 1 1 erasure profile-isa \
- || return 1
-
- rados_put_get $dir $poolname || return 1
-
- delete_pool $poolname
-}
-
-function TEST_rados_put_get_jerasure() {
- local dir=$1
-
- rados_put_get $dir ecpool || return 1
-
- local poolname=pool-jerasure
- local profile=profile-jerasure
-
- ceph osd erasure-code-profile set $profile \
- plugin=jerasure \
- k=4 m=2 \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 12 12 erasure $profile \
- || return 1
-
- rados_put_get $dir $poolname || return 1
- rados_osds_out_in $dir $poolname || return 1
-
- delete_pool $poolname
- ceph osd erasure-code-profile rm $profile
-}
-
-function TEST_rados_put_get_shec() {
- local dir=$1
-
- local poolname=pool-shec
- local profile=profile-shec
-
- ceph osd erasure-code-profile set $profile \
- plugin=shec \
- k=2 m=1 c=1 \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 12 12 erasure $profile \
- || return 1
-
- rados_put_get $dir $poolname || return 1
-
- delete_pool $poolname
- ceph osd erasure-code-profile rm $profile
-}
-
-function TEST_alignment_constraints() {
- local payload=ABC
- echo "$payload" > $dir/ORIGINAL
- #
- # Verify that the rados command enforces alignment constraints
- # imposed by the stripe width
- # See http://tracker.ceph.com/issues/8622
- #
- local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
- eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
- local block_size=$((stripe_unit * k - 1))
- dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2
- rados --block-size=$block_size \
- --pool ecpool put UNALIGNED $dir/ORIGINAL || return 1
- rm $dir/ORIGINAL
-}
-
-function chunk_size() {
- echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
-}
-
-#
-# By default an object will be split in two (k=2) with the first part
-# of the object in the first OSD of the up set and the second part in
-# the next OSD in the up set. This layout is defined by the mapping
-# parameter and this function helps verify that the first and second
-# part of the object are located in the OSD where they should be.
-#
-function verify_chunk_mapping() {
- local dir=$1
- local poolname=$2
- local first=$3
- local second=$4
-
- local payload=$(printf '%*s' $(chunk_size) FIRST$poolname ; printf '%*s' $(chunk_size) SECOND$poolname)
- echo -n "$payload" > $dir/ORIGINAL
-
- rados --pool $poolname put SOMETHING$poolname $dir/ORIGINAL || return 1
- rados --pool $poolname get SOMETHING$poolname $dir/COPY || return 1
- local -a osds=($(get_osds $poolname SOMETHING$poolname))
- for (( i = 0; i < ${#osds[@]}; i++ )) ; do
- ceph daemon osd.${osds[$i]} flush_journal
- done
- diff $dir/ORIGINAL $dir/COPY || return 1
- rm $dir/COPY
-
- local -a osds=($(get_osds $poolname SOMETHING$poolname))
- grep --quiet --recursive --text FIRST$poolname $dir/${osds[$first]} || return 1
- grep --quiet --recursive --text SECOND$poolname $dir/${osds[$second]} || return 1
-}
-
-function TEST_chunk_mapping() {
- local dir=$1
-
- #
- # mapping=DD_ is the default:
- # first OSD (i.e. 0) in the up set has the first part of the object
- # second OSD (i.e. 1) in the up set has the second part of the object
- #
- verify_chunk_mapping $dir ecpool 0 1 || return 1
-
- ceph osd erasure-code-profile set remap-profile \
- plugin=lrc \
- layers='[ [ "_DD", "" ] ]' \
- mapping='_DD' \
- crush-steps='[ [ "choose", "osd", 0 ] ]' || return 1
- ceph osd erasure-code-profile get remap-profile
- ceph osd pool create remap-pool 12 12 erasure remap-profile \
- || return 1
-
- #
- # mapping=_DD
- # second OSD (i.e. 1) in the up set has the first part of the object
- # third OSD (i.e. 2) in the up set has the second part of the object
- #
- verify_chunk_mapping $dir remap-pool 1 2 || return 1
-
- delete_pool remap-pool
- ceph osd erasure-code-profile rm remap-profile
-}
-
-main test-erasure-code "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-code.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-#
-# Author: Kefu Chai <kchai@redhat.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7112" # git grep '\<7112\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- # check that erasure code plugins are preloaded
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function setup_osds() {
- for id in $(seq 0 3) ; do
- run_osd $dir $id || return 1
- done
- wait_for_clean || return 1
-
- # check that erasure code plugins are preloaded
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
-}
-
-function create_erasure_coded_pool() {
- local poolname=$1
-
- ceph osd erasure-code-profile set myprofile \
- plugin=jerasure \
- k=2 m=1 \
- crush-failure-domain=osd || return 1
- ceph osd pool create $poolname 1 1 erasure myprofile \
- || return 1
- wait_for_clean || return 1
-}
-
-function delete_pool() {
- local poolname=$1
-
- ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
- ceph osd erasure-code-profile rm myprofile
-}
-
-function rados_put() {
- local dir=$1
- local poolname=$2
- local objname=${3:-SOMETHING}
-
- for marker in AAA BBB CCCC DDDD ; do
- printf "%*s" 1024 $marker
- done > $dir/ORIGINAL
- #
- # get and put an object, compare they are equal
- #
- rados --pool $poolname put $objname $dir/ORIGINAL || return 1
-}
-
-function rados_get() {
- local dir=$1
- local poolname=$2
- local objname=${3:-SOMETHING}
- local expect=${4:-ok}
-
- #
- # Expect a failure to get object
- #
- if [ $expect = "fail" ];
- then
- ! rados --pool $poolname get $objname $dir/COPY
- return
- fi
- #
- # get an object, compare with $dir/ORIGINAL
- #
- rados --pool $poolname get $objname $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
- rm $dir/COPY
-}
-
-function rados_put_get() {
- local dir=$1
- local poolname=$2
- local objname=${3:-SOMETHING}
- local recovery=$4
-
- #
- # get and put an object, compare they are equal
- #
- rados_put $dir $poolname $objname || return 1
- # We can read even though caller injected read error on one of the shards
- rados_get $dir $poolname $objname || return 1
-
- if [ -n "$recovery" ];
- then
- #
- # take out the last OSD used to store the object,
- # bring it back, and check for clean PGs which means
- # recovery didn't crash the primary.
- #
- local -a initial_osds=($(get_osds $poolname $objname))
- local last=$((${#initial_osds[@]} - 1))
- # Kill OSD
- kill_daemons $dir TERM osd.${initial_osds[$last]} >&2 < /dev/null || return 1
- ceph osd out ${initial_osds[$last]} || return 1
- ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
- ceph osd in ${initial_osds[$last]} || return 1
- run_osd $dir ${initial_osds[$last]} || return 1
- wait_for_clean || return 1
- fi
-
- rm $dir/ORIGINAL
-}
-
-function inject_eio() {
- local objname=$1
- shift
- local dir=$1
- shift
- local shard_id=$1
- shift
-
- local poolname=pool-jerasure
- local -a initial_osds=($(get_osds $poolname $objname))
- local osd_id=${initial_osds[$shard_id]}
- set_config osd $osd_id filestore_debug_inject_read_err true || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \
- injectdataerr $poolname $objname $shard_id || return 1
-}
-
-function rados_get_data_eio() {
- local dir=$1
- shift
- local shard_id=$1
- shift
- local recovery=$1
- shift
-
- # inject eio to speificied shard
- #
- local poolname=pool-jerasure
- local objname=obj-eio-$$-$shard_id
- inject_eio $objname $dir $shard_id || return 1
- rados_put_get $dir $poolname $objname $recovery || return 1
-
- shard_id=$(expr $shard_id + 1)
- inject_eio $objname $dir $shard_id || return 1
- # Now 2 out of 3 shards get EIO, so should fail
- rados_get $dir $poolname $objname fail || return 1
-}
-
-# Change the size of speificied shard
-#
-function set_size() {
- local objname=$1
- shift
- local dir=$1
- shift
- local shard_id=$1
- shift
- local bytes=$1
- shift
- local mode=${1}
-
- local poolname=pool-jerasure
- local -a initial_osds=($(get_osds $poolname $objname))
- local osd_id=${initial_osds[$shard_id]}
- ceph osd set noout
- if [ "$mode" = "add" ];
- then
- objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1
- dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT
- elif [ "$bytes" = "0" ];
- then
- touch $dir/CORRUPT
- else
- dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT
- fi
- objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1
- rm -f $dir/CORRUPT
- ceph osd unset noout
-}
-
-function rados_get_data_bad_size() {
- local dir=$1
- shift
- local shard_id=$1
- shift
- local bytes=$1
- shift
- local mode=${1:-set}
-
- local poolname=pool-jerasure
- local objname=obj-size-$$-$shard_id-$bytes
- rados_put $dir $poolname $objname || return 1
-
- # Change the size of speificied shard
- #
- set_size $objname $dir $shard_id $bytes $mode || return 1
-
- rados_get $dir $poolname $objname || return 1
-
- # Leave objname and modify another shard
- shard_id=$(expr $shard_id + 1)
- set_size $objname $dir $shard_id $bytes $mode || return 1
- rados_get $dir $poolname $objname fail || return 1
-}
-
-#
-# These two test cases try to validate the following behavior:
-# For object on EC pool, if there is one shard having read error (
-# either primary or replica), client can still read object.
-#
-# If 2 shards have read errors the client will get an error.
-#
-function TEST_rados_get_subread_eio_shard_0() {
- local dir=$1
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # inject eio on primary OSD (0) and replica OSD (1)
- local shard_id=0
- rados_get_data_eio $dir $shard_id || return 1
- delete_pool $poolname
-}
-
-function TEST_rados_get_subread_eio_shard_1() {
- local dir=$1
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # inject eio into replicas OSD (1) and OSD (2)
- local shard_id=1
- rados_get_data_eio $dir $shard_id || return 1
- delete_pool $poolname
-}
-
-#
-# These two test cases try to validate that following behavior:
-# For object on EC pool, if there is one shard which an incorrect
-# size this will cause an internal read error, client can still read object.
-#
-# If 2 shards have incorrect size the client will get an error.
-#
-function TEST_rados_get_bad_size_shard_0() {
- local dir=$1
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # Set incorrect size into primary OSD (0) and replica OSD (1)
- local shard_id=0
- rados_get_data_bad_size $dir $shard_id 10 || return 1
- rados_get_data_bad_size $dir $shard_id 0 || return 1
- rados_get_data_bad_size $dir $shard_id 256 add || return 1
- delete_pool $poolname
-}
-
-function TEST_rados_get_bad_size_shard_1() {
- local dir=$1
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # Set incorrect size into replicas OSD (1) and OSD (2)
- local shard_id=1
- rados_get_data_bad_size $dir $shard_id 10 || return 1
- rados_get_data_bad_size $dir $shard_id 0 || return 1
- rados_get_data_bad_size $dir $shard_id 256 add || return 1
- delete_pool $poolname
-}
-
-function TEST_rados_get_with_subreadall_eio_shard_0() {
- local dir=$1
- local shard_id=0
-
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # inject eio on primary OSD (0)
- local shard_id=0
- rados_get_data_eio $dir $shard_id recovery || return 1
-
- delete_pool $poolname
-}
-
-function TEST_rados_get_with_subreadall_eio_shard_1() {
- local dir=$1
- local shard_id=0
-
- setup_osds || return 1
-
- local poolname=pool-jerasure
- create_erasure_coded_pool $poolname || return 1
- # inject eio on replica OSD (1)
- local shard_id=1
- rados_get_data_eio $dir $shard_id recovery || return 1
-
- delete_pool $poolname
-}
-
-main test-erasure-eio "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh"
-# End:
int ret = ceph_create(&cmount, NULL);
ASSERT_EQ(ret, 0);
- ret = ceph_conf_set(cmount, "max_open_files", "21");
+ ret = ceph_conf_set(cmount, "leveldb_max_open_files", "21");
ASSERT_EQ(ret, 0);
char buf[128];
memset(buf, 0, sizeof(buf));
- ret = ceph_conf_get(cmount, "max_open_files", buf, sizeof(buf));
+ ret = ceph_conf_get(cmount, "leveldb_max_open_files", buf, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("21"), string(buf));
int ret = ceph_create(&cmount, NULL);
ASSERT_EQ(ret, 0);
- const char *argv[] = { "foo", "--max-open-files", "2",
+ const char *argv[] = { "foo", "--leveldb-max-open-files", "2",
"--keyfile", "/tmp/my-keyfile", NULL };
size_t argc = (sizeof(argv) / sizeof(argv[0])) - 1;
ceph_conf_parse_argv(cmount, argc, argv);
ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
memset(buf, 0, sizeof(buf));
- ret = ceph_conf_get(cmount, "max_open_files", buf, sizeof(buf));
+ ret = ceph_conf_get(cmount, "leveldb_max_open_files", buf, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("2"), string(buf));
ASSERT_EQ(0, test_data.m_cluster.pool_create(p.c_str()));
IoCtx ioctx;
ASSERT_EQ(0, test_data.m_cluster.ioctx_create(p.c_str(), ioctx));
+ ioctx.application_enable("rados", true);
bufferlist inbl;
ASSERT_EQ(0, test_data.m_cluster.mon_command(
#include "include/rados/librados.h"
#include "include/rados/librados.hpp"
#include "include/encoding.h"
+#include "include/err.h"
#include "include/scope_guard.h"
#include "test/librados/test.h"
#include "test/librados/TestCase.h"
}
}
}
+
+TEST_F(LibRadosIoPP, CmpExtPP) {
+ bufferlist bl;
+ bl.append("ceph");
+ ObjectWriteOperation write1;
+ write1.write(0, bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write1));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write2;
+ write2.cmpext(0, bl, nullptr);
+ write2.write(0, new_bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write2));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "CEPH", 4));
+}
+
+TEST_F(LibRadosIoPP, CmpExtDNEPP) {
+ bufferlist bl;
+ bl.append(std::string(4, '\0'));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write;
+ write.cmpext(0, bl, nullptr);
+ write.write(0, new_bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "CEPH", 4));
+}
+
+TEST_F(LibRadosIoPP, CmpExtMismatchPP) {
+ bufferlist bl;
+ bl.append("ceph");
+ ObjectWriteOperation write1;
+ write1.write(0, bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write1));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write2;
+ write2.cmpext(0, new_bl, nullptr);
+ write2.write(0, new_bl);
+ ASSERT_EQ(-MAX_ERRNO, ioctx.operate("foo", &write2));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "ceph", 4));
+}
+
+TEST_F(LibRadosIoECPP, CmpExtPP) {
+ bufferlist bl;
+ bl.append("ceph");
+ ObjectWriteOperation write1;
+ write1.write(0, bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write1));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write2;
+ write2.cmpext(0, bl, nullptr);
+ write2.write_full(new_bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write2));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "CEPH", 4));
+}
+
+TEST_F(LibRadosIoECPP, CmpExtDNEPP) {
+ bufferlist bl;
+ bl.append(std::string(4, '\0'));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write;
+ write.cmpext(0, bl, nullptr);
+ write.write_full(new_bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "CEPH", 4));
+}
+
+TEST_F(LibRadosIoECPP, CmpExtMismatchPP) {
+ bufferlist bl;
+ bl.append("ceph");
+ ObjectWriteOperation write1;
+ write1.write(0, bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &write1));
+
+ bufferlist new_bl;
+ new_bl.append("CEPH");
+ ObjectWriteOperation write2;
+ write2.cmpext(0, new_bl, nullptr);
+ write2.write_full(new_bl);
+ ASSERT_EQ(-MAX_ERRNO, ioctx.operate("foo", &write2));
+
+ ObjectReadOperation read;
+ read.read(0, bl.length(), NULL, NULL);
+ ASSERT_EQ(0, ioctx.operate("foo", &read, &bl));
+ ASSERT_EQ(0, memcmp(bl.c_str(), "ceph", 4));
+}
int ret = rados_create(&cl, NULL);
ASSERT_EQ(ret, 0);
- ret = rados_conf_set(cl, "max_open_files", "21");
+ ret = rados_conf_set(cl, "leveldb_max_open_files", "21");
ASSERT_EQ(ret, 0);
char buf[128];
memset(buf, 0, sizeof(buf));
- ret = rados_conf_get(cl, "max_open_files", buf, sizeof(buf));
+ ret = rados_conf_get(cl, "leveldb_max_open_files", buf, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("21"), string(buf));
int ret = rados_create(&cl, NULL);
ASSERT_EQ(ret, 0);
- const char *argv[] = { "foo", "--max-open-files", "2",
+ const char *argv[] = { "foo", "--leveldb-max-open-files", "2",
"--keyfile", "/tmp/my-keyfile", NULL };
size_t argc = (sizeof(argv) / sizeof(argv[0])) - 1;
rados_conf_parse_argv(cl, argc, argv);
ASSERT_EQ(string("/tmp/my-keyfile"), string(buf));
memset(buf, 0, sizeof(buf));
- ret = rados_conf_get(cl, "max_open_files", buf, sizeof(buf));
+ ret = rados_conf_get(cl, "leveldb_max_open_files", buf, sizeof(buf));
ASSERT_EQ(ret, 0);
ASSERT_EQ(string("2"), string(buf));
#include <map>
#include <sstream>
#include <string>
-
+#include <boost/regex.hpp>
using namespace librados;
using std::map;
ASSERT_EQ("", create_one_ec_pool_pp(pool_name, s_cluster));
src_pool_name = get_temp_pool_name();
ASSERT_EQ(0, s_cluster.pool_create(src_pool_name.c_str()));
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, s_cluster.ioctx_create(pool_name.c_str(), ioctx));
+ ioctx.application_enable("rados", true);
+
+ librados::IoCtx src_ioctx;
+ ASSERT_EQ(0, s_cluster.ioctx_create(src_pool_name.c_str(), src_ioctx));
+ src_ioctx.application_enable("rados", true);
}
static void TearDownTestCase() {
ASSERT_EQ(0, s_cluster.pool_delete(src_pool_name.c_str()));
ASSERT_EQ(-MAX_ERRNO - 5,
rados_cmpext(ioctx, "cmpextpp", mismatch_str, sizeof(mismatch_str), 0));
}
+
+TEST_F(LibRadosMisc, Applications) {
+ const char *cmd[] = {"{\"prefix\":\"osd dump\"}", nullptr};
+ char *buf, *st;
+ size_t buflen, stlen;
+ ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf,
+ &buflen, &st, &stlen));
+ ASSERT_LT(0u, buflen);
+ string result(buf);
+ rados_buffer_free(buf);
+ rados_buffer_free(st);
+ if (!boost::regex_search(result, boost::regex("require_osd_release [l-z]"))) {
+ std::cout << "SKIPPING";
+ return;
+ }
+
+ char apps[128];
+ size_t app_len;
+
+ app_len = sizeof(apps);
+ ASSERT_EQ(0, rados_application_list(ioctx, apps, &app_len));
+ ASSERT_EQ(6U, app_len);
+ ASSERT_EQ(0, memcmp("rados\0", apps, app_len));
+
+ ASSERT_EQ(0, rados_application_enable(ioctx, "app1", 1));
+ ASSERT_EQ(-EPERM, rados_application_enable(ioctx, "app2", 0));
+ ASSERT_EQ(0, rados_application_enable(ioctx, "app2", 1));
+
+ ASSERT_EQ(-ERANGE, rados_application_list(ioctx, apps, &app_len));
+ ASSERT_EQ(16U, app_len);
+ ASSERT_EQ(0, rados_application_list(ioctx, apps, &app_len));
+ ASSERT_EQ(16U, app_len);
+ ASSERT_EQ(0, memcmp("app1\0app2\0rados\0", apps, app_len));
+
+ char keys[128];
+ char vals[128];
+ size_t key_len;
+ size_t val_len;
+
+ key_len = sizeof(keys);
+ val_len = sizeof(vals);
+ ASSERT_EQ(-ENOENT, rados_application_metadata_list(ioctx, "dne", keys,
+ &key_len, vals, &val_len));
+ ASSERT_EQ(0, rados_application_metadata_list(ioctx, "app1", keys, &key_len,
+ vals, &val_len));
+ ASSERT_EQ(0U, key_len);
+ ASSERT_EQ(0U, val_len);
+
+ ASSERT_EQ(-ENOENT, rados_application_metadata_set(ioctx, "dne", "key",
+ "value"));
+ ASSERT_EQ(0, rados_application_metadata_set(ioctx, "app1", "key1", "value1"));
+ ASSERT_EQ(0, rados_application_metadata_set(ioctx, "app1", "key2", "value2"));
+
+ ASSERT_EQ(-ERANGE, rados_application_metadata_list(ioctx, "app1", keys,
+ &key_len, vals, &val_len));
+ ASSERT_EQ(10U, key_len);
+ ASSERT_EQ(14U, val_len);
+ ASSERT_EQ(0, rados_application_metadata_list(ioctx, "app1", keys, &key_len,
+ vals, &val_len));
+ ASSERT_EQ(10U, key_len);
+ ASSERT_EQ(14U, val_len);
+ ASSERT_EQ(0, memcmp("key1\0key2\0", keys, key_len));
+ ASSERT_EQ(0, memcmp("value1\0value2\0", vals, val_len));
+
+ ASSERT_EQ(0, rados_application_metadata_remove(ioctx, "app1", "key1"));
+ ASSERT_EQ(0, rados_application_metadata_list(ioctx, "app1", keys, &key_len,
+ vals, &val_len));
+ ASSERT_EQ(5U, key_len);
+ ASSERT_EQ(7U, val_len);
+ ASSERT_EQ(0, memcmp("key2\0", keys, key_len));
+ ASSERT_EQ(0, memcmp("value2\0", vals, val_len));
+}
+
+TEST_F(LibRadosMiscPP, Applications) {
+ bufferlist inbl, outbl;
+ string outs;
+ ASSERT_EQ(0, cluster.mon_command("{\"prefix\": \"osd dump\"}",
+ inbl, &outbl, &outs));
+ ASSERT_LT(0u, outbl.length());
+ ASSERT_LE(0u, outs.length());
+ if (!boost::regex_search(outbl.to_str(),
+ boost::regex("require_osd_release [l-z]"))) {
+ std::cout << "SKIPPING";
+ return;
+ }
+
+ std::set<std::string> expected_apps = {"rados"};
+ std::set<std::string> apps;
+ ASSERT_EQ(0, ioctx.application_list(&apps));
+ ASSERT_EQ(expected_apps, apps);
+
+ ASSERT_EQ(0, ioctx.application_enable("app1", true));
+ ASSERT_EQ(-EPERM, ioctx.application_enable("app2", false));
+ ASSERT_EQ(0, ioctx.application_enable("app2", true));
+
+ expected_apps = {"app1", "app2", "rados"};
+ ASSERT_EQ(0, ioctx.application_list(&apps));
+ ASSERT_EQ(expected_apps, apps);
+
+ std::map<std::string, std::string> expected_meta;
+ std::map<std::string, std::string> meta;
+ ASSERT_EQ(-ENOENT, ioctx.application_metadata_list("dne", &meta));
+ ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
+ ASSERT_EQ(expected_meta, meta);
+
+ ASSERT_EQ(-ENOENT, ioctx.application_metadata_set("dne", "key1", "value1"));
+ ASSERT_EQ(0, ioctx.application_metadata_set("app1", "key1", "value1"));
+ ASSERT_EQ(0, ioctx.application_metadata_set("app1", "key2", "value2"));
+
+ expected_meta = {{"key1", "value1"}, {"key2", "value2"}};
+ ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
+ ASSERT_EQ(expected_meta, meta);
+
+ ASSERT_EQ(0, ioctx.application_metadata_remove("app1", "key1"));
+
+ expected_meta = {{"key2", "value2"}};
+ ASSERT_EQ(0, ioctx.application_metadata_list("app1", &meta));
+ ASSERT_EQ(expected_meta, meta);
+}
using namespace librados;
+TEST(LibRadosService, RegisterEarly) {
+ rados_t cluster;
+ ASSERT_EQ(0, rados_create(&cluster, "admin"));
+ ASSERT_EQ(0, rados_conf_read_file(cluster, NULL));
+ ASSERT_EQ(0, rados_conf_parse_env(cluster, NULL));
+
+ string name = string("pid") + stringify(getpid());
+ ASSERT_EQ(0, rados_service_register(cluster, "laundry", name.c_str(),
+ "foo\0bar\0this\0that\0"));
+ ASSERT_EQ(-EEXIST, rados_service_register(cluster, "laundry", name.c_str(),
+ "foo\0bar\0this\0that\0"));
+
+ ASSERT_EQ(0, rados_connect(cluster));
+ sleep(5);
+ rados_shutdown(cluster);
+}
+
+TEST(LibRadosService, RegisterLate) {
+ rados_t cluster;
+ ASSERT_EQ(0, rados_create(&cluster, "admin"));
+ ASSERT_EQ(0, rados_conf_read_file(cluster, NULL));
+ ASSERT_EQ(0, rados_conf_parse_env(cluster, NULL));
+ ASSERT_EQ(0, rados_connect(cluster));
+
+ string name = string("pid") + stringify(getpid());
+ ASSERT_EQ(0, rados_service_register(cluster, "laundry", name.c_str(),
+ "foo\0bar\0this\0that\0"));
+ ASSERT_EQ(-EEXIST, rados_service_register(cluster, "laundry", name.c_str(),
+ "foo\0bar\0this\0that\0"));
+ rados_shutdown(cluster);
+}
+
+TEST(LibRadosService, Status) {
+ rados_t cluster;
+ ASSERT_EQ(0, rados_create(&cluster, "admin"));
+ ASSERT_EQ(0, rados_conf_read_file(cluster, NULL));
+ ASSERT_EQ(0, rados_conf_parse_env(cluster, NULL));
+
+ ASSERT_EQ(-ENOTCONN, rados_service_update_status(cluster,
+ "testing\0testing\0"));
+
+ ASSERT_EQ(0, rados_connect(cluster));
+ string name = string("pid") + stringify(getpid());
+ ASSERT_EQ(0, rados_service_register(cluster, "laundry", name.c_str(),
+ "foo\0bar\0this\0that\0"));
+
+ for (int i=0; i<20; ++i) {
+ char buffer[1024];
+ snprintf(buffer, sizeof(buffer), "%s%c%s%c%s%c%d%c",
+ "testing", '\0', "testing", '\0',
+ "count", '\0', i, '\0');
+ ASSERT_EQ(0, rados_service_update_status(cluster, buffer));
+ sleep(1);
+ }
+ rados_shutdown(cluster);
+}
+
TEST(LibRadosServicePP, RegisterEarly) {
Rados cluster;
cluster.init("admin");
return oss.str();
}
+ rados_ioctx_t ioctx;
+ ret = rados_ioctx_create(*cluster, pool_name.c_str(), &ioctx);
+ if (ret < 0) {
+ rados_shutdown(*cluster);
+ std::ostringstream oss;
+ oss << "rados_ioctx_create(" << pool_name << ") failed with error " << ret;
+ return oss.str();
+ }
+
+ rados_application_enable(ioctx, "rados", 1);
+ rados_ioctx_destroy(ioctx);
return "";
}
oss << "cluster.pool_create(" << pool_name << ") failed with error " << ret;
return oss.str();
}
+
+ IoCtx ioctx;
+ ret = cluster.ioctx_create(pool_name.c_str(), ioctx);
+ if (ret < 0) {
+ cluster.shutdown();
+ std::ostringstream oss;
+ oss << "cluster.ioctx_create(" << pool_name << ") failed with error "
+ << ret;
+ return oss.str();
+ }
+ ioctx.application_enable("rados", true);
return "";
}
cache_pool_name = get_temp_pool_name();
ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
RadosTestPP::SetUp();
+
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+ cache_ioctx.application_enable("rados", true);
cache_ioctx.set_namespace(nspace);
}
void TearDown() override {
ASSERT_EQ(0, cluster.pool_create(cache_pool_name.c_str()));
IoCtx cache_ioctx;
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+ cache_ioctx.application_enable("rados", true);
IoCtx ioctx;
ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
cache_pool_name = get_temp_pool_name();
ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
RadosTestECPP::SetUp();
+
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+ cache_ioctx.application_enable("rados", true);
cache_ioctx.set_namespace(nspace);
}
void TearDown() override {
ASSERT_EQ(0, cluster.pool_create(cache_pool_name.c_str()));
IoCtx cache_ioctx;
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+ cache_ioctx.application_enable("rados", true);
IoCtx ioctx;
ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
ASSERT_EQ(0, cluster.pool_create(cache_pool_name.c_str()));
IoCtx cache_ioctx;
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
+ cache_ioctx.application_enable("rados", true);
IoCtx ioctx;
ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
ASSERT_EQ(-ENOTCONN, notify_err);
ASSERT_EQ(-ENOTCONN, rados_watch_check(ioctx, handle));
rados_unwatch2(ioctx, handle);
+ rados_watch_flush(cluster);
}
TEST_F(LibRadosWatchNotify, AioWatchDelete) {
ASSERT_EQ(0u, reply_buf_len);
rados_unwatch2(ioctx, handle);
+ rados_watch_flush(cluster);
}
TEST_F(LibRadosWatchNotify, AioWatchNotify2) {
ASSERT_EQ(0u, reply_buf_len);
rados_unwatch2(ioctx, handle);
+ rados_watch_flush(cluster);
}
TEST_P(LibRadosWatchNotifyPP, WatchNotify2) {
ASSERT_EQ(0u, missed_map.size());
ASSERT_GT(ioctx.watch_check(handle), 0);
ioctx.unwatch2(handle);
+ cluster.watch_flush();
}
// --
rados_buffer_free(reply_buf);
rados_unwatch2(ioctx, handle1);
rados_unwatch2(ioctx, handle2);
+ rados_watch_flush(cluster);
}
// --
ASSERT_EQ(it->timeout_seconds, timeout);
}
bufferlist bl2, bl_reply;
+ std::cout << "notify2" << std::endl;
ASSERT_EQ(0, ioctx.notify2(notify_oid, bl2, 300000, &bl_reply));
+ std::cout << "notify2 done" << std::endl;
bufferlist::iterator p = bl_reply.begin();
std::map<std::pair<uint64_t,uint64_t>,bufferlist> reply_map;
std::set<std::pair<uint64_t,uint64_t> > missed_map;
ASSERT_EQ(5u, reply_map.begin()->second.length());
ASSERT_EQ(0, strncmp("reply", reply_map.begin()->second.c_str(), 5));
ASSERT_EQ(0u, missed_map.size());
+ std::cout << "watch_check" << std::endl;
ASSERT_GT(ioctx.watch_check(handle), 0);
+ std::cout << "unwatch2" << std::endl;
ioctx.unwatch2(handle);
+
+ std::cout << " flushing" << std::endl;
+ cluster.watch_flush();
+ std::cout << "done" << std::endl;
}
TEST_F(LibRadosWatchNotify, Watch3Timeout) {
// re-watch
rados_unwatch2(ioctx, handle);
+ rados_watch_flush(cluster);
+
handle = 0;
ASSERT_EQ(0,
rados_watch2(ioctx, notify_oid, &handle,
rados_buffer_free(reply_buf);
rados_unwatch2(ioctx, handle);
+ rados_watch_flush(cluster);
}
TEST_F(LibRadosWatchNotify, AioWatchDelete2) {
ctx->get_snap_context()));
}
+int IoCtx::cmpext(const std::string& oid, uint64_t off, bufferlist& cmp_bl) {
+ TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
+ return ctx->execute_operation(
+ oid, boost::bind(&TestIoCtxImpl::cmpext, _1, _2, off, cmp_bl));
+}
+
+int IoCtx::application_enable(const std::string& app_name, bool force) {
+ return 0;
+}
+
+int IoCtx::application_enable_async(const std::string& app_name,
+ bool force, PoolAsyncCompletion *c) {
+ return -EOPNOTSUPP;
+}
+
+int IoCtx::application_list(std::set<std::string> *app_names) {
+ return -EOPNOTSUPP;
+}
+
+int IoCtx::application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string *value) {
+ return -EOPNOTSUPP;
+}
+
+int IoCtx::application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value) {
+ return -EOPNOTSUPP;
+}
+
+int IoCtx::application_metadata_remove(const std::string& app_name,
+ const std::string &key) {
+ return -EOPNOTSUPP;
+}
+
+int IoCtx::application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values) {
+ return -EOPNOTSUPP;
+}
+
static int save_operation_result(int result, int *pval) {
if (pval != NULL) {
*pval = result;
return o->ops.size();
}
+void ObjectOperation::cmpext(uint64_t off, bufferlist& cmp_bl, int *prval) {
+ TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+ ObjectOperationTestImpl op = boost::bind(&TestIoCtxImpl::cmpext, _1, _2, off, cmp_bl);
+ if (prval != NULL) {
+ op = boost::bind(save_operation_result,
+ boost::bind(op, _1, _2, _3, _4), prval);
+ }
+ o->ops.push_back(op);
+}
+
void ObjectReadOperation::list_snaps(snap_set_t *out_snaps, int *prval) {
TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
return impl->mon_command(cmds, inbl, outbl, outs);
}
+int Rados::service_daemon_register(const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata) {
+ TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+ return impl->service_daemon_register(service, name, metadata);
+}
+
+int Rados::service_daemon_update_status(const std::map<std::string,std::string>& status) {
+ TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+ return impl->service_daemon_update_status(status);
+}
+
int Rados::pool_create(const char *name) {
TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
return impl->pool_create(name);
}
int cls_cxx_map_get_keys(cls_method_context_t hctx, const string &start_obj,
- uint64_t max_to_get, std::set<string> *keys) {
+ uint64_t max_to_get, std::set<string> *keys, bool *more) {
librados::TestClassHandler::MethodContext *ctx =
reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
keys->clear();
std::map<string, bufferlist> vals;
- std::string last_key = start_obj;
- do {
- vals.clear();
- int r = ctx->io_ctx_impl->omap_get_vals(ctx->oid, last_key, "", 1024,
- &vals);
- if (r < 0) {
- return r;
- }
+ int r = ctx->io_ctx_impl->omap_get_vals2(ctx->oid, start_obj, "", max_to_get,
+ &vals, more);
+ if (r < 0) {
+ return r;
+ }
- for (std::map<string, bufferlist>::iterator it = vals.begin();
- it != vals.end(); ++it) {
- last_key = it->first;
- keys->insert(last_key);
- }
- } while (!vals.empty());
+ for (std::map<string, bufferlist>::iterator it = vals.begin();
+ it != vals.end(); ++it) {
+ keys->insert(it->first);
+ }
return keys->size();
}
int cls_cxx_map_get_vals(cls_method_context_t hctx, const string &start_obj,
const string &filter_prefix, uint64_t max_to_get,
- std::map<string, bufferlist> *vals) {
+ std::map<string, bufferlist> *vals, bool *more) {
librados::TestClassHandler::MethodContext *ctx =
reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
- int r = ctx->io_ctx_impl->omap_get_vals(ctx->oid, start_obj, filter_prefix,
- max_to_get, vals);
+ int r = ctx->io_ctx_impl->omap_get_vals2(ctx->oid, start_obj, filter_prefix,
+ max_to_get, vals, more);
if (r < 0) {
return r;
}
return TestMemRadosClient::blacklist_add(client_address, expire_seconds);
}
+ MOCK_METHOD3(service_daemon_register,
+ int(const std::string&,
+ const std::string&,
+ const std::map<std::string,std::string>&));
+ int do_service_daemon_register(const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata) {
+ return TestMemRadosClient::service_daemon_register(service, name, metadata);
+ }
+
+ MOCK_METHOD1(service_daemon_update_status,
+ int(const std::map<std::string,std::string>&));
+ int do_service_daemon_update_status(const std::map<std::string,std::string>& status) {
+ return TestMemRadosClient::service_daemon_update_status(status);
+ }
+
void default_to_dispatch() {
using namespace ::testing;
ON_CALL(*this, create_ioctx(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_create_ioctx));
ON_CALL(*this, blacklist_add(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_blacklist_add));
+ ON_CALL(*this, service_daemon_register(_, _, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_service_daemon_register));
+ ON_CALL(*this, service_daemon_update_status(_)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_service_daemon_update_status));
}
};
const std::string &filter_prefix,
uint64_t max_return,
std::map<std::string, bufferlist> *out_vals) = 0;
+ virtual int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore) = 0;
virtual int omap_rm_keys(const std::string& oid,
const std::set<std::string>& keys) = 0;
virtual int omap_set(const std::string& oid,
const SnapContext &snapc) = 0;
virtual int writesame(const std::string& oid, bufferlist& bl, size_t len,
uint64_t off, const SnapContext &snapc) = 0;
+ virtual int cmpext(const std::string& oid, uint64_t off, bufferlist& cmp_bl) = 0;
virtual int xattr_get(const std::string& oid,
std::map<std::string, bufferlist>* attrset) = 0;
virtual int xattr_set(const std::string& oid, const std::string &name,
#include "test/librados_test_stub/TestMemRadosClient.h"
#include "common/Clock.h"
#include "common/RWLock.h"
+#include "include/err.h"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/bind.hpp>
#include <errno.h>
}
-int TestMemIoCtxImpl::omap_get_vals(const std::string& oid,
+int TestMemIoCtxImpl::omap_get_vals2(const std::string& oid,
const std::string& start_after,
const std::string &filter_prefix,
uint64_t max_return,
- std::map<std::string, bufferlist> *out_vals) {
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore) {
if (out_vals == NULL) {
return -EINVAL;
} else if (m_client->is_blacklisted()) {
RWLock::RLocker l(file->lock);
TestMemCluster::FileOMaps::iterator o_it = m_pool->file_omaps.find(oid);
if (o_it == m_pool->file_omaps.end()) {
+ if (pmore) {
+ *pmore = false;
+ }
return 0;
}
}
++it;
}
+ if (pmore) {
+ *pmore = (it != omap.end());
+ }
return 0;
}
+int TestMemIoCtxImpl::omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals) {
+ return omap_get_vals2(oid, start_after, filter_prefix, max_return, out_vals, nullptr);
+}
+
int TestMemIoCtxImpl::omap_rm_keys(const std::string& oid,
const std::set<std::string>& keys) {
if (get_snap_read() != CEPH_NOSNAP) {
return -EBLACKLISTED;
}
-
if (len == 0 || (len % bl.length())) {
return -EINVAL;
}
return 0;
}
+int TestMemIoCtxImpl::cmpext(const std::string& oid, uint64_t off,
+ bufferlist& cmp_bl) {
+ if (get_snap_read() != CEPH_NOSNAP) {
+ return -EROFS;
+ } else if (m_client->is_blacklisted()) {
+ return -EBLACKLISTED;
+ }
+
+ if (cmp_bl.length() == 0) {
+ return -EINVAL;
+ }
+
+ TestMemCluster::SharedFile file;
+ {
+ RWLock::WLocker l(m_pool->file_lock);
+ file = get_file(oid, true, get_snap_context());
+ }
+
+ RWLock::RLocker l(file->lock);
+ size_t len = cmp_bl.length();
+ ensure_minimum_length(off + len, &file->data);
+ if (len > 0 && off <= len) {
+ for (uint64_t p = off; p < len; p++) {
+ if (file->data[p] != cmp_bl[p])
+ return -MAX_ERRNO - p;
+ }
+ }
+ return 0;
+}
+
int TestMemIoCtxImpl::xattr_get(const std::string& oid,
std::map<std::string, bufferlist>* attrset) {
if (m_client->is_blacklisted()) {
const std::string &filter_prefix,
uint64_t max_return,
std::map<std::string, bufferlist> *out_vals) override;
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore) override;
int omap_rm_keys(const std::string& oid,
const std::set<std::string>& keys) override;
int omap_set(const std::string& oid, const std::map<std::string,
const SnapContext &snapc) override;
int writesame(const std::string& oid, bufferlist& bl, size_t len,
uint64_t off, const SnapContext &snapc) override;
+ int cmpext(const std::string& oid, uint64_t off, bufferlist& cmp_bl) override;
int xattr_get(const std::string& oid,
std::map<std::string, bufferlist>* attrset) override;
int xattr_set(const std::string& oid, const std::string &name,
void object_list(int64_t pool_id,
std::list<librados::TestRadosClient::Object> *list) override;
+ int service_daemon_register(const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata) override {
+ return 0;
+ }
+ int service_daemon_update_status(const std::map<std::string,std::string>& status) override {
+ return 0;
+ }
+
int pool_create(const std::string &pool_name) override;
int pool_delete(const std::string &pool_name) override;
int pool_get_base_tier(int64_t pool_id, int64_t* base_tier) override;
virtual void object_list(int64_t pool_id,
std::list<librados::TestRadosClient::Object> *list) = 0;
+ virtual int service_daemon_register(const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata) = 0;
+ virtual int service_daemon_update_status(const std::map<std::string,std::string>& status) = 0;
+
virtual int pool_create(const std::string &pool_name) = 0;
virtual int pool_delete(const std::string &pool_name) = 0;
virtual int pool_get_base_tier(int64_t pool_id, int64_t* base_tier) = 0;
install(TARGETS ceph_test_rados_striper_api_striping
DESTINATION ${CMAKE_INSTALL_BINDIR})
-add_ceph_test(rados-striper.sh ${CMAKE_SOURCE_DIR}/src/test/libradosstriper/rados-striper.sh)
-
add_executable(ceph_test_rados_striper_api_io
io.cc)
target_link_libraries(ceph_test_rados_striper_api_io librados radosstriper
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Red Hat <contact@redhat.com>
-#
-# Author: Sebastien Ponce <sebastien.ponce@cern.ch>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7116" # git grep '\<7116\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- # setup
- setup $dir || return 1
-
- # create a cluster with one monitor and three osds
- run_mon $dir a || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- # create toyfile
- dd if=/dev/urandom of=$dir/toyfile bs=1234 count=1
-
- # put a striped object
- rados --pool rbd --striper put toyfile $dir/toyfile || return 1
-
- # stat it, with and without striping
- rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
- rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
- echo ' size 1234' > $dir/refstat
- diff -w $dir/stripedStat $dir/refstat || return 1
- diff -w $dir/stat $dir/refstat || return 1
- rados --pool rbd stat toyfile >& $dir/staterror
- grep -q 'No such file or directory' $dir/staterror || return 1
-
- # get the file back with and without striping
- rados --pool rbd --striper get toyfile $dir/stripedGroup || return 1
- diff -w $dir/toyfile $dir/stripedGroup || return 1
- rados --pool rbd get toyfile.0000000000000000 $dir/nonSTripedGroup || return 1
- diff -w $dir/toyfile $dir/nonSTripedGroup || return 1
-
- # test truncate
- rados --pool rbd --striper truncate toyfile 12
- rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
- rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
- echo ' size 12' > $dir/reftrunc
- diff -w $dir/stripedStat $dir/reftrunc || return 1
- diff -w $dir/stat $dir/reftrunc || return 1
-
- # test xattrs
-
- rados --pool rbd --striper setxattr toyfile somexattr somevalue || return 1
- rados --pool rbd --striper getxattr toyfile somexattr > $dir/xattrvalue || return 1
- rados --pool rbd getxattr toyfile.0000000000000000 somexattr > $dir/xattrvalue2 || return 1
- echo 'somevalue' > $dir/refvalue
- diff -w $dir/xattrvalue $dir/refvalue || return 1
- diff -w $dir/xattrvalue2 $dir/refvalue || return 1
- rados --pool rbd --striper listxattr toyfile > $dir/xattrlist || return 1
- echo 'somexattr' > $dir/reflist
- diff -w $dir/xattrlist $dir/reflist || return 1
- rados --pool rbd listxattr toyfile.0000000000000000 | grep -v striper > $dir/xattrlist2 || return 1
- diff -w $dir/xattrlist2 $dir/reflist || return 1
- rados --pool rbd --striper rmxattr toyfile somexattr || return 1
-
- local attr_not_found_str="No data available"
- [ `uname` = FreeBSD ] && \
- attr_not_found_str="Attribute not found"
- expect_failure $dir "$attr_not_found_str" \
- rados --pool rbd --striper getxattr toyfile somexattr || return 1
- expect_failure $dir "$attr_not_found_str" \
- rados --pool rbd getxattr toyfile.0000000000000000 somexattr || return 1
-
- # test rm
- rados --pool rbd --striper rm toyfile || return 1
- expect_failure $dir 'No such file or directory' \
- rados --pool rbd --striper stat toyfile || return 1
- expect_failure $dir 'No such file or directory' \
- rados --pool rbd stat toyfile.0000000000000000 || return 1
-
- # cleanup
- teardown $dir || return 1
-}
-
-main rados-striper "$@"
* FALLOCATE: - 5
* PUNCH HOLE: - 6
* WRITESAME: - 7
+ * COMPAREANDWRITE: - 8
*
* When mapped read/writes are disabled, they are simply converted to normal
* reads and writes. When fallocate/fpunch calls are disabled, they are
#define OP_FALLOCATE 5
#define OP_PUNCH_HOLE 6
#define OP_WRITESAME 7
+#define OP_COMPARE_AND_WRITE 8
/* rbd-specific operations */
-#define OP_CLONE 8
-#define OP_FLATTEN 9
-#define OP_MAX_FULL 10
+#define OP_CLONE 9
+#define OP_FLATTEN 10
+#define OP_MAX_FULL 11
/* operation modifiers */
#define OP_CLOSEOPEN 100
int (*flatten)(struct rbd_ctx *ctx);
ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
const char *buf, size_t data_len);
+ ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
+ const char *cmp_buf, const char *buf);
};
char *pool; /* name of the pool our test image is in */
return n;
}
+ssize_t
+librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
+ const char *cmp_buf, const char *buf)
+{
+ ssize_t n;
+ int ret;
+ uint64_t mismatch_off = 0;
+
+ n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
+ if (n == -EINVAL) {
+ return n;
+ } else if (n < 0) {
+ prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
+ off, len, mismatch_off);
+ return n;
+ }
+
+ ret = librbd_verify_object_map(ctx);
+ if (ret < 0) {
+ return ret;
+ }
+ return n;
+
+}
+
int
librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
{
librbd_clone,
librbd_flatten,
librbd_writesame,
+ librbd_compare_and_write,
};
int
badoff < lp->args[0] + lp->args[1])
prt("\t***WSWSWSWS");
break;
+ case OP_COMPARE_AND_WRITE:
+ prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
+ lp->args[0], lp->args[0] + lp->args[1] - 1,
+ lp->args[1]);
+ if (lp->args[0] > lp->args[2])
+ prt(" HOLE");
+ else if (lp->args[0] + lp->args[1] > lp->args[2])
+ prt(" EXTEND");
+ if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
+ badoff < lp->args[0] + lp->args[1])
+ prt("\t***WWWW");
+ break;
case OP_CLONE:
prt("CLONE");
break;
simple_err("Error creating ioctx", r);
goto failed_krbd;
}
+ rados_application_enable(ioctx, "rbd", 1);
+
if (clone_calls || journal_replay) {
uint64_t features = 0;
if (clone_calls) {
doflush(offset, size);
}
+void
+docompareandwrite(unsigned offset, unsigned size)
+{
+ int ret;
+
+ offset -= offset % writebdy;
+ if (o_direct)
+ size -= size % writebdy;
+
+ if (size == 0) {
+ if (!quiet && testcalls > simulatedopcount && !o_direct)
+ prt("skipping zero size read\n");
+ log4(OP_SKIPPED, OP_READ, offset, size);
+ return;
+ }
+
+ if (size + offset > file_size) {
+ if (!quiet && testcalls > simulatedopcount)
+ prt("skipping seek/compare past end of file\n");
+ log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
+ return;
+ }
+
+ memcpy(temp_buf + offset, good_buf + offset, size);
+ gendata(original_buf, good_buf, offset, size);
+ log4(OP_COMPARE_AND_WRITE, offset, size, 0);
+
+ if (testcalls <= simulatedopcount)
+ return;
+
+ if (!quiet &&
+ ((progressinterval && testcalls % progressinterval == 0) ||
+ (debug &&
+ (monitorstart == -1 ||
+ (static_cast<long>(offset + size) > monitorstart &&
+ (monitorend == -1 ||
+ static_cast<long>(offset) <= monitorend))))))
+ prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
+ offset, offset + size - 1, size);
+
+ ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
+ good_buf + offset);
+ if (ret != (ssize_t)size) {
+ if (ret == -EINVAL) {
+ memcpy(good_buf + offset, temp_buf + offset, size);
+ return;
+ }
+ if (ret < 0)
+ prterrcode("docompareandwrite: ops->compare_and_write", ret);
+ else
+ prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
+ report_failure(151);
+ return;
+ }
+
+ if (flush_enabled)
+ doflush(offset, size);
+}
+
void clone_filename(char *buf, size_t len, int clones)
{
snprintf(buf, len, "%s/fsx-%s-parent%d",
log4(OP_SKIPPED, OP_WRITESAME, offset, size);
goto out;
}
+ case OP_COMPARE_AND_WRITE:
+ /* compare_and_write not implemented */
+ if (!ops->compare_and_write) {
+ log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
+ goto out;
+ }
}
switch (op) {
TRIM_OFF_LEN(offset, size, maxfilelen);
dowritesame(offset, size);
break;
+ case OP_COMPARE_AND_WRITE:
+ TRIM_OFF_LEN(offset, size, file_size);
+ docompareandwrite(offset, size);
+ break;
case OP_CLONE:
do_clone();
return s_instance;
}
+ static ObjectRequest* create_compare_and_write(librbd::MockTestImageCtx *ictx,
+ const std::string &oid,
+ uint64_t object_no,
+ uint64_t object_off,
+ const ceph::bufferlist &cmp_data,
+ const ceph::bufferlist &write_data,
+ const ::SnapContext &snapc,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace,
+ Context *completion) {
+ assert(s_instance != nullptr);
+ s_instance->on_finish = completion;
+ return s_instance;
+ }
+
ObjectRequest() {
assert(s_instance == nullptr);
s_instance = this;
typedef ImageDiscardRequest<librbd::MockTestImageCtx> MockImageDiscardRequest;
typedef ImageFlushRequest<librbd::MockTestImageCtx> MockImageFlushRequest;
typedef ImageWriteSameRequest<librbd::MockTestImageCtx> MockImageWriteSameRequest;
+ typedef ImageCompareAndWriteRequest<librbd::MockTestImageCtx> MockImageCompareAndWriteRequest;
typedef ObjectRequest<librbd::MockTestImageCtx> MockObjectRequest;
typedef ObjectReadRequest<librbd::MockTestImageCtx> MockObjectReadRequest;
ASSERT_EQ(0, aio_comp_ctx.wait());
}
+TEST_F(TestMockIoImageRequest, AioCompareAndWriteJournalAppendDisabled) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockObjectRequest mock_aio_object_request;
+ MockTestImageCtx mock_image_ctx(*ictx);
+ MockJournal mock_journal;
+ mock_image_ctx.journal = &mock_journal;
+
+ InSequence seq;
+ expect_is_journal_appending(mock_journal, false);
+ expect_object_request_send(mock_image_ctx, mock_aio_object_request, 0);
+
+ C_SaferCond aio_comp_ctx;
+ AioCompletion *aio_comp = AioCompletion::create_and_start(
+ &aio_comp_ctx, ictx, AIO_TYPE_COMPARE_AND_WRITE);
+
+ bufferlist cmp_bl;
+ cmp_bl.append("1");
+ bufferlist write_bl;
+ write_bl.append("1");
+ uint64_t mismatch_offset;
+ MockImageCompareAndWriteRequest mock_aio_image_write(mock_image_ctx, aio_comp,
+ {{0, 1}}, std::move(cmp_bl),
+ std::move(write_bl),
+ &mismatch_offset,
+ 0, {});
+ {
+ RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
+ mock_aio_image_write.send();
+ }
+ ASSERT_EQ(0, aio_comp_ctx.wait());
+}
+
} // namespace io
} // namespace librbd
s_instance->aio_writesame(c, off, len, bl, op_flags);
}
+ MOCK_METHOD6(aio_compare_and_write, void(AioCompletion *c, const Extents &image_extents,
+ const bufferlist &cmp_bl, const bufferlist &bl,
+ uint64_t *mismatch_offset, int op_flags));
+ static void aio_compare_and_write(MockReplayImageCtx *ictx, AioCompletion *c,
+ Extents &&image_extents, bufferlist &&cmp_bl,
+ bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace) {
+ assert(s_instance != nullptr);
+ s_instance->aio_compare_and_write(c, image_extents, cmp_bl, bl,
+ mismatch_offset, op_flags);
+ }
+
ImageRequest() {
s_instance = this;
}
.WillOnce(SaveArg<0>(aio_comp));
}
+ void expect_aio_compare_and_write(MockIoImageRequest &mock_io_image_request,
+ io::AioCompletion **aio_comp, uint64_t off,
+ uint64_t len, const char *cmp_data, const char *data,
+ uint64_t *mismatch_offset) {
+ EXPECT_CALL(mock_io_image_request,
+ aio_compare_and_write(_, io::Extents{{off, len}},
+ BufferlistEqual(cmp_data), BufferlistEqual(data), mismatch_offset, _))
+ .WillOnce(SaveArg<0>(aio_comp));
+ }
+
void expect_flatten(MockReplayImageCtx &mock_image_ctx, Context **on_finish) {
EXPECT_CALL(*mock_image_ctx.operations, execute_flatten(_, _))
.WillOnce(DoAll(SaveArg<1>(on_finish),
ASSERT_EQ(0, on_safe.wait());
}
+
+TEST_F(TestMockJournalReplay, AioCompareAndWrite) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ MockReplayImageCtx mock_image_ctx(*ictx);
+
+ MockExclusiveLock mock_exclusive_lock;
+ mock_image_ctx.exclusive_lock = &mock_exclusive_lock;
+ expect_accept_ops(mock_exclusive_lock, true);
+
+ MockJournalReplay mock_write_journal_replay(mock_image_ctx);
+ MockJournalReplay mock_compare_and_write_journal_replay(mock_image_ctx);
+ MockJournalReplay mock_mis_compare_and_write_journal_replay(mock_image_ctx);
+ MockIoImageRequest mock_io_image_request;
+ expect_op_work_queue(mock_image_ctx);
+
+ InSequence seq;
+ io::AioCompletion *aio_comp;
+ C_SaferCond on_ready;
+ C_SaferCond on_safe;
+ expect_aio_write(mock_io_image_request, &aio_comp, 512, 512, "test");
+ when_process(mock_write_journal_replay,
+ EventEntry{AioWriteEvent(512, 512, to_bl("test"))},
+ &on_ready, &on_safe);
+
+ when_complete(mock_image_ctx, aio_comp, 0);
+ ASSERT_EQ(0, on_ready.wait());
+
+ expect_aio_flush(mock_image_ctx, mock_io_image_request, 0);
+ ASSERT_EQ(0, when_shut_down(mock_write_journal_replay, false));
+ ASSERT_EQ(0, on_safe.wait());
+
+ expect_aio_compare_and_write(mock_io_image_request, &aio_comp,
+ 512, 512, "test", "test", nullptr);
+ when_process(mock_compare_and_write_journal_replay,
+ EventEntry{AioCompareAndWriteEvent(512, 512, to_bl("test"),
+ to_bl("test"))}, &on_ready, &on_safe);
+
+ when_complete(mock_image_ctx, aio_comp, 0);
+ ASSERT_EQ(0, on_ready.wait());
+
+ expect_aio_flush(mock_image_ctx, mock_io_image_request, 0);
+ ASSERT_EQ(0, when_shut_down(mock_compare_and_write_journal_replay, false));
+ ASSERT_EQ(0, on_safe.wait());
+
+ expect_aio_compare_and_write(mock_io_image_request, &aio_comp,
+ 512, 512, "111", "test", nullptr);
+ when_process(mock_mis_compare_and_write_journal_replay,
+ EventEntry{AioCompareAndWriteEvent(512, 512, to_bl("111"),
+ to_bl("test"))}, &on_ready, &on_safe);
+
+ when_complete(mock_image_ctx, aio_comp, 0);
+ ASSERT_EQ(0, on_ready.wait());
+
+ expect_aio_flush(mock_image_ctx, mock_io_image_request, 0);
+ ASSERT_EQ(0, when_shut_down(mock_mis_compare_and_write_journal_replay, false));
+ ASSERT_EQ(0, on_safe.wait());
+
+}
+
TEST_F(TestMockJournalReplay, IOError) {
REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
int fadvise_flags, Context *on_finish) {
aio_writesame_mock(off, len, bl, fadvise_flags, on_finish);
}
+
+ MOCK_METHOD6(aio_compare_and_write_mock, void(const Extents &,
+ const ceph::bufferlist &,
+ const ceph::bufferlist &,
+ uint64_t *, int, Context *));
+
+ void aio_compare_and_write(Extents&& image_extents, ceph::bufferlist&& cmp_bl,
+ ceph::bufferlist&& bl, uint64_t *mismatch_offset,
+ int fadvise_flags, Context *on_finish) {
+ aio_compare_and_write_mock(image_extents, cmp_bl, bl, mismatch_offset,
+ fadvise_flags, on_finish);
+ }
};
} // namespace cache
#include "include/rbd/librbd.h"
#include "include/rbd/librbd.hpp"
#include "include/event_type.h"
+#include "include/err.h"
#include "gtest/gtest.h"
*passed = true;
}
+void aio_compare_and_write_test_data(rbd_image_t image, const char *cmp_data,
+ const char *test_data, uint64_t off,
+ size_t len, uint32_t iohint, bool *passed)
+{
+ rbd_completion_t comp;
+ rbd_aio_create_completion(NULL, (rbd_callback_t) simple_write_cb, &comp);
+ printf("created completion\n");
+
+ uint64_t mismatch_offset;
+ rbd_aio_compare_and_write(image, off, len, cmp_data, test_data, comp, &mismatch_offset, iohint);
+ printf("started aio compare and write\n");
+ rbd_aio_wait_for_complete(comp);
+ int r = rbd_aio_get_return_value(comp);
+ printf("return value is: %d\n", r);
+ ASSERT_EQ(0, r);
+ printf("finished aio compare and write\n");
+ rbd_aio_release(comp);
+ *passed = true;
+}
+
+void compare_and_write_test_data(rbd_image_t image, const char *cmp_data,
+ const char *test_data, uint64_t off, size_t len,
+ uint64_t *mismatch_off, uint32_t iohint, bool *passed)
+{
+ printf("start compare and write\n");
+ ssize_t written;
+ written = rbd_compare_and_write(image, off, len, cmp_data, test_data, mismatch_off, iohint);
+ printf("compare and wrote: %d\n", (int) written);
+ ASSERT_EQ(len, static_cast<size_t>(written));
+ *passed = true;
+}
+
+
TEST_F(TestLibRBD, TestIO)
{
rados_ioctx_t ioctx;
char test_data[TEST_IO_SIZE + 1];
char zero_data[TEST_IO_SIZE + 1];
+ char mismatch_data[TEST_IO_SIZE + 1];
int i;
+ uint64_t mismatch_offset;
for (i = 0; i < TEST_IO_SIZE; ++i) {
test_data[i] = (char) (rand() % (126 - 33) + 33);
}
test_data[TEST_IO_SIZE] = '\0';
memset(zero_data, 0, sizeof(zero_data));
+ memset(mismatch_data, 9, sizeof(mismatch_data));
for (i = 0; i < 5; ++i)
ASSERT_PASSED(write_test_data, image, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
for (i = 5; i < 10; ++i)
ASSERT_PASSED(aio_write_test_data, image, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
+ for (i = 0; i < 5; ++i)
+ ASSERT_PASSED(compare_and_write_test_data, image, test_data, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, &mismatch_offset, 0);
+
+ for (i = 5; i < 10; ++i)
+ ASSERT_PASSED(aio_compare_and_write_test_data, image, test_data, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
+
for (i = 0; i < 5; ++i)
ASSERT_PASSED(read_test_data, image, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE, 0);
ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
rbd_aio_release(comp);
+ ASSERT_PASSED(write_test_data, image, zero_data, 0, TEST_IO_SIZE, LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(-EILSEQ, rbd_compare_and_write(image, 0, TEST_IO_SIZE, mismatch_data, mismatch_data, &mismatch_offset, 0));
+ ASSERT_EQ(0U, mismatch_offset);
+ rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
+ ASSERT_EQ(0, rbd_aio_compare_and_write(image, 0, TEST_IO_SIZE, mismatch_data, mismatch_data, comp, &mismatch_offset, 0));
+ ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+ ASSERT_EQ(0U, mismatch_offset);
+ rbd_aio_release(comp);
+
ASSERT_PASSED(validate_object_map, image);
ASSERT_EQ(0, rbd_close(image));
char test_data[TEST_IO_SIZE + 1];
char zero_data[TEST_IO_SIZE + 1];
+ char mismatch_data[TEST_IO_SIZE + 1];
int i;
+ uint64_t mismatch_offset;
for (i = 0; i < TEST_IO_SIZE; ++i) {
test_data[i] = (char) (rand() % (126 - 33) + 33);
}
test_data[TEST_IO_SIZE] = '\0';
memset(zero_data, 0, sizeof(zero_data));
+ memset(mismatch_data, 9, sizeof(mismatch_data));
for (i = 0; i < 5; ++i)
ASSERT_PASSED(write_test_data, image, test_data, TEST_IO_SIZE * i,
ASSERT_PASSED(aio_write_test_data, image, test_data, TEST_IO_SIZE * i,
TEST_IO_SIZE, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+ for (i = 0; i < 5; ++i)
+ ASSERT_PASSED(compare_and_write_test_data, image, test_data, test_data,
+ TEST_IO_SIZE * i, TEST_IO_SIZE, &mismatch_offset, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+
+ for (i = 5; i < 10; ++i)
+ ASSERT_PASSED(aio_compare_and_write_test_data, image, test_data, test_data,
+ TEST_IO_SIZE * i, TEST_IO_SIZE, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+
for (i = 0; i < 5; ++i)
ASSERT_PASSED(read_test_data, image, test_data, TEST_IO_SIZE * i, TEST_IO_SIZE,
LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL);
ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
rbd_aio_release(comp);
+ ASSERT_PASSED(write_test_data, image, zero_data, 0, TEST_IO_SIZE, LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(-EILSEQ, rbd_compare_and_write(image, 0, TEST_IO_SIZE, mismatch_data, mismatch_data,
+ &mismatch_offset, LIBRADOS_OP_FLAG_FADVISE_DONTNEED));
+ ASSERT_EQ(0U, mismatch_offset);
+ rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
+ ASSERT_EQ(0, rbd_aio_compare_and_write(image, 0, TEST_IO_SIZE, mismatch_data, mismatch_data,
+ comp, &mismatch_offset, LIBRADOS_OP_FLAG_FADVISE_DONTNEED));
+ ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+ ASSERT_EQ(0U, mismatch_offset);
+ rbd_aio_release(comp);
+
ASSERT_PASSED(validate_object_map, image);
ASSERT_EQ(0, rbd_close(image));
*passed = true;
}
+void aio_compare_and_write_test_data(librbd::Image& image, const char *cmp_data,
+ const char *test_data, off_t off, ssize_t len,
+ uint32_t iohint, bool *passed)
+{
+ ceph::bufferlist cmp_bl;
+ cmp_bl.append(cmp_data, strlen(cmp_data));
+ ceph::bufferlist test_bl;
+ test_bl.append(test_data, strlen(test_data));
+ librbd::RBD::AioCompletion *comp = new librbd::RBD::AioCompletion(NULL, (librbd::callback_t) simple_write_cb_pp);
+ printf("created completion\n");
+
+ uint64_t mismatch_offset;
+ image.aio_compare_and_write(off, len, cmp_bl, test_bl, comp, &mismatch_offset, iohint);
+ printf("started aio compare and write\n");
+ comp->wait_for_complete();
+ int r = comp->get_return_value();
+ printf("return value is: %d\n", r);
+ ASSERT_EQ(0, r);
+ printf("finished aio compare and write\n");
+ comp->release();
+ *passed = true;
+}
+
+void compare_and_write_test_data(librbd::Image& image, const char *cmp_data, const char *test_data,
+ off_t off, ssize_t len, uint64_t *mismatch_off, uint32_t iohint, bool *passed)
+{
+ size_t written;
+ ceph::bufferlist cmp_bl;
+ cmp_bl.append(cmp_data, strlen(cmp_data));
+ ceph::bufferlist test_bl;
+ test_bl.append(test_data, strlen(test_data));
+ printf("start compare and write\n");
+ written = image.compare_and_write(off, len, cmp_bl, test_bl, mismatch_off, iohint);
+ printf("compare and wrote: %d\n", (int) written);
+ ASSERT_EQ(len, static_cast<ssize_t>(written));
+ *passed = true;
+}
+
TEST_F(TestLibRBD, TestIOPP)
{
librados::IoCtx ioctx;
char test_data[TEST_IO_SIZE + 1];
char zero_data[TEST_IO_SIZE + 1];
int i;
+ uint64_t mismatch_offset;
for (i = 0; i < TEST_IO_SIZE; ++i) {
test_data[i] = (char) (rand() % (126 - 33) + 33);
for (i = 5; i < 10; ++i)
ASSERT_PASSED(aio_write_test_data, image, test_data, strlen(test_data) * i, 0);
+ for (i = 0; i < 5; ++i)
+ ASSERT_PASSED(compare_and_write_test_data, image, test_data, test_data, TEST_IO_SIZE * i,
+ TEST_IO_SIZE, &mismatch_offset, 0);
+
+ for (i = 5; i < 10; ++i)
+ ASSERT_PASSED(aio_compare_and_write_test_data, image, test_data, test_data, TEST_IO_SIZE * i,
+ TEST_IO_SIZE, 0);
+
for (i = 0; i < 5; ++i)
ASSERT_PASSED(read_test_data, image, test_data, strlen(test_data) * i, TEST_IO_SIZE, 0);
return 0;
}
+ librados::IoCtx ioctx;
+ r = rados.ioctx_create(pool.c_str(), ioctx);
+ if (r < 0) {
+ return r;
+ }
+ ioctx.application_enable("rbd", true);
+
return r;
}
# GNU Library Public License for more details.
#
source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
timeout 360 ceph --mon-host $MON mon stat || return 1
export CEPH_ARGS="--mon_host $MON "
- ceph config-key put mgr/x/dashboard/server_port 7001
+ ceph config-key set mgr/x/dashboard/server_port 7001
MGR_ARGS+="--mgr_module_path=${CEPH_ROOT}/src/pybind/mgr "
run_mgr $dir x ${MGR_ARGS} || return 1
set_target_properties(ceph_test_mon_msg PROPERTIES COMPILE_FLAGS
${UNITTEST_CXX_FLAGS})
-#scripts
-add_ceph_test(misc.sh ${CMAKE_CURRENT_SOURCE_DIR}/misc.sh)
-add_ceph_test(mkfs.sh ${CMAKE_CURRENT_SOURCE_DIR}/mkfs.sh)
-add_ceph_test(mon-bind.sh ${CMAKE_CURRENT_SOURCE_DIR}/mon-bind.sh)
-add_ceph_test(mon-created-time.sh ${CMAKE_CURRENT_SOURCE_DIR}/mon-created-time.sh)
-add_ceph_test(mon-handle-forward.sh ${CMAKE_CURRENT_SOURCE_DIR}/mon-handle-forward.sh)
-add_ceph_test(mon-ping.sh ${CMAKE_CURRENT_SOURCE_DIR}/mon-ping.sh)
-add_ceph_test(mon-scrub.sh ${CMAKE_CURRENT_SOURCE_DIR}/mon-scrub.sh)
-add_ceph_test(osd-crush.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-crush.sh)
-add_ceph_test(osd-erasure-code-profile.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-erasure-code-profile.sh)
-add_ceph_test(osd-pool-create.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-pool-create.sh)
-add_ceph_test(test_pool_quota.sh ${CMAKE_CURRENT_SOURCE_DIR}/test_pool_quota.sh)
-
# unittest_mon_moncap
add_executable(unittest_mon_moncap
moncap.cc
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7102" # git grep '\<7102\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-TEST_POOL=rbd
-
-function TEST_osd_pool_get_set() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a || return 1
- ceph osd pool create $TEST_POOL 8
-
- local flag
- for flag in nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do
- ceph osd pool set $TEST_POOL $flag 0 || return 1
- ! ceph osd dump | grep 'pool ' | grep $flag || return 1
- ceph osd pool set $TEST_POOL $flag 1 || return 1
- ceph osd dump | grep 'pool ' | grep $flag || return 1
- ceph osd pool set $TEST_POOL $flag false || return 1
- ! ceph osd dump | grep 'pool ' | grep $flag || return 1
- ceph osd pool set $TEST_POOL $flag false || return 1
- # check that setting false twice does not toggle to true (bug)
- ! ceph osd dump | grep 'pool ' | grep $flag || return 1
- ceph osd pool set $TEST_POOL $flag true || return 1
- ceph osd dump | grep 'pool ' | grep $flag || return 1
- # cleanup
- ceph osd pool set $TEST_POOL $flag 0 || return 1
- done
-
- local size=$(ceph osd pool get $TEST_POOL size|awk '{print $2}')
- local min_size=$(ceph osd pool get $TEST_POOL min_size|awk '{print $2}')
-
- ceph osd pool set $TEST_POOL scrub_min_interval 123456 || return 1
- ceph osd dump | grep 'pool ' | grep 'scrub_min_interval 123456' || return 1
- ceph osd pool set $TEST_POOL scrub_min_interval 0 || return 1
- ceph osd dump | grep 'pool ' | grep 'scrub_min_interval' && return 1
- ceph osd pool set $TEST_POOL scrub_max_interval 123456 || return 1
- ceph osd dump | grep 'pool ' | grep 'scrub_max_interval 123456' || return 1
- ceph osd pool set $TEST_POOL scrub_max_interval 0 || return 1
- ceph osd dump | grep 'pool ' | grep 'scrub_max_interval' && return 1
- ceph osd pool set $TEST_POOL deep_scrub_interval 123456 || return 1
- ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval 123456' || return 1
- ceph osd pool set $TEST_POOL deep_scrub_interval 0 || return 1
- ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval' && return 1
-
- #replicated pool size restrict in 1 and 10
- ! ceph osd pool set $TEST_POOL 11 || return 1
- #replicated pool min_size must be between in 1 and size
- ! ceph osd pool set $TEST_POOL min_size $(expr $size + 1) || return 1
- ! ceph osd pool set $TEST_POOL min_size 0 || return 1
-
- local ecpool=erasepool
- ceph osd pool create $ecpool 12 12 erasure default || return 1
- #erasue pool size=k+m, min_size=k
- local size=$(ceph osd pool get $ecpool size|awk '{print $2}')
- local min_size=$(ceph osd pool get $ecpool min_size|awk '{print $2}')
- local k=$(expr $min_size - 1) # default min_size=k+1
- #erasure pool size can't change
- ! ceph osd pool set $ecpool size $(expr $size + 1) || return 1
- #erasure pool min_size must be between in k and size
- ceph osd pool set $ecpool min_size $(expr $k + 1) || return 1
- ! ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
- ! ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
-
- teardown $dir || return 1
-}
-
-function TEST_mon_add_to_single_mon() {
- local dir=$1
-
- fsid=$(uuidgen)
- MONA=127.0.0.1:7117 # git grep '\<7117\>' : there must be only one
- MONB=127.0.0.1:7118 # git grep '\<7118\>' : there must be only one
- CEPH_ARGS_orig=$CEPH_ARGS
- CEPH_ARGS="--fsid=$fsid --auth-supported=none "
- CEPH_ARGS+="--mon-initial-members=a "
- CEPH_ARGS+="--mon-host=$MONA "
-
- setup $dir || return 1
- run_mon $dir a --public-addr $MONA || return 1
- # wait for the quorum
- timeout 120 ceph -s > /dev/null || return 1
- run_mon $dir b --public-addr $MONB || return 1
- teardown $dir || return 1
-
- setup $dir || return 1
- run_mon $dir a --public-addr $MONA || return 1
- # without the fix of #5454, mon.a will assert failure at seeing the MMonJoin
- # from mon.b
- run_mon $dir b --public-addr $MONB || return 1
- # wait for the quorum
- timeout 120 ceph -s > /dev/null || return 1
- local num_mons
- num_mons=$(ceph mon dump --format=json 2>/dev/null | jq ".mons | length") || return 1
- [ $num_mons == 2 ] || return 1
- # no reason to take more than 120 secs to get this submitted
- timeout 120 ceph mon add b $MONB || return 1
- teardown $dir || return 1
-}
-
-function TEST_no_segfault_for_bad_keyring() {
- local dir=$1
- setup $dir || return 1
- # create a client.admin key and add it to ceph.mon.keyring
- ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
- ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *'
- ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring
- CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx "
- CEPH_ARGS_orig=$CEPH_ARGS
- CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring "
- run_mon $dir a
- # create a bad keyring and make sure no segfault occurs when using the bad keyring
- echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring
- CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring"
- ceph osd dump 2> /dev/null
- # 139(11|128) means segfault and core dumped
- [ $? -eq 139 ] && return 1
- CEPH_ARGS=$CEPH_ARGS_orig
- teardown $dir || return 1
-}
-
-function TEST_mon_features() {
- local dir=$1
- setup $dir || return 1
-
- fsid=$(uuidgen)
- MONA=127.0.0.1:7127 # git grep '\<7127\>' ; there must be only one
- MONB=127.0.0.1:7128 # git grep '\<7128\>' ; there must be only one
- MONC=127.0.0.1:7129 # git grep '\<7129\>' ; there must be only one
- CEPH_ARGS_orig=$CEPH_ARGS
- CEPH_ARGS="--fsid=$fsid --auth-supported=none "
- CEPH_ARGS+="--mon-initial-members=a,b,c "
- CEPH_ARGS+="--mon-host=$MONA,$MONB,$MONC "
- CEPH_ARGS+="--mon-debug-no-initial-persistent-features "
- CEPH_ARGS+="--mon-debug-no-require-luminous "
-
- run_mon $dir a --public-addr $MONA || return 1
- run_mon $dir b --public-addr $MONB || return 1
- timeout 120 ceph -s > /dev/null || return 1
-
- # expect monmap to contain 3 monitors (a, b, and c)
- jqinput="$(ceph mon_status --format=json 2>/dev/null)"
- jq_success "$jqinput" '.monmap.mons | length == 3' || return 1
- # quorum contains two monitors
- jq_success "$jqinput" '.quorum | length == 2' || return 1
- # quorum's monitor features contain kraken and luminous
- jqfilter='.features.quorum_mon[]|select(. == "kraken")'
- jq_success "$jqinput" "$jqfilter" "kraken" || return 1
- jqfilter='.features.quorum_mon[]|select(. == "luminous")'
- jq_success "$jqinput" "$jqfilter" "luminous" || return 1
-
- # monmap must have no persistent features set, because we
- # don't currently have a quorum made out of all the monitors
- # in the monmap.
- jqfilter='.monmap.features.persistent | length == 0'
- jq_success "$jqinput" "$jqfilter" || return 1
-
- # nor do we have any optional features, for that matter.
- jqfilter='.monmap.features.optional | length == 0'
- jq_success "$jqinput" "$jqfilter" || return 1
-
- # validate 'mon feature ls'
-
- jqinput="$(ceph mon feature ls --format=json 2>/dev/null)"
- # 'kraken' and 'luminous' are supported
- jqfilter='.all.supported[] | select(. == "kraken")'
- jq_success "$jqinput" "$jqfilter" "kraken" || return 1
- jqfilter='.all.supported[] | select(. == "luminous")'
- jq_success "$jqinput" "$jqfilter" "luminous" || return 1
-
- # start third monitor
- run_mon $dir c --public-addr $MONC || return 1
-
- wait_for_quorum 300 3 || return 1
-
- timeout 300 ceph -s > /dev/null || return 1
-
- jqinput="$(ceph mon_status --format=json 2>/dev/null)"
- # expect quorum to have all three monitors
- jqfilter='.quorum | length == 3'
- jq_success "$jqinput" "$jqfilter" || return 1
- # quorum's monitor features contain kraken and luminous
- jqfilter='.features.quorum_mon[]|select(. == "kraken")'
- jq_success "$jqinput" "$jqfilter" "kraken" || return 1
- jqfilter='.features.quorum_mon[]|select(. == "luminous")'
- jq_success "$jqinput" "$jqfilter" "luminous" || return 1
-
- # monmap must have no both 'kraken' and 'luminous' persistent
- # features set.
- jqfilter='.monmap.features.persistent | length == 2'
- jq_success "$jqinput" "$jqfilter" || return 1
- jqfilter='.monmap.features.persistent[]|select(. == "kraken")'
- jq_success "$jqinput" "$jqfilter" "kraken" || return 1
- jqfilter='.monmap.features.persistent[]|select(. == "luminous")'
- jq_success "$jqinput" "$jqfilter" "luminous" || return 1
-
- CEPH_ARGS=$CEPH_ARGS_orig
- # that's all folks. thank you for tuning in.
- teardown $dir || return 1
-}
-
-main misc "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-set -xe
-PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
-
-source $(dirname $0)/../detect-build-env-vars.sh
-
-DIR=mkfs
-export CEPH_CONF=/dev/null
-unset CEPH_ARGS
-MON_ID=a
-MON_DIR=$DIR/$MON_ID
-CEPH_MON=127.0.0.1:7110 # git grep '\<7110\>' : there must be only one
-TIMEOUT=360
-
-function setup() {
- teardown
- mkdir $DIR
-}
-
-function teardown() {
- kill_daemons
- rm -fr $DIR
-}
-
-function mon_mkfs() {
- local fsid=$(uuidgen)
-
- ceph-mon \
- --id $MON_ID \
- --fsid $fsid \
- --erasure-code-dir=$CEPH_LIB \
- --mkfs \
- --mon-data=$MON_DIR \
- --mon-initial-members=$MON_ID \
- --mon-host=$CEPH_MON \
- "$@"
-}
-
-function mon_run() {
- ceph-mon \
- --id $MON_ID \
- --chdir= \
- --mon-osd-full-ratio=.99 \
- --mon-data-avail-crit=1 \
- --erasure-code-dir=$CEPH_LIB \
- --mon-data=$MON_DIR \
- --log-file=$MON_DIR/log \
- --mon-cluster-log-file=$MON_DIR/log \
- --run-dir=$MON_DIR \
- --pid-file=$MON_DIR/pidfile \
- --public-addr $CEPH_MON \
- "$@"
-}
-
-function kill_daemons() {
- for pidfile in $(find $DIR -name pidfile) ; do
- pid=$(cat $pidfile)
- for try in 0 1 1 1 2 3 ; do
- kill $pid || break
- sleep $try
- done
- done
-}
-
-function auth_none() {
- mon_mkfs --auth-supported=none
-
- ceph-mon \
- --id $MON_ID \
- --mon-osd-full-ratio=.99 \
- --mon-data-avail-crit=1 \
- --erasure-code-dir=$CEPH_LIB \
- --mon-data=$MON_DIR \
- --extract-monmap $MON_DIR/monmap
-
- [ -f $MON_DIR/monmap ] || return 1
-
- [ ! -f $MON_DIR/keyring ] || return 1
-
- mon_run --auth-supported=none
-
- timeout $TIMEOUT ceph --mon-host $CEPH_MON mon stat || return 1
-}
-
-function auth_cephx_keyring() {
- cat > $DIR/keyring <<EOF
-[mon.]
- key = AQDUS79S0AF9FRAA2cgRLFscVce0gROn/s9WMg==
- caps mon = "allow *"
-EOF
-
- mon_mkfs --keyring=$DIR/keyring
-
- [ -f $MON_DIR/keyring ] || return 1
-
- mon_run
-
- timeout $TIMEOUT ceph \
- --name mon. \
- --keyring $MON_DIR/keyring \
- --mon-host $CEPH_MON mon stat || return 1
-}
-
-function auth_cephx_key() {
- if [ -f /etc/ceph/keyring ] ; then
- echo "Please move /etc/ceph/keyring away for testing!"
- return 1
- fi
-
- local key=$(ceph-authtool --gen-print-key)
-
- if mon_mkfs --key='corrupted key' ; then
- return 1
- else
- rm -fr $MON_DIR/store.db
- rm -fr $MON_DIR/kv_backend
- fi
-
- mon_mkfs --key=$key
-
- [ -f $MON_DIR/keyring ] || return 1
- grep $key $MON_DIR/keyring
-
- mon_run
-
- timeout $TIMEOUT ceph \
- --name mon. \
- --keyring $MON_DIR/keyring \
- --mon-host $CEPH_MON mon stat || return 1
-}
-
-function makedir() {
- local toodeep=$MON_DIR/toodeep
-
- # fail if recursive directory creation is needed
- ceph-mon \
- --id $MON_ID \
- --mon-osd-full-ratio=.99 \
- --mon-data-avail-crit=1 \
- --erasure-code-dir=$CEPH_LIB \
- --mkfs \
- --mon-data=$toodeep 2>&1 | tee $DIR/makedir.log
- grep 'toodeep.*No such file' $DIR/makedir.log > /dev/null
- rm $DIR/makedir.log
-
- # an empty directory does not mean the mon exists
- mkdir $MON_DIR
- mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log
- ! grep "$MON_DIR already exists" $DIR/makedir.log || return 1
-}
-
-function idempotent() {
- mon_mkfs --auth-supported=none
- mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log
- grep "'$MON_DIR' already exists" $DIR/makedir.log > /dev/null || return 1
-}
-
-function run() {
- local actions
- actions+="makedir "
- actions+="idempotent "
- actions+="auth_cephx_key "
- actions+="auth_cephx_keyring "
- actions+="auth_none "
- for action in $actions ; do
- setup
- $action || return 1
- teardown
- done
-}
-
-run
-
-# Local Variables:
-# compile-command: "cd ../.. ; make TESTS=test/mon/mkfs.sh check"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2017 Quantum Corp.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-SOCAT_PIDS=()
-
-function port_forward() {
- local source_port=$1
- local target_port=$2
-
- socat TCP-LISTEN:${source_port},fork,reuseaddr TCP:localhost:${target_port} &
- SOCAT_PIDS+=( $! )
-}
-
-function cleanup() {
- for p in "${SOCAT_PIDS[@]}"; do
- kill $p
- done
- SOCAT_PIDS=()
-}
-
-trap cleanup SIGTERM SIGKILL SIGQUIT SIGINT
-
-function run() {
- local dir=$1
- shift
-
- export MON_IP=127.0.0.1
- export MONA_PUBLIC=7132 # git grep '\<7132\>' ; there must be only one
- export MONB_PUBLIC=7133 # git grep '\<7133\>' ; there must be only one
- export MONC_PUBLIC=7134 # git grep '\<7134\>' ; there must be only one
- export MONA_BIND=7135 # git grep '\<7135\>' ; there must be only one
- export MONB_BIND=7136 # git grep '\<7136\>' ; there must be only one
- export MONC_BIND=7137 # git grep '\<7137\>' ; there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir && cleanup || { cleanup; return 1; }
- teardown $dir
- done
-}
-
-function TEST_mon_client_connect_fails() {
- local dir=$1
-
- # start the mon with a public-bind-addr that is different
- # from the public-addr.
- CEPH_ARGS+="--mon-initial-members=a "
- CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} "
- run_mon_no_pool $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
-
- # now attempt to ping it that should fail.
- timeout 3 ceph ping mon.a || return 0
- return 1
-}
-
-function TEST_mon_client_connect() {
- local dir=$1
-
- # start the mon with a public-bind-addr that is different
- # from the public-addr.
- CEPH_ARGS+="--mon-initial-members=a "
- CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} "
- run_mon_no_pool $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
-
- # now forward the public port to the bind port.
- port_forward ${MONA_PUBLIC} ${MONA_BIND}
-
- # attempt to connect. we expect that to work
- ceph ping mon.a || return 1
-}
-
-function TEST_mon_quorum() {
- local dir=$1
-
- # start the mon with a public-bind-addr that is different
- # from the public-addr.
- CEPH_ARGS+="--mon-initial-members=a,b,c "
- CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} "
- run_mon_no_pool $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
- run_mon_no_pool $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1
- run_mon_no_pool $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1
-
- # now forward the public port to the bind port.
- port_forward ${MONA_PUBLIC} ${MONA_BIND}
- port_forward ${MONB_PUBLIC} ${MONB_BIND}
- port_forward ${MONC_PUBLIC} ${MONC_BIND}
-
- # expect monmap to contain 3 monitors (a, b, and c)
- jqinput="$(ceph mon_status --format=json 2>/dev/null)"
- jq_success "$jqinput" '.monmap.mons | length == 3' || return 1
-
- # quorum should form
- wait_for_quorum 300 3 || return 1
- # expect quorum to have all three monitors
- jqfilter='.quorum | length == 3'
- jq_success "$jqinput" "$jqfilter" || return 1
-}
-
-function TEST_put_get() {
- local dir=$1
-
- # start the mon with a public-bind-addr that is different
- # from the public-addr.
- CEPH_ARGS+="--mon-initial-members=a,b,c "
- CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} "
- run_mon_no_pool $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1
- run_mon_no_pool $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1
- run_mon_no_pool $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1
-
- # now forward the public port to the bind port.
- port_forward ${MONA_PUBLIC} ${MONA_BIND}
- port_forward ${MONB_PUBLIC} ${MONB_BIND}
- port_forward ${MONC_PUBLIC} ${MONC_BIND}
-
- # quorum should form
- wait_for_quorum 300 3 || return 1
-
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- ceph osd pool create hello 8 || return 1
-
- echo "hello world" > $dir/hello
- rados --pool hello put foo $dir/hello || return 1
- rados --pool hello get foo $dir/hello2 || return 1
- diff $dir/hello $dir/hello2 || return 1
-}
-
-main mon-bind "$@"
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 SUSE LINUX GmbH
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7125" # git grep '\<7125\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_mon_created_time() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- ceph mon dump || return 1
-
- if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = ""x ; then
- return 1
- fi
-
- if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = "0.000000"x ; then
- return 1
- fi
-}
-
-main mon-created-time "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/mon-created-time.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014,2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
-
- setup $dir || return 1
-
- MONA=127.0.0.1:7300
- MONB=127.0.0.1:7301
- (
- FSID=$(uuidgen)
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$FSID --auth-supported=none "
- CEPH_ARGS+="--mon-initial-members=a,b --mon-host=$MONA,$MONB "
- run_mon $dir a --public-addr $MONA || return 1
- run_mon $dir b --public-addr $MONB || return 1
- )
-
- timeout 360 ceph --mon-host $MONA mon stat || return 1
- # check that MONB is indeed a peon
- ceph --admin-daemon $dir/ceph-mon.b.asok mon_status |
- grep '"peon"' || return 1
- # when the leader ( MONA ) is used, there is no message forwarding
- ceph --mon-host $MONA osd pool create POOL1 12
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep 'mon_command(.*"POOL1"' $dir/a/mon.a.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.b.asok log flush || return 1
- grep 'mon_command(.*"POOL1"' $dir/mon.b.log && return 1
- # when the peon ( MONB ) is used, the message is forwarded to the leader
- ceph --mon-host $MONB osd pool create POOL2 12
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.b.asok log flush || return 1
- grep 'forward_request.*mon_command(.*"POOL2"' $dir/mon.b.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep ' forward(mon_command(.*"POOL2"' $dir/mon.a.log
- # forwarded messages must retain features from the original connection
- features=$(sed -n -e 's|.*127.0.0.1:0.*accept features \([0-9][0-9]*\)|\1|p' < \
- $dir/mon.b.log)
- grep ' forward(mon_command(.*"POOL2".*con_features '$features $dir/mon.a.log
-
- teardown $dir || return 1
-}
-
-main mon-handle-forward "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 TESTS=test/mon/mon-handle-forward.sh check"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 SUSE LINUX GmbH
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7119" # git grep '\<7119\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_mon_ping() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- ceph ping mon.a || return 1
-}
-
-main mon-ping "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7120" # git grep '\<7120\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_mon_scrub() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- ceph mon scrub || return 1
-}
-
-main mon-scrub "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/mon-scrub.sh"
-# End:
"allow command abc with arg=foo arg2=bar",
"allow command abc with arg=foo arg2 prefix bar arg3 prefix baz",
"allow command abc with arg=foo arg2 prefix \"bar bingo\" arg3 prefix baz",
+ "allow command abc with arg regex \"^[0-9a-z.]*$\"",
+ "allow command abc with arg regex \"\(invaluid regex\"",
"allow service foo x",
"allow service foo x; allow service bar x",
"allow service foo w ;allow service bar x",
"allow command abc.def with arg=foo arg2=bar, allow service foo r",
"allow command \"foo bar\" with arg=\"baz\"",
"allow command \"foo bar\" with arg=\"baz.xx\"",
+ "profile osd",
+ "profile \"mds-bootstrap\", profile foo",
0
};
name, "", "config-key get", ca, true, true, true));
ASSERT_TRUE(cap.is_capable(NULL, CEPH_ENTITY_TYPE_MON,
name, "", "config-key put", ca, true, true, true));
+ ASSERT_TRUE(cap.is_capable(NULL, CEPH_ENTITY_TYPE_MON,
+ name, "", "config-key set", ca, true, true, true));
ASSERT_TRUE(cap.is_capable(NULL, CEPH_ENTITY_TYPE_MON,
name, "", "config-key exists", ca, true, true, true));
ASSERT_TRUE(cap.is_capable(NULL, CEPH_ENTITY_TYPE_MON,
name, "", "config-key delete", ca, true, true, true));
}
+TEST(MonCap, CommandRegEx) {
+ MonCap cap;
+ ASSERT_FALSE(cap.is_allow_all());
+ ASSERT_TRUE(cap.parse("allow command abc with arg regex \"^[0-9a-z.]*$\"", NULL));
+
+ EntityName name;
+ name.from_str("osd.123");
+ ASSERT_TRUE(cap.is_capable(nullptr, CEPH_ENTITY_TYPE_OSD, name, "",
+ "abc", {{"arg", "12345abcde"}}, true, true, true));
+ ASSERT_FALSE(cap.is_capable(nullptr, CEPH_ENTITY_TYPE_OSD, name, "",
+ "abc", {{"arg", "~!@#$"}}, true, true, true));
+
+ ASSERT_TRUE(cap.parse("allow command abc with arg regex \"[*\"", NULL));
+ ASSERT_FALSE(cap.is_capable(nullptr, CEPH_ENTITY_TYPE_OSD, name, "",
+ "abc", {{"arg", ""}}, true, true, true));
+}
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<grammar xmlns="http://relaxng.org/ns/structure/1.0">
- <start>
- <ref name="crush_map_roots"/>
- </start>
- <define name="item">
- <choice>
- <ref name="bucket"/>
- <ref name="device"/>
- </choice>
- </define>
- <define name="device">
- <element name="device">
- <element name="id">
- <text/>
- </element>
- <element name="device_class">
- <text/>
- </element>
- <element name="name">
- <text/>
- </element>
- <element name="type">
- <text/>
- </element>
- <element name="type_id">
- <text/>
- </element>
- <element name="crush_weight">
- <text/>
- </element>
- <element name="depth">
- <text/>
- </element>
- </element>
- </define>
- <define name="bucket">
- <element name="bucket">
- <element name="id">
- <text/>
- </element>
- <element name="device_class">
- <text/>
- </element>
- <element name="name">
- <text/>
- </element>
- <element name="type">
- <text/>
- </element>
- <element name="type_id">
- <text/>
- </element>
- <element name="items">
- <zeroOrMore>
- <ref name="item"/>
- </zeroOrMore>
- </element>
- </element>
- </define>
- <define name="crush_map_roots">
- <element name="crush_map_roots">
- <oneOrMore>
- <ref name="bucket"/>
- </oneOrMore>
- </element>
- </define>
-</grammar>
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7104" # git grep '\<7104\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | ${SED} -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_crush_rule_create_simple() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- ceph --format xml osd crush rule dump replicated_rule | \
- egrep '<op>take</op><item>[^<]+</item><item_name>default</item_name>' | \
- grep '<op>choose_firstn</op><num>0</num><type>osd</type>' || return 1
- local ruleset=ruleset0
- local root=host1
- ceph osd crush add-bucket $root host
- local failure_domain=osd
- ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1
- ceph osd crush rule create-simple $ruleset $root $failure_domain 2>&1 | \
- grep "$ruleset already exists" || return 1
- ceph --format xml osd crush rule dump $ruleset | \
- egrep '<op>take</op><item>[^<]+</item><item_name>'$root'</item_name>' | \
- grep '<op>choose_firstn</op><num>0</num><type>'$failure_domain'</type>' || return 1
- ceph osd crush rule rm $ruleset || return 1
-}
-
-function TEST_crush_rule_dump() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- local ruleset=ruleset1
- ceph osd crush rule create-erasure $ruleset || return 1
- test $(ceph --format json osd crush rule dump $ruleset | \
- jq ".rule_name == \"$ruleset\"") == true || return 1
- test $(ceph --format json osd crush rule dump | \
- jq "map(select(.rule_name == \"$ruleset\")) | length == 1") == true || return 1
- ! ceph osd crush rule dump non_existent_ruleset || return 1
- ceph osd crush rule rm $ruleset || return 1
-}
-
-function TEST_crush_rule_rm() {
- local ruleset=erasure2
-
- run_mon $dir a || return 1
-
- ceph osd crush rule create-erasure $ruleset default || return 1
- ceph osd crush rule ls | grep $ruleset || return 1
- ceph osd crush rule rm $ruleset || return 1
- ! ceph osd crush rule ls | grep $ruleset || return 1
-}
-
-function TEST_crush_rule_create_erasure() {
- local dir=$1
-
- run_mon $dir a || return 1
- # should have at least one OSD
- run_osd $dir 0 || return 1
-
- local ruleset=ruleset3
- #
- # create a new ruleset with the default profile, implicitly
- #
- ceph osd crush rule create-erasure $ruleset || return 1
- ceph osd crush rule create-erasure $ruleset 2>&1 | \
- grep "$ruleset already exists" || return 1
- ceph --format xml osd crush rule dump $ruleset | \
- egrep '<op>take</op><item>[^<]+</item><item_name>default</item_name>' | \
- grep '<op>chooseleaf_indep</op><num>0</num><type>host</type>' || return 1
- ceph osd crush rule rm $ruleset || return 1
- ! ceph osd crush rule ls | grep $ruleset || return 1
- #
- # create a new ruleset with the default profile, explicitly
- #
- ceph osd crush rule create-erasure $ruleset default || return 1
- ceph osd crush rule ls | grep $ruleset || return 1
- ceph osd crush rule rm $ruleset || return 1
- ! ceph osd crush rule ls | grep $ruleset || return 1
- #
- # create a new ruleset and the default profile, implicitly
- #
- ceph osd erasure-code-profile rm default || return 1
- ! ceph osd erasure-code-profile ls | grep default || return 1
- ceph osd crush rule create-erasure $ruleset || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
- grep 'profile set default' $dir/mon.a.log || return 1
- ceph osd erasure-code-profile ls | grep default || return 1
- ceph osd crush rule rm $ruleset || return 1
- ! ceph osd crush rule ls | grep $ruleset || return 1
-}
-
-function check_ruleset_id_match_rule_id() {
- local rule_name=$1
- rule_id=`ceph osd crush rule dump $rule_name | grep "\"rule_id\":" | awk -F ":|," '{print int($2)}'`
- ruleset_id=`ceph osd crush rule dump $rule_name | grep "\"ruleset\":"| awk -F ":|," '{print int($2)}'`
- test $ruleset_id = $rule_id || return 1
-}
-
-function generate_manipulated_rules() {
- local dir=$1
- ceph osd crush add-bucket $root host
- ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1
- ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1
- ceph osd getcrushmap -o $dir/original_map
- crushtool -d $dir/original_map -o $dir/decoded_original_map
- #manipulate the rulesets , to make the rule_id != ruleset_id
- ${SED} -i 's/ruleset 0/ruleset 3/' $dir/decoded_original_map
- ${SED} -i 's/ruleset 2/ruleset 0/' $dir/decoded_original_map
- ${SED} -i 's/ruleset 1/ruleset 2/' $dir/decoded_original_map
-
- crushtool -c $dir/decoded_original_map -o $dir/new_map
- ceph osd setcrushmap -i $dir/new_map
-
- ceph osd crush rule dump
-}
-
-function TEST_crush_ruleset_match_rule_when_creating() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- local root=host1
-
- generate_manipulated_rules $dir
-
- ceph osd crush rule create-simple special_rule_simple $root osd firstn || return 1
-
- ceph osd crush rule dump
- #show special_rule_simple has same rule_id and ruleset_id
- check_ruleset_id_match_rule_id special_rule_simple || return 1
-}
-
-function TEST_add_ruleset_failed() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- local root=host1
-
- ceph osd crush add-bucket $root host
- ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1
- ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1
- ceph osd getcrushmap > $dir/crushmap || return 1
- crushtool --decompile $dir/crushmap > $dir/crushmap.txt || return 1
- for i in $(seq 3 255)
- do
- cat <<EOF
-rule test_rule$i {
- ruleset $i
- type replicated
- min_size 1
- max_size 10
- step take $root
- step choose firstn 0 type osd
- step emit
-}
-EOF
- done >> $dir/crushmap.txt
- crushtool --compile $dir/crushmap.txt -o $dir/crushmap || return 1
- ceph osd setcrushmap -i $dir/crushmap || return 1
- ceph osd crush rule create-simple test_rule_nospace $root osd firstn 2>&1 | grep "Error ENOSPC" || return 1
-
-}
-
-function TEST_crush_rename_bucket() {
- local dir=$1
-
- run_mon $dir a || return 1
-
- ceph osd crush add-bucket host1 host
- ceph osd tree
- ! ceph osd tree | grep host2 || return 1
- ceph osd crush rename-bucket host1 host2 || return 1
- ceph osd tree
- ceph osd tree | grep host2 || return 1
- ceph osd crush rename-bucket host1 host2 || return 1 # idempotency
- ceph osd crush rename-bucket nonexistent something 2>&1 | grep "Error ENOENT" || return 1
-}
-
-function TEST_crush_reject_empty() {
- local dir=$1
- run_mon $dir a || return 1
- # should have at least one OSD
- run_osd $dir 0 || return 1
-
- local empty_map=$dir/empty_map
- :> $empty_map.txt
- crushtool -c $empty_map.txt -o $empty_map.map || return 1
- expect_failure $dir "Error EINVAL" \
- ceph osd setcrushmap -i $empty_map.map || return 1
-}
-
-function TEST_crush_tree() {
- local dir=$1
- run_mon $dir a || return 1
-
- ceph osd crush tree --format=xml | \
- $XMLSTARLET val -e -r $CEPH_ROOT/src/test/mon/osd-crush-tree.rng - || return 1
-}
-
-main osd-crush "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/osd-crush.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7220" # git grep '\<7220\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_set() {
- local dir=$1
- local id=$2
-
- run_mon $dir a || return 1
-
- local profile=myprofile
- #
- # no key=value pairs : use the default configuration
- #
- ceph osd erasure-code-profile set $profile 2>&1 || return 1
- ceph osd erasure-code-profile get $profile | \
- grep plugin=jerasure || return 1
- ceph osd erasure-code-profile rm $profile
- #
- # key=value pairs override the default
- #
- ceph osd erasure-code-profile set $profile \
- key=value plugin=example || return 1
- ceph osd erasure-code-profile get $profile | \
- grep -e key=value -e plugin=example || return 1
- #
- # --force is required to override an existing profile
- #
- ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1
- grep 'will not override' $dir/out || return 1
- ceph osd erasure-code-profile set $profile key=other --force || return 1
- ceph osd erasure-code-profile get $profile | \
- grep key=other || return 1
-
- ceph osd erasure-code-profile rm $profile # cleanup
-}
-
-function TEST_ls() {
- local dir=$1
- local id=$2
-
- run_mon $dir a || return 1
-
- local profile=myprofile
- ! ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile set $profile 2>&1 || return 1
- ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph --format xml osd erasure-code-profile ls | \
- grep "<profile>$profile</profile>" || return 1
-
- ceph osd erasure-code-profile rm $profile # cleanup
-}
-
-function TEST_rm() {
- local dir=$1
- local id=$2
-
- run_mon $dir a || return 1
-
- local profile=myprofile
- ceph osd erasure-code-profile set $profile 2>&1 || return 1
- ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile rm $profile || return 1
- ! ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile rm WRONG 2>&1 | \
- grep "WRONG does not exist" || return 1
-
- ceph osd erasure-code-profile set $profile || return 1
- ceph osd pool create poolname 12 12 erasure $profile || return 1
- ! ceph osd erasure-code-profile rm $profile > $dir/out 2>&1 || return 1
- grep "poolname.*using.*$profile" $dir/out || return 1
- ceph osd pool delete poolname poolname --yes-i-really-really-mean-it || return 1
- ceph osd erasure-code-profile rm $profile || return 1
-
- ceph osd erasure-code-profile rm $profile # cleanup
-}
-
-function TEST_get() {
- local dir=$1
- local id=$2
-
- run_mon $dir a || return 1
-
- local default_profile=default
- ceph osd erasure-code-profile get $default_profile | \
- grep plugin=jerasure || return 1
- ceph --format xml osd erasure-code-profile get $default_profile | \
- grep '<plugin>jerasure</plugin>' || return 1
- ! ceph osd erasure-code-profile get WRONG > $dir/out 2>&1 || return 1
- grep -q "unknown erasure code profile 'WRONG'" $dir/out || return 1
-}
-
-function TEST_set_idempotent() {
- local dir=$1
- local id=$2
-
- run_mon $dir a || return 1
- #
- # The default profile is set using a code path different from
- # ceph osd erasure-code-profile set: verify that it is idempotent,
- # as if it was using the same code path.
- #
- ceph osd erasure-code-profile set default k=2 m=1 2>&1 || return 1
- local profile
- #
- # Because plugin=jerasure is the default, it uses a slightly
- # different code path where defaults (m=1 for instance) are added
- # implicitly.
- #
- profile=profileidempotent1
- ! ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1
- ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1
- ceph osd erasure-code-profile rm $profile # cleanup
-
- #
- # In the general case the profile is exactly what is on
- #
- profile=profileidempotent2
- ! ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1
- ceph osd erasure-code-profile ls | grep $profile || return 1
- ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1
- ceph osd erasure-code-profile rm $profile # cleanup
-}
-
-function TEST_format_invalid() {
- local dir=$1
-
- local profile=profile
- # osd_pool_default_erasure-code-profile is
- # valid JSON but not of the expected type
- run_mon $dir a \
- --osd_pool_default_erasure-code-profile 1 || return 1
- ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1
- cat $dir/out
- grep 'must be a JSON object' $dir/out || return 1
-}
-
-function TEST_format_json() {
- local dir=$1
-
- # osd_pool_default_erasure-code-profile is JSON
- expected='"plugin":"example"'
- run_mon $dir a \
- --osd_pool_default_erasure-code-profile "{$expected}" || return 1
- ceph --format json osd erasure-code-profile get default | \
- grep "$expected" || return 1
-}
-
-function TEST_format_plain() {
- local dir=$1
-
- # osd_pool_default_erasure-code-profile is plain text
- expected='"plugin":"example"'
- run_mon $dir a \
- --osd_pool_default_erasure-code-profile "plugin=example" || return 1
- ceph --format json osd erasure-code-profile get default | \
- grep "$expected" || return 1
-}
-
-function TEST_profile_k_sanity() {
- local dir=$1
- local profile=profile-sanity
-
- run_mon $dir a || return 1
-
- expect_failure $dir 'k must be a multiple of (k + m) / l' \
- ceph osd erasure-code-profile set $profile \
- plugin=lrc \
- l=1 \
- k=1 \
- m=1 || return 1
-
- if erasure_code_plugin_exists isa ; then
- expect_failure $dir 'k=1 must be >= 2' \
- ceph osd erasure-code-profile set $profile \
- plugin=isa \
- k=1 \
- m=1 || return 1
- else
- echo "SKIP because plugin isa has not been built"
- fi
-
- expect_failure $dir 'k=1 must be >= 2' \
- ceph osd erasure-code-profile set $profile \
- plugin=jerasure \
- k=1 \
- m=1 || return 1
-}
-
-main osd-erasure-code-profile "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/osd-erasure-code-profile.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2013, 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7105" # git grep '\<7105\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-# Before http://tracker.ceph.com/issues/8307 the invalid profile was created
-function TEST_erasure_invalid_profile() {
- local dir=$1
- run_mon $dir a || return 1
- local poolname=pool_erasure
- local notaprofile=not-a-valid-erasure-code-profile
- ! ceph osd pool create $poolname 12 12 erasure $notaprofile || return 1
- ! ceph osd erasure-code-profile ls | grep $notaprofile || return 1
-}
-
-function TEST_erasure_crush_rule() {
- local dir=$1
- run_mon $dir a || return 1
- #
- # choose the crush ruleset used with an erasure coded pool
- #
- local crush_ruleset=myruleset
- ! ceph osd crush rule ls | grep $crush_ruleset || return 1
- ceph osd crush rule create-erasure $crush_ruleset
- ceph osd crush rule ls | grep $crush_ruleset
- local poolname
- poolname=pool_erasure1
- ! ceph --format json osd dump | grep '"crush_rule":1' || return 1
- ceph osd pool create $poolname 12 12 erasure default $crush_ruleset
- ceph --format json osd dump | grep '"crush_rule":1' || return 1
- #
- # a crush ruleset by the name of the pool is implicitly created
- #
- poolname=pool_erasure2
- ceph osd erasure-code-profile set myprofile
- ceph osd pool create $poolname 12 12 erasure myprofile
- ceph osd crush rule ls | grep $poolname || return 1
- #
- # a non existent crush ruleset given in argument is an error
- # http://tracker.ceph.com/issues/9304
- #
- poolname=pool_erasure3
- ! ceph osd pool create $poolname 12 12 erasure myprofile INVALIDRULESET || return 1
-}
-
-function TEST_erasure_code_profile_default() {
- local dir=$1
- run_mon $dir a || return 1
- ceph osd erasure-code-profile rm default || return 1
- ! ceph osd erasure-code-profile ls | grep default || return 1
- ceph osd pool create $poolname 12 12 erasure default
- ceph osd erasure-code-profile ls | grep default || return 1
-}
-
-function TEST_erasure_crush_stripe_unit() {
- local dir=$1
- # the default stripe unit is used to initialize the pool
- run_mon $dir a --public-addr $CEPH_MON
- stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
- eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
- stripe_width = $((stripe_unit * k))
- ceph osd pool create pool_erasure 12 12 erasure
- ceph --format json osd dump | tee $dir/osd.json
- grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1
-}
-
-function TEST_erasure_crush_stripe_unit_padded() {
- local dir=$1
- # setting osd_pool_erasure_code_stripe_unit modifies the stripe_width
- # and it is padded as required by the default plugin
- profile+=" plugin=jerasure"
- profile+=" technique=reed_sol_van"
- k=4
- profile+=" k=$k"
- profile+=" m=2"
- actual_stripe_unit=2048
- desired_stripe_unit=$((actual_stripe_unit - 1))
- actual_stripe_width=$((actual_stripe_unit * k))
- run_mon $dir a \
- --osd_pool_erasure_code_stripe_unit $desired_stripe_unit \
- --osd_pool_default_erasure_code_profile "$profile" || return 1
- ceph osd pool create pool_erasure 12 12 erasure
- ceph osd dump | tee $dir/osd.json
- grep "stripe_width $actual_stripe_width" $dir/osd.json > /dev/null || return 1
-}
-
-function TEST_erasure_code_pool() {
- local dir=$1
- run_mon $dir a || return 1
- ceph --format json osd dump > $dir/osd.json
- local expected='"erasure_code_profile":"default"'
- ! grep "$expected" $dir/osd.json || return 1
- ceph osd pool create erasurecodes 12 12 erasure
- ceph --format json osd dump | tee $dir/osd.json
- grep "$expected" $dir/osd.json > /dev/null || return 1
-
- ceph osd pool create erasurecodes 12 12 erasure 2>&1 | \
- grep 'already exists' || return 1
- ceph osd pool create erasurecodes 12 12 2>&1 | \
- grep 'cannot change to type replicated' || return 1
-}
-
-function TEST_replicated_pool_with_ruleset() {
- local dir=$1
- run_mon $dir a
- local ruleset=ruleset0
- local root=host1
- ceph osd crush add-bucket $root host
- local failure_domain=osd
- local poolname=mypool
- ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1
- ceph osd crush rule ls | grep $ruleset
- ceph osd pool create $poolname 12 12 replicated $ruleset 2>&1 | \
- grep "pool 'mypool' created" || return 1
- rule_id=`ceph osd crush rule dump $ruleset | grep "rule_id" | awk -F[' ':,] '{print $4}'`
- ceph osd pool get $poolname crush_rule 2>&1 | \
- grep "crush_rule: $rule_id" || return 1
- #non-existent crush ruleset
- ceph osd pool create newpool 12 12 replicated non-existent 2>&1 | \
- grep "doesn't exist" || return 1
-}
-
-function TEST_erasure_code_pool_lrc() {
- local dir=$1
- run_mon $dir a || return 1
-
- ceph osd erasure-code-profile set LRCprofile \
- plugin=lrc \
- mapping=DD_ \
- layers='[ [ "DDc", "" ] ]' || return 1
-
- ceph --format json osd dump > $dir/osd.json
- local expected='"erasure_code_profile":"LRCprofile"'
- local poolname=erasurecodes
- ! grep "$expected" $dir/osd.json || return 1
- ceph osd pool create $poolname 12 12 erasure LRCprofile
- ceph --format json osd dump | tee $dir/osd.json
- grep "$expected" $dir/osd.json > /dev/null || return 1
- ceph osd crush rule ls | grep $poolname || return 1
-}
-
-function TEST_replicated_pool() {
- local dir=$1
- run_mon $dir a || return 1
- ceph osd pool create replicated 12 12 replicated replicated_rule 2>&1 | \
- grep "pool 'replicated' created" || return 1
- ceph osd pool create replicated 12 12 replicated replicated_rule 2>&1 | \
- grep 'already exists' || return 1
- # default is replicated
- ceph osd pool create replicated1 12 12 2>&1 | \
- grep "pool 'replicated1' created" || return 1
- # default is replicated, pgp_num = pg_num
- ceph osd pool create replicated2 12 2>&1 | \
- grep "pool 'replicated2' created" || return 1
- ceph osd pool create replicated 12 12 erasure 2>&1 | \
- grep 'cannot change to type erasure' || return 1
-}
-
-function TEST_no_pool_delete() {
- local dir=$1
- run_mon $dir a || return 1
- ceph osd pool create foo 1 || return 1
- ceph tell mon.a injectargs -- --no-mon-allow-pool-delete || return 1
- ! ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
- ceph tell mon.a injectargs -- --mon-allow-pool-delete || return 1
- ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
-}
-
-function TEST_utf8_cli() {
- local dir=$1
- run_mon $dir a || return 1
- # Hopefully it's safe to include literal UTF-8 characters to test
- # the fix for http://tracker.ceph.com/issues/7387. If it turns out
- # to not be OK (when is the default encoding *not* UTF-8?), maybe
- # the character '黄' can be replaced with the escape $'\xe9\xbb\x84'
- ceph osd pool create 黄 1024 2>&1 | \
- grep "pool '黄' created" || return 1
- ceph osd lspools 2>&1 | \
- grep "黄" || return 1
- ceph -f json-pretty osd dump | \
- python -c "import json; import sys; json.load(sys.stdin)" || return 1
- ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it
-}
-
-main osd-pool-create "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/mon/osd-pool-create.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-# vim: ts=8 sw=2 smarttab
-#
-# $0.sh - run mon workload generator
-
-if [[ ! -e "./ceph_ver.h" ]]; then
- echo "This script must be run from the repository's src/ directory"
- exit 1
-fi
-
-usage() {
- echo "usage: $1 [options..] <num-osds>"
- echo
- echo "options:"
- echo " -v, --verbose Be more verbose"
- echo " -c, --conf FILE ceph.conf location"
- echo " -d, --duration SECS Run test for SECS seconds (default: 300)"
- echo " --debug LEVEL Set the test's debug level (default: 0)"
- echo " -n, --new Make a fresh run by creating a new cluster"
- echo
- echo "environment variables:"
- echo " EXTRA_ARGS Pass additional ceph arguments to the test"
- echo
-}
-
-stop_ceph() {
- if [[ ! -e "init-ceph" ]]; then
- echo "could not find 'init-ceph'; killing only by hand and may bob have"
- echo "mercy on our souls"
- else
- ./init-ceph stop
- fi
-
- for i in mon osd mds; do
- killall -9 ceph-$i
- done
-}
-
-start_mon() {
- if [[ ! -e "init-ceph" ]]; then
- echo "could not find 'init-ceph'; attempting to start monitors by hand"
-
- for i in a b c; do
- ./ceph-mon -i $i -c ceph.conf -k keyring -d
- done
- else
- ./init-ceph start mon
- fi
-}
-
-make_fresh() {
- rm -fr dev/ out/ keyring
- mkdir dev
-
- if [[ ! -e "vstart.sh" ]]; then
- echo "could not find 'vstart.sh', which is weird; what have you done?"
- exit 1
- fi
-
- env MON=3 OSD=0 MDS=0 MGR=0 \
- ./vstart.sh -n -l -d mon
-}
-
-DEBUG=0
-TEST_CEPH_CONF=
-DURATION=
-LOADGEN_NUM_OSDS=
-ARGS=
-
-[[ ! -z $EXTRA_ARGS ]] && ARGS="$EXTRA_ARGS"
-
-fresh_run=0
-
-while [[ $# -gt 0 ]];
-do
- case "$1" in
- -v | --verbose)
- VERBOSE=1
- shift
- ;;
- -c | --conf)
- if [[ "$2" == "" ]]; then
- echo "'$1' expects an argument; none was given"
- exit 1
- fi
- TEST_CEPH_CONF="$2"
- shift 2
- ;;
- --debug)
- if [[ -z $2 ]]; then
- echo "'$1' expects an argument; none was given"
- exit 1
- fi
- ARGS="$ARGS --debug-none $2"
- shift 2
- ;;
- -d | --duration)
- if [[ -z $2 ]]; then
- echo "'$1' expects an argument; none was given"
- exit 1
- fi
- DURATION=$2
- shift 2
- ;;
- -n | --new)
- fresh_run=1
- shift
- ;;
- --)
- shift
- break
- ;;
- -*)
- echo "$1: unknown option" >&2
- usage $0
- exit 1
- ;;
- *)
- LOADGEN_NUM_OSDS=$1
- shift
- break
- ;;
- esac
-done
-
-if [[ -z $LOADGEN_NUM_OSDS ]]; then
- echo "must specify the number of osds"
- usage $0
- exit 1
-fi
-
-stop_ceph ;
-[[ $fresh_run -eq 1 ]] && make_fresh ;
-start_mon ;
-
-env VERBOSE=$VERBOSE TEST_CEPH_CONF="$TEST_CEPH_CONF" \
- DURATION=$DURATION EXTRA_ARGS="$ARGS" \
- LOADGEN_NUM_OSDS=$LOADGEN_NUM_OSDS \
- PATH="$PATH:`pwd`" ../qa/workunits/mon/workloadgen.sh
+++ /dev/null
-#!/bin/bash
-
-#
-# Generic pool quota test
-#
-
-# Includes
-
-source $(dirname $0)/../detect-build-env-vars.sh
-
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:17108" # git grep '\<17108\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-function TEST_pool_quota() {
- local dir=$1
- setup $dir || return 1
-
- run_mon $dir a || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- local poolname=testquoa
- ceph osd pool create $poolname 20
- local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
- local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
-
- echo $objects
- echo $bytes
- if [ $objects != 'N/A' ] || [ $bytes != 'N/A' ] ;
- then
- return 1
- fi
-
- ceph osd pool set-quota $poolname max_objects 1000
- ceph osd pool set-quota $poolname max_bytes 1024
-
- objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
- bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`
-
- if [ $objects != '1000' ] || [ $bytes != '1024' ] ;
- then
- return 1
- fi
-
- ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
- teardown $dir || return 1
-}
-
-main testpoolquota
std::cerr << "ioctx_create " << pool_name << " failed with " << ret << std::endl;
exit(1);
}
+ ioctx.application_enable("rados", true);
ret = ioctx.create(obj_name, false);
if (ret < 0) {
}
ioctx.close();
+ ret = cluster.pool_delete(pool_name.c_str());
+ if (ret < 0) {
+ std::cerr << "pool_delete failed with " << ret << std::endl;
+ exit(1);
+ }
}
#pragma GCC diagnostic pop
EXPECT_EQ(1, (int)extents.size());
}
+TEST_P(AllocTest, test_alloc_non_aligned_len)
+{
+ int64_t block_size = 1 << 12;
+ int64_t blocks = (1 << 20) * 100;
+ int64_t want_size = 1 << 22;
+ int64_t alloc_unit = 1 << 20;
+
+ init_alloc(blocks*block_size, block_size);
+ alloc->init_add_free(0, 2097152);
+ alloc->init_add_free(2097152, 1064960);
+ alloc->init_add_free(3670016, 2097152);
+
+ EXPECT_EQ(0, alloc->reserve(want_size));
+ AllocExtentVector extents;
+ EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents));
+}
+
INSTANTIATE_TEST_CASE_P(
Allocator,
class ObjectStore;
-class StoreTestFixture : public ::testing::Test {
+class StoreTestFixture : virtual public ::testing::Test {
const std::string type;
const std::string data_dir;
ceph_test_rados
DESTINATION ${CMAKE_INSTALL_BINDIR})
-# scripts
-add_ceph_test(osd-bench.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-bench.sh)
-add_ceph_test(osd-config.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-config.sh)
-add_ceph_test(osd-markdown.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-markdown.sh)
-add_ceph_test(osd-reactivate.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reactivate.sh)
-add_ceph_test(osd-reuse-id.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reuse-id.sh)
-add_ceph_test(osd-scrub-repair.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-repair.sh)
-add_ceph_test(osd-scrub-snaps.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-snaps.sh)
-add_ceph_test(osd-copy-from.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-copy-from.sh)
-add_ceph_test(osd-fast-mark-down.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-fast-mark-down.sh)
-if(HAVE_LIBAIO)
- add_ceph_test(osd-dup.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-dup.sh)
-endif()
-
# unittest_osdmap
add_executable(unittest_osdmap
TestOSDMap.cc
add_executable(unittest_pglog
TestPGLog.cc
$<TARGET_OBJECTS:unit-main>
+ $<TARGET_OBJECTS:store_test_fixture>
)
add_ceph_unittest(unittest_pglog ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_pglog)
-target_link_libraries(unittest_pglog osd global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
+target_link_libraries(unittest_pglog osd os global ${CMAKE_DL_LIBS} ${BLKID_LIBRARIES})
# unittest_hitset
add_executable(unittest_hitset
#include "osd/PGLog.h"
#include "osd/OSDMap.h"
#include "include/coredumpctl.h"
+#include "../objectstore/store_test_fixture.h"
-class PGLogTest : public ::testing::Test, protected PGLog {
-public:
- PGLogTest() : PGLog(g_ceph_context) {}
- void SetUp() override { }
-
- void TearDown() override {
- clear();
- }
+struct PGLogTestBase {
static hobject_t mk_obj(unsigned id) {
hobject_t hoid;
stringstream ss;
ss << "obj_" << id;
hoid.oid = ss.str();
hoid.set_hash(id);
+ hoid.pool = 1;
return hoid;
}
static eversion_t mk_evt(unsigned ep, unsigned v) {
return eversion_t(ep, v);
}
static pg_log_entry_t mk_ple_mod(
- const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
pg_log_entry_t e;
e.mark_unrollbackable();
e.op = pg_log_entry_t::MODIFY;
e.soid = hoid;
e.version = v;
e.prior_version = pv;
+ e.reqid = reqid;
return e;
}
static pg_log_entry_t mk_ple_dt(
- const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
pg_log_entry_t e;
e.mark_unrollbackable();
e.op = pg_log_entry_t::DELETE;
e.soid = hoid;
e.version = v;
e.prior_version = pv;
+ e.reqid = reqid;
return e;
}
- static pg_log_entry_t mk_ple_mod_rb(
+ static pg_log_entry_t mk_ple_ldt(
const hobject_t &hoid, eversion_t v, eversion_t pv) {
pg_log_entry_t e;
+ e.mark_unrollbackable();
+ e.op = pg_log_entry_t::LOST_DELETE;
+ e.soid = hoid;
+ e.version = v;
+ e.prior_version = pv;
+ return e;
+ }
+ static pg_log_entry_t mk_ple_mod_rb(
+ const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
+ pg_log_entry_t e;
e.op = pg_log_entry_t::MODIFY;
e.soid = hoid;
e.version = v;
e.prior_version = pv;
+ e.reqid = reqid;
return e;
}
static pg_log_entry_t mk_ple_dt_rb(
- const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
pg_log_entry_t e;
e.op = pg_log_entry_t::DELETE;
e.soid = hoid;
e.version = v;
e.prior_version = pv;
+ e.reqid = reqid;
return e;
}
+ static pg_log_entry_t mk_ple_mod(
+ const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ return mk_ple_mod(hoid, v, pv, osd_reqid_t());
+ }
+ static pg_log_entry_t mk_ple_dt(
+ const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ return mk_ple_dt(hoid, v, pv, osd_reqid_t());
+ }
+ static pg_log_entry_t mk_ple_mod_rb(
+ const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ return mk_ple_mod_rb(hoid, v, pv, osd_reqid_t());
+ }
+ static pg_log_entry_t mk_ple_dt_rb(
+ const hobject_t &hoid, eversion_t v, eversion_t pv) {
+ return mk_ple_dt_rb(hoid, v, pv, osd_reqid_t());
+ }
+}; // PGLogTestBase
+
+
+class PGLogTest : virtual public ::testing::Test, protected PGLog, public PGLogTestBase {
+public:
+ PGLogTest() : PGLog(g_ceph_context) {}
+ void SetUp() override {
+ missing.may_include_deletes = true;
+ }
+
+#include "common/ceph_context.h"
+#include "common/config.h"
+
+ void TearDown() override {
+ clear();
+ }
+
struct TestCase {
list<pg_log_entry_t> base;
set<hobject_t> toremove;
list<pg_log_entry_t> torollback;
+ bool deletes_during_peering;
private:
IndexedLog fullauth;
pg_info_t authinfo;
pg_info_t divinfo;
public:
+ TestCase() : deletes_during_peering(false) {}
void setup() {
+ init.may_include_deletes = !deletes_during_peering;
+ final.may_include_deletes = !deletes_during_peering;
fullauth.log.insert(fullauth.log.end(), base.begin(), base.end());
fullauth.log.insert(fullauth.log.end(), auth.begin(), auth.end());
fulldiv.log.insert(fulldiv.log.end(), base.begin(), base.end());
const IndexedLog &get_fulldiv() const { return fulldiv; }
const pg_info_t &get_authinfo() const { return authinfo; }
const pg_info_t &get_divinfo() const { return divinfo; }
- };
+ }; // struct TestCase
struct LogHandler : public PGLog::LogEntryHandler {
set<hobject_t> removed;
list<pg_log_entry_t> rolledback;
-
+
void rollback(
const pg_log_entry_t &entry) override {
rolledback.push_back(entry);
ASSERT_EQ(info.last_update, oinfo.last_update);
verify_missing(tcase, missing);
verify_sideeffects(tcase, h);
- };
+ }
+
void test_proc_replica_log(const TestCase &tcase) {
clear();
log = tcase.get_fullauth();
pg_info_t oinfo = tcase.get_divinfo();
proc_replica_log(
- oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
+ oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
assert(oinfo.last_update >= log.tail);
for (list<pg_log_entry_t>::const_iterator i = tcase.auth.begin();
i != tcase.auth.end();
++i) {
- if (i->version > oinfo.last_update)
- omissing.add_next_event(*i);
+ if (i->version > oinfo.last_update) {
+ if (i->is_delete() && tcase.deletes_during_peering) {
+ omissing.rm(i->soid, i->version);
+ } else {
+ omissing.add_next_event(*i);
+ }
+ }
}
verify_missing(tcase, omissing);
- }
+ } // test_proc_replica_log
+
void run_test_case(const TestCase &tcase) {
test_merge_log(tcase);
test_proc_replica_log(tcase);
}
-};
+}; // class PGLogTest
struct TestHandler : public PGLog::LogEntryHandler {
list<hobject_t> &removed;
// the old entry (from the log entry given in argument) is not a CLONE and
// the old entry (from the log entry given in argument) is not a DELETE and
// the old entry prior_version is lower than the tail of the log :
- // add the old object to the remove_snap list and
+ // add the old object to the remove_snap list and
// add the old object to divergent priors and
// add or update the prior_version of the object to missing and
// return false
oe.op = pg_log_entry_t::MODIFY;
oe.prior_version = eversion_t();
- missing.add(oe.soid, eversion_t(1,1), eversion_t());
+ missing.add(oe.soid, eversion_t(1,1), eversion_t(), false);
missing.flush();
EXPECT_FALSE(is_dirty());
oinfo.stats.reported_epoch = 1;
log.tail = olog.tail = eversion_t(1, 1);
log.head = olog.head = eversion_t(2, 1);
+ missing.may_include_deletes = false;
EXPECT_FALSE(missing.have_missing());
EXPECT_EQ(0U, log.log.size());
list<hobject_t> remove_snap;
bool dirty_info = false;
bool dirty_big_info = false;
+ missing.may_include_deletes = false;
{
pg_log_entry_t e;
EXPECT_TRUE(dirty_big_info);
}
+ /* +--------------------------+
+ | log olog |
+ +--------+-------+---------+
+ | |object | |
+ |version | hash | version |
+ | | | |
+ tail > (1,1) | x5 | (1,1) < tail
+ | | | |
+ | | | |
+ | (1,2) | x3 | (1,2) < lower_bound
+ | | | |
+ | | | |
+ head > (1,3) | x9 | |
+ | DELETE | | |
+ | | | |
+ | | x9 | (2,3) |
+ | | | MODIFY |
+ | | | |
+ | | x7 | (2,4) < head
+ | | | DELETE |
+ +--------+-------+---------+
+
+ The log entry (1,3) deletes the object x9 but the olog entry (2,3) modifies
+ it and is authoritative : the log entry (1,3) is divergent.
+
+ */
+ {
+ clear();
+
+ pg_log_t olog;
+ pg_info_t oinfo;
+ pg_shard_t fromosd;
+ pg_info_t info;
+ list<hobject_t> remove_snap;
+ bool dirty_info = false;
+ bool dirty_big_info = false;
+
+ hobject_t divergent_object;
+ missing.may_include_deletes = true;
+
+ {
+ pg_log_entry_t e;
+ e.mark_unrollbackable();
+
+ e.version = eversion_t(1, 1);
+ e.soid.set_hash(0x5);
+ log.tail = e.version;
+ log.log.push_back(e);
+ e.version = eversion_t(1, 2);
+ e.soid.set_hash(0x3);
+ log.log.push_back(e);
+ e.version = eversion_t(1,3);
+ e.soid.set_hash(0x9);
+ divergent_object = e.soid;
+ e.op = pg_log_entry_t::DELETE;
+ log.log.push_back(e);
+ log.head = e.version;
+ log.index();
+
+ info.last_update = log.head;
+
+ e.version = eversion_t(1, 1);
+ e.soid.set_hash(0x5);
+ olog.tail = e.version;
+ olog.log.push_back(e);
+ e.version = eversion_t(1, 2);
+ e.soid.set_hash(0x3);
+ olog.log.push_back(e);
+ e.version = eversion_t(2, 3);
+ e.soid.set_hash(0x9);
+ e.op = pg_log_entry_t::MODIFY;
+ olog.log.push_back(e);
+ e.version = eversion_t(2, 4);
+ e.soid.set_hash(0x7);
+ e.op = pg_log_entry_t::DELETE;
+ olog.log.push_back(e);
+ olog.head = e.version;
+ }
+
+ snapid_t purged_snap(1);
+ {
+ oinfo.last_update = olog.head;
+ oinfo.purged_snaps.insert(purged_snap);
+ }
+
+ EXPECT_FALSE(missing.have_missing());
+ EXPECT_EQ(1U, log.objects.count(divergent_object));
+ EXPECT_EQ(3U, log.log.size());
+ EXPECT_TRUE(remove_snap.empty());
+ EXPECT_EQ(log.head, info.last_update);
+ EXPECT_TRUE(info.purged_snaps.empty());
+ EXPECT_FALSE(is_dirty());
+ EXPECT_FALSE(dirty_info);
+ EXPECT_FALSE(dirty_big_info);
+
+ TestHandler h(remove_snap);
+ merge_log(oinfo, olog, fromosd, info, &h,
+ dirty_info, dirty_big_info);
+
+ /* When the divergent entry is a DELETE and the authoritative
+ entry is a MODIFY, the object will be added to missing : it is
+ a verifiable side effect proving the entry was identified
+ to be divergent.
+ */
+ EXPECT_TRUE(missing.is_missing(divergent_object));
+ EXPECT_EQ(1U, log.objects.count(divergent_object));
+ EXPECT_EQ(4U, log.log.size());
+ /* DELETE entries from olog that are appended to the hed of the
+ log, and the divergent version of the object is removed (added
+ to remove_snap)
+ */
+ EXPECT_EQ(0x9U, remove_snap.front().get_hash());
+ EXPECT_EQ(log.head, info.last_update);
+ EXPECT_TRUE(info.purged_snaps.contains(purged_snap));
+ EXPECT_TRUE(is_dirty());
+ EXPECT_TRUE(dirty_info);
+ EXPECT_TRUE(dirty_big_info);
+ }
+
/* +--------------------------+
| log olog |
+--------+-------+---------+
EXPECT_FALSE(dirty_big_info);
TestHandler h(remove_snap);
+ missing.may_include_deletes = false;
merge_log(oinfo, olog, fromosd, info, &h,
dirty_info, dirty_big_info);
EXPECT_EQ(1U, log.objects.count(divergent_object));
EXPECT_EQ(4U, log.log.size());
/* DELETE entries from olog that are appended to the hed of the
- log are also added to remove_snap.
+ log, and the divergent version of the object is removed (added
+ to remove_snap). When peering handles deletes, it is the earlier
+ version that is in the removed list.
*/
EXPECT_EQ(0x7U, remove_snap.front().get_hash());
EXPECT_EQ(log.head, info.last_update);
EXPECT_FALSE(dirty_big_info);
TestHandler h(remove_snap);
+ missing.may_include_deletes = false;
merge_log(oinfo, olog, fromosd, info, &h,
dirty_info, dirty_big_info);
EXPECT_EQ(last_update, oinfo.last_update);
EXPECT_EQ(last_complete, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_FALSE(omissing.have_missing());
| | | DELETE |
| | | |
+--------+-------+---------+
-
+
The log entry (1,3) deletes the object x9 and the olog entry
(2,3) also deletes it : do nothing. The olog tail is ignored
because it is before the log tail.
-
+
*/
{
clear();
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_FALSE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
e.prior_version = eversion_t(1, 1);
e.soid = divergent_object;
divergent_object = e.soid;
- omissing.add(divergent_object, e.version, eversion_t());
+ omissing.add(divergent_object, e.version, eversion_t(), false);
e.op = pg_log_entry_t::MODIFY;
olog.log.push_back(e);
olog.head = e.version;
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
e.prior_version = eversion_t(1, 1);
e.soid.set_hash(0x9);
divergent_object = e.soid;
- omissing.add(divergent_object, e.version, eversion_t());
+ omissing.add(divergent_object, e.version, eversion_t(), false);
e.op = pg_log_entry_t::MODIFY;
olog.log.push_back(e);
olog.head = e.version;
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100)));
- t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0));
+ t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0), false);
t.toremove.insert(mk_obj(1));
t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100)));
t.div.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 102), mk_evt(10, 101)));
- t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0));
+ t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0), false);
t.toremove.insert(mk_obj(1));
t.div.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100)));
t.div.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 102), mk_evt(10, 101)));
- t.init.add(mk_obj(1), mk_evt(10, 102), mk_evt(0, 0));
- t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0));
+ t.init.add(mk_obj(1), mk_evt(10, 102), mk_evt(0, 0), false);
+ t.final.add(mk_obj(1), mk_evt(10, 100), mk_evt(0, 0), false);
t.setup();
run_test_case(t);
t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
- t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(0, 0));
+ t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(0, 0), false);
t.toremove.insert(mk_obj(1));
t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
- t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100));
+ t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100), false);
t.setup();
run_test_case(t);
t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
- t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80));
- t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(8, 80));
+ t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80), false);
+ t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(8, 80), false);
t.setup();
run_test_case(t);
t.auth.push_back(mk_ple_dt(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
- t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80));
+ t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80), false);
+ t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(8, 80), true);
+
+ t.setup();
+ run_test_case(t);
+}
+
+TEST_F(PGLogTest, merge_log_9) {
+ TestCase t;
+ t.base.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80)));
+
+ t.auth.push_back(mk_ple_dt(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
+ t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80), false);
t.toremove.insert(mk_obj(1));
+ t.deletes_during_peering = true;
+
+ t.setup();
+ run_test_case(t);
+}
+
+TEST_F(PGLogTest, merge_log_10) {
+ TestCase t;
+ t.base.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80)));
+
+ t.auth.push_back(mk_ple_ldt(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
+
+ t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80), false);
+ t.final.add(mk_obj(1), mk_evt(11, 101), mk_evt(8, 80), true);
t.setup();
run_test_case(t);
t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100)));
- t.init.add(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100));
+ t.init.add(mk_obj(1), mk_evt(10, 101), mk_evt(10, 100), false);
t.setup();
run_test_case(t);
t.setup();
t.set_div_bounds(mk_evt(9, 79), mk_evt(8, 69));
t.set_auth_bounds(mk_evt(15, 160), mk_evt(9, 77));
- t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70));
+ t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70), false);
run_test_case(t);
}
t.setup();
t.set_div_bounds(mk_evt(15, 153), mk_evt(15, 151));
t.set_auth_bounds(mk_evt(15, 156), mk_evt(10, 99));
- t.final.add(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150));
+ t.final.add(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150), false);
run_test_case(t);
}
t.setup();
t.set_div_bounds(mk_evt(15, 153), mk_evt(15, 151));
t.set_auth_bounds(mk_evt(16, 156), mk_evt(10, 99));
- t.final.add(mk_obj(1), mk_evt(16, 155), mk_evt(0, 0));
+ t.final.add(mk_obj(1), mk_evt(16, 155), mk_evt(0, 0), false);
t.toremove.insert(mk_obj(1));
run_test_case(t);
}
EXPECT_EQ(del.reqid, entry->reqid);
}
+TEST_F(PGLogTest, split_into_preserves_may_include_deletes) {
+ clear();
+
+ {
+ rebuilt_missing_with_deletes = false;
+ missing.may_include_deletes = true;
+ PGLog child_log(cct, prefix_provider);
+ pg_t child_pg;
+ split_into(child_pg, 6, &child_log);
+ ASSERT_TRUE(child_log.get_missing().may_include_deletes);
+ ASSERT_TRUE(child_log.get_rebuilt_missing_with_deletes());
+ }
+
+ {
+ rebuilt_missing_with_deletes = false;
+ missing.may_include_deletes = false;
+ PGLog child_log(cct, prefix_provider);
+ pg_t child_pg;
+ split_into(child_pg, 6, &child_log);
+ ASSERT_FALSE(child_log.get_missing().may_include_deletes);
+ ASSERT_FALSE(child_log.get_rebuilt_missing_with_deletes());
+ }
+}
+
+class PGLogTestRebuildMissing : public PGLogTest, public StoreTestFixture {
+public:
+ PGLogTestRebuildMissing() : PGLogTest(), StoreTestFixture("memstore") {}
+ void SetUp() override {
+ StoreTestFixture::SetUp();
+ ObjectStore::Sequencer osr(__func__);
+ ObjectStore::Transaction t;
+ test_coll = coll_t(spg_t(pg_t(1, 1)));
+ t.create_collection(test_coll, 0);
+ store->apply_transaction(&osr, std::move(t));
+ existing_oid = mk_obj(0);
+ nonexistent_oid = mk_obj(1);
+ ghobject_t existing_ghobj(existing_oid);
+ object_info_t existing_info;
+ existing_info.version = eversion_t(6, 2);
+ bufferlist enc_oi;
+ ::encode(existing_info, enc_oi, 0);
+ ObjectStore::Transaction t2;
+ t2.touch(test_coll, ghobject_t(existing_oid));
+ t2.setattr(test_coll, ghobject_t(existing_oid), OI_ATTR, enc_oi);
+ ASSERT_EQ(0u, store->apply_transaction(&osr, std::move(t2)));
+ info.last_backfill = hobject_t::get_max();
+ info.last_complete = eversion_t();
+ }
+
+ void TearDown() override {
+ clear();
+ missing.may_include_deletes = false;
+ StoreTestFixture::TearDown();
+ }
+
+ pg_info_t info;
+ coll_t test_coll;
+ hobject_t existing_oid, nonexistent_oid;
+
+ void run_rebuild_missing_test(const map<hobject_t, pg_missing_item> &expected_missing_items) {
+ rebuild_missing_set_with_deletes(store.get(), test_coll, info);
+ ASSERT_EQ(expected_missing_items, missing.get_items());
+ }
+};
+
+TEST_F(PGLogTestRebuildMissing, EmptyLog) {
+ missing.add(existing_oid, mk_evt(6, 2), mk_evt(6, 3), false);
+ missing.add(nonexistent_oid, mk_evt(7, 4), mk_evt(0, 0), false);
+ map<hobject_t, pg_missing_item> orig_missing = missing.get_items();
+ run_rebuild_missing_test(orig_missing);
+}
+
+TEST_F(PGLogTestRebuildMissing, SameVersionMod) {
+ missing.add(existing_oid, mk_evt(6, 2), mk_evt(6, 1), false);
+ log.add(mk_ple_mod(existing_oid, mk_evt(6, 2), mk_evt(6, 1)));
+ map<hobject_t, pg_missing_item> empty_missing;
+ run_rebuild_missing_test(empty_missing);
+}
+
+TEST_F(PGLogTestRebuildMissing, DelExisting) {
+ missing.add(existing_oid, mk_evt(6, 3), mk_evt(6, 2), false);
+ log.add(mk_ple_dt(existing_oid, mk_evt(7, 5), mk_evt(7, 4)));
+ map<hobject_t, pg_missing_item> expected;
+ expected[existing_oid] = pg_missing_item(mk_evt(7, 5), mk_evt(6, 2), true);
+ run_rebuild_missing_test(expected);
+}
+
+TEST_F(PGLogTestRebuildMissing, DelNonexistent) {
+ log.add(mk_ple_dt(nonexistent_oid, mk_evt(7, 5), mk_evt(7, 4)));
+ map<hobject_t, pg_missing_item> expected;
+ expected[nonexistent_oid] = pg_missing_item(mk_evt(7, 5), mk_evt(0, 0), true);
+ run_rebuild_missing_test(expected);
+}
+
+TEST_F(PGLogTestRebuildMissing, MissingNotInLog) {
+ missing.add(mk_obj(10), mk_evt(8, 12), mk_evt(8, 10), false);
+ log.add(mk_ple_dt(nonexistent_oid, mk_evt(7, 5), mk_evt(7, 4)));
+ map<hobject_t, pg_missing_item> expected;
+ expected[nonexistent_oid] = pg_missing_item(mk_evt(7, 5), mk_evt(0, 0), true);
+ expected[mk_obj(10)] = pg_missing_item(mk_evt(8, 12), mk_evt(8, 10), false);
+ run_rebuild_missing_test(expected);
+}
+
+
+class PGLogMergeDupsTest : public ::testing::Test, protected PGLog {
+
+public:
+
+ PGLogMergeDupsTest() : PGLog(g_ceph_context) { }
+
+ void SetUp() override { }
+
+ void TearDown() override {
+ clear();
+ }
+
+ static pg_log_dup_t create_dup_entry(uint a, uint b) {
+ // make each dup_entry unique by using different client id's
+ static uint client_id = 777;
+ return pg_log_dup_t(eversion_t(a, b),
+ a,
+ osd_reqid_t(entity_name_t::CLIENT(client_id++), 8, 1),
+ 0);
+ }
+
+ static std::vector<pg_log_dup_t> example_dups_1() {
+ std::vector<pg_log_dup_t> result = {
+ create_dup_entry(10, 11),
+ create_dup_entry(10, 12),
+ create_dup_entry(11, 1),
+ create_dup_entry(12, 3),
+ create_dup_entry(13, 99)
+ };
+ return result;
+ }
+
+ static std::vector<pg_log_dup_t> example_dups_2() {
+ std::vector<pg_log_dup_t> result = {
+ create_dup_entry(12, 3),
+ create_dup_entry(13, 99),
+ create_dup_entry(15, 11),
+ create_dup_entry(16, 14),
+ create_dup_entry(16, 32)
+ };
+ return result;
+ }
+
+ void add_dups(uint a, uint b) {
+ log.dups.push_back(create_dup_entry(a, b));
+ }
+
+ void add_dups(const std::vector<pg_log_dup_t>& l) {
+ for (auto& i : l) {
+ log.dups.push_back(i);
+ }
+ }
+
+ static void add_dups(IndexedLog& log, const std::vector<pg_log_dup_t>& dups) {
+ for (auto& i : dups) {
+ log.dups.push_back(i);
+ }
+ }
+
+ void check_order() {
+ eversion_t prev(0, 0);
+
+ for (auto& i : log.dups) {
+ EXPECT_LT(prev, i.version) << "verify versions monotonically increase";
+ prev = i.version;
+ }
+ }
+
+ void check_index() {
+ EXPECT_EQ(log.dups.size(), log.dup_index.size());
+ for (auto& i : log.dups) {
+ EXPECT_EQ(1u, log.dup_index.count(i.reqid));
+ }
+ }
+};
+
+TEST_F(PGLogMergeDupsTest, OtherEmpty) {
+ log.tail = eversion_t(14, 5);
+
+ IndexedLog olog;
+
+ add_dups(example_dups_1());
+ index();
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_FALSE(changed);
+ EXPECT_EQ(5u, log.dups.size());
+
+ if (5 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+ EXPECT_EQ(13u, log.dups.back().version.epoch);
+ EXPECT_EQ(99u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, AmEmpty) {
+ log.tail = eversion_t(14, 5);
+ index();
+
+ IndexedLog olog;
+
+ add_dups(olog, example_dups_1());
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(5u, log.dups.size());
+
+ if (5 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+
+ EXPECT_EQ(13u, log.dups.back().version.epoch);
+ EXPECT_EQ(99u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, AmEmptyOverlap) {
+ log.tail = eversion_t(12, 3);
+ index();
+
+ IndexedLog olog;
+
+ add_dups(olog, example_dups_1());
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(3u, log.dups.size());
+
+ if (3 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+
+ EXPECT_EQ(11u, log.dups.back().version.epoch);
+ EXPECT_EQ(1u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, Same) {
+ log.tail = eversion_t(14, 1);
+
+ IndexedLog olog;
+
+ add_dups(example_dups_1());
+ index();
+ add_dups(olog, example_dups_1());
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_FALSE(changed);
+ EXPECT_EQ(5u, log.dups.size());
+
+ if (5 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+
+ EXPECT_EQ(13u, log.dups.back().version.epoch);
+ EXPECT_EQ(99u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Later) {
+ log.tail = eversion_t(16, 14);
+
+ IndexedLog olog;
+
+ add_dups(example_dups_1());
+ index();
+ add_dups(olog, example_dups_2());
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(6u, log.dups.size());
+
+ if (6 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+
+ EXPECT_EQ(15u, log.dups.back().version.epoch);
+ EXPECT_EQ(11u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Earlier) {
+ log.tail = eversion_t(17, 2);
+
+ IndexedLog olog;
+
+ add_dups(example_dups_2());
+ index();
+ add_dups(olog, example_dups_1());
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(8u, log.dups.size());
+
+ if (6 == log.dups.size()) {
+ EXPECT_EQ(10u, log.dups.front().version.epoch);
+ EXPECT_EQ(11u, log.dups.front().version.version);
+
+ EXPECT_EQ(16u, log.dups.back().version.epoch);
+ EXPECT_EQ(32u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Superset) {
+ log.tail = eversion_t(17, 2);
+
+ IndexedLog olog;
+
+ add_dups(example_dups_1());
+ index();
+
+ olog.dups.push_back(create_dup_entry(9, 5));
+ olog.dups.push_back(create_dup_entry(15, 11));
+
+ bool changed = merge_log_dups(olog);
+
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(7u, log.dups.size());
+
+ if (7 == log.dups.size()) {
+ EXPECT_EQ(9u, log.dups.front().version.epoch);
+ EXPECT_EQ(5u, log.dups.front().version.version);
+
+ EXPECT_EQ(15u, log.dups.back().version.epoch);
+ EXPECT_EQ(11u, log.dups.back().version.version);
+ }
+
+ check_order();
+ check_index();
+}
+
+
+struct PGLogTrimTest :
+ public ::testing::Test,
+ public PGLogTestBase,
+ public PGLog::IndexedLog
+{
+ std::list<hobject_t*> test_hobjects;
+ CephContext *cct;
+
+ void SetUp() override {
+ cct = (new CephContext(CEPH_ENTITY_TYPE_OSD))->get();
+
+ hobject_t::generate_test_instances(test_hobjects);
+ }
+
+ void SetUp(unsigned min_entries, unsigned max_entries, unsigned dup_track) {
+ constexpr size_t size = 10;
+
+ char min_entries_s[size];
+ char max_entries_s[size];
+ char dup_track_s[size];
+
+ snprintf(min_entries_s, size, "%u", min_entries);
+ snprintf(max_entries_s, size, "%u", max_entries);
+ snprintf(dup_track_s, size, "%u", dup_track);
+
+ cct->_conf->set_val_or_die("osd_min_pg_log_entries", min_entries_s);
+ cct->_conf->set_val_or_die("osd_max_pg_log_entries", max_entries_s);
+ cct->_conf->set_val_or_die("osd_pg_log_dups_tracked", dup_track_s);
+}
+
+ void TearDown() override {
+ while (!test_hobjects.empty()) {
+ delete test_hobjects.front();
+ test_hobjects.pop_front();
+ }
+
+ cct->put();
+ }
+}; // struct PGLogTrimTest
+
+
+# if 0
+TEST_F(PGLogTest, Trim1) {
+ TestCase t;
+
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150)));
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(26, 160)));
+ t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(31, 171)));
+
+ t.setup();
+}
+#endif
+
+
+TEST_F(PGLogTrimTest, TestMakingCephContext)
+{
+ SetUp(1, 2, 5);
+
+ EXPECT_EQ(1u, cct->_conf->osd_min_pg_log_entries);
+ EXPECT_EQ(2u, cct->_conf->osd_max_pg_log_entries);
+ EXPECT_EQ(5u, cct->_conf->osd_pg_log_dups_tracked);
+}
+
+
+TEST_F(PGLogTrimTest, TestPartialTrim)
+{
+ SetUp(1, 2, 20);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(24, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+ std::set<eversion_t> trimmed;
+ std::set<std::string> trimmed_dups;
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+
+ EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(3u, log.log.size());
+ EXPECT_EQ(3u, trimmed.size());
+ EXPECT_EQ(2u, log.dups.size());
+ EXPECT_EQ(0u, trimmed_dups.size());
+
+ SetUp(1, 2, 15);
+
+ std::set<eversion_t> trimmed2;
+ std::set<std::string> trimmed_dups2;
+ bool dirty_dups2 = false;
+
+ log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &dirty_dups2);
+
+ EXPECT_EQ(true, dirty_dups2);
+ EXPECT_EQ(2u, log.log.size());
+ EXPECT_EQ(1u, trimmed2.size());
+ EXPECT_EQ(2u, log.dups.size());
+ EXPECT_EQ(1u, trimmed_dups2.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestTrimNoTrimmed) {
+ SetUp(1, 2, 20);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(20, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+
+ EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(3u, log.log.size());
+ EXPECT_EQ(2u, log.dups.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestTrimNoDups)
+{
+ SetUp(1, 2, 10);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(20, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+ std::set<eversion_t> trimmed;
+ std::set<std::string> trimmed_dups;
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+
+ EXPECT_FALSE(dirty_dups);
+ EXPECT_EQ(3u, log.log.size());
+ EXPECT_EQ(3u, trimmed.size());
+ EXPECT_EQ(0u, log.dups.size());
+ EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+TEST_F(PGLogTrimTest, TestNoTrim)
+{
+ SetUp(1, 2, 20);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(24, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+ std::set<eversion_t> trimmed;
+ std::set<std::string> trimmed_dups;
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &dirty_dups);
+
+ EXPECT_FALSE(dirty_dups);
+ EXPECT_EQ(6u, log.log.size());
+ EXPECT_EQ(0u, trimmed.size());
+ EXPECT_EQ(0u, log.dups.size());
+ EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+TEST_F(PGLogTrimTest, TestTrimAll)
+{
+ SetUp(1, 2, 20);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(24, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+ std::set<eversion_t> trimmed;
+ std::set<std::string> trimmed_dups;
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &dirty_dups);
+
+ EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(0u, log.log.size());
+ EXPECT_EQ(6u, trimmed.size());
+ EXPECT_EQ(5u, log.dups.size());
+ EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestGetRequest) {
+ SetUp(1, 2, 20);
+ PGLog::IndexedLog log;
+ log.head = mk_evt(20, 0);
+ log.skip_can_rollback_to_to_head();
+ log.head = mk_evt(9, 0);
+
+ entity_name_t client = entity_name_t::CLIENT(777);
+
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70),
+ osd_reqid_t(client, 8, 1)));
+ log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100),
+ osd_reqid_t(client, 8, 2)));
+ log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150),
+ osd_reqid_t(client, 8, 3)));
+ log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152),
+ osd_reqid_t(client, 8, 4)));
+ log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160),
+ osd_reqid_t(client, 8, 5)));
+ log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166),
+ osd_reqid_t(client, 8, 6)));
+
+ bool dirty_dups = false;
+
+ log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+
+ EXPECT_EQ(true, dirty_dups);
+ EXPECT_EQ(3u, log.log.size());
+ EXPECT_EQ(2u, log.dups.size());
+
+ eversion_t version;
+ version_t user_version;
+ int return_code;
+
+ osd_reqid_t log_reqid = osd_reqid_t(client, 8, 5);
+ osd_reqid_t dup_reqid = osd_reqid_t(client, 8, 3);
+ osd_reqid_t bad_reqid = osd_reqid_t(client, 8, 1);
+
+ bool result;
+
+ result = log.get_request(log_reqid, &version, &user_version, &return_code);
+ EXPECT_EQ(true, result);
+ EXPECT_EQ(mk_evt(21, 165), version);
+
+ result = log.get_request(dup_reqid, &version, &user_version, &return_code);
+ EXPECT_EQ(true, result);
+ EXPECT_EQ(mk_evt(15, 155), version);
+
+ result = log.get_request(bad_reqid, &version, &user_version, &return_code);
+ EXPECT_FALSE(result);
+}
+
+
// Local Variables:
// compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true --debug-osd=20 # --gtest_filter=*.* "
// End:
<< "\tWrite stride min: " << min_stride_size << std::endl
<< "\tWrite stride max: " << max_stride_size << std::endl;
- if (min_stride_size > max_stride_size) {
- cerr << "Error: min_stride_size cannot be more than max_stride_size"
+ if (min_stride_size >= max_stride_size) {
+ cerr << "Error: max_stride_size must be more than min_stride_size"
<< std::endl;
return 1;
}
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7106" # git grep '\<7106\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_bench() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- local osd_bench_small_size_max_iops=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_bench_small_size_max_iops)
- local osd_bench_large_size_max_throughput=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_bench_large_size_max_throughput)
- local osd_bench_max_block_size=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_bench_max_block_size)
- local osd_bench_duration=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_bench_duration)
-
- #
- # block size too high
- #
- expect_failure $dir osd_bench_max_block_size \
- ceph tell osd.0 bench 1024 $((osd_bench_max_block_size + 1)) || return 1
-
- #
- # count too high for small (< 1MB) block sizes
- #
- local bsize=1024
- local max_count=$(($bsize * $osd_bench_duration * $osd_bench_small_size_max_iops))
- expect_failure $dir bench_small_size_max_iops \
- ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
-
- #
- # count too high for large (>= 1MB) block sizes
- #
- local bsize=$((1024 * 1024 + 1))
- local max_count=$(($osd_bench_large_size_max_throughput * $osd_bench_duration))
- expect_failure $dir osd_bench_large_size_max_throughput \
- ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
-
- #
- # default values should work
- #
- ceph tell osd.0 bench || return 1
-
- #
- # test object_size < block_size
- ceph tell osd.0 bench 10 14456 4444 3
- #
-
- #
- # test object_size < block_size & object_size = 0(default value)
- #
- ceph tell osd.0 bench 1 14456
-}
-
-main osd-bench "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7100" # git grep '\<7100\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_config_init() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- local advance=1000
- local stale=1000
- local cache=500
- run_osd $dir 0 \
- --osd-map-max-advance $advance \
- --osd-map-cache-size $cache \
- --osd-pg-epoch-persisted-max-stale $stale \
- || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
-}
-
-function TEST_config_track() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- local osd_map_cache_size=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_map_cache_size)
- local osd_map_max_advance=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_map_max_advance)
- local osd_pg_epoch_persisted_max_stale=$(CEPH_ARGS='' ceph-conf \
- --show-config-value osd_pg_epoch_persisted_max_stale)
- #
- # lower cache_size under max_advance to trigger the warning
- #
- ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- local cache=$(($osd_map_max_advance / 2))
- ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- rm $dir/osd.0.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log reopen || return 1
-
- #
- # reset cache_size to the default and assert that it does not trigger the warning
- #
- ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- local cache=$osd_map_cache_size
- ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- rm $dir/osd.0.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log reopen || return 1
-
- #
- # increase the osd_map_max_advance above the default cache_size
- #
- ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- local advance=$(($osd_map_cache_size * 2))
- ceph tell osd.0 injectargs "--osd-map-max-advance $advance" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
- rm $dir/osd.0.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log reopen || return 1
-
- #
- # increase the osd_pg_epoch_persisted_max_stale above the default cache_size
- #
- ! grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
- local stale=$(($osd_map_cache_size * 2))
- ceph tell osd.0 injectargs "--osd-pg-epoch-persisted-max-stale $stale" || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
- grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
- rm $dir/osd.0.log
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok log reopen || return 1
-}
-
-main osd-config "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-config.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-# Author: Sage Weil <sage@redhat.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7111" # git grep '\<7111\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_copy_from() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
-
- # success
- rados -p rbd put foo $(which rados)
- rados -p rbd cp foo foo2
- rados -p rbd stat foo2
-
- # failure
- ceph tell osd.\* injectargs -- --osd-debug-inject-copyfrom-error
- ! rados -p rbd cp foo foo3
- ! rados -p rbd stat foo3
-
- # success again
- ceph tell osd.\* injectargs -- --no-osd-debug-inject-copyfrom-error
- ! rados -p rbd cp foo foo3
- rados -p rbd stat foo3
-}
-
-main osd-copy-from "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7146" # git grep '\<7146\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
- # avoid running out of fds in rados bench
- CEPH_ARGS+="--filestore_wbthrottle_xfs_ios_hard_limit=900 "
- CEPH_ARGS+="--filestore_wbthrottle_btrfs_ios_hard_limit=900 "
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_filestore_to_bluestore() {
- local dir=$1
-
- local flimit=$(ulimit -n)
- if [ $flimit -lt 1536 ]; then
- echo "Low open file limit ($flimit), test may fail. Increase to 1536 or higher and retry if that happens."
- fi
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- osd_pid=$(cat $dir/osd.0.pid)
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- sleep 5
-
- ceph osd pool create foo 16
-
- # write some objects
- rados bench -p foo 10 write -b 4096 --no-cleanup || return 1
-
- # kill
- while kill $osd_pid; do sleep 1 ; done
- ceph osd down 0
-
- mv $dir/0 $dir/0.old || return 1
- mkdir $dir/0 || return 1
- ofsid=$(cat $dir/0.old/fsid)
- echo "osd fsid $ofsid"
- O=$CEPH_ARGS
- CEPH_ARGS+="--log-file $dir/cot.log --log-max-recent 0 "
- ceph-objectstore-tool --type bluestore --data-path $dir/0 --fsid $ofsid \
- --op mkfs || return 1
- ceph-objectstore-tool --data-path $dir/0.old --target-data-path $dir/0 \
- --op dup || return 1
- CEPH_ARGS=$O
-
- run_osd_bluestore $dir 0 || return 1
-
- while ! ceph osd stat | grep '3 up' ; do sleep 1 ; done
- ceph osd metadata 0 | grep bluestore || return 1
-
- ceph osd scrub 0
-
- # give it some time
- sleep 15
- # and make sure mon is sync'ed
- flush_pg_stats
-
- wait_for_clean || return 1
-}
-
-main osd-dup "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-dup.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2016 Piotr Dałek <git@predictor.org.pl>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Piotr Dałek <git@predictor.org.pl>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-MAX_PROPAGATION_TIME=30
-
-function run() {
- local dir=$1
- shift
- rm -f $dir/*.pid
- export CEPH_MON="127.0.0.1:7126" # git grep '\<7126\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
- #
- # Disable device auto class feature for this testing,
- # as it will automatically make root clones based on new class types
- # and hence affect the down osd counting.
- # E.g.,
- #
- # ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
- # -4 3.00000 root default~hdd
- # -3 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic~hdd
- # 0 1.00000 osd.0 down 1.00000 1.00000
- # 1 1.00000 osd.1 up 1.00000 1.00000
- # 2 1.00000 osd.2 up 1.00000 1.00000
- # -1 3.00000 root default
- # -2 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic
- # 0 1.00000 osd.0 down 1.00000 1.00000
- # 1 1.00000 osd.1 up 1.00000 1.00000
- # 2 1.00000 osd.2 up 1.00000 1.00000
- #
- CEPH_ARGS+="--osd-class-update-on-start=false "
-
- OLD_ARGS=$CEPH_ARGS
- CEPH_ARGS+="--osd-fast-fail-on-connection-refused=false "
- echo "Ensuring old behavior is there..."
- test_fast_kill $dir && (echo "OSDs died too early! Old behavior doesn't work." ; return 1)
-
- CEPH_ARGS=$OLD_ARGS"--osd-fast-fail-on-connection-refused=true "
- OLD_ARGS=$CEPH_ARGS
-
- CEPH_ARGS+="--ms_type=simple"
- echo "Testing simple msgr..."
- test_fast_kill $dir || return 1
-
- CEPH_ARGS=$OLD_ARGS"--ms_type=async"
- echo "Testing async msgr..."
- test_fast_kill $dir || return 1
-
- return 0
-
-}
-
-function test_fast_kill() {
- # create cluster with 3 osds
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=3 || return 1
- run_mgr $dir x || return 1
- for oi in {0..2}; do
- run_osd $dir $oi || return 1
- pids[$oi]=$(cat $dir/osd.$oi.pid)
- done
-
- # make some objects so osds to ensure connectivity between osds
- rados -p rbd bench 10 write -b 4096 --max-objects 128 --no-cleanup
- sleep 1
-
- killid=0
- previd=0
-
- # kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased.
- for i in {1..2}; do
- while [ $killid -eq $previd ]; do
- killid=${pids[$RANDOM%${#pids[@]}]}
- done
- previd=$killid
-
- kill -9 $killid
- time_left=$MAX_PROPAGATION_TIME
- down_osds=0
-
- while [ $time_left -gt 0 ]; do
- sleep 1
- time_left=$[$time_left - 1];
-
- grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null
- if [ $? -ne 0 ]; then
- continue
- fi
-
- down_osds=$(ceph osd tree | grep -c down)
- if [ $down_osds -lt $i ]; then
- # osds not marked down yet, try again in a second
- continue
- elif [ $down_osds -gt $i ]; then
- echo Too many \($down_osds\) osds died!
- return 1
- else
- break
- fi
- done
-
- if [ $down_osds -lt $i ]; then
- echo Killed the OSD, yet it is not marked down
- ceph osd tree
- return 1
- fi
- done
- pkill -SIGTERM rados
- teardown $dir || return 1
-}
-
-main osd-fast-mark-down "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-fast-mark-down.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2015 Intel <contact@intel.com.com>
-# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
-#
-# Author: Xiaoxi Chen <xiaoxi.chen@intel.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7108" # git grep '\<7108\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function markdown_N_impl() {
- markdown_times=$1
- total_time=$2
- sleeptime=$3
- for i in `seq 1 $markdown_times`
- do
- # check the OSD is UP
- ceph osd tree
- ceph osd tree | grep osd.0 |grep up || return 1
- # mark the OSD down.
- ceph osd down 0
- sleep $sleeptime
- done
-}
-
-
-function TEST_markdown_exceed_maxdown_count() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
- # 3+1 times within 300s, osd should stay dead on the 4th time
- local count=3
- local sleeptime=10
- local period=300
- ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
- ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
-
- markdown_N_impl $(($count+1)) $period $sleeptime
- # down N+1 times ,the osd.0 shoud die
- ceph osd tree | grep down | grep osd.0 || return 1
-}
-
-function TEST_markdown_boot() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
- # 3 times within 120s, should stay up
- local count=3
- local sleeptime=10
- local period=120
- ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
- ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
-
- markdown_N_impl $count $period $sleeptime
- #down N times, osd.0 should be up
- sleep 15 # give osd plenty of time to notice and come back up
- ceph osd tree | grep up | grep osd.0 || return 1
-}
-
-function TEST_markdown_boot_exceed_time() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- run_osd $dir 2 || return 1
-
-
- # 3+1 times, but over 40s, > 20s, so should stay up
- local count=3
- local period=20
- local sleeptime=10
- ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1
- ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1
-
- markdown_N_impl $(($count+1)) $period $sleeptime
- sleep 15 # give osd plenty of time to notice and come back up
- ceph osd tree | grep up | grep osd.0 || return 1
-}
-
-main osd-markdown "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
-# End:
+++ /dev/null
-#!/bin/bash
-#
-# Author: Vicente Cheng <freeze.bilsted@gmail.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7122" # git grep '\<7122\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- setup $dir || return 1
- $func $dir || return 1
- teardown $dir || return 1
- done
-}
-
-function TEST_reactivate() {
- local dir=$1
-
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- kill_daemons $dir TERM osd || return 1
-
- ready_path=$dir"/0/ready"
- activate_path=$dir"/0/active"
- # trigger mkfs again
- rm -rf $ready_path $activate_path
- activate_osd $dir 0 || return 1
-
-}
-
-main osd-reactivate "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reactivate.sh"
-# End:
+++ /dev/null
-#! /bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7123" # git grep '\<7123\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-function TEST_reuse_id() {
- local dir=$1
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- wait_for_clean || return 1
- destroy_osd $dir 1 || return 1
- run_osd $dir 1 || return 1
-}
-
-main osd-reuse-id "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reuse-id.sh"
-# End:
-
+++ /dev/null
-#!/bin/bash -x
-#
-# Copyright (C) 2014 Red Hat <contact@redhat.com>
-#
-# Author: Loic Dachary <loic@dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-if [ `uname` = FreeBSD ]; then
- # erasure coding overwrites are only tested on Bluestore
- # erasure coding on filestore is unsafe
- # http://docs.ceph.com/docs/master/rados/operations/erasure-code/#erasure-coding-with-overwrites
- use_ec_overwrite=false
-else
- use_ec_overwrite=true
-fi
-
-# Test development and debugging
-# Set to "yes" in order to ignore diff errors and save results to update test
-getjson="no"
-
-# Ignore the epoch and filter out the attr '_' value because it has date information and won't match
-jqfilter='.inconsistents | (.[].shards[].attrs[]? | select(.name == "_") | .value) |= "----Stripped-by-test----"'
-sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
-
-# Remove items are not consistent across runs, the pg interval and client
-sedfilter='s/\([ ]*\"\(selected_\)*object_info\":.*head[(]\)[^[:space:]]* [^[:space:]]* \(.*\)/\1\3/'
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-function add_something() {
- local dir=$1
- local poolname=$2
- local obj=${3:-SOMETHING}
- local scrub=${4:-noscrub}
-
- if [ "$scrub" = "noscrub" ];
- then
- ceph osd set noscrub || return 1
- ceph osd set nodeep-scrub || return 1
- else
- ceph osd unset noscrub || return 1
- ceph osd unset nodeep-scrub || return 1
- fi
-
- local payload=ABCDEF
- echo $payload > $dir/ORIGINAL
- rados --pool $poolname put $obj $dir/ORIGINAL || return 1
-}
-
-#
-# Corrupt one copy of a replicated pool
-#
-function TEST_corrupt_and_repair_replicated() {
- local dir=$1
- local poolname=rbd
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=2 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- wait_for_clean || return 1
-
- add_something $dir $poolname || return 1
- corrupt_and_repair_one $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1
- # Reproduces http://tracker.ceph.com/issues/8914
- corrupt_and_repair_one $dir $poolname $(get_primary $poolname SOMETHING) || return 1
-
- teardown $dir || return 1
-}
-
-function corrupt_and_repair_two() {
- local dir=$1
- local poolname=$2
- local first=$3
- local second=$4
-
- #
- # 1) remove the corresponding file from the OSDs
- #
- pids=""
- run_in_background pids objectstore_tool $dir $first SOMETHING remove
- run_in_background pids objectstore_tool $dir $second SOMETHING remove
- wait_background pids
- return_code=$?
- if [ $return_code -ne 0 ]; then return $return_code; fi
-
- #
- # 2) repair the PG
- #
- local pg=$(get_pg $poolname SOMETHING)
- repair $pg
- #
- # 3) The files must be back
- #
- pids=""
- run_in_background pids objectstore_tool $dir $first SOMETHING list-attrs
- run_in_background pids objectstore_tool $dir $second SOMETHING list-attrs
- wait_background pids
- return_code=$?
- if [ $return_code -ne 0 ]; then return $return_code; fi
-
- rados --pool $poolname get SOMETHING $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
-}
-
-#
-# 1) add an object
-# 2) remove the corresponding file from a designated OSD
-# 3) repair the PG
-# 4) check that the file has been restored in the designated OSD
-#
-function corrupt_and_repair_one() {
- local dir=$1
- local poolname=$2
- local osd=$3
-
- #
- # 1) remove the corresponding file from the OSD
- #
- objectstore_tool $dir $osd SOMETHING remove || return 1
- #
- # 2) repair the PG
- #
- local pg=$(get_pg $poolname SOMETHING)
- repair $pg
- #
- # 3) The file must be back
- #
- objectstore_tool $dir $osd SOMETHING list-attrs || return 1
- rados --pool $poolname get SOMETHING $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
-}
-
-function corrupt_and_repair_erasure_coded() {
- local dir=$1
- local poolname=$2
-
- add_something $dir $poolname || return 1
-
- local primary=$(get_primary $poolname SOMETHING)
- local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
- local not_primary_first=${osds[0]}
- local not_primary_second=${osds[1]}
-
- # Reproduces http://tracker.ceph.com/issues/10017
- corrupt_and_repair_one $dir $poolname $primary || return 1
- # Reproduces http://tracker.ceph.com/issues/10409
- corrupt_and_repair_one $dir $poolname $not_primary_first || return 1
- corrupt_and_repair_two $dir $poolname $not_primary_first $not_primary_second || return 1
- corrupt_and_repair_two $dir $poolname $primary $not_primary_first || return 1
-
-}
-
-function create_ec_pool() {
- local pool_name=$1
- local allow_overwrites=$2
-
- ceph osd erasure-code-profile set myprofile crush-failure-domain=osd $3 $4 $5 $6 $7 || return 1
-
- ceph osd pool create "$poolname" 1 1 erasure myprofile || return 1
-
- if [ "$allow_overwrites" = "true" ]; then
- ceph osd pool set "$poolname" allow_ec_overwrites true || return 1
- fi
-
- wait_for_clean || return 1
- return 0
-}
-
-function auto_repair_erasure_coded() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
-
- # Launch a cluster with 5 seconds scrub interval
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- local ceph_osd_args="--osd-scrub-auto-repair=true \
- --osd-deep-scrub-interval=5 \
- --osd-scrub-max-interval=5 \
- --osd-scrub-min-interval=5 \
- --osd-scrub-interval-randomize-ratio=0"
- for id in $(seq 0 2) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id $ceph_osd_args || return 1
- else
- run_osd $dir $id $ceph_osd_args || return 1
- fi
- done
- wait_for_clean || return 1
-
- # Create an EC pool
- create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1
-
- # Put an object
- local payload=ABCDEF
- echo $payload > $dir/ORIGINAL
- rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
-
- # Remove the object from one shard physically
- # Restarted osd get $ceph_osd_args passed
- objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
- # Wait for auto repair
- local pgid=$(get_pg $poolname SOMETHING)
- wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)"
- wait_for_clean || return 1
- # Verify - the file should be back
- # Restarted osd get $ceph_osd_args passed
- objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1
- rados --pool $poolname get SOMETHING $dir/COPY || return 1
- diff $dir/ORIGINAL $dir/COPY || return 1
-
- # Tear down
- teardown $dir || return 1
-}
-
-function TEST_auto_repair_erasure_coded_appends() {
- auto_repair_erasure_coded $1 false
-}
-
-function TEST_auto_repair_erasure_coded_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- auto_repair_erasure_coded $1 true
- fi
-}
-
-function corrupt_and_repair_jerasure() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 3) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id || return 1
- else
- run_osd $dir $id || return 1
- fi
- done
- wait_for_clean || return 1
-
- create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1
- corrupt_and_repair_erasure_coded $dir $poolname || return 1
-
- teardown $dir || return 1
-}
-
-function TEST_corrupt_and_repair_jerasure_appends() {
- corrupt_and_repair_jerasure $1
-}
-
-function TEST_corrupt_and_repair_jerasure_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- corrupt_and_repair_jerasure $1 true
- fi
-}
-
-function corrupt_and_repair_lrc() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 9) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id || return 1
- else
- run_osd $dir $id || return 1
- fi
- done
- wait_for_clean || return 1
-
- create_ec_pool $poolname $allow_overwrites k=4 m=2 l=3 plugin=lrc || return 1
- corrupt_and_repair_erasure_coded $dir $poolname || return 1
-
- teardown $dir || return 1
-}
-
-function TEST_corrupt_and_repair_lrc_appends() {
- corrupt_and_repair_jerasure $1
-}
-
-function TEST_corrupt_and_repair_lrc_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- corrupt_and_repair_jerasure $1 true
- fi
-}
-
-function unfound_erasure_coded() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
- local payload=ABCDEF
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 3) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id || return 1
- else
- run_osd $dir $id || return 1
- fi
- done
- wait_for_clean || return 1
-
- create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1
-
- add_something $dir $poolname || return 1
-
- local primary=$(get_primary $poolname SOMETHING)
- local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
- local not_primary_first=${osds[0]}
- local not_primary_second=${osds[1]}
- local not_primary_third=${osds[2]}
-
- #
- # 1) remove the corresponding file from the OSDs
- #
- pids=""
- run_in_background pids objectstore_tool $dir $not_primary_first SOMETHING remove
- run_in_background pids objectstore_tool $dir $not_primary_second SOMETHING remove
- run_in_background pids objectstore_tool $dir $not_primary_third SOMETHING remove
- wait_background pids
- return_code=$?
- if [ $return_code -ne 0 ]; then return $return_code; fi
-
- #
- # 2) repair the PG
- #
- local pg=$(get_pg $poolname SOMETHING)
- repair $pg
- #
- # 3) check pg state
- #
- # it may take a bit to appear due to mon/mgr asynchrony
- for f in `seq 1 60`; do
- ceph -s | grep "1/1 unfound" && break
- sleep 1
- done
- ceph -s|grep "4 osds: 4 up, 4 in" || return 1
- ceph -s|grep "1/1 unfound" || return 1
-
- teardown $dir || return 1
-}
-
-function TEST_unfound_erasure_coded_appends() {
- unfound_erasure_coded $1
-}
-
-function TEST_unfound_erasure_coded_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- unfound_erasure_coded $1 true
- fi
-}
-
-#
-# list_missing for EC pool
-#
-function list_missing_erasure_coded() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 2) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id || return 1
- else
- run_osd $dir $id || return 1
- fi
- done
- wait_for_clean || return 1
-
- create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1
-
- # Put an object and remove the two shards (including primary)
- add_something $dir $poolname MOBJ0 || return 1
- local -a osds0=($(get_osds $poolname MOBJ0))
-
- # Put another object and remove two shards (excluding primary)
- add_something $dir $poolname MOBJ1 || return 1
- local -a osds1=($(get_osds $poolname MOBJ1))
-
- # Stop all osd daemons
- for id in $(seq 0 2) ; do
- kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
- done
-
- id=${osds0[0]}
- ceph-objectstore-tool --data-path $dir/$id \
- MOBJ0 remove || return 1
- id=${osds0[1]}
- ceph-objectstore-tool --data-path $dir/$id \
- MOBJ0 remove || return 1
-
- id=${osds1[1]}
- ceph-objectstore-tool --data-path $dir/$id \
- MOBJ1 remove || return 1
- id=${osds1[2]}
- ceph-objectstore-tool --data-path $dir/$id \
- MOBJ1 remove || return 1
-
- for id in $(seq 0 2) ; do
- activate_osd $dir $id >&2 || return 1
- done
- wait_for_clean || return 1
-
- # Get get - both objects should in the same PG
- local pg=$(get_pg $poolname MOBJ0)
-
- # Repair the PG, which triggers the recovering,
- # and should mark the object as unfound
- repair $pg
-
- for i in $(seq 0 120) ; do
- [ $i -lt 60 ] || return 1
- matches=$(ceph pg $pg list_missing | egrep "MOBJ0|MOBJ1" | wc -l)
- [ $matches -eq 2 ] && break
- done
-
- teardown $dir || return 1
-}
-
-function TEST_list_missing_erasure_coded_appends() {
- list_missing_erasure_coded $1 false
-}
-
-function TEST_list_missing_erasure_coded_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- list_missing_erasure_coded $1 true
- fi
-}
-
-#
-# Corrupt one copy of a replicated pool
-#
-function TEST_corrupt_scrub_replicated() {
- local dir=$1
- local poolname=csr_pool
- local total_objs=15
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=2 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
- run_osd $dir 1 || return 1
- wait_for_clean || return 1
-
- ceph osd pool create $poolname 1 1 || return 1
- wait_for_clean || return 1
-
- for i in $(seq 1 $total_objs) ; do
- objname=ROBJ${i}
- add_something $dir $poolname $objname || return 1
-
- rados --pool $poolname setomapheader $objname hdr-$objname || return 1
- rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
- done
-
- local pg=$(get_pg $poolname ROBJ0)
-
- # Compute an old omap digest and save oi
- CEPH_ARGS='' ceph daemon $dir//ceph-osd.0.asok \
- config set osd_deep_scrub_update_digest_min_age 0
- CEPH_ARGS='' ceph daemon $dir//ceph-osd.1.asok \
- config set osd_deep_scrub_update_digest_min_age 0
- pg_deep_scrub $pg
-
- for i in $(seq 1 $total_objs) ; do
- objname=ROBJ${i}
-
- # Alternate corruption between osd.0 and osd.1
- local osd=$(expr $i % 2)
-
- case $i in
- 1)
- # Size (deep scrub data_digest too)
- local payload=UVWXYZZZ
- echo $payload > $dir/CORRUPT
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
- ;;
-
- 2)
- # digest (deep scrub only)
- local payload=UVWXYZ
- echo $payload > $dir/CORRUPT
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
- ;;
-
- 3)
- # missing
- objectstore_tool $dir $osd $objname remove || return 1
- ;;
-
- 4)
- # Modify omap value (deep scrub only)
- objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
- ;;
-
- 5)
- # Delete omap key (deep scrub only)
- objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
- ;;
-
- 6)
- # Add extra omap key (deep scrub only)
- echo extra > $dir/extra-val
- objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
- rm $dir/extra-val
- ;;
-
- 7)
- # Modify omap header (deep scrub only)
- echo -n newheader > $dir/hdr
- objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
- rm $dir/hdr
- ;;
-
- 8)
- rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
- rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
-
- # Break xattrs
- echo -n bad-val > $dir/bad-val
- objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
- objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
- echo -n val3-$objname > $dir/newval
- objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
- rm $dir/bad-val $dir/newval
- ;;
-
- 9)
- objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
- echo -n D > $dir/change
- rados --pool $poolname put $objname $dir/change
- objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
- rm $dir/oi $dir/change
- ;;
-
- # ROBJ10 must be handled after digests are re-computed by a deep scrub below
- # ROBJ11 must be handled with config change before deep scrub
- # ROBJ12 must be handled with config change before scrubs
- # ROBJ13 must be handled before scrubs
-
- 14)
- echo -n bad-val > $dir/bad-val
- objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
- objectstore_tool $dir 1 $objname rm-attr _ || return 1
- rm $dir/bad-val
- ;;
-
- 15)
- objectstore_tool $dir $osd $objname rm-attr _ || return 1
-
- esac
- done
-
- local pg=$(get_pg $poolname ROBJ0)
-
- set_config osd 0 filestore_debug_inject_read_err true || return 1
- set_config osd 1 filestore_debug_inject_read_err true || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.1.asok \
- injectdataerr $poolname ROBJ11 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok \
- injectmdataerr $poolname ROBJ12 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok \
- injectmdataerr $poolname ROBJ13 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.1.asok \
- injectdataerr $poolname ROBJ13 || return 1
-
- pg_scrub $pg
-
- rados list-inconsistent-pg $poolname > $dir/json || return 1
- # Check pg count
- test $(jq '. | length' $dir/json) = "1" || return 1
- # Check pgid
- test $(jq -r '.[0]' $dir/json) = $pg || return 1
-
- rados list-inconsistent-obj $pg > $dir/json || return 1
- # Get epoch for repair-get requests
- epoch=$(jq .epoch $dir/json)
-
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "shards": [
- {
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "size": 9,
- "errors": [
- "size_mismatch_oi"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 3,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ1"
- }
- },
- {
- "shards": [
- {
- "errors": [
- "stat_error"
- ],
- "osd": 0
- },
- {
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
- "union_shard_errors": [
- "stat_error"
- ],
- "errors": [],
- "object": {
- "version": 36,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ12"
- }
- },
- {
- "shards": [
- {
- "errors": [
- "stat_error"
- ],
- "osd": 0
- },
- {
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:d60617f9:::ROBJ13:head(47'55 osd.0.0:54 dirty|omap|data_digest|omap_digest s 7 uv 39 dd 2ddbf8f5 od 6441854d alloc_hint [0 0 0])",
- "union_shard_errors": [
- "stat_error"
- ],
- "errors": [],
- "object": {
- "version": 39,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ13"
- }
- },
- {
- "shards": [
- {
- "size": 7,
- "errors": [
- "oi_attr_corrupted"
- ],
- "osd": 0
- },
- {
- "size": 7,
- "errors": [
- "oi_attr_missing"
- ],
- "osd": 1
- }
- ],
- "union_shard_errors": [
- "oi_attr_missing",
- "oi_attr_corrupted"
- ],
- "errors": [],
- "object": {
- "version": 0,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ14"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "size": 7,
- "errors": [
- "oi_attr_missing"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
- "union_shard_errors": [
- "oi_attr_missing"
- ],
- "errors": [
- "attr_name_mismatch"
- ],
- "object": {
- "version": 45,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ15"
- }
- },
- {
- "shards": [
- {
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "errors": [
- "missing"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
- "union_shard_errors": [
- "missing"
- ],
- "errors": [],
- "object": {
- "version": 9,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ3"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "bad-val",
- "name": "_key1-ROBJ8"
- },
- {
- "Base64": false,
- "value": "val3-ROBJ8",
- "name": "_key3-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-ROBJ8",
- "name": "_key1-ROBJ8"
- },
- {
- "Base64": false,
- "value": "val2-ROBJ8",
- "name": "_key2-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "attr_value_mismatch",
- "attr_name_mismatch"
- ],
- "object": {
- "version": 62,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ8"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
- "size": 1,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
- "size": 1,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "object_info_inconsistency",
- "attr_value_mismatch"
- ],
- "object": {
- "version": 63,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ9"
- }
- }
- ],
- "epoch": 0
-}
-EOF
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
- if test $getjson = "yes"
- then
- jq '.' $dir/json > save1.json
- fi
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
- fi
-
- objname=ROBJ9
- # Change data and size again because digest was recomputed
- echo -n ZZZ > $dir/change
- rados --pool $poolname put $objname $dir/change
- # Set one to an even older value
- objectstore_tool $dir 0 $objname set-attr _ $dir/robj9-oi
- rm $dir/oi $dir/change
-
- objname=ROBJ10
- objectstore_tool $dir 1 $objname get-attr _ > $dir/oi
- rados --pool $poolname setomapval $objname key2-$objname val2-$objname
- objectstore_tool $dir 0 $objname set-attr _ $dir/oi
- objectstore_tool $dir 1 $objname set-attr _ $dir/oi
- rm $dir/oi
-
- set_config osd 0 filestore_debug_inject_read_err true || return 1
- set_config osd 1 filestore_debug_inject_read_err true || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.1.asok \
- injectdataerr $poolname ROBJ11 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok \
- injectmdataerr $poolname ROBJ12 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.0.asok \
- injectmdataerr $poolname ROBJ13 || return 1
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.1.asok \
- injectdataerr $poolname ROBJ13 || return 1
- pg_deep_scrub $pg
-
- rados list-inconsistent-pg $poolname > $dir/json || return 1
- # Check pg count
- test $(jq '. | length' $dir/json) = "1" || return 1
- # Check pgid
- test $(jq -r '.[0]' $dir/json) = $pg || return 1
-
- rados list-inconsistent-obj $pg > $dir/json || return 1
- # Get epoch for repair-get requests
- epoch=$(jq .epoch $dir/json)
-
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xf5fba2c6",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "data_digest": "0x2d4a11c2",
- "omap_digest": "0xf5fba2c6",
- "size": 9,
- "errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "data_digest_mismatch_oi",
- "size_mismatch_oi"
- ],
- "errors": [
- "data_digest_mismatch",
- "size_mismatch"
- ],
- "object": {
- "version": 3,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ1"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xa8dd5adc",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xa8dd5adc",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:b1f19cbd:::ROBJ10:head(47'51 osd.0.0:50 dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "omap_digest_mismatch_oi"
- ],
- "errors": [],
- "object": {
- "version": 30,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ10"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xa03cef03",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "size": 7,
- "errors": [
- "read_error"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:87abbf36:::ROBJ11:head(47'48 osd.0.0:47 dirty|omap|data_digest|omap_digest s 7 uv 33 dd 2ddbf8f5 od a03cef03 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "read_error"
- ],
- "errors": [],
- "object": {
- "version": 33,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ11"
- }
- },
- {
- "shards": [
- {
- "errors": [
- "stat_error"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x067f306a",
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])",
- "union_shard_errors": [
- "stat_error"
- ],
- "errors": [],
- "object": {
- "version": 36,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ12"
- }
- },
- {
- "shards": [
- {
- "errors": [
- "stat_error"
- ],
- "osd": 0
- },
- {
- "size": 7,
- "errors": [
- "read_error"
- ],
- "osd": 1
- }
- ],
- "union_shard_errors": [
- "stat_error",
- "read_error"
- ],
- "errors": [],
- "object": {
- "version": 0,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ13"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x4f14f849",
- "size": 7,
- "errors": [
- "oi_attr_corrupted"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x4f14f849",
- "size": 7,
- "errors": [
- "oi_attr_missing"
- ],
- "osd": 1
- }
- ],
- "union_shard_errors": [
- "oi_attr_missing",
- "oi_attr_corrupted"
- ],
- "errors": [],
- "object": {
- "version": 0,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ14"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x2d2a4d6e",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x2d2a4d6e",
- "size": 7,
- "errors": [
- "oi_attr_missing"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])",
- "union_shard_errors": [
- "oi_attr_missing"
- ],
- "errors": [
- "attr_name_mismatch"
- ],
- "object": {
- "version": 45,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ15"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x578a4830",
- "omap_digest": "0xf8e11918",
- "size": 7,
- "errors": [
- "data_digest_mismatch_oi"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xf8e11918",
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:e97ce31e:::ROBJ2:head(47'56 osd.0.0:55 dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "data_digest_mismatch_oi"
- ],
- "errors": [
- "data_digest_mismatch"
- ],
- "object": {
- "version": 6,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ2"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x00b35dfd",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "errors": [
- "missing"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])",
- "union_shard_errors": [
- "missing"
- ],
- "errors": [],
- "object": {
- "version": 9,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ3"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xd7178dfe",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xe2d46ea4",
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:f4981d31:::ROBJ4:head(47'58 osd.0.0:57 dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "omap_digest_mismatch_oi"
- ],
- "errors": [
- "omap_digest_mismatch"
- ],
- "object": {
- "version": 12,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ4"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x1a862a41",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x06cac8f6",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:f4bfd4d1:::ROBJ5:head(47'59 osd.0.0:58 dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [0 0 0])",
- "union_shard_errors": [
- "omap_digest_mismatch_oi"
- ],
- "errors": [
- "omap_digest_mismatch"
- ],
- "object": {
- "version": 15,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ5"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x689ee887",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x179c919f",
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:a53c12e8:::ROBJ6:head(47'50 osd.0.0:49 dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [0 0 0])",
- "union_shard_errors": [
- "omap_digest_mismatch_oi"
- ],
- "errors": [
- "omap_digest_mismatch"
- ],
- "object": {
- "version": 18,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ6"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xefced57a",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0x6a73cc07",
- "size": 7,
- "errors": [
- "omap_digest_mismatch_oi"
- ],
- "osd": 1
- }
- ],
- "selected_object_info": "3:8b55fa4b:::ROBJ7:head(47'49 osd.0.0:48 dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [0 0 0])",
- "union_shard_errors": [
- "omap_digest_mismatch_oi"
- ],
- "errors": [
- "omap_digest_mismatch"
- ],
- "object": {
- "version": 21,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ7"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "bad-val",
- "name": "_key1-ROBJ8"
- },
- {
- "Base64": false,
- "value": "val3-ROBJ8",
- "name": "_key3-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xd6be81dc",
- "size": 7,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-ROBJ8",
- "name": "_key1-ROBJ8"
- },
- {
- "Base64": false,
- "value": "val2-ROBJ8",
- "name": "_key2-ROBJ8"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x2ddbf8f5",
- "omap_digest": "0xd6be81dc",
- "size": 7,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "attr_value_mismatch",
- "attr_name_mismatch"
- ],
- "object": {
- "version": 62,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ8"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])",
- "data_digest": "0x1f26fb26",
- "omap_digest": "0x2eecc539",
- "size": 3,
- "errors": [],
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
- "data_digest": "0x1f26fb26",
- "omap_digest": "0x2eecc539",
- "size": 3,
- "errors": [],
- "osd": 1
- }
- ],
- "selected_object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "object_info_inconsistency",
- "attr_value_mismatch"
- ],
- "object": {
- "version": 64,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "ROBJ9"
- }
- }
- ],
- "epoch": 0
-}
-EOF
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
- if test $getjson = "yes"
- then
- jq '.' $dir/json > save2.json
- fi
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
- fi
-
- rados rmpool $poolname $poolname --yes-i-really-really-mean-it
- teardown $dir || return 1
-}
-
-
-#
-# Test scrub errors for an erasure coded pool
-#
-function corrupt_scrub_erasure() {
- local dir=$1
- local allow_overwrites=$2
- local poolname=ecpool
- local total_objs=5
-
- setup $dir || return 1
- run_mon $dir a || return 1
- run_mgr $dir x || return 1
- for id in $(seq 0 2) ; do
- if [ "$allow_overwrites" = "true" ]; then
- run_osd_bluestore $dir $id || return 1
- else
- run_osd $dir $id || return 1
- fi
- done
- wait_for_clean || return 1
-
- create_ec_pool $poolname $allow_overwrites k=2 m=1 stripe_unit=2K --force || return 1
-
- for i in $(seq 1 $total_objs) ; do
- objname=EOBJ${i}
- add_something $dir $poolname $objname || return 1
-
- local osd=$(expr $i % 2)
-
- case $i in
- 1)
- # Size (deep scrub data_digest too)
- local payload=UVWXYZZZ
- echo $payload > $dir/CORRUPT
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
- ;;
-
- 2)
- # Corrupt EC shard
- dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=1
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
- ;;
-
- 3)
- # missing
- objectstore_tool $dir $osd $objname remove || return 1
- ;;
-
- 4)
- rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
- rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
-
- # Break xattrs
- echo -n bad-val > $dir/bad-val
- objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
- objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
- echo -n val3-$objname > $dir/newval
- objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
- rm $dir/bad-val $dir/newval
- ;;
-
- 5)
- # Corrupt EC shard
- dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=2
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
- ;;
-
- esac
- done
-
- local pg=$(get_pg $poolname EOBJ0)
-
- pg_scrub $pg
-
- rados list-inconsistent-pg $poolname > $dir/json || return 1
- # Check pg count
- test $(jq '. | length' $dir/json) = "1" || return 1
- # Check pgid
- test $(jq -r '.[0]' $dir/json) = $pg || return 1
-
- rados list-inconsistent-obj $pg > $dir/json || return 1
- # Get epoch for repair-get requests
- epoch=$(jq .epoch $dir/json)
-
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "shards": [
- {
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "size": 9,
- "shard": 0,
- "errors": [
- "size_mismatch_oi"
- ],
- "osd": 1
- },
- {
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 1,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ1"
- }
- },
- {
- "shards": [
- {
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "shard": 0,
- "errors": [
- "missing"
- ],
- "osd": 1
- },
- {
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "missing"
- ],
- "errors": [],
- "object": {
- "version": 3,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ3"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "bad-val",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "osd": 1,
- "shard": 0,
- "errors": [],
- "size": 2048,
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ]
- },
- {
- "osd": 2,
- "shard": 1,
- "errors": [],
- "size": 2048,
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ]
- }
- ],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "attr_value_mismatch",
- "attr_name_mismatch"
- ],
- "object": {
- "version": 6,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ4"
- }
- },
- {
- "shards": [
- {
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "size": 4096,
- "shard": 0,
- "errors": [
- "size_mismatch_oi"
- ],
- "osd": 1
- },
- {
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 7,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ5"
- }
- }
- ],
- "epoch": 0
-}
-EOF
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
- if test $getjson = "yes"
- then
- jq '.' $dir/json > save3.json
- fi
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
- fi
-
- pg_deep_scrub $pg
-
- rados list-inconsistent-pg $poolname > $dir/json || return 1
- # Check pg count
- test $(jq '. | length' $dir/json) = "1" || return 1
- # Check pgid
- test $(jq -r '.[0]' $dir/json) = $pg || return 1
-
- rados list-inconsistent-obj $pg > $dir/json || return 1
- # Get epoch for repair-get requests
- epoch=$(jq .epoch $dir/json)
-
- if [ "$allow_overwrites" = "true" ]
- then
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "shards": [
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "size": 9,
- "shard": 0,
- "errors": [
- "read_error",
- "size_mismatch_oi"
- ],
- "osd": 1
- },
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:9175b684:::EOBJ1:head(27'1 client.4155.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "read_error",
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 1,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ1"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "shard": 0,
- "errors": [
- "missing"
- ],
- "osd": 1
- },
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(41'3 client.4199.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "missing"
- ],
- "errors": [],
- "object": {
- "version": 3,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ3"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "bad-val",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 0,
- "osd": 1
- },
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 1,
- "osd": 2
- }
- ],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(48'6 client.4223.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "attr_value_mismatch",
- "attr_name_mismatch"
- ],
- "object": {
- "version": 6,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ4"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 4096,
- "errors": [
- "size_mismatch_oi"
- ],
- "shard": 0,
- "osd": 1
- },
- {
- "data_digest": "0x00000000",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 1,
- "osd": 2
- }
- ],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4288.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 7,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ5"
- }
- }
- ],
- "epoch": 0
-}
-EOF
-
- else
-
- jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "shards": [
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "size": 9,
- "shard": 0,
- "errors": [
- "read_error",
- "size_mismatch_oi"
- ],
- "osd": 1
- },
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "read_error",
- "size_mismatch_oi"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 1,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ1"
- }
- },
- {
- "shards": [
- {
- "size": 2048,
- "errors": [
- "ec_hash_error"
- ],
- "shard": 2,
- "osd": 0
- },
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 0,
- "osd": 1
- },
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 1,
- "osd": 2
- }
- ],
- "selected_object_info": "3:9babd184:::EOBJ2:head(29'2 client.4217.0:1 dirty|data_digest|omap_digest s 7 uv 2 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "ec_hash_error"
- ],
- "errors": [],
- "object": {
- "version": 2,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ2"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "osd": 1,
- "shard": 0,
- "errors": [
- "missing"
- ]
- },
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "missing"
- ],
- "errors": [],
- "object": {
- "version": 3,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ3"
- }
- },
- {
- "shards": [
- {
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "bad-val",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val3-EOBJ4",
- "name": "_key3-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ],
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "osd": 1,
- "shard": 0,
- "errors": [],
- "size": 2048,
- "omap_digest": "0xffffffff",
- "data_digest": "0x04cfa72f",
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ]
- },
- {
- "osd": 2,
- "shard": 1,
- "errors": [],
- "size": 2048,
- "omap_digest": "0xffffffff",
- "data_digest": "0x04cfa72f",
- "attrs": [
- {
- "Base64": true,
- "value": "",
- "name": "_"
- },
- {
- "Base64": false,
- "value": "val1-EOBJ4",
- "name": "_key1-EOBJ4"
- },
- {
- "Base64": false,
- "value": "val2-EOBJ4",
- "name": "_key2-EOBJ4"
- },
- {
- "Base64": true,
- "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E",
- "name": "hinfo_key"
- },
- {
- "Base64": true,
- "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=",
- "name": "snapset"
- }
- ]
- }
- ],
- "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [],
- "errors": [
- "attr_value_mismatch",
- "attr_name_mismatch"
- ],
- "object": {
- "version": 6,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ4"
- }
- },
- {
- "shards": [
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "errors": [],
- "shard": 2,
- "osd": 0
- },
- {
- "size": 4096,
- "shard": 0,
- "errors": [
- "size_mismatch_oi",
- "ec_size_error"
- ],
- "osd": 1
- },
- {
- "data_digest": "0x04cfa72f",
- "omap_digest": "0xffffffff",
- "size": 2048,
- "shard": 1,
- "errors": [],
- "osd": 2
- }
- ],
- "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])",
- "union_shard_errors": [
- "size_mismatch_oi",
- "ec_size_error"
- ],
- "errors": [
- "size_mismatch"
- ],
- "object": {
- "version": 7,
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "EOBJ5"
- }
- }
- ],
- "epoch": 0
-}
-EOF
-
- fi
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
- if test $getjson = "yes"
- then
- if [ "$allow_overwrites" = "true" ]
- then
- num=4
- else
- num=5
- fi
- jq '.' $dir/json > save${num}.json
- fi
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
- fi
-
- rados rmpool $poolname $poolname --yes-i-really-really-mean-it
- teardown $dir || return 1
-}
-
-function TEST_corrupt_scrub_erasure_appends() {
- corrupt_scrub_erasure $1 false
-}
-
-function TEST_corrupt_scrub_erasure_overwrites() {
- if [ "$use_ec_overwrite" = "true" ]; then
- corrupt_scrub_erasure $1 true
- fi
-}
-
-#
-# Test to make sure that a periodic scrub won't cause deep-scrub info to be lost
-#
-function TEST_periodic_scrub_replicated() {
- local dir=$1
- local poolname=psr_pool
- local objname=POBJ
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=2 || return 1
- run_mgr $dir x || return 1
- local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0"
- run_osd $dir 0 $ceph_osd_args || return 1
- run_osd $dir 1 $ceph_osd_args || return 1
- wait_for_clean || return 1
-
- ceph osd pool create $poolname 1 1 || return 1
- wait_for_clean || return 1
-
- local osd=0
- add_something $dir $poolname $objname scrub || return 1
- local primary=$(get_primary $poolname $objname)
- local pg=$(get_pg $poolname $objname)
-
- # Add deep-scrub only error
- local payload=UVWXYZ
- echo $payload > $dir/CORRUPT
- # Uses $ceph_osd_args for osd restart
- objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
-
- # No scrub information available, so expect failure
- set -o pipefail
- ! rados list-inconsistent-obj $pg | jq '.' || return 1
- set +o pipefail
-
- pg_deep_scrub $pg || return 1
-
- # Make sure bad object found
- rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
-
- local last_scrub=$(get_last_scrub_stamp $pg)
- # Fake a schedule scrub
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.${primary}.asok \
- trigger_scrub $pg || return 1
- # Wait for schedule regular scrub
- wait_for_scrub $pg "$last_scrub"
-
- # It needed to be upgraded
- grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1
-
- # Bad object still known
- rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
-
- # Can't upgrade with this set
- ceph osd set nodeep-scrub
- # Let map change propagate to OSDs
- sleep 2
-
- # Fake a schedule scrub
- local last_scrub=$(get_last_scrub_stamp $pg)
- CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.${primary}.asok \
- trigger_scrub $pg || return 1
- # Wait for schedule regular scrub
- # to notice scrub and skip it
- local found=false
- for i in $(seq 14 -1 0)
- do
- sleep 1
- ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.${primary}.log || { found=true ; break; }
- echo Time left: $i seconds
- done
- test $found = "true" || return 1
-
- # Bad object still known
- rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
-
- # Request a regular scrub and it will be done
- local scrub_backoff_ratio=$(get_config osd ${primary} osd_scrub_backoff_ratio)
- set_config osd ${primary} osd_scrub_backoff_ratio 0
- pg_scrub $pg
- sleep 1
- set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio
- grep -q "Regular scrub request, losing deep-scrub details" $dir/osd.${primary}.log || return 1
-
- # deep-scrub error is no longer present
- rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1
-}
-
-
-main osd-scrub-repair "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-# test/osd/osd-scrub-repair.sh # TEST_corrupt_and_repair_replicated"
-# End:
+++ /dev/null
-#! /bin/bash
-#
-# Copyright (C) 2015 Red Hat <contact@redhat.com>
-#
-# Author: David Zafman <dzafman@redhat.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/../detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
-
-function run() {
- local dir=$1
- shift
-
- export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
- export CEPH_ARGS
- CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
- CEPH_ARGS+="--mon-host=$CEPH_MON "
-
- local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
- for func in $funcs ; do
- $func $dir || return 1
- done
-}
-
-function TEST_scrub_snaps() {
- local dir=$1
- local poolname=test
-
- TESTDATA="testdata.$$"
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 || return 1
- run_mgr $dir x || return 1
- run_osd $dir 0 || return 1
-
- wait_for_clean || return 1
-
- # Create a pool with a single pg
- ceph osd pool create $poolname 1 1
- poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
- dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
- for i in `seq 1 15`
- do
- rados -p $poolname put obj${i} $TESTDATA
- done
-
- SNAP=1
- rados -p $poolname mksnap snap${SNAP}
- dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
- rados -p $poolname put obj1 $TESTDATA
- rados -p $poolname put obj5 $TESTDATA
- rados -p $poolname put obj3 $TESTDATA
- for i in `seq 6 14`
- do rados -p $poolname put obj${i} $TESTDATA
- done
-
- SNAP=2
- rados -p $poolname mksnap snap${SNAP}
- dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
- rados -p $poolname put obj5 $TESTDATA
-
- SNAP=3
- rados -p $poolname mksnap snap${SNAP}
- dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
- rados -p $poolname put obj3 $TESTDATA
-
- SNAP=4
- rados -p $poolname mksnap snap${SNAP}
- dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
- rados -p $poolname put obj5 $TESTDATA
- rados -p $poolname put obj2 $TESTDATA
-
- SNAP=5
- rados -p $poolname mksnap snap${SNAP}
- SNAP=6
- rados -p $poolname mksnap snap${SNAP}
- dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
- rados -p $poolname put obj5 $TESTDATA
-
- SNAP=7
- rados -p $poolname mksnap snap${SNAP}
-
- rados -p $poolname rm obj4
- rados -p $poolname rm obj2
-
- kill_daemons $dir TERM osd || return 1
-
- # Don't need to ceph_objectstore_tool function because osd stopped
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj1)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" --force remove
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)"
- OBJ5SAVE="$JSON"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)"
- dd if=/dev/urandom of=$TESTDATA bs=256 count=18
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj3)"
- dd if=/dev/urandom of=$TESTDATA bs=256 count=15
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj2)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset
-
- # Create a clone which isn't in snapset and doesn't have object info
- JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
- dd if=/dev/urandom of=$TESTDATA bs=256 count=7
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
-
- rm -f $TESTDATA
-
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj6)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj7)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj8)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj9)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj10)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj11)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj12)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj13)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj14)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size
-
- echo "garbage" > $dir/bad
- JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj15)"
- ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-attr snapset $dir/bad
- rm -f $dir/bad
-
- run_osd $dir 0 || return 1
- wait_for_clean || return 1
-
- local pgid="${poolid}.0"
- if ! pg_scrub "$pgid" ; then
- cat $dir/osd.0.log
- return 1
- fi
- grep 'log_channel' $dir/osd.0.log
-
- rados list-inconsistent-pg $poolname > $dir/json || return 1
- # Check pg count
- test $(jq '. | length' $dir/json) = "1" || return 1
- # Check pgid
- test $(jq -r '.[0]' $dir/json) = $pgid || return 1
-
- rados list-inconsistent-snapset $pgid > $dir/json || return 1
- test $(jq '.inconsistents | length' $dir/json) = "21" || return 1
-
- local jqfilter='.inconsistents'
- local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
-
- jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
-{
- "inconsistents": [
- {
- "errors": [
- "headless"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj1"
- },
- {
- "errors": [
- "size_mismatch"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj10"
- },
- {
- "errors": [
- "headless"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj11"
- },
- {
- "errors": [
- "size_mismatch"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj14"
- },
- {
- "errors": [
- "headless"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj6"
- },
- {
- "errors": [
- "headless"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj7"
- },
- {
- "errors": [
- "size_mismatch"
- ],
- "snap": 1,
- "locator": "",
- "nspace": "",
- "name": "obj9"
- },
- {
- "errors": [
- "headless"
- ],
- "snap": 4,
- "locator": "",
- "nspace": "",
- "name": "obj2"
- },
- {
- "errors": [
- "size_mismatch"
- ],
- "snap": 4,
- "locator": "",
- "nspace": "",
- "name": "obj5"
- },
- {
- "errors": [
- "headless"
- ],
- "snap": 7,
- "locator": "",
- "nspace": "",
- "name": "obj2"
- },
- {
- "errors": [
- "oi_attr_missing",
- "headless"
- ],
- "snap": 7,
- "locator": "",
- "nspace": "",
- "name": "obj5"
- },
- {
- "extra clones": [
- 1
- ],
- "errors": [
- "extra_clones"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj11"
- },
- {
- "errors": [
- "head_mismatch"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj12"
- },
- {
- "errors": [
- "ss_attr_corrupted"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj15"
- },
- {
- "extra clones": [
- 7,
- 4
- ],
- "errors": [
- "ss_attr_missing",
- "extra_clones"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj2"
- },
- {
- "errors": [
- "size_mismatch"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj3"
- },
- {
- "missing": [
- 7
- ],
- "errors": [
- "clone_missing"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj4"
- },
- {
- "missing": [
- 2,
- 1
- ],
- "extra clones": [
- 7
- ],
- "errors": [
- "extra_clones",
- "clone_missing"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj5"
- },
- {
- "extra clones": [
- 1
- ],
- "errors": [
- "extra_clones"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj6"
- },
- {
- "extra clones": [
- 1
- ],
- "errors": [
- "head_mismatch",
- "extra_clones"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj7"
- },
- {
- "errors": [
- "snapset_mismatch"
- ],
- "snap": "head",
- "locator": "",
- "nspace": "",
- "name": "obj8"
- }
- ],
- "epoch": 20
-}
-EOF
-
- jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || return 1
-
- if which jsonschema > /dev/null;
- then
- jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
- fi
-
- for i in `seq 1 7`
- do
- rados -p $poolname rmsnap snap$i
- done
-
- ERRORS=0
-
- pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
- pid=$(cat $pidfile)
- if ! kill -0 $pid
- then
- echo "OSD crash occurred"
- tail -100 $dir/osd.0.log
- ERRORS=$(expr $ERRORS + 1)
- fi
-
- kill_daemons $dir || return 1
-
- declare -a err_strings
- err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj10:.* is missing in clone_overlap"
- err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 no '_' attr"
- err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 is an unexpected clone"
- err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
- err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:2"
- err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:1"
- err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj5:head 2 missing clone[(]s[)]"
- err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:head snapset.head_exists=false, but head exists"
- err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj8:head snaps.seq not set"
- err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:head snapset.head_exists=false, but head exists"
- err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 is an unexpected clone"
- err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
- err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 is an unexpected clone"
- err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head no 'snapset' attr"
- err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 clone ignored due to missing snapset"
- err_strings[15]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 clone ignored due to missing snapset"
- err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head expected clone .*:::obj4:7"
- err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head 1 missing clone[(]s[)]"
- err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 is an unexpected clone"
- err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 is missing in clone_size"
- err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 is an unexpected clone"
- err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 size 1032 != clone_size 1033"
- err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 23 errors"
- err_strings[23]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head can't decode 'snapset' attr buffer"
- err_strings[24]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:1 has no oi or legacy_snaps; cannot convert 1=[[]1[]]:[[]1[]].stray_clone_snaps=[{]1=[[]1[]][}]"
-
- for i in `seq 0 ${#err_strings[@]}`
- do
- if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null;
- then
- echo "Missing log message '${err_strings[$i]}'"
- ERRORS=$(expr $ERRORS + 1)
- fi
- done
-
- teardown $dir || return 1
-
- if [ $ERRORS != "0" ];
- then
- echo "TEST FAILED WITH $ERRORS ERRORS"
- return 1
- fi
-
- echo "TEST PASSED"
- return 0
-}
-
-main osd-scrub-snaps "$@"
-
-# Local Variables:
-# compile-command: "cd ../.. ; make -j4 && \
-# test/osd/osd-scrub-snaps.sh"
"allow pool foo namespace=nfoo rwx; allow pool bar namespace nbar object_prefix rbd r",
"allow pool foo namespace=\"\" rwx; allow pool bar namespace='' object_prefix rbd r",
"allow pool foo namespace \"\" rwx; allow pool bar namespace '' object_prefix rbd r",
+ "profile abc, profile abc pool=bar, profile abc pool=bar namespace=foo",
0
};
ASSERT_FALSE(cap.is_capable("bar", "", 0, "foo", false, false, {{"foo", false, false, false}, {"bar", false, true, false}}));
ASSERT_FALSE(cap.is_capable("bar", "", 0, "foo", false, false, {{"foo", false, false, false}, {"bar", false, false, false}}));
}
+
+TEST(OSDCap, AllowProfile) {
+ OSDCap cap;
+ ASSERT_TRUE(cap.parse("profile read-only, profile read-write pool abc", NULL));
+ ASSERT_FALSE(cap.allow_all());
+ ASSERT_FALSE(cap.is_capable("foo", "", 0, "asdf", true, true, {}));
+ ASSERT_TRUE(cap.is_capable("foo", "", 0, "asdf", true, false, {}));
+ ASSERT_TRUE(cap.is_capable("abc", "", 0, "asdf", false, true, {}));
+
+ // RBD
+ cap.grants.clear();
+ ASSERT_TRUE(cap.parse("profile rbd pool abc", NULL));
+ ASSERT_FALSE(cap.allow_all());
+ ASSERT_FALSE(cap.is_capable("foo", "", 0, "asdf", true, true, {}));
+ ASSERT_FALSE(cap.is_capable("foo", "", 0, "rbd_children", true, false, {}));
+ ASSERT_TRUE(cap.is_capable("foo", "", 0, "rbd_children", false, false,
+ {{"rbd", true, false, true}}));
+ ASSERT_TRUE(cap.is_capable("abc", "", 0, "asdf", true, true,
+ {{"rbd", true, true, true}}));
+
+ cap.grants.clear();
+ ASSERT_TRUE(cap.parse("profile rbd-read-only pool abc", NULL));
+ ASSERT_FALSE(cap.allow_all());
+ ASSERT_FALSE(cap.is_capable("foo", "", 0, "rbd_children", true, false, {}));
+ ASSERT_TRUE(cap.is_capable("abc", "", 0, "asdf", true, false,
+ {{"rbd", true, false, true}}));
+}
+
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_FALSE(missing.have_missing());
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_TRUE(missing.have_missing());
}
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_FALSE(missing.have_missing());
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_TRUE(missing.have_missing());
pg_missing_t other;
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_FALSE(missing.is_missing(oid));
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
}
pg_missing_t missing;
eversion_t need(10,5);
EXPECT_FALSE(missing.is_missing(oid, eversion_t()));
- missing.add(oid, need, eversion_t());
+ missing.add(oid, need, eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_FALSE(missing.is_missing(oid, eversion_t()));
EXPECT_TRUE(missing.is_missing(oid, need));
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_EQ(eversion_t(), missing.have_old(oid));
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_EQ(eversion_t(), missing.have_old(oid));
eversion_t have(1,1);
missing.revise_have(oid, have);
e.op = pg_log_entry_t::DELETE;
EXPECT_TRUE(e.is_delete());
missing.add_next_event(e);
- EXPECT_FALSE(missing.have_missing());
+ EXPECT_TRUE(missing.is_missing(oid));
+ EXPECT_TRUE(missing.get_items().at(oid).is_delete());
+ EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
+ EXPECT_EQ(version, missing.get_items().at(oid).need);
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(1U, missing.num_missing());
+ EXPECT_EQ(1U, missing.get_rmissing().size());
}
- // ERROR op should only be used for dup detection
+ // adding a LOST_DELETE after an existing event
{
pg_missing_t missing;
pg_log_entry_t e = sample_e;
- e.op = pg_log_entry_t::ERROR;
- e.return_code = -ENOENT;
- EXPECT_FALSE(e.is_update());
- EXPECT_FALSE(e.object_is_indexed());
+ e.op = pg_log_entry_t::MODIFY;
+ EXPECT_TRUE(e.is_update());
+ EXPECT_TRUE(e.object_is_indexed());
EXPECT_TRUE(e.reqid_is_indexed());
EXPECT_FALSE(missing.is_missing(oid));
missing.add_next_event(e);
- EXPECT_FALSE(missing.is_missing(oid));
- EXPECT_FALSE(e.object_is_indexed());
- EXPECT_TRUE(e.reqid_is_indexed());
- }
-
- // ERROR op should not affect previous entries
- {
- pg_missing_t missing;
- pg_log_entry_t modify = sample_e;
-
- modify.op = pg_log_entry_t::MODIFY;
- EXPECT_FALSE(missing.is_missing(oid));
- missing.add_next_event(modify);
EXPECT_TRUE(missing.is_missing(oid));
- EXPECT_EQ(missing.get_items().at(oid).need, version);
+ EXPECT_FALSE(missing.get_items().at(oid).is_delete());
- pg_log_entry_t error = sample_e;
- error.op = pg_log_entry_t::ERROR;
- error.return_code = -ENOENT;
- error.version = eversion_t(11, 5);
- missing.add_next_event(error);
+ e.op = pg_log_entry_t::LOST_DELETE;
+ e.version.version++;
+ EXPECT_TRUE(e.is_delete());
+ missing.add_next_event(e);
EXPECT_TRUE(missing.is_missing(oid));
- EXPECT_EQ(missing.get_items().at(oid).need, version);
+ EXPECT_TRUE(missing.get_items().at(oid).is_delete());
+ EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
+ EXPECT_EQ(e.version, missing.get_items().at(oid).need);
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(1U, missing.num_missing());
+ EXPECT_EQ(1U, missing.get_rmissing().size());
}
}
// create a new entry
EXPECT_FALSE(missing.is_missing(oid));
eversion_t need(10,10);
- missing.revise_need(oid, need);
+ missing.revise_need(oid, need, false);
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
EXPECT_EQ(need, missing.get_items().at(oid).need);
missing.revise_have(oid, have);
eversion_t new_need(10,12);
EXPECT_EQ(have, missing.get_items().at(oid).have);
- missing.revise_need(oid, new_need);
+ missing.revise_need(oid, new_need, false);
EXPECT_EQ(have, missing.get_items().at(oid).have);
EXPECT_EQ(new_need, missing.get_items().at(oid).need);
}
EXPECT_FALSE(missing.is_missing(oid));
// update an existing entry
eversion_t need(10,12);
- missing.add(oid, need, have);
+ missing.add(oid, need, have, false);
EXPECT_TRUE(missing.is_missing(oid));
eversion_t new_have(2,2);
EXPECT_EQ(have, missing.get_items().at(oid).have);
EXPECT_FALSE(missing.is_missing(oid));
eversion_t have(1,1);
eversion_t need(10,10);
- missing.add(oid, need, have);
+ missing.add(oid, need, have, false);
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(have, missing.get_items().at(oid).have);
EXPECT_EQ(need, missing.get_items().at(oid).need);
EXPECT_FALSE(missing.is_missing(oid));
epoch_t epoch = 10;
eversion_t need(epoch,10);
- missing.add(oid, need, eversion_t());
+ missing.add(oid, need, eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
// rm of an older version is a noop
missing.rm(oid, eversion_t(epoch / 2,20));
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_FALSE(missing.is_missing(oid));
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
auto m = missing.get_items().find(oid);
missing.rm(m);
EXPECT_FALSE(missing.is_missing(oid));
epoch_t epoch = 10;
eversion_t need(epoch,10);
- missing.add(oid, need, eversion_t());
+ missing.add(oid, need, eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
// assert if that the version to be removed is lower than the version of the object
{
hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
pg_missing_t missing;
EXPECT_FALSE(missing.is_missing(oid));
- missing.add(oid, eversion_t(), eversion_t());
+ missing.add(oid, eversion_t(), eversion_t(), false);
EXPECT_TRUE(missing.is_missing(oid));
auto m = missing.get_items().find(oid);
missing.got(m);
uint32_t hash2 = 2;
hobject_t oid2(object_t("objname"), "key2", 123, hash2, 0, "");
pg_missing_t missing;
- missing.add(oid1, eversion_t(), eversion_t());
- missing.add(oid2, eversion_t(), eversion_t());
+ missing.add(oid1, eversion_t(), eversion_t(), false);
+ missing.add(oid2, eversion_t(), eversion_t(), false);
pg_t child_pgid;
child_pgid.m_seed = 1;
pg_missing_t child;
assert_equal({}, validate_command(sigdict, ['pg', 'debug',
'invalid']))
- def test_force_create_pg(self):
- self.one_pgid('force_create_pg')
-
class TestAuth(TestArgparse):
from __future__ import print_function
+from nose import SkipTest
from nose.tools import eq_ as eq, ok_ as ok, assert_raises
from rados import (Rados, Error, RadosStateError, Object, ObjectExists,
ObjectNotFound, ObjectBusy, requires, opt,
[i.remove() for i in self.ioctx.list_objects()]
+ def test_applications(self):
+ cmd = {"prefix":"osd dump", "format":"json"}
+ ret, buf, errs = self.rados.mon_command(json.dumps(cmd), b'')
+ eq(ret, 0)
+ assert len(buf) > 0
+ release = json.loads(buf.decode("utf-8")).get("require_osd_release",
+ None)
+ if not release or release[0] < 'l':
+ raise SkipTest
+
+ eq([], self.ioctx.application_list())
+
+ self.ioctx.application_enable("app1")
+ assert_raises(Error, self.ioctx.application_enable, "app2")
+ self.ioctx.application_enable("app2", True)
+
+ assert_raises(Error, self.ioctx.application_metadata_list, "dne")
+ eq([], self.ioctx.application_metadata_list("app1"))
+
+ assert_raises(Error, self.ioctx.application_metadata_set, "dne", "key",
+ "key")
+ self.ioctx.application_metadata_set("app1", "key1", "val1")
+ self.ioctx.application_metadata_set("app1", "key2", "val2")
+ self.ioctx.application_metadata_set("app2", "key1", "val1")
+
+ eq([("key1", "val1"), ("key2", "val2")],
+ self.ioctx.application_metadata_list("app1"))
+
+ self.ioctx.application_metadata_remove("app1", "key1")
+ eq([("key2", "val2")], self.ioctx.application_metadata_list("app1"))
+
class TestObject(object):
def setUp(self):
// template definitions
#include "tools/rbd_mirror/image_replayer/BootstrapRequest.cc"
-template class rbd::mirror::image_replayer::BootstrapRequest<librbd::MockTestImageCtx>;
namespace rbd {
namespace mirror {
#include "librbd/internal.h"
#include "librbd/api/Mirror.h"
#include "tools/rbd_mirror/ClusterWatcher.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
#include "tools/rbd_mirror/types.h"
#include "test/rbd_mirror/test_fixture.h"
#include "test/librados/test.h"
{
m_cluster = std::make_shared<librados::Rados>();
EXPECT_EQ("", connect_cluster_pp(*m_cluster));
- m_cluster_watcher.reset(new ClusterWatcher(m_cluster, m_lock));
}
~TestClusterWatcher() override {
}
}
+ void SetUp() override {
+ TestFixture::SetUp();
+ m_service_daemon.reset(new rbd::mirror::ServiceDaemon<>(g_ceph_context,
+ m_cluster,
+ m_threads));
+ m_cluster_watcher.reset(new ClusterWatcher(m_cluster, m_lock,
+ m_service_daemon.get()));
+ }
+
+ void TearDown() override {
+ m_service_daemon.reset();
+ m_cluster_watcher.reset();
+ TestFixture::TearDown();
+ }
+
void create_pool(bool enable_mirroring, const peer_t &peer,
string *uuid = nullptr, string *name=nullptr) {
string pool_name = get_temp_pool_name("test-rbd-mirror-");
int64_t pool_id = m_cluster->pool_lookup(pool_name.c_str());
ASSERT_GE(pool_id, 0);
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, m_cluster->ioctx_create2(pool_id, ioctx));
+ ioctx.application_enable("rbd", true);
+
m_pools.insert(pool_name);
if (enable_mirroring) {
- librados::IoCtx ioctx;
- ASSERT_EQ(0, m_cluster->ioctx_create2(pool_id, ioctx));
ASSERT_EQ(0, librbd::api::Mirror<>::mode_set(ioctx,
RBD_MIRROR_MODE_POOL));
ASSERT_EQ(m_pool_peers, m_cluster_watcher->get_pool_peers());
}
- Mutex m_lock;
RadosRef m_cluster;
+ Mutex m_lock;
+ unique_ptr<rbd::mirror::ServiceDaemon<>> m_service_daemon;
unique_ptr<ClusterWatcher> m_cluster_watcher;
set<string> m_pools;
#include "cls/rbd/cls_rbd_client.h"
#include "tools/rbd_mirror/ImageDeleter.h"
#include "tools/rbd_mirror/ImageReplayer.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
#include "tools/rbd_mirror/Threads.h"
#include "librbd/ImageCtx.h"
#include "librbd/ImageState.h"
#include "librbd/internal.h"
#include "librbd/Utils.h"
#include "librbd/api/Mirror.h"
+#include "librbd/journal/DisabledPolicy.h"
#include "test/rbd_mirror/test_fixture.h"
#include "test/librados/test.h"
void SetUp() override {
TestFixture::SetUp();
+ m_service_daemon.reset(new rbd::mirror::ServiceDaemon<>(g_ceph_context,
+ _rados, m_threads));
librbd::api::Mirror<>::mode_set(m_local_io_ctx, RBD_MIRROR_MODE_IMAGE);
- m_deleter = new rbd::mirror::ImageDeleter(m_threads->work_queue,
- m_threads->timer,
- &m_threads->timer_lock);
-
- EXPECT_EQ(0, create_image(rbd, m_local_io_ctx, m_image_name, 1 << 20));
- ImageCtx *ictx = new ImageCtx(m_image_name, "", "", m_local_io_ctx,
- false);
- EXPECT_EQ(0, ictx->state->open(false));
- m_local_image_id = ictx->id;
-
- cls::rbd::MirrorImage mirror_image(GLOBAL_IMAGE_ID,
- MirrorImageState::MIRROR_IMAGE_STATE_ENABLED);
- EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, ictx->id,
+ m_deleter = new rbd::mirror::ImageDeleter<>(m_threads->work_queue,
+ m_threads->timer,
+ &m_threads->timer_lock,
+ m_service_daemon.get());
+
+ m_local_image_id = librbd::util::generate_image_id(m_local_io_ctx);
+ librbd::ImageOptions image_opts;
+ image_opts.set(RBD_IMAGE_OPTION_FEATURES, RBD_FEATURES_ALL);
+ EXPECT_EQ(0, librbd::create(m_local_io_ctx, m_image_name, m_local_image_id,
+ 1 << 20, image_opts, GLOBAL_IMAGE_ID,
+ m_remote_mirror_uuid, true));
+
+ cls::rbd::MirrorImage mirror_image(
+ GLOBAL_IMAGE_ID, MirrorImageState::MIRROR_IMAGE_STATE_ENABLED);
+ EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, m_local_image_id,
mirror_image));
-
- demote_image(ictx);
- EXPECT_EQ(0, ictx->state->close());
}
void TearDown() override {
remove_image();
- TestFixture::TearDown();
delete m_deleter;
+ m_service_daemon.reset();
+
+ TestFixture::TearDown();
}
void remove_image(bool force=false) {
ImageCtx *ictx = new ImageCtx("", m_local_image_id, "", m_local_io_ctx,
false);
EXPECT_EQ(0, ictx->state->open(false));
- promote_image(ictx);
+ {
+ RWLock::WLocker snap_locker(ictx->snap_lock);
+ ictx->set_journal_policy(new librbd::journal::DisabledPolicy());
+ }
- EXPECT_EQ(0, ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(),
- snap_name.c_str()));
+ EXPECT_EQ(0, ictx->operations->snap_create(
+ cls::rbd::UserSnapshotNamespace(), snap_name.c_str()));
if (protect) {
- EXPECT_EQ(0, ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(),
- snap_name.c_str()));
+ EXPECT_EQ(0, ictx->operations->snap_protect(
+ cls::rbd::UserSnapshotNamespace(), snap_name.c_str()));
}
- demote_image(ictx);
EXPECT_EQ(0, ictx->state->close());
}
ImageCtx *ictx = new ImageCtx("", m_local_image_id, "", m_local_io_ctx,
false);
EXPECT_EQ(0, ictx->state->open(false));
- promote_image(ictx);
-
- EXPECT_EQ(0, ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(),
- "snap1"));
- EXPECT_EQ(0, ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(),
- "snap1"));
- int order = 20;
- EXPECT_EQ(0, librbd::clone(m_local_io_ctx, ictx->name.c_str(), "snap1",
- m_local_io_ctx, "clone1", ictx->features,
- &order, 0, 0));
- std::string clone_id;
- ImageCtx *ictx_clone = new ImageCtx("clone1", "", "", m_local_io_ctx,
- false);
- EXPECT_EQ(0, ictx_clone->state->open(false));
- clone_id = ictx_clone->id;
- cls::rbd::MirrorImage mirror_image(GLOBAL_CLONE_IMAGE_ID,
- MirrorImageState::MIRROR_IMAGE_STATE_ENABLED);
+ {
+ RWLock::WLocker snap_locker(ictx->snap_lock);
+ ictx->set_journal_policy(new librbd::journal::DisabledPolicy());
+ }
+
+ EXPECT_EQ(0, ictx->operations->snap_create(
+ cls::rbd::UserSnapshotNamespace(), "snap1"));
+ EXPECT_EQ(0, ictx->operations->snap_protect(
+ cls::rbd::UserSnapshotNamespace(), "snap1"));
+ EXPECT_EQ(0, librbd::snap_set(ictx, cls::rbd::UserSnapshotNamespace(),
+ "snap1"));
+
+ std::string clone_id = librbd::util::generate_image_id(m_local_io_ctx);
+ librbd::ImageOptions clone_opts;
+ clone_opts.set(RBD_IMAGE_OPTION_FEATURES, ictx->features);
+ EXPECT_EQ(0, librbd::clone(ictx, m_local_io_ctx, "clone1", clone_id,
+ clone_opts, GLOBAL_CLONE_IMAGE_ID,
+ m_remote_mirror_uuid));
+
+ cls::rbd::MirrorImage mirror_image(
+ GLOBAL_CLONE_IMAGE_ID, MirrorImageState::MIRROR_IMAGE_STATE_ENABLED);
EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, clone_id,
mirror_image));
- demote_image(ictx_clone);
- EXPECT_EQ(0, ictx_clone->state->close());
-
- demote_image(ictx);
EXPECT_EQ(0, ictx->state->close());
-
return clone_id;
}
librbd::RBD rbd;
std::string m_local_image_id;
- rbd::mirror::ImageDeleter *m_deleter;
+ std::unique_ptr<rbd::mirror::ServiceDaemon<>> m_service_daemon;
+ rbd::mirror::ImageDeleter<> *m_deleter;
};
int64_t TestImageDeleter::m_local_pool_id;
TEST_F(TestImageDeleter, Delete_NonPrimary_Image) {
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
+
+ C_SaferCond ctx;
+ m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
+ &ctx);
+ EXPECT_EQ(0, ctx.wait());
+
+ ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
+ ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
+
+ check_image_deleted();
+}
+
+TEST_F(TestImageDeleter, Delete_Split_Brain_Image) {
+ promote_image();
+ demote_image();
+
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ true);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
TEST_F(TestImageDeleter, Fail_Delete_Primary_Image) {
promote_image();
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
+
+ C_SaferCond ctx;
+ m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
+ &ctx);
+ EXPECT_EQ(-rbd::mirror::ImageDeleter<>::EISPRM, ctx.wait());
+
+ ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
+ ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
+}
+
+TEST_F(TestImageDeleter, Fail_Delete_Orphan_Image) {
+ promote_image();
+ demote_image();
+
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
&ctx);
- EXPECT_EQ(-rbd::mirror::ImageDeleter::EISPRM, ctx.wait());
+ EXPECT_EQ(-rbd::mirror::ImageDeleter<>::EISPRM, ctx.wait());
ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
TEST_F(TestImageDeleter, Delete_Image_With_Child) {
create_snapshot();
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
create_snapshot("snap1");
create_snapshot("snap2");
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
TEST_F(TestImageDeleter, Delete_Image_With_ProtectedChild) {
create_snapshot("snap1", true);
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
create_snapshot("snap1", true);
create_snapshot("snap2", true);
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
TEST_F(TestImageDeleter, Delete_Image_With_Clone) {
std::string clone_id = create_clone();
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
m_deleter->schedule_image_delete(_rados, m_local_pool_id,
- GLOBAL_CLONE_IMAGE_ID);
+ GLOBAL_CLONE_IMAGE_ID, false);
C_SaferCond ctx2;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_CLONE_IMAGE_ID,
EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, m_local_image_id,
mirror_image));
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, m_local_image_id,
mirror_image));
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
TEST_F(TestImageDeleter, Delete_NonExistent_Image_Without_MirroringState) {
remove_image();
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
false);
EXPECT_EQ(0, ictx->state->open(false));
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
m_deleter->set_failed_timer_interval(2);
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
false);
EXPECT_EQ(0, ictx->state->open(false));
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
C_SaferCond ctx;
m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
ASSERT_EQ(1u, m_deleter->get_failed_queue_items().size());
- m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
+ m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID,
+ false);
ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
ASSERT_EQ(1u, m_deleter->get_failed_queue_items().size());
#include "librbd/io/ImageRequestWQ.h"
#include "librbd/io/ReadResult.h"
#include "tools/rbd_mirror/types.h"
+#include "tools/rbd_mirror/ImageDeleter.h"
#include "tools/rbd_mirror/ImageReplayer.h"
#include "tools/rbd_mirror/InstanceWatcher.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
#include "tools/rbd_mirror/Threads.h"
-#include "tools/rbd_mirror/ImageDeleter.h"
#include "test/librados/test.h"
#include "gtest/gtest.h"
EXPECT_EQ(0, m_local_cluster->pool_create(m_local_pool_name.c_str()));
EXPECT_EQ(0, m_local_cluster->ioctx_create(m_local_pool_name.c_str(),
m_local_ioctx));
+ m_local_ioctx.application_enable("rbd", true);
EXPECT_EQ("", connect_cluster_pp(m_remote_cluster));
EXPECT_EQ(0, m_remote_cluster.conf_set("rbd_cache", "false"));
EXPECT_EQ(0, m_remote_cluster.ioctx_create(m_remote_pool_name.c_str(),
m_remote_ioctx));
+ m_remote_ioctx.application_enable("rbd", true);
+
EXPECT_EQ(0, librbd::api::Mirror<>::mode_set(m_remote_ioctx,
RBD_MIRROR_MODE_POOL));
false, features, &order, 0, 0));
m_remote_image_id = get_image_id(m_remote_ioctx, m_image_name);
- m_threads = new rbd::mirror::Threads<>(reinterpret_cast<CephContext*>(
- m_local_ioctx.cct()));
+ m_threads.reset(new rbd::mirror::Threads<>(reinterpret_cast<CephContext*>(
+ m_local_ioctx.cct())));
- m_image_deleter.reset(new rbd::mirror::ImageDeleter(m_threads->work_queue,
- m_threads->timer,
- &m_threads->timer_lock));
+ m_service_daemon.reset(new rbd::mirror::ServiceDaemon<>(g_ceph_context,
+ m_local_cluster,
+ m_threads.get()));
+ m_image_deleter.reset(new rbd::mirror::ImageDeleter<>(
+ m_threads->work_queue, m_threads->timer, &m_threads->timer_lock,
+ m_service_daemon.get()));
m_instance_watcher = rbd::mirror::InstanceWatcher<>::create(
m_local_ioctx, m_threads->work_queue, nullptr);
m_instance_watcher->handle_acquire_leader();
delete m_replayer;
delete m_instance_watcher;
- delete m_threads;
EXPECT_EQ(0, m_remote_cluster.pool_delete(m_remote_pool_name.c_str()));
EXPECT_EQ(0, m_local_cluster->pool_delete(m_local_pool_name.c_str()));
template <typename ImageReplayerT = rbd::mirror::ImageReplayer<> >
void create_replayer() {
m_replayer = new ImageReplayerT(
- m_threads, m_image_deleter, m_instance_watcher,
+ m_threads.get(), m_image_deleter.get(), m_instance_watcher,
rbd::mirror::RadosRef(new librados::Rados(m_local_ioctx)),
m_local_mirror_uuid, m_local_ioctx.get_id(), "global image id");
m_replayer->add_remote_image(m_remote_mirror_uuid, m_remote_image_id,
static int _image_number;
- rbd::mirror::Threads<> *m_threads = nullptr;
- std::shared_ptr<rbd::mirror::ImageDeleter> m_image_deleter;
std::shared_ptr<librados::Rados> m_local_cluster;
+ std::unique_ptr<rbd::mirror::Threads<>> m_threads;
+ std::unique_ptr<rbd::mirror::ServiceDaemon<>> m_service_daemon;
+ std::unique_ptr<rbd::mirror::ImageDeleter<>> m_image_deleter;
librados::Rados m_remote_cluster;
rbd::mirror::InstanceWatcher<> *m_instance_watcher;
std::string m_local_mirror_uuid = "local mirror uuid";
create_replayer<>();
C_SaferCond cond;
m_replayer->start(&cond);
- ASSERT_EQ(0, cond.wait());
+ ASSERT_EQ(-EREMOTEIO, cond.wait());
ASSERT_TRUE(m_replayer->is_stopped());
}
create_replayer<>();
C_SaferCond cond;
m_replayer->start(&cond);
- ASSERT_EQ(0, cond.wait());
+ ASSERT_EQ(-EREMOTEIO, cond.wait());
ASSERT_TRUE(m_replayer->is_stopped());
}
librados::IoCtx ioctx;
ASSERT_EQ(0, m_cluster->ioctx_create2(pool_id, ioctx));
+ ioctx.application_enable("rbd", true);
m_pool_watcher.reset(new PoolWatcher<>(m_threads, ioctx,
m_pool_watcher_listener));
_local_pool_name = get_temp_pool_name("test-rbd-mirror-");
ASSERT_EQ(0, _rados->pool_create(_local_pool_name.c_str()));
+ librados::IoCtx local_ioctx;
+ ASSERT_EQ(0, _rados->ioctx_create(_local_pool_name.c_str(), local_ioctx));
+ local_ioctx.application_enable("rbd", true);
+
_remote_pool_name = get_temp_pool_name("test-rbd-mirror-");
ASSERT_EQ(0, _rados->pool_create(_remote_pool_name.c_str()));
+ librados::IoCtx remote_ioctx;
+ ASSERT_EQ(0, _rados->ioctx_create(_remote_pool_name.c_str(), remote_ioctx));
+ remote_ioctx.application_enable("rbd", true);
+
ASSERT_EQ(0, create_image_data_pool(_data_pool));
if (!_data_pool.empty()) {
printf("using image data pool: %s\n", _data_pool.c_str());
#include "cls/journal/cls_journal_types.h"
#include "librbd/journal/Replay.h"
#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/ImageDeleter.h"
#include "tools/rbd_mirror/ImageReplayer.h"
#include "tools/rbd_mirror/InstanceWatcher.h"
#include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
namespace rbd {
namespace mirror {
+template <>
+struct ImageDeleter<librbd::MockTestImageCtx> {
+ MOCK_METHOD4(schedule_image_delete, void(RadosRef, int64_t,
+ const std::string&, bool));
+};
+
template<>
class InstanceWatcher<librbd::MockTestImageCtx> {
};
void get() {
}
+ inline bool is_syncing() const {
+ return false;
+ }
+
MOCK_METHOD0(send, void());
MOCK_METHOD0(cancel, void());
};
class TestMockImageReplayer : public TestMockFixture {
public:
+ typedef ImageDeleter<librbd::MockTestImageCtx> MockImageDeleter;
typedef BootstrapRequest<librbd::MockTestImageCtx> MockBootstrapRequest;
typedef CloseImageRequest<librbd::MockTestImageCtx> MockCloseImageRequest;
typedef EventPreprocessor<librbd::MockTestImageCtx> MockEventPreprocessor;
librbd::RBD rbd;
ASSERT_EQ(0, create_image(rbd, m_remote_io_ctx, m_image_name, m_image_size));
ASSERT_EQ(0, open_image(m_remote_io_ctx, m_image_name, &m_remote_image_ctx));
-
- m_image_deleter.reset(new rbd::mirror::ImageDeleter(m_threads->work_queue,
- m_threads->timer,
- &m_threads->timer_lock));
- m_image_replayer = new MockImageReplayer(
- m_threads, m_image_deleter, &m_instance_watcher,
- rbd::mirror::RadosRef(new librados::Rados(m_local_io_ctx)),
- "local_mirror_uuid", m_local_io_ctx.get_id(), "global image id");
- m_image_replayer->add_remote_image(
- "remote_mirror_uuid", m_remote_image_ctx->id, m_remote_io_ctx);
}
void TearDown() override {
WithArg<2>(CompleteContext(on_commit_r))));
}
+ void create_image_replayer(MockImageDeleter &mock_image_deleter) {
+ m_image_replayer = new MockImageReplayer(
+ m_threads, &mock_image_deleter, &m_instance_watcher,
+ rbd::mirror::RadosRef(new librados::Rados(m_local_io_ctx)),
+ "local_mirror_uuid", m_local_io_ctx.get_id(), "global image id");
+ m_image_replayer->add_remote_image(
+ "remote_mirror_uuid", m_remote_image_ctx->id, m_remote_io_ctx);
+ }
+
librbd::ImageCtx *m_remote_image_ctx;
librbd::ImageCtx *m_local_image_ctx = nullptr;
- std::shared_ptr<rbd::mirror::ImageDeleter> m_image_deleter;
MockInstanceWatcher m_instance_watcher;
- MockImageReplayer *m_image_replayer;
+ MockImageReplayer *m_image_replayer = nullptr;
};
TEST_F(TestMockImageReplayer, StartStop) {
EXPECT_CALL(mock_remote_journaler, start_live_replay(_, _));
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
+ ASSERT_EQ(image_replayer::HEALTH_STATE_OK,
+ m_image_replayer->get_health_state());
// STOP
C_SaferCond stop_ctx;
m_image_replayer->stop(&stop_ctx);
ASSERT_EQ(0, stop_ctx.wait());
+ ASSERT_EQ(image_replayer::HEALTH_STATE_OK,
+ m_image_replayer->get_health_state());
}
TEST_F(TestMockImageReplayer, LocalImagePrimary) {
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
"", 0);
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, remove_listener(_));
expect_shut_down(mock_remote_journaler, 0);
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
- ASSERT_EQ(0, start_ctx.wait());
+ ASSERT_EQ(-EREMOTEIO, start_ctx.wait());
}
TEST_F(TestMockImageReplayer, PrepareLocalImageError) {
expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
"remote mirror uuid", -EINVAL);
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(-EINVAL, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, remove_listener(_));
expect_shut_down(mock_remote_journaler, 0);
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(-EINVAL, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, remove_listener(_));
expect_shut_down(mock_remote_journaler, 0);
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(-EINVAL, start_ctx.wait());
+ ASSERT_EQ(image_replayer::HEALTH_STATE_ERROR,
+ m_image_replayer->get_health_state());
}
TEST_F(TestMockImageReplayer, StopError) {
EXPECT_CALL(mock_remote_journaler, start_live_replay(_, _));
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, start_live_replay(_, _));
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, start_live_replay(_, _));
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
EXPECT_CALL(mock_remote_journaler, start_live_replay(_, _));
+ MockImageDeleter mock_image_deleter;
+ create_image_replayer(mock_image_deleter);
+
C_SaferCond start_ctx;
m_image_replayer->start(&start_ctx);
ASSERT_EQ(0, start_ctx.wait());
#include "test/librbd/mock/MockImageCtx.h"
#include "test/rbd_mirror/test_mock_fixture.h"
+#include "tools/rbd_mirror/ImageDeleter.h"
#include "tools/rbd_mirror/ImageReplayer.h"
#include "tools/rbd_mirror/InstanceWatcher.h"
#include "tools/rbd_mirror/InstanceReplayer.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/Types.h"
namespace librbd {
}
};
+template <>
+struct ImageDeleter<librbd::MockTestImageCtx> {
+ MOCK_METHOD4(schedule_image_delete, void(RadosRef, int64_t,
+ const std::string&, bool));
+ MOCK_METHOD4(wait_for_scheduled_deletion,
+ void(int64_t, const std::string&, Context*, bool));
+};
+
+template<>
+struct ServiceDaemon<librbd::MockTestImageCtx> {
+ MOCK_METHOD3(add_or_update_attribute,
+ void(int64_t, const std::string&,
+ const service_daemon::AttributeValue&));
+};
+
template<>
struct InstanceWatcher<librbd::MockTestImageCtx> {
};
static ImageReplayer *create(
Threads<librbd::MockTestImageCtx> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ ImageDeleter<librbd::MockTestImageCtx>* image_deleter,
InstanceWatcher<librbd::MockTestImageCtx> *instance_watcher,
RadosRef local, const std::string &local_mirror_uuid, int64_t local_pool_id,
const std::string &global_image_id) {
MOCK_METHOD0(is_running, bool());
MOCK_METHOD0(is_stopped, bool());
MOCK_METHOD0(is_blacklisted, bool());
+
+ MOCK_CONST_METHOD0(get_health_state, image_replayer::HealthState());
};
ImageReplayer<librbd::MockTestImageCtx>* ImageReplayer<librbd::MockTestImageCtx>::s_instance = nullptr;
using ::testing::Invoke;
using ::testing::Return;
using ::testing::ReturnRef;
+using ::testing::WithArg;
class TestMockInstanceReplayer : public TestMockFixture {
public:
+ typedef ImageDeleter<librbd::MockTestImageCtx> MockImageDeleter;
typedef ImageReplayer<librbd::MockTestImageCtx> MockImageReplayer;
typedef InstanceReplayer<librbd::MockTestImageCtx> MockInstanceReplayer;
typedef InstanceWatcher<librbd::MockTestImageCtx> MockInstanceWatcher;
+ typedef ServiceDaemon<librbd::MockTestImageCtx> MockServiceDaemon;
typedef Threads<librbd::MockTestImageCtx> MockThreads;
void SetUp() override {
TestMockFixture::SetUp();
m_mock_threads = new MockThreads(m_threads);
-
- m_image_deleter.reset(
- new rbd::mirror::ImageDeleter(m_threads->work_queue, m_threads->timer,
- &m_threads->timer_lock));
}
void TearDown() override {
TestMockFixture::TearDown();
}
+ void expect_wait_for_scheduled_deletion(MockImageDeleter& mock_image_deleter,
+ const std::string& global_image_id,
+ int r) {
+ EXPECT_CALL(mock_image_deleter,
+ wait_for_scheduled_deletion(_, global_image_id, _, false))
+ .WillOnce(WithArg<2>(Invoke([this, r](Context *ctx) {
+ m_threads->work_queue->queue(ctx, r);
+ })));
+ }
+
MockThreads *m_mock_threads;
- std::shared_ptr<rbd::mirror::ImageDeleter> m_image_deleter;
};
TEST_F(TestMockInstanceReplayer, AcquireReleaseImage) {
+ MockServiceDaemon mock_service_daemon;
+ MockImageDeleter mock_image_deleter;
MockInstanceWatcher mock_instance_watcher;
MockImageReplayer mock_image_replayer;
MockInstanceReplayer instance_replayer(
- m_mock_threads, m_image_deleter,
+ m_mock_threads, &mock_service_daemon, &mock_image_deleter,
rbd::mirror::RadosRef(new librados::Rados(m_local_io_ctx)),
"local_mirror_uuid", m_local_io_ctx.get_id());
EXPECT_CALL(mock_image_replayer, is_blacklisted())
.WillRepeatedly(Return(false));
+ expect_wait_for_scheduled_deletion(mock_image_deleter, "global_image_id", 0);
+
InSequence seq;
instance_replayer.init();
assert False, 'finished bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \
(target_zone.name, source_zone.name, bucket_name)
+def zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name):
+ for source_conn in zonegroup_conns.zones:
+ for target_conn in zonegroup_conns.zones:
+ if source_conn.zone == target_conn.zone:
+ continue
+ zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket_name)
+ target_conn.check_bucket_eq(source_conn, bucket_name)
+
def set_master_zone(zone):
zone.modify(zone.cluster, ['--master'])
zonegroup = zone.zonegroup
log.info('Set master zone=%s, waiting %ds for reconfiguration..', zone.name, config.reconfigure_delay)
time.sleep(config.reconfigure_delay)
+def enable_bucket_sync(zone, bucket_name):
+ cmd = ['bucket', 'sync', 'enable', '--bucket', bucket_name] + zone.zone_args()
+ zone.cluster.admin(cmd)
+
+def disable_bucket_sync(zone, bucket_name):
+ cmd = ['bucket', 'sync', 'disable', '--bucket', bucket_name] + zone.zone_args()
+ zone.cluster.admin(cmd)
+
+def check_buckets_sync_status_obj_not_exist(zone, buckets):
+ for _ in range(config.checkpoint_retries):
+ cmd = ['log', 'list'] + zone.zone_arg()
+ log_list, ret = zone.cluster.admin(cmd, check_retcode=False, read_only=True)
+ for bucket in buckets:
+ if log_list.find(':'+bucket+":") >= 0:
+ break
+ else:
+ return
+ time.sleep(config.checkpoint_delay)
+ assert False
+
def gen_bucket_name():
global num_buckets
for _, bucket in zone_bucket:
bucket.set_policy(policy)
assert(bucket.get_policy() == policy)
+
+def test_bucket_sync_disable():
+ zonegroup = realm.master_zonegroup()
+ zonegroup_conns = ZonegroupConns(zonegroup)
+ buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+ for bucket_name in buckets:
+ disable_bucket_sync(realm.meta_master_zone(), bucket_name)
+
+ for zone in zonegroup.zones:
+ check_buckets_sync_status_obj_not_exist(zone, buckets)
+
+def test_bucket_sync_enable_right_after_disable():
+ zonegroup = realm.master_zonegroup()
+ zonegroup_conns = ZonegroupConns(zonegroup)
+ buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+ objnames = ['obj1', 'obj2', 'obj3', 'obj4']
+ content = 'asdasd'
+
+ for zone, bucket in zone_bucket:
+ for objname in objnames:
+ k = new_key(zone, bucket.name, objname)
+ k.set_contents_from_string(content)
+
+ for bucket_name in buckets:
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+
+ for bucket_name in buckets:
+ disable_bucket_sync(realm.meta_master_zone(), bucket_name)
+ enable_bucket_sync(realm.meta_master_zone(), bucket_name)
+
+ objnames_2 = ['obj5', 'obj6', 'obj7', 'obj8']
+
+ for zone, bucket in zone_bucket:
+ for objname in objnames_2:
+ k = new_key(zone, bucket.name, objname)
+ k.set_contents_from_string(content)
+
+ for bucket_name in buckets:
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+
+def test_bucket_sync_disable_enable():
+ zonegroup = realm.master_zonegroup()
+ zonegroup_conns = ZonegroupConns(zonegroup)
+ buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+ objnames = [ 'obj1', 'obj2', 'obj3', 'obj4' ]
+ content = 'asdasd'
+
+ for zone, bucket in zone_bucket:
+ for objname in objnames:
+ k = new_key(zone, bucket.name, objname)
+ k.set_contents_from_string(content)
+
+ zonegroup_meta_checkpoint(zonegroup)
+
+ for bucket_name in buckets:
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
+
+ for bucket_name in buckets:
+ disable_bucket_sync(realm.meta_master_zone(), bucket_name)
+
+ zonegroup_meta_checkpoint(zonegroup)
+
+ objnames_2 = [ 'obj5', 'obj6', 'obj7', 'obj8' ]
+
+ for zone, bucket in zone_bucket:
+ for objname in objnames_2:
+ k = new_key(zone, bucket.name, objname)
+ k.set_contents_from_string(content)
+
+ for bucket_name in buckets:
+ enable_bucket_sync(realm.meta_master_zone(), bucket_name)
+
+ for bucket_name in buckets:
+ zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
--- /dev/null
+#!/bin/bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ set -e
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_minimal() {
+ local dir=$1
+
+ run_mon $dir a
+ run_mgr $dir x
+ run_osd $dir 0
+ run_osd $dir 1
+ run_osd $dir 2
+ create_rbd_pool
+ wait_for_clean
+}
+
+function TEST_multimon() {
+ local dir=$1
+
+ MONA="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
+ MONB="127.0.0.1:7225" # git grep '\<7225\>' : there must be only one
+ MONC="127.0.0.1:7226" # git grep '\<7226\>' : there must be only one
+
+ run_mon $dir a --public-addr $MONA
+ run_mon $dir b --public-addr $MONB
+ run_mon $dir c --public_addr $MONC
+ run_mgr $dir x
+ run_mgr $dir y
+ run_osd $dir 0
+ run_osd $dir 1
+ run_osd $dir 2
+
+ ceph osd pool create foo 32
+ ceph osd out 0
+ wait_for_clean
+
+ rados -p foo bench 4 write -b 4096 --no-cleanup
+ wait_for_clean
+
+ ceph osd in 0
+ flush_pg_stats
+ wait_for_clean
+}
+
+main smoke "$@"
rados_pool_create(cl, m_pool_name.c_str());
rados_ioctx_t io_ctx;
printf("%s: rados_ioctx_create.\n", get_id_str());
- RETURN1_IF_NOT_VAL(0, rados_ioctx_create(cl, m_pool_name.c_str(), &io_ctx));
+ RETURN1_IF_NONZERO(rados_ioctx_create(cl, m_pool_name.c_str(), &io_ctx));
if (m_open_pool_sem)
m_open_pool_sem->post();
rados_ioctx_destroy(io_ctx);
+ rados_pool_delete(cl, m_pool_name.c_str());
rados_shutdown(cl);
return 0;
}
RETURN1_IF_NONZERO(rados_ioctx_create(cl, m_pool_name.c_str(), &io_ctx));
printf("%s: watching object %s\n", get_id_str(), m_obj_name.c_str());
- RETURN1_IF_NOT_VAL(
+ RETURN1_IF_NOT_VAL(m_watch_retcode,
rados_watch(io_ctx, m_obj_name.c_str(), 0, &handle,
reinterpret_cast<rados_watchcb_t>(notify_cb),
- reinterpret_cast<void*>(&num_notifies)),
- m_watch_retcode
+ reinterpret_cast<void*>(&num_notifies))
);
if (m_watch_sem) {
m_watch_sem->post();
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact@redhat.com>
-# Copyright (C) 2014 Federico Gimenez <fgimenez@coit.es>
-#
-# Author: Loic Dachary <loic@dachary.org>
-# Author: Federico Gimenez <fgimenez@coit.es>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library Public License for more details.
-#
-source $(dirname $0)/detect-build-env-vars.sh
-
-$CEPH_ROOT/qa/workunits/ceph-helpers.sh TESTS
# Includes
source $(dirname $0)/detect-build-env-vars.sh
-source ../qa/workunits/ceph-helpers.sh
+source ../qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
shift
<< "': error " << ret << std::endl;
return 1;
}
+ ioctx.application_enable("rados", true);
+
librados::ObjectWriteOperation op;
op.create(true);
ret = ioctx.operate(oid, &op);
# Includes
source $(dirname $0)/detect-build-env-vars.sh
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
--mkfs \
--mon-data=$data \
--run-dir=$dir || return 1
+ sleep 1
expect_failure $dir "ignore empty --pid-file" ceph-mon \
-f \
--log-to-stderr \
# no daemon can use a pidfile that is owned by another daemon
run_mon $dir a || return 1
- run_mon $dir a 2>&1 | grep "failed to lock pidfile" || return 1
+ run_mon $dir a --log-to-stderr -f 2>&1 | grep "failed to lock pidfile" || return 1
run_osd $dir 0 || return 1
- run_osd $dir 0 2>&1 | grep "failed to lock pidfile" || return 1
+ activate_osd $dir 0 --log-to-stderr -f 2>&1 | grep "failed to lock pidfile" || return 1
# when a daemon shutdown, it will not unlink a path different from
# the one it owns
#
# Includes
-source ../qa/workunits/ceph-helpers.sh
+source ../qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
const char* env = getenv("CEPH_LIB");
if (env) {
- g_conf->set_val("erasure_code_dir", env, false);
- g_conf->set_val("plugin_dir", env, false);
+ g_conf->set_val_or_die("erasure_code_dir", env, false);
+ g_conf->set_val_or_die("plugin_dir", env, false);
}
::testing::InitGoogleTest(&argc, argv);
# GNU Library Public License for more details.
#
-source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
export CEPH_VSTART_WRAPPER=1
export CEPH_DIR="${TMPDIR:-$PWD}/td/t-$CEPH_PORT"
export CEPH_DEV_DIR="$CEPH_DIR/dev"
export CEPH_OUT_DIR="$CEPH_DIR/out"
+export CEPH_ASOK_DIR="$CEPH_DIR/out"
export MGR_PYTHON_PATH=$CEPH_ROOT/src/pybind/mgr
if (!divergent.empty()) {
assert(missing.get_items().empty());
PGLog::write_log_and_missing_wo_missing(
- t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true, true);
} else {
pg_missing_tracker_t tmissing(missing);
+ bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
PGLog::write_log_and_missing(
- t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true);
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true, true,
+ &rebuilt_missing_set_with_deletes);
}
t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
return 0;
read_keys.insert(key);
}
+ list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
+ for (auto& nb : nb_list) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
// Perform bulk read of existing dentries
std::map<std::string, bufferlist> read_vals;
r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
}
}
+ std::set<std::string> null_vals;
+ for (auto& nb : nb_list) {
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+ << dendl;
+
+ auto it = read_vals.find(key);
+ if (it != read_vals.end()) {
+ dout(4) << "dentry exists, will remove" << dendl;
+
+ bufferlist::iterator q = it->second.begin();
+ snapid_t dnfirst;
+ ::decode(dnfirst, q);
+ char dentry_type;
+ ::decode(dentry_type, q);
+
+ bool remove_dentry = false;
+ if (dentry_type == 'L') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode.version;
+ } else if (dentry_type == 'I') {
+ dout(10) << "Existing full inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode.version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+ remove_dentry = true;
+ }
+
+ if (remove_dentry)
+ null_vals.insert(key);
+ }
+ }
+
// Write back any new/changed dentries
if (!write_vals.empty()) {
r = output.omap_set(frag_oid.name, write_vals);
return r;
}
}
+
+ // remove any null dentries
+ if (!null_vals.empty()) {
+ r = output.omap_rm_keys(frag_oid.name, null_vals);
+ if (r != 0) {
+ derr << "error removing dentries from " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
}
/* Now that we've looked at the dirlumps, we finally pay attention to
action/MirrorImage.cc
action/Nbd.cc
action/ObjectMap.cc
+ action/Pool.cc
action/Remove.cc
action/Rename.cc
action/Resize.cc
fd = STDOUT_FILENO;
max_concurrent_ops = 1;
} else {
- max_concurrent_ops = max(g_conf->rbd_concurrent_management_ops, 1);
+ max_concurrent_ops = g_conf->rbd_concurrent_management_ops;
fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
if (fd < 0) {
return -errno;
ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress)
: image(image), fd(fd), size(size), pc("Importing image diff", no_progress),
throttle((fd == STDIN_FILENO) ? 1 :
- max(g_conf->rbd_concurrent_management_ops, 1), false), last_offset(0) {
+ g_conf->rbd_concurrent_management_ops, false), last_offset(0) {
}
void update_size(size_t new_size)
throttle.reset(new SimpleThrottle(1, false));
} else {
throttle.reset(new SimpleThrottle(
- max(g_conf->rbd_concurrent_management_ops, 1), false));
+ g_conf->rbd_concurrent_management_ops, false));
}
reqlen = min<uint64_t>(reqlen, size);
done:
if (pd > 2)
close(pd);
- if (sd)
+ if (sd > 2)
close(sd);
if (fd > 2)
close(fd);
if (r < 0) {
return r;
}
+
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+ std::cerr << "rbd: failed to add mirror peer: "
+ << "mirroring must be enabled on the pool "
+ << pool_name << std::endl;
+ return -EINVAL;
+ }
// TODO: temporary restriction to prevent adding multiple peers
// until rbd-mirror daemon can properly handle the scenario
- librbd::RBD rbd;
std::vector<librbd::mirror_peer_t> mirror_peers;
r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
if (r < 0) {
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "osd/osd_types.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_init(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options);
+ options->add_options()
+ ("force", po::bool_switch(),
+ "force initialize pool for RBD use if registered by another application");
+}
+
+int execute_init(const po::variables_map &vm) {
+ size_t arg_index = 0;
+ std::string pool_name = utils::get_pool_name(vm, &arg_index);
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ int r = utils::init(pool_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RBD,
+ vm["force"].as<bool>());
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: luminous or later release required." << std::endl;
+ } else if (r == -EPERM) {
+ std::cerr << "rbd: pool already registered to a different application."
+ << std::endl;
+ } else if (r < 0) {
+ std::cerr << "rbd: error registered application: " << cpp_strerror(r)
+ << std::endl;
+ }
+
+ return 0;
+}
+
+Shell::Action action(
+ {"pool", "init"}, {}, "Initialize pool for use by RBD.", "",
+ &get_arguments_init, &execute_init);
+
+} // namespace pool
+} // namespace action
+} // namespace rbd
{"snap", "create"}, {"snap", "add"}, "Create a snapshot.", "",
&get_create_arguments, &execute_create);
Shell::Action action_remove(
- {"snap", "remove"}, {"snap", "rm"}, "Deletes a snapshot.", "",
+ {"snap", "remove"}, {"snap", "rm"}, "Delete a snapshot.", "",
&get_remove_arguments, &execute_remove);
Shell::Action action_purge(
- {"snap", "purge"}, {}, "Deletes all snapshots.", "",
+ {"snap", "purge"}, {}, "Delete all snapshots.", "",
&get_purge_arguments, &execute_purge);
Shell::Action action_rollback(
{"snap", "rollback"}, {"snap", "revert"}, "Rollback image to snapshot.", "",
Shell::Action action_move(
- {"trash", "move"}, {"trash", "mv"}, "Moves an image to the trash.", "",
+ {"trash", "move"}, {"trash", "mv"}, "Move an image to the trash.", "",
&get_move_arguments, &execute_move);
Shell::Action action_remove(
- {"trash", "remove"}, {"trash", "rm"}, "Removes an image from trash.", "",
+ {"trash", "remove"}, {"trash", "rm"}, "Remove an image from trash.", "",
&get_remove_arguments, &execute_remove);
Shell::SwitchArguments switched_arguments({"long", "l"});
&get_list_arguments, &execute_list);
Shell::Action action_restore(
- {"trash", "restore"}, {}, "Restores an image from trash.", "",
+ {"trash", "restore"}, {}, "Restore an image from trash.", "",
&get_restore_arguments, &execute_restore);
} // namespace trash
MirrorStatusWatcher.cc
PoolReplayer.cc
PoolWatcher.cc
+ ServiceDaemon.cc
Threads.cc
types.cc
image_replayer/BootstrapRequest.cc
image_sync/SnapshotCreateRequest.cc
image_sync/SyncPointCreateRequest.cc
image_sync/SyncPointPruneRequest.cc
- pool_watcher/RefreshImagesRequest.cc)
+ pool_watcher/RefreshImagesRequest.cc
+ service_daemon/Types.cc)
add_library(rbd_mirror_internal STATIC
${rbd_mirror_internal})
#include "cls/rbd/cls_rbd_client.h"
#include "librbd/internal.h"
#include "librbd/api/Mirror.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_rbd_mirror
namespace rbd {
namespace mirror {
-ClusterWatcher::ClusterWatcher(RadosRef cluster, Mutex &lock) :
- m_lock(lock),
- m_cluster(cluster)
+ClusterWatcher::ClusterWatcher(RadosRef cluster, Mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_cluster(cluster), m_lock(lock), m_service_daemon(service_daemon)
{
}
return;
}
- for (auto kv : pools) {
+ std::set<int64_t> service_pool_ids;
+ for (auto& kv : pools) {
int64_t pool_id = kv.first;
- string pool_name = kv.second;
+ auto& pool_name = kv.second;
int64_t base_tier;
r = m_cluster->pool_get_base_tier(pool_id, &base_tier);
if (r == -ENOENT) {
cls::rbd::MirrorMode mirror_mode_internal;
r = librbd::cls_client::mirror_mode_get(&ioctx, &mirror_mode_internal);
+ if (r == 0 && mirror_mode_internal == cls::rbd::MIRROR_MODE_DISABLED) {
+ dout(10) << "mirroring is disabled for pool " << pool_name << dendl;
+ continue;
+ }
+
+ service_pool_ids.insert(pool_id);
+ if (m_service_pools.find(pool_id) == m_service_pools.end()) {
+ m_service_pools[pool_id] = {};
+ m_service_daemon->add_pool(pool_id, pool_name);
+ }
+
if (r == -EPERM) {
dout(10) << "access denied querying pool " << pool_name << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "access denied");
continue;
} else if (r < 0) {
derr << "could not tell whether mirroring was enabled for " << pool_name
<< " : " << cpp_strerror(r) << dendl;
- continue;
- }
- if (mirror_mode_internal == cls::rbd::MIRROR_MODE_DISABLED) {
- dout(10) << "mirroring is disabled for pool " << pool_name << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "mirroring mode query failed");
continue;
}
if (r < 0) {
derr << "error reading mirroring config for pool " << pool_name
<< cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_ERROR, "mirroring peer list failed");
continue;
}
+ if (m_service_pools[pool_id] != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(pool_id, m_service_pools[pool_id]);
+ m_service_pools[pool_id] = service_daemon::CALLOUT_ID_NONE;
+ }
+
pool_peers->insert({pool_id, Peers{configs.begin(), configs.end()}});
pool_names->insert(pool_name);
}
+
+ for (auto it = m_service_pools.begin(); it != m_service_pools.end(); ) {
+ auto current_it(it++);
+ if (service_pool_ids.find(current_it->first) == service_pool_ids.end()) {
+ m_service_daemon->remove_pool(current_it->first);
+ m_service_pools.erase(current_it->first);
+ }
+ }
}
} // namespace mirror
#include "common/Timer.h"
#include "include/rados/librados.hpp"
#include "types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <unordered_map>
+
+namespace librbd { struct ImageCtx; }
namespace rbd {
namespace mirror {
+template <typename> class ServiceDaemon;
+
/**
* Tracks mirroring configuration for pools in a single
* cluster.
typedef std::map<int64_t, Peers> PoolPeers;
typedef std::set<std::string> PoolNames;
- ClusterWatcher(RadosRef cluster, Mutex &lock);
+ ClusterWatcher(RadosRef cluster, Mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
~ClusterWatcher() = default;
ClusterWatcher(const ClusterWatcher&) = delete;
ClusterWatcher& operator=(const ClusterWatcher&) = delete;
const PoolPeers& get_pool_peers() const;
private:
- Mutex &m_lock;
+ typedef std::unordered_map<int64_t, service_daemon::CalloutId> ServicePools;
+
RadosRef m_cluster;
+ Mutex &m_lock;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+
+ ServicePools m_service_pools;
PoolPeers m_pool_peers;
void read_pool_peers(PoolPeers *pool_peers, PoolNames *pool_names);
* Foundation. See file COPYING.
*
*/
+
#include <boost/bind.hpp>
#include <map>
#include <set>
virtual bool call(Formatter *f, stringstream *ss) = 0;
};
+template <typename I>
class StatusCommand : public ImageDeleterAdminSocketCommand {
public:
- explicit StatusCommand(ImageDeleter *image_del) : image_del(image_del) {}
+ explicit StatusCommand(ImageDeleter<I> *image_del) : image_del(image_del) {}
bool call(Formatter *f, stringstream *ss) override {
image_del->print_status(f, ss);
}
private:
- ImageDeleter *image_del;
+ ImageDeleter<I> *image_del;
};
struct DeleteJournalPolicy : public librbd::journal::Policy {
} // anonymous namespace
+template <typename I>
class ImageDeleterAdminSocketHook : public AdminSocketHook {
public:
- ImageDeleterAdminSocketHook(CephContext *cct, ImageDeleter *image_del) :
+ ImageDeleterAdminSocketHook(CephContext *cct, ImageDeleter<I> *image_del) :
admin_socket(cct->get_admin_socket()) {
std::string command;
r = admin_socket->register_command(command, command, this,
"get status for image deleter");
if (r == 0) {
- commands[command] = new StatusCommand(image_del);
+ commands[command] = new StatusCommand<I>(image_del);
}
}
Commands commands;
};
-ImageDeleter::ImageDeleter(ContextWQ *work_queue, SafeTimer *timer,
- Mutex *timer_lock)
- : m_running(true),
- m_work_queue(work_queue),
+template <typename I>
+ImageDeleter<I>::ImageDeleter(ContextWQ *work_queue, SafeTimer *timer,
+ Mutex *timer_lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_work_queue(work_queue),
+ m_service_daemon(service_daemon),
m_delete_lock("rbd::mirror::ImageDeleter::Delete"),
m_image_deleter_thread(this),
m_failed_timer(timer),
m_failed_timer_lock(timer_lock),
- m_asok_hook(new ImageDeleterAdminSocketHook(g_ceph_context, this))
+ m_asok_hook(new ImageDeleterAdminSocketHook<I>(g_ceph_context, this))
{
set_failed_timer_interval(g_ceph_context->_conf->rbd_mirror_delete_retry_interval);
m_image_deleter_thread.create("image_deleter");
}
-ImageDeleter::~ImageDeleter() {
+template <typename I>
+ImageDeleter<I>::~ImageDeleter() {
dout(20) << "enter" << dendl;
m_running = false;
dout(20) << "return" << dendl;
}
-void ImageDeleter::run() {
+template <typename I>
+void ImageDeleter<I>::run() {
dout(20) << "enter" << dendl;
while(m_running) {
m_delete_lock.Lock();
}
}
-void ImageDeleter::schedule_image_delete(RadosRef local_rados,
- int64_t local_pool_id,
- const std::string& global_image_id) {
+template <typename I>
+void ImageDeleter<I>::schedule_image_delete(RadosRef local_rados,
+ int64_t local_pool_id,
+ const std::string& global_image_id,
+ bool ignore_orphaned) {
dout(20) << "enter" << dendl;
Mutex::Locker locker(m_delete_lock);
if (del_info != nullptr) {
dout(20) << "image " << global_image_id << " "
<< "was already scheduled for deletion" << dendl;
+ if (ignore_orphaned) {
+ (*del_info)->ignore_orphaned = true;
+ }
return;
}
m_delete_queue.push_front(
unique_ptr<DeleteInfo>(new DeleteInfo(local_rados, local_pool_id,
- global_image_id)));
+ global_image_id, ignore_orphaned)));
m_delete_queue_cond.Signal();
}
-void ImageDeleter::wait_for_scheduled_deletion(int64_t local_pool_id,
- const std::string &global_image_id,
- Context *ctx,
- bool notify_on_failed_retry) {
+template <typename I>
+void ImageDeleter<I>::wait_for_scheduled_deletion(int64_t local_pool_id,
+ const std::string &global_image_id,
+ Context *ctx,
+ bool notify_on_failed_retry) {
ctx = new FunctionContext([this, ctx](int r) {
m_work_queue->queue(ctx, r);
(*del_info)->notify_on_failed_retry = notify_on_failed_retry;
}
-void ImageDeleter::cancel_waiter(int64_t local_pool_id,
- const std::string &global_image_id) {
+template <typename I>
+void ImageDeleter<I>::cancel_waiter(int64_t local_pool_id,
+ const std::string &global_image_id) {
Mutex::Locker locker(m_delete_lock);
auto del_info = find_delete_info(local_pool_id, global_image_id);
if (!del_info) {
}
}
-bool ImageDeleter::process_image_delete() {
-
+template <typename I>
+bool ImageDeleter<I>::process_image_delete() {
stringstream ss;
m_active_delete->to_string(ss);
std::string del_info_str = ss.str();
return true;
}
- bool is_primary = false;
+ std::string mirror_uuid;
C_SaferCond tag_owner_ctx;
- Journal<>::is_tag_owner(ioctx, local_image_id, &is_primary,
- m_work_queue, &tag_owner_ctx);
+ Journal<>::get_tag_owner(ioctx, local_image_id, &mirror_uuid, m_work_queue,
+ &tag_owner_ctx);
r = tag_owner_ctx.wait();
if (r < 0 && r != -ENOENT) {
derr << "error retrieving image primary info for image " << global_image_id
<< ": " << cpp_strerror(r) << dendl;
enqueue_failed_delete(r);
return true;
- }
- if (is_primary) {
- dout(10) << "image " << global_image_id << " is local primary" << dendl;
- complete_active_delete(-EISPRM);
- return true;
+ } else if (r != -ENOENT) {
+ if (mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) {
+ dout(10) << "image " << global_image_id << " is local primary" << dendl;
+ complete_active_delete(-EISPRM);
+ return true;
+ } else if (mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID &&
+ !m_active_delete->ignore_orphaned) {
+ dout(10) << "image " << global_image_id << " is orphaned" << dendl;
+ complete_active_delete(-EISPRM);
+ return true;
+ }
}
dout(20) << "local image is not the primary" << dendl;
-
bool has_snapshots;
r = image_has_snapshots_and_children(&ioctx, local_image_id, &has_snapshots);
if (r < 0) {
return true;
}
-int ImageDeleter::image_has_snapshots_and_children(IoCtx *ioctx,
- string& image_id,
- bool *has_snapshots) {
-
+template <typename I>
+int ImageDeleter<I>::image_has_snapshots_and_children(IoCtx *ioctx,
+ string& image_id,
+ bool *has_snapshots) {
string header_oid = librbd::util::header_name(image_id);
::SnapContext snapc;
int r = cls_client::get_snapcontext(ioctx, header_oid, &snapc);
return 0;
}
-void ImageDeleter::complete_active_delete(int r) {
+template <typename I>
+void ImageDeleter<I>::complete_active_delete(int r) {
dout(20) << dendl;
Mutex::Locker delete_locker(m_delete_lock);
m_active_delete.reset();
}
-void ImageDeleter::enqueue_failed_delete(int error_code) {
+template <typename I>
+void ImageDeleter<I>::enqueue_failed_delete(int error_code) {
dout(20) << "enter" << dendl;
if (error_code == -EBLACKLISTED) {
m_delete_lock.Unlock();
if (was_empty) {
FunctionContext *ctx = new FunctionContext(
- boost::bind(&ImageDeleter::retry_failed_deletions, this));
+ boost::bind(&ImageDeleter<I>::retry_failed_deletions, this));
Mutex::Locker l(*m_failed_timer_lock);
m_failed_timer->add_event_after(m_failed_interval, ctx);
}
}
-void ImageDeleter::retry_failed_deletions() {
+template <typename I>
+void ImageDeleter<I>::retry_failed_deletions() {
dout(20) << "enter" << dendl;
Mutex::Locker l(m_delete_lock);
}
}
-unique_ptr<ImageDeleter::DeleteInfo> const* ImageDeleter::find_delete_info(
- int64_t local_pool_id, const std::string &global_image_id) {
+template <typename I>
+unique_ptr<typename ImageDeleter<I>::DeleteInfo> const*
+ImageDeleter<I>::find_delete_info(int64_t local_pool_id,
+ const std::string &global_image_id) {
assert(m_delete_lock.is_locked());
if (m_active_delete && m_active_delete->match(local_pool_id,
return nullptr;
}
-void ImageDeleter::print_status(Formatter *f, stringstream *ss) {
+template <typename I>
+void ImageDeleter<I>::print_status(Formatter *f, stringstream *ss) {
dout(20) << "enter" << dendl;
if (f) {
}
}
-void ImageDeleter::DeleteInfo::notify(int r) {
+template <typename I>
+void ImageDeleter<I>::DeleteInfo::notify(int r) {
if (on_delete) {
dout(20) << "executing image deletion handler r=" << r << dendl;
}
}
-void ImageDeleter::DeleteInfo::to_string(stringstream& ss) {
+template <typename I>
+void ImageDeleter<I>::DeleteInfo::to_string(stringstream& ss) {
ss << "[" << "local_pool_id=" << local_pool_id << ", ";
ss << "global_image_id=" << global_image_id << "]";
}
-void ImageDeleter::DeleteInfo::print_status(Formatter *f, stringstream *ss,
- bool print_failure_info) {
+template <typename I>
+void ImageDeleter<I>::DeleteInfo::print_status(Formatter *f, stringstream *ss,
+ bool print_failure_info) {
if (f) {
f->open_object_section("delete_info");
f->dump_int("local_pool_id", local_pool_id);
}
}
-vector<string> ImageDeleter::get_delete_queue_items() {
+template <typename I>
+vector<string> ImageDeleter<I>::get_delete_queue_items() {
vector<string> items;
Mutex::Locker l(m_delete_lock);
return items;
}
-vector<pair<string, int> > ImageDeleter::get_failed_queue_items() {
+template <typename I>
+vector<pair<string, int> > ImageDeleter<I>::get_failed_queue_items() {
vector<pair<string, int> > items;
Mutex::Locker l(m_delete_lock);
return items;
}
-void ImageDeleter::set_failed_timer_interval(double interval) {
+template <typename I>
+void ImageDeleter<I>::set_failed_timer_interval(double interval) {
this->m_failed_interval = interval;
}
} // namespace mirror
} // namespace rbd
+
+template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
#include <vector>
#include <atomic>
+class AdminSocketHook;
class ContextWQ;
+namespace librbd { struct ImageCtx; }
namespace rbd {
namespace mirror {
-class ImageDeleterAdminSocketHook;
+template <typename> class ServiceDaemon;
/**
* Manage deletion of non-primary images.
*/
+template <typename ImageCtxT = librbd::ImageCtx>
class ImageDeleter {
public:
static const int EISPRM = 1000;
- ImageDeleter(ContextWQ *work_queue, SafeTimer *timer, Mutex *timer_lock);
+ ImageDeleter(ContextWQ *work_queue, SafeTimer *timer, Mutex *timer_lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
~ImageDeleter();
ImageDeleter(const ImageDeleter&) = delete;
ImageDeleter& operator=(const ImageDeleter&) = delete;
void schedule_image_delete(RadosRef local_rados,
int64_t local_pool_id,
- const std::string& global_image_id);
+ const std::string& global_image_id,
+ bool ignore_orphaned);
void wait_for_scheduled_deletion(int64_t local_pool_id,
const std::string &global_image_id,
Context *ctx,
RadosRef local_rados;
int64_t local_pool_id;
std::string global_image_id;
+ bool ignore_orphaned;
int error_code = 0;
int retries = 0;
bool notify_on_failed_retry = true;
Context *on_delete = nullptr;
DeleteInfo(RadosRef local_rados, int64_t local_pool_id,
- const std::string& global_image_id) :
- local_rados(local_rados), local_pool_id(local_pool_id),
- global_image_id(global_image_id) {
+ const std::string& global_image_id,
+ bool ignore_orphaned)
+ : local_rados(local_rados), local_pool_id(local_pool_id),
+ global_image_id(global_image_id), ignore_orphaned(ignore_orphaned) {
}
bool match(int64_t local_pool_id, const std::string &global_image_id) {
bool print_failure_info=false);
};
- std::atomic<unsigned> m_running { 0 };
+ std::atomic<unsigned> m_running { 1 };
ContextWQ *m_work_queue;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
std::deque<std::unique_ptr<DeleteInfo> > m_delete_queue;
Mutex m_delete_lock;
SafeTimer *m_failed_timer;
Mutex *m_failed_timer_lock;
- ImageDeleterAdminSocketHook *m_asok_hook;
+ AdminSocketHook *m_asok_hook;
void run();
bool process_image_delete();
} // namespace mirror
} // namespace rbd
+extern template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
+
#endif // CEPH_RBD_MIRROR_IMAGEDELETER_H
template <typename I>
ImageReplayer<I>::ImageReplayer(Threads<librbd::ImageCtx> *threads,
- shared_ptr<ImageDeleter> image_deleter,
+ ImageDeleter<I>* image_deleter,
InstanceWatcher<I> *instance_watcher,
RadosRef local,
const std::string &local_mirror_uuid,
delete m_asok_hook;
}
+template <typename I>
+image_replayer::HealthState ImageReplayer<I>::get_health_state() const {
+ Mutex::Locker locker(m_lock);
+
+ if (!m_mirror_image_status_state) {
+ return image_replayer::HEALTH_STATE_OK;
+ } else if (*m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING ||
+ *m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN) {
+ return image_replayer::HEALTH_STATE_WARNING;
+ }
+ return image_replayer::HEALTH_STATE_ERROR;
+}
+
template <typename I>
void ImageReplayer<I>::add_remote_image(const std::string &mirror_uuid,
const std::string &image_id,
dout(20) << dendl;
if (m_remote_images.empty()) {
- on_start_fail(0, "waiting for primary remote image");
+ on_start_fail(-EREMOTEIO, "waiting for primary remote image");
return;
}
if (r == -EREMOTEIO) {
m_local_image_tag_owner = "";
- dout(5) << "remote image is non-primary or local image is primary" << dendl;
- on_start_fail(0, "remote image is non-primary or local image is primary");
+ dout(5) << "remote image is non-primary" << dendl;
+ on_start_fail(-EREMOTEIO, "remote image is non-primary");
return;
} else if (r == -EEXIST) {
m_local_image_tag_owner = "";
Mutex::Locker locker(m_lock);
assert(m_state == STATE_STARTING);
m_state = STATE_STOPPING;
- if (r < 0 && r != -ECANCELED) {
+ if (r < 0 && r != -ECANCELED && r != -EREMOTEIO) {
derr << "start failed: " << cpp_strerror(r) << dendl;
} else {
dout(20) << "start canceled" << dendl;
State state;
std::string state_desc;
int last_r;
- bool bootstrapping;
bool stopping_replay;
+
+ OptionalMirrorImageStatusState mirror_image_status_state{
+ boost::make_optional(false, cls::rbd::MirrorImageStatusState{})};
+ image_replayer::BootstrapRequest<I>* bootstrap_request = nullptr;
{
Mutex::Locker locker(m_lock);
state = m_state;
state_desc = m_state_desc;
+ mirror_image_status_state = m_mirror_image_status_state;
last_r = m_last_r;
- bootstrapping = (m_bootstrap_request != nullptr);
stopping_replay = (m_local_image_ctx != nullptr);
+
+ if (m_bootstrap_request != nullptr) {
+ bootstrap_request = m_bootstrap_request;
+ bootstrap_request->get();
+ }
+ }
+
+ bool syncing = false;
+ if (bootstrap_request != nullptr) {
+ syncing = bootstrap_request->is_syncing();
+ bootstrap_request->put();
+ bootstrap_request = nullptr;
}
if (opt_state) {
status.up = true;
switch (state) {
case STATE_STARTING:
- if (bootstrapping) {
+ if (syncing) {
status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING;
status.description = state_desc.empty() ? "syncing" : state_desc;
+ mirror_image_status_state = status.state;
} else {
status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY;
status.description = "starting replay";
return;
}
status.description = "replaying, " + desc;
+ mirror_image_status_state = boost::none;
}
break;
case STATE_STOPPING:
}
// FALLTHROUGH
case STATE_STOPPED:
- if (last_r < 0) {
+ if (last_r == -EREMOTEIO) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
+ status.description = state_desc;
+ mirror_image_status_state = status.state;
+ } else if (last_r < 0) {
status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR;
status.description = state_desc;
+ mirror_image_status_state = status.state;
} else {
status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPED;
status.description = state_desc.empty() ? "stopped" : state_desc;
+ mirror_image_status_state = boost::none;
}
break;
default:
assert(!"invalid state");
}
+ {
+ Mutex::Locker locker(m_lock);
+ m_mirror_image_status_state = mirror_image_status_state;
+ }
+
+ // prevent the status from ping-ponging when failed replays are restarted
+ if (mirror_image_status_state &&
+ *mirror_image_status_state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR) {
+ status.state = *mirror_image_status_state;
+ }
+
dout(20) << "status=" << status << dendl;
librados::ObjectWriteOperation op;
librbd::cls_client::mirror_image_status_set(&op, m_global_image_id, status);
if (m_stopping_for_resync) {
m_image_deleter->schedule_image_delete(m_local,
m_local_pool_id,
- m_global_image_id);
+ m_global_image_id,
+ true);
m_stopping_for_resync = false;
}
}
#include "ImageDeleter.h"
#include "ProgressContext.h"
#include "types.h"
+#include "tools/rbd_mirror/image_replayer/Types.h"
#include <boost/noncopyable.hpp>
#include <boost/optional.hpp>
template <typename ImageCtxT = librbd::ImageCtx>
class ImageReplayer {
public:
- typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry;
-
- enum State {
- STATE_UNKNOWN,
- STATE_STARTING,
- STATE_REPLAYING,
- STATE_REPLAY_FLUSHING,
- STATE_STOPPING,
- STATE_STOPPED,
- };
-
static ImageReplayer *create(
- Threads<librbd::ImageCtx> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ Threads<librbd::ImageCtx> *threads, ImageDeleter<ImageCtxT>* image_deleter,
InstanceWatcher<ImageCtxT> *instance_watcher,
RadosRef local, const std::string &local_mirror_uuid, int64_t local_pool_id,
const std::string &global_image_id) {
}
ImageReplayer(Threads<librbd::ImageCtx> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ ImageDeleter<ImageCtxT>* image_deleter,
InstanceWatcher<ImageCtxT> *instance_watcher,
RadosRef local, const std::string &local_mirror_uuid,
int64_t local_pool_id, const std::string &global_image_id);
ImageReplayer(const ImageReplayer&) = delete;
ImageReplayer& operator=(const ImageReplayer&) = delete;
- State get_state() { Mutex::Locker l(m_lock); return get_state_(); }
bool is_stopped() { Mutex::Locker l(m_lock); return is_stopped_(); }
bool is_running() { Mutex::Locker l(m_lock); return is_running_(); }
bool is_replaying() { Mutex::Locker l(m_lock); return is_replaying_(); }
return (m_last_r == -EBLACKLISTED);
}
+ image_replayer::HealthState get_health_state() const;
+
void add_remote_image(const std::string &remote_mirror_uuid,
const std::string &remote_image_id,
librados::IoCtx &remote_io_ctx);
bool on_replay_interrupted();
private:
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry;
+
+ enum State {
+ STATE_UNKNOWN,
+ STATE_STARTING,
+ STATE_REPLAYING,
+ STATE_REPLAY_FLUSHING,
+ STATE_STOPPING,
+ STATE_STOPPED,
+ };
+
struct RemoteImage {
std::string mirror_uuid;
std::string image_id;
typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
typedef boost::optional<State> OptionalState;
+ typedef boost::optional<cls::rbd::MirrorImageStatusState>
+ OptionalMirrorImageStatusState;
struct JournalListener : public librbd::journal::Listener {
ImageReplayer *img_replayer;
};
Threads<librbd::ImageCtx> *m_threads;
- std::shared_ptr<ImageDeleter> m_image_deleter;
+ ImageDeleter<ImageCtxT>* m_image_deleter;
InstanceWatcher<ImageCtxT> *m_instance_watcher;
RemoteImages m_remote_images;
std::string m_name;
mutable Mutex m_lock;
State m_state = STATE_STOPPED;
- int m_last_r = 0;
std::string m_state_desc;
+
+ OptionalMirrorImageStatusState m_mirror_image_status_state = boost::none;
+ int m_last_r = 0;
+
BootstrapProgressContext m_progress_cxt;
bool m_do_resync{false};
image_replayer::EventPreprocessor<ImageCtxT> *m_event_preprocessor = nullptr;
static std::string to_string(const State state);
- State get_state_() const {
- return m_state;
- }
bool is_stopped_() const {
return m_state == STATE_STOPPED;
}
#include "librbd/Utils.h"
#include "ImageReplayer.h"
#include "InstanceReplayer.h"
+#include "ServiceDaemon.h"
#include "Threads.h"
#define dout_context g_ceph_context
namespace rbd {
namespace mirror {
+namespace {
+
+const std::string SERVICE_DAEMON_ASSIGNED_COUNT_KEY("image_assigned_count");
+const std::string SERVICE_DAEMON_WARNING_COUNT_KEY("image_warning_count");
+const std::string SERVICE_DAEMON_ERROR_COUNT_KEY("image_error_count");
+
+} // anonymous namespace
+
using librbd::util::create_async_context_callback;
using librbd::util::create_context_callback;
template <typename I>
InstanceReplayer<I>::InstanceReplayer(
- Threads<I> *threads, std::shared_ptr<ImageDeleter> image_deleter,
- RadosRef local_rados, const std::string &local_mirror_uuid,
- int64_t local_pool_id)
- : m_threads(threads), m_image_deleter(image_deleter),
- m_local_rados(local_rados), m_local_mirror_uuid(local_mirror_uuid),
- m_local_pool_id(local_pool_id),
+ Threads<I> *threads, ServiceDaemon<I>* service_daemon,
+ ImageDeleter<I>* image_deleter, RadosRef local_rados,
+ const std::string &local_mirror_uuid, int64_t local_pool_id)
+ : m_threads(threads), m_service_daemon(service_daemon),
+ m_image_deleter(image_deleter), m_local_rados(local_rados),
+ m_local_mirror_uuid(local_mirror_uuid), m_local_pool_id(local_pool_id),
m_lock("rbd::mirror::InstanceReplayer " + stringify(local_pool_id)) {
}
[this, image_replayer, on_finish] (int r) {
auto global_image_id = image_replayer->get_global_image_id();
m_image_deleter->schedule_image_delete(
- m_local_rados, m_local_pool_id, global_image_id);
+ m_local_rados, m_local_pool_id, global_image_id, false);
on_finish->complete(0);
});
}
}
template <typename I>
-void InstanceReplayer<I>::start_image_replayers() {
+void InstanceReplayer<I>::queue_start_image_replayers() {
dout(20) << dendl;
- Context *ctx = new FunctionContext(
- [this] (int r) {
- Mutex::Locker locker(m_lock);
- m_async_op_tracker.finish_op();
- if (m_on_shut_down != nullptr) {
- return;
- }
- for (auto &it : m_image_replayers) {
- start_image_replayer(it.second);
- }
- });
-
+ Context *ctx = create_context_callback<
+ InstanceReplayer, &InstanceReplayer<I>::start_image_replayers>(this);
m_async_op_tracker.start_op();
m_threads->work_queue->queue(ctx, 0);
}
+template <typename I>
+void InstanceReplayer<I>::start_image_replayers(int r) {
+ dout(20) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ if (m_on_shut_down != nullptr) {
+ return;
+ }
+
+ size_t image_count = 0;
+ size_t warning_count = 0;
+ size_t error_count = 0;
+ for (auto &it : m_image_replayers) {
+ ++image_count;
+ auto health_state = it.second->get_health_state();
+ if (health_state == image_replayer::HEALTH_STATE_WARNING) {
+ ++warning_count;
+ } else if (health_state == image_replayer::HEALTH_STATE_ERROR) {
+ ++error_count;
+ }
+
+ start_image_replayer(it.second);
+ }
+
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_ASSIGNED_COUNT_KEY, image_count);
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_WARNING_COUNT_KEY, warning_count);
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_ERROR_COUNT_KEY, error_count);
+
+ m_async_op_tracker.finish_op();
+}
template <typename I>
void InstanceReplayer<I>::stop_image_replayer(ImageReplayer<I> *image_replayer,
assert(m_threads->timer_lock.is_locked());
m_image_state_check_task = nullptr;
schedule_image_state_check_task();
- start_image_replayers();
+ queue_start_image_replayers();
});
- int after =
- max(1, g_ceph_context->_conf->rbd_mirror_image_state_check_interval);
+ int after = g_ceph_context->_conf->rbd_mirror_image_state_check_interval;
dout(20) << "scheduling image state check after " << after << " sec (task "
<< m_image_state_check_task << ")" << dendl;
namespace rbd {
namespace mirror {
-class ImageDeleter;
-
+template <typename> class ImageDeleter;
template <typename> class ImageReplayer;
template <typename> class InstanceWatcher;
+template <typename> class ServiceDaemon;
template <typename> struct Threads;
template <typename ImageCtxT = librbd::ImageCtx>
class InstanceReplayer {
public:
static InstanceReplayer* create(
- Threads<ImageCtxT> *threads, std::shared_ptr<ImageDeleter> image_deleter,
+ Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT>* service_daemon,
+ ImageDeleter<ImageCtxT>* image_deleter,
RadosRef local_rados, const std::string &local_mirror_uuid,
int64_t local_pool_id) {
- return new InstanceReplayer(threads, image_deleter, local_rados,
- local_mirror_uuid, local_pool_id);
+ return new InstanceReplayer(threads, service_daemon, image_deleter,
+ local_rados, local_mirror_uuid, local_pool_id);
}
void destroy() {
delete this;
}
InstanceReplayer(Threads<ImageCtxT> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ ServiceDaemon<ImageCtxT>* service_daemon,
+ ImageDeleter<ImageCtxT>* image_deleter,
RadosRef local_rados, const std::string &local_mirror_uuid,
int64_t local_pool_id);
~InstanceReplayer();
typedef std::set<Peer> Peers;
Threads<ImageCtxT> *m_threads;
- std::shared_ptr<ImageDeleter> m_image_deleter;
+ ServiceDaemon<ImageCtxT>* m_service_daemon;
+ ImageDeleter<ImageCtxT>* m_image_deleter;
RadosRef m_local_rados;
std::string m_local_mirror_uuid;
int64_t m_local_pool_id;
void handle_wait_for_ops(int r);
void start_image_replayer(ImageReplayer<ImageCtxT> *image_replayer);
- void start_image_replayers();
+ void queue_start_image_replayers();
+ void start_image_replayers(int r);
void stop_image_replayer(ImageReplayer<ImageCtxT> *image_replayer,
Context *on_finish);
cancel_remove_task(instance);
- int after = max(1, m_cct->_conf->rbd_mirror_leader_heartbeat_interval) *
+ int after = m_cct->_conf->rbd_mirror_leader_heartbeat_interval *
(1 + m_cct->_conf->rbd_mirror_leader_max_missed_heartbeats +
m_cct->_conf->rbd_mirror_leader_max_acquire_attempts_before_break);
m_timer_gate->timer_callback = timer_callback;
});
- int after = delay_factor *
- max(1, m_cct->_conf->rbd_mirror_leader_heartbeat_interval);
+ int after = delay_factor * m_cct->_conf->rbd_mirror_leader_heartbeat_interval;
dout(20) << "scheduling " << name << " after " << after << " sec (task "
<< m_timer_task << ")" << dendl;
#include "common/errno.h"
#include "librbd/ImageCtx.h"
#include "Mirror.h"
+#include "ServiceDaemon.h"
#include "Threads.h"
-#include "ImageSync.h"
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_rbd_mirror
{
cct->lookup_or_create_singleton_object<Threads<librbd::ImageCtx> >(
m_threads, "rbd_mirror::threads");
+ m_service_daemon.reset(new ServiceDaemon<>(m_cct, m_local, m_threads));
}
Mirror::~Mirror()
return r;
}
- m_local_cluster_watcher.reset(new ClusterWatcher(m_local, m_lock));
+ r = m_service_daemon->init();
+ if (r < 0) {
+ derr << "error registering service daemon: " << cpp_strerror(r) << dendl;
+ return r;
+ }
- m_image_deleter.reset(new ImageDeleter(m_threads->work_queue,
- m_threads->timer,
- &m_threads->timer_lock));
+ m_local_cluster_watcher.reset(new ClusterWatcher(m_local, m_lock,
+ m_service_daemon.get()));
+ m_image_deleter.reset(new ImageDeleter<>(m_threads->work_queue,
+ m_threads->timer,
+ &m_threads->timer_lock,
+ m_service_daemon.get()));
return r;
}
for (auto it = m_pool_replayers.begin(); it != m_pool_replayers.end();) {
auto &peer = it->first.second;
auto pool_peer_it = pool_peers.find(it->first.first);
- if (it->second->is_blacklisted()) {
- derr << "removing blacklisted pool replayer for " << peer << dendl;
- // TODO: make async
- it = m_pool_replayers.erase(it);
- } else if (pool_peer_it == pool_peers.end() ||
- pool_peer_it->second.find(peer) == pool_peer_it->second.end()) {
+ if (pool_peer_it == pool_peers.end() ||
+ pool_peer_it->second.find(peer) == pool_peer_it->second.end()) {
dout(20) << "removing pool replayer for " << peer << dendl;
// TODO: make async
+ it->second->shut_down();
it = m_pool_replayers.erase(it);
} else {
++it;
for (auto &kv : pool_peers) {
for (auto &peer : kv.second) {
PoolPeer pool_peer(kv.first, peer);
- if (m_pool_replayers.find(pool_peer) == m_pool_replayers.end()) {
+
+ auto pool_replayers_it = m_pool_replayers.find(pool_peer);
+ if (pool_replayers_it != m_pool_replayers.end()) {
+ auto& pool_replayer = pool_replayers_it->second;
+ if (pool_replayer->is_blacklisted()) {
+ derr << "restarting blacklisted pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init();
+ } else if (!pool_replayer->is_running()) {
+ derr << "restarting failed pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init();
+ }
+ } else {
dout(20) << "starting pool replayer for " << peer << dendl;
unique_ptr<PoolReplayer> pool_replayer(new PoolReplayer(
- m_threads, m_image_deleter, kv.first, peer, m_args));
+ m_threads, m_service_daemon.get(), m_image_deleter.get(), kv.first,
+ peer, m_args));
- // TODO: make async, and retry connecting within pool replayer
- int r = pool_replayer->init();
- if (r < 0) {
- continue;
- }
+ // TODO: make async
+ pool_replayer->init();
m_pool_replayers.emplace(pool_peer, std::move(pool_replayer));
}
}
namespace rbd {
namespace mirror {
+template <typename> struct ServiceDaemon;
template <typename> struct Threads;
class MirrorAdminSocketHook;
Mutex m_lock;
Cond m_cond;
RadosRef m_local;
+ std::unique_ptr<ServiceDaemon<librbd::ImageCtx>> m_service_daemon;
// monitor local cluster for config changes in peers
std::unique_ptr<ClusterWatcher> m_local_cluster_watcher;
- std::shared_ptr<ImageDeleter> m_image_deleter;
+ std::unique_ptr<ImageDeleter<>> m_image_deleter;
std::map<PoolPeer, std::unique_ptr<PoolReplayer> > m_pool_replayers;
std::atomic<bool> m_stopping = { false };
bool m_manual_stop = false;
#include "InstanceReplayer.h"
#include "InstanceWatcher.h"
#include "LeaderWatcher.h"
+#include "ServiceDaemon.h"
#include "Threads.h"
#define dout_context g_ceph_context
namespace {
+const std::string SERVICE_DAEMON_LEADER_KEY("leader");
+const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
+const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
+
class PoolReplayerAdminSocketCommand {
public:
PoolReplayerAdminSocketCommand(PoolReplayer *pool_replayer)
} // anonymous namespace
PoolReplayer::PoolReplayer(Threads<librbd::ImageCtx> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon,
+ ImageDeleter<>* image_deleter,
int64_t local_pool_id, const peer_t &peer,
const std::vector<const char*> &args) :
m_threads(threads),
+ m_service_daemon(service_daemon),
m_image_deleter(image_deleter),
- m_lock(stringify("rbd::mirror::PoolReplayer ") + stringify(peer)),
+ m_local_pool_id(local_pool_id),
m_peer(peer),
m_args(args),
- m_local_pool_id(local_pool_id),
+ m_lock(stringify("rbd::mirror::PoolReplayer ") + stringify(peer)),
m_local_pool_watcher_listener(this, true),
m_remote_pool_watcher_listener(this, false),
- m_asok_hook(nullptr),
m_pool_replayer_thread(this),
m_leader_listener(this)
{
PoolReplayer::~PoolReplayer()
{
delete m_asok_hook;
-
- m_stopping = true;
- {
- Mutex::Locker l(m_lock);
- m_cond.Signal();
- }
- if (m_pool_replayer_thread.is_started()) {
- m_pool_replayer_thread.join();
- }
- if (m_leader_watcher) {
- m_leader_watcher->shut_down();
- }
- if (m_instance_watcher) {
- m_instance_watcher->shut_down();
- }
- if (m_instance_replayer) {
- m_instance_replayer->shut_down();
- }
-
- assert(!m_local_pool_watcher);
- assert(!m_remote_pool_watcher);
+ shut_down();
}
bool PoolReplayer::is_blacklisted() const {
return m_leader_watcher && m_leader_watcher->is_leader();
}
-int PoolReplayer::init()
+bool PoolReplayer::is_running() const {
+ return m_pool_replayer_thread.is_started();
+}
+
+void PoolReplayer::init()
{
- dout(20) << "replaying for " << m_peer << dendl;
+ assert(!m_pool_replayer_thread.is_started());
+
+ // reset state
+ m_stopping = false;
+ m_blacklisted = false;
+ dout(20) << "replaying for " << m_peer << dendl;
int r = init_rados(g_ceph_context->_conf->cluster,
g_ceph_context->_conf->name.to_str(),
"local cluster", &m_local_rados);
if (r < 0) {
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to local cluster");
+ return;
}
r = init_rados(m_peer.cluster_name, m_peer.client_name,
std::string("remote peer ") + stringify(m_peer),
&m_remote_rados);
if (r < 0) {
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to remote cluster");
+ return;
}
r = m_local_rados->ioctx_create2(m_local_pool_id, m_local_io_ctx);
if (r < 0) {
derr << "error accessing local pool " << m_local_pool_id << ": "
<< cpp_strerror(r) << dendl;
- return r;
+ return;
}
std::string local_mirror_uuid;
if (r < 0) {
derr << "failed to retrieve local mirror uuid from pool "
<< m_local_io_ctx.get_pool_name() << ": " << cpp_strerror(r) << dendl;
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to query local mirror uuid");
+ return;
}
r = m_remote_rados->ioctx_create(m_local_io_ctx.get_pool_name().c_str(),
if (r < 0) {
derr << "error accessing remote pool " << m_local_io_ctx.get_pool_name()
<< ": " << cpp_strerror(r) << dendl;
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_WARNING,
+ "unable to access remote pool");
+ return;
}
dout(20) << "connected to " << m_peer << dendl;
- m_instance_replayer.reset(
- InstanceReplayer<>::create(m_threads, m_image_deleter, m_local_rados,
- local_mirror_uuid, m_local_pool_id));
+ m_instance_replayer.reset(InstanceReplayer<>::create(
+ m_threads, m_service_daemon, m_image_deleter, m_local_rados,
+ local_mirror_uuid, m_local_pool_id));
m_instance_replayer->init();
m_instance_replayer->add_peer(m_peer.uuid, m_remote_io_ctx);
- m_instance_watcher.reset(InstanceWatcher<>::create(m_local_io_ctx,
- m_threads->work_queue,
- m_instance_replayer.get()));
+ m_instance_watcher.reset(InstanceWatcher<>::create(
+ m_local_io_ctx, m_threads->work_queue, m_instance_replayer.get()));
r = m_instance_watcher->init();
if (r < 0) {
derr << "error initializing instance watcher: " << cpp_strerror(r) << dendl;
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize instance messenger object");
+ return;
}
m_leader_watcher.reset(new LeaderWatcher<>(m_threads, m_local_io_ctx,
&m_leader_listener));
-
r = m_leader_watcher->init();
if (r < 0) {
derr << "error initializing leader watcher: " << cpp_strerror(r) << dendl;
- return r;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize leader messenger object");
+ return;
+ }
+
+ if (m_callout_id != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(m_local_pool_id, m_callout_id);
+ m_callout_id = service_daemon::CALLOUT_ID_NONE;
}
m_pool_replayer_thread.create("pool replayer");
+}
- return 0;
+void PoolReplayer::shut_down() {
+ m_stopping = true;
+ {
+ Mutex::Locker l(m_lock);
+ m_cond.Signal();
+ }
+ if (m_pool_replayer_thread.is_started()) {
+ m_pool_replayer_thread.join();
+ }
+ if (m_leader_watcher) {
+ m_leader_watcher->shut_down();
+ }
+ if (m_instance_watcher) {
+ m_instance_watcher->shut_down();
+ }
+ if (m_instance_replayer) {
+ m_instance_replayer->shut_down();
+ }
+
+ assert(!m_local_pool_watcher);
+ assert(!m_remote_pool_watcher);
+ m_local_rados.reset();
+ m_remote_rados.reset();
}
int PoolReplayer::init_rados(const std::string &cluster_name,
return;
}
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_LOCAL_COUNT_KEY,
+ m_local_pool_watcher->get_image_count());
+ if (m_remote_pool_watcher) {
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_REMOTE_COUNT_KEY,
+ m_remote_pool_watcher->get_image_count());
+ }
+
+ std::string removed_remote_peer_id;
+ ImageIds removed_remote_image_ids;
if (m_initial_mirror_image_ids.find(mirror_uuid) ==
m_initial_mirror_image_ids.end() &&
m_initial_mirror_image_ids.size() < 2) {
// removal notifications for the remote pool
auto &local_image_ids = m_initial_mirror_image_ids.begin()->second;
auto &remote_image_ids = m_initial_mirror_image_ids.rbegin()->second;
+ removed_remote_peer_id = m_initial_mirror_image_ids.rbegin()->first;
for (auto &local_image_id : local_image_ids) {
if (remote_image_ids.find(local_image_id) == remote_image_ids.end()) {
- removed_image_ids.emplace(local_image_id.global_id, "");
+ removed_remote_image_ids.emplace(local_image_id.global_id, "");
}
}
local_image_ids.clear();
gather_ctx->new_sub());
}
+ // derived removal events for remote after initial image listing
+ for (auto& image_id : removed_remote_image_ids) {
+ // for now always send to myself (the leader)
+ std::string &instance_id = m_instance_watcher->get_instance_id();
+ m_instance_watcher->notify_image_release(instance_id, image_id.global_id,
+ removed_remote_peer_id,
+ image_id.id, true,
+ gather_ctx->new_sub());
+ }
+
gather_ctx->activate();
}
void PoolReplayer::handle_post_acquire_leader(Context *on_finish) {
dout(20) << dendl;
+ m_service_daemon->add_or_update_attribute(m_local_pool_id,
+ SERVICE_DAEMON_LEADER_KEY, true);
m_instance_watcher->handle_acquire_leader();
init_local_pool_watcher(on_finish);
}
void PoolReplayer::handle_pre_release_leader(Context *on_finish) {
dout(20) << dendl;
+ m_service_daemon->remove_attribute(m_local_pool_id, SERVICE_DAEMON_LEADER_KEY);
m_instance_watcher->handle_release_leader();
shut_down_pool_watchers(on_finish);
}
#include "PoolWatcher.h"
#include "ImageDeleter.h"
#include "types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
#include <set>
#include <map>
namespace rbd {
namespace mirror {
-template <typename> struct Threads;
template <typename> class InstanceReplayer;
template <typename> class InstanceWatcher;
+template <typename> class ServiceDaemon;
+template <typename> struct Threads;
/**
* Controls mirroring for a single remote cluster.
class PoolReplayer {
public:
PoolReplayer(Threads<librbd::ImageCtx> *threads,
- std::shared_ptr<ImageDeleter> image_deleter,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon,
+ ImageDeleter<>* image_deleter,
int64_t local_pool_id, const peer_t &peer,
const std::vector<const char*> &args);
~PoolReplayer();
bool is_blacklisted() const;
bool is_leader() const;
+ bool is_running() const;
+
+ void init();
+ void shut_down();
- int init();
void run();
void print_status(Formatter *f, stringstream *ss);
void handle_update_leader(const std::string &leader_instance_id);
Threads<librbd::ImageCtx> *m_threads;
- std::shared_ptr<ImageDeleter> m_image_deleter;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+ ImageDeleter<>* m_image_deleter;
+ int64_t m_local_pool_id = -1;
+ peer_t m_peer;
+ std::vector<const char*> m_args;
+
mutable Mutex m_lock;
Cond m_cond;
std::atomic<bool> m_stopping = { false };
bool m_manual_stop = false;
bool m_blacklisted = false;
- peer_t m_peer;
- std::vector<const char*> m_args;
RadosRef m_local_rados;
RadosRef m_remote_rados;
librados::IoCtx m_local_io_ctx;
librados::IoCtx m_remote_io_ctx;
- int64_t m_local_pool_id = -1;
-
PoolWatcherListener m_local_pool_watcher_listener;
std::unique_ptr<PoolWatcher<> > m_local_pool_watcher;
std::unique_ptr<InstanceReplayer<librbd::ImageCtx>> m_instance_replayer;
std::string m_asok_hook_name;
- AdminSocketHook *m_asok_hook;
+ AdminSocketHook *m_asok_hook = nullptr;
std::map<std::string, ImageIds> m_initial_mirror_image_ids;
+ service_daemon::CalloutId m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
class PoolReplayerThread : public Thread {
PoolReplayer *m_pool_replayer;
public:
void init(Context *on_finish = nullptr);
void shut_down(Context *on_finish);
+ inline size_t get_image_count() const {
+ Mutex::Locker locker(m_lock);
+ return m_image_ids.size();
+ }
+
private:
/**
* @verbatim
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/ServiceDaemon.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Timer.h"
+#include "tools/rbd_mirror/Threads.h"
+#include <sstream>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ServiceDaemon: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+const std::string RBD_MIRROR_AUTH_ID_PREFIX("rbd-mirror.");
+
+struct AttributeDumpVisitor : public boost::static_visitor<void> {
+ ceph::Formatter *f;
+ const std::string& name;
+
+ AttributeDumpVisitor(ceph::Formatter *f, const std::string& name)
+ : f(f), name(name) {
+ }
+
+ void operator()(bool val) const {
+ f->dump_bool(name.c_str(), val);
+ }
+ void operator()(uint64_t val) const {
+ f->dump_unsigned(name.c_str(), val);
+ }
+ void operator()(const std::string& val) const {
+ f->dump_string(name.c_str(), val);
+ }
+};
+
+} // anonymous namespace
+
+using namespace service_daemon;
+
+template <typename I>
+ServiceDaemon<I>::ServiceDaemon(CephContext *cct, RadosRef rados,
+ Threads<I>* threads)
+ : m_cct(cct), m_rados(rados), m_threads(threads),
+ m_lock("rbd::mirror::ServiceDaemon") {
+ dout(20) << dendl;
+}
+
+template <typename I>
+ServiceDaemon<I>::~ServiceDaemon() {
+ dout(20) << dendl;
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ update_status();
+ }
+}
+
+template <typename I>
+int ServiceDaemon<I>::init() {
+ dout(20) << dendl;
+
+ std::string name = m_cct->_conf->name.get_id();
+ if (name.find(RBD_MIRROR_AUTH_ID_PREFIX) == 0) {
+ name = name.substr(RBD_MIRROR_AUTH_ID_PREFIX.size());
+ }
+
+ std::map<std::string, std::string> service_metadata = {
+ {"instance_id", stringify(m_rados->get_instance_id())}
+ };
+ int r = m_rados->service_daemon_register("rbd-mirror", name,
+ service_metadata);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_pool(int64_t pool_id, const std::string& pool_name) {
+ dout(20) << "pool_id=" << pool_id << ", pool_name=" << pool_name << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_pools.insert({pool_id, {pool_name}});
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_pool(int64_t pool_id) {
+ dout(20) << "pool_id=" << pool_id << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ m_pools.erase(pool_id);
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+uint64_t ServiceDaemon<I>::add_or_update_callout(int64_t pool_id,
+ uint64_t callout_id,
+ CalloutLevel callout_level,
+ const std::string& text) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << ", "
+ << "callout_level=" << callout_level << ", "
+ << "text=" << text << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return CALLOUT_ID_NONE;
+ }
+
+ if (callout_id == CALLOUT_ID_NONE) {
+ callout_id = ++m_callout_id;
+ }
+ pool_it->second.callouts[callout_id] = {callout_level, text};
+ }
+
+ schedule_update_status();
+ return callout_id;
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_callout(int64_t pool_id, uint64_t callout_id) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.callouts.erase(callout_id);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_or_update_attribute(int64_t pool_id,
+ const std::string& key,
+ const AttributeValue& value) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << ", "
+ << "value=" << value << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes[key] = value;
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_attribute(int64_t pool_id,
+ const std::string& key) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes.erase(key);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::schedule_update_status() {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_timer_ctx != nullptr) {
+ return;
+ }
+
+ m_timer_ctx = new FunctionContext([this](int) {
+ m_timer_ctx = nullptr;
+ update_status();
+ });
+ m_threads->timer->add_event_after(1, m_timer_ctx);
+}
+
+template <typename I>
+void ServiceDaemon<I>::update_status() {
+ dout(20) << dendl;
+ assert(m_threads->timer_lock.is_locked());
+
+ ceph::JSONFormatter f;
+ {
+ Mutex::Locker locker(m_lock);
+ f.open_object_section("pools");
+ for (auto& pool_pair : m_pools) {
+ f.open_object_section(stringify(pool_pair.first).c_str());
+ f.dump_string("name", pool_pair.second.name);
+ f.open_object_section("callouts");
+ for (auto& callout : pool_pair.second.callouts) {
+ f.open_object_section(stringify(callout.first).c_str());
+ f.dump_string("level", stringify(callout.second.level).c_str());
+ f.dump_string("text", callout.second.text.c_str());
+ f.close_section();
+ }
+ f.close_section(); // callouts
+
+ for (auto& attribute : pool_pair.second.attributes) {
+ AttributeDumpVisitor attribute_dump_visitor(&f, attribute.first);
+ boost::apply_visitor(attribute_dump_visitor, attribute.second);
+ }
+ f.close_section(); // pool
+ }
+ f.close_section(); // pools
+ }
+
+ std::stringstream ss;
+ f.flush(ss);
+
+ int r = m_rados->service_daemon_update_status({{"json", ss.str()}});
+ if (r < 0) {
+ derr << "failed to update service daemon status: " << cpp_strerror(r)
+ << dendl;
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+
+#include "common/Mutex.h"
+#include "tools/rbd_mirror/types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <map>
+#include <string>
+
+struct CephContext;
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ServiceDaemon {
+public:
+ ServiceDaemon(CephContext *cct, RadosRef rados, Threads<ImageCtxT>* threads);
+ ~ServiceDaemon();
+
+ int init();
+
+ void add_pool(int64_t pool_id, const std::string& pool_name);
+ void remove_pool(int64_t pool_id);
+
+ uint64_t add_or_update_callout(int64_t pool_id, uint64_t callout_id,
+ service_daemon::CalloutLevel callout_level,
+ const std::string& text);
+ void remove_callout(int64_t pool_id, uint64_t callout_id);
+
+ void add_or_update_attribute(int64_t pool_id, const std::string& key,
+ const service_daemon::AttributeValue& value);
+ void remove_attribute(int64_t pool_id, const std::string& key);
+
+private:
+ struct Callout {
+ service_daemon::CalloutLevel level;
+ std::string text;
+
+ Callout() : level(service_daemon::CALLOUT_LEVEL_INFO) {
+ }
+ Callout(service_daemon::CalloutLevel level, const std::string& text)
+ : level(level), text(text) {
+ }
+ };
+ typedef std::map<uint64_t, Callout> Callouts;
+ typedef std::map<std::string, service_daemon::AttributeValue> Attributes;
+
+ struct Pool {
+ std::string name;
+ Callouts callouts;
+ Attributes attributes;
+
+ Pool(const std::string& name) : name(name) {
+ }
+ };
+
+ typedef std::map<int64_t, Pool> Pools;
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ Threads<ImageCtxT>* m_threads;
+
+ Mutex m_lock;
+ Pools m_pools;
+ uint64_t m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
+ Context* m_timer_ctx = nullptr;
+
+ void schedule_update_status();
+ void update_status();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_H
assert(m_remote_image_ctx == nullptr);
}
+template <typename I>
+bool BootstrapRequest<I>::is_syncing() const {
+ Mutex::Locker locker(m_lock);
+ return (m_image_sync != nullptr);
+}
+
template <typename I>
void BootstrapRequest<I>::send() {
*m_do_resync = false;
}
dout(20) << dendl;
- update_progress("IMAGE_SYNC");
-
- Context *ctx = create_context_callback<
- BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(
- this);
-
{
Mutex::Locker locker(m_lock);
if (m_canceled) {
m_ret_val = -ECANCELED;
} else {
assert(m_image_sync == nullptr);
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(this);
m_image_sync = ImageSync<I>::create(
*m_local_image_ctx, m_remote_image_ctx, m_timer, m_timer_lock,
m_local_mirror_uuid, m_journaler, m_client_meta, m_work_queue,
m_instance_watcher, ctx, m_progress_ctx);
m_image_sync->get();
+
+ m_lock.Unlock();
+ update_progress("IMAGE_SYNC");
+ m_lock.Lock();
+
m_image_sync->send();
return;
}
{
Mutex::Locker locker(m_lock);
-
m_image_sync->put();
m_image_sync = nullptr;
bool *do_resync, ProgressContext *progress_ctx = nullptr);
~BootstrapRequest() override;
+ bool is_syncing() const;
+
void send() override;
void cancel() override;
ProgressContext *m_progress_ctx;
bool *m_do_resync;
- Mutex m_lock;
+ mutable Mutex m_lock;
bool m_canceled = false;
Tags m_remote_tags;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+enum HealthState {
+ HEALTH_STATE_OK,
+ HEALTH_STATE_WARNING,
+ HEALTH_STATE_ERROR
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <iostream>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level) {
+ switch (callout_level) {
+ case CALLOUT_LEVEL_INFO:
+ os << "info";
+ break;
+ case CALLOUT_LEVEL_WARNING:
+ os << "warning";
+ break;
+ case CALLOUT_LEVEL_ERROR:
+ os << "error";
+ break;
+ }
+ return os;
+}
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+
+#include "include/int_types.h"
+#include <iosfwd>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+typedef uint64_t CalloutId;
+const uint64_t CALLOUT_ID_NONE {0};
+
+enum CalloutLevel {
+ CALLOUT_LEVEL_INFO,
+ CALLOUT_LEVEL_WARNING,
+ CALLOUT_LEVEL_ERROR
+};
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level);
+
+typedef boost::variant<bool, uint64_t, std::string> AttributeValue;
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
set(osd_traces oprequest.tp osd.tp pg.tp)
add_tracing_library(osd_tp "${osd_traces}" 1.0.0)
add_tracing_library(rados_tp librados.tp 2.0.0)
-add_tracing_library(rbd_tp librbd.tp 1.0.0)
add_tracing_library(os_tp objectstore.tp 1.0.0)
-install(TARGETS rados_tp osd_tp rbd_tp os_tp DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS rados_tp osd_tp os_tp DESTINATION ${CMAKE_INSTALL_LIBDIR})
+if(WITH_RBD)
+ add_tracing_library(rbd_tp librbd.tp 1.0.0)
+ install(TARGETS rbd_tp DESTINATION ${CMAKE_INSTALL_LIBDIR})
+endif(WITH_RBD)
if(WITH_OSD_INSTRUMENT_FUNCTIONS)
add_tracing_library(cyg_profile_tp cyg_profile.tp 1.0.0)
install(TARGETS cyg_profile_tp DESTINATION ${CMAKE_INSTALL_LIBDIR})
)
)
+TRACEPOINT_EVENT(librbd, compare_and_write_enter,
+ TP_ARGS(
+ void*, imagectx,
+ const char*, name,
+ const char*, snap_name,
+ char, read_only,
+ uint64_t, off,
+ size_t, len,
+ const char*, cmp_buf,
+ const char*, buf,
+ int, op_flags),
+ TP_FIELDS(
+ ctf_integer_hex(void*, imagectx, imagectx)
+ ctf_string(name, name)
+ ctf_string(snap_name, snap_name)
+ ctf_integer(char, read_only, read_only)
+ ctf_integer(uint64_t, off, off)
+ ceph_ctf_sequence(unsigned char, cmp_buf, cmp_buf, size_t, len)
+ ceph_ctf_sequence(unsigned char, buf, buf, size_t, len)
+ ctf_integer(int, op_flags, op_flags)
+ )
+)
+
+TRACEPOINT_EVENT(librbd, compare_and_write_exit,
+ TP_ARGS(
+ ssize_t, retval),
+ TP_FIELDS(
+ ctf_integer(ssize_t, retval, retval)
+ )
+)
+
TRACEPOINT_EVENT(librbd, open_image_by_id_enter,
TP_ARGS(
void*, imagectx,
)
)
+TRACEPOINT_EVENT(librbd, aio_compare_and_write_enter,
+ TP_ARGS(
+ void*, imagectx,
+ const char*, name,
+ const char*, snap_name,
+ char, read_only,
+ uint64_t, off,
+ size_t, len,
+ const char*, cmp_buf,
+ const char*, buf,
+ const void*, completion,
+ int, op_flags),
+ TP_FIELDS(
+ ctf_integer_hex(void*, imagectx, imagectx)
+ ctf_string(name, name)
+ ctf_string(snap_name, snap_name)
+ ctf_integer(char, read_only, read_only)
+ ctf_integer(uint64_t, off, off)
+ ctf_integer(size_t, len, len)
+ ceph_ctf_sequence(unsigned char, cmp_buf, cmp_buf, size_t, len)
+ ceph_ctf_sequence(unsigned char, buf, buf, size_t, len)
+ ctf_integer_hex(const void*, completion, completion)
+ ctf_integer(int, op_flags, op_flags)
+ )
+)
+TRACEPOINT_EVENT(librbd, aio_compare_and_write_exit,
+ TP_ARGS(
+ int, retval),
+ TP_FIELDS(
+ ctf_integer(int, retval, retval)
+ )
+)
+
+
TRACEPOINT_EVENT(librbd, clone_enter,
TP_ARGS(
const char*, parent_pool_name,
fi
if [ "$overwrite_conf" -eq 0 ]; then
+ CEPH_ASOK_DIR=`dirname $($CEPH_BIN/ceph-conf --show-config-value admin_socket)`
+ mkdir -p $CEPH_ASOK_DIR
MON=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC num_mon 2>/dev/null` && \
CEPH_NUM_MON="$MON"
OSD=`$CEPH_BIN/ceph-conf -c $conf_fn --name $VSTART_SEC num_osd 2>/dev/null` && \
else
if [ "$new" -ne 0 ]; then
# only delete if -n
+ asok_dir=`dirname $($CEPH_BIN/ceph-conf --show-config-value admin_socket)`
+ if [ $asok_dir != /var/run/ceph ]; then
+ [ -d $asok_dir ] && rm -f $asok_dir/* && rmdir $asok_dir
+ fi
+ if [ -z "$CEPH_ASOK_DIR" ]; then
+ CEPH_ASOK_DIR=`mktemp -u -d "${TMPDIR:-/tmp}/ceph-asok.XXXXXX"`
+ fi
[ -e "$conf_fn" ] && rm -- "$conf_fn"
else
+ CEPH_ASOK_DIR=`dirname $($CEPH_BIN/ceph-conf --show-config-value admin_socket)`
# -k is implied... (doesn't make sense otherwise)
overwrite_conf=0
fi
prepare_conf() {
local DAEMONOPTS="
log file = $CEPH_OUT_DIR/\$name.log
- admin socket = $CEPH_OUT_DIR/\$name.asok
+ admin socket = $CEPH_ASOK_DIR/\$name.asok
chdir = \"\"
pid file = $CEPH_OUT_DIR/\$name.pid
heartbeat file = $CEPH_OUT_DIR/\$name.heartbeat
osd crush chooseleaf type = 0
osd pool default min size = 1
osd failsafe full ratio = .99
+ mon osd nearfull ratio = .99
+ mon osd backfillfull ratio = .99
mon osd reporter subtree level = osd
mon osd full ratio = .99
- mon data avail warn = 10
+ mon data avail warn = 2
mon data avail crit = 1
erasure code dir = $EC_PATH
plugin dir = $CEPH_LIB
[client]
keyring = $keyring_fn
log file = $CEPH_OUT_DIR/\$name.\$pid.log
- admin socket = $CEPH_OUT_DIR/\$name.\$pid.asok
+ admin socket = $CEPH_ASOK_DIR/\$name.\$pid.asok
[client.rgw]
host = $HOSTNAME
EOF
- ceph_adm config-key put mgr/dashboard/$name/server_port $MGR_PORT
+ ceph_adm config-key set mgr/dashboard/$name/server_port $MGR_PORT
DASH_URLS+="http://$IP:$MGR_PORT/"
MGR_PORT=$(($MGR_PORT + 1000))
- ceph_adm config-key put mgr/restful/$name/server_port $MGR_PORT
+ ceph_adm config-key set mgr/restful/$name/server_port $MGR_PORT
RESTFUL_URLS+="https://$IP:$MGR_PORT"
MGR_PORT=$(($MGR_PORT + 1000))
run 'mgr' $CEPH_BIN/ceph-mgr -i $name $ARGS
done
+ # use tell mgr here because the first mgr might not have activated yet
+ # to register the python module commands.
if ceph_adm tell mgr restful create-self-signed-cert; then
SF=`mktemp`
- ceph_adm tell mgr restful create-key admin -o $SF
+ ceph_adm restful create-key admin -o $SF
RESTFUL_SECRET=`cat $SF`
rm $SF
else
prun $SUDO rm -f core*
+test -d $CEPH_ASOK_DIR || mkdir $CEPH_ASOK_DIR
test -d $CEPH_OUT_DIR || mkdir $CEPH_OUT_DIR
test -d $CEPH_DEV_DIR || mkdir $CEPH_DEV_DIR
$SUDO rm -rf $CEPH_OUT_DIR/*
start_mon
fi
+if [ $CEPH_NUM_MGR -gt 0 ]; then
+ start_mgr
+fi
+
# osd
if [ $CEPH_NUM_OSD -gt 0 ]; then
start_osd
# mgr
-if [ $CEPH_NUM_MGR -gt 0 ]; then
- start_mgr
-fi
-
if [ "$ec" -eq 1 ]; then
ceph_adm <<EOF
osd erasure-code-profile set ec-profile m=2 k=2