RELEASE=5.2
PACKAGE=ceph
-VER=12.2.7
+VER=12.2.8
DEBREL=pve1
SRCDIR=ceph
cmake_minimum_required(VERSION 2.8.11)
project(ceph)
-set(VERSION 12.2.7)
+set(VERSION 12.2.8)
if(POLICY CMP0046)
# Tweak policies (this one disables "missing" dependency warning)
message(FATAL_ERROR "Cannot have WITH_KRBD with WITH_RBD.")
endif()
-# embedded ceph daemon static library
-# NOTE: Ceph is mostly LGPL (see COPYING), which means that
-# static linking brings with it restrictions. Please be sure
-# to look at the LGPL license carefully before linking this library to
-# your code. See http://www.gnu.org/licenses/gpl-faq.html#LGPLStaticVsDynamic.
-option(WITH_EMBEDDED "build the embedded ceph daemon library" ON)
-
option(WITH_LEVELDB "LevelDB is here" ON)
if(WITH_LEVELDB)
if(LEVELDB_PREFIX)
### Build
```
-./run-make-check.sh -DWITH_EMBEDDED=OFF -DWITH_SYSTEM_BOOST=ON -DWITH_LTTNG=OFF -DWITH_REENTRANT_STRSIGNAL=ON -DWITH_THREAD_SAFE_RES_QUERY=ON
+./run-make-check.sh -DWITH_SYSTEM_BOOST=ON -DWITH_LTTNG=OFF -DWITH_REENTRANT_STRSIGNAL=ON -DWITH_THREAD_SAFE_RES_QUERY=ON
```
### Packaging
or
-./test/docker-test.sh --os-type alpine --os-version edge -- ./run-make-check.sh -DWITH_EMBEDDED=OFF -DWITH_SYSTEM_BOOST=ON -DWITH_LTTNG=OFF -DWITH_REENTRANT_STRSIGNAL=ON -DWITH_THREAD_SAFE_RES_QUERY=ON
+./test/docker-test.sh --os-type alpine --os-version edge -- ./run-make-check.sh -DWITH_SYSTEM_BOOST=ON -DWITH_LTTNG=OFF -DWITH_REENTRANT_STRSIGNAL=ON -DWITH_THREAD_SAFE_RES_QUERY=ON
```
# Contributor: John Coyle <dx9err@gmail.com>
# Maintainer: John Coyle <dx9err@gmail.com>
pkgname=ceph
-pkgver=12.2.7
+pkgver=12.2.8
pkgrel=0
pkgdesc="Ceph is a distributed object store and file system"
pkgusers="ceph"
xmlstarlet
yasm
"
-source="ceph-12.2.7.tar.bz2"
+source="ceph-12.2.8.tar.bz2"
subpackages="
$pkgname-base
$pkgname-common
_udevrulesdir=/etc/udev/rules.d
_python_sitelib=/usr/lib/python2.7/site-packages
-builddir=$srcdir/ceph-12.2.7
+builddir=$srcdir/ceph-12.2.8
build() {
export CEPH_BUILD_VIRTUALENV=$builddir
-DWITH_PYTHON3=OFF \
-DWITH_LTTNG=OFF \
-DWITH_SYSTEM_BOOST=ON \
- -DWITH_EMBEDDED=OFF \
-DWITH_TESTS=${_with_tests:-OFF} \
|| return 1
make -j${JOBS:-2} || return 1
-DWITH_PYTHON3=OFF \
-DWITH_LTTNG=OFF \
-DWITH_SYSTEM_BOOST=ON \
- -DWITH_EMBEDDED=OFF \
-DWITH_TESTS=${_with_tests:-OFF} \
|| return 1
make -j${JOBS:-2} || return 1
# main package definition
#################################################################################
Name: ceph
-Version: 12.2.7
+Version: 12.2.8
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
Group: System/Filesystems
%endif
URL: http://ceph.com/
-Source0: http://ceph.com/download/ceph-12.2.7.tar.bz2
+Source0: http://ceph.com/download/ceph-12.2.8.tar.bz2
%if 0%{?suse_version}
%if 0%{?is_opensuse}
ExclusiveArch: x86_64 aarch64 ppc64 ppc64le
BuildRequires: python-devel
BuildRequires: python-nose
BuildRequires: python-requests
+BuildRequires: python-six
BuildRequires: python-virtualenv
BuildRequires: snappy-devel
BuildRequires: udev
Group: System/Filesystems
%endif
Requires: ceph-base = %{_epoch_prefix}%{version}-%{release}
+Requires: python-six
%if 0%{?fedora} || 0%{?rhel}
Requires: python-cherrypy
Requires: python-jinja2
# common
#################################################################################
%prep
-%autosetup -p1 -n ceph-12.2.7
+%autosetup -p1 -n ceph-12.2.8
%build
%if 0%{with cephfs_java}
-DCMAKE_INSTALL_MANDIR=%{_mandir} \
-DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \
-DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \
- -DWITH_EMBEDDED=OFF \
-DWITH_MANPAGE=ON \
-DWITH_PYTHON3=ON \
-DWITH_SYSTEMD=ON \
/usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
%endif
# work around https://tracker.ceph.com/issues/24903
-chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
+chown -f -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
%preun osd
%if 0%{?suse_version}
BuildRequires: python-devel
BuildRequires: python-nose
BuildRequires: python-requests
+BuildRequires: python-six
BuildRequires: python-virtualenv
BuildRequires: snappy-devel
BuildRequires: udev
Group: System/Filesystems
%endif
Requires: ceph-base = %{_epoch_prefix}%{version}-%{release}
+Requires: python-six
%if 0%{?fedora} || 0%{?rhel}
Requires: python-cherrypy
Requires: python-jinja2
-DCMAKE_INSTALL_MANDIR=%{_mandir} \
-DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \
-DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \
- -DWITH_EMBEDDED=OFF \
-DWITH_MANPAGE=ON \
-DWITH_PYTHON3=ON \
-DWITH_SYSTEMD=ON \
/usr/lib/systemd/systemd-sysctl %{_sysctldir}/90-ceph-osd.conf > /dev/null 2>&1 || :
%endif
# work around https://tracker.ceph.com/issues/24903
-chown -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
+chown -f -h ceph:ceph /var/lib/ceph/osd/*/block* 2>&1 > /dev/null || :
%preun osd
%if 0%{?suse_version}
+++ /dev/null
-# This function is a helper that will merge static libraries.
-# For example,
-#
-# merge_static_libraries(mylib staticlibX staticlibY)
-#
-# mylib.a will generate a new static library mylib that is
-# a combination of staticlibX and staticlibY
-#
-function(merge_static_libraries target)
-
- set(dummy_source ${CMAKE_CURRENT_BINARY_DIR}/${target}_dummy.c)
- add_library(${target} STATIC ${dummy_source})
-
- # remove duplicates
- set(libs ${ARGN})
- list(REMOVE_DUPLICATES libs)
-
- # validate that all libs are static
- foreach(lib ${libs})
- if (NOT TARGET ${lib})
- message(FATAL_ERROR "${lib} not a valid target")
- endif()
-
- get_target_property(libtype ${lib} TYPE)
- if(NOT libtype STREQUAL "STATIC_LIBRARY")
- message(FATAL_ERROR "${lib} not a static library")
- endif()
-
- # add a dependency on the lib
- add_dependencies(${target} ${lib})
- endforeach()
-
- # Force the merged Make the generated dummy source file depended on all static input
- # libs. If input lib changes,the source file is touched
- # which causes the desired effect (relink).
- add_custom_command(
- OUTPUT ${dummy_source}
- COMMAND ${CMAKE_COMMAND} -E touch ${dummy_source}
- DEPENDS ${libs})
-
- # only LINUX is currently supported. OSX's libtool and windows lib.exe
- # have native support for merging static libraries, and support for them
- # can be easily added if required.
- if(LINUX)
- # generate a script to merge the static libraries in to the target
- # library. see https://sourceware.org/binutils/docs/binutils/ar-scripts.html
- set(mri_script "open $<TARGET_FILE:${target}>=")
- foreach(lib ${libs})
- # we use the generator expression TARGET_FILE to get the location
- # of the library. this will not be expanded until the script file
- # is written below
- set(mri_script "${mri_script} addlib $<TARGET_FILE:${lib}>=")
- endforeach()
- set(mri_script "${mri_script} save=end")
-
- add_custom_command(
- TARGET ${target} POST_BUILD
- COMMAND echo ${mri_script} | tr = \\\\n | ${CMAKE_AR} -M)
- endif(LINUX)
-
- message("-- MergeStaticLibraries: ${target}: merged ${libs}")
-
- # we want to set the target_link_libraries correctly for the new merged
- # static library. First we get the list of link libraries for each
- # of the libs we are merging
- set(link_libs)
- foreach(lib ${libs})
- get_property(trans TARGET ${lib} PROPERTY LINK_LIBRARIES)
- list(APPEND link_libs ${trans})
- endforeach()
-
- if (link_libs)
- # now remove the duplicates and any of the libraries we already merged
- list(REMOVE_DUPLICATES link_libs)
- foreach(lib ${libs})
- list(REMOVE_ITEM link_libs ${lib})
- endforeach()
-
- # set the target link libraries
- target_link_libraries(${target} ${link_libs})
-
- message("-- MergeStaticLibraries: ${target}: remaining ${link_libs}")
- endif()
-
-endfunction()
+ceph (12.2.8-1) stable; urgency=medium
+
+ * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com> Thu, 30 Aug 2018 17:24:37 +0000
+
ceph (12.2.7-1) stable; urgency=medium
* New upstream release
python-pecan,
python-prettytable,
python-setuptools,
+ python-six,
python-sphinx,
python-werkzeug,
python3-all-dev,
export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
-extraopts += -DUSE_CRYPTOPP=OFF -DWITH_OCF=ON -DWITH_LTTNG=ON -DWITH_PYTHON3=ON -DWITH_EMBEDDED=OFF
+extraopts += -DUSE_CRYPTOPP=OFF -DWITH_OCF=ON -DWITH_LTTNG=ON -DWITH_PYTHON3=ON
extraopts += -DWITH_CEPHFS_JAVA=ON
# assumes that ceph is exmpt from multiarch support, so we override the libdir.
extraopts += -DCMAKE_INSTALL_LIBDIR=/usr/lib
-D CEPH_MAN_DIR=man \
-D WITH_LIBCEPHFS=OFF \
-D WITH_CEPHFS=OFF \
- -D WITH_EMBEDDED=OFF \
-D WITH_MGR=YES \
2>&1 | tee cmake.log
Migrating
---------
-Starting on Ceph version 12.2.2, ``ceph-disk`` is deprecated. Deprecation
+Starting on Ceph version 13.0.0, ``ceph-disk`` is deprecated. Deprecation
warnings will show up that will link to this page. It is strongly suggested
that users start consuming ``ceph-volume``. There are two paths for migrating:
systemd
lvm/index
lvm/activate
+ lvm/batch
lvm/encryption
lvm/prepare
lvm/scan
--- /dev/null
+.. _ceph-volume-lvm-batch:
+
+``batch``
+===========
+This subcommand allows for multiple OSDs to be created at the same time given
+an input of devices. Depending on the device type (spinning drive, or solid
+state), the internal engine will decide the best approach to create the OSDs.
+
+This decision abstracts away the many nuances when creating an OSD: how large
+should a ``block.db`` be? How can one mix a solid state device with spinning
+devices in an efficient way?
+
+The process is similar to :ref:`ceph-volume-lvm-create`, and will do the
+preparation and activation at once, following the same workflow for each OSD.
+
+All the features that ``ceph-volume lvm create`` supports, like ``dmcrypt``,
+avoiding ``systemd`` units from starting, defining bluestore or filestore,
+are supported. Any fine-grained option that may affect a single OSD is not
+supported, for example: specifying where journals should be placed.
+
+
+.. _ceph-volume-lvm-batch_bluestore:
+
+``bluestore``
+-------------
+The :term:`bluestore` objectstore (the default) is used when creating multiple OSDs
+with the ``batch`` sub-command. It allows a few different scenarios depending
+on the input of devices:
+
+#. Devices are all spinning HDDs: 1 OSD is created per device
+#. Devices are all spinning SSDs: 2 OSDs are created per device
+#. Devices are a mix of HDDS and SSDs: data is placed on the spinning device,
+ the ``block.db`` is created on the SSD, as large as possible.
+
+
+.. note:: Although operations in ``ceph-volume lvm create`` allow usage of
+ ``block.wal`` it isn't supported with the ``batch`` sub-command
+
+
+.. _ceph-volume-lvm-batch_filestore:
+
+``filestore``
+-------------
+The :term:`filestore` objectstore can be used when creating multiple OSDs
+with the ``batch`` sub-command. It allows two different scenarios depending
+on the input of devices:
+
+#. Devices are all the same type (for example all spinning HDD or all SSDs):
+ 1 OSD is created per device, collocating the journal in the same HDD.
+#. Devices are a mix of HDDS and SSDs: data is placed on the spinning device,
+ while the journal is created on the SSD using the sizing options from
+ ceph.conf and falling back to the default journal size of 5GB.
+
+
+When a mix of solid and spinning devices are used, ``ceph-volume`` will try to
+detect existing volume groups on the solid devices. If a VG is found, it will
+try to create the logical volume from there, otherwise raising an error if
+space is insufficient.
+
+If a raw solid device is used along with a device that has a volume group in
+addition to some spinning devices, ``ceph-volume`` will try to extend the
+existing volume group and then create a logical volume.
+
+.. _ceph-volume-lvm-batch_report:
+
+Reporting
+=========
+When a call is received to create OSDs, the tool will prompt the user to
+continue if the pre-computed output is acceptable. This output is useful to
+understand the outcome of the received devices. Once confirmation is accepted,
+the process continues.
+
+Although prompts are good to understand outcomes, it is incredibly useful to
+try different inputs to find the best product possible. With the ``--report``
+flag, one can prevent any actual operations and just verify outcomes from
+inputs.
+
+**pretty reporting**
+For two spinning devices, this is how the ``pretty`` report (the default) would
+look::
+
+ $ ceph-volume lvm batch --report /dev/sdb /dev/sdc
+
+ Total OSDs: 2
+
+ Type Path LV Size % of device
+ --------------------------------------------------------------------------------
+ [data] /dev/sdb 10.74 GB 100%
+ --------------------------------------------------------------------------------
+ [data] /dev/sdc 10.74 GB 100%
+
+
+
+**JSON reporting**
+Reporting can produce a richer output with ``JSON``, which gives a few more
+hints on sizing. This feature might be better for other tooling to consume
+information that will need to be transformed.
+
+For two spinning devices, this is how the ``JSON`` report would look::
+
+ $ ceph-volume lvm batch --report --format=json /dev/sdb /dev/sdc
+ {
+ "osds": [
+ {
+ "block.db": {},
+ "data": {
+ "human_readable_size": "10.74 GB",
+ "parts": 1,
+ "path": "/dev/sdb",
+ "percentage": 100,
+ "size": 11534336000.0
+ }
+ },
+ {
+ "block.db": {},
+ "data": {
+ "human_readable_size": "10.74 GB",
+ "parts": 1,
+ "path": "/dev/sdc",
+ "percentage": 100,
+ "size": 11534336000.0
+ }
+ }
+ ],
+ "vgs": [
+ {
+ "devices": [
+ "/dev/sdb"
+ ],
+ "parts": 1
+ },
+ {
+ "devices": [
+ "/dev/sdc"
+ ],
+ "parts": 1
+ }
+ ]
+ }
Commands
========
-.. option:: help
+:command:`help`
show help
-.. option:: fsck
+:command:`fsck` [ --deep ]
run consistency check on BlueStore metadata. If *--deep* is specified, also read all object data and verify checksums.
-.. option:: repair
+:command:`repair`
Run a consistency check *and* repair any errors we can.
-.. option:: bluefs-export
+:command:`bluefs-export`
Export the contents of BlueFS (i.e., rocksdb files) to an output directory.
-.. option:: bluefs-bdev-sizes --path *osd path*
+:command:`bluefs-bdev-sizes` --path *osd path*
Print the device sizes, as understood by BlueFS, to stdout.
-.. option:: bluefs-bdev-expand --path *osd path*
+:command:`bluefs-bdev-expand` --path *osd path*
Instruct BlueFS to check the size of its block devices and, if they have expanded, make use of the additional space.
-.. option:: show-label --dev *device* [...]
+:command:`show-label` --dev *device* [...]
Show device label(s).
| [--log-path LOG_PATH]
| **ceph-volume** **lvm** [ *trigger* | *create* | *activate* | *prepare*
-| *zap* | *list*]
+| *zap* | *list* | *batch*]
| **ceph-volume** **simple** [ *trigger* | *scan* | *activate* ]
Subcommands:
+**batch**
+Creates OSDs from a list of devices using a ``filestore``
+or ``bluestore`` (default) setup. It will create all necessary volume groups
+and logical volumes required to have a working OSD.
+
+Example usage with three devices::
+
+ ceph-volume lvm batch --bluestore /dev/sda /dev/sdb /dev/sdc
+
+Optional arguments:
+
+* [-h, --help] show the help message and exit
+* [--bluestore] Use the bluestore objectstore (default)
+* [--filestore] Use the filestore objectstore
+* [--yes] Skip the report and prompt to continue provisioning
+* [--dmcrypt] Enable encryption for the underlying OSD devices
+* [--crush-device-class] Define a CRUSH device class to assign the OSD to
+* [--no-systemd] Do not enable or create any systemd units
+* [--report] Report what the potential outcome would be for the
+ current input (requires devices to be passed in)
+* [--format] Output format when reporting (used along with
+ --report), can be one of 'pretty' (default) or 'json'
+
+Required positional arguments:
+
+* <DEVICE> Full path to a raw device, like ``/dev/sda``. Multiple
+ ``<DEVICE>`` paths can be passed in.
+
+
**activate**
Enables a systemd unit that persists the OSD ID and its UUID (also called
``fsid`` in Ceph CLI tools), so that at boot time it can understand what OSD is
NOTE: A negative value means to disable subdir merging
:Type: Integer
:Required: No
-:Default: ``10``
+:Default: ``-10``
``filestore split multiple``
file. For example:
.. code-block:: ini
-
+
[osd]
- osd journal size = 1024
-
+ osd journal size = 5120
+
[osd.0]
host = osd-host-a
-
+
[osd.1]
host = osd-host-b
The following settings provide an Ceph OSD Daemon's ID, and determine paths to
data and journals. Ceph deployment scripts typically generate the UUID
-automatically. We **DO NOT** recommend changing the default paths for data or
-journals, as it makes it more problematic to troubleshoot Ceph later.
+automatically.
+
+.. warning:: **DO NOT** change the default paths for data or journals, as it
+ makes it more problematic to troubleshoot Ceph later.
The journal size should be at least twice the product of the expected drive
speed multiplied by ``filestore max sync interval``. However, the most common
:Description: The universally unique identifier (UUID) for the Ceph OSD Daemon.
:Type: UUID
:Default: The UUID.
-:Note: The ``osd uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
+:Note: The ``osd uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
applies to the entire cluster.
-``osd data``
+``osd data``
-:Description: The path to the OSDs data. You must create the directory when
- deploying Ceph. You should mount a drive for OSD data at this
- mount point. We do not recommend changing the default.
+:Description: The path to the OSDs data. You must create the directory when
+ deploying Ceph. You should mount a drive for OSD data at this
+ mount point. We do not recommend changing the default.
:Type: String
:Default: ``/var/lib/ceph/osd/$cluster-$id``
-``osd max write size``
+``osd max write size``
:Description: The maximum size of a write in megabytes.
:Type: 32-bit Integer
:Description: The largest client data message allowed in memory.
:Type: 64-bit Unsigned Integer
-:Default: 500MB default. ``500*1024L*1024L``
+:Default: 500MB default. ``500*1024L*1024L``
-``osd class dir``
+``osd class dir``
:Description: The class path for RADOS class plug-ins.
:Type: String
====================
Ceph builds and mounts file systems which are used for Ceph OSDs.
-``osd mkfs options {fs-type}``
+``osd mkfs options {fs-type}``
:Description: Options used when creating a new Ceph OSD of type {fs-type}.
For example::
``osd mkfs options xfs = -f -d agcount=24``
-``osd mount options {fs-type}``
+``osd mount options {fs-type}``
:Description: Options used when mounting a Ceph OSD of type {fs-type}.
/var/lib/ceph/osd/$cluster-$id/journal
-Without performance optimization, Ceph stores the journal on the same disk as
-the Ceph OSD Daemons data. An Ceph OSD Daemon optimized for performance may use
-a separate disk to store journal data (e.g., a solid state drive delivers high
-performance journaling).
+When using a single device type (for example, spinning drives), the journals
+should be *colocated*: the logical volume (or partition) should be in the same
+device as the ``data`` logical volume.
-Ceph's default ``osd journal size`` is 0, so you will need to set this in your
-``ceph.conf`` file. A journal size should find the product of the ``filestore
-max sync interval`` and the expected throughput, and multiply the product by
-two (2)::
-
- osd journal size = {2 * (expected throughput * filestore max sync interval)}
+When using a mix of fast (SSDs, NVMe) devices with slower ones (like spinning
+drives) it makes sense to place the journal on the faster device, while
+``data`` occupies the slower device fully.
-The expected throughput number should include the expected disk throughput
-(i.e., sustained data transfer rate), and network throughput. For example,
-a 7200 RPM disk will likely have approximately 100 MB/s. Taking the ``min()``
-of the disk and network throughput should provide a reasonable expected
-throughput. Some users just start off with a 10GB journal size. For
-example::
+The default ``osd journal size`` value is 5120 (5 gigabytes), but it can be
+larger, in which case it will need to be set in the ``ceph.conf`` file::
- osd journal size = 10000
+ osd journal size = 10240
-``osd journal``
+
+``osd journal``
:Description: The path to the OSD's journal. This may be a path to a file or a
- block device (such as a partition of an SSD). If it is a file,
+ block device (such as a partition of an SSD). If it is a file,
you must create the directory to contain it. We recommend using a
drive separate from the ``osd data`` drive.
:Default: ``/var/lib/ceph/osd/$cluster-$id/journal``
-``osd journal size``
+``osd journal size``
-:Description: The size of the journal in megabytes. If this is 0, and the
- journal is a block device, the entire block device is used.
- Since v0.54, this is ignored if the journal is a block device,
- and the entire block device is used.
+:Description: The size of the journal in megabytes.
:Type: 32-bit Integer
:Default: ``5120``
-:Recommended: Begin with 1GB. Should be at least twice the product of the
- expected speed multiplied by ``filestore max sync interval``.
See `Journal Config Reference`_ for additional details.
scrubbing operations.
-``osd max scrubs``
+``osd max scrubs``
-:Description: The maximum number of simultaneous scrub operations for
+:Description: The maximum number of simultaneous scrub operations for
a Ceph OSD Daemon.
:Type: 32-bit Int
-:Default: ``1``
+:Default: ``1``
``osd scrub begin hour``
:Default: ``true``
-``osd scrub thread timeout``
+``osd scrub thread timeout``
:Description: The maximum time in seconds before timing out a scrub thread.
:Type: 32-bit Integer
-:Default: ``60``
+:Default: ``60``
-``osd scrub finalize thread timeout``
+``osd scrub finalize thread timeout``
-:Description: The maximum time in seconds before timing out a scrub finalize
+:Description: The maximum time in seconds before timing out a scrub finalize
thread.
:Type: 32-bit Integer
:Default: ``60*10``
-``osd scrub load threshold``
+``osd scrub load threshold``
:Description: The maximum load. Ceph will not scrub when the system load
(as defined by ``getloadavg()``) is higher than this number.
Default is ``0.5``.
:Type: Float
-:Default: ``0.5``
+:Default: ``0.5``
-``osd scrub min interval``
+``osd scrub min interval``
:Description: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
when the Ceph Storage Cluster load is low.
:Default: Once per day. ``60*60*24``
-``osd scrub max interval``
+``osd scrub max interval``
-:Description: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
+:Description: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
irrespective of cluster load.
:Type: Float
``osd deep scrub interval``
-:Description: The interval for "deep" scrubbing (fully reading all data). The
+:Description: The interval for "deep" scrubbing (fully reading all data). The
``osd scrub load threshold`` does not affect this setting.
:Type: Float
``osd client op priority``
-:Description: The priority set for client operations. It is relative to
+:Description: The priority set for client operations. It is relative to
``osd recovery op priority``.
:Type: 32-bit Integer
-:Default: ``63``
+:Default: ``63``
:Valid Range: 1-63
``osd recovery op priority``
-:Description: The priority set for recovery operations. It is relative to
+:Description: The priority set for recovery operations. It is relative to
``osd client op priority``.
:Type: 32-bit Integer
-:Default: ``3``
+:Default: ``3``
:Valid Range: 1-63
:Valid Range: 1-63
-``osd op thread timeout``
+``osd op thread timeout``
:Description: The Ceph OSD Daemon operation thread timeout in seconds.
:Type: 32-bit Integer
-:Default: ``15``
+:Default: ``15``
-``osd op complaint time``
+``osd op complaint time``
:Description: An operation becomes complaint worthy after the specified number
of seconds have elapsed.
:Type: Float
-:Default: ``30``
+:Default: ``30``
-``osd disk threads``
+``osd disk threads``
-:Description: The number of disk threads, which are used to perform background
- disk intensive OSD operations such as scrubbing and snap
+:Description: The number of disk threads, which are used to perform background
+ disk intensive OSD operations such as scrubbing and snap
trimming.
:Type: 32-bit Integer
-:Default: ``1``
+:Default: ``1``
``osd disk thread ioprio class``
operations. ``be`` is the default and is the same
priority as all other threads in the OSD. ``rt`` means
the disk thread will have precendence over all other
- threads in the OSD. Note: Only works with the Linux Kernel
+ threads in the OSD. Note: Only works with the Linux Kernel
CFQ scheduler. Since Jewel scrubbing is no longer carried
out by the disk iothread, see osd priority options instead.
:Type: String
the objects they contain can reduce the cluster's operational performance
considerably. To maintain operational performance, Ceph performs this migration
with 'backfilling', which allows Ceph to set backfill operations to a lower
-priority than requests to read or write data.
+priority than requests to read or write data.
``osd max backfills``
:Default: ``1``
-``osd backfill scan min``
+``osd backfill scan min``
:Description: The minimum number of objects per backfill scan.
:Type: 32-bit Integer
-:Default: ``64``
+:Default: ``64``
-``osd backfill scan max``
+``osd backfill scan max``
:Description: The maximum number of objects per backfill scan.
:Type: 32-bit Integer
-:Default: ``512``
+:Default: ``512``
``osd backfill retry interval``
OSD Map
=======
-OSD maps reflect the OSD daemons operating in the cluster. Over time, the
+OSD maps reflect the OSD daemons operating in the cluster. Over time, the
number of map epochs increases. Ceph provides some settings to ensure that
Ceph performs well as the OSD map grows larger.
``osd map dedup``
-:Description: Enable removing duplicates in the OSD map.
+:Description: Enable removing duplicates in the OSD map.
:Type: Boolean
:Default: ``true``
-``osd map cache size``
+``osd map cache size``
:Description: The number of OSD maps to keep cached.
:Type: 32-bit Integer
``osd map cache bl size``
-:Description: The size of the in-memory OSD map cache in OSD daemons.
+:Description: The size of the in-memory OSD map cache in OSD daemons.
:Type: 32-bit Integer
:Default: ``50``
``osd map cache bl inc size``
-:Description: The size of the in-memory OSD map cache incrementals in
+:Description: The size of the in-memory OSD map cache incrementals in
OSD daemons.
:Type: 32-bit Integer
:Default: ``100``
-``osd map message max``
+``osd map message max``
:Description: The maximum map entries allowed per MOSDMap message.
:Type: 32-bit Integer
To maintain operational performance, Ceph performs recovery with limitations on
the number recovery requests, threads and object chunk sizes which allows Ceph
-perform well in a degraded state.
+perform well in a degraded state.
-``osd recovery delay start``
+``osd recovery delay start``
-:Description: After peering completes, Ceph will delay for the specified number
+:Description: After peering completes, Ceph will delay for the specified number
of seconds before starting to recover objects.
:Type: Float
-:Default: ``0``
+:Default: ``0``
-``osd recovery max active``
+``osd recovery max active``
-:Description: The number of active recovery requests per OSD at one time. More
- requests will accelerate recovery, but the requests places an
+:Description: The number of active recovery requests per OSD at one time. More
+ requests will accelerate recovery, but the requests places an
increased load on the cluster.
:Type: 32-bit Integer
:Default: ``3``
-``osd recovery max chunk``
+``osd recovery max chunk``
-:Description: The maximum size of a recovered chunk of data to push.
+:Description: The maximum size of a recovered chunk of data to push.
:Type: 64-bit Unsigned Integer
-:Default: ``8 << 20``
+:Default: ``8 << 20``
``osd recovery max single start``
:Default: ``1``
-``osd recovery thread timeout``
+``osd recovery thread timeout``
:Description: The maximum time in seconds before timing out a recovery thread.
:Type: 32-bit Integer
``osd recover clone overlap``
-:Description: Preserves clone overlap during recovery. Should always be set
+:Description: Preserves clone overlap during recovery. Should always be set
to ``true``.
:Type: Boolean
=============
-``osd snap trim thread timeout``
+``osd snap trim thread timeout``
:Description: The maximum time in seconds before timing out a snap trim thread.
:Type: 32-bit Integer
-:Default: ``60*60*1``
+:Default: ``60*60*1``
-``osd backlog thread timeout``
+``osd backlog thread timeout``
:Description: The maximum time in seconds before timing out a backlog thread.
:Type: 32-bit Integer
-:Default: ``60*60*1``
+:Default: ``60*60*1``
-``osd default notify timeout``
+``osd default notify timeout``
:Description: The OSD default notification timeout (in seconds).
:Type: 32-bit Unsigned Integer
-:Default: ``30``
+:Default: ``30``
-``osd check for log corruption``
+``osd check for log corruption``
:Description: Check log files for corruption. Can be computationally expensive.
:Type: Boolean
-:Default: ``false``
+:Default: ``false``
-``osd remove thread timeout``
+``osd remove thread timeout``
:Description: The maximum time in seconds before timing out a remove OSD thread.
:Type: 32-bit Integer
:Default: ``60*60``
-``osd command thread timeout``
+``osd command thread timeout``
:Description: The maximum time in seconds before timing out a command thread.
:Type: 32-bit Integer
-:Default: ``10*60``
+:Default: ``10*60``
-``osd command max records``
+``osd command max records``
-:Description: Limits the number of lost objects to return.
+:Description: Limits the number of lost objects to return.
:Type: 32-bit Integer
-:Default: ``256``
+:Default: ``256``
-``osd auto upgrade tmap``
+``osd auto upgrade tmap``
:Description: Uses ``tmap`` for ``omap`` on old objects.
:Type: Boolean
:Default: ``true``
-
-``osd tmapput sets users tmap``
+
+``osd tmapput sets users tmap``
:Description: Uses ``tmap`` for debugging only.
:Type: Boolean
-:Default: ``false``
+:Default: ``false``
``osd fast fail on connection refused``
Parameters
~~~~~~~~~~
-+-----------------+-----------+-----------------------------------------------------------------------+
-| Name | Type | Description |
-+=================+===========+=======================================================================+
-| ``prefix`` | String | Only returns objects that contain the specified prefix. |
-+-----------------+-----------+-----------------------------------------------------------------------+
-| ``delimiter`` | String | The delimiter between the prefix and the rest of the object name. |
-+-----------------+-----------+-----------------------------------------------------------------------+
-| ``marker`` | String | A beginning index for the list of objects returned. |
-+-----------------+-----------+-----------------------------------------------------------------------+
-| ``max-keys`` | Integer | The maximum number of keys to return. Default is 1000. |
-+-----------------+-----------+-----------------------------------------------------------------------+
-
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++=====================+===========+=================================================================================================+
+| ``prefix`` | String | Only returns objects that contain the specified prefix. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``delimiter`` | String | The delimiter between the prefix and the rest of the object name. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``marker`` | String | A beginning index for the list of objects returned. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``max-keys`` | Integer | The maximum number of keys to return. Default is 1000. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``allow-unordered`` | Boolean | Non-standard extension. Allows results to be returned unordered. Cannot be used with delimiter. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
HTTP Response
~~~~~~~~~~~~~
:Type: String
:Required: No
+``allow_unordered``
+
+:Description: Allows the results to be returned unordered to reduce computation overhead. Cannot be used with ``delimiter``.
+:Type: Boolean
+:Required: No
+:Non-Standard Extension: Yes
+
Response Entities
~~~~~~~~~~~~~~~~~
CXX?=g++
CXX_FLAGS?=-std=c++11 -Wall -Wextra -Werror -g
-CXX_LIBS?=-lboost_system -lrados -lradosstriper
+CXX_LIBS?=-lrados -lradosstriper
CXX_INC?=$(LOCAL_LIBRADOS_INC)
CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS) $(CXX_LIBS)
$SUDO $builddepcmd $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
! grep -q -i error: $DIR/yum-builddep.out || exit 1
;;
- opensuse|suse|sles)
+ opensuse*|suse|sles)
echo "Using zypper to install dependencies"
$SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release systemd-rpm-macros
munge_ceph_spec_in $DIR/ceph.spec
else
sudo sysctl -w ${KERNCORE}=${COREPATTERN}
fi
+# Clean out any cores in core target directory (currently .)
+if ls $(dirname $(sysctl -n $KERNCORE)) | grep -q '^core\|core$' ; then
+ mkdir found.cores.$$ 2> /dev/null || true
+ for i in $(ls $(dirname $(sysctl -n $KERNCORE)) | grep '^core\|core$'); do
+ mv $i found.cores.$$
+ done
+ echo "Stray cores put in $(pwd)/found.cores.$$"
+fi
+
ulimit -c unlimited
for f in $(cd $location ; find . -perm $exec_mode -type f)
do
# subvolumes that relate to it.
#
# @param dir path name of the environment
+# @param dumplogs pass "1" to dump logs otherwise it will only if cores found
# @return 0 on success, 1 on error
#
function teardown() {
pattern=""
fi
# Local we start with core and teuthology ends with core
- if ls $(dirname $pattern) | grep -q '^core\|core$' ; then
+ if ls $(dirname "$pattern") | grep -q '^core\|core$' ; then
cores="yes"
if [ -n "$LOCALRUN" ]; then
mkdir /tmp/cores.$$ 2> /dev/null || true
fi
fi
if [ "$cores" = "yes" -o "$dumplogs" = "1" ]; then
- display_logs $dir
+ if [ -n "$LOCALRUN" ]; then
+ display_logs $dir
+ else
+ # Move logs to where Teuthology will archive it
+ mkdir -p $TESTDIR/archive/log
+ mv $dir/*.log $TESTDIR/archive/log
+ fi
fi
rm -fr $dir
rm -rf $(get_asok_dir)
#######################################################################
##
-# Wait until the cluster becomes HEALTH_OK again or if it does not make progress
-# for $TIMEOUT seconds.
+# Wait until the cluster has health condition passed as arg
+# again for $TIMEOUT seconds.
#
-# @return 0 if the cluster is HEALTHY, 1 otherwise
+# @param string to grep for in health detail
+# @return 0 if the cluster health matches request, 1 otherwise
#
function wait_for_health() {
local grepstr=$1
done
}
+##
+# Wait until the cluster becomes HEALTH_OK again or if it does not make progress
+# for $TIMEOUT seconds.
+#
+# @return 0 if the cluster is HEALTHY, 1 otherwise
+#
function wait_for_health_ok() {
wait_for_health "HEALTH_OK" || return 1
}
done
}
+function multidiff() {
+ if ! diff $@ ; then
+ if [ "$DIFFCOLOPTS" = "" ]; then
+ return 1
+ fi
+ diff $DIFFCOLOPTS $@
+ fi
+}
+
# Local Variables:
# compile-command: "cd ../../src ; make -j4 && ../qa/standalone/ceph-helpers.sh TESTS # test_get_config"
# End:
setup $dir || return 1
# disable pg dir merge
- CEPH_ARGS+="--filestore-merge-threshold=-10 "
export CEPH_ARGS
run_mon $dir a || return 1
run_osd $dir 0 || return 1
}
-main recout "$@"
+main osd-backfill-stats "$@"
# Local Variables:
# compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-stats.sh"
kill_daemons $dir || return 1
}
-main recout "$@"
+main osd-recovery-stats "$@"
# Local Variables:
# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"
ceph tell osd.\* injectargs -- --osd-pg-log-trim-min 10 || return 1
ceph tell osd.\* injectargs -- --osd-pg-log-dups-tracked 10 || return 1
- touch foo
+ touch $dir/foo
for i in $(seq 1 20)
do
- rados -p test put foo foo || return 1
+ rados -p test put foo $dir/foo || return 1
done
test_log_size $PGID 20 || return 1
setup_log_test $dir || return 1
# regular write should trim the log
- rados -p test put foo foo || return 1
+ rados -p test put foo $dir/foo || return 1
test_log_size $PGID 22 || return 1
}
--- /dev/null
+#!/bin/bash -x
+#
+# Copyright (C) 2014 Red Hat <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+# Test development and debugging
+# Set to "yes" in order to ignore diff errors and save results to update test
+getjson="yes"
+
+# Filter out mtime and local_mtime dates, version, prior_version and last_reqid (client) from any object_info.
+jqfilter='def walk(f):
+ . as $in
+ | if type == "object" then
+ reduce keys[] as $key
+ ( {}; . + { ($key): ($in[$key] | walk(f)) } ) | f
+ elif type == "array" then map( walk(f) ) | f
+ else f
+ end;
+walk(if type == "object" then del(.mtime) else . end)
+| walk(if type == "object" then del(.local_mtime) else . end)
+| walk(if type == "object" then del(.last_reqid) else . end)
+| walk(if type == "object" then del(.version) else . end)
+| walk(if type == "object" then del(.prior_version) else . end)
+| walk(if type == "object" then del(.redirect_target) else . end)
+| walk(if type == "object" then del(.legacy_snaps) else . end)'
+
+sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+ CEPH_ARGS+="--osd-distrust-data-digest=true "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+function add_something() {
+ local dir=$1
+ local poolname=$2
+ local obj=${3:-SOMETHING}
+ local scrub=${4:-noscrub}
+
+ if [ "$scrub" = "noscrub" ];
+ then
+ ceph osd set noscrub || return 1
+ ceph osd set nodeep-scrub || return 1
+ else
+ ceph osd unset noscrub || return 1
+ ceph osd unset nodeep-scrub || return 1
+ fi
+
+ local payload=ABCDEF
+ echo $payload > $dir/ORIGINAL
+ rados --pool $poolname put $obj $dir/ORIGINAL || return 1
+}
+
+#
+# Test automatic repair with distrust set
+#
+function TEST_distrust_scrub_replicated() {
+ local dir=$1
+ local poolname=dsr_pool
+ local total_objs=2
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_pool foo 1 || return 1
+ create_pool $poolname 1 1 || return 1
+ wait_for_clean || return 1
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+ add_something $dir $poolname $objname || return 1
+ done
+
+ local pg=$(get_pg $poolname ROBJ0)
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+
+ case $i in
+ 1)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=XROBJ1
+ echo $payload > $dir/new.ROBJ1
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ1 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ1 || return 1
+ ;;
+
+ 2)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=XROBJ2
+ echo $payload > $dir/new.ROBJ2
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ2 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ2 || return 1
+ # Make one replica have a different object info, so a full repair must happen too
+ objectstore_tool $dir 0 $objname corrupt-info || return 1
+ ;;
+ esac
+ done
+
+ # This should fix the data_digest because osd-distrust-data-digest is true
+ pg_deep_scrub $pg
+
+ # This hangs if the scrub didn't repair the data_digest
+ timeout 30 rados -p $poolname get ROBJ1 $dir/robj1.out || return 1
+ diff -q $dir/new.ROBJ1 $dir/robj1.out || return 1
+ rm -f $dir/new.ROBJ1 $dir/robj1.out || return 1
+
+ rados list-inconsistent-pg $poolname > $dir/json || return 1
+ # Check pg count
+ test $(jq '. | length' $dir/json) = "1" || return 1
+ # Check pgid
+ test $(jq -r '.[0]' $dir/json) = $pg || return 1
+
+ rados list-inconsistent-obj $pg > $dir/json || return 1
+ # Get epoch for repair-get requests
+ epoch=$(jq .epoch $dir/json)
+
+ jq "$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
+{
+ "inconsistents": [
+ {
+ "shards": [
+ {
+ "object_info": {
+ "watchers": {},
+ "manifest": {
+ "redirect_target": {
+ "namespace": "",
+ "pool": -9223372036854776000,
+ "max": 0,
+ "hash": 0,
+ "snapid": 0,
+ "key": "",
+ "oid": ""
+ },
+ "type": 0
+ },
+ "alloc_hint_flags": 255,
+ "expected_write_size": 0,
+ "local_mtime": "2018-07-24 15:05:56.027234",
+ "mtime": "2018-07-24 15:05:56.021775",
+ "size": 7,
+ "user_version": 2,
+ "last_reqid": "client.4137.0:1",
+ "prior_version": "0'0",
+ "version": "23'2",
+ "oid": {
+ "namespace": "",
+ "pool": 3,
+ "max": 0,
+ "hash": 2026323607,
+ "snapid": -2,
+ "key": "",
+ "oid": "ROBJ2"
+ },
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest"
+ ],
+ "legacy_snaps": [],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0
+ },
+ "data_digest": "0x0bb7ab52",
+ "omap_digest": "0xffffffff",
+ "size": 7,
+ "errors": [],
+ "primary": false,
+ "osd": 0
+ },
+ {
+ "object_info": {
+ "watchers": {},
+ "manifest": {
+ "redirect_target": {
+ "namespace": "",
+ "pool": -9223372036854776000,
+ "max": 0,
+ "hash": 0,
+ "snapid": 0,
+ "key": "",
+ "oid": ""
+ },
+ "type": 0
+ },
+ "alloc_hint_flags": 0,
+ "expected_write_size": 0,
+ "local_mtime": "2018-07-24 15:05:56.027234",
+ "mtime": "2018-07-24 15:05:56.021775",
+ "size": 7,
+ "user_version": 2,
+ "last_reqid": "client.4137.0:1",
+ "prior_version": "0'0",
+ "version": "23'2",
+ "oid": {
+ "namespace": "",
+ "pool": 3,
+ "max": 0,
+ "hash": 2026323607,
+ "snapid": -2,
+ "key": "",
+ "oid": "ROBJ2"
+ },
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest"
+ ],
+ "legacy_snaps": [],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0
+ },
+ "data_digest": "0x0bb7ab52",
+ "omap_digest": "0xffffffff",
+ "size": 7,
+ "errors": [],
+ "primary": true,
+ "osd": 1
+ }
+ ],
+ "selected_object_info": {
+ "watchers": {},
+ "manifest": {
+ "redirect_target": {
+ "namespace": "",
+ "pool": -9223372036854776000,
+ "max": 0,
+ "hash": 0,
+ "snapid": 0,
+ "key": "",
+ "oid": ""
+ },
+ "type": 0
+ },
+ "alloc_hint_flags": 0,
+ "expected_write_size": 0,
+ "local_mtime": "2018-07-24 15:05:56.027234",
+ "mtime": "2018-07-24 15:05:56.021775",
+ "size": 7,
+ "user_version": 2,
+ "last_reqid": "client.4137.0:1",
+ "prior_version": "0'0",
+ "version": "23'2",
+ "oid": {
+ "namespace": "",
+ "pool": 3,
+ "max": 0,
+ "hash": 2026323607,
+ "snapid": -2,
+ "key": "",
+ "oid": "ROBJ2"
+ },
+ "lost": 0,
+ "flags": [
+ "dirty",
+ "data_digest"
+ ],
+ "legacy_snaps": [],
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "data_digest": "0x2ddbf8f5",
+ "omap_digest": "0xffffffff",
+ "expected_object_size": 0
+ },
+ "union_shard_errors": [],
+ "errors": [
+ "object_info_inconsistency"
+ ],
+ "object": {
+ "version": 2,
+ "snap": "head",
+ "locator": "",
+ "nspace": "",
+ "name": "ROBJ2"
+ }
+ }
+ ],
+ "epoch": 42
+}
+EOF
+
+ jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
+ diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ if test $getjson = "yes"
+ then
+ jq '.' $dir/json > save1.json
+ fi
+
+ if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
+ then
+ jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
+ fi
+
+ repair $pg
+ wait_for_clean
+
+ timeout 30 rados -p $poolname get ROBJ2 $dir/robj2.out || return 1
+ diff -q $dir/new.ROBJ2 $dir/robj2.out || return 1
+ rm -f $dir/new.ROBJ2 $dir/robj2.out || return 1
+
+ rados rmpool $poolname $poolname --yes-i-really-really-mean-it
+ teardown $dir || return 1
+}
+
+main osd-scrub-distrust "$@"
+
+# Local Variables:
+# compile-command: "cd build ; make -j4 && \
+# ../qa/run-standalone.sh osd-scrub-distrust"
+# End:
function TEST_corrupt_scrub_replicated() {
local dir=$1
local poolname=csr_pool
- local total_objs=16
+ local total_objs=18
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=2 || return 1
objectstore_tool $dir 0 $objname rm-attr snapset || return 1
echo -n bad-val > $dir/bad-val
objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+ ;;
+
+ 17)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=ROBJ17
+ echo $payload > $dir/new.ROBJ17
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+ ;;
+
+ 18)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=ROBJ18
+ echo $payload > $dir/new.ROBJ18
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+ # Make one replica have a different object info, so a full repair must happen too
+ objectstore_tool $dir $osd $objname corrupt-info || return 1
esac
done
]
},
{
+ "errors": [
+ "object_info_inconsistency"
+ ],
+ "object": {
+ "locator": "",
+ "name": "ROBJ18",
+ "nspace": "",
+ "snap": "head"
+ },
+ "selected_object_info": {
+ "alloc_hint_flags": 255,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "errors": [],
+ "object_info": {
+ "alloc_hint_flags": 0,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "osd": 0,
+ "primary": false,
+ "size": 7
+ },
+ {
+ "errors": [],
+ "object_info": {
+ "alloc_hint_flags": 255,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "osd": 1,
+ "primary": true,
+ "size": 7
+ }
+ ],
+ "union_shard_errors": []
+ },
+ {
"shards": [
{
"size": 7,
"version": "79'66",
"prior_version": "79'65",
"last_reqid": "client.4554.0:1",
- "user_version": 66,
+ "user_version": 74,
"size": 7,
"mtime": "",
"local_mtime": "",
"version": "95'67",
"prior_version": "51'64",
"last_reqid": "client.4649.0:1",
- "user_version": 67,
+ "user_version": 75,
"size": 1,
"mtime": "",
"local_mtime": "",
"version": "95'67",
"prior_version": "51'64",
"last_reqid": "client.4649.0:1",
- "user_version": 67,
+ "user_version": 75,
"size": 1,
"mtime": "",
"local_mtime": "",
EOF
jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
]
},
{
- "shards": [
+ "errors": [],
+ "object": {
+ "locator": "",
+ "name": "ROBJ17",
+ "nspace": "",
+ "snap": "head"
+ },
+ "selected_object_info": {
+ "alloc_hint_flags": 0,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1884071249,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ17",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xe9572720",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 51,
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "data_digest": "0x5af0c3ef",
+ "errors": [
+ "data_digest_mismatch_info"
+ ],
+ "omap_digest": "0xe9572720",
+ "osd": 0,
+ "primary": false,
+ "size": 7
+ },
+ {
+ "data_digest": "0x5af0c3ef",
+ "errors": [
+ "data_digest_mismatch_info"
+ ],
+ "omap_digest": "0xe9572720",
+ "osd": 1,
+ "primary": true,
+ "size": 7
+ }
+ ],
+ "union_shard_errors": [
+ "data_digest_mismatch_info"
+ ]
+ },
+ {
+ "errors": [
+ "object_info_inconsistency"
+ ],
+ "object": {
+ "locator": "",
+ "name": "ROBJ18",
+ "nspace": "",
+ "snap": "head"
+ },
+ "selected_object_info": {
+ "alloc_hint_flags": 255,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "shards": [
+ {
+ "data_digest": "0xbd89c912",
+ "errors": [
+ "data_digest_mismatch_info"
+ ],
+ "object_info": {
+ "alloc_hint_flags": 0,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "omap_digest": "0xddc3680f",
+ "osd": 0,
+ "primary": false,
+ "size": 7
+ },
+ {
+ "data_digest": "0xbd89c912",
+ "errors": [
+ "data_digest_mismatch_info"
+ ],
+ "object_info": {
+ "alloc_hint_flags": 255,
+ "data_digest": "0x2ddbf8f5",
+ "expected_object_size": 0,
+ "expected_write_size": 0,
+ "flags": [
+ "dirty",
+ "omap",
+ "data_digest",
+ "omap_digest"
+ ],
+ "lost": 0,
+ "manifest": {
+ "type": 0
+ },
+ "oid": {
+ "hash": 1629828556,
+ "key": "",
+ "max": 0,
+ "namespace": "",
+ "oid": "ROBJ18",
+ "pool": 3,
+ "snapid": -2
+ },
+ "omap_digest": "0xddc3680f",
+ "size": 7,
+ "truncate_seq": 0,
+ "truncate_size": 0,
+ "user_version": 54,
+ "watchers": {}
+ },
+ "omap_digest": "0xddc3680f",
+ "osd": 1,
+ "primary": true,
+ "size": 7
+ }
+ ],
+ "union_shard_errors": [
+ "data_digest_mismatch_info"
+ ]
+ },
+ {
+ "shards": [
{
"data_digest": "0x578a4830",
"omap_digest": "0xf8e11918",
"version": "79'66",
"prior_version": "79'65",
"last_reqid": "client.4554.0:1",
- "user_version": 66,
+ "user_version": 74,
"size": 7,
"mtime": "2018-04-05 14:34:05.598688",
"local_mtime": "2018-04-05 14:34:05.599698",
"version": "119'68",
"prior_version": "51'64",
"last_reqid": "client.4834.0:1",
- "user_version": 68,
+ "user_version": 76,
"size": 3,
"mtime": "2018-04-05 14:35:01.500659",
"local_mtime": "2018-04-05 14:35:01.502117",
"version": "119'68",
"prior_version": "51'64",
"last_reqid": "client.4834.0:1",
- "user_version": 68,
+ "user_version": 76,
"size": 3,
"mtime": "2018-04-05 14:35:01.500659",
"local_mtime": "2018-04-05 14:35:01.502117",
EOF
jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save2.json
jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1
fi
+ repair $pg
+ wait_for_clean
+
+ # This hangs if the repair doesn't work
+ timeout 30 rados -p $poolname get ROBJ17 $dir/robj17.out || return 1
+ timeout 30 rados -p $poolname get ROBJ18 $dir/robj18.out || return 1
+ # Even though we couldn't repair all of the introduced errors, we can fix ROBJ17
+ diff -q $dir/new.ROBJ17 $dir/robj17.out || return 1
+ rm -f $dir/new.ROBJ17 $dir/robj17.out || return 1
+ diff -q $dir/new.ROBJ18 $dir/robj18.out || return 1
+ rm -f $dir/new.ROBJ18 $dir/robj18.out || return 1
+
rados rmpool $poolname $poolname --yes-i-really-really-mean-it
teardown $dir || return 1
}
EOF
jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save3.json
fi
jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
if [ "$allow_overwrites" = "true" ]
EOF
jq "$jqfilter" $dir/json | jq '.inconsistents' | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save6.json
EOF
jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
rados list-inconsistent-snapset $pgid > $dir/json || return 1
EOF
jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
fi
jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
- diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
+ multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
if test $getjson = "yes"
then
jq '.' $dir/json > save1.json
--- /dev/null
+#!/usr/bin/env bash
+#
+# Copyright (C) 2015 Intel <contact@intel.com.com>
+# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
+#
+# Author: Xiaoxi Chen <xiaoxi.chen@intel.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ setup $dir || return 1
+ $func $dir || return 1
+ teardown $dir || return 1
+ done
+}
+
+function TEST_recover_unexpected() {
+ local dir=$1
+
+ run_mon $dir a || return 1
+ run_mgr $dir x || return 1
+ run_osd $dir 0 || return 1
+ run_osd $dir 1 || return 1
+ run_osd $dir 2 || return 1
+
+ ceph osd pool create foo 1
+ rados -p foo put foo /etc/passwd
+ rados -p foo mksnap snap
+ rados -p foo put foo /etc/group
+
+ wait_for_clean || return 1
+
+ local osd=$(get_primary foo foo)
+
+ JSON=`objectstore_tool $dir $osd --op list foo | grep snapid.:1`
+ echo "JSON is $JSON"
+ rm -f $dir/_ $dir/data
+ objectstore_tool $dir $osd "$JSON" get-attr _ > $dir/_ || return 1
+ objectstore_tool $dir $osd "$JSON" get-bytes $dir/data || return 1
+
+ rados -p foo rmsnap snap
+
+ sleep 5
+
+ objectstore_tool $dir $osd "$JSON" set-bytes $dir/data || return 1
+ objectstore_tool $dir $osd "$JSON" set-attr _ $dir/_ || return 1
+
+ sleep 5
+
+ ceph pg repair 1.0 || return 1
+
+ sleep 10
+
+ ceph log last
+
+ # make sure osds are still up
+ timeout 60 ceph tell osd.0 version || return 1
+ timeout 60 ceph tell osd.1 version || return 1
+ timeout 60 ceph tell osd.2 version || return 1
+}
+
+
+main osd-unexpected-clone "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
+# End:
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
ms die on skipped message: false
client:
rbd default features: 5
+ log-whitelist:
+ - slow request
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_pg_log_dups_tracked: 10000
+
tasks:
- ceph-fuse:
- workunit:
ceph:
log-whitelist:
- \(MDS_TRIM\)
+ - \(MDS_SLOW_REQUEST\)
- Behind on trimming
--- /dev/null
+../parallel/slow_requests.yaml
\ No newline at end of file
--- /dev/null
+overrides:
+ ceph:
+ log-whitelist:
+ - slow request
--- /dev/null
+../parallel/slow_requests.yaml
\ No newline at end of file
--- /dev/null
+../parallel/slow_requests.yaml
\ No newline at end of file
- wrongly marked
- \(POOL_APP_NOT_ENABLED\)
- overall HEALTH_
+ - \(REQUEST_SLOW\)
+ - slow request
conf:
global:
enable experimental unrecoverable data corrupting features: "*"
- overall HEALTH_
- \(MON_DOWN\)
- \(MGR_DOWN\)
+ - \(REQUEST_SLOW\)
+ - slow request
conf:
global:
enable experimental unrecoverable data corrupting features: "*"
run workload and upgrade-sequence in parallel
install ceph/luminous v12.2.5 point version
run workload and upgrade-sequence in parallel
+ install ceph/luminous v12.2.7 point version
+ run workload and upgrade-sequence in parallel
install ceph/luminous latest version
run workload and upgrade-sequence in parallel
overrides:
- PG_AVAILABILITY
- PG_DEGRADED
- application not enabled
+ - overall HEALTH_
fs: xfs
conf:
mon:
- workload_luminous
- upgrade-sequence_luminous
- print: "**** done parallel luminous v12.2.5"
+
+#### upgrade to v12.2.7
+- install.upgrade:
+ #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+ mon.a:
+ tag: v12.2.7
+ mon.b:
+ tag: v12.2.7
+ # Note that client.a IS NOT upgraded at this point
+- parallel:
+ - workload_luminous
+ - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.7"
+
#### upgrade to latest luminous
- install.upgrade:
#exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
mon.a:
- branch: luminous
mon.b:
- branch: luminous
# Note that client.a IS NOT upgraded at this point
- parallel:
- workload_luminous
+++ /dev/null
-openstack:
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes,
- with a separate client 0,1,2 third node.
- Use xfs beneath the osds.
- CephFS tests running on client 2,3
-roles:
-- - mon.a
- - mgr.x
- - mds.a
- - osd.0
- - osd.1
-- - mon.b
- - mon.c
- - osd.2
- - osd.3
-- - client.0
- - client.1
- - client.2
- - client.3
-- - client.4
-overrides:
- ceph:
- log-whitelist:
- - scrub mismatch
- - ScrubResult
- - wrongly marked
- - \(POOL_APP_NOT_ENABLED\)
- - overall HEALTH_
- conf:
- global:
- enable experimental unrecoverable data corrupting features: "*"
- mon:
- mon warn on osd down out interval zero: false
- osd:
- osd_class_load_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- osd_class_default_list: "cephfs hello journal lock log numops rbd refcount
- replica_log rgw sdk statelog timeindex user version"
- fs: xfs
+++ /dev/null
-meta:
-- desc: |
- install ceph/luminous latest
- run workload and upgrade-sequence in parallel
- upgrade the client node
-tasks:
-- install:
- branch: luminous
-- print: "**** done installing luminous"
-- ceph:
- log-whitelist:
- - overall HEALTH_
- - \(FS_
- - \(MDS_
- - \(OSD_
- - \(MON_DOWN\)
- - \(CACHE_POOL_
- - \(POOL_
- - \(MGR_DOWN\)
- - \(PG_
- - \(SMALLER_PGP_NUM\)
- - Monitor daemon marked osd
- - Behind on trimming
- - Manager daemon
- conf:
- global:
- mon warn on pool no app: false
-- exec:
- osd.0:
- - ceph osd require-osd-release luminous
- - ceph osd set-require-min-compat-client luminous
-- print: "**** done ceph"
-- install.upgrade:
- mon.a:
- mon.b:
-- print: "**** done install.upgrade both hosts"
-- parallel:
- - workload
- - upgrade-sequence
-- print: "**** done parallel"
-- install.upgrade:
- client.0:
-- print: "**** done install.upgrade on client.0"
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.2 before running workunit
-workload:
- full_sequential:
- - sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 2-workload"
- - workunit:
- clients:
- client.2:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- run run randomized correctness test for rados operations
- on an erasure-coded pool
-workload:
- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-meta:
-- desc: |
- object class functional tests
-workload:
- full_sequential:
- - workunit:
- branch: luminous
- clients:
- client.0:
- - cls
- - print: "**** done cls 2-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1MB to 25MB
-workload:
- full_sequential:
- - workunit:
- branch: luminous
- clients:
- client.0:
- - rados/load-gen-big.sh
- - print: "**** done rados/load-gen-big.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-workload:
- full_sequential:
- - workunit:
- branch: luminous
- clients:
- client.0:
- - rbd/test_librbd.sh
- - print: "**** done rbd/test_librbd.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-workload:
- full_sequential:
- - workunit:
- branch: luminous
- clients:
- client.0:
- - rbd/test_librbd_python.sh
- - print: "**** done rbd/test_librbd_python.sh 2-workload"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [mon.a, mon.b, mon.c, mgr.x]
- - ceph.restart:
- daemons: [osd.0, osd.1, osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - ceph.restart:
- daemons: [mds.a]
- wait-for-healthy: false
- wait-for-osds-up: true
- - print: "**** done ceph.restart all"
+++ /dev/null
-meta:
-- desc: |
- upgrade the ceph cluster,
- upgrate in two steps
- step one ordering: mon.a, osd.0, osd.1, mds.a
- step two ordering: mon.b, mon.c, osd.2, osd.3
- ceph expected to be healthy state after each step
-upgrade-sequence:
- sequential:
- - ceph.restart:
- daemons: [mon.a]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [mon.b, mon.c, mgr.x]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.0, osd.1]
- wait-for-healthy: true
- - sleep:
- duration: 60
- - ceph.restart: [mds.a]
- - sleep:
- duration: 60
- - sleep:
- duration: 60
- - ceph.restart:
- daemons: [osd.2, osd.3]
- wait-for-healthy: false
- wait-for-osds-up: true
- - sleep:
- duration: 60
+++ /dev/null
-meta:
-- desc: |
- run a cephfs stress test
- mount ceph-fuse on client.3 before running workunit
-tasks:
-- sequential:
- - ceph-fuse:
- - print: "**** done ceph-fuse 5-final-workload"
- - workunit:
- clients:
- client.3:
- - suites/blogbench.sh
- - print: "**** done suites/blogbench.sh 5-final-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshots
-tasks:
- - rados:
- clients: [client.1]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- - print: "**** done rados 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- generate read/write load with rados objects ranging from 1 byte to 1MB
-tasks:
- - workunit:
- clients:
- client.1:
- - rados/load-gen-mix.sh
- - print: "**** done rados/load-gen-mix.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- librados C and C++ api tests
-overrides:
- ceph:
- log-whitelist:
- - reached quota
-tasks:
- - mon_thrash:
- revive_delay: 20
- thrash_delay: 1
- - print: "**** done mon_thrash 4-final-workload"
- - workunit:
- branch: luminous
- clients:
- client.1:
- - rados/test.sh
- - print: "**** done rados/test.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- rbd object class functional tests
-tasks:
- - workunit:
- clients:
- client.1:
- - cls/test_cls_rbd.sh
- - print: "**** done cls/test_cls_rbd.sh 4-final-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
- on NO upgrated client
-tasks:
- - workunit:
- branch: luminous
- clients:
- client.4:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload on NO upgrated client"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
- on upgrated client
-tasks:
- - workunit:
- clients:
- client.1:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
- - print: "**** done rbd/import_export.sh 4-final-workload on upgrated client"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-overrides:
- rgw:
- frontend: civetweb
-tasks:
- - rgw: [client.1]
- - print: "**** done rgw 4-final-workload"
- - swift:
- client.1:
- rgw_server: client.1
- - print: "**** done swift 4-final-workload"
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-../stress-split/objectstore/
\ No newline at end of file
+++ /dev/null
-../stress-split/0-cluster/
\ No newline at end of file
+++ /dev/null
-../stress-split/1-ceph-install/
\ No newline at end of file
+++ /dev/null
-../stress-split/2-partial-upgrade/
\ No newline at end of file
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance to increase the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - but it is still running
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- parallel:
- - stress-tasks
-stress-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- min_in: 4
- chance_thrash_cluster_full: 0
- chance_thrash_pg_upmap: 0
- chance_thrash_pg_upmap_items: 0
- chance_force_recovery: 0
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
-stress-tasks:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
- - print: "**** done rados ec task"
+++ /dev/null
-../stress-split/5-finish-upgrade.yaml
\ No newline at end of file
+++ /dev/null
-#
-# k=3 implies a stripe_width of 1376*3 = 4128 which is different from
-# the default value of 4096 It is also not a multiple of 1024*1024 and
-# creates situations where rounding rules during recovery becomes
-# necessary.
-#
-meta:
-- desc: |
- randomized correctness test for rados operations on an erasure coded pool
- using the jerasure plugin with k=3 and m=1
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- ec_pool: true
- write_append_excl: false
- erasure_code_profile:
- name: jerasure31profile
- plugin: jerasure
- k: 3
- m: 1
- technique: reed_sol_van
- crush-failure-domain: osd
- op_weights:
- read: 100
- write: 0
- append: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
- copy_from: 50
- setattr: 25
- rmattr: 25
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-../stress-split/objectstore/
\ No newline at end of file
+++ /dev/null
-../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
+++ /dev/null
-openstack:
- - machine:
- disk: 100 # GB
- - volumes: # attached to each instance
- count: 3
- size: 30 # GB
+++ /dev/null
-meta:
-- desc: |
- Run ceph on two nodes,
- with a separate client-only node.
- Use xfs beneath the osds.
-overrides:
- ceph:
- fs: xfs
- log-whitelist:
- - overall HEALTH_
- - \(MON_DOWN\)
- - \(MGR_DOWN\)
- conf:
- global:
- enable experimental unrecoverable data corrupting features: "*"
- mon:
- mon warn on osd down out interval zero: false
-roles:
-- - mon.a
- - mon.b
- - mon.c
- - mgr.x
- - osd.0
- - osd.1
- - osd.2
-- - osd.3
- - osd.4
- - osd.5
-- - client.0
+++ /dev/null
-meta:
-- desc: install ceph/luminous latest
-tasks:
-- install:
- branch: luminous
-- print: "**** done install luminous"
-- ceph:
-- exec:
- osd.0:
- - ceph osd require-osd-release luminous
- - ceph osd set-require-min-compat-client luminous
-- print: "**** done ceph "
-overrides:
- ceph:
- conf:
- mon:
- mon warn on osd down out interval zero: false
+++ /dev/null
-meta:
-- desc: |
- install upgrade ceph/-x on one node only
- 1st half
- restart : osd.0,1,2
-tasks:
-- install.upgrade:
- osd.0:
-- print: "**** done install.upgrade osd.0"
-- ceph.restart:
- daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2]
-- print: "**** done ceph.restart 1st half"
+++ /dev/null
-meta:
-- desc: |
- randomly kill and revive osd
- small chance to increase the number of pgs
-overrides:
- ceph:
- log-whitelist:
- - but it is still running
- - wrongly marked me down
- - objects unfound and apparently lost
- - log bound mismatch
-tasks:
-- parallel:
- - stress-tasks
-stress-tasks:
-- thrashosds:
- timeout: 1200
- chance_pgnum_grow: 1
- chance_pgpnum_fix: 1
- chance_thrash_cluster_full: 0
- chance_thrash_pg_upmap: 0
- chance_thrash_pg_upmap_items: 0
- disable_objectstore_tool_tests: true
- chance_force_recovery: 0
-- print: "**** done thrashosds 3-thrash"
+++ /dev/null
-meta:
-- desc: |
- run randomized correctness test for rados operations
- generate write load with rados bench
-stress-tasks:
-- full_sequential:
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
- - radosbench:
- clients: [client.0]
- time: 150
-- print: "**** done radosbench 7-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic cls tests for rbd
-stress-tasks:
-- workunit:
- branch: luminous
- clients:
- client.0:
- - cls/test_cls_rbd.sh
-- print: "**** done cls/test_cls_rbd.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- run basic import/export cli tests for rbd
-stress-tasks:
-- workunit:
- branch: luminous
- clients:
- client.0:
- - rbd/import_export.sh
- env:
- RBD_CREATE_ARGS: --new-format
-- print: "**** done rbd/import_export.sh 5-workload"
+++ /dev/null
-meta:
-- desc: |
- librbd C and C++ api tests
-stress-tasks:
-- workunit:
- branch: luminous
- clients:
- client.0:
- - rbd/test_librbd.sh
-- print: "**** done rbd/test_librbd.sh 7-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool,
- using only reads, writes, and deletes
-stress-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 45
- write: 45
- delete: 10
-- print: "**** done rados/readwrite 5-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-stress-tasks:
-- full_sequential:
- - rados:
- clients: [client.0]
- ops: 4000
- objects: 50
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
-- print: "**** done rados/snaps-few-objects 5-workload"
+++ /dev/null
-tasks:
-- install.upgrade:
- osd.3:
- client.0:
-- ceph.restart:
- daemons: [osd.3, osd.4, osd.5]
- wait-for-healthy: false
- wait-for-osds-up: true
-
+++ /dev/null
-meta:
-- desc: |
- librbd python api tests
-tasks:
-- workunit:
- branch: luminous
- clients:
- client.0:
- - rbd/test_librbd_python.sh
-- print: "**** done rbd/test_librbd_python.sh 9-workload"
+++ /dev/null
-meta:
-- desc: |
- swift api tests for rgw
-tasks:
-- rgw:
- client.0:
-- print: "**** done rgw 9-workload"
-- swift:
- client.0:
- rgw_server: client.0
-- print: "**** done swift 9-workload"
+++ /dev/null
-meta:
-- desc: |
- randomized correctness test for rados operations on a replicated pool with snapshot operations
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- write_append_excl: false
- op_weights:
- read: 100
- write: 100
- delete: 50
- snap_create: 50
- snap_remove: 50
- rollback: 50
+++ /dev/null
-../../../../distros/supported/
\ No newline at end of file
+++ /dev/null
-../../../../../objectstore/bluestore.yaml
\ No newline at end of file
+++ /dev/null
-../../../../../objectstore/filestore-xfs.yaml
\ No newline at end of file
+++ /dev/null
-../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
from teuthology import misc as teuthology
from cephfs.fuse_mount import FuseMount
+from tasks.cephfs.filesystem import MDSCluster
+from tasks.cephfs.filesystem import Filesystem
log = logging.getLogger(__name__)
all_mounts = getattr(ctx, 'mounts', {})
mounted_by_me = {}
+ log.info('Wait for MDS to reach steady state...')
+ mds_cluster = MDSCluster(ctx)
+ status = mds_cluster.status()
+ for filesystem in status.get_filesystems():
+ fs = Filesystem(ctx, fscid=filesystem['id'])
+ fs.wait_for_daemons()
+ log.info('Ready to start ceph-fuse...')
+
# Construct any new FuseMount instances
for id_, remote in clients:
client_config = config.get("client.%s" % id_)
self._one_or_all(mds_id, _fail_restart)
+ def mds_signal(self, mds_id, sig, silent=False):
+ """
+ signal a MDS daemon
+ """
+ self.mds_daemons[mds_id].signal(sig, silent);
+
def newfs(self, name='cephfs', create=True):
return Filesystem(self._ctx, name=name, create=create)
+import time
+import signal
import json
import logging
from unittest import case, SkipTest
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
self.wait_for_health_clear(timeout=30)
+ def test_discontinuous_mdsmap(self):
+ """
+ That discontinuous mdsmap does not affect failover.
+ See http://tracker.ceph.com/issues/24856.
+ """
+ mds_ids = sorted(self.mds_cluster.mds_ids)
+ mds_a, mds_b = mds_ids[0:2]
+ # Assign mds to fixed ranks. To prevent standby mds from replacing frozen mds
+ rank = 0;
+ for mds_id in mds_ids:
+ self.set_conf("mds.{0}".format(mds_id), "mds_standby_for_rank", str(rank))
+ rank += 1
+ self.mds_cluster.mds_restart()
+ self.fs.wait_for_daemons()
+
+ self.fs.set_max_mds(2)
+ self.fs.wait_for_state('up:active', rank=1)
+
+ # Drop 'export prep' message, make import stay in 'discovered' state
+ self.fs.mds_asok(['config', 'set', 'mds_inject_migrator_message_loss', '82'], mds_id=mds_b)
+
+ self.mount_a.run_shell(["mkdir", "a"])
+ self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+ self.mount_a.umount_wait()
+
+ # Should be long enough for start the export
+ time.sleep(30)
+
+ grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+ monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
+ # Freeze mds_b
+ self.mds_cluster.mds_signal(mds_b, signal.SIGSTOP)
+ self.wait_until_true(
+ lambda: "laggy_since" in self.fs.mon_manager.get_mds_status(mds_b),
+ timeout=grace * 2
+ )
+
+ self.mds_cluster.mds_restart(mds_a)
+ self.fs.wait_for_state('up:resolve', rank=0, timeout=30)
+
+ # Make sure of mds_b's monitor connection gets reset
+ time.sleep(monc_timeout * 2)
+
+ # Unfreeze mds_b, it will get discontinuous mdsmap
+ self.mds_cluster.mds_signal(mds_b, signal.SIGCONT)
+ self.wait_until_true(
+ lambda: "laggy_since" not in self.fs.mon_manager.get_mds_status(mds_b),
+ timeout=grace * 2
+ )
+ # Check if mds_b sends 'resolve' message to mds_a. If not, mds_a can't become active
+ self.fs.wait_for_state('up:active', rank=0, timeout=30)
class TestStandbyReplay(CephFSTestCase):
MDSS_REQUIRED = 4
self.mount_a.umount_wait()
with self.assert_cluster_log("inode table repaired", invert_match=True):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ out_json = self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ self.assertNotEqual(out_json, None)
self.mds_cluster.mds_stop()
self.mds_cluster.mds_fail()
self.fs.wait_for_daemons()
with self.assert_cluster_log("inode table repaired"):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ out_json = self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ self.assertNotEqual(out_json, None)
self.mds_cluster.mds_stop()
table_text = self.fs.table_tool(["0", "show", "inode"])
"oh i'm sorry did i overwrite your xattr?")
with self.assert_cluster_log("bad backtrace on inode"):
- self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ out_json = self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+ self.assertNotEqual(out_json, None)
self.fs.mds_asok(["flush", "journal"])
backtrace = self.fs.read_backtrace(file_ino)
self.assertEqual(['alpha', 'parent_a'],
ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
-class Workload(object):
+class Workload(CephFSTestCase):
def __init__(self, filesystem, mount):
self._mount = mount
self._filesystem = filesystem
# a string
self._errors = []
- def assert_equal(self, a, b):
- try:
- if a != b:
- raise AssertionError("{0} != {1}".format(a, b))
- except AssertionError as e:
- self._errors.append(
- ValidationError(e, traceback.format_exc(3))
- )
-
def write(self):
"""
Write the workload files to the mount
self._filesystem.mds_asok(["flush", "journal"])
bt = self._filesystem.read_backtrace(st['st_ino'])
parent = bt['ancestors'][0]['dname']
- self.assert_equal(parent, "sixmegs")
+ self.assertEqual(parent, 'sixmegs')
return self._errors
def damage(self):
self._filesystem.wait_for_daemons()
def validate(self):
- self._filesystem.mds_asok(["scrub_path", "/", "recursive", "repair"])
- self.assert_equal(self._filesystem.are_daemons_healthy(), True)
+ out_json = self._filesystem.mds_asok(["scrub_path", "/", "recursive", "repair"])
+ self.assertNotEqual(out_json, None)
+ self.assertTrue(self._filesystem.are_daemons_healthy())
return self._errors
# Apply any data damage the workload wants
workload.damage()
- self.fs.mds_asok(["scrub_path", "/", "recursive", "repair"])
+ out_json = self.fs.mds_asok(["scrub_path", "/", "recursive", "repair"])
+ self.assertNotEqual(out_json, None)
# See that the files are present and correct
errors = workload.validate()
- \(REQUEST_SLOW\)
- \(TOO_FEW_PGS\)
- \(MON_DOWN\)
+ - slow requests
self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
+ def signal(self, sig, silent=False):
+ if not self.running():
+ raise RuntimeError("Can't send signal to non-running daemon")
+
+ os.kill(self._get_pid(), sig)
+ if not silent:
+ log.info("Sent signal {0} to {1}.{2}".format(sig, self.daemon_type, self.daemon_id))
+
def safe_kill(pid):
"""
# We only aim at testing the units are parsed accordingly
# and don't intend to test whether the options being set
# actually expect SI units to be passed.
- # Keep in mind that all integer based options (i.e., INT,
- # LONG, U32, U64) will accept SI unit modifiers.
+ # Keep in mind that all integer based options that are not based on bytes
+ # (i.e., INT, LONG, U32, U64) will accept SI unit modifiers and be parsed to
+ # base 10.
initial_value=$(get_config_value_or_die "mon.a" "mon_pg_warn_min_objects")
$SUDO ceph daemon mon.a config set mon_pg_warn_min_objects 10
expect_config_value "mon.a" "mon_pg_warn_min_objects" 10
$SUDO ceph daemon mon.a config set mon_pg_warn_min_objects 10K
- expect_config_value "mon.a" "mon_pg_warn_min_objects" 10240
+ expect_config_value "mon.a" "mon_pg_warn_min_objects" 10000
$SUDO ceph daemon mon.a config set mon_pg_warn_min_objects 1G
- expect_config_value "mon.a" "mon_pg_warn_min_objects" 1073741824
+ expect_config_value "mon.a" "mon_pg_warn_min_objects" 1000000000
$SUDO ceph daemon mon.a config set mon_pg_warn_min_objects 10F > $TMPFILE || true
check_response "'10F': (22) Invalid argument"
# now test with injectargs
ceph tell mon.a injectargs '--mon_pg_warn_min_objects 10'
expect_config_value "mon.a" "mon_pg_warn_min_objects" 10
ceph tell mon.a injectargs '--mon_pg_warn_min_objects 10K'
- expect_config_value "mon.a" "mon_pg_warn_min_objects" 10240
+ expect_config_value "mon.a" "mon_pg_warn_min_objects" 10000
ceph tell mon.a injectargs '--mon_pg_warn_min_objects 1G'
- expect_config_value "mon.a" "mon_pg_warn_min_objects" 1073741824
+ expect_config_value "mon.a" "mon_pg_warn_min_objects" 1000000000
expect_false ceph tell mon.a injectargs '--mon_pg_warn_min_objects 10F'
expect_false ceph tell mon.a injectargs '--mon_globalid_prealloc -1'
$SUDO ceph daemon mon.a config set mon_pg_warn_min_objects $initial_value
}
+function test_mon_injectargs_IEC()
+{
+ # Test IEC units during injectargs and 'config set'
+ # We only aim at testing the units are parsed accordingly
+ # and don't intend to test whether the options being set
+ # actually expect IEC units to be passed.
+ # Keep in mind that all integer based options that are based on bytes
+ # (i.e., INT, LONG, U32, U64) will accept IEC unit modifiers, as well as SI
+ # unit modifiers (for backwards compatibility and convinience) and be parsed
+ # to base 2.
+ initial_value=$(get_config_value_or_die "mon.a" "mon_data_size_warn")
+ $SUDO ceph daemon mon.a config set mon_data_size_warn 15000000000
+ expect_config_value "mon.a" "mon_data_size_warn" 15000000000
+ $SUDO ceph daemon mon.a config set mon_data_size_warn 15G
+ expect_config_value "mon.a" "mon_data_size_warn" 16106127360
+ $SUDO ceph daemon mon.a config set mon_data_size_warn 16Gi
+ expect_config_value "mon.a" "mon_data_size_warn" 17179869184
+ $SUDO ceph daemon mon.a config set mon_data_size_warn 10F > $TMPFILE || true
+ check_response "'10F': (22) Invalid argument"
+ # now test with injectargs
+ ceph tell mon.a injectargs '--mon_data_size_warn 15000000000'
+ expect_config_value "mon.a" "mon_data_size_warn" 15000000000
+ ceph tell mon.a injectargs '--mon_data_size_warn 15G'
+ expect_config_value "mon.a" "mon_data_size_warn" 16106127360
+ ceph tell mon.a injectargs '--mon_data_size_warn 16Gi'
+ expect_config_value "mon.a" "mon_data_size_warn" 17179869184
+ expect_false ceph tell mon.a injectargs '--mon_data_size_warn 10F'
+ $SUDO ceph daemon mon.a config set mon_data_size_warn $initial_value
+}
+
function test_tiering_agent()
{
local slow=slow_eviction
ceph osd pool set-quota tmp-quota-pool max_bytes 10
ceph osd pool set-quota tmp-quota-pool max_objects 10M
#
- # get quotas
- #
- ceph osd pool get-quota tmp-quota-pool | grep 'max bytes.*10B'
- ceph osd pool get-quota tmp-quota-pool | grep 'max objects.*10240k objects'
- #
# get quotas in json-pretty format
#
ceph osd pool get-quota tmp-quota-pool --format=json-pretty | \
- grep '"quota_max_objects":.*10485760'
+ grep '"quota_max_objects":.*10000000'
ceph osd pool get-quota tmp-quota-pool --format=json-pretty | \
grep '"quota_max_bytes":.*10'
#
+ # get quotas
+ #
+ ceph osd pool get-quota tmp-quota-pool | grep 'max bytes.*10B'
+ ceph osd pool get-quota tmp-quota-pool | grep 'max objects.*10M objects'
+ #
+ # set valid quotas with unit prefix
+ #
+ ceph osd pool set-quota tmp-quota-pool max_bytes 10K
+ #
+ # get quotas
+ #
+ ceph osd pool get-quota tmp-quota-pool | grep 'max bytes.*10Ki'
+ #
+ # set valid quotas with unit prefix
+ #
+ ceph osd pool set-quota tmp-quota-pool max_bytes 10Ki
+ #
+ # get quotas
+ #
+ ceph osd pool get-quota tmp-quota-pool | grep 'max bytes.*10Ki'
+ #
+ #
# reset pool quotas
#
ceph osd pool set-quota tmp-quota-pool max_bytes 0
rbd export testimg1 /tmp/img3
# info
- rbd info testimg1 | grep 'size 128 MB'
- rbd info --snap=snap1 testimg1 | grep 'size 256 MB'
+ rbd info testimg1 | grep 'size 128MiB'
+ rbd info --snap=snap1 testimg1 | grep 'size 256MiB'
# export-diff
rm -rf /tmp/diff-testimg1-1 /tmp/diff-testimg1-2
rbd import-diff --sparse-size 8K /tmp/diff-testimg1-2 testimg-diff1
# info
- rbd info testimg1 | grep 'size 128 MB'
- rbd info --snap=snap1 testimg1 | grep 'size 256 MB'
- rbd info testimg-diff1 | grep 'size 128 MB'
- rbd info --snap=snap1 testimg-diff1 | grep 'size 256 MB'
+ rbd info testimg1 | grep 'size 128MiB'
+ rbd info --snap=snap1 testimg1 | grep 'size 256MiB'
+ rbd info testimg-diff1 | grep 'size 128MiB'
+ rbd info --snap=snap1 testimg-diff1 | grep 'size 256MiB'
# make copies
rbd copy testimg1 --snap=snap1 testimg2
rbd copy testimg-diff1 --sparse-size 768K testimg-diff3
# verify the result
- rbd info testimg2 | grep 'size 256 MB'
- rbd info testimg3 | grep 'size 128 MB'
- rbd info testimg-diff2 | grep 'size 256 MB'
- rbd info testimg-diff3 | grep 'size 128 MB'
+ rbd info testimg2 | grep 'size 256MiB'
+ rbd info testimg3 | grep 'size 128MiB'
+ rbd info testimg-diff2 | grep 'size 256MiB'
+ rbd info testimg-diff3 | grep 'size 128MiB'
rbd export testimg1 /tmp/img1.new
rbd export testimg2 /tmp/img2.new
# rollback
rbd snap rollback --snap=snap1 testimg1
rbd snap rollback --snap=snap1 testimg-diff1
- rbd info testimg1 | grep 'size 256 MB'
- rbd info testimg-diff1 | grep 'size 256 MB'
+ rbd info testimg1 | grep 'size 256MiB'
+ rbd info testimg-diff1 | grep 'size 256MiB'
rbd export testimg1 /tmp/img1.snap1
rbd export testimg-diff1 /tmp/img-diff1.snap1
cmp /tmp/img2 /tmp/img1.snap1
rbd ls | grep test2
rbd ls | wc -l | grep 2
# look for fields in output of ls -l without worrying about space
- rbd ls -l | grep 'test1.*1024k.*1'
- rbd ls -l | grep 'test2.*1024k.*1'
+ rbd ls -l | grep 'test1.*1MiB.*1'
+ rbd ls -l | grep 'test2.*1MiB.*1'
rbd rm test1
rbd rm test2
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024k.*2'
- rbd ls -l | grep 'test2.*1024k.*2'
+ rbd ls -l | grep 'test1.*1MiB.*2'
+ rbd ls -l | grep 'test2.*1MiB.*2'
rbd rm test1
rbd rm test2
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024k.*2'
- rbd ls -l | grep 'test2.*1024k.*1'
+ rbd ls -l | grep 'test1.*1MiB.*2'
+ rbd ls -l | grep 'test2.*1MiB.*1'
remove_images
# test that many images can be shown by ls
wget http://download.ceph.com/qa/blogbench-1.0.tar.bz2
#cp /home/gregf/src/blogbench-1.0.tar.bz2 .
tar -xvf blogbench-1.0.tar.bz2
-cd blogbench*
+cd blogbench-1.0/
echo "making blogbench"
./configure
make
--- /dev/null
+diff -urp 1/parser.c 2/parser.c
+--- 1/parser.c 2008-10-28 04:17:05.000000000 +0800
++++ 2/parser.c 2018-06-26 20:25:59.000000000 +0800
+@@ -203,7 +203,7 @@ static char *get_optstr(char *buf, char
+ len = strnlen(string, BUFSIZE);
+ sprintf(search_str, "%s=%%%ds\\n", string, BUFSIZE - len-1);
+ if (1 == sscanf(line, search_str, &temp)) {
+- len = strnlen(temp, 4096);
++ len = strnlen(temp, 4095) + 1;
+ ret_buf = malloc(len);
+ strncpy(ret_buf, temp, len);
+ return ret_buf;
wget http://download.ceph.com/qa/ffsb.tar.bz2
tar jxvf ffsb.tar.bz2
-cd ffsb-*
+cd ffsb-6.0-rc2
+patch -p1 < $mydir/ffsb.patch
./configure
make
cd ..
echo "getting iogen"
wget http://download.ceph.com/qa/iogen_3.1p0.tar
tar -xvzf iogen_3.1p0.tar
-cd iogen*
+cd iogen_3.1p0
echo "making iogen"
make
echo "running iogen"
wget http://download.ceph.com/qa/pjd-fstest-20090130-RC-aclfixes.tgz
tar zxvf pjd*.tgz
-cd pjd*
+cd pjd-fstest-20090130-RC
make clean
make
cd ..
-3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5
-v12.2.7
+ae699615bac534ea496ee965ac6192cb7e0e07c0
+v12.2.8
endif()
add_subdirectory(script)
-
-if(WITH_EMBEDDED)
- add_subdirectory(libcephd)
-endif()
from collections import namedtuple
+sys_info = namedtuple('sys_info', ['devices'])
+sys_info.devices = dict()
+
+
class UnloadedConfig(object):
"""
This class is used as the default value for conf.ceph so that if
conf.ceph = UnloadedConfig()
__version__ = "1.0.0"
+
+__release__ = "luminous"
"""
import logging
import os
-from ceph_volume import process
-from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError, MultiplePVsError
+import uuid
+from math import floor
+from ceph_volume import process, util
+from ceph_volume.exceptions import (
+ MultipleLVsError, MultipleVGsError,
+ MultiplePVsError, SizeAllocationError
+)
logger = logging.getLogger(__name__)
return report
+def _splitname_parser(line):
+ """
+ Parses the output from ``dmsetup splitname``, that should contain prefixes
+ (--nameprefixes) and set the separator to ";"
+
+ Output for /dev/mapper/vg-lv will usually look like::
+
+ DM_VG_NAME='/dev/mapper/vg';DM_LV_NAME='lv';DM_LV_LAYER=''
+
+
+ The ``VG_NAME`` will usually not be what other callers need (e.g. just 'vg'
+ in the example), so this utility will split ``/dev/mapper/`` out, so that
+ the actual volume group name is kept
+
+ :returns: dictionary with stripped prefixes
+ """
+ parts = line[0].split(';')
+ parsed = {}
+ for part in parts:
+ part = part.replace("'", '')
+ key, value = part.split('=')
+ if 'DM_VG_NAME' in key:
+ value = value.split('/dev/mapper/')[-1]
+ key = key.split('DM_')[-1]
+ parsed[key] = value
+
+ return parsed
+
+
+def sizing(device_size, parts=None, size=None):
+ """
+ Calculate proper sizing to fully utilize the volume group in the most
+ efficient way possible. To prevent situations where LVM might accept
+ a percentage that is beyond the vg's capabilities, it will refuse with
+ an error when requesting a larger-than-possible parameter, in addition
+ to rounding down calculations.
+
+ A dictionary with different sizing parameters is returned, to make it
+ easier for others to choose what they need in order to create logical
+ volumes::
+
+ >>> sizing(100, parts=2)
+ >>> {'parts': 2, 'percentages': 50, 'sizes': 50}
+
+ """
+ if parts is not None and size is not None:
+ raise ValueError(
+ "Cannot process sizing with both parts (%s) and size (%s)" % (parts, size)
+ )
+
+ if size and size > device_size:
+ raise SizeAllocationError(size, device_size)
+
+ def get_percentage(parts):
+ return int(floor(100 / float(parts)))
+
+ if parts is not None:
+ # Prevent parts being 0, falling back to 1 (100% usage)
+ parts = parts or 1
+ percentages = get_percentage(parts)
+
+ if size:
+ parts = int(device_size / size) or 1
+ percentages = get_percentage(parts)
+
+ sizes = device_size / parts if parts else int(floor(device_size))
+
+ return {
+ 'parts': parts,
+ 'percentages': percentages,
+ 'sizes': int(sizes),
+ }
+
+
def parse_tags(lv_tags):
"""
Return a dictionary mapping of all the tags associated with
return '0'
+def dmsetup_splitname(dev):
+ """
+ Run ``dmsetup splitname`` and parse the results.
+
+ .. warning:: This call does not ensure that the device is correct or that
+ it exists. ``dmsetup`` will happily take a non existing path and still
+ return a 0 exit status.
+ """
+ command = [
+ 'dmsetup', 'splitname', '--noheadings',
+ "--separator=';'", '--nameprefixes', dev
+ ]
+ out, err, rc = process.call(command)
+ return _splitname_parser(out)
+
+
+def is_lv(dev, lvs=None):
+ """
+ Boolean to detect if a device is an LV or not.
+ """
+ splitname = dmsetup_splitname(dev)
+ # Allowing to optionally pass `lvs` can help reduce repetitive checks for
+ # multiple devices at once.
+ lvs = lvs if lvs is not None else Volumes()
+ if splitname.get('LV_NAME'):
+ lvs.filter(lv_name=splitname['LV_NAME'], vg_name=splitname['VG_NAME'])
+ return len(lvs) > 0
+ return False
+
+
def get_api_vgs():
"""
Return the list of group volumes available in the system using flags to
Command and sample delimited output should look like::
- $ vgs --noheadings --readonly --separator=';' \
+ $ vgs --noheadings --units=g --readonly --separator=';' \
-o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
osd_vg;3;1;0;wz--n-;29.21g;9.21g
+ To normalize sizing, the units are forced in 'g' which is equivalent to
+ gigabytes, which uses multiples of 1024 (as opposed to 1000)
"""
- fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
+ fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free,vg_free_count'
stdout, stderr, returncode = process.call(
- ['vgs', '--noheadings', '--readonly', '--separator=";"', '-o', fields]
+ ['vgs', '--noheadings', '--readonly', '--units=g', '--separator=";"', '-o', fields]
)
return _output_parser(stdout, fields)
;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg
"""
- fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid'
+ fields = 'lv_tags,lv_path,lv_name,vg_name,lv_uuid,lv_size'
stdout, stderr, returncode = process.call(
['lvs', '--noheadings', '--readonly', '--separator=";"', '-o', fields]
)
])
-def create_vg(name, *devices):
+def create_vg(devices, name=None, name_prefix=None):
"""
Create a Volume Group. Command looks like::
vgcreate --force --yes group_name device
Once created the volume group is returned as a ``VolumeGroup`` object
+
+ :param devices: A list of devices to create a VG. Optionally, a single
+ device (as a string) can be used.
+ :param name: Optionally set the name of the VG, defaults to 'ceph-{uuid}'
+ :param name_prefix: Optionally prefix the name of the VG, which will get combined
+ with a UUID string
"""
+ if isinstance(devices, set):
+ devices = list(devices)
+ if not isinstance(devices, list):
+ devices = [devices]
+ if name_prefix:
+ name = "%s-%s" % (name_prefix, str(uuid.uuid4()))
+ elif name is None:
+ name = "ceph-%s" % str(uuid.uuid4())
process.run([
'vgcreate',
'--force',
'--yes',
- name] + list(devices)
+ name] + devices
)
vg = get_vg(vg_name=name)
return vg
+def extend_vg(vg, devices):
+ """
+ Extend a Volume Group. Command looks like::
+
+ vgextend --force --yes group_name [device, ...]
+
+ Once created the volume group is extended and returned as a ``VolumeGroup`` object
+
+ :param vg: A VolumeGroup object
+ :param devices: A list of devices to extend the VG. Optionally, a single
+ device (as a string) can be used.
+ """
+ if not isinstance(devices, list):
+ devices = [devices]
+ process.run([
+ 'vgextend',
+ '--force',
+ '--yes',
+ vg.name] + devices
+ )
+
+ vg = get_vg(vg_name=vg.name)
+ return vg
+
+
def remove_vg(vg_name):
"""
Removes a volume group.
return True
-def create_lv(name, group, size=None, tags=None):
+def create_lv(name, group, extents=None, size=None, tags=None, uuid_name=False):
"""
Create a Logical Volume in a Volume Group. Command looks like::
conform to the convention of prefixing them with "ceph." like::
{"ceph.block_device": "/dev/ceph/osd-1"}
+
+ :param uuid_name: Optionally combine the ``name`` with UUID to ensure uniqueness
"""
+ if uuid_name:
+ name = '%s-%s' % (name, uuid.uuid4())
+ if tags is None:
+ tags = {
+ "ceph.osd_id": "null",
+ "ceph.type": "null",
+ "ceph.cluster_fsid": "null",
+ "ceph.osd_fsid": "null",
+ }
+
# XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
type_path_tag = {
'journal': 'ceph.journal_device',
'%s' % size,
'-n', name, group
])
+ elif extents:
+ process.run([
+ 'lvcreate',
+ '--yes',
+ '-l',
+ '%s' % extents,
+ '-n', name, group
+ ])
# create the lv with all the space available, this is needed because the
# system call is different for LVM
else:
return lv
+def create_lvs(volume_group, parts=None, size=None, name_prefix='ceph-lv'):
+ """
+ Create multiple Logical Volumes from a Volume Group by calculating the
+ proper extents from ``parts`` or ``size``. A custom prefix can be used
+ (defaults to ``ceph-lv``), these names are always suffixed with a uuid.
+
+ LV creation in ceph-volume will require tags, this is expected to be
+ pre-computed by callers who know Ceph metadata like OSD IDs and FSIDs. It
+ will probably not be the case when mass-creating LVs, so common/default
+ tags will be set to ``"null"``.
+
+ .. note:: LVs that are not in use can be detected by querying LVM for tags that are
+ set to ``"null"``.
+
+ :param volume_group: The volume group (vg) to use for LV creation
+ :type group: ``VolumeGroup()`` object
+ :param parts: Number of LVs to create *instead of* ``size``.
+ :type parts: int
+ :param size: Size (in gigabytes) of LVs to create, e.g. "as many 10gb LVs as possible"
+ :type size: int
+ :param extents: The number of LVM extents to use to create the LV. Useful if looking to have
+ accurate LV sizes (LVM rounds sizes otherwise)
+ """
+ if parts is None and size is None:
+ # fallback to just one part (using 100% of the vg)
+ parts = 1
+ lvs = []
+ tags = {
+ "ceph.osd_id": "null",
+ "ceph.type": "null",
+ "ceph.cluster_fsid": "null",
+ "ceph.osd_fsid": "null",
+ }
+ sizing = volume_group.sizing(parts=parts, size=size)
+ for part in range(0, sizing['parts']):
+ size = sizing['sizes']
+ extents = sizing['extents']
+ lv_name = '%s-%s' % (name_prefix, uuid.uuid4())
+ lvs.append(
+ create_lv(lv_name, volume_group.name, extents=extents, tags=tags)
+ )
+ return lvs
+
+
def get_vg(vg_name=None, vg_tags=None):
"""
Return a matching vg for the current system, requires ``vg_name`` or
)
if not pvs:
return None
- if len(pvs) > 1:
+ if len(pvs) > 1 and pv_tags:
raise MultiplePVsError(pv_name)
return pvs[0]
def __repr__(self):
return self.__str__()
+ def _parse_size(self, size):
+ error_msg = "Unable to convert vg size to integer: '%s'" % str(size)
+ try:
+ integer, _ = size.split('g')
+ except ValueError:
+ logger.exception(error_msg)
+ raise RuntimeError(error_msg)
+
+ return util.str_to_int(integer)
+
+ @property
+ def free(self):
+ """
+ Parse the available size in gigabytes from the ``vg_free`` attribute, that
+ will be a string with a character ('g') to indicate gigabytes in size.
+ Returns a rounded down integer to ease internal operations::
+
+ >>> data_vg.vg_free
+ '0.01g'
+ >>> data_vg.size
+ 0
+ """
+ return self._parse_size(self.vg_free)
+
+ @property
+ def size(self):
+ """
+ Parse the size in gigabytes from the ``vg_size`` attribute, that
+ will be a string with a character ('g') to indicate gigabytes in size.
+ Returns a rounded down integer to ease internal operations::
+
+ >>> data_vg.vg_size
+ '1024.9g'
+ >>> data_vg.size
+ 1024
+ """
+ return self._parse_size(self.vg_size)
+
+ def sizing(self, parts=None, size=None):
+ """
+ Calculate proper sizing to fully utilize the volume group in the most
+ efficient way possible. To prevent situations where LVM might accept
+ a percentage that is beyond the vg's capabilities, it will refuse with
+ an error when requesting a larger-than-possible parameter, in addition
+ to rounding down calculations.
+
+ A dictionary with different sizing parameters is returned, to make it
+ easier for others to choose what they need in order to create logical
+ volumes::
+
+ >>> data_vg.free
+ 1024
+ >>> data_vg.sizing(parts=4)
+ {'parts': 4, 'sizes': 256, 'percentages': 25}
+ >>> data_vg.sizing(size=512)
+ {'parts': 2, 'sizes': 512, 'percentages': 50}
+
+
+ :param parts: Number of parts to create LVs from
+ :param size: Size in gigabytes to divide the VG into
+
+ :raises SizeAllocationError: When requested size cannot be allocated with
+ :raises ValueError: If both ``parts`` and ``size`` are given
+ """
+ if parts is not None and size is not None:
+ raise ValueError(
+ "Cannot process sizing with both parts (%s) and size (%s)" % (parts, size)
+ )
+
+ # if size is given we need to map that to extents so that we avoid
+ # issues when trying to get this right with a size in gigabytes find
+ # the percentage first, cheating, because these values are thrown out
+ vg_free_count = util.str_to_int(self.vg_free_count)
+
+ if size:
+ extents = int(size * vg_free_count / self.free)
+ disk_sizing = sizing(self.free, size=size, parts=parts)
+ else:
+ if parts is not None:
+ # Prevent parts being 0, falling back to 1 (100% usage)
+ parts = parts or 1
+ size = int(self.free / parts)
+ extents = size * vg_free_count / self.free
+ disk_sizing = sizing(self.free, parts=parts)
+
+ extent_sizing = sizing(vg_free_count, size=extents)
+
+ disk_sizing['extents'] = int(extents)
+ disk_sizing['percentages'] = extent_sizing['percentages']
+ return disk_sizing
+
class Volume(object):
"""
import logging
import os
from textwrap import dedent
-from ceph_volume import process, conf, decorators, terminal
+from ceph_volume import process, conf, decorators, terminal, __release__
from ceph_volume.util import system, disk
from ceph_volume.util import prepare as prepare_utils
from ceph_volume.util import encryption as encryption_utils
# enable the ceph-volume unit for this OSD
systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+ # enable the OSD
+ systemctl.enable_osd(osd_id)
+
# start the OSD
systemctl.start_osd(osd_id)
terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id)
wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal', dmcrypt_secret=dmcrypt_secret)
# Once symlinks are removed, the osd dir can be 'primed again.
- process.run([
+ prime_command = [
'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
'prime-osd-dir', '--dev', osd_lv_path,
- '--path', osd_path])
+ '--path', osd_path]
+
+ if __release__ != "luminous":
+ # mon-config changes are not available in Luminous
+ prime_command.append('--no-mon-config')
+
+ process.run(prime_command)
# always re-do the symlink regardless if it exists, so that the block,
# block.wal, and block.db devices that may have changed can be mapped
# correctly every time
# enable the ceph-volume unit for this OSD
systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
+ # enable the OSD
+ systemctl.enable_osd(osd_id)
+
# start the OSD
systemctl.start_osd(osd_id)
terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id)
--- /dev/null
+import argparse
+from textwrap import dedent
+from ceph_volume import terminal, decorators
+from ceph_volume.util import disk, prompt_bool
+from ceph_volume.util import arg_validators
+from . import strategies
+
+
+device_list_template = """
+ * {path: <25} {size: <10} {state}"""
+
+
+def device_formatter(devices):
+ lines = []
+ for path, details in devices:
+ lines.append(device_list_template.format(
+ path=path, size=details['human_readable_size'],
+ state='solid' if details['rotational'] == '0' else 'rotational')
+ )
+
+ return ''.join(lines)
+
+
+# Scenario filtering/detection
+def bluestore_single_type(device_facts):
+ """
+ Detect devices that are just HDDs or solid state so that a 1:1
+ device-to-osd provisioning can be done
+ """
+ types = [device.sys_api['rotational'] for device in device_facts]
+ if len(set(types)) == 1:
+ return strategies.bluestore.SingleType
+
+
+def bluestore_mixed_type(device_facts):
+ """
+ Detect if devices are HDDs as well as solid state so that block.db can be
+ placed in solid devices while data is kept in the spinning drives.
+ """
+ types = [device.sys_api['rotational'] for device in device_facts]
+ if len(set(types)) > 1:
+ return strategies.bluestore.MixedType
+
+
+def filestore_single_type(device_facts):
+ """
+ Detect devices that are just HDDs or solid state so that a 1:1
+ device-to-osd provisioning can be done, keeping the journal on the OSD
+ """
+ types = [device.sys_api['rotational'] for device in device_facts]
+ if len(set(types)) == 1:
+ return strategies.filestore.SingleType
+
+
+def filestore_mixed_type(device_facts):
+ """
+ Detect if devices are HDDs as well as solid state so that the journal can be
+ placed in solid devices while data is kept in the spinning drives.
+ """
+ types = [device.sys_api['rotational'] for device in device_facts]
+ if len(set(types)) > 1:
+ return strategies.filestore.MixedType
+
+
+def get_strategy(args):
+ """
+ Given a set of devices as input, go through the different detection
+ mechanisms to narrow down on a strategy to use. The strategies are 4 in
+ total:
+
+ * Single device type on Bluestore
+ * Mixed device types on Bluestore
+ * Single device type on Filestore
+ * Mixed device types on Filestore
+
+ When the function matches to a scenario it returns the strategy class. This
+ allows for dynamic loading of the conditions needed for each scenario, with
+ normalized classes
+ """
+ bluestore_strategies = [bluestore_mixed_type, bluestore_single_type]
+ filestore_strategies = [filestore_mixed_type, filestore_single_type]
+ if args.bluestore:
+ strategies = bluestore_strategies
+ else:
+ strategies = filestore_strategies
+
+ for strategy in strategies:
+ backend = strategy(args.devices)
+ if backend:
+ return backend(args.devices, args)
+
+
+class Batch(object):
+
+ help = 'Automatically size devices for multi-OSD provisioning with minimal interaction'
+
+ _help = dedent("""
+ Automatically size devices ready for OSD provisioning based on default strategies.
+
+ Detected devices:
+ {detected_devices}
+
+ Usage:
+
+ ceph-volume lvm batch [DEVICE...]
+
+ Optional reporting on possible outcomes is enabled with --report
+
+ ceph-volume lvm batch --report [DEVICE...]
+ """)
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ def get_devices(self):
+ all_devices = disk.get_devices()
+ # remove devices with partitions
+ # XXX Should be optional when getting device info
+ for device, detail in all_devices.items():
+ if detail.get('partitions') != {}:
+ del all_devices[device]
+ devices = sorted(all_devices.items(), key=lambda x: (x[0], x[1]['size']))
+ return device_formatter(devices)
+
+ def print_help(self):
+ return self._help.format(
+ detected_devices=self.get_devices(),
+ )
+
+ def report(self, args):
+ strategy = get_strategy(args)
+ if args.format == 'pretty':
+ strategy.report_pretty()
+ elif args.format == 'json':
+ strategy.report_json()
+ else:
+ raise RuntimeError('report format must be "pretty" or "json"')
+
+ def execute(self, args):
+ strategy = get_strategy(args)
+ if not args.yes:
+ strategy.report_pretty()
+ terminal.info('The above OSDs would be created if the operation continues')
+ if not prompt_bool('do you want to proceed? (yes/no)'):
+ terminal.error('aborting OSD provisioning for %s' % ','.join(args.devices))
+ raise SystemExit(0)
+
+ strategy.execute()
+
+ @decorators.needs_root
+ def main(self):
+ parser = argparse.ArgumentParser(
+ prog='ceph-volume lvm batch',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=self.print_help(),
+ )
+
+ parser.add_argument(
+ 'devices',
+ metavar='DEVICES',
+ nargs='*',
+ type=arg_validators.ValidDevice(),
+ default=[],
+ help='Devices to provision OSDs',
+ )
+ parser.add_argument(
+ '--bluestore',
+ action='store_true',
+ help='bluestore objectstore (default)',
+ )
+ parser.add_argument(
+ '--filestore',
+ action='store_true',
+ help='filestore objectstore',
+ )
+ parser.add_argument(
+ '--report',
+ action='store_true',
+ help='Autodetect the objectstore by inspecting the OSD',
+ )
+ parser.add_argument(
+ '--yes',
+ action='store_true',
+ help='Avoid prompting for confirmation when provisioning',
+ )
+ parser.add_argument(
+ '--format',
+ help='output format, defaults to "pretty"',
+ default='pretty',
+ choices=['json', 'pretty'],
+ )
+ parser.add_argument(
+ '--dmcrypt',
+ action='store_true',
+ help='Enable device encryption via dm-crypt',
+ )
+ parser.add_argument(
+ '--crush-device-class',
+ dest='crush_device_class',
+ help='Crush device class to assign this OSD to',
+ )
+ parser.add_argument(
+ '--no-systemd',
+ dest='no_systemd',
+ action='store_true',
+ help='Skip creating and enabling systemd units and starting OSD services',
+ )
+ args = parser.parse_args(self.argv)
+
+ if not args.devices:
+ return parser.print_help()
+
+ # Default to bluestore here since defaulting it in add_argument may
+ # cause both to be True
+ if not args.bluestore and not args.filestore:
+ args.bluestore = True
+
+ if args.report:
+ self.report(args)
+ else:
+ self.execute(args)
value=value
)
)
- output.append(
- device_metadata_item_template.format(tag_name='devices', value=','.join(device['devices'])))
+ if not device.get('devices'):
+ continue
+ else:
+ output.append(
+ device_metadata_item_template.format(
+ tag_name='devices',
+ value=','.join(device['devices'])
+ )
+ )
print(''.join(output))
from . import trigger
from . import listing
from . import zap
+from . import batch
class LVM(object):
mapper = {
'activate': activate.Activate,
+ 'batch': batch.Batch,
'prepare': prepare.Prepare,
'create': create.Create,
'trigger': trigger.Trigger,
from __future__ import print_function
import json
import logging
-import uuid
from textwrap import dedent
from ceph_volume.util import prepare as prepare_utils
from ceph_volume.util import encryption as encryption_utils
# get the latest monmap
prepare_utils.get_monmap(osd_id)
# prepare the osd filesystem
- prepare_utils.osd_mkfs_filestore(osd_id, fsid)
+ prepare_utils.osd_mkfs_filestore(osd_id, fsid, cephx_secret)
# write the OSD keyring if it doesn't exist already
prepare_utils.write_keyring(osd_id, cephx_secret)
if secrets.get('dmcrypt_key'):
"""
if disk.is_partition(arg) or disk.is_device(arg):
# we must create a vg, and then a single lv
- vg_name = "ceph-%s" % str(uuid.uuid4())
- api.create_vg(vg_name, arg)
+ vg = api.create_vg(arg)
lv_name = "osd-%s-%s" % (device_type, osd_fsid)
return api.create_lv(
lv_name,
- vg_name, # the volume group
+ vg.name, # the volume group
tags={'ceph.type': device_type})
else:
error = [
--- /dev/null
+from . import bluestore, filestore # noqa
--- /dev/null
+from __future__ import print_function
+import json
+from uuid import uuid4
+from ceph_volume.util import disk
+from ceph_volume.api import lvm
+from . import validators
+from ceph_volume.devices.lvm.create import Create
+from ceph_volume.util import templates
+
+
+class SingleType(object):
+ """
+ Support for all SSDs, or all HDDS
+ """
+
+ def __init__(self, devices, args):
+ self.args = args
+ self.devices = devices
+ self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+ self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+ self.computed = {'osds': [], 'vgs': []}
+ self.validate()
+ self.compute()
+
+ def report_json(self):
+ print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+ def report_pretty(self):
+ string = ""
+ string += templates.total_osds.format(
+ total_osds=len(self.hdds) or len(self.ssds) * 2
+ )
+ string += templates.osd_component_titles
+
+ for osd in self.computed['osds']:
+ string += templates.osd_header
+ string += templates.osd_component.format(
+ _type='[data]',
+ path=osd['data']['path'],
+ size=osd['data']['human_readable_size'],
+ percent=osd['data']['percentage'],
+ )
+
+ print(string)
+
+ def validate(self):
+ """
+ Ensure that the minimum requirements for this type of scenario is
+ met, raise an error if the provided devices would not work
+ """
+ # validate minimum size for all devices
+ validators.minimum_device_size(self.devices)
+
+ def compute(self):
+ """
+ Go through the rules needed to properly size the lvs, return
+ a dictionary with the result
+ """
+ osds = self.computed['osds']
+ vgs = self.computed['vgs']
+ for device in self.hdds:
+ vgs.append({'devices': [device.abspath], 'parts': 1})
+ osd = {'data': {}, 'block.db': {}}
+ osd['data']['path'] = device.abspath
+ osd['data']['size'] = device.sys_api['size']
+ osd['data']['parts'] = 1
+ osd['data']['percentage'] = 100
+ osd['data']['human_readable_size'] = str(disk.Size(b=device.sys_api['size']))
+ osds.append(osd)
+
+ for device in self.ssds:
+ # TODO: creates 2 OSDs per device, make this configurable (env var?)
+ extents = lvm.sizing(device.sys_api['size'], parts=2)
+ vgs.append({'devices': [device.abspath], 'parts': 2})
+ for ssd in range(2):
+ osd = {'data': {}, 'block.db': {}}
+ osd['data']['path'] = device.abspath
+ osd['data']['size'] = extents['sizes']
+ osd['data']['parts'] = extents['parts']
+ osd['data']['percentage'] = 50
+ osd['data']['human_readable_size'] = str(disk.Size(b=extents['sizes']))
+ osds.append(osd)
+
+ def execute(self):
+ """
+ Create vgs/lvs from the incoming set of devices, assign their roles
+ (block, block.db, block.wal, etc..) and offload the OSD creation to
+ ``lvm create``
+ """
+ osd_vgs = dict([(osd['data']['path'], None) for osd in self.computed['osds']])
+
+ # create the vgs first, mapping them to the device path
+ for osd in self.computed['osds']:
+ vg = osd_vgs.get(osd['data']['path'])
+ if not vg:
+ vg = lvm.create_vg(osd['data']['path'])
+ osd_vgs[osd['data']['path']] = {'vg': vg, 'parts': osd['data']['parts']}
+
+ # create the lvs from the vgs captured in the beginning
+ for create in osd_vgs.values():
+ lvs = lvm.create_lvs(create['vg'], parts=create['parts'], name_prefix='osd-data')
+ vg_name = create['vg'].name
+ for lv in lvs:
+ command = ['--bluestore', '--data']
+ command.append('%s/%s' % (vg_name, lv.name))
+ if self.args.dmcrypt:
+ command.append('--dmcrypt')
+ if self.args.no_systemd:
+ command.append('--no-systemd')
+ if self.args.crush_device_class:
+ command.extend(['--crush-device-class', self.args.crush_device_class])
+
+ Create(command).main()
+
+
+class MixedType(object):
+
+ def __init__(self, devices, args):
+ self.args = args
+ self.devices = devices
+ self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+ self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+ self.computed = {'osds': [], 'vgs': []}
+ self.block_db_size = None
+ # For every HDD we get 1 block.db
+ self.db_lvs = len(self.hdds)
+ self.validate()
+ self.compute()
+
+ def report_json(self):
+ print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+ def report_pretty(self):
+ vg_extents = lvm.sizing(self.total_ssd_size.b, parts=self.db_lvs)
+ db_size = str(disk.Size(b=(vg_extents['sizes'])))
+
+ string = ""
+ string += templates.total_osds.format(
+ total_osds=len(self.hdds)
+ )
+
+ string += templates.ssd_volume_group.format(
+ target='block.db',
+ total_lv_size=str(self.total_ssd_size),
+ total_lvs=vg_extents['parts'],
+ block_lv_size=db_size,
+ block_db_devices=', '.join([ssd.abspath for ssd in self.ssds]),
+ lv_size=str(disk.Size(b=(vg_extents['sizes']))),
+ total_osds=len(self.hdds)
+ )
+
+ string += templates.osd_component_titles
+ for osd in self.computed['osds']:
+ string += templates.osd_header
+ string += templates.osd_component.format(
+ _type='[data]',
+ path=osd['data']['path'],
+ size=osd['data']['human_readable_size'],
+ percent=osd['data']['percentage'])
+
+ string += templates.osd_component.format(
+ _type='[block.db]',
+ path='(volume-group/lv)',
+ size=osd['block.db']['human_readable_size'],
+ percent=osd['block.db']['percentage'])
+
+ print(string)
+
+ def compute(self):
+ osds = self.computed['osds']
+ for device in self.hdds:
+ osd = {'data': {}, 'block.db': {}}
+ osd['data']['path'] = device.abspath
+ osd['data']['size'] = device.sys_api['size']
+ osd['data']['percentage'] = 100
+ osd['data']['human_readable_size'] = str(disk.Size(b=(device.sys_api['size'])))
+ osd['block.db']['path'] = None
+ osd['block.db']['size'] = int(self.block_db_size.b)
+ osd['block.db']['human_readable_size'] = str(self.block_db_size)
+ osd['block.db']['percentage'] = self.vg_extents['percentages']
+ osds.append(osd)
+
+ self.computed['vgs'] = [{
+ 'devices': [d.abspath for d in self.ssds],
+ 'parts': self.db_lvs,
+ 'percentages': self.vg_extents['percentages'],
+ 'sizes': self.vg_extents['sizes'],
+ 'size': int(self.total_ssd_size.b),
+ 'human_readable_sizes': str(disk.Size(b=self.vg_extents['sizes'])),
+ 'human_readable_size': str(self.total_ssd_size),
+ }]
+
+ def execute(self):
+ """
+ Create vgs/lvs from the incoming set of devices, assign their roles
+ (block, block.db, block.wal, etc..) and offload the OSD creation to
+ ``lvm create``
+ """
+ # create the single vg for all block.db lv's first
+ vg_info = self.computed['vgs'][0]
+ vg = lvm.create_vg(vg_info['devices'])
+
+ # now produce all the block.db lvs needed from that single vg
+ db_lvs = lvm.create_lvs(vg, parts=vg_info['parts'], name_prefix='osd-block-db')
+
+ # create the data lvs, and create the OSD with the matching block.db lvs from before
+ for osd in self.computed['osds']:
+ vg = lvm.create_vg(osd['data']['path'])
+ data_lv = lvm.create_lv('osd-data-%s' % str(uuid4()), vg.name)
+ db_lv = db_lvs.pop()
+ command = [
+ '--bluestore',
+ '--data', "%s/%s" % (data_lv.vg_name, data_lv.name),
+ '--block.db', '%s/%s' % (db_lv.vg_name, db_lv.name)
+ ]
+ if self.args.dmcrypt:
+ command.append('--dmcrypt')
+ if self.args.no_systemd:
+ command.append('--no-systemd')
+ if self.args.crush_device_class:
+ command.extend(['--crush-device-class', self.args.crush_device_class])
+
+ Create(command).main()
+
+ def validate(self):
+ """
+ HDDs represent data devices, and solid state devices are for block.db,
+ make sure that the number of data devices would have enough LVs and
+ those LVs would be large enough to accommodate a block.db
+ """
+ # validate minimum size for all devices
+ validators.minimum_device_size(self.devices)
+
+ # add all the size available in solid drives and divide it by the
+ # expected number of osds, the expected output should be larger than
+ # the minimum alllowed for block.db
+ self.total_ssd_size = disk.Size(b=0)
+ for ssd in self.ssds:
+ self.total_ssd_size += disk.Size(b=ssd.sys_api['size'])
+
+ self.block_db_size = self.total_ssd_size / self.db_lvs
+ self.vg_extents = lvm.sizing(self.total_ssd_size.b, parts=self.db_lvs)
+
+ # min 2GB of block.db is allowed
+ msg = 'Total solid size (%s) is not enough for block.db LVs larger than 2 GB'
+ if self.block_db_size < disk.Size(gb=2):
+ # use ad-hoc exception here
+ raise RuntimeError(msg % self.total_ssd_size)
--- /dev/null
+from __future__ import print_function
+import json
+from ceph_volume.util import disk, prepare
+from ceph_volume.api import lvm
+from . import validators
+from ceph_volume.devices.lvm.create import Create
+from ceph_volume.util import templates
+
+
+class SingleType(object):
+ """
+ Support for all SSDs, or all HDDs, data and journal LVs will be colocated
+ in the same device
+ """
+
+ def __init__(self, devices, args):
+ self.args = args
+ self.devices = devices
+ self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+ self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+ self.computed = {'osds': [], 'vgs': []}
+ self.validate()
+ self.compute()
+
+ def report_json(self):
+ print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+ def report_pretty(self):
+ string = ""
+ string += templates.total_osds.format(
+ total_osds=len(self.hdds) or len(self.ssds) * 2
+ )
+ string += templates.osd_component_titles
+
+ for osd in self.computed['osds']:
+ string += templates.osd_header
+ string += templates.osd_component.format(
+ _type='[data]',
+ path=osd['data']['path'],
+ size=osd['data']['human_readable_size'],
+ percent=osd['data']['percentage'],
+ )
+ string += templates.osd_component.format(
+ _type='[journal]',
+ path=osd['journal']['path'],
+ size=osd['journal']['human_readable_size'],
+ percent=osd['journal']['percentage'],
+ )
+
+ print(string)
+
+ def validate(self):
+ """
+ Ensure that the minimum requirements for this type of scenario is
+ met, raise an error if the provided devices would not work
+ """
+ # validate minimum size for all devices
+ validators.minimum_device_size(self.devices)
+
+ def compute(self):
+ """
+ Go through the rules needed to properly size the lvs, return
+ a dictionary with the result
+ """
+ # chose whichever is the one group we have to compute against
+ devices = self.hdds or self.ssds
+ osds = self.computed['osds']
+ vgs = self.computed['vgs']
+ for device in devices:
+ device_size = disk.Size(b=device.sys_api['size'])
+ journal_size = prepare.get_journal_size(lv_format=False)
+ data_size = device_size - journal_size
+ data_percentage = data_size * 100 / device_size
+ vgs.append({'devices': [device.abspath], 'parts': 2})
+ osd = {'data': {}, 'journal': {}}
+ osd['data']['path'] = device.abspath
+ osd['data']['size'] = data_size.b
+ osd['data']['percentage'] = int(data_percentage)
+ osd['data']['human_readable_size'] = str(data_size)
+ osd['journal']['path'] = device.abspath
+ osd['journal']['size'] = journal_size.b
+ osd['journal']['percentage'] = int(100 - data_percentage)
+ osd['journal']['human_readable_size'] = str(journal_size)
+ osds.append(osd)
+
+ def execute(self):
+ """
+ Create vgs/lvs from the incoming set of devices, assign their roles
+ (data, journal) and offload the OSD creation to ``lvm create``
+ """
+ osd_vgs = []
+
+ # create the vgs first, one per device (since this is colocating, it
+ # picks the 'data' path)
+ for osd in self.computed['osds']:
+ vg = lvm.create_vg(osd['data']['path'])
+ osd_vgs.append(vg)
+
+ journal_size = prepare.get_journal_size()
+
+ # create the lvs from the vgs captured in the beginning
+ for vg in osd_vgs:
+ # this is called again, getting us the LVM formatted string
+ journal_lv = lvm.create_lv(
+ 'osd-journal', vg.name, size=journal_size, uuid_name=True
+ )
+ # no extents or size means it will use 100%FREE
+ data_lv = lvm.create_lv('osd-data', vg.name)
+
+ command = ['--filestore', '--data']
+ command.append('%s/%s' % (vg.name, data_lv.name))
+ command.extend(['--journal', '%s/%s' % (vg.name, journal_lv.name)])
+ if self.args.dmcrypt:
+ command.append('--dmcrypt')
+ if self.args.no_systemd:
+ command.append('--no-systemd')
+ if self.args.crush_device_class:
+ command.extend(['--crush-device-class', self.args.crush_device_class])
+
+ Create(command).main()
+
+
+class MixedType(object):
+ """
+ Supports HDDs with SSDs, journals will be placed on SSDs, while HDDs will
+ be used fully for data.
+
+ If an existing common VG is detected on SSDs, it will be extended if blank
+ SSDs are used, otherwise it will be used directly.
+ """
+
+ def __init__(self, devices, args):
+ self.args = args
+ self.devices = devices
+ self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+ self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+ self.computed = {'osds': [], 'vg': None}
+ self.blank_ssds = []
+ self.journals_needed = len(self.hdds)
+ self.journal_size = prepare.get_journal_size(lv_format=False)
+ self.system_vgs = lvm.VolumeGroups()
+ self.validate()
+ self.compute()
+
+ def report_json(self):
+ print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+ def report_pretty(self):
+ string = ""
+ string += templates.total_osds.format(
+ total_osds=len(self.hdds) or len(self.ssds) * 2
+ )
+
+ string += templates.ssd_volume_group.format(
+ target='journal',
+ total_lv_size=str(self.total_available_journal_space),
+ total_lvs=self.journals_needed,
+ block_db_devices=', '.join([d.path for d in self.ssds]),
+ lv_size=str(self.journal_size),
+ total_osds=self.journals_needed
+ )
+
+ string += templates.osd_component_titles
+
+ for osd in self.computed['osds']:
+ string += templates.osd_header
+ string += templates.osd_component.format(
+ _type='[data]',
+ path=osd['data']['path'],
+ size=osd['data']['human_readable_size'],
+ percent=osd['data']['percentage'],
+ )
+ string += templates.osd_component.format(
+ _type='[journal]',
+ path=osd['journal']['path'],
+ size=osd['journal']['human_readable_size'],
+ percent=osd['journal']['percentage'],
+ )
+
+ print(string)
+
+ def get_common_vg(self):
+ # find all the vgs associated with the current device
+ for ssd in self.ssds:
+ for pv in ssd.pvs_api:
+ vg = self.system_vgs.get(vg_name=pv.vg_name)
+ if not vg:
+ continue
+ # this should give us just one VG, it would've been caught by
+ # the validator otherwise
+ return vg
+
+ def validate(self):
+ """
+ Ensure that the minimum requirements for this type of scenario is
+ met, raise an error if the provided devices would not work
+ """
+ # validate minimum size for all devices
+ validators.minimum_device_size(self.devices)
+
+ # make sure that data devices do not have any LVs
+ validators.no_lvm_membership(self.hdds)
+
+ # do not allow non-common VG to continue
+ validators.has_common_vg(self.ssds)
+
+ # find the common VG to calculate how much is available
+ self.common_vg = self.get_common_vg()
+
+ # find how many journals are possible from the common VG
+ if self.common_vg:
+ common_vg_size = disk.Size(gb=self.common_vg.free)
+ else:
+ common_vg_size = disk.Size(gb=0)
+
+ # non-VG SSDs
+ self.vg_ssds = set([d for d in self.ssds if d.is_lvm_member])
+ self.blank_ssds = set(self.ssds).difference(self.vg_ssds)
+ self.total_blank_ssd_size = disk.Size(b=0)
+ for blank_ssd in self.blank_ssds:
+ self.total_blank_ssd_size += disk.Size(b=blank_ssd.sys_api['size'])
+
+ self.total_available_journal_space = self.total_blank_ssd_size + common_vg_size
+
+ try:
+ self.vg_extents = lvm.sizing(
+ self.total_available_journal_space.b, size=self.journal_size.b
+ )
+ # FIXME with real exception catching from sizing that happens when the
+ # journal space is not enough
+ except Exception:
+ self.vg_extents = {'parts': 0, 'percentages': 0, 'sizes': 0}
+
+ # validate that number of journals possible are enough for number of
+ # OSDs proposed
+ total_journals_possible = self.total_available_journal_space / self.journal_size
+ if len(self.hdds) > total_journals_possible:
+ msg = "Not enough %s journals (%s) can be created for %s OSDs" % (
+ self.journal_size, total_journals_possible, len(self.hdds)
+ )
+ raise RuntimeError(msg)
+
+ def compute(self):
+ """
+ Go through the rules needed to properly size the lvs, return
+ a dictionary with the result
+ """
+ osds = self.computed['osds']
+
+ vg_free = int(self.total_available_journal_space.gb)
+ if not self.common_vg:
+ # there isn't a common vg, so a new one must be created with all
+ # the blank SSDs
+ self.computed['vg'] = {
+ 'devices': self.blank_ssds,
+ 'parts': self.journals_needed,
+ 'percentages': self.vg_extents['percentages'],
+ 'sizes': self.journal_size.b,
+ 'size': int(self.total_blank_ssd_size.b),
+ 'human_readable_sizes': str(self.journal_size),
+ 'human_readable_size': str(self.total_available_journal_space),
+ }
+ vg_name = 'lv/vg'
+ else:
+ vg_name = self.common_vg.name
+
+ for device in self.hdds:
+ device_size = disk.Size(b=device.sys_api['size'])
+ data_size = device_size - self.journal_size
+ osd = {'data': {}, 'journal': {}}
+ osd['data']['path'] = device.path
+ osd['data']['size'] = data_size.b
+ osd['data']['percentage'] = 100
+ osd['data']['human_readable_size'] = str(device_size)
+ osd['journal']['path'] = 'vg: %s' % vg_name
+ osd['journal']['size'] = self.journal_size.b
+ osd['journal']['percentage'] = int(self.journal_size.gb * 100 / vg_free)
+ osd['journal']['human_readable_size'] = str(self.journal_size)
+ osds.append(osd)
+
+ def execute(self):
+ """
+ Create vgs/lvs from the incoming set of devices, assign their roles
+ (data, journal) and offload the OSD creation to ``lvm create``
+ """
+ ssd_paths = [d.abspath for d in self.blank_ssds]
+
+ # no common vg is found, create one with all the blank SSDs
+ if not self.common_vg:
+ journal_vg = lvm.create_vg(ssd_paths, name_prefix='ceph-journals')
+ # a vg exists that can be extended
+ elif self.common_vg and ssd_paths:
+ journal_vg = lvm.extend_vg(self.common_vg, ssd_paths)
+ # one common vg with nothing else to extend can be used directly
+ else:
+ journal_vg = self.common_vg
+
+ journal_size = prepare.get_journal_size(lv_format=True)
+
+ for osd in self.computed['osds']:
+ data_vg = lvm.create_vg(osd['data']['path'], name_prefix='ceph-data')
+ # no extents or size means it will use 100%FREE
+ data_lv = lvm.create_lv('osd-data', data_vg.name)
+ journal_lv = lvm.create_lv(
+ 'osd-journal', journal_vg.name, size=journal_size, uuid_name=True
+ )
+
+ command = ['--filestore', '--data']
+ command.append('%s/%s' % (data_vg.name, data_lv.name))
+ command.extend(['--journal', '%s/%s' % (journal_vg.name, journal_lv.name)])
+ if self.args.dmcrypt:
+ command.append('--dmcrypt')
+ if self.args.no_systemd:
+ command.append('--no-systemd')
+ if self.args.crush_device_class:
+ command.extend(['--crush-device-class', self.args.crush_device_class])
+
+ Create(command).main()
--- /dev/null
+from ceph_volume.util import disk
+from ceph_volume.api import lvm
+
+
+def minimum_device_size(devices):
+ """
+ Ensure that the minimum requirements for this type of scenario is
+ met, raise an error if the provided devices would not work
+ """
+ msg = 'Unable to use device smaller than 5GB: %s (%s)'
+ for device in devices:
+ device_size = disk.Size(b=device.sys_api['size'])
+ if device_size < disk.Size(gb=5):
+ raise RuntimeError(msg % (device, device_size))
+
+
+def no_lvm_membership(devices):
+ """
+ Do not allow devices that are part of LVM
+ """
+ msg = 'Unable to use device, already a member of LVM: %s'
+ for device in devices:
+ if device.is_lvm_member:
+ raise RuntimeError(msg % device.abspath)
+
+
+def has_common_vg(ssd_devices):
+ """
+ Ensure that devices have a common VG between them
+ """
+ msg = 'Could not find a common VG between devices: %s'
+ system_vgs = lvm.VolumeGroups()
+ ssd_vgs = {}
+
+ for ssd_device in ssd_devices:
+ for pv in ssd_device.pvs_api:
+ vg = system_vgs.get(vg_name=pv.vg_name)
+ if not vg:
+ continue
+ try:
+ ssd_vgs[vg.name].append(ssd_device.abspath)
+ except KeyError:
+ ssd_vgs[vg.name] = [ssd_device.abspath]
+ # len of 1 means they all have a common vg, and len of 0 means that these
+ # are blank
+ if len(ssd_vgs) <= 1:
+ return
+ raise RuntimeError(msg % ', '.join(ssd_vgs.keys()))
def __init__(self, argv):
self.argv = argv
- @decorators.needs_root
- def zap(self, args):
- device = args.device
- lv = api.get_lv_from_argument(device)
- if lv:
- # we are zapping a logical volume
- path = lv.lv_path
- else:
- # we are zapping a partition
- #TODO: ensure device is a partition
- path = device
-
- mlogger.info("Zapping: %s", path)
-
- # check if there was a pv created with the
- # name of device
- pv = api.get_pv(pv_name=device)
- if pv:
- vg_name = pv.vg_name
- lv = api.get_lv(vg_name=vg_name)
-
- dmcrypt = False
- dmcrypt_uuid = None
- if lv:
- osd_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id'])
- dmcrypt_uuid = lv.lv_uuid
- dmcrypt = lv.encrypted
- if system.path_is_mounted(osd_path):
- mlogger.info("Unmounting %s", osd_path)
- system.unmount(osd_path)
+ def unmount_lv(self, lv):
+ if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'):
+ lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id'])
else:
- # we're most likely dealing with a partition here, check to
- # see if it was encrypted
- partuuid = disk.get_partuuid(device)
- if encryption.status("/dev/mapper/{}".format(partuuid)):
- dmcrypt_uuid = partuuid
- dmcrypt = True
-
+ lv_path = lv.lv_path
+ dmcrypt_uuid = lv.lv_uuid
+ dmcrypt = lv.encrypted
+ if system.path_is_mounted(lv_path):
+ mlogger.info("Unmounting %s", lv_path)
+ system.unmount(lv_path)
if dmcrypt and dmcrypt_uuid:
- dmcrypt_path = "/dev/mapper/{}".format(dmcrypt_uuid)
- mlogger.info("Closing encrypted path %s", dmcrypt_path)
- encryption.dmcrypt_close(dmcrypt_path)
-
- if args.destroy and pv:
- logger.info("Found a physical volume created from %s, will destroy all it's vgs and lvs", device)
- vg_name = pv.vg_name
- mlogger.info("Destroying volume group %s because --destroy was given", vg_name)
- api.remove_vg(vg_name)
- mlogger.info("Destroying physical volume %s because --destroy was given", device)
- api.remove_pv(device)
- elif args.destroy and not pv:
- mlogger.info("Skipping --destroy because no associated physical volumes are found for %s", device)
-
- wipefs(path)
- zap_data(path)
+ self.dmcrypt_close(dmcrypt_uuid)
- if lv and not pv:
- # remove all lvm metadata
- lv.clear_tags()
-
- terminal.success("Zapping successful for: %s" % path)
+ @decorators.needs_root
+ def zap(self, args):
+ for device in args.devices:
+ if disk.is_mapper_device(device):
+ terminal.error("Refusing to zap the mapper device: {}".format(device))
+ raise SystemExit(1)
+ lv = api.get_lv_from_argument(device)
+ if lv:
+ # we are zapping a logical volume
+ path = lv.lv_path
+ self.unmount_lv(lv)
+ else:
+ # we are zapping a partition
+ #TODO: ensure device is a partition
+ path = device
+ # check to if it is encrypted to close
+ partuuid = disk.get_partuuid(device)
+ if encryption.status("/dev/mapper/{}".format(partuuid)):
+ dmcrypt_uuid = partuuid
+ self.dmcrypt_close(dmcrypt_uuid)
+
+ mlogger.info("Zapping: %s", path)
+
+ # check if there was a pv created with the
+ # name of device
+ pvs = api.PVolumes()
+ pvs.filter(pv_name=device)
+ vgs = set([pv.vg_name for pv in pvs])
+ for pv in pvs:
+ vg_name = pv.vg_name
+ lv = None
+ if pv.lv_uuid:
+ lv = api.get_lv(vg_name=vg_name, lv_uuid=pv.lv_uuid)
+
+ if lv:
+ self.unmount_lv(lv)
+
+ if args.destroy:
+ for vg_name in vgs:
+ mlogger.info("Destroying volume group %s because --destroy was given", vg_name)
+ api.remove_vg(vg_name)
+ mlogger.info("Destroying physical volume %s because --destroy was given", device)
+ api.remove_pv(device)
+
+ wipefs(path)
+ zap_data(path)
+
+ if lv and not pvs:
+ # remove all lvm metadata
+ lv.clear_tags()
+
+ terminal.success("Zapping successful for: %s" % ", ".join(args.devices))
+
+ def dmcrypt_close(self, dmcrypt_uuid):
+ dmcrypt_path = "/dev/mapper/{}".format(dmcrypt_uuid)
+ mlogger.info("Closing encrypted path %s", dmcrypt_path)
+ encryption.dmcrypt_close(dmcrypt_path)
def main(self):
sub_command_help = dedent("""
- Zaps the given logical volume, raw device or partition for reuse by ceph-volume.
+ Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume.
If given a path to a logical volume it must be in the format of vg/lv. Any
filesystems present on the given device, vg/lv, or partition will be removed and
all data will be purged.
ceph-volume lvm zap /dev/sdc1
+ Zapping many raw devices:
+
+ ceph-volume lvm zap /dev/sda /dev/sdb /db/sdc
+
If the --destroy flag is given and you are zapping a raw device or partition
then all vgs and lvs that exist on that raw device or partition will be destroyed.
)
parser.add_argument(
- 'device',
- metavar='DEVICE',
- nargs='?',
- help='Path to an lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)'
+ 'devices',
+ metavar='DEVICES',
+ nargs='*',
+ default=[],
+ help='Path to one or many lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)'
)
parser.add_argument(
'--destroy',
def __str__(self):
msg = "Got more than 1 result looking for volume group: %s" % self.vg_name
return msg
+
+
+class SizeAllocationError(Exception):
+
+ def __init__(self, requested, available):
+ self.requested = requested
+ self.available = available
+
+ def __str__(self):
+ msg = 'Unable to allocate size (%s), not enough free space (%s)' % (
+ self.requested, self.available
+ )
+ return msg
"""
def __init__(self, argv=None, parse=True):
- self.mapper = {'lvm': devices.lvm.LVM, 'simple': devices.simple.Simple}
+ self.mapper = {
+ 'lvm': devices.lvm.LVM,
+ 'simple': devices.simple.Simple,
+ }
self.plugin_help = "No plugins found/loaded"
if argv is None:
self.argv = sys.argv
import subprocess
from select import select
from ceph_volume import terminal
+from ceph_volume.util import as_bytes
import logging
for descriptor in reads:
descriptor_name = descriptor_names[descriptor]
try:
- log_output(descriptor_name, read(descriptor, 1024), terminal_logging, True)
+ message = read(descriptor, 1024)
+ if not isinstance(message, str):
+ message = message.decode('utf-8')
+ log_output(descriptor_name, message, terminal_logging, True)
except (IOError, OSError):
# nothing else to log
pass
close_fds=True,
**kw
)
+
if stdin:
- stdout_stream, stderr_stream = process.communicate(stdin)
+ stdout_stream, stderr_stream = process.communicate(as_bytes(stdin))
else:
stdout_stream = process.stdout.read()
stderr_stream = process.stderr.read()
process.run(['systemctl', 'stop', unit])
-def enable(unit):
- process.run(['systemctl', 'enable', unit])
+def enable(unit, runtime=False):
+ if runtime:
+ process.run(['systemctl', 'enable', '--runtime', unit])
+ else:
+ process.run(['systemctl', 'enable', unit])
def disable(unit):
def enable_osd(id_):
- return enable(osd_unit % id_)
+ return enable(osd_unit % id_, runtime=True)
def disable_osd(id_):
return volumes
-@pytest.fixture
-def pvolumes(monkeypatch):
- monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
- pvolumes = api.PVolumes()
- pvolumes._purge()
- return pvolumes
-
-
@pytest.fixture
def volume_groups(monkeypatch):
monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
assert api.get_pv(pv_uuid='0000') == FooPVolume
+ def test_multiple_pvs_is_matched_by_uuid(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={}, lv_uuid="0000000")
+ BarPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_uuid='0000') == FooPVolume
+
+ def test_multiple_pvs_is_matched_by_name(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={}, lv_uuid="0000000")
+ BarPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ assert api.get_pv(pv_name='/dev/sda') == FooPVolume
+
+ def test_multiple_pvs_is_matched_by_tags(self, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(vg_name="vg1", pv_name='/dev/sdc', pv_uuid="1000", pv_tags="ceph.foo=bar", lv_uuid="0000000")
+ BarPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags="ceph.foo=bar")
+ pvolumes.append(FooPVolume)
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ with pytest.raises(exceptions.MultiplePVsError):
+ api.get_pv(pv_tags={"ceph.foo": "bar"})
+
def test_single_pv_is_matched_by_uuid(self, pvolumes, monkeypatch):
FooPVolume = api.PVolume(
pv_name='/dev/vg/foo',
volume_groups.filter()
+class TestVolumeGroupFree(object):
+
+ def test_no_g_in_output(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='')
+ with pytest.raises(RuntimeError):
+ vg.free
+
+ def test_g_without_size(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='g')
+ with pytest.raises(RuntimeError):
+ vg.free
+
+ def test_size_without_g(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='1')
+ with pytest.raises(RuntimeError):
+ vg.free
+
+ def test_error_message(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='F')
+ with pytest.raises(RuntimeError) as error:
+ vg.free
+ assert "Unable to convert vg size to integer: 'F'" in str(error)
+
+ def test_invalid_float(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free=' g')
+ with pytest.raises(RuntimeError) as error:
+ vg.free
+ assert "Unable to convert to integer: ' '" in str(error.value)
+
+ def test_integer_gets_produced(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='100g')
+ assert vg.free == 100
+
+ def test_integer_gets_produced_whitespace(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free=' 100g ')
+ assert vg.free == 100
+
+ def test_integer_gets_rounded_down(self):
+ vg = api.VolumeGroup(vg_name='nosize', vg_free='100.99g')
+ assert vg.free == 100
+
+
+class TestCreateLVs(object):
+
+ def test_creates_correct_lv_number_from_parts(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm.create_lv', lambda *a, **kw: (a, kw))
+ vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_size='99999999g', vg_free_count='999'
+ )
+ lvs = api.create_lvs(vg, parts=4)
+ assert len(lvs) == 4
+
+ def test_suffixes_the_size_arg(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm.create_lv', lambda *a, **kw: (a, kw))
+ vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_size='99999999g', vg_free_count='999'
+ )
+ lvs = api.create_lvs(vg, parts=4)
+ assert lvs[0][1]['extents'] == 249
+
+ def test_only_uses_free_size(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm.create_lv', lambda *a, **kw: (a, kw))
+ vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_size='99999999g', vg_free_count='1000'
+ )
+ lvs = api.create_lvs(vg, parts=4)
+ assert lvs[0][1]['extents'] == 250
+
+ def test_null_tags_are_set_by_default(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm.create_lv', lambda *a, **kw: (a, kw))
+ vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_size='99999999g', vg_free_count='999'
+ )
+ kwargs = api.create_lvs(vg, parts=4)[0][1]
+ assert list(kwargs['tags'].values()) == ['null', 'null', 'null', 'null']
+
+ def test_fallback_to_one_part(self, monkeypatch):
+ monkeypatch.setattr('ceph_volume.api.lvm.create_lv', lambda *a, **kw: (a, kw))
+ vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_size='99999999g', vg_free_count='999'
+ )
+ lvs = api.create_lvs(vg)
+ assert len(lvs) == 1
+
+
+class TestVolumeGroupSizing(object):
+
+ def setup(self):
+ self.vg = api.VolumeGroup(
+ vg_name='ceph', vg_free='1024g',
+ vg_free_count='261129'
+ )
+
+ def test_parts_and_size_errors(self):
+ with pytest.raises(ValueError) as error:
+ self.vg.sizing(parts=4, size=10)
+ assert "Cannot process sizing" in str(error)
+
+ def test_zero_parts_produces_100_percent(self):
+ result = self.vg.sizing(parts=0)
+ assert result['percentages'] == 100
+
+ def test_two_parts_produces_50_percent(self):
+ result = self.vg.sizing(parts=2)
+ assert result['percentages'] == 50
+
+ def test_two_parts_produces_half_size(self):
+ result = self.vg.sizing(parts=2)
+ assert result['sizes'] == 512
+
+ def test_half_size_produces_round_sizes(self):
+ result = self.vg.sizing(size=512)
+ assert result['sizes'] == 512
+ assert result['percentages'] == 50
+ assert result['parts'] == 2
+
+ def test_bit_more_than_half_size_allocates_full_size(self):
+ # 513 can't allocate more than 1, so it just fallsback to using the
+ # whole device
+ result = self.vg.sizing(size=513)
+ assert result['sizes'] == 1024
+ assert result['percentages'] == 100
+ assert result['parts'] == 1
+
+ def test_extents_are_halfed_rounded_down(self):
+ result = self.vg.sizing(size=512)
+ # the real extents would've given 130564.5
+ assert result['extents'] == 130564
+
+ def test_bit_less_size_rounds_down(self):
+ result = self.vg.sizing(size=129)
+ assert result['sizes'] == 146
+ assert result['percentages'] == 14
+ assert result['parts'] == 7
+
+ def test_unable_to_allocate_past_free_size(self):
+ with pytest.raises(exceptions.SizeAllocationError):
+ self.vg.sizing(size=2048)
+
+
class TestGetLVFromArgument(object):
def setup(self):
data_tag = ['lvchange', '--addtag', 'ceph.data_device=/path', '/path']
assert capture.calls[2]['args'][0] == data_tag
+ def test_uses_uuid(self, monkeypatch, capture):
+ monkeypatch.setattr(process, 'run', capture)
+ monkeypatch.setattr(process, 'call', capture)
+ monkeypatch.setattr(api, 'get_lv', lambda *a, **kw: self.foo_volume)
+ api.create_lv('foo', 'foo_group', size='5G', tags={'ceph.type': 'data'}, uuid_name=True)
+ result = capture.calls[0]['args'][0][5]
+ assert result.startswith('foo-')
+ assert len(result) == 40
+
+
+class TestExtendVG(object):
+
+ def setup(self):
+ self.foo_volume = api.VolumeGroup(vg_name='foo', lv_tags='')
+
+ def test_uses_single_device_in_list(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.extend_vg(self.foo_volume, ['/dev/sda'])
+ expected = ['vgextend', '--force', '--yes', 'foo', '/dev/sda']
+ assert fake_run.calls[0]['args'][0] == expected
+
+ def test_uses_single_device(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.extend_vg(self.foo_volume, '/dev/sda')
+ expected = ['vgextend', '--force', '--yes', 'foo', '/dev/sda']
+ assert fake_run.calls[0]['args'][0] == expected
+
+ def test_uses_multiple_devices(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.extend_vg(self.foo_volume, ['/dev/sda', '/dev/sdb'])
+ expected = ['vgextend', '--force', '--yes', 'foo', '/dev/sda', '/dev/sdb']
+ assert fake_run.calls[0]['args'][0] == expected
+
+
+class TestCreateVG(object):
+
+ def setup(self):
+ self.foo_volume = api.VolumeGroup(vg_name='foo', lv_tags='')
+
+ def test_no_name(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.create_vg('/dev/sda')
+ result = fake_run.calls[0]['args'][0]
+ assert '/dev/sda' in result
+ assert result[-2].startswith('ceph-')
+
+ def test_devices_list(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.create_vg(['/dev/sda', '/dev/sdb'], name='ceph')
+ result = fake_run.calls[0]['args'][0]
+ expected = ['vgcreate', '--force', '--yes', 'ceph', '/dev/sda', '/dev/sdb']
+ assert result == expected
+
+ def test_name_prefix(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.create_vg('/dev/sda', name_prefix='master')
+ result = fake_run.calls[0]['args'][0]
+ assert '/dev/sda' in result
+ assert result[-2].startswith('master-')
+
+ def test_specific_name(self, monkeypatch, fake_run):
+ monkeypatch.setattr(api, 'get_vg', lambda **kw: True)
+ api.create_vg('/dev/sda', name='master')
+ result = fake_run.calls[0]['args'][0]
+ assert '/dev/sda' in result
+ assert result[-2] == 'master'
#
# The following tests are pretty gnarly. VDO detection is very convoluted and
'/sys/block': block_path})
result = api._vdo_parents(['dm-3'])
assert result == []
+
+
+class TestSplitNameParser(object):
+
+ def test_keys_are_parsed_without_prefix(self):
+ line = ["DM_VG_NAME='/dev/mapper/vg';DM_LV_NAME='lv';DM_LV_LAYER=''"]
+ result = api._splitname_parser(line)
+ assert result['VG_NAME'] == 'vg'
+ assert result['LV_NAME'] == 'lv'
+ assert result['LV_LAYER'] == ''
+
+ def test_vg_name_sans_mapper(self):
+ line = ["DM_VG_NAME='/dev/mapper/vg';DM_LV_NAME='lv';DM_LV_LAYER=''"]
+ result = api._splitname_parser(line)
+ assert '/dev/mapper' not in result['VG_NAME']
+
+
+class TestIsLV(object):
+
+ def test_is_not_an_lv(self, monkeypatch):
+ monkeypatch.setattr(api, 'dmsetup_splitname', lambda x: {})
+ assert api.is_lv('/dev/sda1', lvs=[]) is False
+
+ def test_lvs_not_found(self, monkeypatch, volumes):
+ CephVolume = api.Volume(lv_name='foo', lv_path='/dev/vg/foo', lv_tags="ceph.type=data")
+ volumes.append(CephVolume)
+ splitname = {'LV_NAME': 'data', 'VG_NAME': 'ceph'}
+ monkeypatch.setattr(api, 'dmsetup_splitname', lambda x: splitname)
+ assert api.is_lv('/dev/sda1', lvs=volumes) is False
+
+ def test_is_lv(self, monkeypatch, volumes):
+ CephVolume = api.Volume(
+ vg_name='ceph', lv_name='data',
+ lv_path='/dev/vg/foo', lv_tags="ceph.type=data"
+ )
+ volumes.append(CephVolume)
+ splitname = {'LV_NAME': 'data', 'VG_NAME': 'ceph'}
+ monkeypatch.setattr(api, 'dmsetup_splitname', lambda x: splitname)
+ assert api.is_lv('/dev/sda1', lvs=volumes) is True
return vgs
+@pytest.fixture
+def pvolumes(monkeypatch):
+ monkeypatch.setattr('ceph_volume.process.call', lambda x: ('', '', 0))
+ pvolumes = lvm_api.PVolumes()
+ pvolumes._purge()
+ return pvolumes
+
+
@pytest.fixture
def is_root(monkeypatch):
"""
Create a temporary file, optionally filling it with contents, returns an
absolute path to the file when called
"""
- def generate_file(name='file', contents=''):
- path = os.path.join(str(tmpdir), name)
+ def generate_file(name='file', contents='', directory=None):
+ directory = directory or str(tmpdir)
+ path = os.path.join(directory, name)
with open(path, 'w') as fp:
fp.write(contents)
return path
return generate_file
+
+
+@pytest.fixture
+def device_info(monkeypatch):
+ def apply(devices=None, lsblk=None, lv=None):
+ devices = devices if devices else {}
+ lsblk = lsblk if lsblk else {}
+ lv = Factory(**lv) if lv else None
+ monkeypatch.setattr("ceph_volume.sys_info.devices", {})
+ monkeypatch.setattr("ceph_volume.util.device.disk.get_devices", lambda: devices)
+ monkeypatch.setattr("ceph_volume.util.device.lvm.get_lv_from_argument", lambda path: lv)
+ monkeypatch.setattr("ceph_volume.util.device.disk.lsblk", lambda path: lsblk)
+ return apply
assert result['0'][0]['path'] == '/dev/VolGroup/lv'
assert result['0'][0]['devices'] == ['/dev/sda1', '/dev/sdb1']
+ def test_report_a_ceph_lv_with_multiple_pvs_of_same_name(self, pvolumes, monkeypatch):
+ tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+ lv = api.Volume(
+ lv_name='lv', vg_name='VolGroup',
+ lv_uuid='aaaa', lv_path='/dev/VolGroup/lv', lv_tags=tags
+ )
+ monkeypatch.setattr(api, 'get_lv_from_argument', lambda device: None)
+ monkeypatch.setattr(api, 'get_lv', lambda vg_name: lv)
+ FooPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={}, lv_uuid="aaaa")
+ BarPVolume = api.PVolume(vg_name="vg", pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+ pvolumes.append(FooPVolume)
+ pvolumes.append(BarPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ listing = lvm.listing.List([])
+ result = listing.single_report('/dev/sda')
+ assert result['0'][0]['name'] == 'lv'
+ assert result['0'][0]['lv_tags'] == tags
+ assert result['0'][0]['path'] == '/dev/VolGroup/lv'
+ assert len(result) == 1
+
def test_report_a_ceph_lv_with_no_matching_devices(self, volumes, monkeypatch):
tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
lv = api.Volume(
def test_main_spits_help_with_no_arguments(self, capsys):
lvm.zap.Zap([]).main()
stdout, stderr = capsys.readouterr()
- assert 'Zaps the given logical volume, raw device or partition' in stdout
+ assert 'Zaps the given logical volume(s), raw device(s) or partition(s)' in stdout
def test_main_shows_full_help(self, capsys):
with pytest.raises(SystemExit):
lvm.zap.Zap(argv=['--help']).main()
stdout, stderr = capsys.readouterr()
assert 'optional arguments' in stdout
- assert 'positional arguments' in stdout
+
+ @pytest.mark.parametrize('device_name', [
+ '/dev/mapper/foo',
+ '/dev/dm-0',
+ ])
+ def test_can_not_zap_mapper_device(self, capsys, is_root, device_name):
+ with pytest.raises(SystemExit):
+ lvm.zap.Zap(argv=[device_name]).main()
+ stdout, stderr = capsys.readouterr()
+ assert 'Refusing to zap' in stdout
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+dmcrypt: true
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_bluestore_dmcrypt.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_bluestore.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "filestore"
+osd_scenario: lvm
+dmcrypt: true
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
+ osd:
+ osd_journal_size: 2048
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_filestore_dmcrypt.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_filestore.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: centos/7
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: stop ceph-osd@1 daemon
+ service:
+ name: ceph-osd@1
+ state: stopped
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: destroy osd.1
+ command: "ceph osd purge osd.1 --yes-i-really-mean-it"
+
+ - name: destroy osd.0
+ command: "ceph osd purge osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: zap /dev/sdd
+ command: "ceph-volume lvm zap /dev/sdb --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+
+ - name: zap /dev/sdc
+ command: "ceph-volume lvm zap /dev/sdc --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: batch create /dev/sdb and /dev/sdc again
+ command: "ceph-volume lvm batch --yes --bluestore /dev/sdb /dev/sdc"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: stop ceph-osd@1 daemon
+ service:
+ name: ceph-osd@1
+ state: stopped
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: destroy osd.1
+ command: "ceph osd purge osd.1 --yes-i-really-mean-it"
+
+ - name: destroy osd.0
+ command: "ceph osd purge osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: zap /dev/sdd
+ command: "ceph-volume lvm zap /dev/sdb --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+
+ - name: zap /dev/sdc
+ command: "ceph-volume lvm zap /dev/sdc --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: batch create /dev/sdb and /dev/sdc again
+ command: "ceph-volume lvm batch --yes --bluestore --dmcrypt /dev/sdb /dev/sdc"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: stop ceph-osd@1 daemon
+ service:
+ name: ceph-osd@1
+ state: stopped
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: destroy osd.1
+ command: "ceph osd purge osd.1 --yes-i-really-mean-it"
+
+ - name: destroy osd.0
+ command: "ceph osd purge osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: zap /dev/sdd
+ command: "ceph-volume lvm zap /dev/sdb --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+
+ - name: zap /dev/sdc
+ command: "ceph-volume lvm zap /dev/sdc --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: batch create /dev/sdb and /dev/sdc again
+ command: "ceph-volume lvm batch --yes --filestore /dev/sdb /dev/sdc"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: stop ceph-osd@1 daemon
+ service:
+ name: ceph-osd@1
+ state: stopped
+
+ - name: stop ceph-osd@0 daemon
+ service:
+ name: ceph-osd@0
+ state: stopped
+
+
+- hosts: mons
+ become: yes
+ tasks:
+
+ - name: destroy osd.1
+ command: "ceph osd purge osd.1 --yes-i-really-mean-it"
+
+ - name: destroy osd.0
+ command: "ceph osd purge osd.0 --yes-i-really-mean-it"
+
+
+- hosts: osds
+ become: yes
+ tasks:
+
+ - name: zap /dev/sdd
+ command: "ceph-volume lvm zap /dev/sdb --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+
+ - name: zap /dev/sdc
+ command: "ceph-volume lvm zap /dev/sdc --destroy"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
+
+ - name: batch create /dev/sdb and /dev/sdc again
+ command: "ceph-volume lvm batch --yes --filestore --dmcrypt /dev/sdb /dev/sdc"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+[tox]
+envlist = {centos7,xenial}-{bluestore,filestore}-{single_type,single_type_dmcrypt}
+skipsdist = True
+
+[testenv]
+whitelist_externals =
+ vagrant
+ bash
+ git
+ cp
+passenv=*
+setenv=
+ ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
+ ANSIBLE_ACTION_PLUGINS = {envdir}/tmp/ceph-ansible/plugins/actions
+ ANSIBLE_STDOUT_CALLBACK = debug
+ ANSIBLE_RETRY_FILES_ENABLED = False
+ ANSIBLE_SSH_RETRIES = 5
+ VAGRANT_CWD = {changedir}
+ CEPH_VOLUME_DEBUG = 1
+deps=
+ ansible~=2.6,<2.7
+ testinfra
+ pytest-xdist
+ notario>=0.0.13
+changedir=
+ centos7-filestore-single_type: {toxinidir}/centos7/filestore/single-type
+ centos7-filestore-single_type_dmcrypt: {toxinidir}/centos7/filestore/single-type-dmcrypt
+ centos7-bluestore-single_type: {toxinidir}/centos7/bluestore/single-type
+ centos7-bluestore-single_type_dmcrypt: {toxinidir}/centos7/bluestore/single-type-dmcrypt
+ xenial-filestore-single_type: {toxinidir}/xenial/filestore/single-type
+ xenial-filestore-single_type_dmcrypt: {toxinidir}/xenial/filestore/single-type-dmcrypt
+ xenial-bluestore-single_type: {toxinidir}/xenial/bluestore/single-type
+ xenial-bluestore-single_type_dmcrypt: {toxinidir}/xenial/bluestore/single-type-dmcrypt
+commands=
+ git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible
+
+ bash {toxinidir}/../scripts/vagrant_up.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
+ bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+
+ cp {toxinidir}/../playbooks/deploy.yml {envdir}/tmp/ceph-ansible
+
+ # use ceph-ansible to deploy a ceph cluster on the vms
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/deploy.yml --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest} toxinidir={toxinidir}"
+
+ # prepare nodes for testing with testinfra
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
+
+ # test cluster state using ceph-ansible tests
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ # reboot all vms - attempt
+ bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
+
+ # retest to ensure cluster came back up correctly after rebooting
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ # destroy an OSD, zap it's device and recreate it using it's ID
+ ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
+
+ # retest to ensure cluster came back up correctly
+ testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+
+ vagrant destroy {env:VAGRANT_DESTROY_FLAGS:"--force"}
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+dmcrypt: True
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_bluestore_dmcrypt.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "bluestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_bluestore.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+dmcrypt: True
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
+ osd:
+ osd_journal_size: 2048
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_filestore_dmcrypt.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
--- /dev/null
+../../../../Vagrantfile
\ No newline at end of file
--- /dev/null
+---
+
+ceph_dev: True
+cluster: ceph
+public_network: "192.168.3.0/24"
+cluster_network: "192.168.4.0/24"
+monitor_interface: eth1
+osd_objectstore: "filestore"
+osd_scenario: lvm
+ceph_origin: 'repository'
+ceph_repository: 'dev'
+copy_admin_key: false
+devices:
+ - /dev/sdb
+ - /dev/sdc
+os_tuning_params:
+ - { name: kernel.pid_max, value: 4194303 }
+ - { name: fs.file-max, value: 26234859 }
+ceph_conf_overrides:
+ global:
+ osd_pool_default_pg_num: 8
+ osd_pool_default_size: 1
--- /dev/null
+[mons]
+mon0
+
+[osds]
+osd0
+
+[mgrs]
+mon0
--- /dev/null
+../../../playbooks/test_filestore.yml
\ No newline at end of file
--- /dev/null
+---
+
+# DEFINE THE NUMBER OF VMS TO RUN
+mon_vms: 1
+osd_vms: 1
+mds_vms: 0
+rgw_vms: 0
+nfs_vms: 0
+rbd_mirror_vms: 0
+client_vms: 0
+iscsi_gw_vms: 0
+mgr_vms: 0
+
+# SUBNETS TO USE FOR THE VMS
+public_subnet: 192.168.3
+cluster_subnet: 192.168.4
+
+# MEMORY
+# set 1024 for CentOS
+memory: 512
+
+# Ethernet interface name
+# use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial
+eth: 'eth1'
+
+
+# VAGRANT BOX
+# Ceph boxes are *strongly* suggested. They are under better control and will
+# not get updated frequently unless required for build systems. These are (for
+# now):
+#
+# * ceph/ubuntu-xenial
+#
+# Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64
+# CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet
+# libvirt CentOS: centos/7
+# parallels Ubuntu: parallels/ubuntu-14.04
+# Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller'
+# For more boxes have a look at:
+# - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q=
+# - https://download.gluster.org/pub/gluster/purpleidea/vagrant/
+vagrant_box: ceph/ubuntu-xenial
+#ssh_private_key_path: "~/.ssh/id_rsa"
+# The sync directory changes based on vagrant box
+# Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant
+#vagrant_sync_dir: /home/vagrant/sync
+#vagrant_sync_dir: /
+# Disables synced folder creation. Not needed for testing, will skip mounting
+# the vagrant directory on the remote box regardless of the provider.
+vagrant_disable_synced_folder: true
+# VAGRANT URL
+# This is a URL to download an image from an alternate location. vagrant_box
+# above should be set to the filename of the image.
+# Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
+# Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box
+# vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box
[osds]
osd0
+
+[mgrs]
+mon0
[osds]
osd0
+
+[mgrs]
+mon0
command: "ceph-volume lvm activate --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
[osds]
osd0
+
+[mgrs]
+mon0
[osds]
osd0
+
+[mgrs]
+mon0
command: "ceph-volume lvm activate --filestore --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
command: "ceph-volume lvm activate --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
command: "ceph-volume lvm activate --filestore --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
vagrant
bash
git
+ cp
+ sleep
passenv=*
setenv=
ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
VAGRANT_CWD = {changedir}
CEPH_VOLUME_DEBUG = 1
deps=
- ansible==2.4.1
- testinfra==1.7.1
+ ansible~=2.6,<2.7
+ testinfra
pytest-xdist
notario>=0.0.13
changedir=
# but the master branch doesn't pin dependencies so we can't guarantee to work correctly
#pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
- vagrant up --no-provision {posargs:--provider=virtualbox}
+ bash {toxinidir}/../scripts/vagrant_up.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
# create logical volumes to test with on the vms
# ad-hoc/local test setup for lvm
ansible-playbook -vv -i {changedir}/hosts {changedir}/setup.yml
+ cp {toxinidir}/../playbooks/deploy.yml {envdir}/tmp/ceph-ansible
+
# use ceph-ansible to deploy a ceph cluster on the vms
- ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/deploy.yml --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest} toxinidir={toxinidir}"
# prepare nodes for testing with testinfra
ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
[osds]
osd0
+
+[mgrs]
+mon0
[osds]
osd0
+
+[mgrs]
+mon0
command: "ceph-volume lvm activate --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
[osds]
osd0
+
+[mgrs]
+mon0
[osds]
osd0
+
+[mgrs]
+mon0
command: "ceph-volume lvm activate --filestore --all"
environment:
CEPH_VOLUME_DEBUG: 1
+
+ - name: list all OSDs
+ command: "ceph-volume lvm list"
+ environment:
+ CEPH_VOLUME_DEBUG: 1
--- /dev/null
+---
+# Defines deployment design and assigns role to server groups
+
+- hosts:
+ - mons
+ - osds
+ - mgrs
+
+ gather_facts: false
+ any_errors_fatal: true
+ become: true
+
+ tags:
+ - always
+
+ vars:
+ delegate_facts_host: True
+
+ pre_tasks:
+ # If we can't get python2 installed before any module is used we will fail
+ # so just try what we can to get it installed
+ - name: check for python2
+ stat:
+ path: /usr/bin/python
+ ignore_errors: yes
+ register: systempython2
+
+ - name: install python2 for debian based systems
+ raw: sudo apt-get -y install python-simplejson
+ ignore_errors: yes
+ when:
+ - systempython2.stat is undefined or systempython2.stat.exists == false
+
+ - name: install python2 for fedora
+ raw: sudo dnf -y install python creates=/usr/bin/python
+ ignore_errors: yes
+ when:
+ - systempython2.stat is undefined or systempython2.stat.exists == false
+
+ - name: install python2 for opensuse
+ raw: sudo zypper -n install python-base creates=/usr/bin/python2.7
+ ignore_errors: yes
+ when:
+ - systempython2.stat is undefined or systempython2.stat.exists == false
+
+ - name: gather facts
+ setup:
+ when:
+ - not delegate_facts_host | bool
+
+ - name: gather and delegate facts
+ setup:
+ delegate_to: "{{ item }}"
+ delegate_facts: True
+ with_items: "{{ groups['all'] }}"
+ run_once: true
+ when:
+ - delegate_facts_host | bool
+
+ - name: install required packages for fedora > 23
+ raw: sudo dnf -y install python2-dnf libselinux-python ntp
+ when:
+ - ansible_distribution == 'Fedora'
+ - ansible_distribution_major_version|int >= 23
+
+ roles:
+ - ceph-defaults
+ - ceph-validate
+
+- hosts:
+ - mons
+ - osds
+ - mgrs
+ gather_facts: false
+ become: True
+ roles:
+ - role: ceph-defaults
+ tags: ['ceph_update_config']
+ - role: ceph-common
+ - role: ceph-config
+ tags: ['ceph_update_config']
+
+- hosts: mons
+ gather_facts: false
+ become: True
+ roles:
+ - role: ceph-defaults
+ - role: ceph-common
+ - role: ceph-mon
+
+- hosts: mgrs
+ gather_facts: false
+ become: True
+ roles:
+ - role: ceph-defaults
+ - role: ceph-common
+ - role: ceph-mgr
+
+- hosts: osds
+ gather_facts: false
+ become: True
+ tasks:
+ - name: rsync ceph-volume to test nodes on centos
+ synchronize:
+ src: "{{ toxinidir}}/../../../../ceph_volume"
+ dest: "/usr/lib/python2.7/site-packages"
+ use_ssh_args: true
+ when: ansible_os_family == "RedHat"
+
+ - name: rsync ceph-volume to test nodes on ubuntu
+ synchronize:
+ src: "{{ toxinidir}}/../../../../ceph_volume"
+ dest: "/usr/lib/python2.7/dist-packages"
+ use_ssh_args: true
+ when: ansible_os_family == "Debian"
+
+- hosts: osds
+ gather_facts: false
+ become: True
+ roles:
+ - role: ceph-defaults
+ - role: ceph-common
+ - role: ceph-osd
--- /dev/null
+#!/bin/bash
+
+retries=0
+until [ $retries -ge 5 ]
+do
+ echo "Attempting to start VMs. Attempts: $retries"
+ timeout 10m vagrant up "$@" && break
+ retries=$[$retries+1]
+ sleep 5
+done
+
+sleep 10
bash
git
sleep
+ cp
passenv=*
setenv=
ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config
VAGRANT_CWD = {changedir}
CEPH_VOLUME_DEBUG = 1
deps=
- ansible==2.4.1
- testinfra==1.7.1
+ ansible~=2.6,<2.7
+ testinfra
pytest-xdist
notario>=0.0.13
changedir=
# but the master branch doesn't pin dependencies so we can't guarantee to work correctly
#pip install -r {envdir}/tmp/ceph-ansible/requirements.txt
- vagrant up --no-provision {posargs:--provider=virtualbox}
+ bash {toxinidir}/../scripts/vagrant_up.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
bash {toxinidir}/../scripts/generate_ssh_config.sh {changedir}
+ cp {toxinidir}/../playbooks/deploy.yml {envdir}/tmp/ceph-ansible
+
# use ceph-ansible to deploy a ceph cluster on the vms
- ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest}"
+ ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/deploy.yml --extra-vars "fetch_directory={changedir}/fetch ceph_dev_branch={env:CEPH_DEV_BRANCH:master} ceph_dev_sha1={env:CEPH_DEV_SHA1:latest} toxinidir={toxinidir}"
# prepare nodes for testing with testinfra
ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
assert 'ls' in log_lines
assert 'stderr' in log_lines
assert out == ''
+
+
+class TestFunctionalCall(object):
+
+ def test_stdin(self):
+ process.call(['xargs', 'ls'], stdin="echo '/'")
+
+ def test_unicode_encoding(self):
+ process.call(['echo', u'\xd0'])
+
+ def test_unicode_encoding_stdin(self):
+ process.call(['echo'], stdin=u'\xd0'.encode('utf-8'))
+
+
+class TestFunctionalRun(object):
+
+ def test_log_descriptors(self):
+ process.run(['ls', '-l'])
)
stdout, stderr = capsys.readouterr()
assert 'Cannot use --filestore (filestore) with --bluestore (bluestore)' in stdout
+
+
+class TestValidDevice(object):
+
+ def setup(self):
+ self.validator = arg_validators.ValidDevice()
+
+ def test_path_is_valid(self, fake_call):
+ result = self.validator('/')
+ assert result.abspath == '/'
+
+ def test_path_is_invalid(self, fake_call):
+ with pytest.raises(argparse.ArgumentError):
+ self.validator('/device/does/not/exist')
--- /dev/null
+from ceph_volume.util import device
+from ceph_volume.api import lvm as api
+
+
+class TestDevice(object):
+
+ def test_sys_api(self, device_info):
+ data = {"/dev/sda": {"foo": "bar"}}
+ device_info(devices=data)
+ disk = device.Device("/dev/sda")
+ assert disk.sys_api
+ assert "foo" in disk.sys_api
+
+ def test_is_lv(self, device_info):
+ data = {"lv_path": "vg/lv"}
+ device_info(lv=data)
+ disk = device.Device("vg/lv")
+ assert disk.is_lv
+
+ def test_is_device(self, device_info):
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "device"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_device
+
+ def test_is_partition(self, device_info, pvolumes):
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "part"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.is_partition
+
+ def test_is_not_lvm_memeber(self, device_info, pvolumes):
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "part"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert not disk.is_lvm_member
+
+ def test_is_lvm_memeber(self, device_info, pvolumes):
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "part"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert not disk.is_lvm_member
+
+ def test_is_mapper_device(self, device_info):
+ device_info()
+ disk = device.Device("/dev/mapper/foo")
+ assert disk.is_mapper
+
+ def test_is_not_mapper_device(self, device_info):
+ device_info()
+ disk = device.Device("/dev/sda")
+ assert not disk.is_mapper
+
+ def test_pv_api(self, device_info, pvolumes, monkeypatch):
+ FooPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={}, vg_name="vg")
+ pvolumes.append(FooPVolume)
+ monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+ data = {"/dev/sda": {"foo": "bar"}}
+ lsblk = {"TYPE": "part"}
+ device_info(devices=data, lsblk=lsblk)
+ disk = device.Device("/dev/sda")
+ assert disk.pvs_api
+import os
+import pytest
from ceph_volume.util import disk
result = disk.device_family('sdaa5')
for parsed in result:
assert parsed['NAME'] in names
+
+
+class TestMapDevPaths(object):
+
+ def test_errors_return_empty_mapping(self, tmpdir):
+ bad_dir = os.path.join(str(tmpdir), 'nonexisting')
+ assert disk._map_dev_paths(bad_dir) == {}
+
+ def test_base_name_and_abspath(self, tmpfile):
+ sda_path = tmpfile(name='sda', contents='')
+ directory = os.path.dirname(sda_path)
+ result = disk._map_dev_paths(directory)
+ assert len(result.keys()) == 1
+ assert result['sda'] == sda_path
+
+ def test_abspath_included(self, tmpfile):
+ sda_path = tmpfile(name='sda', contents='')
+ directory = os.path.dirname(sda_path)
+ result = disk._map_dev_paths(directory, include_abspath=True)
+ assert sorted(result.keys()) == sorted(['sda', sda_path])
+ assert result['sda'] == sda_path
+ assert result[sda_path] == 'sda'
+
+ def test_realpath_included(self, tmpfile):
+ sda_path = tmpfile(name='sda', contents='')
+ directory = os.path.dirname(sda_path)
+ dm_path = os.path.join(directory, 'dm-0')
+ os.symlink(sda_path, os.path.join(directory, 'dm-0'))
+ result = disk._map_dev_paths(directory, include_realpath=True)
+ assert sorted(result.keys()) == sorted(['sda', 'dm-0'])
+ assert result['sda'] == dm_path
+ assert result['dm-0'] == dm_path
+
+ def test_absolute_and_realpath_included(self, tmpfile):
+ dm_path = tmpfile(name='dm-0', contents='')
+ directory = os.path.dirname(dm_path)
+ sda_path = os.path.join(directory, 'sda')
+ os.symlink(sda_path, os.path.join(directory, 'sda'))
+ result = disk._map_dev_paths(directory, include_realpath=True, include_abspath=True)
+ assert sorted(result.keys()) == sorted([dm_path, sda_path, 'sda', 'dm-0'])
+ assert result['sda'] == sda_path
+ assert result['dm-0'] == dm_path
+ assert result[sda_path] == sda_path
+ assert result[dm_path] == 'dm-0'
+
+
+class TestGetBlockDevs(object):
+
+ def test_loop_devices_are_missing(self, tmpfile):
+ path = os.path.dirname(tmpfile(name='loop0', contents=''))
+ result = disk.get_block_devs(sys_block_path=path)
+ assert result == []
+
+ def test_loop_devices_are_included(self, tmpfile):
+ path = os.path.dirname(tmpfile(name='loop0', contents=''))
+ result = disk.get_block_devs(sys_block_path=path, skip_loop=False)
+ assert len(result) == 1
+ assert result == ['loop0']
+
+
+class TestGetDevDevs(object):
+
+ def test_abspaths_are_included(self, tmpfile):
+ sda_path = tmpfile(name='sda', contents='')
+ directory = os.path.dirname(sda_path)
+ result = disk.get_dev_devs(directory)
+ assert sorted(result.keys()) == sorted(['sda', sda_path])
+ assert result['sda'] == sda_path
+ assert result[sda_path] == 'sda'
+
+
+class TestGetMapperDevs(object):
+
+ def test_abspaths_and_realpaths_are_included(self, tmpfile):
+ dm_path = tmpfile(name='dm-0', contents='')
+ directory = os.path.dirname(dm_path)
+ sda_path = os.path.join(directory, 'sda')
+ os.symlink(sda_path, os.path.join(directory, 'sda'))
+ result = disk.get_mapper_devs(directory)
+ assert sorted(result.keys()) == sorted([dm_path, sda_path, 'sda', 'dm-0'])
+ assert result['sda'] == sda_path
+ assert result['dm-0'] == dm_path
+ assert result[sda_path] == sda_path
+ assert result[dm_path] == 'dm-0'
+
+
+class TestHumanReadableSize(object):
+
+ def test_bytes(self):
+ result = disk.human_readable_size(800)
+ assert result == '800.00 B'
+
+ def test_kilobytes(self):
+ result = disk.human_readable_size(800*1024)
+ assert result == '800.00 KB'
+
+ def test_megabytes(self):
+ result = disk.human_readable_size(800*1024*1024)
+ assert result == '800.00 MB'
+
+ def test_gigabytes(self):
+ result = disk.human_readable_size(8.19*1024*1024*1024)
+ assert result == '8.19 GB'
+
+ def test_terabytes(self):
+ result = disk.human_readable_size(81.2*1024*1024*1024*1024)
+ assert result == '81.20 TB'
+
+
+class TestGetDevices(object):
+
+ def setup_paths(self, tmpdir):
+ paths = []
+ for directory in ['block', 'dev', 'mapper']:
+ path = os.path.join(str(tmpdir), directory)
+ paths.append(path)
+ os.makedirs(path)
+ return paths
+
+ def test_no_devices_are_found(self, tmpdir):
+ result = disk.get_devices(
+ _sys_block_path=str(tmpdir),
+ _dev_path=str(tmpdir),
+ _mapper_path=str(tmpdir))
+ assert result == {}
+
+ def test_sda_block_is_found(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(os.path.join(block_path, 'sda'))
+ os.makedirs(dev_sda_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert len(result.keys()) == 1
+ assert result[dev_sda_path]['human_readable_size'] == '0.00 B'
+ assert result[dev_sda_path]['model'] == ''
+ assert result[dev_sda_path]['partitions'] == {}
+
+ def test_sda_is_removable_gets_skipped(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ block_sda_path = os.path.join(block_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(dev_sda_path)
+
+ tmpfile('removable', contents='1', directory=block_sda_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert result == {}
+
+ def test_dm_device_is_not_used(self, monkeypatch, tmpdir):
+ # the link to the mapper is used instead
+ monkeypatch.setattr(disk.lvm, 'is_lv', lambda: True)
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ dev_dm_path = os.path.join(dev_path, 'dm-0')
+ ceph_data_path = os.path.join(mapper_path, 'ceph-data')
+ os.symlink(dev_dm_path, ceph_data_path)
+ block_dm_path = os.path.join(block_path, 'dm-0')
+ os.makedirs(block_dm_path)
+
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ result = list(result.keys())
+ assert len(result) == 1
+ assert result == [ceph_data_path]
+
+ def test_sda_size(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(dev_sda_path)
+ tmpfile('size', '1024', directory=block_sda_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert list(result.keys()) == [dev_sda_path]
+ assert result[dev_sda_path]['human_readable_size'] == '512.00 KB'
+
+ def test_sda_sectorsize_fallsback(self, tmpfile, tmpdir):
+ # if no sectorsize, it will use queue/hw_sector_size
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ sda_queue_path = os.path.join(block_sda_path, 'queue')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(sda_queue_path)
+ os.makedirs(dev_sda_path)
+ tmpfile('hw_sector_size', contents='1024', directory=sda_queue_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert list(result.keys()) == [dev_sda_path]
+ assert result[dev_sda_path]['sectorsize'] == '1024'
+
+ def test_sda_sectorsize_from_logical_block(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ sda_queue_path = os.path.join(block_sda_path, 'queue')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(sda_queue_path)
+ os.makedirs(dev_sda_path)
+ tmpfile('logical_block_size', contents='99', directory=sda_queue_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert result[dev_sda_path]['sectorsize'] == '99'
+
+ def test_sda_sectorsize_does_not_fallback(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ sda_queue_path = os.path.join(block_sda_path, 'queue')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(sda_queue_path)
+ os.makedirs(dev_sda_path)
+ tmpfile('logical_block_size', contents='99', directory=sda_queue_path)
+ tmpfile('hw_sector_size', contents='1024', directory=sda_queue_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert result[dev_sda_path]['sectorsize'] == '99'
+
+ def test_is_rotational(self, tmpfile, tmpdir):
+ block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+ block_sda_path = os.path.join(block_path, 'sda')
+ sda_queue_path = os.path.join(block_sda_path, 'queue')
+ dev_sda_path = os.path.join(dev_path, 'sda')
+ os.makedirs(block_sda_path)
+ os.makedirs(sda_queue_path)
+ os.makedirs(dev_sda_path)
+ tmpfile('rotational', contents='1', directory=sda_queue_path)
+ result = disk.get_devices(
+ _sys_block_path=block_path,
+ _dev_path=dev_path,
+ _mapper_path=mapper_path)
+ assert result[dev_sda_path]['rotational'] == '1'
+
+
+class TestSizeCalculations(object):
+
+ @pytest.mark.parametrize('aliases', [
+ ('b', 'bytes'),
+ ('kb', 'kilobytes'),
+ ('mb', 'megabytes'),
+ ('gb', 'gigabytes'),
+ ('tb', 'terabytes'),
+ ])
+ def test_aliases(self, aliases):
+ short_alias, long_alias = aliases
+ s = disk.Size(b=1)
+ short_alias = getattr(s, short_alias)
+ long_alias = getattr(s, long_alias)
+ assert short_alias == long_alias
+
+ @pytest.mark.parametrize('values', [
+ ('b', 857619069665.28),
+ ('kb', 837518622.72),
+ ('mb', 817889.28),
+ ('gb', 798.72),
+ ('tb', 0.78),
+ ])
+ def test_terabytes(self, values):
+ # regardless of the input value, all the other values correlate to each
+ # other the same, every time
+ unit, value = values
+ s = disk.Size(**{unit: value})
+ assert s.b == 857619069665.28
+ assert s.kb == 837518622.72
+ assert s.mb == 817889.28
+ assert s.gb == 798.72
+ assert s.tb == 0.78
+
+
+class TestSizeOperators(object):
+
+ @pytest.mark.parametrize('larger', [1025, 1024.1, 1024.001])
+ def test_gigabytes_is_smaller(self, larger):
+ assert disk.Size(gb=1) < disk.Size(mb=larger)
+
+ @pytest.mark.parametrize('smaller', [1023, 1023.9, 1023.001])
+ def test_gigabytes_is_larger(self, smaller):
+ assert disk.Size(gb=1) > disk.Size(mb=smaller)
+
+ @pytest.mark.parametrize('larger', [1025, 1024.1, 1024.001, 1024])
+ def test_gigabytes_is_smaller_or_equal(self, larger):
+ assert disk.Size(gb=1) <= disk.Size(mb=larger)
+
+ @pytest.mark.parametrize('smaller', [1023, 1023.9, 1023.001, 1024])
+ def test_gigabytes_is_larger_or_equal(self, smaller):
+ assert disk.Size(gb=1) >= disk.Size(mb=smaller)
+
+ @pytest.mark.parametrize('values', [
+ ('b', 857619069665.28),
+ ('kb', 837518622.72),
+ ('mb', 817889.28),
+ ('gb', 798.72),
+ ('tb', 0.78),
+ ])
+ def test_equality(self, values):
+ unit, value = values
+ s = disk.Size(**{unit: value})
+ # both tb and b, since b is always calculated regardless, and is useful
+ # when testing tb
+ assert disk.Size(tb=0.78) == s
+ assert disk.Size(b=857619069665.28) == s
+
+ @pytest.mark.parametrize('values', [
+ ('b', 857619069665.28),
+ ('kb', 837518622.72),
+ ('mb', 817889.28),
+ ('gb', 798.72),
+ ('tb', 0.78),
+ ])
+ def test_inequality(self, values):
+ unit, value = values
+ s = disk.Size(**{unit: value})
+ # both tb and b, since b is always calculated regardless, and is useful
+ # when testing tb
+ assert disk.Size(tb=1) != s
+ assert disk.Size(b=100) != s
+
+
+class TestSizeOperations(object):
+
+ def test_assignment_addition_with_size_objects(self):
+ result = disk.Size(mb=256) + disk.Size(gb=1)
+ assert result.gb == 1.25
+ assert result.gb.as_int() == 1
+ assert result.gb.as_float() == 1.25
+
+ def test_self_addition_with_size_objects(self):
+ base = disk.Size(mb=256)
+ base += disk.Size(gb=1)
+ assert base.gb == 1.25
+
+ def test_self_addition_does_not_alter_state(self):
+ base = disk.Size(mb=256)
+ base + disk.Size(gb=1)
+ assert base.mb == 256
+
+ def test_addition_with_non_size_objects(self):
+ with pytest.raises(TypeError):
+ disk.Size(mb=100) + 4
+
+ def test_assignment_subtraction_with_size_objects(self):
+ base = disk.Size(gb=1)
+ base -= disk.Size(mb=256)
+ assert base.mb == 768
+
+ def test_self_subtraction_does_not_alter_state(self):
+ base = disk.Size(gb=1)
+ base - disk.Size(mb=256)
+ assert base.gb == 1
+
+ def test_subtraction_with_size_objects(self):
+ result = disk.Size(gb=1) - disk.Size(mb=256)
+ assert result.mb == 768
+
+ def test_subtraction_with_non_size_objects(self):
+ with pytest.raises(TypeError):
+ disk.Size(mb=100) - 4
+
+ def test_multiplication_with_size_objects(self):
+ with pytest.raises(TypeError):
+ disk.Size(mb=100) * disk.Size(mb=1)
+
+ def test_multiplication_with_non_size_objects(self):
+ base = disk.Size(gb=1)
+ result = base * 2
+ assert result.gb == 2
+ assert result.gb.as_int() == 2
+
+ def test_division_with_size_objects(self):
+ result = disk.Size(gb=1) / disk.Size(mb=1)
+ assert int(result) == 1024
+
+ def test_division_with_non_size_objects(self):
+ base = disk.Size(gb=1)
+ base / 2
+ assert base.mb == 512
+ assert base.mb.as_int() == 512
+
+
+class TestSizeAttributes(object):
+
+ def test_attribute_does_not_exist(self):
+ with pytest.raises(AttributeError):
+ disk.Size(mb=1).exabytes
+
+
+class TestSizeFormatting(object):
+
+ def test_default_formatting_tb_to_b(self):
+ size = disk.Size(tb=0.0000000001)
+ result = "%s" % size
+ assert result == "109.95 B"
+
+ def test_default_formatting_tb_to_kb(self):
+ size = disk.Size(tb=0.00000001)
+ result = "%s" % size
+ assert result == "10.74 KB"
+
+ def test_default_formatting_tb_to_mb(self):
+ size = disk.Size(tb=0.000001)
+ result = "%s" % size
+ assert result == "1.05 MB"
+
+ def test_default_formatting_tb_to_gb(self):
+ size = disk.Size(tb=0.001)
+ result = "%s" % size
+ assert result == "1.02 GB"
+
+ def test_default_formatting_tb_to_tb(self):
+ size = disk.Size(tb=10)
+ result = "%s" % size
+ assert result == "10.00 TB"
+
+
+class TestSizeSpecificFormatting(object):
+
+ def test_formatting_b(self):
+ size = disk.Size(b=2048)
+ result = "%s" % size.b
+ assert "%s" % size.b == "%s" % size.bytes
+ assert result == "2048.00 B"
+
+ def test_formatting_kb(self):
+ size = disk.Size(kb=5700)
+ result = "%s" % size.kb
+ assert "%s" % size.kb == "%s" % size.kilobytes
+ assert result == "5700.00 KB"
+
+ def test_formatting_mb(self):
+ size = disk.Size(mb=4000)
+ result = "%s" % size.mb
+ assert "%s" % size.mb == "%s" % size.megabytes
+ assert result == "4000.00 MB"
+
+ def test_formatting_gb(self):
+ size = disk.Size(gb=77777)
+ result = "%s" % size.gb
+ assert "%s" % size.gb == "%s" % size.gigabytes
+ assert result == "77777.00 GB"
+
+ def test_formatting_tb(self):
+ size = disk.Size(tb=1027)
+ result = "%s" % size.tb
+ assert "%s" % size.tb == "%s" % size.terabytes
+ assert result == "1027.00 TB"
out = ['some line here', ' ']
stub_call((out, '', 0))
assert encryption.status('/dev/sdc1') == {}
+
+
+class TestDmcryptClose(object):
+
+ def test_mapper_exists(self, fake_run, tmpfile):
+ file_name = tmpfile(name='mapper-device')
+ encryption.dmcrypt_close(file_name)
+ arguments = fake_run.calls[0]['args'][0]
+ assert arguments[0] == 'cryptsetup'
+ assert arguments[1] == 'remove'
+ assert arguments[2].startswith('/')
+
+ def test_mapper_does_not_exist(self, fake_run):
+ file_name = '/path/does/not/exist'
+ encryption.dmcrypt_close(file_name)
+ assert fake_run.calls == []
from ceph_volume.tests.conftest import Factory
-class TestCheckID(object):
+class TestOSDIDAvailable(object):
def test_false_if_id_is_none(self):
- assert not prepare.check_id(None)
+ assert not prepare.osd_id_available(None)
def test_returncode_is_not_zero(self, monkeypatch):
monkeypatch.setattr('ceph_volume.process.call', lambda *a, **kw: ('', '', 1))
with pytest.raises(RuntimeError):
- prepare.check_id(1)
+ prepare.osd_id_available(1)
- def test_id_does_exist(self, monkeypatch):
+ def test_id_does_exist_but_not_available(self, monkeypatch):
stdout = dict(nodes=[
- dict(id=0),
+ dict(id=0, status="up"),
])
stdout = ['', json.dumps(stdout)]
monkeypatch.setattr('ceph_volume.process.call', lambda *a, **kw: (stdout, '', 0))
- result = prepare.check_id(0)
- assert result
+ result = prepare.osd_id_available(0)
+ assert not result
def test_id_does_not_exist(self, monkeypatch):
stdout = dict(nodes=[
])
stdout = ['', json.dumps(stdout)]
monkeypatch.setattr('ceph_volume.process.call', lambda *a, **kw: (stdout, '', 0))
- result = prepare.check_id(1)
+ result = prepare.osd_id_available(1)
assert not result
def test_invalid_osd_id(self, monkeypatch):
])
stdout = ['', json.dumps(stdout)]
monkeypatch.setattr('ceph_volume.process.call', lambda *a, **kw: (stdout, '', 0))
- result = prepare.check_id("foo")
+ result = prepare.osd_id_available("foo")
assert not result
+ def test_returns_true_when_id_is_destroyed(self, monkeypatch):
+ stdout = dict(nodes=[
+ dict(id=0, status="destroyed"),
+ ])
+ stdout = ['', json.dumps(stdout)]
+ monkeypatch.setattr('ceph_volume.process.call', lambda *a, **kw: (stdout, '', 0))
+ result = prepare.osd_id_available(0)
+ assert result
+
class TestFormatDevice(object):
assert expected == fake_run.calls[0]['args'][0]
+mkfs_filestore_flags = [
+ 'ceph-osd',
+ '--cluster',
+ '--osd-objectstore', 'filestore',
+ '--mkfs',
+ '-i',
+ '--monmap',
+ '--keyfile', '-', # goes through stdin
+ '--osd-data',
+ '--osd-journal',
+ '--osd-uuid',
+ '--setuser', 'ceph',
+ '--setgroup', 'ceph'
+]
+
+
+class TestOsdMkfsFilestore(object):
+
+ @pytest.mark.parametrize('flag', mkfs_filestore_flags)
+ def test_keyring_is_used(self, fake_call, monkeypatch, flag):
+ monkeypatch.setattr(prepare, '__release__', 'mimic')
+ monkeypatch.setattr(system, 'chown', lambda path: True)
+ prepare.osd_mkfs_filestore(1, 'asdf', keyring='secret')
+ assert flag in fake_call.calls[0]['args'][0]
+
+ def test_keyring_is_used_luminous(self, fake_call, monkeypatch):
+ monkeypatch.setattr(prepare, '__release__', 'luminous')
+ monkeypatch.setattr(system, 'chown', lambda path: True)
+ prepare.osd_mkfs_filestore(1, 'asdf', keyring='secret')
+ assert '--keyfile' not in fake_call.calls[0]['args'][0]
+
+
class TestOsdMkfsBluestore(object):
def test_keyring_is_added(self, fake_call, monkeypatch):
prepare.osd_mkfs_bluestore(1, 'asdf')
assert '--keyfile' not in fake_call.calls[0]['args'][0]
+ def test_keyring_is_not_added_luminous(self, fake_call, monkeypatch):
+ monkeypatch.setattr(system, 'chown', lambda path: True)
+ prepare.osd_mkfs_bluestore(1, 'asdf')
+ monkeypatch.setattr(prepare, '__release__', 'luminous')
+ assert '--keyfile' not in fake_call.calls[0]['args'][0]
+
def test_wal_is_added(self, fake_call, monkeypatch):
monkeypatch.setattr(system, 'chown', lambda path: True)
prepare.osd_mkfs_bluestore(1, 'asdf', wal='/dev/smm1')
'--osd-uuid', 'asdf-1234',
'--setuser', 'ceph', '--setgroup', 'ceph'])
assert expected in str(error)
+
+
+class TestGetJournalSize(object):
+
+ def test_undefined_size_fallbacks_formatted(self, conf_ceph_stub):
+ conf_ceph_stub(dedent("""
+ [global]
+ fsid = a25d19a6-7d57-4eda-b006-78e35d2c4d9f
+ """))
+ result = prepare.get_journal_size()
+ assert result == '5G'
+
+ def test_undefined_size_fallbacks_unformatted(self, conf_ceph_stub):
+ conf_ceph_stub(dedent("""
+ [global]
+ fsid = a25d19a6-7d57-4eda-b006-78e35d2c4d9f
+ """))
+ result = prepare.get_journal_size(lv_format=False)
+ assert result.gb.as_int() == 5
+
+ def test_defined_size_unformatted(self, conf_ceph_stub):
+ conf_ceph_stub(dedent("""
+ [global]
+ fsid = a25d19a6-7d57-4eda-b006-78e35d2c4d9f
+
+ [osd]
+ osd journal size = 10240
+ """))
+ result = prepare.get_journal_size(lv_format=False)
+ assert result.gb.as_int() == 10
+
+ def test_defined_size_formatted(self, conf_ceph_stub):
+ conf_ceph_stub(dedent("""
+ [global]
+ fsid = a25d19a6-7d57-4eda-b006-78e35d2c4d9f
+
+ [osd]
+ osd journal size = 10240
+ """))
+ result = prepare.get_journal_size()
+ assert result == '10G'
+
+ def test_refuse_tiny_journals(self, conf_ceph_stub):
+ conf_ceph_stub(dedent("""
+ [global]
+ fsid = a25d19a6-7d57-4eda-b006-78e35d2c4d9f
+
+ [osd]
+ osd journal size = 1024
+ """))
+ with pytest.raises(RuntimeError) as error:
+ prepare.get_journal_size()
+ assert 'journal sizes must be larger' in str(error)
+ assert 'detected: 1024.00 MB' in str(error)
assert system.is_binary(binary_path) is False
+class TestGetFileContents(object):
+
+ def test_path_does_not_exist(self, tmpdir):
+ filepath = os.path.join(str(tmpdir), 'doesnotexist')
+ assert system.get_file_contents(filepath, 'default') == 'default'
+
+ def test_path_has_contents(self, tmpfile):
+ interesting_file = tmpfile(contents="1")
+ result = system.get_file_contents(interesting_file)
+ assert result == "1"
+
+ def test_path_has_multiline_contents(self, tmpfile):
+ interesting_file = tmpfile(contents="0\n1")
+ result = system.get_file_contents(interesting_file)
+ assert result == "0\n1"
+
+ def test_exception_returns_default(self, tmpfile):
+ interesting_file = tmpfile(contents="0")
+ # remove read, causes IOError
+ os.chmod(interesting_file, 0o000)
+ result = system.get_file_contents(interesting_file)
+ assert result == ''
+
+
class TestWhich(object):
def test_executable_exists_but_is_not_file(self, monkeypatch):
--- /dev/null
+import pytest
+from ceph_volume import util
+
+
+class TestAsBytes(object):
+
+ def test_bytes_just_gets_returned(self):
+ bytes_string = "contents".encode('utf-8')
+ assert util.as_bytes(bytes_string) == bytes_string
+
+ def test_string_gets_converted_to_bytes(self):
+ result = util.as_bytes('contents')
+ assert isinstance(result, bytes)
+
+
+class TestStrToInt(object):
+
+ def test_passing_a_float_str(self):
+ result = util.str_to_int("1.99")
+ assert result == 1
+
+ def test_passing_a_float_does_not_round(self):
+ result = util.str_to_int("1.99", round_down=False)
+ assert result == 2
+
+ def test_text_is_not_an_integer_like(self):
+ with pytest.raises(RuntimeError) as error:
+ util.str_to_int("1.4GB")
+ assert str(error.value) == "Unable to convert to integer: '1.4GB'"
+
+
+def true_responses(upper_casing=False):
+ if upper_casing:
+ return ['Y', 'YES', '']
+ return ['y', 'yes', '']
+
+
+def false_responses(upper_casing=False):
+ if upper_casing:
+ return ['N', 'NO']
+ return ['n', 'no']
+
+
+def invalid_responses():
+ return [9, 0.1, 'h', [], {}, None]
+
+
+class TestStrToBool(object):
+
+ @pytest.mark.parametrize('response', true_responses())
+ def test_trueish(self, response):
+ assert util.str_to_bool(response) is True
+
+ @pytest.mark.parametrize('response', false_responses())
+ def test_falseish(self, response):
+ assert util.str_to_bool(response) is False
+
+ @pytest.mark.parametrize('response', true_responses(True))
+ def test_trueish_upper(self, response):
+ assert util.str_to_bool(response) is True
+
+ @pytest.mark.parametrize('response', false_responses(True))
+ def test_falseish_upper(self, response):
+ assert util.str_to_bool(response) is False
+
+ @pytest.mark.parametrize('response', invalid_responses())
+ def test_invalid(self, response):
+ with pytest.raises(ValueError):
+ util.str_to_bool(response)
+
+
+class TestPromptBool(object):
+
+ @pytest.mark.parametrize('response', true_responses())
+ def test_trueish(self, response):
+ fake_input = lambda x: response
+ qx = 'what the what?'
+ assert util.prompt_bool(qx, _raw_input=fake_input) is True
+
+ @pytest.mark.parametrize('response', false_responses())
+ def test_falseish(self, response):
+ fake_input = lambda x: response
+ qx = 'what the what?'
+ assert util.prompt_bool(qx, _raw_input=fake_input) is False
+
+ def test_try_again_true(self):
+ responses = ['g', 'h', 'y']
+ fake_input = lambda x: responses.pop(0)
+ qx = 'what the what?'
+ assert util.prompt_bool(qx, _raw_input=fake_input) is True
+
+ def test_try_again_false(self):
+ responses = ['g', 'h', 'n']
+ fake_input = lambda x: responses.pop(0)
+ qx = 'what the what?'
+ assert util.prompt_bool(qx, _raw_input=fake_input) is False
+import logging
+from math import floor
+from ceph_volume import terminal
+
+
+logger = logging.getLogger(__name__)
+
def as_string(string):
"""
# we really ignore here if we can't properly decode with utf-8
return string.decode('utf-8', 'ignore')
return string
+
+
+def as_bytes(string):
+ """
+ Ensure that whatever type of string is incoming, it is returned as bytes,
+ encoding to utf-8 otherwise
+ """
+ if isinstance(string, bytes):
+ return string
+ return string.encode('utf-8', errors='ignore')
+
+
+def str_to_int(string, round_down=True):
+ """
+ Parses a string number into an integer, optionally converting to a float
+ and rounding down.
+ """
+ error_msg = "Unable to convert to integer: '%s'" % str(string)
+ try:
+ integer = float(string)
+ except (TypeError, ValueError):
+ logger.exception(error_msg)
+ raise RuntimeError(error_msg)
+
+ if round_down:
+ integer = floor(integer)
+ else:
+ integer = round(integer)
+ return int(integer)
+
+
+def str_to_bool(val):
+ """
+ Convert a string representation of truth to True or False
+
+ True values are 'y', 'yes', or ''; case-insensitive
+ False values are 'n', or 'no'; case-insensitive
+ Raises ValueError if 'val' is anything else.
+ """
+ true_vals = ['yes', 'y', '']
+ false_vals = ['no', 'n']
+ try:
+ val = val.lower()
+ except AttributeError:
+ val = str(val).lower()
+ if val in true_vals:
+ return True
+ elif val in false_vals:
+ return False
+ else:
+ raise ValueError("Invalid input value: %s" % val)
+
+
+def prompt_bool(question, _raw_input=None):
+ """
+ Interface to prompt a boolean (or boolean-like) response from a user.
+ Usually a confirmation.
+ """
+ input_prompt = _raw_input or raw_input
+ prompt_format = '--> {question} '.format(question=question)
+ response = input_prompt(prompt_format)
+ try:
+ return str_to_bool(response)
+ except ValueError:
+ terminal.error('Valid true responses are: y, yes, <Enter>')
+ terminal.error('Valid false responses are: n, no')
+ terminal.error('That response was invalid, please try again')
+ return prompt_bool(question, _raw_input=input_prompt)
from ceph_volume import terminal
from ceph_volume import decorators
from ceph_volume.util import disk
+from ceph_volume.util.device import Device
class LVPath(object):
return string
+class ValidDevice(object):
+
+ def __call__(self, string):
+ device = Device(string)
+ if not device.exists:
+ raise argparse.ArgumentError(
+ None, "Unable to proceed with non-existing device: %s" % string
+ )
+
+ return device
+
+
class OSDPath(object):
"""
Validate path exists and it looks like an OSD directory.
--- /dev/null
+import os
+from ceph_volume import sys_info
+from ceph_volume.api import lvm
+from ceph_volume.util import disk
+
+
+class Device(object):
+
+ def __init__(self, path):
+ self.path = path
+ # LVs can have a vg/lv path, while disks will have /dev/sda
+ self.abspath = path
+ self.lv_api = None
+ self.pvs_api = []
+ self.disk_api = {}
+ self.sys_api = {}
+ self._exists = None
+ self._is_lvm_member = None
+ self._parse()
+
+ def _parse(self):
+ # start with lvm since it can use an absolute or relative path
+ lv = lvm.get_lv_from_argument(self.path)
+ if lv:
+ self.lv_api = lv
+ self.abspath = lv.lv_path
+ else:
+ dev = disk.lsblk(self.path)
+ self.disk_api = dev
+ device_type = dev.get('TYPE', '')
+ # always check is this is an lvm member
+ if device_type in ['part', 'disk']:
+ self._set_lvm_membership()
+
+ if not sys_info.devices:
+ sys_info.devices = disk.get_devices()
+ self.sys_api = sys_info.devices.get(self.abspath, {})
+
+ def __repr__(self):
+ prefix = 'Unknown'
+ if self.is_lv:
+ prefix = 'LV'
+ elif self.is_partition:
+ prefix = 'Partition'
+ elif self.is_device:
+ prefix = 'Raw Device'
+ return '<%s: %s>' % (prefix, self.abspath)
+
+ def _set_lvm_membership(self):
+ if self._is_lvm_member is None:
+ # check if there was a pv created with the
+ # name of device
+ pvs = lvm.PVolumes()
+ pvs.filter(pv_name=self.abspath)
+ if not pvs:
+ self._is_lvm_member = False
+ return self._is_lvm_member
+ has_vgs = [pv.vg_name for pv in pvs if pv.vg_name]
+ if has_vgs:
+ self._is_lvm_member = True
+ self.pvs_api = pvs
+ else:
+ # this is contentious, if a PV is recognized by LVM but has no
+ # VGs, should we consider it as part of LVM? We choose not to
+ # here, because most likely, we need to use VGs from this PV.
+ self._is_lvm_member = False
+
+ return self._is_lvm_member
+
+ @property
+ def exists(self):
+ return os.path.exists(self.abspath)
+
+ @property
+ def is_lvm_member(self):
+ if self._is_lvm_member is None:
+ self._set_lvm_membership()
+ return self._is_lvm_member
+
+ @property
+ def is_mapper(self):
+ return self.path.startswith('/dev/mapper')
+
+ @property
+ def is_lv(self):
+ return self.lv_api is not None
+
+ @property
+ def is_partition(self):
+ if self.disk_api:
+ return self.disk_api['TYPE'] == 'part'
+ return False
+
+ @property
+ def is_device(self):
+ if self.disk_api:
+ return self.disk_api['TYPE'] == 'device'
+ return False
+import logging
import os
+import re
import stat
from ceph_volume import process
+from ceph_volume.api import lvm
+from ceph_volume.util.system import get_file_contents
+
+
+logger = logging.getLogger(__name__)
# The blkid CLI tool has some oddities which prevents having one common call
if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)):
return True
return False
+
+
+def _map_dev_paths(_path, include_abspath=False, include_realpath=False):
+ """
+ Go through all the items in ``_path`` and map them to their absolute path::
+
+ {'sda': '/dev/sda'}
+
+ If ``include_abspath`` is set, then a reverse mapping is set as well::
+
+ {'sda': '/dev/sda', '/dev/sda': 'sda'}
+
+ If ``include_realpath`` is set then the same operation is done for any
+ links found when listing, these are *not* reversed to avoid clashing on
+ existing keys, but both abspath and basename can be included. For example::
+
+ {
+ 'ceph-data': '/dev/mapper/ceph-data',
+ '/dev/mapper/ceph-data': 'ceph-data',
+ '/dev/dm-0': '/dev/mapper/ceph-data',
+ 'dm-0': '/dev/mapper/ceph-data'
+ }
+
+
+ In case of possible exceptions the mapping is returned empty, and the
+ exception is logged.
+ """
+ mapping = {}
+ try:
+ dev_names = os.listdir(_path)
+ except (OSError, IOError):
+ logger.exception('unable to list block devices from: %s' % _path)
+ return {}
+
+ for dev_name in dev_names:
+ mapping[dev_name] = os.path.join(_path, dev_name)
+
+ if include_abspath:
+ for k, v in list(mapping.items()):
+ mapping[v] = k
+
+ if include_realpath:
+ for abspath in list(mapping.values()):
+ if not os.path.islink(abspath):
+ continue
+
+ realpath = os.path.realpath(abspath)
+ basename = os.path.basename(realpath)
+ mapping[basename] = abspath
+ if include_abspath:
+ mapping[realpath] = abspath
+
+ return mapping
+
+
+def get_block_devs(sys_block_path="/sys/block", skip_loop=True):
+ """
+ Go through all the items in /sys/block and return them as a list.
+
+ The ``sys_block_path`` argument is set for easier testing and is not
+ required for proper operation.
+ """
+ devices = _map_dev_paths(sys_block_path).keys()
+ if skip_loop:
+ return [d for d in devices if not d.startswith('loop')]
+ return list(devices)
+
+
+def get_dev_devs(dev_path="/dev"):
+ """
+ Go through all the items in /dev and return them as a list.
+
+ The ``dev_path`` argument is set for easier testing and is not
+ required for proper operation.
+ """
+ return _map_dev_paths(dev_path, include_abspath=True)
+
+
+def get_mapper_devs(mapper_path="/dev/mapper"):
+ """
+ Go through all the items in /dev and return them as a list.
+
+ The ``dev_path`` argument is set for easier testing and is not
+ required for proper operation.
+ """
+ return _map_dev_paths(mapper_path, include_abspath=True, include_realpath=True)
+
+
+class BaseFloatUnit(float):
+ """
+ Base class to support float representations of size values. Suffix is
+ computed on child classes by inspecting the class name
+ """
+
+ def __repr__(self):
+ return "<%s(%s)>" % (self.__class__.__name__, self.__float__())
+
+ def __str__(self):
+ return "{size:.2f} {suffix}".format(
+ size=self.__float__(),
+ suffix=self.__class__.__name__.split('Float')[-1]
+ )
+
+ def as_int(self):
+ return int(self.real)
+
+ def as_float(self):
+ return self.real
+
+
+class FloatB(BaseFloatUnit):
+ pass
+
+
+class FloatMB(BaseFloatUnit):
+ pass
+
+
+class FloatGB(BaseFloatUnit):
+ pass
+
+
+class FloatKB(BaseFloatUnit):
+ pass
+
+
+class FloatTB(BaseFloatUnit):
+ pass
+
+
+class Size(object):
+ """
+ Helper to provide an interface for different sizes given a single initial
+ input. Allows for comparison between different size objects, which avoids
+ the need to convert sizes before comparison (e.g. comparing megabytes
+ against gigabytes).
+
+ Common comparison operators are supported::
+
+ >>> hd1 = Size(gb=400)
+ >>> hd2 = Size(gb=500)
+ >>> hd1 > hd2
+ False
+ >>> hd1 < hd2
+ True
+ >>> hd1 == hd2
+ False
+ >>> hd1 == Size(gb=400)
+ True
+
+ The Size object can also be multiplied or divided::
+
+ >>> hd1
+ <Size(400.00 GB)>
+ >>> hd1 * 2
+ <Size(800.00 GB)>
+ >>> hd1
+ <Size(800.00 GB)>
+
+ Additions and subtractions are only supported between Size objects::
+
+ >>> Size(gb=224) - Size(gb=100)
+ <Size(124.00 GB)>
+ >>> Size(gb=1) + Size(mb=300)
+ <Size(1.29 GB)>
+
+ Can also display a human-readable representation, with automatic detection
+ on best suited unit, or alternatively, specific unit representation::
+
+ >>> s = Size(mb=2211)
+ >>> s
+ <Size(2.16 GB)>
+ >>> s.mb
+ <FloatMB(2211.0)>
+ >>> print "Total size: %s" % s.mb
+ Total size: 2211.00 MB
+ >>> print "Total size: %s" % s
+ Total size: 2.16 GB
+ """
+
+ def __init__(self, multiplier=1024, **kw):
+ self._multiplier = multiplier
+ # create a mapping of units-to-multiplier, skip bytes as that is
+ # calculated initially always and does not need to convert
+ aliases = [
+ [('kb', 'kilobytes'), self._multiplier],
+ [('mb', 'megabytes'), self._multiplier ** 2],
+ [('gb', 'gigabytes'), self._multiplier ** 3],
+ [('tb', 'terabytes'), self._multiplier ** 4],
+ ]
+ # and mappings for units-to-formatters, including bytes and aliases for
+ # each
+ format_aliases = [
+ [('b', 'bytes'), FloatB],
+ [('kb', 'kilobytes'), FloatKB],
+ [('mb', 'megabytes'), FloatMB],
+ [('gb', 'gigabytes'), FloatGB],
+ [('tb', 'terabytes'), FloatTB],
+ ]
+ self._formatters = {}
+ for key, value in format_aliases:
+ for alias in key:
+ self._formatters[alias] = value
+ self._factors = {}
+ for key, value in aliases:
+ for alias in key:
+ self._factors[alias] = value
+
+ for k, v in kw.items():
+ self._convert(v, k)
+ # only pursue the first occurence
+ break
+
+ def _convert(self, size, unit):
+ """
+ Convert any size down to bytes so that other methods can rely on bytes
+ being available always, regardless of what they pass in, avoiding the
+ need for a mapping of every permutation.
+ """
+ if unit in ['b', 'bytes']:
+ self._b = size
+ return
+ factor = self._factors[unit]
+ self._b = float(size * factor)
+
+ def _get_best_format(self):
+ """
+ Go through all the supported units, and use the first one that is less
+ than 1024. This allows to represent size in the most readable format
+ available
+ """
+ for unit in ['b', 'kb', 'mb', 'gb', 'tb']:
+ if getattr(self, unit) > 1024:
+ continue
+ return getattr(self, unit)
+
+ def __repr__(self):
+ return "<Size(%s)>" % self._get_best_format()
+
+ def __str__(self):
+ return "%s" % self._get_best_format()
+
+ def __lt__(self, other):
+ return self._b < other._b
+
+ def __le__(self, other):
+ return self._b <= other._b
+
+ def __eq__(self, other):
+ return self._b == other._b
+
+ def __ne__(self, other):
+ return self._b != other._b
+
+ def __ge__(self, other):
+ return self._b >= other._b
+
+ def __gt__(self, other):
+ return self._b > other._b
+
+ def __add__(self, other):
+ if isinstance(other, Size):
+ _b = self._b + other._b
+ return Size(b=_b)
+ raise TypeError('Cannot add "Size" object with int')
+
+ def __sub__(self, other):
+ if isinstance(other, Size):
+ _b = self._b - other._b
+ return Size(b=_b)
+ raise TypeError('Cannot subtract "Size" object from int')
+
+ def __mul__(self, other):
+ if isinstance(other, Size):
+ raise TypeError('Cannot multiply with "Size" object')
+ _b = self._b * other
+ return Size(b=_b)
+
+ def __truediv__(self, other):
+ if isinstance(other, Size):
+ return self._b / other._b
+ self._b = self._b / other
+ return self
+
+ def __div__(self, other):
+ if isinstance(other, Size):
+ return self._b / other._b
+ self._b = self._b / other
+ return self
+
+ def __getattr__(self, unit):
+ """
+ Calculate units on the fly, relies on the fact that ``bytes`` has been
+ converted at instantiation. Units that don't exist will trigger an
+ ``AttributeError``
+ """
+ try:
+ formatter = self._formatters[unit]
+ except KeyError:
+ raise AttributeError('Size object has not attribute "%s"' % unit)
+ if unit in ['b', 'bytes']:
+ return formatter(self._b)
+ try:
+ factor = self._factors[unit]
+ except KeyError:
+ raise AttributeError('Size object has not attribute "%s"' % unit)
+ return formatter(float(self._b) / factor)
+
+
+def human_readable_size(size):
+ """
+ Take a size in bytes, and transform it into a human readable size with up
+ to two decimals of precision.
+ """
+ suffixes = ['B', 'KB', 'MB', 'GB', 'TB']
+ suffix_index = 0
+ while size > 1024:
+ suffix_index += 1
+ size = size / 1024.0
+ return "{size:.2f} {suffix}".format(
+ size=size,
+ suffix=suffixes[suffix_index])
+
+
+def get_partitions_facts(sys_block_path):
+ partition_metadata = {}
+ for folder in os.listdir(sys_block_path):
+ folder_path = os.path.join(sys_block_path, folder)
+ if os.path.exists(os.path.join(folder_path, 'partition')):
+ contents = get_file_contents(os.path.join(folder_path, 'partition'))
+ if '1' in contents:
+ part = {}
+ partname = folder
+ part_sys_block_path = os.path.join(sys_block_path, partname)
+
+ part['start'] = get_file_contents(part_sys_block_path + "/start", 0)
+ part['sectors'] = get_file_contents(part_sys_block_path + "/size", 0)
+
+ part['sectorsize'] = get_file_contents(
+ part_sys_block_path + "/queue/logical_block_size")
+ if not part['sectorsize']:
+ part['sectorsize'] = get_file_contents(
+ part_sys_block_path + "/queue/hw_sector_size", 512)
+ part['size'] = human_readable_size(float(part['sectors']) * 512)
+
+ partition_metadata[partname] = part
+ return partition_metadata
+
+
+def is_mapper_device(device_name):
+ return device_name.startswith(('/dev/mapper', '/dev/dm-'))
+
+
+def get_devices(_sys_block_path='/sys/block', _dev_path='/dev', _mapper_path='/dev/mapper'):
+ """
+ Captures all available devices from /sys/block/, including its partitions,
+ along with interesting metadata like sectors, size, vendor,
+ solid/rotational, etc...
+
+ Returns a dictionary, where keys are the full paths to devices.
+
+ ..note:: dmapper devices get their path updated to what they link from, if
+ /dev/dm-0 is linked by /dev/mapper/ceph-data, then the latter gets
+ used as the key.
+
+ ..note:: loop devices, removable media, and logical volumes are never included.
+ """
+ # Portions of this detection process are inspired by some of the fact
+ # gathering done by Ansible in module_utils/facts/hardware/linux.py. The
+ # processing of metadata and final outcome *is very different* and fully
+ # imcompatible. There are ignored devices, and paths get resolved depending
+ # on dm devices, loop, and removable media
+
+ device_facts = {}
+
+ block_devs = get_block_devs(_sys_block_path)
+ dev_devs = get_dev_devs(_dev_path)
+ mapper_devs = get_mapper_devs(_mapper_path)
+
+ for block in block_devs:
+ sysdir = os.path.join(_sys_block_path, block)
+ metadata = {}
+
+ # Ensure that the diskname is an absolute path and that it never points
+ # to a /dev/dm-* device
+ diskname = mapper_devs.get(block) or dev_devs.get(block)
+
+ # If the mapper device is a logical volume it gets excluded
+ if is_mapper_device(diskname):
+ if lvm.is_lv(diskname):
+ continue
+
+ # If the device reports itself as 'removable', get it excluded
+ metadata['removable'] = get_file_contents(os.path.join(sysdir, 'removable'))
+ if metadata['removable'] == '1':
+ continue
+
+ for key in ['vendor', 'model', 'sas_address', 'sas_device_handle']:
+ metadata[key] = get_file_contents(sysdir + "/device/" + key)
+
+ for key in ['sectors', 'size']:
+ metadata[key] = get_file_contents(os.path.join(sysdir, key), 0)
+
+ for key, _file in [('support_discard', '/queue/discard_granularity')]:
+ metadata[key] = get_file_contents(os.path.join(sysdir, _file))
+
+ metadata['partitions'] = get_partitions_facts(sysdir)
+
+ metadata['rotational'] = get_file_contents(sysdir + "/queue/rotational")
+ metadata['scheduler_mode'] = ""
+ scheduler = get_file_contents(sysdir + "/queue/scheduler")
+ if scheduler is not None:
+ m = re.match(r".*?(\[(.*)\])", scheduler)
+ if m:
+ metadata['scheduler_mode'] = m.group(2)
+
+ if not metadata['sectors']:
+ metadata['sectors'] = 0
+ size = metadata['sectors'] or metadata['size']
+ metadata['sectorsize'] = get_file_contents(sysdir + "/queue/logical_block_size")
+ if not metadata['sectorsize']:
+ metadata['sectorsize'] = get_file_contents(sysdir + "/queue/hw_sector_size", 512)
+ metadata['human_readable_size'] = human_readable_size(float(size) * 512)
+ metadata['size'] = float(size) * 512
+ metadata['path'] = diskname
+
+ device_facts[diskname] = metadata
+ return device_facts
:param mapping:
"""
+ if not os.path.exists(mapping):
+ logger.debug('device mapper path does not exist %s' % mapping)
+ logger.debug('will skip cryptsetup removal')
+ return
process.run(['cryptsetup', 'remove', mapping])
import os
import logging
import json
-from ceph_volume import process, conf
-from ceph_volume.util import system, constants
+from ceph_volume import process, conf, __release__, terminal
+from ceph_volume.util import system, constants, str_to_int, disk
logger = logging.getLogger(__name__)
+mlogger = terminal.MultiLogger(__name__)
def create_key():
system.chown(osd_keyring)
+def get_journal_size(lv_format=True):
+ """
+ Helper to retrieve the size (defined in megabytes in ceph.conf) to create
+ the journal logical volume, it "translates" the string into a float value,
+ then converts that into gigabytes, and finally (optionally) it formats it
+ back as a string so that it can be used for creating the LV.
+
+ :param lv_format: Return a string to be used for ``lv_create``. A 5 GB size
+ would result in '5G', otherwise it will return a ``Size`` object.
+ """
+ conf_journal_size = conf.ceph.get_safe('osd', 'osd_journal_size', '5120')
+ logger.debug('osd_journal_size set to %s' % conf_journal_size)
+ journal_size = disk.Size(mb=str_to_int(conf_journal_size))
+
+ if journal_size < disk.Size(gb=2):
+ mlogger.error('Refusing to continue with configured size for journal')
+ raise RuntimeError('journal sizes must be larger than 2GB, detected: %s' % journal_size)
+ if lv_format:
+ return '%sG' % journal_size.gb.as_int()
+ return journal_size
+
+
def create_id(fsid, json_secrets, osd_id=None):
"""
:param fsid: The osd fsid to create, always required
'-i', '-',
'osd', 'new', fsid
]
- if check_id(osd_id):
- cmd.append(osd_id)
+ if osd_id is not None:
+ if osd_id_available(osd_id):
+ cmd.append(osd_id)
+ else:
+ raise RuntimeError("The osd ID {} is already in use or does not exist.".format(osd_id))
stdout, stderr, returncode = process.call(
cmd,
stdin=json_secrets,
return ' '.join(stdout).strip()
-def check_id(osd_id):
+def osd_id_available(osd_id):
"""
- Checks to see if an osd ID exists or not. Returns True
- if it does exist, False if it doesn't.
+ Checks to see if an osd ID exists and if it's available for
+ reuse. Returns True if it is, False if it isn't.
:param osd_id: The osd ID to check
"""
output = json.loads(''.join(stdout).strip())
osds = output['nodes']
- return any([str(osd['id']) == str(osd_id) for osd in osds])
+ osd = [osd for osd in osds if str(osd['id']) == str(osd_id)]
+ if osd and osd[0].get('status') == "destroyed":
+ return True
+ return False
def mount_tmpfs(path):
path
])
+ # Restore SELinux context
+ system.set_context(path)
+
def create_osd_path(osd_id, tmpfs=False):
path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
command.append(destination)
process.run(command)
+ # Restore SELinux context
+ system.set_context(destination)
+
def _link_device(device, device_type, osd_id):
"""
raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command)))
-def osd_mkfs_filestore(osd_id, fsid):
+def osd_mkfs_filestore(osd_id, fsid, keyring):
"""
Create the files for the OSD to function. A normal call will look like:
system.chown(journal)
system.chown(path)
- process.run([
+ command = [
'ceph-osd',
'--cluster', conf.cluster,
# undocumented flag, sets the `type` file to contain 'filestore'
'--mkfs',
'-i', osd_id,
'--monmap', monmap,
+ ]
+
+ if __release__ != 'luminous':
+ # goes through stdin
+ command.extend(['--keyfile', '-'])
+
+ command.extend([
'--osd-data', path,
'--osd-journal', journal,
'--osd-uuid', fsid,
'--setuser', 'ceph',
'--setgroup', 'ceph'
])
+
+ _, _, returncode = process.call(
+ command, stdin=keyring, terminal_verbose=True, show_command=True
+ )
+ if returncode != 0:
+ raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command)))
return user[2], user[3]
+def get_file_contents(path, default=''):
+ contents = default
+ if not os.path.exists(path):
+ return contents
+ try:
+ with open(path, 'r') as open_file:
+ contents = open_file.read().strip()
+ except Exception:
+ logger.exception('Failed to read contents from: %s' % path)
+
+ return contents
+
+
def mkdir_p(path, chown=True):
"""
A `mkdir -p` that defaults to chown the path to the ceph user
return devices_mounted
else:
return paths_mounted
+
+
+def set_context(path, recursive = False):
+ # restore selinux context to default policy values
+ if which('restorecon').startswith('/'):
+ if recursive:
+ process.run(['restorecon', '-R', path])
+ else:
+ process.run(['restorecon', path])
--- /dev/null
+
+osd_header = """
+{:-^100}""".format('')
+
+
+osd_component_titles = """
+ Type Path LV Size % of device"""
+
+
+osd_component = """
+ {_type: <15} {path: <55} {size: <15} {percent}%"""
+
+
+total_osds = """
+Total OSDs: {total_osds}
+"""
+
+ssd_volume_group = """
+Solid State VG:
+ Targets: {target: <25} Total size: {total_lv_size: <25}
+ Total LVs: {total_lvs: <25} Size per LV: {lv_size: <25}
+ Devices: {block_db_devices}
+"""
+
+
mds->handle_signal(signum);
}
-#ifdef BUILDING_FOR_EMBEDDED
-extern "C" int cephd_mds(int argc, const char **argv)
-#else
int main(int argc, const char **argv)
-#endif
{
ceph_pthread_setname(pthread_self(), "ceph-mds");
generic_server_usage();
}
-#ifdef BUILDING_FOR_EMBEDDED
-void cephd_preload_embedded_plugins();
-extern "C" int cephd_mon(int argc, const char **argv)
-#else
int main(int argc, const char **argv)
-#endif
{
int err;
if (stats.avail_percent <= g_conf->mon_data_avail_crit) {
derr << "error: monitor data filesystem reached concerning levels of"
<< " available storage space (available: "
- << stats.avail_percent << "% " << prettybyte_t(stats.byte_avail)
+ << stats.avail_percent << "% " << byte_u_t(stats.byte_avail)
<< ")\nyou may adjust 'mon data avail crit' to a lower value"
<< " to make this go away (default: " << g_conf->mon_data_avail_crit
<< "%)\n" << dendl;
}
common_init_finish(g_ceph_context);
global_init_chdir(g_ceph_context);
-#ifndef BUILDING_FOR_EMBEDDED
if (global_init_preload_erasure_code(g_ceph_context) < 0)
prefork.exit(1);
-#else
- cephd_preload_embedded_plugins();
-#endif
}
MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
generic_server_usage();
}
-#ifdef BUILDING_FOR_EMBEDDED
-void cephd_preload_embedded_plugins();
-void cephd_preload_rados_classes(OSD *osd);
-extern "C" int cephd_osd(int argc, const char **argv)
-#else
int main(int argc, const char **argv)
-#endif
{
vector<const char*> args;
argv_to_vec(argc, argv, args);
return -ENODEV;
}
-#ifdef BUILDING_FOR_EMBEDDED
- cephd_preload_embedded_plugins();
-#endif
if (mkkey) {
common_init_finish(g_ceph_context);
return -1;
global_init_chdir(g_ceph_context);
-#ifndef BUILDING_FOR_EMBEDDED
if (global_init_preload_erasure_code(g_ceph_context) < 0)
return -1;
-#endif
srand(time(NULL) + getpid());
return 1;
}
-#ifdef BUILDING_FOR_EMBEDDED
- cephd_preload_rados_classes(osd);
-#endif
-
// install signal handlers
init_async_signal_handler();
register_async_signal_handler(SIGHUP, sighup_handler);
f->dump_int("dentry_count", lru.lru_get_size());
f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
f->dump_int("id", get_nodeid().v);
+ entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
+ f->dump_object("inst", inst);
+ f->dump_stream("inst_str") << inst;
+ f->dump_stream("addr_str") << inst.addr;
f->dump_int("inode_count", inode_map.size());
f->dump_int("mds_epoch", mdsmap->get_epoch());
f->dump_int("osd_epoch", osd_epoch);
}
-void Client::update_inode_file_bits(Inode *in,
- uint64_t truncate_seq, uint64_t truncate_size,
- uint64_t size, uint64_t change_attr,
- uint64_t time_warp_seq, utime_t ctime,
- utime_t mtime,
- utime_t atime,
- version_t inline_version,
- bufferlist& inline_data,
- int issued)
+void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
+ uint64_t truncate_seq, uint64_t truncate_size)
{
- bool warn = false;
- ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
- << " mtime " << mtime << dendl;
- ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
- << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
- << " local " << in->time_warp_seq << dendl;
uint64_t prior_size = in->size;
- if (inline_version > in->inline_version) {
- in->inline_data = inline_data;
- in->inline_version = inline_version;
- }
-
- /* always take a newer change attr */
- if (change_attr > in->change_attr)
- in->change_attr = change_attr;
-
if (truncate_seq > in->truncate_seq ||
(truncate_seq == in->truncate_seq && size > in->size)) {
ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
}
}
-
+}
+
+void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
+ utime_t ctime, utime_t mtime, utime_t atime)
+{
+ ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
+ << " ctime " << ctime << " mtime " << mtime << dendl;
+
+ if (time_warp_seq > in->time_warp_seq)
+ ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
+ << " is higher than local time_warp_seq "
+ << in->time_warp_seq << dendl;
+
+ int warn = false;
// be careful with size, mtime, atime
if (issued & (CEPH_CAP_FILE_EXCL|
CEPH_CAP_FILE_WR|
if (ctime > in->ctime)
in->ctime = ctime;
if (time_warp_seq > in->time_warp_seq) {
- ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
- << " is higher than local time_warp_seq "
- << in->time_warp_seq << dendl;
//the mds updated times, so take those!
in->mtime = mtime;
in->atime = atime;
if (in->is_symlink())
in->symlink = st->symlink;
- if (was_new)
- ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
-
- if (!st->cap.caps)
- return in; // as with readdir returning indoes in different snaprealms (no caps!)
-
// only update inode if mds info is strictly newer, or it is the same and projected (odd).
- bool updating_inode = false;
- int issued = 0;
- if (st->version == 0 ||
- (in->version & ~1) < st->version) {
- updating_inode = true;
+ bool new_version = false;
+ if (in->version == 0 ||
+ ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ (in->version & ~1) < st->version))
+ new_version = true;
- int implemented = 0;
- issued = in->caps_issued(&implemented) | in->caps_dirty();
- issued |= implemented;
+ int issued;
+ in->caps_issued(&issued);
+ issued |= in->caps_dirty();
+ int new_issued = ~issued & (int)st->cap.caps;
- in->version = st->version;
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ !(issued & CEPH_CAP_AUTH_EXCL)) {
+ in->mode = st->mode;
+ in->uid = st->uid;
+ in->gid = st->gid;
+ in->btime = st->btime;
+ }
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- in->mode = st->mode;
- in->uid = st->uid;
- in->gid = st->gid;
- in->btime = st->btime;
- }
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ !(issued & CEPH_CAP_LINK_EXCL)) {
+ in->nlink = st->nlink;
+ }
- if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
- in->nlink = st->nlink;
- }
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ update_inode_file_time(in, issued, st->time_warp_seq,
+ st->ctime, st->mtime, st->atime);
+ }
- in->dirstat = st->dirstat;
- in->rstat = st->rstat;
- in->quota = st->quota;
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
in->layout = st->layout;
+ update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
+ }
- if (in->is_dir()) {
+ if (in->is_dir()) {
+ if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
+ in->dirstat = st->dirstat;
+ }
+ // dir_layout/rstat/quota are not tracked by capability, update them only if
+ // the inode stat is from auth mds
+ if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
in->dir_layout = st->dir_layout;
ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
+ in->rstat = st->rstat;
+ in->quota = st->quota;
+ }
+ // move me if/when version reflects fragtree changes.
+ if (in->dirfragtree != st->dirfragtree) {
+ in->dirfragtree = st->dirfragtree;
+ _fragmap_remove_non_leaves(in);
}
-
- update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
- st->change_attr, st->time_warp_seq, st->ctime,
- st->mtime, st->atime, st->inline_version,
- st->inline_data, issued);
- } else if (st->inline_version > in->inline_version) {
- in->inline_data = st->inline_data;
- in->inline_version = st->inline_version;
}
if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
in->xattr_version = st->xattr_version;
}
- // move me if/when version reflects fragtree changes.
- if (in->dirfragtree != st->dirfragtree) {
- in->dirfragtree = st->dirfragtree;
- _fragmap_remove_non_leaves(in);
+ if (st->inline_version > in->inline_version) {
+ in->inline_data = st->inline_data;
+ in->inline_version = st->inline_version;
}
+ /* always take a newer change attr */
+ if (st->change_attr > in->change_attr)
+ in->change_attr = st->change_attr;
+
+ if (st->version > in->version)
+ in->version = st->version;
+
+ if (was_new)
+ ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
+
+ if (!st->cap.caps)
+ return in; // as with readdir returning indoes in different snaprealms (no caps!)
+
if (in->snapid == CEPH_NOSNAP) {
add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
in->max_size = st->max_size;
in->rstat = st->rstat;
}
- } else
- in->snap_caps |= st->cap.caps;
- // setting I_COMPLETE needs to happen after adding the cap
- if (updating_inode &&
- in->is_dir() &&
- (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- in->dirstat.nfiles == 0 &&
- in->dirstat.nsubdirs == 0) {
- ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
- in->flags |= I_COMPLETE | I_DIR_ORDERED;
- if (in->dir) {
- ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
- << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
- in->dir->readdir_cache.clear();
- for (auto p = in->dir->dentries.begin();
- p != in->dir->dentries.end();
- ++p) {
- unlink(p->second, true, true); // keep dir, keep dentry
+ // setting I_COMPLETE needs to happen after adding the cap
+ if (in->is_dir() &&
+ (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ in->dirstat.nfiles == 0 &&
+ in->dirstat.nsubdirs == 0) {
+ ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
+ in->flags |= I_COMPLETE | I_DIR_ORDERED;
+ if (in->dir) {
+ ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
+ << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
+ in->dir->readdir_cache.clear();
+ for (const auto& p : in->dir->dentries) {
+ unlink(p.second, true, true); // keep dir, keep dentry
+ }
+ if (in->dir->dentries.empty())
+ close_dir(in->dir);
}
- if (in->dir->dentries.empty())
- close_dir(in->dir);
}
+ } else {
+ in->snap_caps |= st->cap.caps;
}
return in;
void Client::dump_mds_sessions(Formatter *f)
{
f->dump_int("id", get_nodeid().v);
+ entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
+ f->dump_object("inst", inst);
+ f->dump_stream("inst_str") << inst;
+ f->dump_stream("addr_str") << inst.addr;
f->open_array_section("sessions");
for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
f->open_object_section("session");
}
if (objecter->osdmap_pool_full(in->layout.pool_id)) {
- ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
+ ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
objectcacher->purge_set(&in->oset);
if (onfinish) {
onfinish->complete(-ENOSPC);
<< " size " << in->size << " -> " << m->get_size()
<< dendl;
- int implemented = 0;
- int issued = in->caps_issued(&implemented) | in->caps_dirty();
- issued |= implemented;
- update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
- m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
- m->get_ctime(), m->get_mtime(), m->get_atime(),
- m->inline_version, m->inline_data, issued);
+ int issued;
+ in->caps_issued(&issued);
+ issued |= in->caps_dirty();
+ update_inode_file_size(in, issued, m->get_size(),
+ m->get_truncate_seq(), m->get_truncate_size());
m->put();
}
cap->seq = m->get_seq();
cap->gen = session->cap_gen;
- in->layout = m->get_layout();
-
// update inode
- int implemented = 0;
- int issued = in->caps_issued(&implemented) | in->caps_dirty();
- issued |= implemented;
+ int issued;
+ in->caps_issued(&issued);
+ issued |= in->caps_dirty();
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
+ !(issued & CEPH_CAP_AUTH_EXCL)) {
in->mode = m->head.mode;
in->uid = m->head.uid;
in->gid = m->head.gid;
in->btime = m->btime;
}
bool deleted_inode = false;
- if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+ if ((new_caps & CEPH_CAP_LINK_SHARED) &&
+ !(issued & CEPH_CAP_LINK_EXCL)) {
in->nlink = m->head.nlink;
if (in->nlink == 0 &&
(new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
deleted_inode = true;
}
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ if (!(issued & CEPH_CAP_XATTR_EXCL) &&
m->xattrbl.length() &&
m->head.xattr_version > in->xattr_version) {
bufferlist::iterator p = m->xattrbl.begin();
in->dirstat.nsubdirs = m->get_nsubdirs();
}
- update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
- m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
- m->get_mtime(), m->get_atime(),
- m->inline_version, m->inline_data, issued);
+ if (new_caps & CEPH_CAP_ANY_RD) {
+ update_inode_file_time(in, issued, m->get_time_warp_seq(),
+ m->get_ctime(), m->get_mtime(), m->get_atime());
+ }
+
+ if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ in->layout = m->get_layout();
+ update_inode_file_size(in, issued, m->get_size(),
+ m->get_truncate_seq(), m->get_truncate_size());
+ }
+
+ if (m->inline_version > in->inline_version) {
+ in->inline_data = m->inline_data;
+ in->inline_version = m->inline_version;
+ }
+
+ /* always take a newer change attr */
+ if (m->get_change_attr() > in->change_attr)
+ in->change_attr = m->get_change_attr();
// max_size
if (cap == in->auth_cap &&
- m->get_max_size() != in->max_size) {
+ (new_caps & CEPH_CAP_ANY_FILE_WR) &&
+ (m->get_max_size() != in->max_size)) {
ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
in->max_size = m->get_max_size();
if (in->max_size > in->wanted_max_size) {
r = inode_permission(in, perms, want);
}
out:
- ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
+ ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
return r;
}
return -ENOTDIR;
*dirpp = new dir_result_t(in, perms);
opened_dirs.insert(*dirpp);
- ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
+ ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
return 0;
}
* the resulting Inode object in one operation, so that caller
* can safely assume inode will still be there after return.
*/
-int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
+int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
{
- Mutex::Locker lock(client_lock);
- ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
+ ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
if (unmounting)
return -ENOTCONN;
*inode = p->second;
_ll_get(*inode);
}
- ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
+ ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
return r;
}
-
+int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
+{
+ Mutex::Locker lock(client_lock);
+ return _lookup_ino(ino, perms, inode);
+}
/**
* Find the parent inode of `ino` and insert it into
* our cache. Conditionally also set `parent` to a referenced
* Inode* if caller provides non-NULL value.
*/
-int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
+int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
{
- Mutex::Locker lock(client_lock);
- ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
+ ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
if (unmounting)
return -ENOTCONN;
if (!ino->dn_set.empty()) {
// if we exposed the parent here, we'd need to check permissions,
// but right now we just rely on the MDS doing so in make_request
- ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
+ ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
return 0;
}
if (ino->is_root()) {
*parent = NULL;
- ldout(cct, 3) << "ino is root, no parent" << dendl;
+ ldout(cct, 8) << "ino is root, no parent" << dendl;
return -EINVAL;
}
if (r == 0) {
*parent = target.get();
_ll_get(*parent);
- ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
+ ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
} else {
*parent = NULL;
}
}
- ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
+ ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
return r;
}
+int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
+{
+ Mutex::Locker lock(client_lock);
+ return _lookup_parent(ino, perms, parent);
+}
/**
* Populate the parent dentry for `ino`, provided it is
* a child of `parent`.
*/
-int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
+int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
{
assert(parent->is_dir());
-
- Mutex::Locker lock(client_lock);
ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
if (unmounting)
return r;
}
+int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
+{
+ Mutex::Locker lock(client_lock);
+ return _lookup_name(ino, parent, perms);
+}
Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
{
//ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
//ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
Inode *in = f->inode.get();
- ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
+ ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
in->unset_deleg(f);
result = get_caps(in, need, want, &have, -1);
if (result < 0) {
- ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
+ ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
" . Denying open: " <<
cpp_strerror(result) << dendl;
in->put_open_ref(cmode);
ceph_abort();
}
- ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
+ ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
return f->pos;
}
// The IOs in this fsync were okay, but maybe something happened
// in the background that we shoudl be reporting?
r = f->take_async_err();
- ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
+ ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
<< ") = 0, async_err = " << r << dendl;
} else {
// Assume that an error we encountered during fsync, even reported
// synchronously, would also have applied the error to the Fh, and we
// should clear it here to avoid returning the same error again on next
// call.
- ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
+ ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
<< r << dendl;
f->take_async_err();
}
ceph_tid_t flush_tid = 0;
InodeRef tmp_ref;
- ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
+ ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
if (cct->_conf->client_oc) {
object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
} else {
- ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
+ ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
<< cpp_strerror(-r) << dendl;
}
int Client::_fsync(Fh *f, bool syncdataonly)
{
- ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
+ ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
return _fsync(f->inode.get(), syncdataonly);
}
if (r < 0)
return r;
fill_stat(f->inode, stbuf, NULL);
- ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
+ ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
return r;
}
return r;
}
+int Client::ll_lookup_inode(
+ struct inodeno_t ino,
+ const UserPerm& perms,
+ Inode **inode)
+{
+ Mutex::Locker lock(client_lock);
+ ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
+
+ // Num1: get inode and *inode
+ int r = _lookup_ino(ino, perms, inode);
+ if (r) {
+ return r;
+ }
+ assert(inode != NULL);
+ assert(*inode != NULL);
+
+ // Num2: Request the parent inode, so that we can look up the name
+ Inode *parent;
+ r = _lookup_parent(*inode, perms, &parent);
+ if (r && r != -EINVAL) {
+ // Unexpected error
+ _ll_forget(*inode, 1);
+ return r;
+ } else if (r == -EINVAL) {
+ // EINVAL indicates node without parents (root), drop out now
+ // and don't try to look up the non-existent dentry.
+ return 0;
+ }
+ // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
+ // is already in cache
+ assert(parent != NULL);
+
+ // Num3: Finally, get the name (dentry) of the requested inode
+ r = _lookup_name(*inode, parent, perms);
+ if (r) {
+ // Unexpected error
+ _ll_forget(parent, 1);
+ _ll_forget(*inode, 1);
+ return r;
+ }
+
+ _ll_forget(parent, 1);
+ return 0;
+}
+
int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
struct ceph_statx *stx, unsigned want, unsigned flags,
const UserPerm& perms)
void Client::_ll_drop_pins()
{
ldout(cct, 10) << "_ll_drop_pins" << dendl;
+ std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
it != inode_map.end();
Inode *in = it->second;
next = it;
++next;
- if (in->ll_ref)
+ if (in->ll_ref){
+ to_be_put.insert(in);
_ll_put(in, in->ll_ref);
+ }
}
}
-bool Client::ll_forget(Inode *in, int count)
+bool Client::_ll_forget(Inode *in, int count)
{
- Mutex::Locker lock(client_lock);
inodeno_t ino = _get_inodeno(in);
- ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
+ ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
tout(cct) << "ll_forget" << std::endl;
tout(cct) << ino.val << std::endl;
tout(cct) << count << std::endl;
return last;
}
+bool Client::ll_forget(Inode *in, int count)
+{
+ Mutex::Locker lock(client_lock);
+ return _ll_forget(in, count);
+}
+
bool Client::ll_put(Inode *in)
{
/* ll_forget already takes the lock */
{
vinodeno_t vino = _get_vino(in);
- ldout(cct, 3) << "ll_getattr " << vino << dendl;
+ ldout(cct, 8) << "ll_getattr " << vino << dendl;
tout(cct) << "ll_getattr" << std::endl;
tout(cct) << vino.ino.val << std::endl;
{
vinodeno_t vino = _get_vino(in);
- ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
+ ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
<< dendl;
tout(cct) << "ll_setattrx" << std::endl;
tout(cct) << vino.ino.val << std::endl;
}
}
out:
- ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
+ ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
return r;
}
r = -ERANGE;
}
}
- ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
+ ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
return r;
}
int res = make_request(req, perms);
trim_cache();
- ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
+ ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
return res;
}
int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
const UserPerm& perms, InodeRef *inp)
{
- ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
+ ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
<< mode << dec << ", " << rdev << ", uid " << perms.uid()
<< ", gid " << perms.gid() << ")" << dendl;
trim_cache();
- ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
+ ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
return res;
fail:
int object_size, const char *data_pool, bool *created,
const UserPerm& perms)
{
- ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
+ ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
mode << dec << ")" << dendl;
if (strlen(name) > NAME_MAX)
reply_error:
trim_cache();
- ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
+ ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
<< " layout " << stripe_unit
<< ' ' << stripe_count
<< ' ' << object_size
int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
InodeRef *inp)
{
- ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
+ ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
<< mode << dec << ", uid " << perm.uid()
<< ", gid " << perm.gid() << ")" << dendl;
trim_cache();
- ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
+ ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
return res;
fail:
int Client::_symlink(Inode *dir, const char *name, const char *target,
const UserPerm& perms, InodeRef *inp)
{
- ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
+ ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
<< ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
<< dendl;
res = make_request(req, perms, inp);
trim_cache();
- ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
+ ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
res << dendl;
return res;
int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
{
- ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
+ ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
<< " uid " << perm.uid() << " gid " << perm.gid()
<< ")" << dendl;
res = make_request(req, perm);
trim_cache();
- ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
+ ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
return res;
fail:
int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
{
- ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
+ ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
<< perms.uid() << " gid " << perms.gid() << ")" << dendl;
if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
res = make_request(req, perms);
trim_cache();
- ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
+ ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
return res;
fail:
int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
{
- ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
+ ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
<< todir->ino << " " << toname
<< " uid " << perm.uid() << " gid " << perm.gid() << ")"
<< dendl;
// renamed item from our cache
trim_cache();
- ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
+ ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
return res;
fail:
int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
{
- ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
+ ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
<< " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
if (strlen(newname) > NAME_MAX)
ldout(cct, 10) << "link result is " << res << dendl;
trim_cache();
- ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
+ ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
return res;
fail:
vinodeno_t vparent = _get_vino(parent);
- ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
+ ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
<< ", gid " << perms.gid() << dendl;
tout(cct) << "ll_create" << std::endl;
tout(cct) << (unsigned long)*fhp << std::endl;
tout(cct) << ino << std::endl;
- ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
+ ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
*fhp << " " << hex << ino << dec << ")" << dendl;
void clear_dir_complete_and_ordered(Inode *diri, bool complete);
void insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri);
Inode* insert_trace(MetaRequest *request, MetaSession *session);
- void update_inode_file_bits(Inode *in, uint64_t truncate_seq, uint64_t truncate_size, uint64_t size,
- uint64_t change_attr, uint64_t time_warp_seq, utime_t ctime,
- utime_t mtime, utime_t atime, version_t inline_version,
- bufferlist& inline_data, int issued);
+ void update_inode_file_size(Inode *in, int issued, uint64_t size,
+ uint64_t truncate_seq, uint64_t truncate_size);
+ void update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
+ utime_t ctime, utime_t mtime, utime_t atime);
+
Inode *add_update_inode(InodeStat *st, utime_t ttl, MetaSession *session,
const UserPerm& request_perms);
Dentry *insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
mds_rank_t _get_random_up_mds() const;
int _ll_getattr(Inode *in, int caps, const UserPerm& perms);
+ int _lookup_parent(Inode *in, const UserPerm& perms, Inode **parent=NULL);
+ int _lookup_name(Inode *in, Inode *parent, const UserPerm& perms);
+ int _lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode=NULL);
+ bool _ll_forget(Inode *in, int count);
public:
int mount(const std::string &mount_root, const UserPerm& perms,
Inode *ll_get_inode(vinodeno_t vino);
int ll_lookup(Inode *parent, const char *name, struct stat *attr,
Inode **out, const UserPerm& perms);
+ int ll_lookup_inode(struct inodeno_t ino, const UserPerm& perms, Inode **inode);
int ll_lookupx(Inode *parent, const char *name, Inode **out,
struct ceph_statx *stx, unsigned want, unsigned flags,
const UserPerm& perms);
fcntl_locks(NULL), flock_locks(NULL)
{
memset(&dir_layout, 0, sizeof(dir_layout));
- memset("a, 0, sizeof(quota));
}
~Inode();
## Rados object classes
set(cls_dir ${CMAKE_INSTALL_LIBDIR}/rados-classes)
-set(cls_embedded_srcs)
# cls_sdk
add_library(cls_sdk SHARED sdk/cls_sdk.cc)
SOVERSION "1"
INSTALL_RPATH "")
install(TARGETS cls_hello DESTINATION ${cls_dir})
-list(APPEND cls_embedded_srcs ${cls_hello_srcs})
# cls_numops
set(cls_numops_srcs numops/cls_numops.cc)
set(cls_numops_client_srcs numops/cls_numops_client.cc)
add_library(cls_numops_client STATIC ${cls_numops_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_numops_srcs} ${cls_numops_client_srcs})
# cls_rbd
if (WITH_RBD)
add_library(cls_rbd_client STATIC ${cls_rbd_client_srcs})
target_link_libraries(cls_rbd_client cls_lock_client)
- list(APPEND cls_embedded_srcs ${cls_rbd_srcs} ${cls_rbd_client_srcs})
endif (WITH_RBD)
# cls_lock
lock/cls_lock_ops.cc)
add_library(cls_lock_client STATIC ${cls_lock_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_lock_srcs} ${cls_lock_client_srcs})
# cls_refcount
set(cls_refcount_srcs
refcount/cls_refcount_ops.cc)
add_library(cls_refcount_client STATIC ${cls_refcount_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_refcount_srcs} ${cls_refcount_client_srcs})
# cls_version
set(cls_version_srcs version/cls_version.cc)
version/cls_version_types.cc)
add_library(cls_version_client STATIC ${cls_version_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_version_srcs} ${cls_version_client_srcs})
# cls_log
set(cls_log_srcs log/cls_log.cc)
set(cls_log_client_srcs log/cls_log_client.cc)
add_library(cls_log_client STATIC ${cls_log_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_log_srcs} ${cls_log_client_srcs})
# cls_statelog
set(cls_statelog_srcs statelog/cls_statelog.cc)
set(cls_statelog_client_srcs statelog/cls_statelog_client.cc)
add_library(cls_statelog_client STATIC ${cls_statelog_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_statelog_srcs} ${cls_statelog_client_srcs})
# cls_timeindex
set(cls_timeindex_srcs timeindex/cls_timeindex.cc)
set(cls_timeindex_client_srcs timeindex/cls_timeindex_client.cc)
add_library(cls_timeindex_client STATIC ${cls_timeindex_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_timeindex_srcs} ${cls_timeindex_client_srcs})
# cls_replica_log
set(cls_replica_log_srcs replica_log/cls_replica_log.cc)
user/cls_user_ops.cc)
add_library(cls_user_client STATIC ${cls_user_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_user_srcs} ${cls_user_client_srcs})
# cls_journal
set(cls_journal_srcs
journal/cls_journal_types.cc)
add_library(cls_journal_client STATIC ${cls_journal_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_journal_srcs} ${cls_journal_client_srcs})
# cls_rgw
if (WITH_RADOSGW)
rgw/cls_rgw_ops.cc)
add_library(cls_rgw_client STATIC ${cls_rgw_client_srcs})
- list(APPEND cls_embedded_srcs ${cls_rgw_srcs} ${cls_rgw_client_srcs})
endif (WITH_RADOSGW)
# cls_cephfs
cephfs/cls_cephfs_client.cc)
add_library(cls_cephfs_client STATIC ${cls_cephfs_client_srcs})
- list(APPEND cls_embedded_srcs ${cls_cephfs_srcs} ${cls_cephfs_client_srcs})
endif (WITH_CEPHFS)
# cls_lua
lua/cls_lua_client.cc)
add_library(cls_lua_client STATIC ${cls_lua_client_srcs})
-list(APPEND cls_embedded_srcs ${cls_lua_srcs} ${cls_lua_client_srcs})
-
-if(WITH_EMBEDDED)
- include(MergeStaticLibraries)
- list(REMOVE_DUPLICATES cls_embedded_srcs)
- add_library(cephd_cls_base STATIC ${cls_embedded_srcs})
- # while not necessary this seems to bring in the lua's include directories
- # so that cls_lua srcs build correctly
- target_link_libraries(cephd_cls_base liblua)
- set_target_properties(cephd_cls_base PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
- merge_static_libraries(cephd_cls cephd_cls_base liblua)
-endif()
if (pkeys->empty())
return 0;
- map<string, bufferlist>::reverse_iterator last_element = pkeys->rbegin();
+ auto last_element = pkeys->rbegin();
if ((unsigned char)last_element->first[0] < BI_PREFIX_CHAR) {
/* nothing to see here, move along */
return 0;
}
- map<string, bufferlist>::iterator first_element = pkeys->begin();
+ auto first_element = pkeys->begin();
if ((unsigned char)first_element->first[0] > BI_PREFIX_CHAR) {
return 0;
}
/* let's rebuild the list, only keep entries we're interested in */
- map<string, bufferlist> old_keys;
- old_keys.swap(*pkeys);
+ auto comp = [](const pair<string, bufferlist>& l, const string &r) { return l.first < r; };
+ string new_start = {static_cast<char>(BI_PREFIX_CHAR + 1)};
- for (map<string, bufferlist>::iterator iter = old_keys.begin(); iter != old_keys.end(); ++iter) {
- if ((unsigned char)iter->first[0] != BI_PREFIX_CHAR) {
- (*pkeys)[iter->first] = iter->second;
- }
- }
+ auto lower = pkeys->lower_bound(string{static_cast<char>(BI_PREFIX_CHAR)});
+ auto upper = std::lower_bound(lower, pkeys->end(), new_start, comp);
+ pkeys->erase(lower, upper);
if (num_entries == (int)pkeys->size())
return 0;
map<string, bufferlist> new_keys;
- char c[] = { (char)(BI_PREFIX_CHAR + 1), 0 };
- string new_start = c;
/* now get some more keys */
ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix, num_entries - pkeys->size(), &new_keys, pmore);
if (ret < 0)
return ret;
- for (map<string, bufferlist>::iterator iter = new_keys.begin(); iter != new_keys.end(); ++iter) {
- (*pkeys)[iter->first] = iter->second;
- }
-
+ pkeys->insert(std::make_move_iterator(new_keys.begin()),
+ std::make_move_iterator(new_keys.end()));
return 0;
}
if (ret < 0 && ret != -ENOENT)
return -EINVAL;
+ if (ret == -ENOENT) {
+ continue;
+ }
+
if (cur_disk_bl.length()) {
bufferlist::iterator cur_disk_iter = cur_disk_bl.begin();
try {
}
break;
case CEPH_RGW_UPDATE:
- if (!cur_disk.exists) {
- // this update would only have been sent by the rgw client
- // if the rgw_bucket_dir_entry existed, however between that
- // check and now the entry has diappeared, so we were likely
- // in the midst of a delete op, and we will not recreate the
- // entry
- CLS_LOG(10,
- "CEPH_RGW_UPDATE not applied because rgw_bucket_dir_entry"
- " no longer exists\n");
- break;
- }
-
CLS_LOG(10, "CEPH_RGW_UPDATE name=%s instance=%s total_entries: %" PRId64 " -> %" PRId64 "\n",
cur_change.key.name.c_str(), cur_change.key.instance.c_str(), stats.num_entries, stats.num_entries + 1);
}
op_ret.is_truncated = (count >= max) || more;
- while (count >= max) {
+ while (count > max) {
op_ret.entries.pop_back();
count--;
}
CLS_LOG(10, "gc_iterate_entries key=%s\n", key.c_str());
- if (!end_key.empty() && key.compare(end_key) >= 0)
+ if (!end_key.empty() && key.compare(end_key) >= 0) {
+ if (truncated)
+ *truncated = false;
return 0;
+ }
if (!key_in_index(key, GC_OBJ_TIME_INDEX))
return 0;
};
WRITE_CLASS_ENCODER(rgw_cls_list_op)
-struct rgw_cls_list_ret
-{
+struct rgw_cls_list_ret {
rgw_bucket_dir dir;
bool is_truncated;
void cls_rgw_bucket_instance_entry::dump(Formatter *f) const
{
- encode_json("reshard_status", (int)reshard_status, f);
+ string status_str;
+ switch(reshard_status) {
+ case CLS_RGW_RESHARD_NONE:
+ status_str= "none";
+ break;
+ case CLS_RGW_RESHARD_IN_PROGRESS:
+ status_str = "in-progress";
+ break;
+ case CLS_RGW_RESHARD_DONE:
+ status_str = "done";
+ break;
+ default:
+ status_str = "invalid";
+ }
+ encode_json("reshard_status", status_str, f);
encode_json("new_bucket_instance_id", new_bucket_instance_id, f);
encode_json("num_shards", num_shards, f);
::decode(val, p);
::decode(delta, p);
::decode(vel, p);
+ last_decay = t;
DECODE_FINISH(p);
}
auto i = sdata->ops_in_flight_sharded.begin();
while (i != sdata->ops_in_flight_sharded.end() &&
i->get_initiated() < too_old) {
+
+ if (!i->warn_interval_multiplier)
+ continue;
+
(*slow)++;
// exponential backoff of warning intervals
void mark_event(const char *event,
utime_t stamp=ceph_clock_now());
+ void mark_nowarn() {
+ warn_interval_multiplier = 0;
+ }
+
virtual const char *state_string() const {
Mutex::Locker l(lock);
return events.rbegin()->c_str();
_show_config(NULL, f);
}
+void md_config_t::config_options(Formatter *f)
+{
+ Mutex::Locker l(lock);
+ f->open_array_section("options");
+ for (const auto& i: schema) {
+ const Option &opt = i.second;
+ opt.dump(f);
+ }
+ f->close_section();
+}
+
void md_config_t::_show_config(std::ostream *out, Formatter *f)
{
if (out) {
void show_config(std::ostream& out);
/// dump all config values to a formatter
void show_config(Formatter *f);
+
+ /// dump all config settings to a formatter
+ void config_options(Formatter *f);
/// obtain a diff between our config values and another md_config_t values
void diff(const md_config_t *other,
namespace std {
template<> struct hash<hobject_t> {
size_t operator()(const hobject_t &r) const {
- static rjhash<uint64_t> I;
- return r.get_hash() ^ I(r.snap);
+ static rjhash<uint64_t> RJ;
+ return RJ(r.get_hash() ^ r.snap);
}
};
} // namespace std
namespace std {
template<> struct hash<ghobject_t> {
size_t operator()(const ghobject_t &r) const {
- static rjhash<uint64_t> I;
- return r.hobj.get_hash() ^ I(r.hobj.snap);
+ static rjhash<uint64_t> RJ;
+ static hash<hobject_t> HO;
+ size_t hash = HO(r.hobj);
+ hash = RJ(hash ^ r.generation);
+ hash = hash ^ r.shard_id.id;
+ return hash;
}
};
} // namespace std
OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503
OPTION(rgw_s3_auth_use_rados, OPT_BOOL) // should we try to use the internal credentials for s3?
OPTION(rgw_s3_auth_use_keystone, OPT_BOOL) // should we try to use keystone for s3?
+OPTION(rgw_s3_auth_order, OPT_STR) // s3 authentication order to try
OPTION(rgw_barbican_url, OPT_STR) // url for barbican server
/* OpenLDAP-style LDAP parameter strings */
OPTION(rgw_num_zone_opstate_shards, OPT_INT) // max shards for keeping inter-region copy progress info
OPTION(rgw_opstate_ratelimit_sec, OPT_INT) // min time between opstate updates on a single upload (0 for disabling ratelimit)
OPTION(rgw_curl_wait_timeout_ms, OPT_INT) // timeout for certain curl calls
+OPTION(rgw_curl_low_speed_limit, OPT_INT) // low speed limit for certain curl calls
+OPTION(rgw_curl_low_speed_time, OPT_INT) // low speed time for certain curl calls
OPTION(rgw_copy_obj_progress, OPT_BOOL) // should dump progress during long copy operations?
OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT) // min bytes between copy progress output
OPTION(rgw_obj_tombstone_cache_size, OPT_INT) // how many objects in tombstone cache, which is used in multi-zone sync to keep
.set_description(""),
Option("filestore_merge_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(10)
+ .set_default(-10)
.set_description(""),
Option("filestore_split_multiple", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description("Should S3 authentication use Keystone."),
+ Option("rgw_s3_auth_order", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+ .set_default("external, local")
+ .set_description("Authentication strategy order to use for s3 authentication")
+ .set_long_description(
+ "Order of authentication strategies to try for s3 authentication, the allowed "
+ "options are a comma separated list of engines external, local. The "
+ "default order is to try all the externally configured engines before "
+ "attempting local rados based authentication"),
+
Option("rgw_barbican_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("")
.set_description("URL to barbican server."),
.set_default(1000)
.set_description(""),
+ Option("rgw_curl_low_speed_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(1024)
+ .set_long_description(
+ "It contains the average transfer speed in bytes per second that the "
+ "transfer should be below during rgw_curl_low_speed_time seconds for libcurl "
+ "to consider it to be too slow and abort. Set it zero to disable this."),
+
+ Option("rgw_curl_low_speed_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(30)
+ .set_long_description(
+ "It contains the time in number seconds that the transfer speed should be below "
+ "the rgw_curl_low_speed_limit for the library to consider it too slow and abort. "
+ "Set it zero to disable this."),
+
Option("rgw_copy_obj_progress", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description("Send progress report through copy operation")
Option("mds_inject_migrator_session_race", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false),
+
+ Option("mds_inject_migrator_message_loss", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description(""),
+
});
}
f->dump_string("nick", "");
}
f->dump_int("priority", get_adjusted_priority(d->prio));
+
+ if (d->unit == NONE) {
+ f->dump_string("units", "none");
+ } else if (d->unit == BYTES) {
+ f->dump_string("units", "bytes");
+ }
f->close_section();
} else {
if (d->type & PERFCOUNTER_LONGRUNAVG) {
void PerfCountersBuilder::add_u64_counter(
int idx, const char *name,
- const char *description, const char *nick, int prio)
+ const char *description, const char *nick, int prio, int unit)
{
add_impl(idx, name, description, nick, prio,
- PERFCOUNTER_U64 | PERFCOUNTER_COUNTER);
+ PERFCOUNTER_U64 | PERFCOUNTER_COUNTER, unit);
}
void PerfCountersBuilder::add_u64(
int idx, const char *name,
- const char *description, const char *nick, int prio)
+ const char *description, const char *nick, int prio, int unit)
{
- add_impl(idx, name, description, nick, prio, PERFCOUNTER_U64);
+ add_impl(idx, name, description, nick, prio, PERFCOUNTER_U64, unit);
}
void PerfCountersBuilder::add_u64_avg(
int idx, const char *name,
- const char *description, const char *nick, int prio)
+ const char *description, const char *nick, int prio, int unit)
{
add_impl(idx, name, description, nick, prio,
- PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG);
+ PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG, unit);
}
void PerfCountersBuilder::add_time(
int idx, const char *name,
PerfHistogramCommon::axis_config_d x_axis_config,
PerfHistogramCommon::axis_config_d y_axis_config,
- const char *description, const char *nick, int prio)
+ const char *description, const char *nick, int prio, int unit)
{
add_impl(idx, name, description, nick, prio,
- PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER,
+ PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER, unit,
unique_ptr<PerfHistogram<>>{new PerfHistogram<>{x_axis_config, y_axis_config}});
}
void PerfCountersBuilder::add_impl(
int idx, const char *name,
- const char *description, const char *nick, int prio, int ty,
+ const char *description, const char *nick, int prio, int ty, int unit,
unique_ptr<PerfHistogram<>> histogram)
{
assert(idx > m_perf_counters->m_lower_bound);
data.nick = nick;
data.prio = prio ? prio : prio_default;
data.type = (enum perfcounter_type_d)ty;
+ data.unit = (enum unit_t) unit;
data.histogram = std::move(histogram);
}
PERFCOUNTER_HISTOGRAM = 0x10, // histogram (vector) of values
};
+enum unit_t : uint8_t
+{
+ BYTES,
+ NONE
+};
/* Class for constructing a PerfCounters object.
*
};
void add_u64(int key, const char *name,
const char *description=NULL, const char *nick = NULL,
- int prio=0);
+ int prio=0, int unit=NONE);
void add_u64_counter(int key, const char *name,
const char *description=NULL,
const char *nick = NULL,
- int prio=0);
+ int prio=0, int unit=NONE);
void add_u64_avg(int key, const char *name,
const char *description=NULL,
const char *nick = NULL,
- int prio=0);
+ int prio=0, int unit=NONE);
void add_time(int key, const char *name,
const char *description=NULL,
const char *nick = NULL,
PerfHistogramCommon::axis_config_d y_axis_config,
const char *description=NULL,
const char* nick = NULL,
- int prio=0);
+ int prio=0, int unit=NONE);
void set_prio_default(int prio_)
{
PerfCountersBuilder(const PerfCountersBuilder &rhs);
PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
void add_impl(int idx, const char *name,
- const char *description, const char *nick, int prio, int ty,
+ const char *description, const char *nick, int prio, int ty, int unit=NONE,
unique_ptr<PerfHistogram<>> histogram = nullptr);
PerfCounters *m_perf_counters;
: name(NULL),
description(NULL),
nick(NULL),
- type(PERFCOUNTER_NONE)
+ type(PERFCOUNTER_NONE),
+ unit(NONE)
{}
perf_counter_data_any_d(const perf_counter_data_any_d& other)
: name(other.name),
description(other.description),
nick(other.nick),
- type(other.type),
- u64(other.u64.load()) {
+ type(other.type),
+ unit(other.unit),
+ u64(other.u64.load()) {
pair<uint64_t,uint64_t> a = other.read_avg();
u64 = a.first;
avgcount = a.second;
const char *nick;
uint8_t prio = 0;
enum perfcounter_type_d type;
+ enum unit_t unit;
std::atomic<uint64_t> u64 = { 0 };
std::atomic<uint64_t> avgcount = { 0 };
std::atomic<uint64_t> avgcount2 = { 0 };
void set_hinfo_corrupted() {
errors |= err_t::HINFO_CORRUPTED;
}
+ bool only_data_digest_mismatch_info() const {
+ return errors == err_t::DATA_DIGEST_MISMATCH_INFO;
+ }
+ void clear_data_digest_mismatch_info() {
+ errors &= ~err_t::DATA_DIGEST_MISMATCH_INFO;
+ }
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bp);
};
#include <climits>
#include <limits>
+#include <cmath>
#include <sstream>
+#include <boost/utility/string_view.hpp>
using std::ostringstream;
-long long strict_strtoll(const char *str, int base, std::string *err)
+long long strict_strtoll(const boost::string_view str, int base, std::string *err)
{
char *endptr;
- std::string errStr;
errno = 0; /* To distinguish success/failure after call (see man page) */
- long long ret = strtoll(str, &endptr, base);
-
- if (endptr == str) {
- errStr = "Expected option value to be integer, got '";
- errStr.append(str);
- errStr.append("'");
- *err = errStr;
+ long long ret = strtoll(str.data(), &endptr, base);
+ if (endptr == str.data() || endptr != str.data() + str.size()) {
+ *err = (std::string{"Expected option value to be integer, got '"} +
+ std::string{str} + "'");
return 0;
}
- if ((errno == ERANGE && (ret == LLONG_MAX || ret == LLONG_MIN))
- || (errno != 0 && ret == 0)) {
- errStr = "The option value '";
- errStr.append(str);
- errStr.append("'");
- errStr.append(" seems to be invalid");
- *err = errStr;
- return 0;
- }
- if (*endptr != '\0') {
- errStr = "The option value '";
- errStr.append(str);
- errStr.append("'");
- errStr.append(" contains invalid digits");
- *err = errStr;
+ if (errno) {
+ *err = (std::string{"The option value '"} + std::string{str} +
+ "' seems to be invalid");
return 0;
}
*err = "";
return ret;
}
-int strict_strtol(const char *str, int base, std::string *err)
+long long strict_strtoll(const char *str, int base, std::string *err)
+{
+ return strict_strtoll(boost::string_view(str), base, err);
+}
+
+int strict_strtol(const boost::string_view str, int base, std::string *err)
{
- std::string errStr;
long long ret = strict_strtoll(str, base, err);
if (!err->empty())
return 0;
if ((ret <= INT_MIN) || (ret >= INT_MAX)) {
- errStr = "The option value '";
- errStr.append(str);
- errStr.append("'");
- errStr.append(" seems to be invalid");
- *err = errStr;
+ ostringstream errStr;
+ errStr << "The option value '" << str << "' seems to be invalid";
+ *err = errStr.str();
return 0;
}
return static_cast<int>(ret);
}
-double strict_strtod(const char *str, std::string *err)
+int strict_strtol(const char *str, int base, std::string *err)
+{
+ return strict_strtol(boost::string_view(str), base, err);
+}
+
+double strict_strtod(const boost::string_view str, std::string *err)
{
char *endptr;
errno = 0; /* To distinguish success/failure after call (see man page) */
- double ret = strtod(str, &endptr);
+ double ret = strtod(str.data(), &endptr);
if (errno == ERANGE) {
ostringstream oss;
oss << "strict_strtod: floating point overflow or underflow parsing '"
return ret;
}
-float strict_strtof(const char *str, std::string *err)
+double strict_strtod(const char *str, std::string *err)
+{
+ return strict_strtod(boost::string_view(str), err);
+}
+
+float strict_strtof(const boost::string_view str, std::string *err)
{
char *endptr;
errno = 0; /* To distinguish success/failure after call (see man page) */
- float ret = strtof(str, &endptr);
+ float ret = strtof(str.data(), &endptr);
if (errno == ERANGE) {
ostringstream oss;
oss << "strict_strtof: floating point overflow or underflow parsing '"
return ret;
}
+float strict_strtof(const char *str, std::string *err)
+{
+ return strict_strtof(boost::string_view(str), err);
+}
+
template<typename T>
-T strict_si_cast(const char *str, std::string *err)
+T strict_iec_cast(const boost::string_view str, std::string *err)
{
- std::string s(str);
- if (s.empty()) {
- *err = "strict_sistrtoll: value not specified";
+ if (str.empty()) {
+ *err = "strict_iecstrtoll: value not specified";
return 0;
}
- const char &u = s.back();
+ // get a view of the unit and of the value
+ boost::string_view unit;
+ boost::string_view n = str;
+ size_t u = str.find_first_not_of("0123456789-+");
int m = 0;
- if (u == 'B')
- m = 0;
- else if (u == 'K')
- m = 10;
- else if (u == 'M')
- m = 20;
- else if (u == 'G')
- m = 30;
- else if (u == 'T')
- m = 40;
- else if (u == 'P')
- m = 50;
- else if (u == 'E')
- m = 60;
- else
- m = -1;
-
- if (m >= 0)
- s.pop_back();
- else
- m = 0;
-
- long long ll = strict_strtoll(s.c_str(), 10, err);
+ // deal with unit prefix is there is one
+ if (u != boost::string_view::npos) {
+ n = str.substr(0, u);
+ unit = str.substr(u, str.length() - u);
+ // we accept both old si prefixes as well as the proper iec prefixes
+ // i.e. K, M, ... and Ki, Mi, ...
+ if (unit.back() == 'i') {
+ if (unit.front() == 'B') {
+ *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
+ return 0;
+ }
+ }
+ if (unit.length() > 2) {
+ *err = "strict_iecstrtoll: illegal prefix (length > 2)";
+ return 0;
+ }
+ switch(unit.front()) {
+ case 'K':
+ m = 10;
+ break;
+ case 'M':
+ m = 20;
+ break;
+ case 'G':
+ m = 30;
+ break;
+ case 'T':
+ m = 40;
+ break;
+ case 'P':
+ m = 50;
+ break;
+ case 'E':
+ m = 60;
+ break;
+ case 'B':
+ break;
+ default:
+ *err = "strict_iecstrtoll: unit prefix not recognized";
+ return 0;
+ }
+ }
+
+ long long ll = strict_strtoll(n, 10, err);
if (ll < 0 && !std::numeric_limits<T>::is_signed) {
- *err = "strict_sistrtoll: value should not be negative";
+ *err = "strict_iecstrtoll: value should not be negative";
return 0;
}
if (static_cast<unsigned>(m) >= sizeof(T) * CHAR_BIT) {
- *err = ("strict_sistrtoll: the SI prefix is too large for the designated "
- "type");
+ *err = ("strict_iecstrtoll: the IEC prefix is too large for the designated "
+ "type");
return 0;
}
using promoted_t = typename std::common_type<decltype(ll), T>::type;
if (static_cast<promoted_t>(ll) <
static_cast<promoted_t>(std::numeric_limits<T>::min()) >> m) {
- *err = "strict_sistrtoll: value seems to be too small";
+ *err = "strict_iecstrtoll: value seems to be too small";
return 0;
}
if (static_cast<promoted_t>(ll) >
static_cast<promoted_t>(std::numeric_limits<T>::max()) >> m) {
- *err = "strict_sistrtoll: value seems to be too large";
+ *err = "strict_iecstrtoll: value seems to be too large";
return 0;
}
return (ll << m);
}
-template int strict_si_cast<int>(const char *str, std::string *err);
-template long strict_si_cast<long>(const char *str, std::string *err);
-template long long strict_si_cast<long long>(const char *str, std::string *err);
-template uint64_t strict_si_cast<uint64_t>(const char *str, std::string *err);
-template uint32_t strict_si_cast<uint32_t>(const char *str, std::string *err);
+template int strict_iec_cast<int>(const boost::string_view str, std::string *err);
+template long strict_iec_cast<long>(const boost::string_view str, std::string *err);
+template long long strict_iec_cast<long long>(const boost::string_view str, std::string *err);
+template uint64_t strict_iec_cast<uint64_t>(const boost::string_view str, std::string *err);
+template uint32_t strict_iec_cast<uint32_t>(const boost::string_view str, std::string *err);
+
+uint64_t strict_iecstrtoll(const boost::string_view str, std::string *err)
+{
+ return strict_iec_cast<uint64_t>(str, err);
+}
+
+uint64_t strict_iecstrtoll(const char *str, std::string *err)
+{
+ return strict_iec_cast<uint64_t>(boost::string_view(str), err);
+}
+
+template<typename T>
+T strict_iec_cast(const char *str, std::string *err)
+{
+ return strict_iec_cast<T>(boost::string_view(str), err);
+}
+
+template int strict_iec_cast<int>(const char *str, std::string *err);
+template long strict_iec_cast<long>(const char *str, std::string *err);
+template long long strict_iec_cast<long long>(const char *str, std::string *err);
+template uint64_t strict_iec_cast<uint64_t>(const char *str, std::string *err);
+template uint32_t strict_iec_cast<uint32_t>(const char *str, std::string *err);
+
+template<typename T>
+T strict_si_cast(const boost::string_view str, std::string *err)
+{
+ if (str.empty()) {
+ *err = "strict_sistrtoll: value not specified";
+ return 0;
+ }
+ boost::string_view n = str;
+ int m = 0;
+ // deal with unit prefix is there is one
+ if (str.find_first_not_of("0123456789+-") != boost::string_view::npos) {
+ const char &u = str.back();
+ if (u == 'K')
+ m = 3;
+ else if (u == 'M')
+ m = 6;
+ else if (u == 'G')
+ m = 9;
+ else if (u == 'T')
+ m = 12;
+ else if (u == 'P')
+ m = 15;
+ else if (u == 'E')
+ m = 18;
+ else if (u != 'B') {
+ *err = "strict_si_cast: unit prefix not recognized";
+ return 0;
+ }
+
+ if (m >= 3)
+ n = str.substr(0, str.length() -1);
+ }
+
+ long long ll = strict_strtoll(n, 10, err);
+ if (ll < 0 && !std::numeric_limits<T>::is_signed) {
+ *err = "strict_sistrtoll: value should not be negative";
+ return 0;
+ }
+ using promoted_t = typename std::common_type<decltype(ll), T>::type;
+ if (static_cast<promoted_t>(ll) <
+ static_cast<promoted_t>(std::numeric_limits<T>::min()) / pow (10, m)) {
+ *err = "strict_sistrtoll: value seems to be too small";
+ return 0;
+ }
+ if (static_cast<promoted_t>(ll) >
+ static_cast<promoted_t>(std::numeric_limits<T>::max()) / pow (10, m)) {
+ *err = "strict_sistrtoll: value seems to be too large";
+ return 0;
+ }
+ return (ll * pow (10, m));
+}
+
+template int strict_si_cast<int>(const boost::string_view str, std::string *err);
+template long strict_si_cast<long>(const boost::string_view str, std::string *err);
+template long long strict_si_cast<long long>(const boost::string_view str, std::string *err);
+template uint64_t strict_si_cast<uint64_t>(const boost::string_view str, std::string *err);
+template uint32_t strict_si_cast<uint32_t>(const boost::string_view str, std::string *err);
+
+uint64_t strict_sistrtoll(const boost::string_view str, std::string *err)
+{
+ return strict_si_cast<uint64_t>(str, err);
+}
uint64_t strict_sistrtoll(const char *str, std::string *err)
{
return strict_si_cast<uint64_t>(str, err);
}
+
+template<typename T>
+T strict_si_cast(const char *str, std::string *err)
+{
+ return strict_si_cast<T>(boost::string_view(str), err);
+}
+
+template int strict_si_cast<int>(const char *str, std::string *err);
+template long strict_si_cast<long>(const char *str, std::string *err);
+template long long strict_si_cast<long long>(const char *str, std::string *err);
+template uint64_t strict_si_cast<uint64_t>(const char *str, std::string *err);
+template uint32_t strict_si_cast<uint32_t>(const char *str, std::string *err);
float strict_strtof(const char *str, std::string *err);
+uint64_t strict_iecstrtoll(const char *str, std::string *err);
+
+template<typename T>
+T strict_iec_cast(const char *str, std::string *err);
+
uint64_t strict_sistrtoll(const char *str, std::string *err);
template<typename T>
#include <stdio.h>
-int64_t unit_to_bytesize(string val, ostream *pss)
-{
- if (val.empty()) {
- if (pss)
- *pss << "value is empty!";
- return -EINVAL;
- }
-
- char c = val[val.length()-1];
- int modifier = 0;
- if (!::isdigit(c)) {
- if (val.length() < 2) {
- if (pss)
- *pss << "invalid value: " << val;
- return -EINVAL;
- }
- val = val.substr(0,val.length()-1);
- switch (c) {
- case 'B':
- break;
- case 'k':
- case 'K':
- modifier = 10;
- break;
- case 'M':
- modifier = 20;
- break;
- case 'G':
- modifier = 30;
- break;
- case 'T':
- modifier = 40;
- break;
- case 'P':
- modifier = 50;
- break;
- case 'E':
- modifier = 60;
- break;
- default:
- if (pss)
- *pss << "unrecognized modifier '" << c << "'" << std::endl;
- return -EINVAL;
- }
- }
-
- if (val[0] == '+' || val[0] == '-') {
- if (pss)
- *pss << "expected numerical value, got: " << val;
- return -EINVAL;
- }
-
- string err;
- int64_t r = strict_strtoll(val.c_str(), 10, &err);
- if ((r == 0) && !err.empty()) {
- if (pss)
- *pss << err;
- return -1;
- }
- if (r < 0) {
- if (pss)
- *pss << "unable to parse positive integer '" << val << "'";
- return -1;
- }
- return (r * (1LL << modifier));
-}
-
int get_fs_stats(ceph_data_stats_t &stats, const char *path)
{
if (!path)
add_custom_target(compressor_plugins DEPENDS
${ceph_compressor_libs})
-
-if(WITH_EMBEDDED)
- include(MergeStaticLibraries)
- add_library(cephd_compressor_base STATIC ${compressor_srcs})
- set_target_properties(cephd_compressor_base PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
- set(cephd_compressor_libs
- cephd_compressor_base
- cephd_compressor_snappy
- cephd_compressor_zlib
- cephd_compressor_zstd)
- if (HAVE_LZ4)
- list(APPEND cephd_compressor_libs cephd_compressor_lz4)
- endif()
- merge_static_libraries(cephd_compressor ${cephd_compressor_libs})
-endif()
SOVERSION 2
INSTALL_RPATH "")
install(TARGETS ceph_lz4 DESTINATION ${compressor_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_compressor_lz4 STATIC ${lz4_sources})
- set_target_properties(cephd_compressor_lz4 PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
#include "ceph_ver.h"
#include "CompressionPluginLZ4.h"
-#ifndef BUILDING_FOR_EMBEDDED
-
// -----------------------------------------------------------------------------
const char *__ceph_plugin_version()
return instance->add(type, name, new CompressionPluginLZ4(cct));
}
-
-#endif // !BUILDING_FOR_EMBEDDED
SOVERSION 2
INSTALL_RPATH "")
install(TARGETS ceph_snappy DESTINATION ${compressor_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_compressor_snappy STATIC ${snappy_sources})
- set_target_properties(cephd_compressor_snappy PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
#include "ceph_ver.h"
#include "CompressionPluginSnappy.h"
-#ifndef BUILDING_FOR_EMBEDDED
-
// -----------------------------------------------------------------------------
const char *__ceph_plugin_version()
return instance->add(type, name, new CompressionPluginSnappy(cct));
}
-
-#endif // !BUILDING_FOR_EMBEDDED
SOVERSION 2
INSTALL_RPATH "")
install(TARGETS ceph_zlib DESTINATION ${compressor_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_compressor_zlib STATIC ${zlib_sources})
- target_include_directories(cephd_compressor_zlib PRIVATE "${CMAKE_SOURCE_DIR}/src/isa-l/include")
- set_target_properties(cephd_compressor_zlib PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
#include "ceph_ver.h"
#include "CompressionPluginZlib.h"
-#ifndef BUILDING_FOR_EMBEDDED
// -----------------------------------------------------------------------------
const char *__ceph_plugin_version()
return instance->add(type, name, new CompressionPluginZlib(cct));
}
-
-#endif // !BUILDING_FOR_EMBEDDED
target_link_libraries(ceph_zstd zstd)
set_target_properties(ceph_zstd PROPERTIES VERSION 2.0.0 SOVERSION 2)
install(TARGETS ceph_zstd DESTINATION ${compressor_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_compressor_zstd STATIC ${zstd_sources})
- set_target_properties(cephd_compressor_zstd PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
#include "ceph_ver.h"
#include "CompressionPluginZstd.h"
-#ifndef BUILDING_FOR_EMBEDDED
-
// -----------------------------------------------------------------------------
const char *__ceph_plugin_version()
return instance->add(type, name, new CompressionPluginZstd(cct));
}
-
-#endif // !BUILDING_FOR_EMBEDDED
if (HAVE_BETTER_YASM_ELF64)
add_subdirectory(isa)
set(EC_ISA_LIB ec_isa)
- set(EC_ISA_EMBEDDED_LIB cephd_ec_isa)
endif (HAVE_BETTER_YASM_ELF64)
add_library(erasure_code STATIC ErasureCodePlugin.cc)
ec_lrc
ec_jerasure
ec_shec)
-
-if(WITH_EMBEDDED)
- include(MergeStaticLibraries)
- add_library(cephd_ec_base STATIC $<TARGET_OBJECTS:erasure_code_objs>)
- set_target_properties(cephd_ec_base PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
- merge_static_libraries(cephd_ec cephd_ec_base ${EC_ISA_EMBEDDED_LIB} cephd_ec_jerasure cephd_ec_lrc cephd_ec_shec)
-endif()
set_target_properties(ec_isa PROPERTIES
INSTALL_RPATH "")
install(TARGETS ec_isa DESTINATION ${erasure_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_ec_isa STATIC ${isa_srcs})
- set_target_properties(cephd_ec_isa PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
return 0;
}
-#ifndef BUILDING_FOR_EMBEDDED
-
// -----------------------------------------------------------------------------
const char *__erasure_code_version()
return instance.add(plugin_name, new ErasureCodePluginIsa());
}
-
-#endif
install(TARGETS ${plugin_name} DESTINATION ${erasure_plugin_dir})
add_dependencies(ec_jerasure ${plugin_name})
endforeach()
-
-if(WITH_EMBEDDED)
- add_library(cephd_ec_jerasure STATIC
- $<TARGET_OBJECTS:gf-complete_objs>
- $<TARGET_OBJECTS:jerasure_objs>
- ${jerasure_utils_src})
- set_target_properties(cephd_ec_jerasure PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
return 0;
}
-#ifndef BUILDING_FOR_EMBEDDED
-
const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
int __erasure_code_init(char *plugin_name, char *directory)
}
return instance.add(plugin_name, new ErasureCodePluginJerasure());
}
-
-#endif
\ No newline at end of file
INSTALL_RPATH "")
target_link_libraries(ec_lrc json_spirit)
install(TARGETS ec_lrc DESTINATION ${erasure_plugin_dir})
-
-if(WITH_EMBEDDED)
- add_library(cephd_ec_lrc STATIC ${lrc_srcs})
- set_target_properties(cephd_ec_lrc PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
return 0;
};
-#ifndef BUILDING_FOR_EMBEDDED
-
const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
int __erasure_code_init(char *plugin_name, char *directory)
ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
return instance.add(plugin_name, new ErasureCodePluginLrc());
}
-
-#endif
install(TARGETS ${plugin_name} DESTINATION ${erasure_plugin_dir})
add_dependencies(ec_shec ${plugin_name})
endforeach()
-
-if(WITH_EMBEDDED)
- # note we rely on the fact this will always be statically linked with jerasure
- add_library(cephd_ec_shec STATIC ${shec_utils_srcs})
- set_target_properties(cephd_ec_shec PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
return 0;
}
-#ifndef BUILDING_FOR_EMBEDDED
-
const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
int __erasure_code_init(char *plugin_name, char *directory = (char *)"")
}
return instance.add(plugin_name, new ErasureCodePluginShec());
}
-
-#endif
\ No newline at end of file
#endif
}
+int reopen_as_null(CephContext *cct, int fd)
+{
+ int newfd = open("/dev/null", O_RDONLY);
+ if (newfd < 0) {
+ int err = errno;
+ lderr(cct) << __func__ << " failed to open /dev/null: " << cpp_strerror(err)
+ << dendl;
+ return -1;
+ }
+ // atomically dup newfd to target fd. target fd is implicitly closed if
+ // open and atomically replaced; see man dup2
+ int r = dup2(newfd, fd);
+ if (r < 0) {
+ int err = errno;
+ lderr(cct) << __func__ << " failed to dup2 " << fd << ": "
+ << cpp_strerror(err) << dendl;
+ return -1;
+ }
+ // close newfd (we cloned it to target fd)
+ VOID_TEMP_FAILURE_RETRY(close(newfd));
+ return 0;
+}
+
void global_init_postfork_start(CephContext *cct)
{
// restart log thread
* guarantee that nobody ever writes to stdout, even though they're not
* supposed to.
*/
- VOID_TEMP_FAILURE_RETRY(close(STDIN_FILENO));
- if (open("/dev/null", O_RDONLY) < 0) {
- int err = errno;
- derr << "global_init_daemonize: open(/dev/null) failed: error "
- << err << dendl;
- exit(1);
- }
+ reopen_as_null(cct, STDIN_FILENO);
const md_config_t *conf = cct->_conf;
if (pidfile_write(conf) < 0)
}
}
- VOID_TEMP_FAILURE_RETRY(close(STDOUT_FILENO));
- if (open("/dev/null", O_RDONLY) < 0) {
- int err = errno;
- derr << "global_init_daemonize: open(/dev/null) failed: error "
- << err << dendl;
- exit(1);
- }
+ reopen_as_null(cct, STDOUT_FILENO);
ldout(cct, 1) << "finished global_init_daemonize" << dendl;
}
*/
int global_init_shutdown_stderr(CephContext *cct)
{
- VOID_TEMP_FAILURE_RETRY(close(STDERR_FILENO));
- if (open("/dev/null", O_RDONLY) < 0) {
- int err = errno;
- derr << "global_init_shutdown_stderr: open(/dev/null) failed: error "
- << err << dendl;
- return 1;
- }
+ reopen_as_null(cct, STDERR_FILENO);
cct->_log->set_stderr_level(-1, -1);
return 0;
}
CEPH_CAP_LINK_EXCL | \
CEPH_CAP_XATTR_EXCL | \
CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
CEPH_CAP_FILE_EXCL)
#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+++ /dev/null
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define LIBCEPHD_VER_MAJOR 0
-#define LIBCEPHD_VER_MINOR 1
-#define LIBCEPHD_VER_PATCH 0
-
-#define LIBCEPHFD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
-#define LIBCEPHFD_VERSION_CODE LIBCEPHD_VERSION(LIBCEPHD_VER_MAJOR, LIBCEPHD_VER_MINOR, LIBCEPHD_VER_PATCH)
-
-#define CEPH_LIBCEPHD_API __attribute__ ((visibility ("default")))
-
-/**
- * Get the API version of libcephd. We use semantic versioning
- * for the API:
- *
- * - incrementing major is for backwards-incompatible changes
- * - incrementing minor is for backwards-compatible changes
- * - incrementing extra is for bug fixes
- *
- * @param pmajor where to store the major version number
- * @param pminor where to store the minor version number
- * @param ppatch where to store the patch version number
- */
-CEPH_LIBCEPHD_API void cephd_version(int *pmajor, int *pminor, int *ppatch);
-
-/**
- * Gets the runtime version of ceph.
- *
- * @param pmajor where to store the major version number
- * @param pminor where to store the minor version number
- * @param ppatch where to store the patch version number
- */
-CEPH_LIBCEPHD_API const char *ceph_version(int *pmajor, int *pminor, int *ppatch);
-
-/**
- * Generates a new cluster id (fsid) and returns a hexadecimal string.
- *
- * @param context where to the store the handle
- * @param buf where to write the fsid
- * @param len the size of buf in bytes (should be at least 37)
- * @returns 0 on success, negative error code on failure
- * @returns -ERANGE if the buffer is too short to contain the key
- */
-CEPH_LIBCEPHD_API int cephd_generate_fsid(char *buf, size_t len);
-
-/**
- * Generates a new secret key and returns a base64 encoded string.
- *
- * @param context where to the store the handle
- * @param buf where to write the fsid
- * @param len the size of buf in bytes
- * @returns 0 on success, negative error code on failure
- * @returns -ERANGE if the buffer is too short to contain the key
- */
-CEPH_LIBCEPHD_API int cephd_generate_secret_key(char *buf, size_t len);
-
-/**
- * Runs ceph-mon passing in command line args
- *
- * @param argc number of parameters
- * @param argv array of string arguments
- * @returns 0 on success, negative error code on failure
- */
-CEPH_LIBCEPHD_API int cephd_run_mon(int argc, const char **argv);
-
-/**
- * Runs ceph-osd passing in command line args
- *
- * @param argc number of parameters
- * @param argv array of string arguments
- * @returns 0 on success, negative error code on failure
- */
-CEPH_LIBCEPHD_API int cephd_run_osd(int argc, const char **argv);
-
-/**
- * Runs ceph-mds passing in command line args
- *
- * @param argc number of parameters
- * @param argv array of string arguments
- * @returns 0 on success, negative error code on failure
- */
-CEPH_LIBCEPHD_API int cephd_run_mds(int argc, const char **argv);
-
-/**
- * Runs ceph-rgw passing in command line args
- *
- * @param argc number of parameters
- * @param argv array of string arguments
- * @returns 0 on success, negative error code on failure
- */
-CEPH_LIBCEPHD_API int cephd_run_rgw(int argc, const char **argv);
-
-/**
- * Runs radosgw-admin passing in command line args
- *
- * @param argc number of parameters
- * @param argv array of string arguments
- * @returns 0 on success, negative error code on failure
- */
-CEPH_LIBCEPHD_API int cephd_run_rgw_admin(int argc, const char **argv);
-
-#ifdef __cplusplus
-}
-#endif
/* ibverbs experimental conditional compilation */
#cmakedefine HAVE_IBV_EXP
-/* define if embedded enabled */
-#cmakedefine WITH_EMBEDDED
-
/* define if cephfs enabled */
#cmakedefine WITH_CEPHFS
extern "C" {
#endif
-#ifndef BUILDING_FOR_EMBEDDED
#define CLS_VER(maj,min) \
int __cls_ver__## maj ## _ ##min = 0; \
int __cls_ver_maj = maj; \
const char *__cls_name = #name;
#define CLS_INIT(name) \
void CEPH_CLS_API __cls_init()
-#else
-#define CLS_VER(maj,min)
-#define CLS_NAME(name)
-#define CLS_INIT(name) \
-void CEPH_CLS_API name##_cls_init()
-#endif
#define CLS_METHOD_RD 0x1 /// method executes read operations
#define CLS_METHOD_WR 0x2 /// method executes write operations
// --
-struct prettybyte_t {
- uint64_t v;
- // cppcheck-suppress noExplicitConstructor
- prettybyte_t(uint64_t _v) : v(_v) {}
-};
-
-inline ostream& operator<<(ostream& out, const prettybyte_t& b)
-{
- uint64_t bump_after = 100;
- if (b.v > bump_after << 60)
- return out << (b.v >> 60) << " EB";
- if (b.v > bump_after << 50)
- return out << (b.v >> 50) << " PB";
- if (b.v > bump_after << 40)
- return out << (b.v >> 40) << " TB";
- if (b.v > bump_after << 30)
- return out << (b.v >> 30) << " GB";
- if (b.v > bump_after << 20)
- return out << (b.v >> 20) << " MB";
- if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " kB";
- return out << b.v << " bytes";
+namespace {
+ inline ostream& format_u(ostream& out, const uint64_t v, const uint64_t n,
+ const int index, const uint64_t mult, const char* u)
+ {
+ char buffer[32];
+
+ if (index == 0) {
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else if ((v % mult) == 0) {
+ // If this is an even multiple of the base, always display
+ // without any decimal fraction.
+ (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+ } else {
+ // We want to choose a precision that reflects the best choice
+ // for fitting in 5 characters. This can get rather tricky when
+ // we have numbers that are very close to an order of magnitude.
+ // For example, when displaying 10239 (which is really 9.999K),
+ // we want only a single place of precision for 10.0K. We could
+ // develop some complex heuristics for this, but it's much
+ // easier just to try each combination in turn.
+ int i;
+ for (i = 2; i >= 0; i--) {
+ if (snprintf(buffer, sizeof(buffer), "%.*f%s", i,
+ static_cast<double>(v) / mult, u) <= 7)
+ break;
+ }
+ }
+
+ return out << buffer;
+ }
}
-struct si_t {
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * decimal unit prefix (the classic SI units). No actual unit will be added.
+ */
+struct si_u_t {
uint64_t v;
- // cppcheck-suppress noExplicitConstructor
- si_t(uint64_t _v) : v(_v) {}
+ explicit si_u_t(uint64_t _v) : v(_v) {};
};
-inline ostream& operator<<(ostream& out, const si_t& b)
+inline ostream& operator<<(ostream& out, const si_u_t& b)
{
- uint64_t bump_after = 100;
- if (b.v > bump_after << 60)
- return out << (b.v >> 60) << "E";
- if (b.v > bump_after << 50)
- return out << (b.v >> 50) << "P";
- if (b.v > bump_after << 40)
- return out << (b.v >> 40) << "T";
- if (b.v > bump_after << 30)
- return out << (b.v >> 30) << "G";
- if (b.v > bump_after << 20)
- return out << (b.v >> 20) << "M";
- if (b.v > bump_after << 10)
- return out << (b.v >> 10) << "k";
- return out << b.v;
+ uint64_t n = b.v;
+ int index = 0;
+ uint64_t mult = 1;
+ const char* u[] = {"", "k", "M", "G", "T", "P", "E"};
+
+ while (n >= 1000 && index < 7) {
+ n /= 1000;
+ index++;
+ mult *= 1000;
+ }
+
+ return format_u(out, b.v, n, index, mult, u[index]);
}
-struct pretty_si_t {
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * binary unit prefix (IEC units). Since binary unit prefixes are to be used for
+ * "multiples of units in data processing, data transmission, and digital
+ * information" (so bits and bytes) and so far bits are not printed, the unit
+ * "B" for "byte" is added besides the multiplier.
+ */
+struct byte_u_t {
uint64_t v;
- // cppcheck-suppress noExplicitConstructor
- pretty_si_t(uint64_t _v) : v(_v) {}
+ explicit byte_u_t(uint64_t _v) : v(_v) {};
};
-inline ostream& operator<<(ostream& out, const pretty_si_t& b)
+inline ostream& operator<<(ostream& out, const byte_u_t& b)
{
- uint64_t bump_after = 100;
- if (b.v > bump_after << 60)
- return out << (b.v >> 60) << " E";
- if (b.v > bump_after << 50)
- return out << (b.v >> 50) << " P";
- if (b.v > bump_after << 40)
- return out << (b.v >> 40) << " T";
- if (b.v > bump_after << 30)
- return out << (b.v >> 30) << " G";
- if (b.v > bump_after << 20)
- return out << (b.v >> 20) << " M";
- if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " k";
- return out << b.v << " ";
-}
+ uint64_t n = b.v;
+ int index = 0;
+ const char* u[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"};
-struct kb_t {
- uint64_t v;
- // cppcheck-suppress noExplicitConstructor
- kb_t(uint64_t _v) : v(_v) {}
-};
+ while (n >= 1024 && index < 7) {
+ n /= 1024;
+ index++;
+ }
-inline ostream& operator<<(ostream& out, const kb_t& kb)
-{
- uint64_t bump_after = 100;
- if (kb.v > bump_after << 40)
- return out << (kb.v >> 40) << " PB";
- if (kb.v > bump_after << 30)
- return out << (kb.v >> 30) << " TB";
- if (kb.v > bump_after << 20)
- return out << (kb.v >> 20) << " GB";
- if (kb.v > bump_after << 10)
- return out << (kb.v >> 10) << " MB";
- return out << kb.v << " kB";
+ return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]);
}
inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
#include "common/Formatter.h"
#include "include/types.h"
-int64_t unit_to_bytesize(string val, ostream *pss);
-
std::string bytes2str(uint64_t count);
struct ceph_data_stats
SOVERSION "1"
INSTALL_RPATH "")
install(TARGETS cls_kvs DESTINATION ${CMAKE_INSTALL_LIBDIR}/rados-classes)
-
-if(WITH_EMBEDDED)
- add_library(cephd_cls_kvs STATIC ${kvs_srcs})
- set_target_properties(cephd_cls_kvs PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-endif()
{
if (key == "compaction_threads") {
std::string err;
- int f = strict_sistrtoll(val.c_str(), &err);
+ int f = strict_iecstrtoll(val.c_str(), &err);
if (!err.empty())
return -EINVAL;
//Low priority threadpool is used for compaction
opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::LOW);
} else if (key == "flusher_threads") {
std::string err;
- int f = strict_sistrtoll(val.c_str(), &err);
+ int f = strict_iecstrtoll(val.c_str(), &err);
if (!err.empty())
return -EINVAL;
//High priority threadpool is used for flusher
opt.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbt_opts));
dout(10) << __func__ << " block size " << g_conf->rocksdb_block_size
- << ", block_cache size " << prettybyte_t(block_cache_size)
- << ", row_cache size " << prettybyte_t(row_cache_size)
+ << ", block_cache size " << byte_u_t(block_cache_size)
+ << ", row_cache size " << byte_u_t(row_cache_size)
<< "; shards "
<< (1 << g_conf->rocksdb_cache_shard_bits)
<< ", type " << g_conf->rocksdb_cache_type
+++ /dev/null
-include(MergeStaticLibraries)
-
-add_library(cephd_base STATIC
- libcephd.cc
- ../ceph_mon.cc
- ../ceph_osd.cc
- ../ceph_mds.cc)
-
-set_target_properties(cephd_base PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
-
-set(merge_libs
- cephd_base
- cephd_compressor
- cephd_ec
- cephd_cls
- cephd_cls_kvs
- cephd_rados
- common
- common_utf8
- erasure_code
- global
- json_spirit
- kv
- mds
- mon
- os
- osd
- osdc)
-
-if(NOT WITH_SYSTEM_ROCKSDB)
- list(APPEND merge_libs ${ROCKSDB_LIBRARIES})
-endif(NOT WITH_SYSTEM_ROCKSDB)
-
-if(WITH_RADOSGW)
- list(APPEND merge_libs cephd_rgw)
-endif(WITH_RADOSGW)
-
-if(WITH_RBD)
- list(APPEND merge_libs cephd_rbd)
-endif(WITH_RBD)
-
-if(HAVE_ARMV8_CRC)
- list(APPEND merge_libs common_crc_aarch64)
-endif(HAVE_ARMV8_CRC)
-
-merge_static_libraries(cephd ${merge_libs})
-
-# TODO: install these libraries and add them to rpm and deb packages
-#install(TARGETS cephd DESTINATION ${CMAKE_INSTALL_LIBDIR})
-#install(FILES ../include/cephd/libcephd.h
-# DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cephd)
+++ /dev/null
-#include "acconfig.h"
-#include "auth/Auth.h"
-#include "auth/Crypto.h"
-#include "auth/KeyRing.h"
-#include "common/ceph_argparse.h"
-#include "common/version.h"
-#include "common/PluginRegistry.h"
-#include "compressor/snappy/CompressionPluginSnappy.h"
-#include "compressor/zlib/CompressionPluginZlib.h"
-#include "compressor/zstd/CompressionPluginZstd.h"
-#include "erasure-code/ErasureCodePlugin.h"
-#if __x86_64__ && defined(HAVE_BETTER_YASM_ELF64)
-#include "erasure-code/isa/ErasureCodePluginIsa.h"
-#endif
-#include "erasure-code/jerasure/ErasureCodePluginJerasure.h"
-#include "erasure-code/jerasure/jerasure_init.h"
-#include "erasure-code/lrc/ErasureCodePluginLrc.h"
-#include "erasure-code/shec/ErasureCodePluginShec.h"
-#include "include/cephd/libcephd.h"
-#include "global/global_context.h"
-#include "global/global_init.h"
-#include "objclass/objclass.h"
-#include "osd/OSD.h"
-#include "osd/ClassHandler.h"
-
-// forward declarations of RADOS class init functions
-CLS_INIT(cephfs);
-CLS_INIT(hello);
-CLS_INIT(journal);
-CLS_INIT(kvs);
-CLS_INIT(lock);
-CLS_INIT(log);
-CLS_INIT(lua);
-CLS_INIT(numops);
-CLS_INIT(rbd);
-CLS_INIT(refcount);
-CLS_INIT(replica_log);
-CLS_INIT(rgw);
-CLS_INIT(statelog);
-CLS_INIT(timeindex);
-CLS_INIT(user);
-CLS_INIT(version);
-
-extern "C" void cephd_version(int *pmajor, int *pminor, int *ppatch)
-{
- if (pmajor)
- *pmajor = LIBCEPHD_VER_MAJOR;
- if (pminor)
- *pminor = LIBCEPHD_VER_MINOR;
- if (ppatch)
- *ppatch = LIBCEPHD_VER_PATCH;
-}
-
-extern "C" const char *ceph_version(int *pmajor, int *pminor, int *ppatch)
-{
- int major, minor, patch;
- const char *v = ceph_version_to_str();
-
- int n = sscanf(v, "%d.%d.%d", &major, &minor, &patch);
- if (pmajor)
- *pmajor = (n >= 1) ? major : 0;
- if (pminor)
- *pminor = (n >= 2) ? minor : 0;
- if (ppatch)
- *ppatch = (n >= 3) ? patch : 0;
- return v;
-}
-
-extern "C" int cephd_generate_fsid(char *buf, size_t len)
-{
- if (len < sizeof("b06ad912-70d7-4263-a5ff-011462a5929a")) {
- return -ERANGE;
- }
-
- uuid_d fsid;
- fsid.generate_random();
- fsid.print(buf);
-
- return 0;
-}
-
-extern "C" int cephd_generate_secret_key(char *buf, size_t len)
-{
- CephInitParameters iparams(CEPH_ENTITY_TYPE_MON);
- CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0);
- cct->_conf->apply_changes(NULL);
- cct->init_crypto();
-
- CryptoKey key;
- key.create(cct, CEPH_CRYPTO_AES);
-
- cct->put();
-
- string keystr;
- key.encode_base64(keystr);
- if (keystr.length() >= len) {
- return -ERANGE;
- }
- strcpy(buf, keystr.c_str());
- return keystr.length();
-}
-
-// load the embedded plugins. This is safe to call multiple
-// times in the same process
-void cephd_preload_embedded_plugins()
-{
- int r;
-
- // load erasure coding plugins
- {
- ErasureCodePlugin* plugin;
- ErasureCodePluginRegistry& reg = ErasureCodePluginRegistry::instance();
- Mutex::Locker l(reg.lock);
- reg.disable_dlclose = true;
-
- // initialize jerasure (and gf-complete)
- int w[] = { 4, 8, 16, 32 };
- r = jerasure_init(4, w);
- assert(r == 0);
-
- plugin = new ErasureCodePluginJerasure();
- r = reg.add("jerasure", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-
- plugin = new ErasureCodePluginLrc();
- r = reg.add("lrc", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-
- plugin = new ErasureCodePluginShec();
- r = reg.add("shec", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-
-#if __x86_64__ && defined(HAVE_BETTER_YASM_ELF64)
- plugin = new ErasureCodePluginIsa();
- r = reg.add("isa", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-#endif
- }
-
- // now load the compression plugins
- {
- Plugin *plugin;
- PluginRegistry *reg = g_ceph_context->get_plugin_registry();
- Mutex::Locker l(reg->lock);
- reg->disable_dlclose = true;
-
- plugin = new CompressionPluginSnappy(g_ceph_context);
- r = reg->add("compressor", "snappy", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-
- plugin = new CompressionPluginZlib(g_ceph_context);
- r = reg->add("compressor", "zlib", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
-
- plugin = new CompressionPluginZstd(g_ceph_context);
- r = reg->add("compressor", "zstd", plugin);
- if (r == -EEXIST) {
- delete plugin;
- }
- assert(r == 0);
- }
-}
-
-void cephd_preload_rados_classes(OSD *osd)
-{
- // intialize RADOS classes
- {
- ClassHandler *class_handler = osd->class_handler;
- Mutex::Locker l(class_handler->mutex);
-
-#ifdef WITH_CEPHFS
- class_handler->add_embedded_class("cephfs");
- cephfs_cls_init();
-#endif
- class_handler->add_embedded_class("hello");
- hello_cls_init();
- class_handler->add_embedded_class("journal");
- journal_cls_init();
-#ifdef WITH_KVS
- class_handler->add_embedded_class("kvs");
- kvs_cls_init();
-#endif
- class_handler->add_embedded_class("lock");
- lock_cls_init();
- class_handler->add_embedded_class("log");
- log_cls_init();
- class_handler->add_embedded_class("lua");
- lua_cls_init();
- class_handler->add_embedded_class("numops");
- numops_cls_init();
-#ifdef WITH_RBD
- class_handler->add_embedded_class("rbd");
- rbd_cls_init();
-#endif
- class_handler->add_embedded_class("refcount");
- refcount_cls_init();
- class_handler->add_embedded_class("replica_log");
- replica_log_cls_init();
-#ifdef WITH_RADOSGW
- class_handler->add_embedded_class("rgw");
- rgw_cls_init();
-#endif
- class_handler->add_embedded_class("statelog");
- statelog_cls_init();
- class_handler->add_embedded_class("timeindex");
- timeindex_cls_init();
- class_handler->add_embedded_class("user");
- user_cls_init();
- class_handler->add_embedded_class("version");
- version_cls_init();
- }
-}
-
-extern "C" int cephd_mon(int argc, const char **argv);
-extern "C" int cephd_osd(int argc, const char **argv);
-extern "C" int cephd_mds(int argc, const char **argv);
-extern "C" int cephd_rgw(int argc, const char **argv);
-extern "C" int cephd_rgw_admin(int argc, const char **argv);
-
-int cephd_run_mon(int argc, const char **argv)
-{
- return cephd_mon(argc, argv);
-}
-
-int cephd_run_osd(int argc, const char **argv)
-{
- return cephd_osd(argc, argv);
-}
-
-int cephd_run_mds(int argc, const char **argv)
-{
- return cephd_mds(argc, argv);
-}
-
-
-int cephd_run_rgw(int argc, const char **argv)
-{
- return cephd_rgw(argc, argv);
-}
-
-int cephd_run_rgw_admin(int argc, const char **argv)
-{
- return cephd_rgw_admin(argc, argv);
-}
struct inodeno_t ino,
Inode **inode)
{
- int r = (cmount->get_client())->lookup_ino(ino, cmount->default_perms, inode);
- if (r) {
- return r;
- }
-
- assert(inode != NULL);
- assert(*inode != NULL);
-
- // Request the parent inode, so that we can look up the name
- Inode *parent;
- r = (cmount->get_client())->lookup_parent(*inode, cmount->default_perms, &parent);
- if (r && r != -EINVAL) {
- // Unexpected error
- (cmount->get_client())->ll_forget(*inode, 1);
- return r;
- } else if (r == -EINVAL) {
- // EINVAL indicates node without parents (root), drop out now
- // and don't try to look up the non-existent dentry.
- return 0;
- }
- // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
- // is already in cache
- assert(parent != NULL);
-
- // Finally, get the name (dentry) of the requested inode
- r = (cmount->get_client())->lookup_name(*inode, parent, cmount->default_perms);
- if (r) {
- // Unexpected error
- (cmount->get_client())->ll_forget(parent, 1);
- (cmount->get_client())->ll_forget(*inode, 1);
- return r;
- }
-
- (cmount->get_client())->ll_forget(parent, 1);
- return 0;
+ return (cmount->get_client())->ll_lookup_inode(ino, cmount->default_perms, inode);
}
extern "C" int ceph_ll_lookup(struct ceph_mount_info *cmount,
endif(ENABLE_SHARED)
install(TARGETS librados DESTINATION ${CMAKE_INSTALL_LIBDIR})
-if(WITH_EMBEDDED)
- add_library(cephd_rados STATIC
- $<TARGET_OBJECTS:librados_api_obj>
- $<TARGET_OBJECTS:librados_objs>)
-endif()
if(WITH_LTTNG AND WITH_EVENTTRACE)
add_dependencies(librados_api_obj eventtrace_tp)
endif()
c->cond.Signal();
if (r == 0 && c->blp && c->blp->length() > 0) {
- if (c->out_buf && !c->blp->is_provided_buffer(c->out_buf))
- c->blp->copy(0, c->blp->length(), c->out_buf);
- c->rval = c->blp->length();
+ if (c->out_buf && !c->blp->is_contiguous()) {
+ c->rval = -ERANGE;
+ } else {
+ c->rval = c->blp->length();
+ }
}
if (c->callback_complete ||
LINK_FLAGS "-Wl,--exclude-libs,ALL")
endif(ENABLE_SHARED)
install(TARGETS librbd DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-if(WITH_EMBEDDED)
- add_library(cephd_rbd_base STATIC librbd.cc ${CMAKE_SOURCE_DIR}/src/common/ContextCompletion.cc)
- merge_static_libraries(cephd_rbd cephd_rbd_base rbd_internal rbd_types journal)
-endif()
plb.add_u64_counter(l_librbd_rd, "rd", "Reads", "r", perf_prio);
plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads",
- "rb", perf_prio);
+ "rb", perf_prio, unit_t(BYTES));
plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads",
"rl", perf_prio);
plb.add_u64_counter(l_librbd_wr, "wr", "Writes", "w", perf_prio);
plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data",
- "wb", perf_prio);
+ "wb", perf_prio, unit_t(BYTES));
plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency",
"wl", perf_prio);
plb.add_u64_counter(l_librbd_discard, "discard", "Discards");
- plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data");
+ plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data", NULL, 0, unit_t(BYTES));
plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency");
plb.add_u64_counter(l_librbd_flush, "flush", "Flushes");
plb.add_u64_counter(l_librbd_aio_flush, "aio_flush", "Async flushes");
plb.add_time_avg(l_librbd_aio_flush_latency, "aio_flush_latency", "Latency of async flushes");
plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames");
- plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data");
+ plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data", NULL, 0, unit_t(BYTES));
plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency");
plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites");
- plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps");
+ plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps", NULL, 0, unit_t(BYTES));
plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps");
plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications");
plb.add_u64_counter(l_librbd_resize, "resize", "Resizes");
plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead");
- plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead");
+ plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates");
plb.add_time(l_librbd_opened_time, "opened_time", "Opened time",
#include "librbd/ObjectMap.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/MirroringWatcher.h"
+#include "librbd/journal/DisabledPolicy.h"
#include "librbd/journal/RemoveRequest.h"
#include "librbd/image/RemoveRequest.h"
#include "librbd/operation/TrimRequest.h"
void RemoveRequest<I>::acquire_exclusive_lock() {
ldout(m_cct, 20) << dendl;
+ // do not attempt to open the journal when removing the image in case
+ // it's corrupt
+ if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) {
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ m_image_ctx->set_journal_policy(new journal::DisabledPolicy());
+ }
+
using klass = RemoveRequest<I>;
if (m_force) {
Context *ctx = create_context_callback<
Context *ctx = create_context_callback<
klass, &klass::handle_invalidate>(this);
InvalidateRequest<I> *req = InvalidateRequest<I>::create(
- m_image_ctx, m_snap_id, false, ctx);
+ m_image_ctx, m_snap_id, true, ctx);
RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
Context *ctx = create_context_callback<
klass, &klass::handle_resize_invalidate>(this);
InvalidateRequest<I> *req = InvalidateRequest<I>::create(
- m_image_ctx, m_snap_id, false, ctx);
+ m_image_ctx, m_snap_id, true, ctx);
RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
if (mdsmap->get_epoch() != epoch) {
epoch = mdsmap->get_epoch();
- compat = get_mdsmap_compat_set_default();
+ compat = MDSMap::get_compat_set_default();
compat.merge(mdsmap->compat);
}
}
CDir *f = new CDir(inode, *p, cache, is_auth());
f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE));
f->get_replicas() = get_replicas();
- f->dir_auth = dir_auth;
- f->init_fragment_pins();
f->set_version(get_version());
-
f->pop_me = pop_me;
f->pop_me.scale(fac);
f->set_dir_auth(get_dir_auth());
f->prepare_new_fragment(replay);
+ f->init_fragment_pins();
}
// repartition dentries
// new subtree root?
if (!was_subtree && is_subtree_root()) {
dout(10) << " new subtree root, adjusting auth_pins" << dendl;
-
+
+ inode->num_subtree_roots++;
+
// adjust nested auth pins
if (get_cum_auth_pins())
inode->adjust_nested_auth_pins(-1, NULL);
}
if (was_subtree && !is_subtree_root()) {
dout(10) << " old subtree root, adjusting auth_pins" << dendl;
-
+
+ inode->num_subtree_roots--;
+
// adjust nested auth pins
if (get_cum_auth_pins())
inode->adjust_nested_auth_pins(1, NULL);
void assimilate_dirty_rstat_inodes();
void assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob);
+ void mark_exporting() {
+ state_set(CDir::STATE_EXPORTING);
+ inode->num_exporting_dirs++;
+ }
+ void clear_exporting() {
+ state_clear(CDir::STATE_EXPORTING);
+ inode->num_exporting_dirs--;
+ }
+
protected:
version_t projected_version;
mempool::mds_co::list<fnode_t> projected_fnode;
dir->state_clear(CDir::STATE_STICKY);
dir->put(CDir::PIN_STICKY);
}
+
+ if (dir->is_subtree_root())
+ num_subtree_roots--;
// dump any remaining dentries, for debugging purposes
for (const auto &p : dir->items)
bool CInode::has_subtree_root_dirfrag(int auth)
{
- for (const auto &p : dirfrags) {
- if (p.second->is_subtree_root() &&
- (auth == -1 || p.second->dir_auth.first == auth))
+ if (num_subtree_roots > 0) {
+ if (auth == -1)
return true;
+ for (const auto &p : dirfrags) {
+ if (p.second->is_subtree_root() &&
+ p.second->dir_auth.first == auth)
+ return true;
+ }
}
return false;
}
bool CInode::has_subtree_or_exporting_dirfrag()
{
- for (const auto &p : dirfrags) {
- if (p.second->is_subtree_root() ||
- p.second->state_test(CDir::STATE_EXPORTING))
- return true;
- }
+ if (num_subtree_roots > 0 || num_exporting_dirs > 0)
+ return true;
return false;
}
// -- cache infrastructure --
private:
mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
+
+ //for the purpose of quickly determining whether there's a subtree root or exporting dir
+ int num_subtree_roots = 0;
+ int num_exporting_dirs = 0;
+
int stickydir_ref = 0;
scrub_info_t *scrub_infop = nullptr;
clear_file_locks();
assert(num_projected_xattrs == 0);
assert(num_projected_srnodes == 0);
+ assert(num_subtree_roots == 0);
+ assert(num_exporting_dirs == 0);
}
if (ev >= 3)
::decode(legacy_mds_map.compat, p);
else
- legacy_mds_map.compat = get_mdsmap_compat_set_base();
+ legacy_mds_map.compat = MDSMap::get_compat_set_base();
if (ev < 5) {
__u32 n;
::decode(n, p);
*/
class Filesystem
{
- public:
- fs_cluster_id_t fscid;
- MDSMap mds_map;
-
+public:
void encode(bufferlist& bl, uint64_t features) const;
void decode(bufferlist::iterator& p);
- Filesystem()
- :
- fscid(FS_CLUSTER_ID_NONE)
- {
- }
-
void dump(Formatter *f) const;
void print(std::ostream& out) const;
return false;
}
+
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ MDSMap mds_map;
};
WRITE_CLASS_ENCODER_FEATURES(Filesystem)
class FSMap {
protected:
- epoch_t epoch;
- uint64_t next_filesystem_id;
- fs_cluster_id_t legacy_client_fscid;
+ epoch_t epoch = 0;
+ uint64_t next_filesystem_id = FS_CLUSTER_ID_ANONYMOUS + 1;
+ fs_cluster_id_t legacy_client_fscid = FS_CLUSTER_ID_NONE;
CompatSet compat;
- bool enable_multiple;
- bool ever_enabled_multiple; // < the cluster had multiple MDSes enabled once
+ bool enable_multiple = false;
+ bool ever_enabled_multiple = false; // < the cluster had multiple MDSes enabled once
std::map<fs_cluster_id_t, std::shared_ptr<Filesystem> > filesystems;
friend class MDSMonitor;
friend class PaxosFSMap;
- FSMap()
- : epoch(0),
- next_filesystem_id(FS_CLUSTER_ID_ANONYMOUS + 1),
- legacy_client_fscid(FS_CLUSTER_ID_NONE),
- compat(get_mdsmap_compat_set_default()),
- enable_multiple(false), ever_enabled_multiple(false)
- { }
+ FSMap() : compat(MDSMap::get_compat_set_default()) {}
FSMap(const FSMap &rhs)
:
size_t filesystem_count() const {return filesystems.size();}
bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
+ const std::shared_ptr<Filesystem> &get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
std::shared_ptr<const Filesystem> get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
std::shared_ptr<const Filesystem> get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
std::shared_ptr<const Filesystem> get_filesystem(boost::string_view name) const
mdcache->wait_replay_cap_reconnect(m->get_ino(), new C_MDS_RetryMessage(mds, m));
return;
}
- dout(1) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
+
+ /*
+ * "handle_client_caps on unknown ino xxx” is normal after migrating a subtree
+ * Sequence of events that cause this are:
+ * - client sends caps message to mds.a
+ * - mds finishes subtree migration, send cap export to client
+ * - mds trim its cache
+ * - mds receives cap messages from client
+ */
+ dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
m->put();
return;
}
}
assert(subtrees.empty());
- if (myin)
+ if (myin) {
remove_inode(myin);
-
+ assert(!myin);
+ }
+
// done!
dout(2) << "shutdown done." << dendl;
return true;
{
set<mds_rank_t> all, active;
mds->mdsmap->get_mds_set(all);
- mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
if (mds->get_state() == MDSMap::STATE_REJOIN)
- mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
+ else
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
dout(10) << "do_open_ino_peer " << ino << " active " << active
<< " all " << all << " checked " << info.checked << dendl;
{
set<mds_rank_t> all, active;
mds->mdsmap->get_mds_set(all);
- mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
<< " active " << active << " all " << all
}
show_subtrees(10);
-
- // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
- dir->dir_auth = CDIR_AUTH_DEFAULT;
}
diri->close_dirfrag(dir->get_frag());
mdr->internal_op_finish = cs;
enqueue_scrub_work(mdr);
+
+ // since recursive scrub is asynchronous, dump minimal output
+ // to not upset cli tools.
+ if (recursive) {
+ f->open_object_section("results");
+ f->close_section(); // results
+ }
}
void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
"mds_max_purge_ops_per_pg",
"mds_max_purge_files",
"mds_inject_migrator_session_race",
+ "mds_inject_migrator_message_loss",
"clog_to_graylog",
"clog_to_graylog_host",
"clog_to_graylog_port",
void MDSDaemon::handle_mds_map(MMDSMap *m)
{
version_t epoch = m->get_epoch();
- dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl;
// is it new?
if (epoch <= mdsmap->get_epoch()) {
- dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch()
- << ", discarding" << dendl;
+ dout(5) << "handle_mds_map old map epoch " << epoch << " <= "
+ << mdsmap->get_epoch() << ", discarding" << dendl;
m->put();
return;
}
+ dout(1) << "Updating MDS map to version " << epoch << " from " << m->get_source() << dendl;
+
entity_addr_t addr;
// keep old map, for a moment
mds_rank_t whoami = mdsmap->get_rank_gid(mds_gid_t(monc->get_global_id()));
// verify compatset
- CompatSet mdsmap_compat(get_mdsmap_compat_set_all());
+ CompatSet mdsmap_compat(MDSMap::get_compat_set_all());
dout(10) << " my compat " << mdsmap_compat << dendl;
dout(10) << " mdsmap compat " << mdsmap->compat << dendl;
if (!mdsmap_compat.writeable(mdsmap->compat)) {
if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) {
const MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing);
if (i.global_id > myid) {
- dout(1) << "map replaced me with another mds." << whoami
+ dout(1) << "Map replaced me with another mds." << whoami
<< " with gid (" << i.global_id << ") larger than myself ("
<< myid << "); quitting!" << dendl;
// Call suicide() rather than respawn() because if someone else
}
}
- dout(1) << "map removed me (mds." << whoami << " gid:"
+ dout(1) << "Map removed me (mds." << whoami << " gid:"
<< myid << ") from cluster due to lost contact; respawning" << dendl;
respawn();
}
// Normal rankless case, we're marked as standby
if (new_state == MDSMap::STATE_STANDBY) {
beacon.set_want_state(mdsmap, new_state);
- dout(1) << "handle_mds_map standby" << dendl;
+ dout(1) << "Map has assigned me to become a standby" << dendl;
return;
}
assert(stopping == false);
stopping = true;
- dout(1) << "suicide. wanted state "
+ dout(1) << "suicide! Wanted state "
<< ceph_mds_state_name(beacon.get_want_state()) << dendl;
if (tick_event) {
void MDSDaemon::respawn()
{
- dout(1) << "respawn" << dendl;
+ dout(1) << "respawn!" << dendl;
+
+ /* Dump recent in case the MDS was stuck doing something which caused it to
+ * be removed from the MDSMap leading to respawn. */
+ g_ceph_context->_log->dump_recent();
char *new_argv[orig_argc+1];
dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
#include "MDSMap.h"
#include "MDSRank.h"
-#define CEPH_MDS_PROTOCOL 30 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 31 /* cluster internal */
class AuthAuthorizeHandlerRegistry;
class Message;
#define dout_subsys ceph_subsys_
// features
-CompatSet get_mdsmap_compat_set_all() {
+CompatSet MDSMap::get_compat_set_all() {
CompatSet::FeatureSet feature_compat;
CompatSet::FeatureSet feature_ro_compat;
CompatSet::FeatureSet feature_incompat;
return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
}
-CompatSet get_mdsmap_compat_set_default() {
+CompatSet MDSMap::get_compat_set_default() {
CompatSet::FeatureSet feature_compat;
CompatSet::FeatureSet feature_ro_compat;
CompatSet::FeatureSet feature_incompat;
}
// base (pre v0.20)
-CompatSet get_mdsmap_compat_set_base() {
+CompatSet MDSMap::get_compat_set_base() {
CompatSet::FeatureSet feature_compat_base;
CompatSet::FeatureSet feature_incompat_base;
feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
m->data_pools.push_back(0);
m->metadata_pool = 1;
m->cas_pool = 2;
- m->compat = get_mdsmap_compat_set_all();
+ m->compat = get_compat_set_all();
// these aren't the defaults, just in case anybody gets confused
m->session_timeout = 61;
if (ev >= 3)
::decode(compat, p);
else
- compat = get_mdsmap_compat_set_base();
+ compat = get_compat_set_base();
if (ev < 5) {
__u32 n;
::decode(n, p);
state_valid = false;
}
} else if (prev == MDSMap::STATE_REJOIN) {
- if (next != MDSMap::STATE_ACTIVE
- && next != MDSMap::STATE_CLIENTREPLAY
- && next != MDSMap::STATE_STOPPED) {
+ if (next != MDSMap::STATE_ACTIVE &&
+ next != MDSMap::STATE_CLIENTREPLAY &&
+ next != MDSMap::STATE_STOPPED) {
state_valid = false;
}
- } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) {
+ } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
// Once I have entered replay, the only allowable transitions are to
// the next next along in the sequence.
if (next != prev + 1) {
class CephContext;
class health_check_map_t;
-extern CompatSet get_mdsmap_compat_set_all();
-extern CompatSet get_mdsmap_compat_set_default();
-extern CompatSet get_mdsmap_compat_set_base(); // pre v0.20
-
#define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
#define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
#define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs")
void encode_unversioned(bufferlist& bl) const;
};
+ static CompatSet get_compat_set_all();
+ static CompatSet get_compat_set_default();
+ static CompatSet get_compat_set_base(); // pre v0.20
protected:
// base map
- epoch_t epoch;
- bool enabled;
- std::string fs_name;
- uint32_t flags; // flags
- epoch_t last_failure; // mds epoch of last failure
- epoch_t last_failure_osd_epoch; // osd epoch of last failure; any mds entering replay needs
+ epoch_t epoch = 0;
+ bool enabled = false;
+ std::string fs_name = MDS_FS_NAME_DEFAULT;
+ uint32_t flags = CEPH_MDSMAP_DEFAULTS; // flags
+ epoch_t last_failure = 0; // mds epoch of last failure
+ epoch_t last_failure_osd_epoch = 0; // osd epoch of last failure; any mds entering replay needs
// at least this osdmap to ensure the blacklist propagates.
- utime_t created, modified;
+ utime_t created;
+ utime_t modified;
- mds_rank_t tableserver; // which MDS has snaptable
- mds_rank_t root; // which MDS has root directory
+ mds_rank_t tableserver = 0; // which MDS has snaptable
+ mds_rank_t root = 0; // which MDS has root directory
- __u32 session_timeout;
- __u32 session_autoclose;
- uint64_t max_file_size;
+ __u32 session_timeout = 60;
+ __u32 session_autoclose = 300;
+ uint64_t max_file_size = 1ULL<<40; /* 1TB */
std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default.
- int64_t cas_pool; // where CAS objects go
- int64_t metadata_pool; // where fs metadata objects go
+ int64_t cas_pool = -1; // where CAS objects go
+ int64_t metadata_pool = -1; // where fs metadata objects go
/*
* in: the set of logical mds #'s that define the cluster. this is the set
* @up + @failed = @in. @in * @stopped = {}.
*/
- mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */
- mds_rank_t standby_count_wanted;
+ mds_rank_t max_mds = 1; /* The maximum number of active MDSes. Also, the maximum rank. */
+ mds_rank_t standby_count_wanted = -1;
string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
std::set<mds_rank_t> in; // currently defined cluster
std::map<mds_rank_t, mds_gid_t> up; // who is in those roles
std::map<mds_gid_t, mds_info_t> mds_info;
- uint8_t ever_allowed_features; //< bitmap of features the cluster has allowed
- uint8_t explicitly_allowed_features; //< bitmap of features explicitly enabled
+ uint8_t ever_allowed_features = 0; //< bitmap of features the cluster has allowed
+ uint8_t explicitly_allowed_features = 0; //< bitmap of features explicitly enabled
- bool inline_data_enabled;
+ bool inline_data_enabled = false;
- uint64_t cached_up_features;
+ uint64_t cached_up_features = 0;
public:
CompatSet compat;
friend class FSMap;
public:
- MDSMap()
- : epoch(0), enabled(false), fs_name(MDS_FS_NAME_DEFAULT),
- flags(CEPH_MDSMAP_DEFAULTS), last_failure(0),
- last_failure_osd_epoch(0),
- tableserver(0), root(0),
- session_timeout(0),
- session_autoclose(0),
- max_file_size(0),
- cas_pool(-1),
- metadata_pool(-1),
- max_mds(0),
- standby_count_wanted(-1),
- ever_allowed_features(0),
- explicitly_allowed_features(0),
- inline_data_enabled(false),
- cached_up_features(0)
- { }
-
bool get_inline_data_enabled() const { return inline_data_enabled; }
void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; }
s.insert(p.second.rank);
}
- void
- get_clientreplay_or_active_or_stopping_mds_set(std::set<mds_rank_t>& s) const {
+ void get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const {
for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
++p)
- if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING)
+ if (p->second.state >= first && p->second.state <= STATE_STOPPING)
s.insert(p->second.rank);
}
void get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
// kick snaptable (resent AGREEs)
if (mdsmap->get_tableserver() == whoami) {
set<mds_rank_t> active;
- mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+ mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
snapserver->finish_recovery(active);
}
if (g_conf->mds_dump_cache_on_map)
mdcache->dump_cache();
+ cluster_degraded = mdsmap->is_degraded();
+
+ // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
+ // the 'restart' set tracks ranks that have restarted since the old mdsmap
+ set<mds_rank_t> restart;
+ // replaying mds does not communicate with other ranks
+ if (state >= MDSMap::STATE_RESOLVE) {
+ // did someone fail?
+ // new down?
+ set<mds_rank_t> olddown, down;
+ oldmap->get_down_mds_set(&olddown);
+ mdsmap->get_down_mds_set(&down);
+ for (const auto& r : down) {
+ if (oldmap->have_inst(r) && olddown.count(r) == 0) {
+ messenger->mark_down(oldmap->get_inst(r).addr);
+ handle_mds_failure(r);
+ }
+ }
+
+ // did someone fail?
+ // did their addr/inst change?
+ set<mds_rank_t> up;
+ mdsmap->get_up_mds_set(up);
+ for (const auto& r : up) {
+ auto& info = mdsmap->get_info(r);
+ if (oldmap->have_inst(r)) {
+ auto& oldinfo = oldmap->get_info(r);
+ if (info.inc != oldinfo.inc) {
+ messenger->mark_down(oldinfo.addr);
+ if (info.state == MDSMap::STATE_REPLAY ||
+ info.state == MDSMap::STATE_RESOLVE) {
+ restart.insert(r);
+ handle_mds_failure(r);
+ } else {
+ assert(info.state == MDSMap::STATE_STARTING ||
+ info.state == MDSMap::STATE_ACTIVE);
+ // -> stopped (missing) -> starting -> active
+ restart.insert(r);
+ mdcache->migrator->handle_mds_failure_or_stop(r);
+ }
+ }
+ } else {
+ if (info.state == MDSMap::STATE_REPLAY ||
+ info.state == MDSMap::STATE_RESOLVE) {
+ // -> starting/creating (missing) -> active (missing) -> replay -> resolve
+ restart.insert(r);
+ handle_mds_failure(r);
+ } else {
+ assert(info.state == MDSMap::STATE_CREATING ||
+ info.state == MDSMap::STATE_STARTING ||
+ info.state == MDSMap::STATE_ACTIVE);
+ }
+ }
+ }
+ }
+
// did it change?
if (oldstate != state) {
dout(1) << "handle_mds_map state change "
// RESOLVE
// is someone else newly resolving?
- if (is_resolve() || is_reconnect() || is_rejoin() ||
- is_clientreplay() || is_active() || is_stopping()) {
- if (!oldmap->is_resolving() && mdsmap->is_resolving()) {
+ if (state >= MDSMap::STATE_RESOLVE) {
+ if ((!oldmap->is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
set<mds_rank_t> resolve;
mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
dout(10) << " resolve set is " << resolve << dendl;
// REJOIN
// is everybody finally rejoining?
- if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
+ if (state >= MDSMap::STATE_REJOIN) {
// did we start?
if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
rejoin_joint_start();
oldstate == MDSMap::STATE_STARTING) {
// ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
set<mds_rank_t> olddis, dis;
- oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE);
- oldmap->get_mds_set(olddis, MDSMap::STATE_CLIENTREPLAY);
- oldmap->get_mds_set(olddis, MDSMap::STATE_REJOIN);
- mdsmap->get_mds_set(dis, MDSMap::STATE_ACTIVE);
- mdsmap->get_mds_set(dis, MDSMap::STATE_CLIENTREPLAY);
- mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN);
- for (set<mds_rank_t>::iterator p = dis.begin(); p != dis.end(); ++p)
- if (*p != whoami && // not me
- olddis.count(*p) == 0) { // newly so?
- mdcache->kick_discovers(*p);
- mdcache->kick_open_ino_peers(*p);
+ oldmap->get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
+ mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
+ for (const auto& r : dis) {
+ if (r == whoami)
+ continue; // not me
+ if (!olddis.count(r) || restart.count(r)) { // newly so?
+ mdcache->kick_discovers(r);
+ mdcache->kick_open_ino_peers(r);
}
+ }
}
}
- cluster_degraded = mdsmap->is_degraded();
if (oldmap->is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
dout(1) << "cluster recovered." << dendl;
auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
}
// did someone go active?
- if (oldstate >= MDSMap::STATE_CLIENTREPLAY &&
- (is_clientreplay() || is_active() || is_stopping())) {
+ if (state >= MDSMap::STATE_CLIENTREPLAY &&
+ oldstate >= MDSMap::STATE_CLIENTREPLAY) {
set<mds_rank_t> oldactive, active;
- oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
- oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
- mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
- mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
- for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
- if (*p != whoami && // not me
- oldactive.count(*p) == 0) // newly so?
- handle_mds_recovery(*p);
- }
-
- // did someone fail?
- // new down?
- {
- set<mds_rank_t> olddown, down;
- oldmap->get_down_mds_set(&olddown);
- mdsmap->get_down_mds_set(&down);
- for (set<mds_rank_t>::iterator p = down.begin(); p != down.end(); ++p) {
- if (oldmap->have_inst(*p) && olddown.count(*p) == 0) {
- messenger->mark_down(oldmap->get_inst(*p).addr);
- handle_mds_failure(*p);
- }
+ oldmap->get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
+ mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+ for (const auto& r : active) {
+ if (r == whoami)
+ continue; // not me
+ if (!oldactive.count(r) || restart.count(r)) // newly so?
+ handle_mds_recovery(r);
}
}
- // did someone fail?
- // did their addr/inst change?
- {
- set<mds_rank_t> up;
- mdsmap->get_up_mds_set(up);
- for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) {
- if (oldmap->have_inst(*p) &&
- oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
- messenger->mark_down(oldmap->get_inst(*p).addr);
- handle_mds_failure(*p);
- }
- }
- }
-
- if (is_clientreplay() || is_active() || is_stopping()) {
+ if (state >= MDSMap::STATE_CLIENTREPLAY) {
// did anyone stop?
set<mds_rank_t> oldstopped, stopped;
oldmap->get_stopped_mds_set(oldstopped);
mdsmap->get_stopped_mds_set(stopped);
- for (set<mds_rank_t>::iterator p = stopped.begin(); p != stopped.end(); ++p)
- if (oldstopped.count(*p) == 0) // newly so?
- mdcache->migrator->handle_mds_failure_or_stop(*p);
+ for (const auto& r : stopped)
+ if (oldstopped.count(r) == 0) // newly so?
+ mdcache->migrator->handle_mds_failure_or_stop(r);
}
{
/* This function DOES put the passed message before returning*/
void Migrator::dispatch(Message *m)
{
+ if (unlikely(inject_message_loss)) {
+ if (inject_message_loss == m->get_type() - MDS_PORT_MIGRATOR) {
+ dout(0) << "inject message loss " << *m << dendl;
+ m->put();
+ return;
+ }
+ }
+
switch (m->get_type()) {
// import
case MSG_MDS_EXPORTDIRDISCOVER:
case MSG_MDS_EXPORTCAPS:
handle_export_caps(static_cast<MExportCaps*>(m));
break;
+ case MSG_MDS_EXPORTCAPSACK:
+ handle_export_caps_ack(static_cast<MExportCapsAck*>(m));
+ break;
case MSG_MDS_GATHERCAPS:
handle_gather_caps(static_cast<MGatherCaps*>(m));
break;
if (it->second.state == EXPORT_CANCELLED) {
export_state.erase(it);
- dir->state_clear(CDir::STATE_EXPORTING);
+ dir->clear_exporting();
// send pending import_maps?
cache->maybe_send_pending_resolves();
}
void Migrator::export_cancel_finish(CDir *dir)
{
assert(dir->state_test(CDir::STATE_EXPORTING));
- dir->state_clear(CDir::STATE_EXPORTING);
+ dir->clear_exporting();
// pinned by Migrator::export_notify_abort()
dir->auth_unpin(this);
mds->hit_export_target(ceph_clock_now(), dest, -1);
dir->auth_pin(this);
- dir->state_set(CDir::STATE_EXPORTING);
+ dir->mark_exporting();
MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
mdr->more()->export_dir = dir;
mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
export_state.erase(it);
- dir->state_clear(CDir::STATE_EXPORTING);
+ dir->clear_exporting();
cache->maybe_send_pending_resolves();
return;
}
MutationRef mut = it->second.mut;
// remove from exporting list, clean up state
export_state.erase(it);
- dir->state_clear(CDir::STATE_EXPORTING);
+ dir->clear_exporting();
cache->show_subtrees();
audit();
cap->mark_importing();
}
- Capability::Import& im = import_map[it.first];
- im.cap_id = cap->get_cap_id();
- im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
- im.issue_seq = cap->get_last_seq() + 1;
+ // Always ask exporter mds to send cap export messages for auth caps.
+ // For non-auth caps, ask exporter mds to send cap export messages to
+ // clients who haven't opened sessions. The cap export messages will
+ // make clients open sessions.
+ if (auth_cap || session->connection == nullptr) {
+ Capability::Import& im = import_map[it.first];
+ im.cap_id = cap->get_cap_id();
+ im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
+ im.issue_seq = cap->get_last_seq() + 1;
+ }
if (peer >= 0) {
cap->merge(it.second, auth_cap);
mds->send_message_mds(ex, dest);
}
+/* This function DOES put the passed message before returning*/
+void Migrator::handle_export_caps_ack(MExportCapsAck *ack)
+{
+ mds_rank_t from = ack->get_source().num();
+ CInode *in = cache->get_inode(ack->ino);
+ if (in) {
+ assert(!in->is_auth());
+
+ dout(10) << "handle_export_caps_ack " << *ack << " from "
+ << ack->get_source() << " on " << *in << dendl;
+
+ map<client_t,Capability::Import> imported_caps;
+ map<client_t,uint64_t> caps_ids;
+ auto blp = ack->cap_bl.begin();
+ ::decode(imported_caps, blp);
+ ::decode(caps_ids, blp);
+
+ for (auto& it : imported_caps) {
+ Capability *cap = in->get_client_cap(it.first);
+ if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
+ continue;
+
+ dout(7) << __func__ << " telling client." << it.first
+ << " exported caps on " << *in << dendl;
+ MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
+ cap->get_cap_id(), cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
+ mds->send_message_client_counted(m, it.first);
+
+ in->remove_client_cap(it.first);
+ }
+
+ mds->locker->request_inode_file_caps(in);
+ mds->locker->try_eval(in, CEPH_CAP_LOCKS);
+ }
+
+ ack->put();
+}
+
void Migrator::handle_gather_caps(MGatherCaps *m)
{
CInode *in = cache->get_inode(m->ino);
-
if (!in)
goto out;
dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
- << " on " << *in
- << dendl;
+ << " on " << *in << dendl;
+
if (in->is_any_caps() &&
!in->is_auth() &&
!in->is_ambiguous_auth() &&
// force open client sessions and finish cap import
mds->server->finish_force_open_sessions(imported_session_map);
- map<client_t,Capability::Import> imported_caps;
-
auto it = peer_exports.find(in);
assert(it != peer_exports.end());
// clients will release caps from the exporter when they receive the cap import message.
+ map<client_t,Capability::Import> imported_caps;
finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+ if (!imported_caps.empty()) {
+ MExportCapsAck *ack = new MExportCapsAck(in->ino());
+ map<client_t,uint64_t> peer_caps_ids;
+ for (auto &p : imported_caps )
+ peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
+
+ ::encode(imported_caps, ack->cap_bl);
+ ::encode(peer_caps_ids, ack->cap_bl);
+ mds->send_message_mds(ack, from);
+ }
+
in->auth_unpin(this);
}
inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
}
+
+ if (changed.count("mds_inject_migrator_message_loss")) {
+ inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
+ dout(0) << "mds_inject_migrator_message_loss is " << inject_message_loss << dendl;
+ }
}
// -- cons --
Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
inject_session_race = g_conf->get_val<bool>("mds_inject_migrator_session_race");
+ inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
}
void handle_conf_change(const struct md_config_t *conf,
void handle_export_finish(MExportDirFinish *m);
void handle_export_caps(MExportCaps *m);
+ void handle_export_caps_ack(MExportCapsAck *m);
void logged_import_caps(CInode *in,
mds_rank_t from,
map<client_t,pair<Session*,uint64_t> >& imported_session_map,
MDSRank *mds;
MDCache *cache;
bool inject_session_race = false;
+ int inject_message_loss = 0;
};
#endif
delayed_flush = nullptr;
}
+ if (int r = journaler.get_error()) {
+ derr << "Error " << r << " recovering write_pos" << dendl;
+ on_error->complete(r);
+ return could_consume;
+ }
+
if (!journaler.is_readable()) {
dout(10) << " not readable right now" << dendl;
// Because we are the writer and the reader of the journal
Mutex::Locker l(lock);
if (r == 0) {
_consume();
+ } else if (r != -EAGAIN) {
+ on_error->complete(r);
}
}));
}
mdr->more()->flock_was_waiting = true;
mds->locker->drop_locks(mdr.get());
mdr->drop_local_auth_pins();
+ mdr->mark_event("failed to add lock, waiting");
+ mdr->mark_nowarn();
cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
}
} else
if (features & CEPH_FEATURE_MDS_QUOTA)
::decode(quota, p);
else
- memset("a, 0, sizeof(quota));
+ quota = quota_info_t{};
if ((features & CEPH_FEATURE_FS_FILE_LAYOUT_V2))
::decode(layout.pool_ns, p);
class MExportCapsAck : public Message {
public:
inodeno_t ino;
+ bufferlist cap_bl;
MExportCapsAck() :
Message(MSG_MDS_EXPORTCAPSACK) {}
void encode_payload(uint64_t features) override {
::encode(ino, payload);
+ ::encode(cap_bl, payload);
}
void decode_payload() override {
- bufferlist::iterator p = payload.begin();
+ auto p = payload.begin();
::decode(ino, p);
+ ::decode(cap_bl, p);
}
-
};
#endif
// is "useful" so that mgr plugins filtering on prio will get some
// data (albeit probably more than they wanted)
uint8_t priority = PerfCountersBuilder::PRIO_USEFUL;
+ enum unit_t unit;
void encode(bufferlist &bl) const
{
// TODO: decide whether to drop the per-type
// encoding here, we could rely on the MgrReport
// verisoning instead.
- ENCODE_START(2, 1, bl);
+ ENCODE_START(3, 1, bl);
::encode(path, bl);
::encode(description, bl);
::encode(nick, bl);
static_assert(sizeof(type) == 1, "perfcounter_type_d must be one byte");
::encode((uint8_t)type, bl);
::encode(priority, bl);
+ ::encode((uint8_t)unit, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &p)
{
- DECODE_START(2, p);
+ DECODE_START(3, p);
::decode(path, p);
::decode(description, p);
::decode(nick, p);
if (struct_v >= 2) {
::decode(priority, p);
}
+ if (struct_v >= 3) {
+ ::decode((uint8_t&)unit, p);
+ }
DECODE_FINISH(p);
}
};
MMonSubscribeAck() : Message(CEPH_MSG_MON_SUBSCRIBE_ACK),
interval(0) {
- memset(&fsid, 0, sizeof(fsid));
}
MMonSubscribeAck(uuid_d& f, int i) : Message(CEPH_MSG_MON_SUBSCRIBE_ACK),
interval(i), fsid(f) { }
}
});
return f.get();
- } else if (what == "config") {
+ } else if (what.substr(0, 6) == "config") {
PyFormatter f;
- g_conf->show_config(&f);
+ if (what == "config_options") {
+ g_conf->config_options(&f);
+ } else if (what == "config") {
+ g_conf->show_config(&f);
+ }
return f.get();
} else if (what == "mon_map") {
PyFormatter f;
}
f.dump_unsigned("type", type.type);
f.dump_unsigned("priority", type.priority);
+ f.dump_unsigned("units", type.unit);
f.close_section();
}
f.close_section();
auto p = pg_map.osd_stat.find(osd);
if (p == pg_map.osd_stat.end()) {
missing_stats.insert(osd);
- }
- if (p->second.num_pgs > 0) {
+ } else if (p->second.num_pgs > 0) {
stored_pgs.insert(osd);
}
}
type.nick = data.nick;
}
type.type = data.type;
- type.priority = perf_counters.get_adjusted_priority(data.prio);
+ type.priority = perf_counters.get_adjusted_priority(data.prio);
+ type.unit = data.unit;
report->declare_types.push_back(std::move(type));
session->declared.insert(path);
}
void MgrClient::update_osd_health(std::vector<OSDHealthMetric>&& metrics)
{
+ Mutex::Locker l(lock);
osd_health_metrics = std::move(metrics);
}
health_detail.append("; ");
stringstream ss;
ss << "store is getting too big! "
- << prettybyte_t(stats.store_stats.bytes_total)
- << " >= " << prettybyte_t(g_conf->mon_data_size_warn);
+ << byte_u_t(stats.store_stats.bytes_total)
+ << " >= " << byte_u_t(g_conf->mon_data_size_warn);
health_detail.append(ss.str());
}
return err;
}
dout(0) << __func__ << " avail " << ours.fs_stats.avail_percent << "%"
- << " total " << prettybyte_t(ours.fs_stats.byte_total)
- << ", used " << prettybyte_t(ours.fs_stats.byte_used)
- << ", avail " << prettybyte_t(ours.fs_stats.byte_avail) << dendl;
+ << " total " << byte_u_t(ours.fs_stats.byte_total)
+ << ", used " << byte_u_t(ours.fs_stats.byte_used)
+ << ", avail " << byte_u_t(ours.fs_stats.byte_avail) << dendl;
ours.last_update = ceph_clock_now();
return update_store_stats(ours);
for (const auto &gid : to_fail) {
// Standby replays don't write, so it isn't important to
// wait for an osdmap propose here: ignore return value.
- mon->mdsmon()->fail_mds_gid(gid);
+ mon->mdsmon()->fail_mds_gid(fsmap, gid);
}
fsmap.erase_filesystem(fs->fscid);
stats.store_stats.bytes_misc = extra["misc"];
stats.last_update = ceph_clock_now();
dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
- << " total " << prettybyte_t(stats.fs_stats.byte_total)
- << ", used " << prettybyte_t(stats.fs_stats.byte_used)
- << ", avail " << prettybyte_t(stats.fs_stats.byte_avail) << dendl;
+ << " total " << byte_u_t(stats.fs_stats.byte_total)
+ << ", used " << byte_u_t(stats.fs_stats.byte_used)
+ << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl;
// MON_DISK_{LOW,CRIT,BIG}
health_check_map_t next;
ss << "mon%plurals% %names% %isorare% using a lot of disk space";
auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
ss2 << "mon." << mon->name << " is "
- << prettybyte_t(stats.store_stats.bytes_total)
+ << byte_u_t(stats.store_stats.bytes_total)
<< " >= mon_data_size_warn ("
- << prettybyte_t(g_conf->mon_data_size_warn) << ")";
+ << byte_u_t(g_conf->mon_data_size_warn) << ")";
d.detail.push_back(ss2.str());
}
quorum_checks[mon->rank] = next;
changed = true;
} else {
- // tell the leader
- mon->messenger->send_message(new MMonHealthChecks(next),
- mon->monmap->get_inst(mon->get_leader()));
+ // tell the leader, but only if the quorum is luminous
+ if (mon->quorum_mon_features.contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS)) {
+ mon->messenger->send_message(new MMonHealthChecks(next),
+ mon->monmap->get_inst(mon->get_leader()));
+ }
}
return changed;
{
dout(10) << "create_initial -- creating initial map" << dendl;
LogEntry e;
- memset(&e.who, 0, sizeof(e.who));
e.name = g_conf->name;
e.stamp = ceph_clock_now();
e.prio = CLOG_INFO;
t->erase(MDS_HEALTH_PREFIX, stringify(*i));
}
pending_daemon_health_rm.clear();
- remove_from_metadata(t);
+ remove_from_metadata(pending, t);
// health
health_check_map_t new_checks;
version_t seq = m->get_seq();
dout(15) << "_note_beacon " << *m << " noting time" << dendl;
- last_beacon[gid].stamp = ceph_clock_now();
- last_beacon[gid].seq = seq;
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
}
bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
MDSMap::mds_info_t info;
epoch_t effective_epoch = 0;
- const auto &fsmap = get_working_fsmap();
+ const auto &fsmap = get_fsmap();
// check privileges, ignore if fails
MonSession *session = m->get_session();
dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
<< ceph_mds_state_name(state) << ")" << dendl;
+ /* We can't send an MDSMap this MDS was a part of because we no longer
+ * know which FS it was part of. Nor does this matter. Sending an empty
+ * MDSMap is sufficient for getting the MDS to respawn.
+ */
MDSMap null_map;
null_map.epoch = fsmap.epoch;
null_map.compat = fsmap.compat;
MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
- auto &fsmap = get_working_fsmap();
+ const auto &fsmap = get_fsmap();
// check privileges, ignore message if fails
MonSession *session = m->get_session();
if (!session)
- goto done;
+ goto ignore;
if (!session->is_capable("mds", MON_CAP_X)) {
dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
<< session->caps << dendl;
- goto done;
+ goto ignore;
}
if (fsmap.gid_exists(m->global_id) &&
m->targets == fsmap.get_info_gid(m->global_id).export_targets)
- goto done;
+ goto ignore;
return false;
- done:
+ ignore:
+ mon->no_reply(op);
return true;
}
const MDSMap::mds_info_t &existing_info =
pending.get_info_gid(existing);
mon->clog->info() << existing_info.human_name() << " restarted";
- fail_mds_gid(existing);
+ fail_mds_gid(pending, existing);
failed_mds = true;
}
if (failed_mds) {
info.standby_for_name);
if (leaderinfo && (leaderinfo->rank >= 0)) {
const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
- const auto &fs = pending.get_filesystem(fscid);
pending.modify_daemon(gid, [fscid, leaderinfo](
MDSMap::mds_info_t *info) {
}
// initialize the beacon timer
- last_beacon[gid].stamp = ceph_clock_now();
- last_beacon[gid].seq = seq;
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
// new incompat?
if (!pending.compat.writeable(m->get_compat())) {
return false;
}
- fail_mds_gid(gid);
+ fail_mds_gid(pending, gid);
assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
stringstream ss, ds;
map<string, cmd_vartype> cmdmap;
- const auto &fsmap = get_working_fsmap();
+ const auto &fsmap = get_fsmap();
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
// ss has reason for failure
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
string format;
cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
- boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ std::unique_ptr<Formatter> f(Formatter::create(format));
MonSession *session = m->get_session();
if (!session) {
int64_t epocharg;
epoch_t epoch;
- const FSMap *fsmapp = &get_fsmap();
+ const FSMap *fsmapp = &fsmap;
FSMap dummy;
if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
epoch = epocharg;
f->open_object_section("mds");
f->dump_string("name", info.name);
std::ostringstream get_err;
- r = dump_metadata(info.name, f.get(), get_err);
+ r = dump_metadata(fsmap, info.name, f.get(), get_err);
if (r == -EINVAL || r == -ENOENT) {
// Drop error, list what metadata we do have
dout(1) << get_err.str() << dendl;
} else {
// Dump a single daemon's metadata
f->open_object_section("mds_metadata");
- r = dump_metadata(who, f.get(), ss);
+ r = dump_metadata(fsmap, who, f.get(), ss);
f->close_section();
}
f->flush(ds);
} else if (prefix == "fs ls") {
if (f) {
f->open_array_section("filesystems");
- {
- for (const auto &p : fsmap.filesystems) {
- const auto &fs = p.second;
- f->open_object_section("filesystem");
- {
- const MDSMap &mds_map = fs->mds_map;
- f->dump_string("name", mds_map.fs_name);
- /* Output both the names and IDs of pools, for use by
- * humans and machines respectively */
- f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
- mds_map.metadata_pool));
- f->dump_int("metadata_pool_id", mds_map.metadata_pool);
- f->open_array_section("data_pool_ids");
- {
- for (auto dpi = mds_map.data_pools.begin();
- dpi != mds_map.data_pools.end(); ++dpi) {
- f->dump_int("data_pool_id", *dpi);
- }
- }
- f->close_section();
-
- f->open_array_section("data_pools");
- {
- for (auto dpi = mds_map.data_pools.begin();
- dpi != mds_map.data_pools.end(); ++dpi) {
- const auto &name = mon->osdmon()->osdmap.get_pool_name(
- *dpi);
- f->dump_string("data_pool", name);
- }
- }
+ for (const auto &p : fsmap.filesystems) {
+ const auto &fs = p.second;
+ f->open_object_section("filesystem");
+ {
+ const MDSMap &mds_map = fs->mds_map;
+ f->dump_string("name", mds_map.fs_name);
+ /* Output both the names and IDs of pools, for use by
+ * humans and machines respectively */
+ f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
+ mds_map.metadata_pool));
+ f->dump_int("metadata_pool_id", mds_map.metadata_pool);
+ f->open_array_section("data_pool_ids");
+ for (const auto &id : mds_map.data_pools) {
+ f->dump_int("data_pool_id", id);
+ }
+ f->close_section();
- f->close_section();
+ f->open_array_section("data_pools");
+ for (const auto &id : mds_map.data_pools) {
+ const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
+ f->dump_string("data_pool", name);
}
f->close_section();
}
+ f->close_section();
}
f->close_section();
f->flush(ds);
ds << "name: " << mds_map.fs_name << ", metadata pool: "
<< md_pool_name << ", data pools: [";
- for (auto dpi : mds_map.data_pools) {
- const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
+ for (const auto &id : mds_map.data_pools) {
+ const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
ds << pool_name << " ";
}
ds << "]" << std::endl;
return false;
}
-bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
+bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
{
- auto &pending = get_pending_fsmap_writeable();
-
- const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
+ const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
epoch_t blacklist_epoch = 0;
blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
}
- pending.erase(gid, blacklist_epoch);
+ fsmap.erase(gid, blacklist_epoch);
last_beacon.erase(gid);
if (pending_daemon_health.count(gid)) {
pending_daemon_health.erase(gid);
return blacklist_epoch != 0;
}
-mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
+mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
{
- const auto &fsmap = get_working_fsmap();
-
// Try parsing as a role
mds_role_t role;
std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
- int r = parse_role(arg, &role, ignore_err);
+ int r = fsmap.parse_role(arg, &role, ignore_err);
if (r == 0) {
// See if a GID is assigned to this role
const auto &fs = fsmap.get_filesystem(role.fscid);
return MDS_GID_NONE;
}
-int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
- MDSMap::mds_info_t *failed_info)
+int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
+ const std::string &arg, MDSMap::mds_info_t *failed_info)
{
assert(failed_info != nullptr);
- mds_gid_t gid = gid_from_arg(arg, ss);
+ mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
if (gid == MDS_GID_NONE) {
return 0;
}
// Take a copy of the info before removing the MDS from the map,
// so that the caller knows which mds (if any) they ended up removing.
- *failed_info = get_pending_fsmap().get_info_gid(gid);
+ *failed_info = fsmap.get_info_gid(gid);
- fail_mds_gid(gid);
+ fail_mds_gid(fsmap, gid);
ss << "failed mds gid " << gid;
assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
}
}
- r = filesystem_command(op, prefix, cmdmap, ss);
+ r = filesystem_command(pending, op, prefix, cmdmap, ss);
if (r >= 0) {
goto out;
} else if (r == -EAGAIN) {
goto out;
}
- r = legacy_filesystem_command(op, prefix, cmdmap, ss);
+ r = legacy_filesystem_command(pending, op, prefix, cmdmap, ss);
if (r == -ENOSYS && ss.str().empty()) {
ss << "unrecognized command";
}
}
-
-/**
- * Given one of the following forms:
- * <fs name>:<rank>
- * <fs id>:<rank>
- * <rank>
- *
- * Parse into a mds_role_t. The rank-only form is only valid
- * if legacy_client_ns is set.
- */
-int MDSMonitor::parse_role(
- const std::string &role_str,
- mds_role_t *role,
- std::ostream &ss)
-{
- return get_working_fsmap().parse_role(role_str, role, ss);
-}
-
int MDSMonitor::filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
string whostr;
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
- auto &pending = get_pending_fsmap_writeable();
if (prefix == "mds stop" ||
prefix == "mds deactivate") {
mds_role_t role;
- r = parse_role(whostr, &role, ss);
+ r = fsmap.parse_role(whostr, &role, ss);
if (r < 0 ) {
return r;
}
- const auto &fs = pending.get_filesystem(role.fscid);
+ const auto &fs = fsmap.get_filesystem(role.fscid);
if (!fs->mds_map.is_active(role.rank)) {
r = -EEXIST;
r = 0;
mds_gid_t gid = fs->mds_map.up.at(role.rank);
ss << "telling mds." << role << " "
- << pending.get_info_gid(gid).addr << " to deactivate";
+ << fsmap.get_info_gid(gid).addr << " to deactivate";
- pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
+ fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
info->state = MDSMap::STATE_STOPPING;
});
}
<< cmd_vartype_stringify(cmdmap["state"]) << "'";
return -EINVAL;
}
- if (pending.gid_exists(gid)) {
- pending.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
+ if (fsmap.gid_exists(gid)) {
+ fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
info->state = state;
});
ss << "set mds gid " << gid << " to state " << state << " "
cmd_getval(g_ceph_context, cmdmap, "who", who);
MDSMap::mds_info_t failed_info;
- r = fail_mds(ss, who, &failed_info);
+ r = fail_mds(fsmap, ss, who, &failed_info);
if (r < 0 && r == -EAGAIN) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return -EAGAIN; // don't propose yet; wait for message to be retried
<< cmd_vartype_stringify(cmdmap["gid"]) << "'";
return -EINVAL;
}
- if (!pending.gid_exists(gid)) {
+ if (!fsmap.gid_exists(gid)) {
ss << "mds gid " << gid << " dne";
r = 0;
} else {
- const auto &info = pending.get_info_gid(gid);
+ const auto &info = fsmap.get_info_gid(gid);
MDSMap::DaemonState state = info.state;
if (state > 0) {
ss << "cannot remove active mds." << info.name
<< " rank " << info.rank;
return -EBUSY;
} else {
- pending.erase(gid, {});
+ fsmap.erase(gid, {});
ss << "removed mds gid " << gid;
return 0;
}
std::string role_str;
cmd_getval(g_ceph_context, cmdmap, "who", role_str);
mds_role_t role;
- int r = parse_role(role_str, &role, ss);
+ int r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
ss << "invalid role '" << role_str << "'";
return -EINVAL;
}
- pending.modify_filesystem(
+ fsmap.modify_filesystem(
role.fscid,
[role](std::shared_ptr<Filesystem> fs)
{
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending.compat.compat.contains(f)) {
+ if (fsmap.compat.compat.contains(f)) {
ss << "removing compat feature " << f;
- CompatSet modified = pending.compat;
+ CompatSet modified = fsmap.compat;
modified.compat.remove(f);
- pending.update_compat(modified);
+ fsmap.update_compat(modified);
} else {
- ss << "compat feature " << f << " not present in " << pending.compat;
+ ss << "compat feature " << f << " not present in " << fsmap.compat;
}
r = 0;
} else if (prefix == "mds compat rm_incompat") {
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending.compat.incompat.contains(f)) {
+ if (fsmap.compat.incompat.contains(f)) {
ss << "removing incompat feature " << f;
- CompatSet modified = pending.compat;
+ CompatSet modified = fsmap.compat;
modified.incompat.remove(f);
- pending.update_compat(modified);
+ fsmap.update_compat(modified);
} else {
- ss << "incompat feature " << f << " not present in " << pending.compat;
+ ss << "incompat feature " << f << " not present in " << fsmap.compat;
}
r = 0;
} else if (prefix == "mds repaired") {
std::string role_str;
cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
mds_role_t role;
- r = parse_role(role_str, &role, ss);
+ r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
return r;
}
- bool modified = pending.undamaged(role.fscid, role.rank);
+ bool modified = fsmap.undamaged(role.fscid, role.rank);
if (modified) {
dout(4) << "repaired: restoring rank " << role << dendl;
} else {
/**
* Helper to legacy_filesystem_command
*/
-void MDSMonitor::modify_legacy_filesystem(
+void MDSMonitor::modify_legacy_filesystem(FSMap &fsmap,
std::function<void(std::shared_ptr<Filesystem> )> fn)
{
- auto &pending_fsmap = get_pending_fsmap_writeable();
- pending_fsmap.modify_filesystem(
- pending_fsmap.legacy_client_fscid,
+ fsmap.modify_filesystem(
+ fsmap.legacy_client_fscid,
fn
);
}
* @retval < 0 An error has occurred; **ss** may have been set.
*/
int MDSMonitor::legacy_filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
string whostr;
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
- auto &pending_fsmap = get_pending_fsmap_writeable();
-
- assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
+ assert (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
if (prefix == "mds set_max_mds") {
// NOTE: deprecated by "fs set max_mds"
}
const MDSMap& mdsmap =
- pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
+ fsmap.filesystems.at(fsmap.legacy_client_fscid)->mds_map;
if (!mdsmap.allows_multimds() &&
maxmds > mdsmap.get_max_mds() &&
return -EINVAL;
}
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[maxmds](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_max_mds(maxmds);
ss << "max_mds = " << maxmds;
} else if (prefix == "mds cluster_down") {
// NOTE: deprecated by "fs set cluster_down"
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
r = 0;
} else if (prefix == "mds cluster_up") {
// NOTE: deprecated by "fs set cluster_up"
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
if (is_mds) {
// What (if any) namespace are you assigned to?
auto mds_info = fsmap.get_mds_info();
- for (const auto &i : mds_info) {
- if (i.second.addr == sub->session->inst.addr) {
- mds_gid = i.first;
+ for (const auto &p : mds_info) {
+ if (p.second.addr == sub->session->inst.addr) {
+ mds_gid = p.first;
fscid = fsmap.mds_roles.at(mds_gid);
}
}
paxos->trigger_propose();
}
-void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
+void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
{
bool update = false;
- for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
- i != pending_metadata.end(); ) {
- if (!get_pending_fsmap().gid_exists(i->first)) {
- pending_metadata.erase(i++);
+ for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
+ if (!fsmap.gid_exists(it->first)) {
+ it = pending_metadata.erase(it);
update = true;
} else {
- ++i;
+ ++it;
}
}
if (!update)
return 0;
}
-void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
+void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
{
map<mds_gid_t,Metadata> meta;
load_metadata(meta);
}
}
-void MDSMonitor::count_metadata(const string& field, Formatter *f)
+void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
{
map<string,int> by_val;
count_metadata(field, &by_val);
f->close_section();
}
-int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
+int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
+ Formatter *f, ostream& err)
{
assert(f);
- mds_gid_t gid = gid_from_arg(who, err);
+ mds_gid_t gid = gid_from_arg(fsmap, who, err);
if (gid == MDS_GID_NONE) {
return -EINVAL;
}
{
assert(f);
+ const auto &fsmap = get_fsmap();
+
map<mds_gid_t, Metadata> metadata;
if (int r = load_metadata(metadata)) {
return r;
}
map<string, list<int> > mdses; // hostname => rank
- for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
- it != metadata.end(); ++it) {
- const Metadata& m = it->second;
+ for (const auto &p : metadata) {
+ const mds_gid_t& gid = p.first;
+ const Metadata& m = p.second;
Metadata::const_iterator hostname = m.find("hostname");
if (hostname == m.end()) {
// not likely though
continue;
}
- const mds_gid_t gid = it->first;
- if (!get_fsmap().gid_exists(gid)) {
+ if (!fsmap.gid_exists(gid)) {
dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
continue;
}
- const MDSMap::mds_info_t& mds_info = get_fsmap().get_info_gid(gid);
+ const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
// FIXME: include filesystem name with rank here
mdses[hostname->second].push_back(mds_info.rank);
}
* If a cluster is undersized (with respect to max_mds), then
* attempt to find daemons to grow it.
*/
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
{
- bool do_propose = false;
- auto &pending = get_pending_fsmap_writeable();
+ auto fs = fsmap.get_filesystem(fscid);
+ auto &mds_map = fs->mds_map;
if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- return do_propose;
+ return false;
}
- while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
- !fs->mds_map.is_degraded()) {
+ int in = mds_map.get_num_in_mds();
+ int max = mds_map.get_max_mds();
+
+ dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+ if (in < max) {
mds_rank_t mds = mds_rank_t(0);
string name;
- while (fs->mds_map.is_in(mds)) {
+ while (mds_map.is_in(mds)) {
mds++;
}
- mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
+ mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
name, g_conf->mon_force_standby_active);
if (newgid == MDS_GID_NONE) {
- break;
+ return false;
}
- const auto &new_info = pending.get_info_gid(newgid);
+ const auto &new_info = fsmap.get_info_gid(newgid);
dout(1) << "assigned standby " << new_info.addr
<< " as mds." << mds << dendl;
mon->clog->info() << new_info.human_name() << " assigned to "
- "filesystem " << fs->mds_map.fs_name << " as rank "
- << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
+ "filesystem " << mds_map.fs_name << " as rank "
+ << mds << " (now has " << mds_map.get_num_in_mds() + 1
<< " ranks)";
- pending.promote(newgid, fs, mds);
- do_propose = true;
+ fsmap.promote(newgid, fs, mds);
+ return true;
}
- return do_propose;
+ return false;
}
* is available, fail this daemon (remove from map) and pass its
* role to another daemon.
*/
-void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
- bool *mds_propose, bool *osd_propose)
+void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
+ const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
{
assert(mds_propose != nullptr);
assert(osd_propose != nullptr);
- auto &pending = get_pending_fsmap_writeable();
- const auto fscid = pending.mds_roles.at(gid);
+ const auto fscid = fsmap.mds_roles.at(gid);
// We will only take decisive action (replacing/removing a daemon)
// if we have some indicating that some other daemon(s) are successfully
// getting beacons through recently.
- utime_t latest_beacon;
- for (const auto & i : last_beacon) {
- latest_beacon = MAX(i.second.stamp, latest_beacon);
+ mono_time latest_beacon = mono_clock::zero();
+ for (const auto &p : last_beacon) {
+ latest_beacon = std::max(p.second.stamp, latest_beacon);
}
- const bool may_replace = latest_beacon >
- (ceph_clock_now() -
- MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
+ mono_time now = mono_clock::now();
+ chrono::duration<double> since = now-latest_beacon;
+ const bool may_replace = since.count() <
+ std::max(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5);
// are we in?
// and is there a non-laggy standby that can take over for us?
info.state != MDSMap::STATE_STANDBY &&
info.state != MDSMap::STATE_STANDBY_REPLAY &&
may_replace &&
- !pending.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
- (sgid = pending.find_replacement_for({fscid, info.rank}, info.name,
+ !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
+ (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
g_conf->mon_force_standby_active)) != MDS_GID_NONE)
{
- MDSMap::mds_info_t si = pending.get_info_gid(sgid);
+ MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
dout(10) << " replacing " << gid << " " << info.addr << " mds."
<< info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " with standby " << si.human_name();
// Remember what NS the old one was in
- const fs_cluster_id_t fscid = pending.mds_roles.at(gid);
+ const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
// Remove the old one
- *osd_propose |= fail_mds_gid(gid);
+ *osd_propose |= fail_mds_gid(fsmap, gid);
// Promote the replacement
- auto fs = pending.filesystems.at(fscid);
- pending.promote(sgid, fs, info.rank);
+ auto fs = fsmap.filesystems.at(fscid);
+ fsmap.promote(sgid, fs, info.rank);
*mds_propose = true;
} else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
<< dendl;
mon->clog->info() << "Standby " << info.human_name() << " is not "
"responding, dropping it";
- fail_mds_gid(gid);
+ fail_mds_gid(fsmap, gid);
*mds_propose = true;
} else if (!info.laggy()) {
dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
- pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
+ fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
info->laggy_since = ceph_clock_now();
});
*mds_propose = true;
}
}
-bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
{
assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
- auto &pending = get_pending_fsmap_writeable();
-
bool do_propose = false;
// have a standby take over?
set<mds_rank_t>::iterator p = failed.begin();
while (p != failed.end()) {
mds_rank_t f = *p++;
- mds_gid_t sgid = pending.find_replacement_for({fs->fscid, f}, {},
+ mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
g_conf->mon_force_standby_active);
if (sgid) {
- const MDSMap::mds_info_t si = pending.get_info_gid(sgid);
+ const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
dout(0) << " taking over failed mds." << f << " with " << sgid
<< "/" << si.name << " " << si.addr << dendl;
mon->clog->info() << "Standby " << si.human_name()
<< " assigned to filesystem " << fs->mds_map.fs_name
<< " as rank " << f;
- pending.promote(sgid, fs, f);
+ fsmap.promote(sgid, fs, f);
do_propose = true;
}
}
// them while perhaps-modifying standby_daemons during the loop
// (if we promote anyone they are removed from standby_daemons)
std::vector<mds_gid_t> standby_gids;
- for (const auto &j : pending.standby_daemons) {
+ for (const auto &j : fsmap.standby_daemons) {
standby_gids.push_back(j.first);
}
for (const auto &gid : standby_gids) {
- const auto &info = pending.standby_daemons.at(gid);
+ const auto &info = fsmap.standby_daemons.at(gid);
assert(info.state == MDSMap::STATE_STANDBY);
if (!info.standby_replay) {
// the standby_for_rank refers to: lookup via legacy_client_fscid
mds_role_t target_role = {
info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- pending.legacy_client_fscid : info.standby_for_fscid,
+ fsmap.legacy_client_fscid : info.standby_for_fscid,
info.standby_for_rank};
// It is possible that the map contains a standby_for_fscid
// that doesn't correspond to an existing filesystem, especially
// if we loaded from a version with a bug (#17466)
if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
- && !pending.filesystem_exists(info.standby_for_fscid)) {
+ && !fsmap.filesystem_exists(info.standby_for_fscid)) {
derr << "gid " << gid << " has invalid standby_for_fscid "
<< info.standby_for_fscid << dendl;
continue;
// If we managed to resolve a full target role
if (target_role.fscid != FS_CLUSTER_ID_NONE) {
- const auto &fs = pending.get_filesystem(target_role.fscid);
+ const auto &fs = fsmap.get_filesystem(target_role.fscid);
if (fs->mds_map.is_followable(target_role.rank)) {
- do_propose |= try_standby_replay(
- info,
- *fs,
+ do_propose |= try_standby_replay(fsmap, info, *fs,
fs->mds_map.get_info(target_role.rank));
}
}
}
// check everyone
- for (const auto &p : pending.filesystems) {
+ for (const auto &p : fsmap.filesystems) {
if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
info.standby_for_fscid != p.first)
continue;
continue; // we're supposed to follow someone else
}
- if (try_standby_replay(info, *fs, cand_info)) {
+ if (try_standby_replay(fsmap, info, *fs, cand_info)) {
assigned = true;
break;
}
// make sure mds's are still alive
// ...if i am an active leader
- if (!is_active()) return;
-
- dout(10) << get_working_fsmap() << dendl;
-
- if (!is_leader()) return;
+ if (!is_active() || !is_leader()) return;
auto &pending = get_pending_fsmap_writeable();
// expand mds cluster (add new nodes to @in)?
for (auto &p : pending.filesystems) {
- do_propose |= maybe_expand_cluster(p.second);
+ do_propose |= maybe_expand_cluster(pending, p.second->fscid);
}
- const auto now = ceph_clock_now();
- if (last_tick.is_zero()) {
+ mono_time now = mono_clock::now();
+ if (last_tick == decltype(last_tick)::min()) {
last_tick = now;
}
+ chrono::duration<double> since_last = now-last_tick;
- if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
+ if (since_last.count() >
+ (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
// This case handles either local slowness (calls being delayed
// for whatever reason) or cluster election slowness (a long gap
// between calls while an election happened)
dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
"(slow election?) of " << now - last_tick << " seconds" << dendl;
- for (auto &i : last_beacon) {
- i.second.stamp = now;
+ for (auto &p : last_beacon) {
+ p.second.stamp = now;
}
}
last_tick = now;
- // check beacon timestamps
- utime_t cutoff = now;
- cutoff -= g_conf->mds_beacon_grace;
-
// make sure last_beacon is fully populated
for (auto &p : pending.mds_roles) {
auto &gid = p.first;
- if (last_beacon.count(gid) == 0) {
- last_beacon[gid].stamp = now;
- last_beacon[gid].seq = 0;
- }
+ last_beacon.emplace(std::piecewise_construct,
+ std::forward_as_tuple(gid),
+ std::forward_as_tuple(mono_clock::now(), 0));
}
+
+ // check beacon timestamps
bool propose_osdmap = false;
bool osdmap_writeable = mon->osdmon()->is_writeable();
- auto p = last_beacon.begin();
- while (p != last_beacon.end()) {
- mds_gid_t gid = p->first;
- auto beacon_info = p->second;
- ++p;
+ for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+ mds_gid_t gid = it->first;
+ auto beacon_info = it->second;
+ chrono::duration<double> since_last = now-beacon_info.stamp;
if (!pending.gid_exists(gid)) {
// clean it out
- last_beacon.erase(gid);
+ it = last_beacon.erase(it);
continue;
}
- if (beacon_info.stamp < cutoff) {
+
+ if (since_last.count() >= g_conf->mds_beacon_grace) {
auto &info = pending.get_info_gid(gid);
dout(1) << "no beacon from mds." << info.rank << "." << info.inc
<< " (gid: " << gid << " addr: " << info.addr
<< " state: " << ceph_mds_state_name(info.state) << ")"
- << " since " << beacon_info.stamp << dendl;
+ << " since " << since_last.count() << "s" << dendl;
// If the OSDMap is writeable, we can blacklist things, so we can
// try failing any laggy MDS daemons. Consider each one for failure.
if (osdmap_writeable) {
- maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
+ maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
}
}
+
+ ++it;
}
if (propose_osdmap) {
request_proposal(mon->osdmon());
for (auto &p : pending.filesystems) {
auto &fs = p.second;
if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- do_propose |= maybe_promote_standby(fs);
+ do_propose |= maybe_promote_standby(pending, fs);
}
}
* ainfo: the would-be leader
*/
bool MDSMonitor::try_standby_replay(
+ FSMap &fsmap,
const MDSMap::mds_info_t& finfo,
const Filesystem &leader_fs,
const MDSMap::mds_info_t& ainfo)
} else {
// Assign the new role to the standby
dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
- get_pending_fsmap_writeable().assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
+ fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
return true;
}
}
void MDSMonitor::on_restart()
{
// Clear out the leader-specific state.
- last_tick = utime_t();
+ last_tick = mono_clock::now();
last_beacon.clear();
}
/**
* Return true if a blacklist was done (i.e. OSD propose needed)
*/
- bool fail_mds_gid(mds_gid_t gid);
+ bool fail_mds_gid(FSMap &fsmap, mds_gid_t gid);
bool is_leader() const override { return mon->is_leader(); }
void get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail,
CephContext *cct) const override;
- int fail_mds(std::ostream &ss, const std::string &arg,
+ int fail_mds(FSMap &fsmap, std::ostream &ss,
+ const std::string &arg,
MDSMap::mds_info_t *failed_info);
bool preprocess_command(MonOpRequestRef op);
bool prepare_command(MonOpRequestRef op);
- int parse_role(
- const std::string &role_str,
- mds_role_t *role,
- std::ostream &ss);
-
void modify_legacy_filesystem(
+ FSMap &fsmap,
std::function<void(std::shared_ptr<Filesystem> )> fn);
int legacy_filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
std::stringstream &ss);
int filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
// beacons
struct beacon_info_t {
- utime_t stamp;
- uint64_t seq;
+ mono_time stamp = mono_clock::zero();
+ uint64_t seq = 0;
+ beacon_info_t() {}
+ beacon_info_t(mono_time stamp, uint64_t seq) : stamp(stamp), seq(seq) {}
};
map<mds_gid_t, beacon_info_t> last_beacon;
- bool try_standby_replay(
- const MDSMap::mds_info_t& finfo,
- const Filesystem &leader_fs,
- const MDSMap::mds_info_t& ainfo);
+ bool try_standby_replay(FSMap &fsmap, const MDSMap::mds_info_t& finfo,
+ const Filesystem &leader_fs, const MDSMap::mds_info_t& ainfo);
std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
- bool maybe_promote_standby(std::shared_ptr<Filesystem> &fs);
- bool maybe_expand_cluster(std::shared_ptr<Filesystem> &fs);
- void maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
- bool *mds_propose, bool *osd_propose);
+ bool maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs);
+ bool maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid);
+ void maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
+ const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose);
void tick() override; // check state, take actions
- int dump_metadata(const string& who, Formatter *f, ostream& err);
+ int dump_metadata(const FSMap &fsmap, const std::string &who, Formatter *f,
+ ostream& err);
void update_metadata(mds_gid_t gid, const Metadata& metadata);
- void remove_from_metadata(MonitorDBStore::TransactionRef t);
+ void remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t);
int load_metadata(map<mds_gid_t, Metadata>& m);
- void count_metadata(const string& field, Formatter *f);
+ void count_metadata(const std::string& field, Formatter *f);
public:
- void count_metadata(const string& field, map<string,int> *out);
+ void count_metadata(const std::string& field, map<string,int> *out);
protected:
// MDS daemon GID to latest health state from that GID
std::map<uint64_t, MDSHealth> pending_daemon_health;
std::set<uint64_t> pending_daemon_health_rm;
-
map<mds_gid_t, Metadata> pending_metadata;
- mds_gid_t gid_from_arg(const std::string& arg, std::ostream& err);
+ mds_gid_t gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream& err);
// When did the mon last call into our tick() method? Used for detecting
// when the mon was not updating us for some period (e.g. during slow
// election) to reset last_beacon timeouts
- utime_t last_tick;
+ mono_time last_tick = mono_clock::zero();
};
#endif
MonMap()
: epoch(0) {
- memset(&fsid, 0, sizeof(fsid));
}
uuid_d& get_fsid() { return fsid; }
pcb.add_u64(l_cluster_num_osd_up, "num_osd_up", "OSDs that are up");
pcb.add_u64(l_cluster_num_osd_in, "num_osd_in", "OSD in state \"in\" (they are in cluster)");
pcb.add_u64(l_cluster_osd_epoch, "osd_epoch", "Current epoch of OSD map");
- pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster");
- pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space");
- pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space");
+ pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster", NULL, 0, unit_t(BYTES));
+ pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space", NULL, 0, unit_t(BYTES));
+ pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space", NULL, 0, unit_t(BYTES));
pcb.add_u64(l_cluster_num_pool, "num_pool", "Pools");
pcb.add_u64(l_cluster_num_pg, "num_pg", "Placement groups");
pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean", "Placement groups in active+clean state");
if (p->quota_max_objects == 0)
rs << "N/A";
else
- rs << si_t(p->quota_max_objects) << " objects";
+ rs << si_u_t(p->quota_max_objects) << " objects";
rs << "\n"
<< " max bytes : ";
if (p->quota_max_bytes == 0)
rs << "N/A";
else
- rs << si_t(p->quota_max_bytes) << "B";
+ rs << byte_u_t(p->quota_max_bytes);
rdata.append(rs.str());
}
rdata.append("\n");
(uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
mon->clog->warn() << "pool '" << pool_name << "' is full"
<< " (reached quota's max_bytes: "
- << si_t(pool.quota_max_bytes) << ")";
+ << byte_u_t(pool.quota_max_bytes) << ")";
}
if (pool.quota_max_objects > 0 &&
(uint64_t)sum.num_objects >= pool.quota_max_objects) {
auto it = profile.find("stripe_unit");
if (it != profile.end()) {
string err_str;
- uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
+ uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
if (!err_str.empty()) {
*ss << "could not parse stripe_unit '" << it->second
<< "': " << err_str << std::endl;
auto it = profile.find("stripe_unit");
if (it != profile.end()) {
string err_str;
- stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
+ stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
assert(err_str.empty());
}
*stripe_width = data_chunks *
// val could contain unit designations, so we treat as a string
string val;
cmd_getval(g_ceph_context, cmdmap, "val", val);
- stringstream tss;
- int64_t value = unit_to_bytesize(val, &tss);
- if (value < 0) {
- ss << "error parsing value '" << value << "': " << tss.str();
- err = value;
+ string tss;
+ int64_t value;
+ if (field == "max_objects") {
+ value = strict_sistrtoll(val.c_str(), &tss);
+ } else if (field == "max_bytes") {
+ value = strict_iecstrtoll(val.c_str(), &tss);
+ } else {
+ assert(0 == "unrecognized option");
+ }
+ if (!tss.empty()) {
+ ss << "error parsing value '" << val << "': " << tss;
+ err = -EINVAL;
goto reply;
}
} else {
*out << " pools: " << pg_pool_sum.size() << " pools, "
<< num_pg << " pgs\n";
- *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
- << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
+ *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
+ << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
*out << " usage: "
- << kb_t(osd_sum.kb_used) << " used, "
- << kb_t(osd_sum.kb_avail) << " / "
- << kb_t(osd_sum.kb) << " avail\n";
+ << byte_u_t(osd_sum.kb_used << 10) << " used, "
+ << byte_u_t(osd_sum.kb_avail << 10) << " / "
+ << byte_u_t(osd_sum.kb << 10) << " avail\n";
*out << " pgs: ";
}
if (out)
*out << num_pg << " pgs: "
<< states << "; "
- << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
- << kb_t(osd_sum.kb_used) << " used, "
- << kb_t(osd_sum.kb_avail) << " / "
- << kb_t(osd_sum.kb) << " avail";
+ << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
+ << byte_u_t(osd_sum.kb_used << 10) << " used, "
+ << byte_u_t(osd_sum.kb_avail << 10) << " / "
+ << byte_u_t(osd_sum.kb << 10) << " avail";
if (f) {
f->dump_unsigned("num_pgs", num_pg);
f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
if (pos_delta.stats.sum.num_rd) {
int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
if (out)
- *out << pretty_si_t(rd) << "B/s rd, ";
+ *out << byte_u_t(rd) << "/s rd, ";
if (f)
f->dump_unsigned("read_bytes_sec", rd);
}
if (pos_delta.stats.sum.num_wr) {
int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
if (out)
- *out << pretty_si_t(wr) << "B/s wr, ";
+ *out << byte_u_t(wr) << "/s wr, ";
if (f)
f->dump_unsigned("write_bytes_sec", wr);
}
int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
if (out)
- *out << pretty_si_t(iops) << "op/s";
+ *out << si_u_t(iops) << "op/s";
if (f)
f->dump_unsigned("io_sec", iops);
}
f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
} else {
- *out << pretty_si_t(bps) << "B/s";
+ *out << byte_u_t(bps) << "/s";
if (pos_delta.stats.sum.num_keys_recovered)
- *out << ", " << pretty_si_t(kps) << "keys/s";
- *out << ", " << pretty_si_t(objps) << "objects/s";
+ *out << ", " << si_u_t(kps) << "keys/s";
+ *out << ", " << si_u_t(objps) << "objects/s";
}
}
}
if (f) {
f->dump_int("read_bytes_sec", rd);
} else {
- *out << pretty_si_t(rd) << "B/s rd, ";
+ *out << byte_u_t(rd) << "/s rd, ";
}
}
if (pos_delta.stats.sum.num_wr) {
if (f) {
f->dump_int("write_bytes_sec", wr);
} else {
- *out << pretty_si_t(wr) << "B/s wr, ";
+ *out << byte_u_t(wr) << "/s wr, ";
}
}
int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
f->dump_int("read_op_per_sec", iops_rd);
f->dump_int("write_op_per_sec", iops_wr);
} else {
- *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
+ *out << si_u_t(iops_rd) << "op/s rd, " << si_u_t(iops_wr) << "op/s wr";
}
}
}
if (f) {
f->dump_int("flush_bytes_sec", flush);
} else {
- *out << pretty_si_t(flush) << "B/s flush";
+ *out << byte_u_t(flush) << "/s flush";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(evict) << "B/s evict";
+ *out << byte_u_t(evict) << "/s evict";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(promote) << "op/s promote";
+ *out << si_u_t(promote) << "op/s promote";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
+ *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
+ *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
+ *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
have_output = true;
}
}
} else {
if (have_output)
*out << ", ";
- *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
+ *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
}
}
}
if (pool->quota_max_objects == 0)
tbl << "N/A";
else
- tbl << si_t(pool->quota_max_objects);
+ tbl << si_u_t(pool->quota_max_objects);
if (pool->quota_max_bytes == 0)
tbl << "N/A";
else
- tbl << si_t(pool->quota_max_bytes);
+ tbl << byte_u_t(pool->quota_max_bytes);
}
}
if (verbose) {
tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
}
- tbl << stringify(si_t(osd_sum.kb*1024))
- << stringify(si_t(osd_sum.kb_avail*1024))
- << stringify(si_t(osd_sum.kb_used*1024));
+ tbl << stringify(byte_u_t(osd_sum.kb*1024))
+ << stringify(byte_u_t(osd_sum.kb_avail*1024))
+ << stringify(byte_u_t(osd_sum.kb_used*1024));
float used = 0.0;
if (osd_sum.kb > 0) {
used = ((float)osd_sum.kb_used / osd_sum.kb);
}
tbl << percentify(used*100);
if (verbose) {
- tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
+ tbl << stringify(si_u_t(pg_sum.stats.sum.num_objects));
}
tbl << TextTable::endrow;
f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
}
} else {
- tbl << stringify(si_t(sum.num_bytes));
+ tbl << stringify(byte_u_t(sum.num_bytes));
tbl << percentify(used*100);
- tbl << si_t(avail / raw_used_rate);
+ tbl << byte_u_t(avail / raw_used_rate);
tbl << sum.num_objects;
if (verbose) {
- tbl << stringify(si_t(sum.num_objects_dirty))
- << stringify(si_t(sum.num_rd))
- << stringify(si_t(sum.num_wr))
- << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
+ tbl << stringify(si_u_t(sum.num_objects_dirty))
+ << stringify(byte_u_t(sum.num_rd))
+ << stringify(byte_u_t(sum.num_wr))
+ << stringify(byte_u_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
}
}
}
p != osd_stat.end();
++p) {
tab << p->first
- << si_t(p->second.kb_used << 10)
- << si_t(p->second.kb_avail << 10)
- << si_t(p->second.kb << 10)
+ << byte_u_t(p->second.kb_used << 10)
+ << byte_u_t(p->second.kb_avail << 10)
+ << byte_u_t(p->second.kb << 10)
<< p->second.hb_peers
<< get_num_pg_by_osd(p->first)
<< get_num_primary_pg_by_osd(p->first)
}
tab << "sum"
- << si_t(osd_sum.kb_used << 10)
- << si_t(osd_sum.kb_avail << 10)
- << si_t(osd_sum.kb << 10)
+ << byte_u_t(osd_sum.kb_used << 10)
+ << byte_u_t(osd_sum.kb_avail << 10)
+ << byte_u_t(osd_sum.kb << 10)
<< TextTable::endrow;
ss << tab;
tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
tab << "sum"
- << si_t(osd_sum.kb_used << 10)
- << si_t(osd_sum.kb_avail << 10)
- << si_t(osd_sum.kb << 10)
+ << byte_u_t(osd_sum.kb_used << 10)
+ << byte_u_t(osd_sum.kb_avail << 10)
+ << byte_u_t(osd_sum.kb << 10)
<< TextTable::endrow;
ss << tab;
p.second.target_max_objects * (ratio / 1000000.0)) {
ostringstream ss;
ss << "cache pool '" << name << "' with "
- << si_t(st.stats.sum.num_objects)
+ << si_u_t(st.stats.sum.num_objects)
<< " objects at/near target max "
- << si_t(p.second.target_max_objects) << " objects";
+ << si_u_t(p.second.target_max_objects) << " objects";
detail.push_back(ss.str());
nearfull = true;
}
p.second.target_max_bytes * (ratio / 1000000.0)) {
ostringstream ss;
ss << "cache pool '" << name
- << "' with " << si_t(st.stats.sum.num_bytes)
- << "B at/near target max "
- << si_t(p.second.target_max_bytes) << "B";
+ << "' with " << byte_u_t(st.stats.sum.num_bytes)
+ << " at/near target max "
+ << byte_u_t(p.second.target_max_bytes);
detail.push_back(ss.str());
nearfull = true;
}
} else if (crit_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " bytes"
- << " (max " << si_t(pool.quota_max_bytes) << ")";
+ << "' has " << byte_u_t(sum.num_bytes)
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
full_detail.push_back(ss.str());
full = true;
} else if (warn_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " bytes"
- << " (max " << si_t(pool.quota_max_bytes) << ")";
+ << "' has " << byte_u_t(sum.num_bytes)
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
nearfull_detail.push_back(ss.str());
nearfull = true;
}
if (detail) {
ostringstream ss;
ss << "cache pool '" << name << "' with "
- << si_t(st.stats.sum.num_objects)
+ << si_u_t(st.stats.sum.num_objects)
<< " objects at/near target max "
- << si_t(p->second.target_max_objects) << " objects";
+ << si_u_t(p->second.target_max_objects) << " objects";
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
if (detail) {
ostringstream ss;
ss << "cache pool '" << name
- << "' with " << si_t(st.stats.sum.num_bytes)
+ << "' with " << byte_u_t(st.stats.sum.num_bytes)
<< "B at/near target max "
- << si_t(p->second.target_max_bytes) << "B";
+ << byte_u_t(p->second.target_max_bytes) << "B";
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
} else if (crit_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " bytes"
- << " (max " << si_t(pool.quota_max_bytes) << ")";
+ << "' has " << byte_u_t(sum.num_bytes) << " bytes"
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
status = HEALTH_ERR;
} else if (warn_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " bytes"
- << " (max " << si_t(pool.quota_max_bytes) << ")";
+ << "' has " << byte_u_t(sum.num_bytes) << " bytes"
+ << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
status = HEALTH_WARN;
}
if (status != HEALTH_OK) {
pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency", "Refresh latency");
pcb.add_u64_counter(l_paxos_begin, "begin", "Started and handled begins");
pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys", "Keys in transaction on begin");
- pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin");
+ pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin", NULL, 0, unit_t(BYTES));
pcb.add_time_avg(l_paxos_begin_latency, "begin_latency", "Latency of begin operation");
pcb.add_u64_counter(l_paxos_commit, "commit",
"Commits", "cmt");
pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys", "Keys in transaction on commit");
- pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit");
+ pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit", NULL, 0, unit_t(BYTES));
pcb.add_time_avg(l_paxos_commit_latency, "commit_latency",
"Commit latency", "clat");
pcb.add_u64_counter(l_paxos_collect, "collect", "Peon collects");
pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys", "Keys in transaction on peon collect");
- pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect");
+ pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect", NULL, 0, unit_t(BYTES));
pcb.add_time_avg(l_paxos_collect_latency, "collect_latency", "Peon collect latency");
pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted", "Uncommitted values in started and handled collects");
pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout", "Collect timeouts");
pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout", "Lease timeouts");
pcb.add_u64_counter(l_paxos_store_state, "store_state", "Store a shared state on disk");
pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys", "Keys in transaction in stored state");
- pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state");
+ pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state", NULL, 0, unit_t(BYTES));
pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency", "Storing state latency");
pcb.add_u64_counter(l_paxos_share_state, "share_state", "Sharings of state");
pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys", "Keys in shared state");
- pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state");
+ pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state", NULL, 0, unit_t(BYTES));
pcb.add_u64_counter(l_paxos_new_pn, "new_pn", "New proposal number queries");
pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency", "New proposal number getting latency");
logger = pcb.create_perf_counters();
protected:
FSMap &get_pending_fsmap_writeable() { assert(is_leader()); return pending_fsmap; }
- /* get_working_fsmap returns the "relevant" version of the fsmap (see MDSMonitor.cc history)
- * used depending in helper methods of MDSMonitor.cc.
- *
- * This is technically evil and will be removed in the future.
- *
- * See discussion: https://github.com/ceph/ceph/pull/21458#discussion_r182081366
- */
- const FSMap &get_working_fsmap() const { return is_leader() ? pending_fsmap : fsmap; }
-
FSMap &create_pending() {
assert(is_leader());
pending_fsmap = fsmap;
inline ostream& operator<<(ostream& out, const MonSession& s)
{
- out << "MonSession(" << s.inst << " is "
- << (s.closed ? "closed" : "open");
- out << " " << s.caps << ")";
+ out << "MonSession(" << s.inst << " is " << (s.closed ? "closed" : "open")
+ << " " << s.caps << ", features 0x" << std::hex << s.con_features << std::dec
+ << " (" << ceph_release_name(ceph_release_from_features(s.con_features))
+ << "))";
return out;
}
plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
- plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes");
- plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes");
+ plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(BYTES));
+ plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
- plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes");
- plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes");
+ plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
- plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes");
- plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes");
+ plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
plb.add_u64_counter(l_msgr_rdma_rx_no_registered_mem, "rx_no_registered_mem", "The count of no registered buffer when receiving");
plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
- plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
- plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
perf_logger = plb.create_perf_counters();
PerfCountersBuilder b(cct, "bluefs",
l_bluefs_first, l_bluefs_last);
b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
- "Bytes gifted from BlueStore");
+ "Bytes gifted from BlueStore", NULL, 0, unit_t(BYTES));
b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
- "Bytes reclaimed by BlueStore");
+ "Bytes reclaimed by BlueStore", NULL, 0, unit_t(BYTES));
b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
"Total bytes (main db device)",
- "b", PerfCountersBuilder::PRIO_USEFUL);
+ "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
"Used bytes (main db device)",
- "u", PerfCountersBuilder::PRIO_USEFUL);
+ "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
"Total bytes (wal device)",
- "walb", PerfCountersBuilder::PRIO_USEFUL);
+ "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
"Used bytes (wal device)",
- "walu", PerfCountersBuilder::PRIO_USEFUL);
+ "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
"Total bytes (slow device)",
- "slob", PerfCountersBuilder::PRIO_USEFUL);
+ "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
"Used bytes (slow device)",
- "slou", PerfCountersBuilder::PRIO_USEFUL);
+ "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
b.add_u64(l_bluefs_num_files, "num_files", "File count",
"f", PerfCountersBuilder::PRIO_USEFUL);
b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
- "jlen", PerfCountersBuilder::PRIO_INTERESTING);
+ "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
"Compactions of the metadata log");
b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
"Bytes written to the metadata log", "j",
- PerfCountersBuilder::PRIO_CRITICAL);
+ PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
"Files written to WAL");
b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
PerfCountersBuilder::PRIO_CRITICAL);
b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
"Bytes written to SSTs", "sst",
- PerfCountersBuilder::PRIO_CRITICAL);
+ PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
logger = b.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
return r;
}
dout(1) << __func__ << " bdev " << id << " path " << path
- << " size " << pretty_si_t(b->get_size()) << "B" << dendl;
+ << " size " << byte_u_t(b->get_size()) << dendl;
bdev[id] = b;
ioc[id] = new IOContext(cct, NULL);
return 0;
(block_total[id] - (*usage)[id].first) * 100 / block_total[id];
dout(10) << __func__ << " bdev " << id
<< " free " << (*usage)[id].first
- << " (" << pretty_si_t((*usage)[id].first) << "B)"
+ << " (" << byte_u_t((*usage)[id].first) << ")"
<< " / " << (*usage)[id].second
- << " (" << pretty_si_t((*usage)[id].second) << "B)"
+ << " (" << byte_u_t((*usage)[id].second) << ")"
<< ", used " << used << "%"
<< dendl;
}
target_buffer = min(target_bytes - target_meta, target_buffer);
if (current <= target_bytes) {
- dout(10) << __func__
- << " shard target " << pretty_si_t(target_bytes)
+ dout(30) << __func__
+ << " shard target " << byte_u_t(target_bytes)
<< " meta/data ratios " << target_meta_ratio
<< " + " << target_data_ratio << " ("
- << pretty_si_t(target_meta) << " + "
- << pretty_si_t(target_buffer) << "), "
- << " current " << pretty_si_t(current) << " ("
- << pretty_si_t(current_meta) << " + "
- << pretty_si_t(current_buffer) << ")"
+ << byte_u_t(target_meta) << " + "
+ << byte_u_t(target_buffer) << "), "
+ << " current " << byte_u_t(current) << " ("
+ << byte_u_t(current_meta) << " + "
+ << byte_u_t(current_buffer) << ")"
<< dendl;
return;
}
uint64_t max_meta = current_meta - free_meta;
uint64_t max_onodes = max_meta / bytes_per_onode;
- dout(10) << __func__
- << " shard target " << pretty_si_t(target_bytes)
+ dout(20) << __func__
+ << " shard target " << byte_u_t(target_bytes)
<< " ratio " << target_meta_ratio << " ("
- << pretty_si_t(target_meta) << " + "
- << pretty_si_t(target_buffer) << "), "
- << " current " << pretty_si_t(current) << " ("
- << pretty_si_t(current_meta) << " + "
- << pretty_si_t(current_buffer) << "),"
- << " need_to_free " << pretty_si_t(need_to_free) << " ("
- << pretty_si_t(free_meta) << " + "
- << pretty_si_t(free_buffer) << ")"
+ << byte_u_t(target_meta) << " + "
+ << byte_u_t(target_buffer) << "), "
+ << " current " << byte_u_t(current) << " ("
+ << byte_u_t(current_meta) << " + "
+ << byte_u_t(current_buffer) << "),"
+ << " need_to_free " << byte_u_t(need_to_free) << " ("
+ << byte_u_t(free_meta) << " + "
+ << byte_u_t(free_buffer) << ")"
<< " -> max " << max_onodes << " onodes + "
<< max_buffer << " buffer"
<< dendl;
}
if (evicted > 0) {
- dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
<< " from warm_in list, done evicting warm_in buffers"
<< dendl;
}
}
if (evicted > 0) {
- dout(20) << __func__ << " evicted " << prettybyte_t(evicted)
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
<< " from hot list, done evicting hot buffers"
<< dendl;
}
BlueStore::SharedBlob::~SharedBlob()
{
- if (get_cache()) { // the dummy instances have a nullptr
- std::lock_guard<std::recursive_mutex> l(get_cache()->lock);
- bc._clear(get_cache());
- get_cache()->rm_blob();
- }
if (loaded && persistent) {
delete persistent;
}
ldout(coll->store->cct, 20) << __func__ << " " << this
<< " removing self from set " << get_parent()
<< dendl;
- if (get_parent()) {
- get_parent()->remove(this);
+ again:
+ auto coll_snap = coll;
+ if (coll_snap) {
+ std::lock_guard<std::recursive_mutex> l(coll_snap->cache->lock);
+ if (coll_snap != coll) {
+ goto again;
+ }
+ coll_snap->shared_blob_set.remove(this);
+
+ bc._clear(coll_snap->cache);
+ coll_snap->cache->rm_blob();
}
delete this;
}
<< std::dec << dendl;
}
+void BlueStore::_set_finisher_num()
+{
+ if (cct->_conf->bluestore_shard_finishers) {
+ if (cct->_conf->osd_op_num_shards) {
+ m_finisher_num = cct->_conf->osd_op_num_shards;
+ } else {
+ assert(bdev);
+ if (bdev->is_rotational()) {
+ m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
+ } else {
+ m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
+ }
+ }
+ }
+ assert(m_finisher_num != 0);
+}
+
int BlueStore::_set_cache_sizes()
{
assert(bdev);
b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
"Sum for compress ops rejected due to low net gain of space");
b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
- "Sum for write-op padded bytes");
+ "Sum for write-op padded bytes", NULL, 0, unit_t(BYTES));
b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
"Sum for deferred write op");
b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
- "Sum for deferred write bytes", "def");
+ "Sum for deferred write bytes", "def", 0, unit_t(BYTES));
b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
"Sum for write penalty read ops");
b.add_u64(l_bluestore_allocated, "bluestore_allocated",
b.add_u64(l_bluestore_buffers, "bluestore_buffers",
"Number of buffers in cache");
b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
- "Number of buffer bytes in cache");
+ "Number of buffer bytes in cache", NULL, 0, unit_t(BYTES));
b.add_u64(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
- "Sum for bytes of read hit in the cache");
+ "Sum for bytes of read hit in the cache", NULL, 0, unit_t(BYTES));
b.add_u64(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
- "Sum for bytes of read missed in the cache");
+ "Sum for bytes of read missed in the cache", NULL, 0, unit_t(BYTES));
b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
"Large aligned writes into fresh blobs");
b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
- "Large aligned writes into fresh blobs (bytes)");
+ "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(BYTES));
b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
"Large aligned writes into fresh blobs (blobs)");
b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
"Small writes into existing or sparse small blobs");
b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
- "Small writes into existing or sparse small blobs (bytes)");
+ "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(BYTES));
b.add_u64_counter(l_bluestore_write_small_unused,
"bluestore_write_small_unused",
"Small writes into unused portion of existing blob");
bytes += length;
}
fm->enumerate_reset();
- dout(1) << __func__ << " loaded " << pretty_si_t(bytes)
+ dout(1) << __func__ << " loaded " << byte_u_t(bytes)
<< " in " << num << " extents"
<< dendl;
float bluefs_ratio = (float)bluefs_free / (float)total_free;
dout(10) << __func__
- << " bluefs " << pretty_si_t(bluefs_free)
+ << " bluefs " << byte_u_t(bluefs_free)
<< " free (" << bluefs_free_ratio
- << ") bluestore " << pretty_si_t(my_free)
+ << ") bluestore " << byte_u_t(my_free)
<< " free (" << my_free_ratio
<< "), bluefs_ratio " << bluefs_ratio
<< dendl;
gift = cct->_conf->bluestore_bluefs_gift_ratio * total_free;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " < min_ratio " << cct->_conf->bluestore_bluefs_min_ratio
- << ", should gift " << pretty_si_t(gift) << dendl;
+ << ", should gift " << byte_u_t(gift) << dendl;
} else if (bluefs_ratio > cct->_conf->bluestore_bluefs_max_ratio) {
reclaim = cct->_conf->bluestore_bluefs_reclaim_ratio * total_free;
if (bluefs_total - reclaim < cct->_conf->bluestore_bluefs_min)
reclaim = bluefs_total - cct->_conf->bluestore_bluefs_min;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " > max_ratio " << cct->_conf->bluestore_bluefs_max_ratio
- << ", should reclaim " << pretty_si_t(reclaim) << dendl;
+ << ", should reclaim " << byte_u_t(reclaim) << dendl;
}
// don't take over too much of the freespace
uint64_t g = cct->_conf->bluestore_bluefs_min - bluefs_total;
dout(10) << __func__ << " bluefs_total " << bluefs_total
<< " < min " << cct->_conf->bluestore_bluefs_min
- << ", should gift " << pretty_si_t(g) << dendl;
+ << ", should gift " << byte_u_t(g) << dendl;
if (g > gift)
gift = g;
reclaim = 0;
uint64_t g = min_free - bluefs_free;
dout(10) << __func__ << " bluefs_free " << bluefs_total
<< " < min " << min_free
- << ", should gift " << pretty_si_t(g) << dendl;
+ << ", should gift " << byte_u_t(g) << dendl;
if (g > gift)
gift = g;
reclaim = 0;
// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
dout(10) << __func__ << " gifting " << gift
- << " (" << pretty_si_t(gift) << ")" << dendl;
+ << " (" << byte_u_t(gift) << ")" << dendl;
// fixme: just do one allocation to start...
int r = alloc->reserve(gift);
// hard cap to fit into 32 bits
reclaim = MIN(reclaim, 1ull<<31);
dout(10) << __func__ << " reclaiming " << reclaim
- << " (" << pretty_si_t(reclaim) << ")" << dendl;
+ << " (" << byte_u_t(reclaim) << ")" << dendl;
while (reclaim > 0) {
// NOTE: this will block and do IO.
}
}
dout(1) << __func__ << " resized " << name << " file to "
- << pretty_si_t(size) << "B" << dendl;
+ << byte_u_t(size) << dendl;
}
VOID_TEMP_FAILURE_RETRY(::close(fd));
} else {
_set_compression();
_set_blob_size();
+ _set_finisher_num();
+
return 0;
}
}
return;
case TransContext::STATE_KV_SUBMITTED:
- txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
- txc->state = TransContext::STATE_KV_DONE;
_txc_committed_kv(txc);
// ** fall-thru **
txc->onreadable = NULL;
}
- if (!txc->oncommits.empty()) {
- finishers[n]->queue(txc->oncommits);
+ {
+ std::lock_guard<std::mutex> l(txc->osr->qlock);
+ txc->state = TransContext::STATE_KV_DONE;
+ if (!txc->oncommits.empty()) {
+ finishers[n]->queue(txc->oncommits);
+ }
}
+ txc->log_state_latency(logger, l_bluestore_state_kv_committing_lat);
}
void BlueStore::_txc_finish(TransContext *txc)
{
dout(10) << __func__ << dendl;
- if (cct->_conf->bluestore_shard_finishers) {
- if (cct->_conf->osd_op_num_shards) {
- m_finisher_num = cct->_conf->osd_op_num_shards;
- } else {
- assert(bdev);
- if (bdev->is_rotational()) {
- m_finisher_num = cct->_conf->osd_op_num_shards_hdd;
- } else {
- m_finisher_num = cct->_conf->osd_op_num_shards_ssd;
- }
- }
- }
-
- assert(m_finisher_num != 0);
-
for (int i = 0; i < m_finisher_num; ++i) {
ostringstream oss;
oss << "finisher-" << i;
{
auto& extents_to_collect = gc.get_extents_to_collect();
+ bool dirty_range_updated = false;
WriteContext wctx_gc;
wctx_gc.fork(wctx); // make a clone for garbage collection
if (*dirty_start > it->offset) {
*dirty_start = it->offset;
+ dirty_range_updated = true;
}
if (*dirty_end < it->offset + it->length) {
*dirty_end = it->offset + it->length;
+ dirty_range_updated = true;
}
}
+ if (dirty_range_updated) {
+ o->extent_map.fault_range(db, *dirty_start, *dirty_end);
+ }
dout(30) << __func__ << " alloc write" << dendl;
int r = _do_alloc_write(txc, c, o, &wctx_gc);
<< dendl;
goto out;
}
+ dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
+ << "~" << dirty_end - dirty_start << std::dec << dendl;
}
}
-
o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
void _close_fsid();
void _set_alloc_sizes();
void _set_blob_size();
+ void _set_finisher_num();
int _open_bdev(bool create);
void _close_bdev();
dout(1) << __func__
<< " size " << size
<< " (0x" << std::hex << size << std::dec << ", "
- << pretty_si_t(size) << "B)"
+ << byte_u_t(size) << ")"
<< " block_size " << block_size
- << " (" << pretty_si_t(block_size) << "B)"
+ << " (" << byte_u_t(block_size) << ")"
<< " " << (rotational ? "rotational" : "non-rotational")
<< dendl;
return 0;
// round size down to an even block
size &= ~(block_size - 1);
- dout(1) << __func__ << " size " << size << " (" << pretty_si_t(size) << "B)"
- << " block_size " << block_size << " (" << pretty_si_t(block_size)
- << "B)" << dendl;
+ dout(1) << __func__ << " size " << size << " (" << byte_u_t(size) << ")"
+ << " block_size " << block_size << " (" << byte_u_t(block_size)
+ << ")" << dendl;
return 0;
}
{
dout(1) << __func__ << dendl;
+ delete queue_t;
+ queue_t = nullptr;
name.clear();
driver->remove_device(this);
dout(1) << __func__
<< " size " << size
- << " (" << pretty_si_t(size) << "B)"
+ << " (" << byte_u_t(size) << ")"
<< " block_size " << block_size
- << " (" << pretty_si_t(block_size) << "B)"
+ << " (" << byte_u_t(block_size) << ")"
<< dendl;
return 0;
uint64_t expected_num_objs ///< [in] expected number of objects this collection has
) { ceph_abort(); return 0; }
- virtual int apply_layout_settings() { ceph_abort(); return 0; }
+ virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; }
/// Read index-wide settings (should be called after construction)
virtual int read_settings() { return 0; }
return res;
}
-int FileStore::apply_layout_settings(const coll_t &cid)
+int FileStore::apply_layout_settings(const coll_t &cid, int target_level)
{
- dout(20) << __FUNC__ << ": " << cid << dendl;
+ dout(20) << __FUNC__ << ": " << cid << " target level: "
+ << target_level << dendl;
Index index;
int r = get_index(cid, &index);
if (r < 0) {
return r;
}
- return index->apply_layout_settings();
+ return index->apply_layout_settings(target_level);
}
void dump_stop();
void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
- virtual int apply_layout_settings(const coll_t &cid);
+ virtual int apply_layout_settings(const coll_t &cid, int target_level);
private:
void _inject_failure();
&mkdirred);
}
-int HashIndex::split_dirs(const vector<string> &path) {
- dout(20) << __func__ << " " << path << dendl;
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+ dout(20) << __func__ << " " << path << " target level: "
+ << target_level << dendl;
subdir_info_s info;
int r = get_info(path, &info);
if (r < 0) {
return r;
}
- if (must_split(info)) {
+ if (must_split(info, target_level)) {
dout(1) << __func__ << " " << path << " has " << info.objs
- << " objects, starting split." << dendl;
+ << " objects, " << info.hash_level
+ << " level, starting split in pg " << coll() << "." << dendl;
r = initiate_split(path, info);
if (r < 0) {
dout(10) << "error initiating split on " << path << ": "
}
r = complete_split(path, info);
- dout(1) << __func__ << " " << path << " split completed."
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
<< dendl;
if (r < 0) {
dout(10) << "error completing split on " << path << ": "
it != subdirs.end(); ++it) {
vector<string> subdir_path(path);
subdir_path.push_back(*it);
- r = split_dirs(subdir_path);
+ r = split_dirs(subdir_path, target_level);
if (r < 0) {
return r;
}
return r;
}
-int HashIndex::apply_layout_settings() {
+int HashIndex::apply_layout_settings(int target_level) {
vector<string> path;
dout(10) << __func__ << " split multiple = " << split_multiplier
<< " merge threshold = " << merge_threshold
<< " split rand factor = " << cct->_conf->filestore_split_rand_factor
+ << " target level = " << target_level
<< dendl;
int r = write_settings();
if (r < 0)
return r;
- return split_dirs(path);
+ return split_dirs(path, target_level);
}
int HashIndex::_init() {
if (must_split(info)) {
dout(1) << __func__ << " " << path << " has " << info.objs
- << " objects, starting split." << dendl;
+ << " objects, starting split in pg " << coll() << "." << dendl;
int r = initiate_split(path, info);
if (r < 0)
return r;
r = complete_split(path, info);
- dout(1) << __func__ << " " << path << " split completed."
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
<< dendl;
return r;
} else {
info.subdirs == 0);
}
-bool HashIndex::must_split(const subdir_info_s &info) {
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+ // target_level is used for ceph-objectstore-tool to split dirs offline.
+ // if it is set (defalult is 0) and current hash level < target_level,
+ // this dir would be split no matters how many objects it has.
return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
- info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16));
-
+ ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+ (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
}
int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
) override;
/// @see CollectionIndex
- int apply_layout_settings() override;
+ int apply_layout_settings(int target_level) override;
protected:
int _init() override;
/// Encapsulates logic for when to merge.
bool must_split(
- const subdir_info_s &info ///< [in] Info to check
+ const subdir_info_s &info, ///< [in] Info to check
+ int target_level = 0
); /// @return True if info must be split, False otherwise
/// Initiates merge
int recursive_create_path(vector<string>& path, int level);
/// split each dir below the given path
- int split_dirs(const vector<string> &path);
+ int split_dirs(const vector<string> &path, int target_level = 0);
int write_settings();
};
PerfCountersBuilder b(
cct, string("WBThrottle"),
l_wbthrottle_first, l_wbthrottle_last);
- b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data");
- b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data");
+ b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(BYTES));
+ b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(BYTES));
b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
uint64_t attempts, obj, bytes;
promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
dout(10) << __func__ << " " << attempts << " attempts, promoted "
- << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
+ << obj << " objects and " << byte_u_t(bytes) << "; target "
<< target_obj_sec << " obj/sec or "
- << pretty_si_t(target_bytes_sec) << " bytes/sec"
+ << byte_u_t(target_bytes_sec) << "/sec"
<< dendl;
// calculate what the probability *should* be, given the targets
osd_plb.add_u64_counter(
l_osd_op_inb, "op_in_bytes",
"Client operations total write size",
- "wr", PerfCountersBuilder::PRIO_INTERESTING);
+ "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
osd_plb.add_u64_counter(
l_osd_op_outb, "op_out_bytes",
"Client operations total read size",
- "rd", PerfCountersBuilder::PRIO_INTERESTING);
+ "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
osd_plb.add_time_avg(
l_osd_op_lat, "op_latency",
"Latency of client operations (including queue time)",
osd_plb.add_u64_counter(
l_osd_op_r, "op_r", "Client read operations");
osd_plb.add_u64_counter(
- l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
+ l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
osd_plb.add_time_avg(
l_osd_op_r_lat, "op_r_latency",
"Latency of read operation (including queue time)");
"Client read-modify-write operations");
osd_plb.add_u64_counter(
l_osd_op_rw_inb, "op_rw_in_bytes",
- "Client read-modify-write operations write in");
+ "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
osd_plb.add_u64_counter(
l_osd_op_rw_outb,"op_rw_out_bytes",
- "Client read-modify-write operations read out ");
+ "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
osd_plb.add_time_avg(
l_osd_op_rw_lat, "op_rw_latency",
"Latency of read-modify-write operation (including queue time)");
osd_plb.add_u64_counter(
l_osd_sop, "subop", "Suboperations");
osd_plb.add_u64_counter(
- l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
+ l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(BYTES));
osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
osd_plb.add_u64_counter(
- l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
+ l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(BYTES));
osd_plb.add_time_avg(
l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
osd_plb.add_u64_counter(
osd_plb.add_u64_counter(
l_osd_sop_push, "subop_push", "Suboperations push messages");
osd_plb.add_u64_counter(
- l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
+ l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(BYTES));
osd_plb.add_time_avg(
l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
- osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
+ osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(BYTES));
osd_plb.add_u64_counter(
l_osd_rop, "recovery_ops",
"rop", PerfCountersBuilder::PRIO_INTERESTING);
osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
- osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
- osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
+ osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size", NULL, 0, unit_t(BYTES));
+ osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes", NULL, 0, unit_t(BYTES));
osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
osd_plb.add_u64(
l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
osd_plb.add_u64(
l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
- PerfCountersBuilder::PRIO_USEFUL);
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
osd_plb.add_u64(
l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
- PerfCountersBuilder::PRIO_USEFUL);
- osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
+ osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(BYTES));
osd_plb.add_u64_counter(
l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
pending_creates_from_osd.emplace(pgid.pgid, is_primary);
}
- dout(5) << __func__ << " withhold creation of pg " << pgid
+ dout(1) << __func__ << " withhold creation of pg " << pgid
<< ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
return true;
}
// having a sane value. If we allow any block size to be set things
// can still go sideways.
ss << "block 'size' values are capped at "
- << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
+ << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
<< " a higher value, please adjust 'osd_bench_max_block_size'";
r = -EINVAL;
goto out;
bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
if (count > max_count) {
ss << "'count' values greater than " << max_count
- << " for a block size of " << prettybyte_t(bsize) << ", assuming "
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
<< cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
<< " for " << duration << " seconds,"
<< " can cause ill effects on osd. "
cct->_conf->osd_bench_large_size_max_throughput * duration;
if (count > max_count) {
ss << "'count' values greater than " << max_count
- << " for a block size of " << prettybyte_t(bsize) << ", assuming "
- << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
+ << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
<< " for " << duration << " seconds,"
<< " can cause ill effects on osd. "
<< " Please adjust 'osd_bench_large_size_max_throughput'"
bsize = osize;
dout(1) << " bench count " << count
- << " bsize " << prettybyte_t(bsize) << dendl;
+ << " bsize " << byte_u_t(bsize) << dendl;
ObjectStore::Transaction cleanupt;
f->close_section();
f->flush(ss);
} else {
- ss << "bench: wrote " << prettybyte_t(count)
- << " in blocks of " << prettybyte_t(bsize) << " in "
- << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
+ ss << "bench: wrote " << byte_u_t(count)
+ << " in blocks of " << byte_u_t(bsize) << " in "
+ << (end-start) << " sec at " << byte_u_t(rate) << "/sec";
}
}
}
if (require_osd_release < CEPH_RELEASE_KRAKEN) {
f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
- CEPH_FEATURE_MSG_ADDR2 |
- CEPH_FEATURE_CRUSH_TUNABLES5);
+ CEPH_FEATURE_MSG_ADDR2);
}
if (require_osd_release < CEPH_RELEASE_JEWEL) {
f &= ~(CEPH_FEATURE_SERVER_JEWEL |
- CEPH_FEATURE_NEW_OSDOP_ENCODING);
+ CEPH_FEATURE_NEW_OSDOP_ENCODING |
+ CEPH_FEATURE_CRUSH_TUNABLES5);
}
return f;
}
*tbl << ""
<< ""
<< "" << "TOTAL"
- << si_t(pgs->get_osd_sum().kb << 10)
- << si_t(pgs->get_osd_sum().kb_used << 10)
- << si_t(pgs->get_osd_sum().kb_avail << 10)
+ << byte_u_t(pgs->get_osd_sum().kb << 10)
+ << byte_u_t(pgs->get_osd_sum().kb_used << 10)
+ << byte_u_t(pgs->get_osd_sum().kb_avail << 10)
<< lowprecision_t(average_util)
<< ""
<< TextTable::endrow;
<< c
<< weightf_t(qi.weight)
<< weightf_t(reweight)
- << si_t(kb << 10)
- << si_t(kb_used << 10)
- << si_t(kb_avail << 10)
+ << byte_u_t(kb << 10)
+ << byte_u_t(kb_used << 10)
+ << byte_u_t(kb_avail << 10)
<< lowprecision_t(util)
<< lowprecision_t(var);
encode_features(0),
epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
have_crc(false), full_crc(0), inc_crc(0) {
- memset(&fsid, 0, sizeof(fsid));
}
explicit Incremental(bufferlist &bl) {
bufferlist::iterator p = bl.begin();
cached_up_osd_features(0),
crc_defined(false), crc(0),
crush(std::make_shared<CrushWrapper>()) {
- memset(&fsid, 0, sizeof(fsid));
}
// no copying
from, oinfo, omissing, ctx->handle);
if (found_missing && num_unfound_before != missing_loc.num_unfound())
publish_stats_to_osd();
+ // avoid doing this if the peer is empty. This is abit of paranoia
+ // to avoid doing something rash if add_source_info() above
+ // incorrectly decided we found something new. (if the peer has
+ // last_update=0'0 that's impossible.)
if (found_missing &&
(get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
- CEPH_FEATURE_OSD_ERASURE_CODES)) {
+ CEPH_FEATURE_OSD_ERASURE_CODES) &&
+ oinfo.last_update != eversion_t()) {
pg_info_t tinfo(oinfo);
tinfo.pgid.shard = pg_whoami.shard;
(*(ctx->info_map))[from.osd].push_back(
if (p->second.is_delete()) {
ldout(pg->cct, 10) << __func__ << " " << soid
<< " delete, ignoring source" << dendl;
- found_missing = true;
continue;
}
if (oinfo.last_update < need) {
osd->send_message_osd_cluster(mlog, con.get());
}
+void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
+{
+ if (query.query.type == pg_query_t::INFO) {
+ pair<pg_shard_t, pg_info_t> notify_info;
+ update_history(query.query.history);
+ fulfill_info(query.from, query.query, notify_info);
+ rctx->send_notify(
+ notify_info.first,
+ pg_notify_t(
+ notify_info.first.shard, pg_whoami.shard,
+ query.query_epoch,
+ get_osdmap()->get_epoch(),
+ notify_info.second),
+ past_intervals);
+ } else {
+ update_history(query.query.history);
+ fulfill_log(query.from, query.query, query.query_epoch);
+ }
+}
+
void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
{
bool changed = false;
dout(20) << "new interval newup " << newup
<< " newacting " << newacting << dendl;
return true;
- } else {
- return false;
}
+ if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
+ dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
+ return true;
+ }
+ return false;
}
bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
return discard_event();
}
-boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
+boost::statechart::result PG::RecoveryState::ReplicaActive::react(
+ const MQuery& query)
{
PG *pg = context< RecoveryMachine >().pg;
- if (query.query.type == pg_query_t::MISSING) {
- pg->update_history(query.query.history);
- pg->fulfill_log(query.from, query.query, query.query_epoch);
- } // else: from prior to activation, safe to ignore
+ pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
return discard_event();
}
boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
{
PG *pg = context< RecoveryMachine >().pg;
- if (query.query.type == pg_query_t::INFO) {
- pair<pg_shard_t, pg_info_t> notify_info;
- pg->update_history(query.query.history);
- pg->fulfill_info(query.from, query.query, notify_info);
- context< RecoveryMachine >().send_notify(
- notify_info.first,
- pg_notify_t(
- notify_info.first.shard, pg->pg_whoami.shard,
- query.query_epoch,
- pg->get_osdmap()->get_epoch(),
- notify_info.second),
- pg->past_intervals);
- } else {
- pg->fulfill_log(query.from, query.query, query.query_epoch);
- }
+ pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
return discard_event();
}
ovec.insert(ovec.end(), i->second.begin(), i->second.end());
}
}
+
+ void send_notify(pg_shard_t to,
+ const pg_notify_t &info, const PastIntervals &pi) {
+ assert(notify_list);
+ (*notify_list)[to.osd].push_back(make_pair(info, pi));
+ }
};
void send_notify(pg_shard_t to,
const pg_notify_t &info, const PastIntervals &pi) {
assert(state->rctx);
- assert(state->rctx->notify_list);
- (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
+ state->rctx->send_notify(to, info, pi);
}
};
friend class RecoveryMachine;
void fulfill_info(pg_shard_t from, const pg_query_t &query,
pair<pg_shard_t, pg_info_t> ¬ify_info);
void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
-
+ void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
bool should_restart_peering(
return error == FOUND_ERROR;
}
-static int dcount(const object_info_t &oi)
+static int dcount(const object_info_t &oi, bool prioritize)
{
int count = 0;
+ // Prioritize bluestore objects when osd_distrust_data_digest is set
+ if (prioritize)
+ count += 1000;
if (oi.is_data_digest())
count++;
if (oi.is_omap_digest())
const map<pg_shard_t,ScrubMap*> &maps,
object_info_t *auth_oi,
map<pg_shard_t, shard_info_wrapper> &shard_map,
- inconsistent_obj_wrapper &object_error)
+ inconsistent_obj_wrapper &object_error,
+ bool &digest_match)
{
eversion_t auth_version;
+ bool auth_prio = false;
bufferlist first_oi_bl, first_ss_bl, first_hk_bl;
// Create list of shards with primary first so it will be auth copy all
shards.push_front(get_parent()->whoami_shard());
map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+ digest_match = true;
for (auto &l : shards) {
+ bool oi_prio = false;
map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
map<hobject_t, ScrubMap::object>::iterator i =
j->second->objects.find(obj);
error_string += " obj_size_info_mismatch";
}
+ // digest_match will only be true if computed digests are the same
+ if (auth_version != eversion_t()
+ && auth->second->objects[obj].digest_present
+ && i->second.digest_present
+ && auth->second->objects[obj].digest != i->second.digest) {
+ digest_match = false;
+ dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
+ << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
+ << dendl;
+ }
+
// Don't use this particular shard due to previous errors
// XXX: For now we can't pick one shard for repair and another's object info or snapset
if (shard_info.errors)
goto out;
+ // XXX: Do I want replicated only?
+ if (parent->get_pool().is_replicated() && cct->_conf->osd_distrust_data_digest) {
+ // This is a boost::optional<bool> so see if option set AND it has the value true
+ // We give priority to a replica where the ObjectStore like BlueStore has builtin checksum
+ if (j->second->has_builtin_csum && j->second->has_builtin_csum == true) {
+ oi_prio = true;
+ }
+ }
+
if (auth_version == eversion_t() || oi.version > auth_version ||
- (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
+ (oi.version == auth_version && dcount(oi, oi_prio) > dcount(*auth_oi, auth_prio))) {
auth = j;
*auth_oi = oi;
auth_version = oi.version;
+ auth_prio = oi_prio;
}
out:
inconsistent_obj_wrapper object_error{*k};
+ bool digest_match;
map<pg_shard_t, ScrubMap *>::const_iterator auth =
- be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error);
+ be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error,
+ digest_match);
list<pg_shard_t> auth_list;
set<pg_shard_t> object_errors;
ScrubMap::object& auth_object = auth->second->objects[*k];
set<pg_shard_t> cur_missing;
set<pg_shard_t> cur_inconsistent;
+ bool fix_digest = false;
for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
if (j == auth)
shard_map[j->first],
object_error,
ss);
+
+ dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
+ << (j == auth ? "auth " : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
+ << (shard_map[j->first].has_data_digest_mismatch_info() ? "info_mismatch " : "")
+ << (shard_map[j->first].only_data_digest_mismatch_info() ? "only" : "")
+ << dendl;
+
+ if (cct->_conf->osd_distrust_data_digest) {
+ if (digest_match && parent->get_pool().is_replicated()
+ && shard_map[j->first].has_data_digest_mismatch_info()) {
+ fix_digest = true;
+ }
+ shard_map[j->first].clear_data_digest_mismatch_info();
+ // If all replicas match, but they don't match object_info we can
+ // repair it by using missing_digest mechanism
+ } else if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
+ && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
+ && auth_object.digest_present) {
+ // Set in missing_digests
+ fix_digest = true;
+ // Clear the error
+ shard_map[j->first].clear_data_digest_mismatch_info();
+ errorstream << pgid << " : soid " << *k << " repairing object info data_digest" << "\n";
+ }
// Some errors might have already been set in be_select_auth_object()
if (shard_map[j->first].errors != 0) {
cur_inconsistent.insert(j->first);
if (found)
errorstream << pgid << " shard " << j->first << ": soid " << *k
<< " " << ss.str() << "\n";
- } else if (found) {
+ } else if (object_error.errors != 0) {
// Track possible shard to use as authoritative, if needed
// There are errors, without identifying the shard
object_errors.insert(j->first);
- errorstream << pgid << " : soid " << *k << " " << ss.str() << "\n";
+ if (found)
+ errorstream << pgid << " : soid " << *k << " " << ss.str() << "\n";
} else {
// XXX: The auth shard might get here that we don't know
// that it has the "correct" data.
if (!cur_inconsistent.empty()) {
inconsistent[*k] = cur_inconsistent;
}
+
+ if (fix_digest) {
+ boost::optional<uint32_t> data_digest, omap_digest;
+ assert(auth_object.digest_present);
+ data_digest = auth_object.digest;
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ }
+ missing_digest[*k] = make_pair(data_digest, omap_digest);
+ }
+ // Special handling of this particular type of inconsistency
+ // This can over-ride a data_digest or set an omap_digest
+ // when all replicas match but the object info is wrong.
if (!cur_inconsistent.empty() || !cur_missing.empty()) {
authoritative[*k] = auth_list;
- } else if (parent->get_pool().is_replicated()) {
+ } else if (!fix_digest && parent->get_pool().is_replicated()) {
enum {
NO = 0,
MAYBE = 1,
// recorded digest != actual digest?
if (auth_oi.is_data_digest() && auth_object.digest_present &&
auth_oi.data_digest != auth_object.digest) {
- assert(shard_map[auth->first].has_data_digest_mismatch_info());
+ assert(cct->_conf->osd_distrust_data_digest
+ || shard_map[auth->first].has_data_digest_mismatch_info());
errorstream << pgid << " recorded data digest 0x"
<< std::hex << auth_oi.data_digest << " != on disk 0x"
<< auth_object.digest << std::dec << " on " << auth_oi.soid
const map<pg_shard_t,ScrubMap*> &maps,
object_info_t *auth_oi,
map<pg_shard_t, shard_info_wrapper> &shard_map,
- inconsistent_obj_wrapper &object_error);
+ inconsistent_obj_wrapper &object_error,
+ bool &digest_match);
void be_compare_scrubmaps(
const map<pg_shard_t,ScrubMap*> &maps,
const set<hobject_t> &master_set,
set<snapid_t> snaps;
dout(20) << " snapset " << recovery_info.ss
<< " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
+ bool error = false;
if (recovery_info.ss.is_legacy() ||
recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
assert(recovery_info.oi.legacy_snaps.size());
recovery_info.oi.legacy_snaps.end());
} else {
auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
- assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
- snaps.insert(p->second.begin(), p->second.end());
+ if (p != recovery_info.ss.clone_snaps.end()) {
+ snaps.insert(p->second.begin(), p->second.end());
+ } else {
+ derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
+ error = true;
+ }
+ }
+ if (!error) {
+ dout(20) << " snaps " << snaps << dendl;
+ snap_mapper.add_oid(
+ recovery_info.soid,
+ snaps,
+ &_t);
}
- dout(20) << " snaps " << snaps << dendl;
- snap_mapper.add_oid(
- recovery_info.soid,
- snaps,
- &_t);
}
if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
} else {
int r = pgbackend->objects_read_sync(
soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
+ // whole object? can we verify the checksum?
+ if (!skip_data_digest && r >= 0 && op.extent.offset == 0 &&
+ (uint64_t)r == oi.size && oi.is_data_digest()) {
+ uint32_t crc = osd_op.outdata.crc32c(-1);
+ if (oi.data_digest != crc) {
+ osd->clog->error() << info.pgid << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << soid;
+ r = -EIO; // try repair later
+ }
+ }
if (r == -EIO) {
r = rep_repair_primary_object(soid, ctx->op);
}
}
dout(10) << " read got " << r << " / " << op.extent.length
<< " bytes from obj " << soid << dendl;
-
- // whole object? can we verify the checksum?
- if (!skip_data_digest &&
- op.extent.length == oi.size && oi.is_data_digest()) {
- uint32_t crc = osd_op.outdata.crc32c(-1);
- if (oi.data_digest != crc) {
- osd->clog->error() << info.pgid << std::hex
- << " full-object read crc 0x" << crc
- << " != expected 0x" << oi.data_digest
- << std::dec << " on " << soid;
- // FIXME fall back to replica or something?
- result = -EIO;
- }
- }
}
// XXX the op.extent.length is the requested length for async read
<< " full-object read crc 0x" << crc
<< " != expected 0x" << oi.data_digest
<< std::dec << " on " << soid;
- // FIXME fall back to replica or something?
- return -EIO;
+ r = rep_repair_primary_object(soid, ctx->op);
+ if (r < 0) {
+ return r;
+ }
}
}
object_info_t& oi = obs.oi;
const hobject_t& soid = oi.soid;
bool skip_data_digest =
- (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
- g_conf->osd_distrust_data_digest;
+ osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest;
PGTransaction* t = ctx->op_t.get();
}
auto p = i->second.lower_bound(oid);
if (p != i->second.begin() &&
- p->first > oid) {
+ (p == i->second.end() || p->first > oid)) {
--p;
}
if (p != i->second.end()) {
inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
- return out << "osd_stat(" << kb_t(s.kb_used) << " used, "
- << kb_t(s.kb_avail) << " avail, "
- << kb_t(s.kb) << " total, "
+ return out << "osd_stat(" << byte_u_t(s.kb_used << 10) << " used, "
+ << byte_u_t(s.kb_avail << 10) << " avail, "
+ << byte_u_t(s.kb << 10) << " total, "
<< "peers " << s.hb_peers
<< " op hist " << s.op_queue_age_hist.h
<< ")";
eversion_t valid_through;
eversion_t incr_since;
bool has_large_omap_object_errors:1;
+ boost::optional<bool> has_builtin_csum;
void merge_incr(const ScrubMap &l);
void clear_from(const hobject_t& start) {
plb.add_u64_counter(l_objectcacher_cache_ops_miss,
"cache_ops_miss", "Miss operations");
plb.add_u64_counter(l_objectcacher_cache_bytes_hit,
- "cache_bytes_hit", "Hit data");
+ "cache_bytes_hit", "Hit data", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_objectcacher_cache_bytes_miss,
- "cache_bytes_miss", "Miss data");
+ "cache_bytes_miss", "Miss data", NULL, 0, unit_t(BYTES));
plb.add_u64_counter(l_objectcacher_data_read,
"data_read", "Read data");
plb.add_u64_counter(l_objectcacher_data_written,
"Write operations, delayed due to dirty limits");
plb.add_u64_counter(l_objectcacher_write_bytes_blocked,
"write_bytes_blocked",
- "Write data blocked on dirty limit");
+ "Write data blocked on dirty limit", NULL, 0, unit_t(BYTES));
plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked",
"Time spent blocking a write due to dirty limits");
ldout(cct, 10) << "writex writing " << f_it->first << "~"
<< f_it->second << " into " << *bh << " at " << opos
<< dendl;
- uint64_t bhoff = bh->start() - opos;
+ uint64_t bhoff = opos - bh->start();
assert(f_it->second <= bh->length() - bhoff);
// get the frag we're mapping in
PerfCountersBuilder::PRIO_CRITICAL);
pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations");
pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations");
- pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data");
+ pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data", NULL, 0, unit_t(BYTES));
pcb.add_u64_counter(l_osdc_op_resend, "op_resend", "Resent operations");
pcb.add_u64_counter(l_osdc_op_reply, "op_reply", "Operation reply");
import json
import math
import random
+import six
import time
from mgr_module import MgrModule, CommandResult
from threading import Event
self.pg_up_by_poolid = {}
for poolid in self.poolids:
self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid)
- for a,b in self.pg_up_by_poolid[poolid].iteritems():
+ for a,b in six.iteritems(self.pg_up_by_poolid[poolid]):
self.pg_up[a] = b
def calc_misplaced_from(self, other_ms):
num = len(other_ms.pg_up)
misplaced = 0
- for pgid, before in other_ms.pg_up.iteritems():
+ for pgid, before in six.iteritems(other_ms.pg_up):
if before != self.pg_up.get(pgid, []):
misplaced += 1
if num > 0:
if len(self.compat_ws) and \
'-1' not in self.initial.crush_dump.get('choose_args', {}):
ls.append('ceph osd crush weight-set create-compat')
- for osd, weight in self.compat_ws.iteritems():
+ for osd, weight in six.iteritems(self.compat_ws):
ls.append('ceph osd crush weight-set reweight-compat %s %f' %
(osd, weight))
- for osd, weight in self.osd_weights.iteritems():
+ for osd, weight in six.iteritems(self.osd_weights):
ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
incdump = self.inc.dump()
for pgid in incdump.get('old_pg_upmap_items', []):
score = 0.0
sum_weight = 0.0
- for k, v in count[t].iteritems():
+ for k, v in six.iteritems(count[t]):
# adjust/normalize by weight
if target[k]:
adjusted = float(v) / target[k] / float(num)
weight_map = ms.crush.get_take_weight_osd_map(rootid)
adjusted_map = {
osd: cw * osd_weight[osd]
- for osd,cw in weight_map.iteritems() if osd in osd_weight and cw > 0
+ for osd,cw in six.iteritems(weight_map) if osd in osd_weight and cw > 0
}
sum_w = sum(adjusted_map.values())
assert len(adjusted_map) == 0 or sum_w > 0
pe.target_by_root[root] = { osd: w / sum_w
- for osd,w in adjusted_map.iteritems() }
+ for osd,w in six.iteritems(adjusted_map) }
actual_by_root[root] = {
'pgs': {},
'objects': {},
self.log.debug('target_by_root %s' % pe.target_by_root)
# pool and root actual
- for pool, pi in pool_info.iteritems():
+ for pool, pi in six.iteritems(pool_info):
poolid = pi['pool']
pm = ms.pg_up_by_poolid[poolid]
pgs = 0
pgs_by_osd[osd] = 0
objects_by_osd[osd] = 0
bytes_by_osd[osd] = 0
- for pgid, up in pm.iteritems():
+ for pgid, up in six.iteritems(pm):
for osd in [int(osd) for osd in up]:
if osd == CRUSHMap.ITEM_NONE:
continue
pe.count_by_pool[pool] = {
'pgs': {
k: v
- for k, v in pgs_by_osd.iteritems()
+ for k, v in six.iteritems(pgs_by_osd)
},
'objects': {
k: v
- for k, v in objects_by_osd.iteritems()
+ for k, v in six.iteritems(objects_by_osd)
},
'bytes': {
k: v
- for k, v in bytes_by_osd.iteritems()
+ for k, v in six.iteritems(bytes_by_osd)
},
}
pe.actual_by_pool[pool] = {
'pgs': {
k: float(v) / float(max(pgs, 1))
- for k, v in pgs_by_osd.iteritems()
+ for k, v in six.iteritems(pgs_by_osd)
},
'objects': {
k: float(v) / float(max(objects, 1))
- for k, v in objects_by_osd.iteritems()
+ for k, v in six.iteritems(objects_by_osd)
},
'bytes': {
k: float(v) / float(max(bytes, 1))
- for k, v in bytes_by_osd.iteritems()
+ for k, v in six.iteritems(bytes_by_osd)
},
}
pe.total_by_pool[pool] = {
pe.count_by_root[root] = {
'pgs': {
k: float(v)
- for k, v in actual_by_root[root]['pgs'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['pgs'])
},
'objects': {
k: float(v)
- for k, v in actual_by_root[root]['objects'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['objects'])
},
'bytes': {
k: float(v)
- for k, v in actual_by_root[root]['bytes'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['bytes'])
},
}
pe.actual_by_root[root] = {
'pgs': {
k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
- for k, v in actual_by_root[root]['pgs'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['pgs'])
},
'objects': {
k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
- for k, v in actual_by_root[root]['objects'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['objects'])
},
'bytes': {
k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
- for k, v in actual_by_root[root]['bytes'].iteritems()
+ for k, v in six.iteritems(actual_by_root[root]['bytes'])
},
}
self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
b,
pe.target_by_root[a],
pe.total_by_root[a]
- ) for a, b in pe.count_by_root.iteritems()
+ ) for a, b in six.iteritems(pe.count_by_root)
}
self.log.debug('stats_by_root %s' % pe.stats_by_root)
# total score is just average of normalized stddevs
pe.score = 0.0
- for r, vs in pe.score_by_root.iteritems():
- for k, v in vs.iteritems():
+ for r, vs in six.iteritems(pe.score_by_root):
+ for k, v in six.iteritems(vs):
pe.score += v
pe.score /= 3 * len(roots)
return pe
# get current osd reweights
orig_osd_weight = { a['osd']: a['weight']
for a in ms.osdmap_dump.get('osds',[]) }
- reweighted_osds = [ a for a,b in orig_osd_weight.iteritems()
+ reweighted_osds = [ a for a,b in six.iteritems(orig_osd_weight)
if b < 1.0 and b > 0.0 ]
# get current compat weight-set weights
orig_ws = self.get_compat_weight_set_weights(ms)
if not orig_ws:
return -errno.EAGAIN, 'compat weight-set not available'
- orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 }
+ orig_ws = { a: b for a, b in six.iteritems(orig_ws) if a >= 0 }
# Make sure roots don't overlap their devices. If so, we
# can't proceed.
visited = {}
overlap = {}
root_ids = {}
- for root, wm in pe.target_by_root.iteritems():
+ for root, wm in six.iteritems(pe.target_by_root):
for osd in wm.iterkeys():
if osd in visited:
overlap[osd] = 1
# normalize weights under this root
root_weight = crush.get_item_weight(pe.root_ids[root])
- root_sum = sum(b for a,b in next_ws.iteritems()
+ root_sum = sum(b for a,b in six.iteritems(next_ws)
if a in target.keys())
if root_sum > 0 and root_weight > 0:
factor = root_sum / root_weight
if best_pe.score < pe.score + fudge:
self.log.info('Success, score %f -> %f', pe.score, best_pe.score)
plan.compat_ws = best_ws
- for osd, w in best_ow.iteritems():
+ for osd, w in six.iteritems(best_ow):
if w != orig_osd_weight[osd]:
self.log.debug('osd.%d reweight %f', osd, w)
plan.osd_weights[osd] = w
self.log.error('Error creating compat weight-set')
return r, outs
- for osd, weight in plan.compat_ws.iteritems():
+ for osd, weight in six.iteritems(plan.compat_ws):
self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
osd, weight)
result = CommandResult('')
# new_weight
reweightn = {}
- for osd, weight in plan.osd_weights.iteritems():
+ for osd, weight in six.iteritems(plan.osd_weights):
reweightn[str(osd)] = str(int(weight * float(0x10000)))
if len(reweightn):
self.log.info('ceph osd reweightn %s', reweightn)
<i class="fa fa-heartbeat" rv-style="health_status | health_color"></i>
<span>Cluster health</span></a>
</li>
- <li class="treeview{%if path_info.startswith(('/server', '/osd'))%} active{%endif%}">
+ <li class="treeview{%if path_info.startswith(('/server', '/osd','/config_options'))%} active{%endif%}">
<a href="#"><i class="fa fa-server"></i> <span>Cluster</span>
<span class="pull-right-container">
<i class="fa fa-angle-left pull-right"></i>
<li>
<a href="{{ url_prefix }}/osd">OSDs</a>
</li>
+ <li>
+ <a href="{{ url_prefix }}/config_options">Configuration</a>
+ </li>
</ul>
</li>
<li class="treeview{%if path_info.startswith('/rbd')%} active{%endif%}">
--- /dev/null
+
+{% extends "base.html" %}
+
+{% block content %}
+
+
+<script>
+ $(document).ready(function(){
+ // Pre-populated initial data at page load
+ var content_data = {{ content_data }};
+
+ rivets.formatters.display_arrays = function(arr) {
+ result = arr.join().replace(/,/g, "<br/>");
+ return "<div style='width:90px;word-break:break-all'>" + result + "</div>";
+ };
+
+ str_to_level = function(str) {
+ if (str == "basic")
+ return "0"
+ else if (str == "advanced")
+ return "1"
+ else if (str == "developer")
+ return "2"
+ };
+
+ apply_filters = function() {
+ content_data.options_list = [];
+ var selection = "#" + content_data.service;
+ $(selection).attr('selected','selected');
+ var level = $("#level").val();
+ var service = $("#service").val();
+ if (level == "developer" && service == "any") {
+ content_data.options_list = content_data.options.options;
+ }
+ for (var opt of content_data.options.options) {
+ if (service == "any" && str_to_level(opt.level) <= level) {
+ content_data.options_list.push(opt);
+ } else if (opt.services.includes(service) && str_to_level(opt.level) <= level) {
+ content_data.options_list.push(opt);
+ }
+ }
+
+ };
+
+ rivets.bind($("#content"), content_data);
+ apply_filters();
+
+ });
+</script>
+
+<!-- Page Header -->
+<section class="content-header">
+ <h1 style="font-weight:bold">
+ Configuration Options
+ <div class="pull-right" style="font-size:17px">
+ <label>Services:</label>
+ <select id="service" style="color:grey" onchange="apply_filters()">
+ <option id="mon">mon</option>
+ <option id="mgr">mgr</option>
+ <option id="osd">osd</option>
+ <option id="mds">mds</option>
+ <option id="common">common</option>
+ <option id="mds_client">mds_client</option>
+ <option id="rgw">rgw</option>
+ <option id="any">any</option>
+ </select>
+ <label>Level:</label>
+ <select id="level" style="color:grey" onchange="apply_filters()">
+ <option value="0">basic</option>
+ <option value="1">advanced</option>
+ <option value="2">developer</option>
+ </select>
+ </div>
+ </h1>
+</section>
+
+<!-- Main content -->
+<section class="content">
+ <div class="box" style="overflow:auto">
+ <div class="box-body">
+ <table class="table table-bordered">
+ <thead>
+ <tr>
+ <th>Name</th>
+ <th>Description</th>
+ <th>Long description</th>
+ <th>Type</th>
+ <th>Level</th>
+ <th>Default</th>
+ <th>Daemon default</th>
+ <th>Tags</th>
+ <th>Services</th>
+ <th>See_also</th>
+ <th>Max</th>
+ <th>Min</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr rv-each-opt="options_list">
+ <td><div style="width:120px;word-break:break-all">{opt.name}</div></td>
+ <td><div style="width:80px;word-break:break-all">{opt.desc}</td>
+ <td><div style="width:120px;word-break:break-all">{opt.long_desc}</div></td>
+ <td><div style="width:70px;word-break:break-all">{opt.type}<div></td>
+ <td>{opt.level}</td>
+ <td><div style="width:80px;word-break:break-all">{opt.default}<div></td>
+ <td><div style="width:120px;word-break:break-all">{opt.daemon_default}</div></td>
+ <td rv-html="opt.tags | display_arrays"</td>
+ <td rv-html="opt.services | display_arrays"></td>
+ <td rv-html="opt.see_also | display_arrays"</td>
+ <td>{opt.max}</td>
+ <td>{opt.min}</td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+</section>
+
+
+{% endblock %}
toplevel_data=json.dumps(self._toplevel_data(), indent=2),
content_data=json.dumps(self._servers(), indent=2)
)
+
+ @cherrypy.expose
+ def config_options(self, service="any"):
+ template = env.get_template("config_options.html")
+ return template.render(
+ url_prefix = global_instance().url_prefix,
+ ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
+ toplevel_data=json.dumps(self._toplevel_data(), indent=2),
+ content_data=json.dumps(self.config_options_data(service), indent=2)
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def config_options_data(self, service):
+ options = {}
+ options = global_instance().get("config_options")
+
+ return {
+ 'options': options,
+ 'service': service,
+ }
def _servers(self):
return {
def servers_data(self):
return self._servers()
+ @cherrypy.expose
+ def perf_counters(self, service_type, service_id):
+ template = env.get_template("perf_counters.html")
+ toplevel_data = self._toplevel_data()
+
+ return template.render(
+ url_prefix = global_instance().url_prefix,
+ ceph_version=global_instance().version,
+ path_info=cherrypy.request.path_info,
+ toplevel_data=json.dumps(toplevel_data, indent=2),
+ content_data=json.dumps(self.perf_counters_data(service_type, service_id), indent=2)
+ )
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def perf_counters_data(self, service_type, service_id):
+ schema = global_instance().get_perf_schema(service_type, str(service_id)).values()[0]
+ counters = []
+
+ for key, value in sorted(schema.items()):
+ counter = dict()
+ counter["name"] = str(key)
+ counter["description"] = value["description"]
+ if global_instance()._stattype_to_str(value["type"]) == 'counter':
+ counter["value"] = global_instance().get_rate(service_type, service_id, key)
+ counter["unit"] = global_instance()._unit_to_str(value["units"])
+ else:
+ counter["value"] = global_instance().get_latest(service_type, service_id, key)
+ counter["unit"] = ""
+ counters.append(counter)
+
+ return {
+ 'service_type': service_type,
+ 'service_id': service_id,
+ 'counters': counters,
+ }
+
def _health(self):
# Fuse osdmap with pg_summary to get description of pools
# including their PG states
<section class="content-header">
<h1>
osd.{osd.osd}
+ <button class="pull-right btn btn-default" style="margin-right:5px"><a href="{{url_prefix}}/config_options/osd">Configuration</a></button>
</h1>
</section>
from threading import Event
import json
import errno
+import six
import time
from mgr_module import MgrModule
now = datetime.utcnow().isoformat() + 'Z'
- for daemon, counters in self.get_all_perf_counters().iteritems():
+ for daemon, counters in six.iteritems(self.get_all_perf_counters()):
svc_type, svc_id = daemon.split(".", 1)
metadata = self.get_metadata(svc_type, svc_id)
import json
import logging
+import six
import threading
from collections import defaultdict
def get_take_weight_osd_map(self, root):
uglymap = self._get_take_weight_osd_map(root)
- return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() }
+ return { int(k): v for k, v in six.iteritems(uglymap.get('weights', {})) }
class MgrStandbyModule(ceph_module.BaseMgrStandbyModule):
"""
PERFCOUNTER_HISTOGRAM = 0x10
PERFCOUNTER_TYPE_MASK = ~3
+ # units supported
+ BYTES = 0
+ NONE = 1
+
def __init__(self, module_name, py_modules_ptr, this_ptr):
self.module_name = module_name
else:
return value
+ def _unit_to_str(self, unit):
+ if unit == self.NONE:
+ return "/s"
+ elif unit == self.BYTES:
+ return "B/s"
+
def get_server(self, hostname):
"""
Called by the plugin to load information about a particular
('quota_max_objects', 'max_objects'),
]
-POOL_ARGS = POOL_PROPERTIES + map(
- lambda x: x[0],
- POOL_QUOTA_PROPERTIES
-)
+POOL_ARGS = POOL_PROPERTIES + [x for x,_ in POOL_QUOTA_PROPERTIES]
# Transform command to a human readable form
import tempfile
import threading
import traceback
+import six
import socket
import common
from hooks import ErrorHook
from mgr_module import MgrModule, CommandResult
+
# Global instance to share
instance = None
self.id = str(id(self))
# Filter out empty sub-requests
- commands_arrays = filter(
- lambda x: len(x) != 0,
- commands_arrays,
- )
+ commands_arrays = [x for x in commands_arrays
+ if len(x) != 0]
self.running = []
self.waiting = commands_arrays[1:]
self._serve()
self.server.socket.close()
except CannotServe as cs:
- self.log.warn("server not running: {0}".format(cs.message))
+ self.log.warn("server not running: %s", cs)
except:
self.log.error(str(traceback.format_exc()))
def refresh_keys(self):
self.keys = {}
rawkeys = self.get_config_prefix('keys/') or {}
- for k, v in rawkeys.iteritems():
+ for k, v in six.iteritems(rawkeys):
self.keys[k[5:]] = v # strip of keys/ prefix
def _serve(self):
cert = self.get_localized_config("crt")
if cert is not None:
cert_tmp = tempfile.NamedTemporaryFile()
- cert_tmp.write(cert)
+ cert_tmp.write(cert.encode('utf-8'))
cert_tmp.flush()
cert_fname = cert_tmp.name
else:
pkey = self.get_localized_config("key")
if pkey is not None:
pkey_tmp = tempfile.NamedTemporaryFile()
- pkey_tmp.write(pkey)
+ pkey_tmp.write(pkey.encode('utf-8'))
pkey_tmp.flush()
pkey_fname = pkey_tmp.name
else:
if tag == 'seq':
return
- request = filter(
- lambda x: x.is_running(tag),
- self.requests)
-
+ request = [x for x in self.requests if x.is_running(tag)]
if len(request) != 1:
self.log.warn("Unknown request '%s'" % str(tag))
return
elif command['prefix'] == "restful create-self-signed-cert":
cert, pkey = self.create_self_signed_cert()
-
- self.set_config(self.get_mgr_id() + '/crt', cert)
- self.set_config(self.get_mgr_id() + '/key', pkey)
+ self.set_config(self.get_mgr_id() + '/crt', cert.decode('utf-8'))
+ self.set_config(self.get_mgr_id() + '/key', pkey.decode('utf-8'))
self.restart()
return (
# Filter by osd ids
if ids is not None:
- osds = filter(
- lambda x: str(x['osd']) in ids,
- osds
- )
+ osds = [x for x in osds if str(x['osd']) in ids]
# Get list of pools per osd node
pools_map = self.get_osd_pools()
# Filter by pool
if pool_id:
pool_id = int(pool_id)
- osds = filter(
- lambda x: pool_id in x['pools'],
- osds
- )
+ osds = [x for x in osds if pool_id in x['pools']]
return osds
def get_osd_by_id(self, osd_id):
- osd = filter(
- lambda x: x['osd'] == osd_id,
- self.get('osd_map')['osds']
- )
+ osd = [x for x in self.get('osd_map')['osds']
+ if x['osd'] == osd_id]
if len(osd) != 1:
return None
def get_pool_by_id(self, pool_id):
- pool = filter(
- lambda x: x['pool'] == pool_id,
- self.get('osd_map')['pools'],
- )
+ pool = [x for x in self.get('osd_map')['pools']
+ if x['pool'] == pool_id]
if len(pool) != 1:
return None
from collections import defaultdict
from prettytable import PrettyTable
-import prettytable
-import fnmatch
import errno
+import fnmatch
+import prettytable
+import six
from mgr_module import MgrModule
])
# Find the standby replays
- for gid_str, daemon_info in mdsmap['info'].iteritems():
+ for gid_str, daemon_info in six.iteritems(mdsmap['info']):
if daemon_info['state'] != "up:standby-replay":
continue
output += "MDS version: {0}".format(mds_versions.keys()[0])
else:
version_table = PrettyTable(["version", "daemons"])
- for version, daemons in mds_versions.iteritems():
+ for version, daemons in six.iteritems(mds_versions):
version_table.add_row([
version,
", ".join(daemons)
class Error(Exception):
""" `Error` class, derived from `Exception` """
- pass
-
-
-class InvalidArgumentError(Error):
- pass
-
-
-class OSError(Error):
- """ `OSError` class, derived from `Error` """
def __init__(self, message, errno=None):
- super(OSError, self).__init__(message)
+ super(Exception, self).__init__(message)
self.errno = errno
def __str__(self):
- msg = super(OSError, self).__str__()
+ msg = super(Exception, self).__str__()
if self.errno is None:
return msg
return '[errno {0}] {1}'.format(self.errno, msg)
def __reduce__(self):
return (self.__class__, (self.message, self.errno))
+class InvalidArgumentError(Error):
+ pass
+
+class OSError(Error):
+ """ `OSError` class, derived from `Error` """
+ pass
+
class InterruptedOrTimeoutError(OSError):
""" `InterruptedOrTimeoutError` class, derived from `OSError` """
pass
set_target_properties(rgw PROPERTIES OUTPUT_NAME rgw VERSION 2.0.0
SOVERSION 2)
install(TARGETS rgw DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-if(WITH_EMBEDDED)
- include(MergeStaticLibraries)
- add_library(cephd_rgw_base STATIC rgw_main.cc ${radosgw_admin_srcs})
- if(WITH_RADOSGW_FCGI_FRONTEND)
- target_include_directories(cephd_rgw_base PUBLIC ${FCGI_INCLUDE_DIR})
- endif()
- set_target_properties(cephd_rgw_base PROPERTIES COMPILE_DEFINITIONS BUILDING_FOR_EMBEDDED)
- merge_static_libraries(cephd_rgw cephd_rgw_base rgw_a radosgw_a)
- if(WITH_RADOSGW_FCGI_FRONTEND)
- target_link_libraries(cephd_rgw ${FCGI_LIBRARY})
- endif()
-endif()
}
-#ifdef BUILDING_FOR_EMBEDDED
-extern "C" int cephd_rgw_admin(int argc, const char **argv)
-#else
int main(int argc, const char **argv)
-#endif
{
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
return EINVAL;
}
} else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
- max_size = strict_si_cast<int64_t>(val.c_str(), &err);
+ max_size = strict_iecstrtoll(val.c_str(), &err);
if (!err.empty()) {
cerr << "ERROR: failed to parse max size: " << err << std::endl;
return EINVAL;
{
RGWRealm realm(realm_id, realm_name);
int ret = realm.init(g_ceph_context, store);
- if (ret < 0) {
+ bool default_realm_not_exist = (ret == -ENOENT && realm_id.empty() && realm_name.empty());
+
+ if (ret < 0 && !default_realm_not_exist ) {
cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
return -ret;
}
if (ret < 0) {
return 1;
}
- if (zonegroup.realm_id.empty()) {
+ if (zonegroup.realm_id.empty() && !default_realm_not_exist) {
zonegroup.realm_id = realm.get_id();
}
ret = zonegroup.create();
}
if (opt_cmd == OPT_OBJECTS_EXPIRE) {
- int ret = store->process_expire_objects();
- if (ret < 0) {
- cerr << "ERROR: process_expire_objects() processing returned error: " << cpp_strerror(-ret) << std::endl;
+ if (!store->process_expire_objects()) {
+ cerr << "ERROR: process_expire_objects() processing returned error." << std::endl;
return 1;
}
}
formatter->open_array_section("objects");
while (is_truncated) {
map<string, rgw_bucket_dir_entry> result;
- int r = store->cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, 1000, true,
- result, &is_truncated, &marker,
- bucket_object_check_filter);
+ int r =
+ store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, marker,
+ prefix, 1000, true,
+ result, &is_truncated, &marker,
+ bucket_object_check_filter);
if (r < 0 && r != -ENOENT) {
cerr << "ERROR: failed operation r=" << r << std::endl;
}
if (opt_cmd == OPT_ORPHANS_FIND) {
+ if (!yes_i_really_mean_it) {
+ cerr << "accidental removal of active objects can not be reversed; "
+ << "do you really mean it? (requires --yes-i-really-mean-it)"
+ << std::endl;
+ return EINVAL;
+ }
+
RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs);
if (job_id.empty()) {
}
public:
+ using engine_map_t = std::map <std::string, std::reference_wrapper<const Engine>>;
+ void add_engines(const std::vector <std::string>& auth_order,
+ engine_map_t eng_map)
+ {
+ auto ctrl_flag = Control::SUFFICIENT;
+ for (const auto &eng : auth_order) {
+ // fallback to the last engine, in case of multiple engines, since ctrl
+ // flag is sufficient for others, error from earlier engine is returned
+ if (&eng == &auth_order.back() && eng_map.size() > 1) {
+ ctrl_flag = Control::FALLBACK;
+ }
+ const auto kv = eng_map.find(eng);
+ if (kv != eng_map.end()) {
+ add_engine(ctrl_flag, kv->second);
+ }
+ }
+ }
+
+ std::vector<std::string> parse_auth_order(CephContext* const cct)
+ {
+ std::vector <std::string> result;
+
+ const std::set <boost::string_view> allowed_auth = { "external", "local" };
+ std::vector <std::string> default_order = { "external", "local"};
+ // supplied strings may contain a space, so let's bypass that
+ boost::split(result, cct->_conf->rgw_s3_auth_order,
+ boost::is_any_of(", "), boost::token_compress_on);
+
+ if (std::any_of(result.begin(), result.end(),
+ [allowed_auth](boost::string_view s)
+ { return allowed_auth.find(s) == allowed_auth.end();})){
+ return default_order;
+ }
+ return result;
+ }
+
AWSAuthStrategy(CephContext* const cct,
rgw::auth::ImplicitTenants& implicit_tenant_context,
RGWRados* const store)
add_engine(Control::SUFFICIENT, anonymous_engine);
}
+ auto auth_order = parse_auth_order(cct);
+ engine_map_t engine_map;
/* The external auth. */
- Control local_engine_mode;
if (! external_engines.is_empty()) {
- add_engine(Control::SUFFICIENT, external_engines);
-
- local_engine_mode = Control::FALLBACK;
- } else {
- local_engine_mode = Control::SUFFICIENT;
+ engine_map.insert(std::make_pair("external", std::cref(external_engines)));
}
-
/* The local auth. */
if (cct->_conf->rgw_s3_auth_use_rados) {
- add_engine(local_engine_mode, local_engine);
+ engine_map.insert(std::make_pair("local", std::cref(local_engine)));
}
+ add_engines(auth_order, engine_map);
}
const char* get_name() const noexcept override {
while (is_truncated) {
map<string, rgw_bucket_dir_entry> result;
- int r = store->cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, 1000, true,
- result, &is_truncated, &marker,
- bucket_object_check_filter);
+ int r = store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD,
+ marker, prefix, 1000, true,
+ result, &is_truncated, &marker,
+ bucket_object_check_filter);
if (r == -ENOENT) {
break;
} else if (r < 0 && r != -ENOENT) {
set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
}
-
dump_bucket_index(result, formatter);
flusher.flush();
-
}
formatter->close_section();
}
auto iter = cache_map.find(name);
- if (iter == cache_map.end() ||
- (expiry.count() &&
- (ceph::coarse_mono_clock::now() - iter->second.info.time_added) > expiry)) {
+ if (iter == cache_map.end()) {
ldout(cct, 10) << "cache get: name=" << name << " : miss" << dendl;
if (perfcounter)
perfcounter->inc(l_rgw_cache_miss);
return -ENOENT;
}
+ if (expiry.count() &&
+ (ceph::coarse_mono_clock::now() - iter->second.info.time_added) > expiry) {
+ ldout(cct, 10) << "cache get: name=" << name << " : expiry miss" << dendl;
+ lock.unlock();
+ lock.get_write();
+ // check that wasn't already removed by other thread
+ iter = cache_map.find(name);
+ if (iter != cache_map.end()) {
+ for (auto &kv : iter->second.chained_entries)
+ kv.first->invalidate(kv.second);
+ remove_lru(name, iter->second.lru_iter);
+ cache_map.erase(iter);
+ }
+ if(perfcounter)
+ perfcounter->inc(l_rgw_cache_miss);
+ return -ENOENT;
+ }
ObjectCacheEntry *entry = &iter->second;
return false;
}
-static ssize_t unescape_str(const string& s, ssize_t ofs, char esc_char, char special_char, string *dest)
-{
- const char *src = s.c_str();
- char dest_buf[s.size() + 1];
- char *destp = dest_buf;
- bool esc = false;
-
- dest_buf[0] = '\0';
-
- for (size_t i = ofs; i < s.size(); i++) {
- char c = src[i];
- if (!esc && c == esc_char) {
- esc = true;
- continue;
- }
- if (!esc && c == special_char) {
- *destp = '\0';
- *dest = dest_buf;
- return (ssize_t)i + 1;
- }
- *destp++ = c;
- esc = false;
- }
- *destp = '\0';
- *dest = dest_buf;
- return string::npos;
-}
-
-static void escape_str(const string& s, char esc_char, char special_char, string *dest)
-{
- const char *src = s.c_str();
- char dest_buf[s.size() * 2 + 1];
- char *destp = dest_buf;
-
- for (size_t i = 0; i < s.size(); i++) {
- char c = src[i];
- if (c == esc_char || c == special_char) {
- *destp++ = esc_char;
- }
- *destp++ = c;
- }
- *destp++ = '\0';
- *dest = dest_buf;
-}
-
void rgw_pool::from_str(const string& s)
{
- size_t pos = unescape_str(s, 0, '\\', ':', &name);
+ size_t pos = rgw_unescape_str(s, 0, '\\', ':', &name);
if (pos != string::npos) {
- pos = unescape_str(s, pos, '\\', ':', &ns);
+ pos = rgw_unescape_str(s, pos, '\\', ':', &ns);
/* ignore return; if pos != string::npos it means that we had a colon
* in the middle of ns that wasn't escaped, we're going to stop there
*/
string rgw_pool::to_str() const
{
string esc_name;
- escape_str(name, '\\', ':', &esc_name);
+ rgw_escape_str(name, '\\', ':', &esc_name);
if (ns.empty()) {
return esc_name;
}
string esc_ns;
- escape_str(ns, '\\', ':', &esc_ns);
+ rgw_escape_str(ns, '\\', ':', &esc_ns);
return esc_name + ":" + esc_ns;
}
};
/** Stores the XML arguments associated with the HTTP request in req_state*/
-class RGWHTTPArgs
-{
+class RGWHTTPArgs {
string str, empty_str;
map<string, string> val_map;
map<string, string> sys_val_map;
const string& get_str() {
return str;
}
-};
+}; // RGWHTTPArgs
const char *rgw_conf_get(const map<string, string, ltstr_nocase>& conf_map, const char *name, const char *def_val);
int rgw_conf_get_int(const map<string, string, ltstr_nocase>& conf_map, const char *name, int def_val);
}
}
-struct RGWBucketInfo
-{
+struct RGWBucketInfo {
enum BIShardsHashType {
MOD = 0
};
void rgw_setup_saved_curl_handles();
void rgw_release_all_curl_handles();
+static inline void rgw_escape_str(const string& s, char esc_char,
+ char special_char, string *dest)
+{
+ const char *src = s.c_str();
+ char dest_buf[s.size() * 2 + 1];
+ char *destp = dest_buf;
+
+ for (size_t i = 0; i < s.size(); i++) {
+ char c = src[i];
+ if (c == esc_char || c == special_char) {
+ *destp++ = esc_char;
+ }
+ *destp++ = c;
+ }
+ *destp++ = '\0';
+ *dest = dest_buf;
+}
+
+static inline ssize_t rgw_unescape_str(const string& s, ssize_t ofs,
+ char esc_char, char special_char,
+ string *dest)
+{
+ const char *src = s.c_str();
+ char dest_buf[s.size() + 1];
+ char *destp = dest_buf;
+ bool esc = false;
+
+ dest_buf[0] = '\0';
+
+ for (size_t i = ofs; i < s.size(); i++) {
+ char c = src[i];
+ if (!esc && c == esc_char) {
+ esc = true;
+ continue;
+ }
+ if (!esc && c == special_char) {
+ *destp = '\0';
+ *dest = dest_buf;
+ return (ssize_t)i + 1;
+ }
+ *destp++ = c;
+ esc = false;
+ }
+ *destp = '\0';
+ *dest = dest_buf;
+ return string::npos;
+}
+
#endif
/* we have reported this error */
}
}
+ if (sync_status != 0)
+ break;
+ }
+ if (sync_status != 0) {
+ /* get error, stop */
+ break;
}
if (!marker_tracker.index_key_to_marker(key, cur_id)) {
set_status() << "can't do op, sync already in progress for object";
int RGWGC::tag_index(const string& tag)
{
- return rgw_shards_hash(tag, max_objs);
+ return rgw_shard_id(tag, max_objs);
}
void RGWGC::add_chain(ObjectWriteOperation& op, cls_rgw_obj_chain& chain, const string& tag)
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, simple_receive_http_data);
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)this);
curl_easy_setopt(curl_handle, CURLOPT_ERRORBUFFER, (void *)error_buf);
+ curl_easy_setopt(curl_handle, CURLOPT_LOW_SPEED_TIME, cct->_conf->rgw_curl_low_speed_time);
+ curl_easy_setopt(curl_handle, CURLOPT_LOW_SPEED_LIMIT, cct->_conf->rgw_curl_low_speed_limit);
if (h) {
curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, (void *)h);
}
curl_easy_setopt(easy_handle, CURLOPT_WRITEFUNCTION, receive_http_data);
curl_easy_setopt(easy_handle, CURLOPT_WRITEDATA, (void *)req_data);
curl_easy_setopt(easy_handle, CURLOPT_ERRORBUFFER, (void *)req_data->error_buf);
+ curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_TIME, cct->_conf->rgw_curl_low_speed_time);
+ curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_LIMIT, cct->_conf->rgw_curl_low_speed_limit);
if (h) {
curl_easy_setopt(easy_handle, CURLOPT_HTTPHEADER, (void *)h);
}
switch (result) {
case CURLE_OK:
break;
+ case CURLE_OPERATION_TIMEDOUT:
+ dout(0) << "WARNING: curl operation timed out, network average transfer speed less than "
+ << cct->_conf->rgw_curl_low_speed_limit << " Bytes per second during " << cct->_conf->rgw_curl_low_speed_time << " seconds." << dendl;
default:
dout(20) << "ERROR: msg->data.result=" << result << " req_data->id=" << id << " http_status=" << http_status << dendl;
break;
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
#include <string.h>
#include <iostream>
#include <map>
continue;
}
if (prefix_iter != prefix_map.begin() &&
- (prefix_iter->first.compare(0, prev(prefix_iter)->first.length(), prev(prefix_iter)->first) == 0)) {
- list_op.next_marker = pre_marker;
+ (prefix_iter->first.compare(0, prev(prefix_iter)->first.length(), prev(prefix_iter)->first) == 0)) {
+ list_op.get_next_marker() = pre_marker;
} else {
- pre_marker = list_op.get_next_marker();
+ pre_marker = list_op.get_next_marker();
}
list_op.params.prefix = prefix_iter->first;
rgw_bucket_dir_entry pre_obj;
if ((obj_iter + 1)==objs.end()) {
if (is_truncated) {
//deal with it in next round because we can't judge whether this marker is the only version
- list_op.next_marker = obj_iter->key;
+ list_op.get_next_marker() = obj_iter->key;
break;
}
} else if (obj_iter->key.name.compare((obj_iter + 1)->key.name) == 0) { //*obj_iter is delete marker and isn't the only version, do nothing.
/*
* start up the RADOS connection and then handle HTTP messages as they come in
*/
-#ifdef BUILDING_FOR_EMBEDDED
-extern "C" int cephd_rgw(int argc, const char **argv)
-#else
int main(int argc, const char **argv)
-#endif
{
// dout() messages will be sent to stderr, but FCGX wants messages on stdout
// Redirect stderr to stdout.
else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE;
else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD;
- if ((rule->get_allowed_methods() & flags) == flags) {
+ if (rule->get_allowed_methods() & flags) {
dout(10) << "Method " << req_meth << " is supported" << dendl;
} else {
dout(5) << "Method " << req_meth << " is not supported" << dendl;
return;
}
+ if (allow_unordered && !delimiter.empty()) {
+ ldout(s->cct, 0) <<
+ "ERROR: unordered bucket listing requested with a delimiter" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
if (need_container_stats()) {
map<string, RGWBucketEnt> m;
m[s->bucket.name] = RGWBucketEnt();
list_op.params.marker = marker;
list_op.params.end_marker = end_marker;
list_op.params.list_versions = list_versions;
+ list_op.params.allow_unordered = allow_unordered;
op_ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
if (op_ret >= 0) {
op_ret = put_data_and_throttle(filter, data, ofs, need_to_wait);
if (op_ret < 0) {
- if (!need_to_wait || op_ret != -EEXIST) {
+ if (op_ret != -EEXIST) {
ldout(s->cct, 20) << "processor->thottle_data() returned ret="
<< op_ret << dendl;
goto done;
const string name() override { return "list_buckets"; }
RGWOpType get_type() override { return RGW_OP_LIST_BUCKETS; }
uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-};
+}; // class RGWListBuckets
class RGWGetUsage : public RGWOp {
protected:
int default_max;
bool is_truncated;
+ bool allow_unordered;
int shard_id;
public:
RGWListBucket() : list_versions(false), max(0),
- default_max(0), is_truncated(false), shard_id(-1) {}
+ default_max(0), is_truncated(false),
+ allow_unordered(false), shard_id(-1) {}
int verify_permission() override;
void pre_exec() override;
void execute() override;
uint64_t interval_msec() override {
return cct->_conf->rgw_md_notify_interval_msec;
}
+ void stop_process() override {
+ notify_mgr.stop();
+ }
public:
RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
: RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
uint64_t interval_msec() override {
return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
}
+ void stop_process() override {
+ notify_mgr.stop();
+ }
public:
RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
int RGWRados::key_to_shard_id(const string& key, int max_shards)
{
- return rgw_shards_hash(key, max_shards);
+ return rgw_shard_id(key, max_shards);
}
void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
{
string obj_key = key.name + key.instance;
int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
- uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
- uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
- sid = rgw_shards_mod(sid2, num_shards);
- return sid;
+ return rgw_bucket_shard_index(obj_key, num_shards);
}
static string objexp_hint_get_keyext(const string& tenant_name,
return 0;
}
-/**
- * get listing of the objects in a bucket.
+
+/**
+ * Get ordered listing of the objects in a bucket.
*
* max: maximum number of results to return
* bucket: bucket to list contents of
* common_prefixes: if delim is filled in, any matching prefixes are placed here.
* is_truncated: if number of objects in the bucket is bigger than max, then truncated.
*/
-int RGWRados::Bucket::List::list_objects(int64_t max,
- vector<rgw_bucket_dir_entry> *result,
- map<string, bool> *common_prefixes,
- bool *is_truncated)
+int RGWRados::Bucket::List::list_objects_ordered(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated)
{
RGWRados *store = target->get_store();
CephContext *cct = store->ctx();
string bigger_than_delim;
if (!params.delim.empty()) {
- unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
+ unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(),
+ params.delim.size());
char buf[params.delim.size() + 16];
int r = encode_utf8(val + 1, (unsigned char *)buf);
if (r < 0) {
cur_marker = s;
}
}
-
+
string skip_after_delim;
while (truncated && count <= max) {
if (skip_after_delim > cur_marker.name) {
ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
}
std::map<string, rgw_bucket_dir_entry> ent_map;
- int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
- read_ahead + 1 - count, params.list_versions, ent_map,
- &truncated, &cur_marker);
+ int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ read_ahead + 1 - count,
+ params.list_versions,
+ ent_map,
+ &truncated,
+ &cur_marker);
if (r < 0)
return r;
- std::map<string, rgw_bucket_dir_entry>::iterator eiter;
- for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+ for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
rgw_bucket_dir_entry& entry = eiter->second;
rgw_obj_index_key index_key = entry.key;
rgw_obj_key obj(index_key);
- /* note that parse_raw_oid() here will not set the correct object's instance, as
- * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
- * not needed for the checks here and we end up using the raw entry for the return vector
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
*/
bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
if (!valid) {
if (params.filter && !params.filter->filter(obj.name, index_key.name))
continue;
- if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
+ if (params.prefix.size() &&
+ (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
continue;
if (!params.delim.empty()) {
result->emplace_back(std::move(entry));
count++;
}
-
- // Either the back-end telling us truncated, or we don't consume all
- // items returned per the amount caller request
- truncated = (truncated || eiter != ent_map.end());
}
done:
*is_truncated = truncated;
return 0;
-}
+} // list_objects_ordered
+
+
+/**
+ * Get listing of the objects in a bucket and allow the results to be out
+ * of order.
+ *
+ * Even though there are key differences with the ordered counterpart,
+ * the parameters are the same to maintain some compatability.
+ *
+ * max: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: should not be set; if it is we should have indicated an error
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: this is never filled with an unordered list; the param
+ * is maintained for compatibility
+ * is_truncated: if number of objects in the bucket is bigger than max, then
+ * truncated.
+ */
+int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated)
+{
+ RGWRados *store = target->get_store();
+ CephContext *cct = store->ctx();
+ int shard_id = target->get_shard_id();
+
+ int count = 0;
+ bool truncated = true;
+
+ // read a few extra in each call to cls_bucket_list_unordered in
+ // case some are filtered out due to namespace matching, versioning,
+ // filtering, etc.
+ const int64_t max_read_ahead = 100;
+ const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
+
+ result->clear();
+
+ rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
+ rgw_obj_index_key cur_marker;
+ marker_obj.get_index_key(&cur_marker);
+
+ rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
+ params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
+ const bool cur_end_marker_valid = !params.end_marker.empty();
+
+ rgw_obj_key prefix_obj(params.prefix);
+ prefix_obj.ns = params.ns;
+ string cur_prefix = prefix_obj.get_index_key_name();
+
+ while (truncated && count <= max) {
+ std::vector<rgw_bucket_dir_entry> ent_list;
+ int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ read_ahead,
+ params.list_versions,
+ ent_list,
+ &truncated,
+ &cur_marker);
+ if (r < 0)
+ return r;
+
+ // NB: while regions of ent_list will be sorted, we have no
+ // guarantee that all items will be sorted since they can cross
+ // shard boundaries
+
+ for (auto& entry : ent_list) {
+ rgw_obj_index_key index_key = entry.key;
+ rgw_obj_key obj(index_key);
+
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
+ */
+ bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+ if (!valid) {
+ ldout(cct, 0) << "ERROR: could not parse object name: " <<
+ obj.name << dendl;
+ continue;
+ }
+
+ if (!params.list_versions && !entry.is_visible()) {
+ continue;
+ }
+
+ if (params.enforce_ns && obj.ns != params.ns) {
+ continue;
+ }
+
+ if (cur_end_marker_valid && cur_end_marker <= index_key) {
+ // we're not guaranteed items will come in order, so we have
+ // to loop through all
+ continue;
+ }
+
+ if (count < max) {
+ params.marker = index_key;
+ next_marker = index_key;
+ }
+
+ if (params.filter && !params.filter->filter(obj.name, index_key.name))
+ continue;
+
+ if (params.prefix.size() &&
+ (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
+ continue;
+
+ if (count >= max) {
+ truncated = true;
+ goto done;
+ }
+
+ result->emplace_back(std::move(entry));
+ count++;
+ } // for (auto& entry : ent_list)
+ } // while (truncated && count <= max)
+
+done:
+ if (is_truncated)
+ *is_truncated = truncated;
+
+ return 0;
+} // list_objects_unordered
+
/**
* create a rados pool, associated meta info
int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
{
- std::map<string, rgw_bucket_dir_entry> ent_map;
+ std::vector<rgw_bucket_dir_entry> ent_list;
rgw_obj_index_key marker;
string prefix;
bool is_truncated;
do {
-#define NUM_ENTRIES 1000
- int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
- &is_truncated, &marker);
+ constexpr uint NUM_ENTRIES = 1000u;
+ int r = cls_bucket_list_unordered(bucket_info,
+ RGW_NO_SHARD,
+ marker,
+ prefix,
+ NUM_ENTRIES,
+ true,
+ ent_list,
+ &is_truncated,
+ &marker);
if (r < 0)
return r;
string ns;
- std::map<string, rgw_bucket_dir_entry>::iterator eiter;
- for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+ for (auto const& dirent : ent_list) {
rgw_obj_key obj;
- if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
+ if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
return -ENOTEMPTY;
}
} while (is_truncated);
+
return 0;
}
return gc->send_chain(chain, tag, sync);
}
-int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
+int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
+ librados::IoCtx& index_ctx,
+ string& bucket_oid)
{
const rgw_bucket& bucket = bucket_info.bucket;
int r = open_bucket_index_ctx(bucket_info, index_ctx);
return 0;
}
-int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
- string& bucket_oid_base) {
+int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
+ librados::IoCtx& index_ctx,
+ string& bucket_oid_base) {
const rgw_bucket& bucket = bucket_info.bucket;
int r = open_bucket_index_ctx(bucket_info, index_ctx);
if (r < 0)
}
-int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
- map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
+int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
+ librados::IoCtx& index_ctx,
+ map<int, string>& bucket_objs,
+ int shard_id,
+ map<int, string> *bucket_instance_ids) {
string bucket_oid_base;
int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
if (ret < 0) {
rgw_zone_set zones_trace;
if (_zones_trace) {
zones_trace = *_zones_trace;
- } else {
- zones_trace.insert(get_zone().id);
}
+ zones_trace.insert(get_zone().id);
BucketShard bs(this);
return lc->process();
}
-int RGWRados::process_expire_objects()
+bool RGWRados::process_expire_objects()
{
- obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
- return 0;
+ return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
}
int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
if (_zones_trace) {
zones_trace = *_zones_trace;
}
- else {
- zones_trace.insert(get_zone().id);
- }
-
+ zones_trace.insert(get_zone().id);
+
ObjectWriteOperation o;
cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
dir_meta = ent.meta;
dir_meta.category = category;
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(get_zone().id);
+
rgw_bucket_entry_ver ver;
ver.pool = pool;
ver.epoch = epoch;
cls_rgw_obj_key key(ent.key.name, ent.key.instance);
cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
- get_zone().log_data, bilog_flags, _zones_trace);
+ get_zone().log_data, bilog_flags, &zones_trace);
complete_op_data *arg;
index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
- get_zone().log_data, bilog_flags, _zones_trace, &arg);
+ get_zone().log_data, bilog_flags, &zones_trace, &arg);
librados::AioCompletion *completion = arg->rados_completion;
int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
completion->release(); /* can't reference arg here, as it might have already been released */
return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
}
-int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
- uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
- bool *is_truncated, rgw_obj_index_key *last_entry,
- bool (*force_check_filter)(const string& name))
+
+int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
+ int shard_id,
+ rgw_obj_index_key& start,
+ const string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ map<string, rgw_bucket_dir_entry>& m,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ bool (*force_check_filter)(const string& name))
{
- ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
+ ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
+ " start " << start.name << "[" << start.instance << "] num_entries " <<
+ num_entries << dendl;
librados::IoCtx index_ctx;
// key - oid (for different shards if there is any)
- // value - list result for the corresponding oid (shard), it is filled by the AIO callback
+ // value - list result for the corresponding oid (shard), it is filled by
+ // the AIO callback
map<int, string> oids;
map<int, struct rgw_cls_list_ret> list_results;
int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
return r;
cls_rgw_obj_key start_key(start.name, start.instance);
- r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
- oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+ r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
+ list_versions, oids, list_results,
+ cct->_conf->rgw_bucket_index_max_aio)();
if (r < 0)
return r;
* and if the tags are old we need to do cleanup as well. */
librados::IoCtx sub_ctx;
sub_ctx.dup(index_ctx);
- r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
+ r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
+ updates[vnames[pos]]);
if (r < 0 && r != -ENOENT) {
return r;
}
}
if (r >= 0) {
- ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+ ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
+ dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
m[name] = std::move(dirent);
++count;
}
// we don't care if we lose suggested updates, send them off blindly
AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
index_ctx.aio_operate(miter->first, c, &o);
- c->release();
+ c->release();
}
}
// Check if all the returned entries are consumed or not
for (size_t i = 0; i < vcurrents.size(); ++i) {
- if (vcurrents[i] != vends[i])
+ if (vcurrents[i] != vends[i]) {
*is_truncated = true;
+ break;
+ }
}
if (!m.empty())
*last_entry = m.rbegin()->first;
return 0;
}
-int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
+
+int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
+ int shard_id,
+ rgw_obj_index_key& start,
+ const string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ std::vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ bool (*force_check_filter)(const string& name)) {
+ ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
+ " start " << start.name << "[" << start.instance <<
+ "] num_entries " << num_entries << dendl;
+
+ *is_truncated = false;
+ librados::IoCtx index_ctx;
+
+ rgw_obj_index_key my_start = start;
+
+ map<int, string> oids;
+ int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
+ if (r < 0)
+ return r;
+ const uint32_t num_shards = oids.size();
+
+ uint32_t current_shard;
+ if (shard_id >= 0) {
+ current_shard = shard_id;
+ } else if (my_start.empty()) {
+ current_shard = 0u;
+ } else {
+ current_shard =
+ rgw_bucket_shard_index(my_start.name, num_shards);
+ }
+
+ uint32_t count = 0u;
+ map<string, bufferlist> updates;
+ std::string last_added_entry;
+ while (count <= num_entries &&
+ ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
+ current_shard < num_shards)) {
+ // key - oid (for different shards if there is any)
+ // value - list result for the corresponding oid (shard), it is filled by
+ // the AIO callback
+ map<int, struct rgw_cls_list_ret> list_results;
+ r = CLSRGWIssueBucketList(index_ctx, my_start, prefix, num_entries,
+ list_versions, oids, list_results,
+ cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0)
+ return r;
+
+ const std::string& oid = oids[current_shard];
+ assert(list_results.find(current_shard) != list_results.end());
+ auto& result = list_results[current_shard];
+ for (auto& entry : result.dir.m) {
+ rgw_bucket_dir_entry& dirent = entry.second;
+
+ bool force_check = force_check_filter &&
+ force_check_filter(dirent.key.name);
+ if ((!dirent.exists && !dirent.is_delete_marker()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
+ /* there are uncommitted ops. We need to check the current state,
+ * and if the tags are old we need to do cleanup as well. */
+ librados::IoCtx sub_ctx;
+ sub_ctx.dup(index_ctx);
+ r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ }
+
+ // at this point either r >=0 or r == -ENOENT
+ if (r >= 0) { // i.e., if r != -ENOENT
+ ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
+ dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+
+ if (count < num_entries) {
+ last_added_entry = entry.first;
+ my_start = dirent.key;
+ ent_list.emplace_back(std::move(dirent));
+ ++count;
+ } else {
+ *is_truncated = true;
+ goto check_updates;
+ }
+ } else { // r == -ENOENT
+ // in the case of -ENOENT, make sure we're advancing marker
+ // for possible next call to CLSRGWIssueBucketList
+ my_start = dirent.key;
+ }
+ } // entry for loop
+
+ if (!result.is_truncated) {
+ // if we reached the end of the shard read next shard
+ ++current_shard;
+ my_start = rgw_obj_index_key();
+ }
+ } // shard loop
+
+check_updates:
+ // suggest updates if there is any
+ map<string, bufferlist>::iterator miter = updates.begin();
+ for (; miter != updates.end(); ++miter) {
+ if (miter->second.length()) {
+ ObjectWriteOperation o;
+ cls_rgw_suggest_changes(o, miter->second);
+ // we don't care if we lose suggested updates, send them off blindly
+ AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+ index_ctx.aio_operate(miter->first, c, &o);
+ c->release();
+ }
+ }
+
+ if (last_entry && !ent_list.empty()) {
+ *last_entry = last_added_entry;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::cls_obj_usage_log_add(const string& oid,
+ rgw_usage_log_info& info)
{
rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
}
void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
- uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
-{
+ uint32_t num_shards,
+ map<int, string>& bucket_objects,
+ int shard_id) {
if (!num_shards) {
bucket_objects[0] = bucket_oid_base;
} else {
*shard_id = -1;
}
} else {
- uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
- uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
- sid = rgw_shards_mod(sid2, bucket_info.num_shards);
+ uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
if (shard_id) {
*shard_id = (int)sid;
}
*shard_id = -1;
}
} else {
- uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
- uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
- sid = rgw_shards_mod(sid2, num_shards);
+ uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
char buf[bucket_oid_base.size() + 32];
snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
(*bucket_obj) = buf;
#define RGW_SHARDS_PRIME_0 7877
#define RGW_SHARDS_PRIME_1 65521
+// only called by rgw_shard_id and rgw_bucket_shard_index
static inline int rgw_shards_mod(unsigned hval, int max_shards)
{
if (max_shards <= RGW_SHARDS_PRIME_0) {
return hval % RGW_SHARDS_PRIME_1 % max_shards;
}
-static inline int rgw_shards_hash(const string& key, int max_shards)
+// used for logging and tagging
+static inline int rgw_shard_id(const string& key, int max_shards)
{
- return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()), max_shards);
+ return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()),
+ max_shards);
+}
+
+// used for bucket indices
+static inline uint32_t rgw_bucket_shard_index(const std::string& key,
+ int num_shards) {
+ uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+ uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+ return rgw_shards_mod(sid2, num_shards);
}
static inline int rgw_shards_max()
const string *get_optag() { return &optag; }
bool is_prepared() { return prepared; }
- };
+ }; // class UpdateIndex
+
+ class List {
+ protected:
- struct List {
RGWRados::Bucket *target;
rgw_obj_key next_marker;
+ int list_objects_ordered(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated);
+ int list_objects_unordered(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated);
+
+ public:
+
struct Params {
string prefix;
string delim;
bool enforce_ns;
RGWAccessListFilter *filter;
bool list_versions;
-
- Params() : enforce_ns(true), filter(NULL), list_versions(false) {}
+ bool allow_unordered;
+
+ Params() :
+ enforce_ns(true),
+ filter(NULL),
+ list_versions(false),
+ allow_unordered(false)
+ {}
} params;
- public:
explicit List(RGWRados::Bucket *_target) : target(_target) {}
- int list_objects(int64_t max, vector<rgw_bucket_dir_entry> *result, map<string, bool> *common_prefixes, bool *is_truncated);
+ int list_objects(int64_t max,
+ vector<rgw_bucket_dir_entry> *result,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated) {
+ if (params.allow_unordered) {
+ return list_objects_unordered(max, result, common_prefixes,
+ is_truncated);
+ } else {
+ return list_objects_ordered(max, result, common_prefixes,
+ is_truncated);
+ }
+ }
rgw_obj_key& get_next_marker() {
return next_marker;
}
- };
- };
+ }; // class List
+ }; // class Bucket
/** Write/overwrite an object to the bucket storage. */
virtual int put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, ceph::real_time *mtime,
ceph::real_time& removed_mtime, list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
int cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
int cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout);
- int cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
- uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
- bool *is_truncated, rgw_obj_index_key *last_entry,
- bool (*force_check_filter)(const string& name) = NULL);
+ int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, int shard_id,
+ rgw_obj_index_key& start, const string& prefix,
+ uint32_t num_entries, bool list_versions,
+ map<string, rgw_bucket_dir_entry>& m,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ bool (*force_check_filter)(const string& name) = nullptr);
+ int cls_bucket_list_unordered(RGWBucketInfo& bucket_info, int shard_id,
+ rgw_obj_index_key& start, const string& prefix,
+ uint32_t num_entries, bool list_versions,
+ vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated, rgw_obj_index_key *last_entry,
+ bool (*force_check_filter)(const string& name) = nullptr);
int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids = NULL);
int cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
int list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, std::list<rgw_bi_log_entry>& result, bool *truncated);
int list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated);
int process_gc();
- int process_expire_objects();
+ bool process_expire_objects();
int defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
int process_lc();
uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
- int logshard = sid % num_logshards;
- get_logshard_oid(logshard, oid);
+ get_logshard_oid(int(sid), oid);
}
int RGWReshard::add(cls_rgw_reshard_entry& entry)
#include "rgw_client_io.h"
#include "common/errno.h"
#include "common/strtol.h"
+#include "rgw/rgw_b64.h"
#include "include/assert.h"
#define dout_context g_ceph_context
}
void RGWOp_Metadata_List::execute() {
- string marker = s->info.args.get("marker");
+ string marker;
+ ldout(s->cct, 16) << __func__
+ << " raw marker " << s->info.args.get("marker")
+ << dendl;
+
+ try {
+ marker = s->info.args.get("marker");
+ if (!marker.empty()) {
+ marker = rgw::from_base64(marker);
+ }
+ ldout(s->cct, 16) << __func__
+ << " marker " << marker << dendl;
+ } catch (...) {
+ marker = std::string("");
+ }
+
bool max_entries_specified;
- string max_entries_str = s->info.args.get("max-entries", &max_entries_specified);
+ string max_entries_str =
+ s->info.args.get("max-entries", &max_entries_specified);
bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified
we will send the old response format */
void *handle;
int max = 1000;
+ /* example markers:
+ marker = "3:b55a9110:root::bu_9:head";
+ marker = "3:b9a8b2a6:root::sorry_janefonda_890:head";
+ marker = "3:bf885d8f:root::sorry_janefonda_665:head";
+ */
+
http_ret = store->meta_mgr->list_keys_init(metadata_key, marker, &handle);
if (http_ret < 0) {
dout(5) << "ERROR: can't get key: " << cpp_strerror(http_ret) << dendl;
encode_json("truncated", truncated, s->formatter);
encode_json("count", count, s->formatter);
if (truncated) {
- encode_json("marker", store->meta_mgr->get_marker(handle), s->formatter);
+ string esc_marker =
+ rgw::to_base64(store->meta_mgr->get_marker(handle));
+ encode_json("marker", esc_marker, s->formatter);
}
s->formatter->close_section();
}
marker.name = s->info.args.get("key-marker");
marker.instance = s->info.args.get("version-id-marker");
}
+
+ // non-standard
+ s->info.args.get_bool("allow-unordered", &allow_unordered, false);
+
+ delimiter = s->info.args.get("delimiter");
+
max_keys = s->info.args.get("max-keys");
op_ret = parse_max_keys();
if (op_ret < 0) {
return op_ret;
}
- delimiter = s->info.args.get("delimiter");
+
encoding_type = s->info.args.get("encoding-type");
if (s->system_request) {
s->info.args.get_bool("objs-container", &objs_container, false);
shard_id = s->bucket_instance_shard_id;
}
}
+
return 0;
}
marker = s->info.args.get("marker");
end_marker = s->info.args.get("end_marker");
max_keys = s->info.args.get("limit");
+
+ // non-standard
+ s->info.args.get_bool("allow_unordered", &allow_unordered, false);
+
+ delimiter = s->info.args.get("delimiter");
+
op_ret = parse_max_keys();
if (op_ret < 0) {
return op_ret;
if (max > default_max)
return -ERR_PRECONDITION_FAILED;
- delimiter = s->info.args.get("delimiter");
-
string path_args;
if (s->info.args.exists("path")) { // should handle empty path
path_args = s->info.args.get("path");
dump_container_metadata(s, bucket, bucket_quota,
s->bucket_info.website_conf);
- s->formatter->open_array_section_with_attrs("container", FormatterAttrs("name", s->bucket.name.c_str(), NULL));
+ s->formatter->open_array_section_with_attrs("container",
+ FormatterAttrs("name",
+ s->bucket.name.c_str(),
+ NULL));
while (iter != objs.end() || pref_iter != common_prefixes.end()) {
bool do_pref = false;
else
do_pref = true;
- if (do_objs && (marker.empty() || marker < key)) {
+ if (do_objs && (allow_unordered || marker.empty() || marker < key)) {
if (key.name.compare(path) == 0)
goto next;
}
rgw_flush_formatter_and_reset(s, s->formatter);
-}
+} // RGWListBucket_ObjStore_SWIFT::send_response
static void dump_container_metadata(struct req_state *s,
const RGWBucketEnt& bucket,
log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
http_op->put();
http_op = NULL;
- return ret;
+ return set_cr_error(ret);
}
return io_block(0);
# Type: Integer
# Required: No
# Default: 10
- ;filestore merge threshold = 10
+ ;filestore merge threshold = -10
# filestore_split_multiple * abs(filestore_merge_threshold) * 16 is the maximum number of files in a subdirectory before splitting into child directories.
# Type: Integer
add_subdirectory(filestore)
add_subdirectory(fs)
add_subdirectory(journal)
-if(WITH_EMBEDDED)
- add_subdirectory(libcephd)
-endif(WITH_EMBEDDED)
add_subdirectory(libcephfs)
add_subdirectory(librados)
add_subdirectory(librados_test_stub)
For now, use a more inclusive regex.
$ rbd info foo
rbd image 'foo':
- \tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 1GiB in 256 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info foo --format json | python -mjson.tool | sed 's/,$/, /'
</image>
$ rbd info foo@snap
rbd image 'foo':
- \tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 1GiB in 256 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
\tprotected: False (esc)
</image>
$ rbd info bar
rbd image 'bar':
- \tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 1GiB in 256 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff, deep-flatten (esc)
</image>
$ rbd info bar@snap
rbd image 'bar':
- \tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 512MiB in 128 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff, deep-flatten (esc)
</image>
$ rbd info bar@snap2
rbd image 'bar':
- \tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 1GiB in 256 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff, deep-flatten (esc)
</image>
$ rbd info baz
rbd image 'baz':
- \tsize 2048 MB in 512 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 2GiB in 512 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
</image>
$ rbd info quux
rbd image 'quux':
- \tsize 1024 kB in 1 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 1MiB in 1 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info quux --format json | python -mjson.tool | sed 's/,$/, /'
</image>
$ rbd info rbd_other/child
rbd image 'child':
- \tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 512MiB in 128 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff (esc)
</image>
$ rbd info rbd_other/child@snap
rbd image 'child':
- \tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 512MiB in 128 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff (esc)
\tcreate_timestamp:* (glob)
\tprotected: False (esc)
\tparent: rbd/bar@snap (esc)
- \toverlap: 512 MB (esc)
+ \toverlap: 512MiB (esc)
$ rbd info rbd_other/child@snap --format json | python -mjson.tool | sed 's/,$/, /'
{
"block_name_prefix": "rbd_data.*", (glob)
</image>
$ rbd info rbd_other/deep-flatten-child
rbd image 'deep-flatten-child':
- \tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 512MiB in 128 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff, deep-flatten (esc)
</image>
$ rbd info rbd_other/deep-flatten-child@snap
rbd image 'deep-flatten-child':
- \tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 kB objects) (esc)
+ \tsize 512MiB in 128 objects (esc)
+ \torder 22 (4MiB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering, exclusive-lock, object-map, fast-diff, deep-flatten (esc)
<name>quuy</name>
</images>
$ rbd list -l
- NAME SIZE PARENT FMT PROT LOCK
- foo 1024M 1
- foo@snap 1024M 1
- quux 1024k 1 excl
- bar 1024M 2
- bar@snap 512M 2 yes
- bar@snap2 1024M 2
- baz 2048M 2 shr
- quuy 2048M 2
+ NAME SIZE PARENT FMT PROT LOCK
+ foo 1GiB 1
+ foo@snap 1GiB 1
+ quux 1MiB 1 excl
+ bar 1GiB 2
+ bar@snap 512MiB 2 yes
+ bar@snap2 1GiB 2
+ baz 2GiB 2 shr
+ quuy 2GiB 2
$ rbd list -l --format json | python -mjson.tool | sed 's/,$/, /'
[
{
<name>deep-flatten-child</name>
</images>
$ rbd list rbd_other -l
- NAME SIZE PARENT FMT PROT LOCK
- child 512M 2
- child@snap 512M rbd/bar@snap 2
- deep-flatten-child 512M 2
- deep-flatten-child@snap 512M 2
+ NAME SIZE PARENT FMT PROT LOCK
+ child 512MiB 2
+ child@snap 512MiB rbd/bar@snap 2
+ deep-flatten-child 512MiB 2
+ deep-flatten-child@snap 512MiB 2
$ rbd list rbd_other -l --format json | python -mjson.tool | sed 's/,$/, /'
[
{
</id*> (glob)
</locks>
$ rbd snap list foo
- SNAPID NAME SIZE TIMESTAMP
- *snap*1024*MB* (glob)
+ SNAPID NAME SIZE TIMESTAMP
+ *snap*1GiB* (glob)
$ rbd snap list foo --format json | python -mjson.tool | sed 's/,$/, /'
[
{
</snapshot>
</snapshots>
$ rbd snap list bar
- SNAPID NAME SIZE TIMESTAMP
- *snap*512*MB* (glob)
- *snap2*1024*MB* (glob)
+ SNAPID NAME SIZE TIMESTAMP
+ *snap*512MiB* (glob)
+ *snap2*1GiB* (glob)
$ rbd snap list bar --format json | python -mjson.tool | sed 's/,$/, /'
[
{
<snapshots></snapshots>
$ rbd snap list rbd_other/child
SNAPID NAME SIZE TIMESTAMP
- *snap*512*MB* (glob)
+ *snap*512MiB* (glob)
$ rbd snap list rbd_other/child --format json | python -mjson.tool | sed 's/,$/, /'
[
{
</snapshot>
</snapshots>
$ rbd disk-usage --pool rbd_other 2>/dev/null
- NAME PROVISIONED USED
- child@snap 512M 0
- child 512M 4096k
- deep-flatten-child@snap 512M 0
- deep-flatten-child 512M 0
- <TOTAL> 1024M 4096k
+ NAME PROVISIONED USED
+ child@snap 512MiB 0B
+ child 512MiB 4MiB
+ deep-flatten-child@snap 512MiB 0B
+ deep-flatten-child 512MiB 0B
+ <TOTAL> 1GiB 4MiB
$ rbd disk-usage --pool rbd_other --format json | python -mjson.tool | sed 's/,$/, /'
{
"images": [
#include "gtest/gtest.h"
#include "test/librados/test.h"
+#include "global/global_context.h"
#include <errno.h>
#include <string>
ASSERT_EQ(num_entries, entries);
}
-void index_prepare(OpMgr& mgr, librados::IoCtx& ioctx, string& oid, RGWModifyOp index_op, string& tag, string& obj, string& loc)
+void index_prepare(OpMgr& mgr, librados::IoCtx& ioctx, string& oid, RGWModifyOp index_op, string& tag,
+ string& obj, string& loc, uint16_t bi_flags = 0)
{
ObjectWriteOperation *op = mgr.write_op();
cls_rgw_obj_key key(obj, string());
rgw_zone_set zones_trace;
- cls_rgw_bucket_prepare_op(*op, index_op, tag, key, loc, true, 0, zones_trace);
+ cls_rgw_bucket_prepare_op(*op, index_op, tag, key, loc, true, bi_flags, zones_trace);
ASSERT_EQ(0, ioctx.operate(oid, op));
}
-void index_complete(OpMgr& mgr, librados::IoCtx& ioctx, string& oid, RGWModifyOp index_op, string& tag, int epoch, string& obj, rgw_bucket_dir_entry_meta& meta)
+void index_complete(OpMgr& mgr, librados::IoCtx& ioctx, string& oid, RGWModifyOp index_op, string& tag,
+ int epoch, string& obj, rgw_bucket_dir_entry_meta& meta, uint16_t bi_flags = 0)
{
ObjectWriteOperation *op = mgr.write_op();
cls_rgw_obj_key key(obj, string());
ver.pool = ioctx.get_id();
ver.epoch = epoch;
meta.accounted_size = meta.size;
- cls_rgw_bucket_complete_op(*op, index_op, tag, ver, key, meta, nullptr, true, 0, nullptr);
+ cls_rgw_bucket_complete_op(*op, index_op, tag, ver, key, meta, nullptr, true, bi_flags, nullptr);
ASSERT_EQ(0, ioctx.operate(oid, op));
}
test_stats(ioctx, bucket_oid, 0, num_objs / 2, total_size);
}
+/*
+ * This case is used to test whether get_obj_vals will
+ * return all validate utf8 objnames and filter out those
+ * in BI_PREFIX_CHAR private namespace.
+ */
+TEST(cls_rgw, index_list)
+{
+ string bucket_oid = str_int("bucket", 4);
+
+ OpMgr mgr;
+
+ ObjectWriteOperation *op = mgr.write_op();
+ cls_rgw_bucket_init(*op);
+ ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
+
+ uint64_t epoch = 1;
+ uint64_t obj_size = 1024;
+ const int num_objs = 5;
+ const string keys[num_objs] = {
+ /* single byte utf8 character */
+ { static_cast<char>(0x41) },
+ /* double byte utf8 character */
+ { static_cast<char>(0xCF), static_cast<char>(0x8F) },
+ /* treble byte utf8 character */
+ { static_cast<char>(0xDF), static_cast<char>(0x8F), static_cast<char>(0x8F) },
+ /* quadruble byte utf8 character */
+ { static_cast<char>(0xF7), static_cast<char>(0x8F), static_cast<char>(0x8F), static_cast<char>(0x8F) },
+ /* BI_PREFIX_CHAR private namespace, for test only */
+ { static_cast<char>(0x80), static_cast<char>(0x41) }
+ };
+
+ for (int i = 0; i < num_objs; i++) {
+ string obj = keys[i];
+ string tag = str_int("tag", i);
+ string loc = str_int("loc", i);
+
+ index_prepare(mgr, ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, obj, loc);
+
+ op = mgr.write_op();
+ rgw_bucket_dir_entry_meta meta;
+ meta.category = 0;
+ meta.size = obj_size;
+ index_complete(mgr, ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, epoch, obj, meta);
+ }
+
+ test_stats(ioctx, bucket_oid, 0, num_objs, obj_size * num_objs);
+
+ map<int, string> oids = { {0, bucket_oid} };
+ map<int, struct rgw_cls_list_ret> list_results;
+ cls_rgw_obj_key start_key("", "");
+ int r = CLSRGWIssueBucketList(ioctx, start_key, "", 1000, true, oids, list_results, 1)();
+
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(1, list_results.size());
+
+ auto it = list_results.begin();
+ auto m = (it->second).dir.m;
+
+ ASSERT_EQ(4, m.size());
+ int i = 0;
+ for(auto it2 = m.begin(); it2 != m.end(); it2++, i++)
+ ASSERT_EQ(it2->first.compare(keys[i]), 0);
+}
+
+
+TEST(cls_rgw, bi_list)
+{
+ string bucket_oid = str_int("bucket", 5);
+
+ CephContext *cct = reinterpret_cast<CephContext *>(ioctx.cct());
+
+ OpMgr mgr;
+
+ ObjectWriteOperation *op = mgr.write_op();
+ cls_rgw_bucket_init(*op);
+ ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
+
+ string name;
+ string marker;
+ uint64_t max = 10;
+ list<rgw_cls_bi_entry> entries;
+ bool is_truncated;
+
+ int ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, max, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(entries.size(), 0);
+ ASSERT_EQ(is_truncated, false);
+
+ uint64_t epoch = 1;
+ uint64_t obj_size = 1024;
+ uint64_t num_objs = 35;
+
+ for (uint64_t i = 0; i < num_objs; i++) {
+ string obj = str_int("obj", i);
+ string tag = str_int("tag", i);
+ string loc = str_int("loc", i);
+ index_prepare(mgr, ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, obj, loc, RGW_BILOG_FLAG_VERSIONED_OP);
+ op = mgr.write_op();
+ rgw_bucket_dir_entry_meta meta;
+ meta.category = 0;
+ meta.size = obj_size;
+ index_complete(mgr, ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, epoch, obj, meta, RGW_BILOG_FLAG_VERSIONED_OP);
+ }
+
+ ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, num_objs + 10, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ if (cct->_conf->osd_max_omap_entries_per_request < num_objs) {
+ ASSERT_EQ(entries.size(), cct->_conf->osd_max_omap_entries_per_request);
+ } else {
+ ASSERT_EQ(entries.size(), num_objs);
+ }
+
+ uint64_t num_entries = 0;
+
+ is_truncated = true;
+ while(is_truncated) {
+ ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, max, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ if (is_truncated) {
+ ASSERT_EQ(entries.size(), std::min(max, cct->_conf->osd_max_omap_entries_per_request));
+ } else {
+ ASSERT_EQ(entries.size(), num_objs - num_entries);
+ }
+ num_entries += entries.size();
+ marker = entries.back().idx;
+ }
+
+ ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, max, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(entries.size(), 0);
+ ASSERT_EQ(is_truncated, false);
+
+ if (cct->_conf->osd_max_omap_entries_per_request < 15) {
+ num_entries = 0;
+ max = 15;
+ is_truncated = true;
+ marker.clear();
+ while(is_truncated) {
+ ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, max, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ if (is_truncated) {
+ ASSERT_EQ(entries.size(), cct->_conf->osd_max_omap_entries_per_request);
+ } else {
+ ASSERT_EQ(entries.size(), num_objs - num_entries);
+ }
+ num_entries += entries.size();
+ marker = entries.back().idx;
+ }
+ }
+
+ ret = cls_rgw_bi_list(ioctx, bucket_oid, name, marker, max, &entries,
+ &is_truncated);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(entries.size(), 0);
+ ASSERT_EQ(is_truncated, false);
+}
+
/* test garbage collection */
static void create_obj(cls_rgw_obj& obj, int i, int j)
{
#include <sstream>
-TEST(util, unit_to_bytesize)
-{
- ASSERT_EQ(1234ll, unit_to_bytesize("1234", &cerr));
- ASSERT_EQ(1024ll, unit_to_bytesize("1K", &cerr));
- ASSERT_EQ(1024ll, unit_to_bytesize("1k", &cerr));
- ASSERT_EQ(1048576ll, unit_to_bytesize("1M", &cerr));
- ASSERT_EQ(1073741824ll, unit_to_bytesize("1G", &cerr));
- ASSERT_EQ(1099511627776ll, unit_to_bytesize("1T", &cerr));
- ASSERT_EQ(1125899906842624ll, unit_to_bytesize("1P", &cerr));
- ASSERT_EQ(1152921504606846976ll, unit_to_bytesize("1E", &cerr));
-
- ASSERT_EQ(65536ll, unit_to_bytesize(" 64K", &cerr));
-}
-
#if defined(__linux__)
TEST(util, collect_sys_info)
{
+++ /dev/null
-# cephdtest
-set(cephdtest_srcs
- test.cc)
-add_library(cephdtest STATIC ${cephdtest_srcs})
-set_target_properties(cephdtest PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS})
-
-#Enable spdk
-if(HAVE_SPDK)
-link_directories("${CMAKE_SOURCE_DIR}/src/spdk/build/lib/")
-endif(HAVE_SPDK)
-
-# ceph_test_cephd_api_misc
-add_executable(ceph_test_cephd_api_misc
- misc.cc
- )
-set_target_properties(ceph_test_cephd_api_misc PROPERTIES COMPILE_FLAGS
- ${UNITTEST_CXX_FLAGS})
-target_link_libraries(ceph_test_cephd_api_misc
- cephd global ${UNITTEST_LIBS} cephdtest z snappy ceph_zstd)
-
-install(TARGETS
- ceph_test_cephd_api_misc
- DESTINATION ${CMAKE_INSTALL_BINDIR})
+++ /dev/null
-#include "gtest/gtest.h"
-#include "include/cephd/libcephd.h"
-
-TEST(LibCephdMiscVersion, Version) {
- int major, minor, extra;
- cephd_version(&major, &minor, &extra);
-}
+++ /dev/null
-void doNothing() {
-}
#include "librbd/image/RemoveRequest.cc"
template class librbd::image::RemoveRequest<librbd::MockImageCtx>;
+ACTION_P(TestFeatures, image_ctx) {
+ return ((image_ctx->features & arg0) != 0);
+}
+
+ACTION_P(ShutDownExclusiveLock, image_ctx) {
+ // shutting down exclusive lock will close object map and journal
+ image_ctx->exclusive_lock = nullptr;
+ image_ctx->object_map = nullptr;
+ image_ctx->journal = nullptr;
+}
+
namespace librbd {
namespace image {
_, _, _))
.WillOnce(Return(r));
}
+
+ void expect_test_features(MockImageCtx &mock_image_ctx) {
+ if (m_mock_imctx->exclusive_lock != nullptr) {
+ EXPECT_CALL(mock_image_ctx, test_features(_))
+ .WillRepeatedly(TestFeatures(&mock_image_ctx));
+ }
+ }
+
+ void expect_set_journal_policy(MockImageCtx &mock_image_ctx) {
+ if (m_test_imctx->test_features(RBD_FEATURE_JOURNALING)) {
+ EXPECT_CALL(mock_image_ctx, set_journal_policy(_))
+ .WillOnce(Invoke([](journal::Policy* policy) {
+ ASSERT_TRUE(policy->journal_disabled());
+ delete policy;
+ }));
+ }
+ }
+
+ void expect_shut_down_exclusive_lock(MockImageCtx &mock_image_ctx,
+ MockExclusiveLock &mock_exclusive_lock,
+ int r) {
+ if (m_mock_imctx->exclusive_lock != nullptr) {
+ EXPECT_CALL(mock_exclusive_lock, shut_down(_))
+ .WillOnce(DoAll(ShutDownExclusiveLock(&mock_image_ctx),
+ CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue)));
+ }
+ }
+
};
TEST_F(TestMockImageRemoveRequest, SuccessV1) {
REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
TestImageRemoveSetUp();
+ MockExclusiveLock *mock_exclusive_lock = nullptr;
+ if (m_test_imctx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ mock_exclusive_lock = new MockExclusiveLock();
+ m_mock_imctx->exclusive_lock = mock_exclusive_lock;
+ }
+
C_SaferCond ctx;
librbd::NoOpProgressContext no_op;
ContextWQ op_work_queue;
InSequence seq;
expect_state_open(*m_mock_imctx, 0);
+
+ expect_test_features(*m_mock_imctx);
+ expect_set_journal_policy(*m_mock_imctx);
+ expect_shut_down_exclusive_lock(*m_mock_imctx, *mock_exclusive_lock, 0);
+
expect_mirror_image_get(*m_mock_imctx, 0);
expect_get_group(*m_mock_imctx, 0);
expect_trim(*m_mock_imctx, mock_trim_request, 0);
expect_remove_mirror_image(m_ioctx, 0);
expect_dir_remove_image(m_ioctx, 0);
- MockRemoveRequest *req = MockRemoveRequest::create(m_ioctx, m_image_name, "",
- true, false, no_op, &op_work_queue, &ctx);
+ MockRemoveRequest *req = MockRemoveRequest::create(
+ m_ioctx, m_image_name, "", true, false, no_op, &op_work_queue, &ctx);
req->send();
ASSERT_EQ(0, ctx.wait());
REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
TestImageRemoveSetUp();
+ MockExclusiveLock *mock_exclusive_lock = nullptr;
+ if (m_test_imctx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ mock_exclusive_lock = new MockExclusiveLock();
+ m_mock_imctx->exclusive_lock = mock_exclusive_lock;
+ }
+
C_SaferCond ctx;
librbd::NoOpProgressContext no_op;
ContextWQ op_work_queue;
InSequence seq;
expect_state_open(*m_mock_imctx, 0);
+
+ expect_test_features(*m_mock_imctx);
+ expect_set_journal_policy(*m_mock_imctx);
+ expect_shut_down_exclusive_lock(*m_mock_imctx, *mock_exclusive_lock, 0);
+
expect_mirror_image_get(*m_mock_imctx, 0);
expect_get_group(*m_mock_imctx, 0);
expect_trim(*m_mock_imctx, mock_trim_request, 0);
expect_remove_mirror_image(m_ioctx, 0);
expect_dir_remove_image(m_ioctx, -ENOENT);
- MockRemoveRequest *req = MockRemoveRequest::create(m_ioctx, m_image_name, "",
- true, false, no_op, &op_work_queue, &ctx);
+ MockRemoveRequest *req = MockRemoveRequest::create(
+ m_ioctx, m_image_name, "", true, false, no_op, &op_work_queue, &ctx);
req->send();
ASSERT_EQ(-ENOENT, ctx.wait());
#include "test/librbd/test_support.h"
#include "librbd/ExclusiveLock.h"
#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
#include "librbd/ImageWatcher.h"
#include "librbd/internal.h"
#include "librbd/ObjectMap.h"
ASSERT_TRUE(flags_set);
}
-TEST_F(TestObjectMap, InvalidateFlagInMemoryOnly) {
+TEST_F(TestObjectMap, AcquireLockInvalidatesWhenTooSmall) {
REQUIRE_FEATURE(RBD_FEATURE_OBJECT_MAP);
librbd::ImageCtx *ictx;
ASSERT_EQ(0, ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID, &flags_set));
ASSERT_FALSE(flags_set);
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::object_map_resize(&op, 0, OBJECT_NONEXISTENT);
+
std::string oid = librbd::ObjectMap<>::object_map_name(ictx->id, CEPH_NOSNAP);
- bufferlist valid_bl;
- ASSERT_LT(0, ictx->md_ctx.read(oid, valid_bl, 0, 0));
+ ASSERT_EQ(0, ictx->md_ctx.operate(oid, &op));
- bufferlist corrupt_bl;
- corrupt_bl.append("corrupt");
- ASSERT_EQ(0, ictx->md_ctx.write_full(oid, corrupt_bl));
+ C_SaferCond lock_ctx;
+ {
+ RWLock::WLocker owner_locker(ictx->owner_lock);
+ ictx->exclusive_lock->try_acquire_lock(&lock_ctx);
+ }
+ ASSERT_EQ(0, lock_ctx.wait());
- ASSERT_EQ(0, when_open_object_map(ictx));
ASSERT_EQ(0, ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID, &flags_set));
ASSERT_TRUE(flags_set);
- ASSERT_EQ(0, ictx->md_ctx.write_full(oid, valid_bl));
- ASSERT_EQ(0, open_image(m_image_name, &ictx));
- ASSERT_EQ(0, ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID, &flags_set));
- ASSERT_FALSE(flags_set);
+ // Test the flag is stored on disk
+ ASSERT_EQ(0, ictx->state->refresh());
+ ASSERT_EQ(0, ictx->test_flags(RBD_FLAG_OBJECT_MAP_INVALID,
+ &flags_set));
+ ASSERT_TRUE(flags_set);
}
-
pool.type = pg_pool_t::TYPE_REPLICATED;
PGMap::dump_object_stat_sum(tbl, nullptr, sum, avail,
pool.get_size(), verbose, &pool);
- ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, 0));
+ ASSERT_EQ(stringify(byte_u_t(sum.num_bytes)), tbl.get(0, 0));
float copies_rate =
(static_cast<float>(sum.num_object_copies - sum.num_objects_degraded) /
sum.num_object_copies);
float used_bytes = sum.num_bytes * copies_rate * pool.get_size();
float used_percent = used_bytes / (used_bytes + avail) * 100;
unsigned col = 0;
- ASSERT_EQ(stringify(si_t(sum.num_bytes)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(sum.num_bytes)), tbl.get(0, col++));
ASSERT_EQ(percentify(used_percent), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(sum.num_objects), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(sum.num_objects_dirty)), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(sum.num_rd)), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(sum.num_wr)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(si_u_t(sum.num_objects_dirty)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(sum.num_rd)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(sum.num_wr)), tbl.get(0, col++));
// we can use pool.size for raw_used_rate if it is a replica pool
uint64_t raw_bytes_used = sum.num_bytes * pool.get_size() * copies_rate;
- ASSERT_EQ(stringify(si_t(raw_bytes_used)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(raw_bytes_used)), tbl.get(0, col++));
}
// with table, without formatter, verbose = true, empty, avail > 0
pool.type = pg_pool_t::TYPE_REPLICATED;
PGMap::dump_object_stat_sum(tbl, nullptr, sum, avail,
pool.get_size(), verbose, &pool);
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, 0));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, 0));
unsigned col = 0;
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++));
ASSERT_EQ(percentify(0), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(0), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(si_u_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++));
}
// with table, without formatter, verbose = false, empty, avail = 0
PGMap::dump_object_stat_sum(tbl, nullptr, sum, avail,
pool.get_size(), verbose, &pool);
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, 0));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, 0));
unsigned col = 0;
- ASSERT_EQ(stringify(si_t(0)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++));
ASSERT_EQ(percentify(0), tbl.get(0, col++));
- ASSERT_EQ(stringify(si_t(avail/pool.size)), tbl.get(0, col++));
+ ASSERT_EQ(stringify(byte_u_t(avail/pool.size)), tbl.get(0, col++));
ASSERT_EQ(stringify(0), tbl.get(0, col++));
}
dout(0) << __func__
<< " written: " << m_stats_total_written
<< " duration: " << duration << " sec"
- << " bandwidth: " << prettybyte_t(throughput) << "/s"
+ << " bandwidth: " << byte_u_t(throughput) << "/s"
<< " iops: " << tx_throughput << "/s"
<< dendl;
bool byte_units::parse(const std::string &val, std::string *err)
{
- v = strict_sistrtoll(val.c_str(), err);
+ v = strict_iecstrtoll(val.c_str(), err);
return err->empty();
}
ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
"\"element3\":{\"avgcount\":0,\"sum\":0.000000000,\"avgtime\":0.000000000}}}"), msg);
ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
- ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\",\"priority\":0},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\",\"priority\":0}}}"), msg);
+ ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\",\"priority\":0,\"units\":\"none\"},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\",\"priority\":0,\"units\":\"none\"},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\",\"priority\":0,\"units\":\"none\"}}}"), msg);
coll->clear();
ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
ASSERT_EQ("{}", msg);
import re
import sys
import json
-from StringIO import StringIO
+try:
+ from StringIO import StringIO
+except ImportError:
+ from io import StringIO
def get_command_descriptions(what):
CEPH_BIN = os.environ['CEPH_BIN']
*/
#include "common/strtol.h"
+#include <math.h>
#include <string>
#include <map>
#include "gtest/gtest.h"
-static void test_strict_strtoll(const char *str, long long expected)
+static void test_strict_strtoll(const char *str, long long expected, int base)
{
std::string err;
- long long val = strict_strtoll(str, 10, &err);
+ long long val = strict_strtoll(str, base, &err);
if (!err.empty()) {
ASSERT_EQ(err, "");
}
}
TEST(StrToL, Simple1) {
- test_strict_strtoll("123", 123);
- test_strict_strtoll("0", 0);
- test_strict_strtoll("-123", -123);
- test_strict_strtoll("8796093022208", 8796093022208LL);
- test_strict_strtoll("-8796093022208", -8796093022208LL);
+ test_strict_strtoll("123", 123, 10);
+ test_strict_strtoll("0", 0, 10);
+ test_strict_strtoll("-123", -123, 10);
+ test_strict_strtoll("8796093022208", 8796093022208LL, 10);
+ test_strict_strtoll("-8796093022208", -8796093022208LL, 10);
+ test_strict_strtoll("123", 123, 0);
+ test_strict_strtoll("0x7b", 123, 0);
+ test_strict_strtoll("4d2", 1234, 16);
test_strict_strtol("208", 208);
test_strict_strtol("-4", -4);
test_strict_strtoll_err("604462909807314587353088"); // overflow
test_strict_strtoll_err("aw shucks"); // invalid
test_strict_strtoll_err("343245 aw shucks"); // invalid chars at end
+ test_strict_strtoll_err("-"); // invalid
test_strict_strtol_err("35 aw shucks"); // invalid chars at end
test_strict_strtol_err("--0");
+ test_strict_strtol_err("-");
test_strict_strtod_err("345345.0-");
test_strict_strtod_err("34.0 garbo");
}
+static void test_strict_iecstrtoll(const char *str)
+{
+ std::string err;
+ strict_iecstrtoll(str, &err);
+ ASSERT_EQ(err, "");
+}
+
+static void test_strict_iecstrtoll_units(const std::string& foo,
+ std::string u, const int m)
+{
+ std::string s(foo);
+ s.append(u);
+ const char *str = s.c_str();
+ std::string err;
+ uint64_t r = strict_iecstrtoll(str, &err);
+ ASSERT_EQ(err, "");
+
+ str = foo.c_str();
+ std::string err2;
+ long long tmp = strict_strtoll(str, 10, &err2);
+ ASSERT_EQ(err2, "");
+ tmp = (tmp << m);
+ ASSERT_EQ(tmp, (long long)r);
+}
+
+TEST(IECStrToLL, WithUnits) {
+ std::map<std::string,int> units;
+ units["B"] = 0;
+ units["K"] = 10;
+ units["M"] = 20;
+ units["G"] = 30;
+ units["T"] = 40;
+ units["P"] = 50;
+ units["E"] = 60;
+ units["Ki"] = 10;
+ units["Mi"] = 20;
+ units["Gi"] = 30;
+ units["Ti"] = 40;
+ units["Pi"] = 50;
+ units["Ei"] = 60;
+
+ for (std::map<std::string,int>::iterator p = units.begin();
+ p != units.end(); ++p) {
+ // the upper bound of uint64_t is 2^64 = 4E
+ test_strict_iecstrtoll_units("4", p->first, p->second);
+ test_strict_iecstrtoll_units("1", p->first, p->second);
+ test_strict_iecstrtoll_units("0", p->first, p->second);
+ }
+}
+
+TEST(IECStrToLL, WithoutUnits) {
+ test_strict_iecstrtoll("1024");
+ test_strict_iecstrtoll("1152921504606846976");
+ test_strict_iecstrtoll("0");
+}
+
+static void test_strict_iecstrtoll_err(const char *str)
+{
+ std::string err;
+ strict_iecstrtoll(str, &err);
+ ASSERT_NE(err, "");
+}
+
+TEST(IECStrToLL, Error) {
+ test_strict_iecstrtoll_err("1024F");
+ test_strict_iecstrtoll_err("QDDSA");
+ test_strict_iecstrtoll_err("1b");
+ test_strict_iecstrtoll_err("100k");
+ test_strict_iecstrtoll_err("1000m");
+ test_strict_iecstrtoll_err("1g");
+ test_strict_iecstrtoll_err("20t");
+ test_strict_iecstrtoll_err("100p");
+ test_strict_iecstrtoll_err("1000e");
+ test_strict_iecstrtoll_err("B");
+ test_strict_iecstrtoll_err("M");
+ test_strict_iecstrtoll_err("BM");
+ test_strict_iecstrtoll_err("B0wef");
+ test_strict_iecstrtoll_err("0m");
+ test_strict_iecstrtoll_err("-1"); // it returns uint64_t
+ test_strict_iecstrtoll_err("-1K");
+ test_strict_iecstrtoll_err("1Bi");
+ test_strict_iecstrtoll_err("Bi");
+ test_strict_iecstrtoll_err("bi");
+ test_strict_iecstrtoll_err("gi");
+ test_strict_iecstrtoll_err("100ki");
+ test_strict_iecstrtoll_err("1000mi");
+ test_strict_iecstrtoll_err("1gi");
+ test_strict_iecstrtoll_err("20ti");
+ test_strict_iecstrtoll_err("100pi");
+ test_strict_iecstrtoll_err("1000ei");
+ // the upper bound of uint64_t is 2^64 = 4E, so 1024E overflows
+ test_strict_iecstrtoll_err("1024E"); // overflows after adding the suffix
+}
+
+// since strict_iecstrtoll is an alias of strict_iec_cast<uint64_t>(), quite a few
+// of cases are covered by existing test cases of strict_iecstrtoll already.
+TEST(StrictIECCast, Error) {
+ {
+ std::string err;
+ // the SI prefix is way too large for `int`.
+ (void)strict_iec_cast<int>("2E", &err);
+ ASSERT_NE(err, "");
+ }
+ {
+ std::string err;
+ (void)strict_iec_cast<int>("-2E", &err);
+ ASSERT_NE(err, "");
+ }
+ {
+ std::string err;
+ (void)strict_iec_cast<int>("1T", &err);
+ ASSERT_NE(err, "");
+ }
+ {
+ std::string err;
+ (void)strict_iec_cast<int64_t>("2E", &err);
+ ASSERT_EQ(err, "");
+ }
+ {
+ std::string err;
+ (void)strict_iec_cast<int64_t>("-2E", &err);
+ ASSERT_EQ(err, "");
+ }
+ {
+ std::string err;
+ (void)strict_iec_cast<int64_t>("1T", &err);
+ ASSERT_EQ(err, "");
+ }
+}
+
+
static void test_strict_sistrtoll(const char *str)
{
std::string err;
}
static void test_strict_sistrtoll_units(const std::string& foo,
- char u, const int m)
+ std::string u, const long long m)
{
std::string s(foo);
- s.push_back(u);
+ s.append(u);
const char *str = s.c_str();
std::string err;
uint64_t r = strict_sistrtoll(str, &err);
std::string err2;
long long tmp = strict_strtoll(str, 10, &err2);
ASSERT_EQ(err2, "");
- tmp = (tmp << m);
+ tmp = (tmp * m);
ASSERT_EQ(tmp, (long long)r);
}
TEST(SIStrToLL, WithUnits) {
- std::map<char,int> units;
- units['B'] = 0;
- units['K'] = 10;
- units['M'] = 20;
- units['G'] = 30;
- units['T'] = 40;
- units['P'] = 50;
- units['E'] = 60;
-
- for (std::map<char,int>::iterator p = units.begin();
+ std::map<std::string,long long> units;
+ units["K"] = pow(10, 3);
+ units["M"] = pow(10, 6);
+ units["G"] = pow(10, 9);
+ units["T"] = pow(10, 12);
+ units["P"] = pow(10, 15);
+ units["E"] = pow(10, 18);
+
+ for (std::map<std::string,long long>::iterator p = units.begin();
p != units.end(); ++p) {
// the upper bound of uint64_t is 2^64 = 4E
test_strict_sistrtoll_units("4", p->first, p->second);
test_strict_sistrtoll_err("0m");
test_strict_sistrtoll_err("-1"); // it returns uint64_t
test_strict_sistrtoll_err("-1K");
+ test_strict_sistrtoll_err("1Bi");
+ test_strict_sistrtoll_err("Bi");
+ test_strict_sistrtoll_err("bi");
+ test_strict_sistrtoll_err("gi");
+ test_strict_sistrtoll_err("100ki");
+ test_strict_sistrtoll_err("1000mi");
+ test_strict_sistrtoll_err("1gi");
+ test_strict_sistrtoll_err("20ti");
+ test_strict_sistrtoll_err("100pi");
+ test_strict_sistrtoll_err("1000ei");
+ test_strict_sistrtoll_err("1B");
// the upper bound of uint64_t is 2^64 = 4E, so 1024E overflows
test_strict_sistrtoll_err("1024E"); // overflows after adding the suffix
}
utime_t cur_duration = ceph_clock_now() - started_at;
std::cout << "ts = " << cur_duration << "s, copied " << total_keys
- << " keys so far (" << stringify(si_t(total_size)) << ")"
+ << " keys so far (" << stringify(si_u_t(total_size)) << ")"
<< std::endl;
} while (it->valid());
std::cout << "summary:" << std::endl;
std::cout << " copied " << total_keys << " keys" << std::endl;
std::cout << " used " << total_txs << " transactions" << std::endl;
- std::cout << " total size " << stringify(si_t(total_size)) << std::endl;
+ std::cout << " total size " << stringify(si_u_t(total_size)) << std::endl;
std::cout << " from '" << store_path << "' to '" << other_path << "'"
<< std::endl;
std::cout << " duration " << time_taken << " seconds" << std::endl;
return 1;
}
std::cout << "(" << url_escape(prefix) << "," << url_escape(key)
- << ") size " << si_t(bl.length()) << std::endl;
+ << ") size " << si_u_t(bl.length()) << std::endl;
} else if (cmd == "set") {
if (argc < 8) {
out_store.apply_transaction(tx);
std::cout << "copied " << total_keys << " keys so far ("
- << stringify(si_t(total_size)) << ")" << std::endl;
+ << stringify(byte_u_t(total_size)) << ")" << std::endl;
} while (it->valid());
out_store.close();
std::cout << "summary: copied " << total_keys << " keys, using "
<< total_tx << " transactions, totalling "
- << stringify(si_t(total_size)) << std::endl;
+ << stringify(byte_u_t(total_size)) << std::endl;
std::cout << "from '" << store_path << "' to '" << out_path << "'"
<< std::endl;
} else if (cmd == "rewrite-crush") {
return r;
}
+int corrupt_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter,
+ ObjectStore::Sequencer &osr)
+{
+ bufferlist attr;
+ int r = store->getattr(coll, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ bufferlist::iterator bp = attr.begin();
+ try {
+ ::decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ cout << "Corrupting info" << std::endl;
+ if (!dry_run) {
+ attr.clear();
+ oi.alloc_hint_flags += 0xff;
+ ObjectStore::Transaction t;
+ ::encode(oi, attr, -1); /* fixme: using full features */
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ r = store->apply_transaction(&osr, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
int set_size(ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter,
ObjectStore::Sequencer &osr, bool corrupt)
{
}
int apply_layout_settings(ObjectStore *os, const OSDSuperblock &superblock,
- const string &pool_name, const spg_t &pgid, bool dry_run)
+ const string &pool_name, const spg_t &pgid, bool dry_run,
+ int target_level)
{
int r = 0;
cerr << "Would apply layout settings to " << coll << std::endl;
} else {
cerr << "Finished " << done << "/" << total << " collections" << "\r";
- r = fs->apply_layout_settings(coll);
+ r = fs->apply_layout_settings(coll, target_level);
if (r < 0) {
cerr << "Error applying layout settings to " << coll << std::endl;
return r;
positional.add_options()
("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json")
("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]")
- ("arg1", po::value<string>(&arg1), "arg1 based on cmd")
+ ("arg1", po::value<string>(&arg1), "arg1 based on cmd, "
+ "for apply-layout-settings: target hash level split to")
("arg2", po::value<string>(&arg2), "arg2 based on cmd")
;
}
if (op == "apply-layout-settings") {
- ret = apply_layout_settings(fs, superblock, pool, pgid, dry_run);
+ int target_level = 0;
+ if (vm.count("arg1") && isdigit(arg1[0])) {
+ target_level = atoi(arg1.c_str());
+ }
+ ret = apply_layout_settings(fs, superblock, pool, pgid, dry_run, target_level);
goto out;
}
}
ret = print_obj_info(fs, coll, ghobj, formatter);
goto out;
+ } else if (objcmd == "corrupt-info") { // Undocumented testing feature
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = corrupt_info(fs, coll, ghobj, formatter, *osr);
+ goto out;
} else if (objcmd == "set-size" || objcmd == "corrupt-size") {
// Undocumented testing feature
bool corrupt = (objcmd == "corrupt-size");
{
uint32_t const object_size = h.layout.object_size;
assert(object_size > 0);
- uint64_t const last_obj = h.write_pos / object_size;
- uint64_t const purge_count = 2;
+ uint64_t last_obj = h.write_pos / object_size;
+ uint64_t purge_count = 2;
+ /* When the length is zero, the last_obj should be zeroed
+ * from the offset determined by the new write_pos instead of being purged.
+ */
+ if (!len) {
+ purge_count = 1;
+ ++last_obj;
+ }
C_SaferCond purge_cond;
cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
lock.Lock();
lock.Unlock();
purge_cond.wait();
}
+ /* When the length is zero, zero the last object
+ * from the offset determined by the new write_pos.
+ */
+ if (!len) {
+ uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
+ uint64_t len = h.layout.object_size - offset_in_obj;
+ C_SaferCond zero_cond;
+ cout << "Zeroing " << len << " bytes in the last object." << std::endl;
+
+ lock.Lock();
+ filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
+ lock.Unlock();
+ zero_cond.wait();
+ }
// Stream from `fd` to `filer`
uint64_t pos = start;
template <typename I, typename T>
static int rados_sistrtoll(I &i, T *val) {
std::string err;
- *val = strict_sistrtoll(i->second.c_str(), &err);
+ *val = strict_iecstrtoll(i->second.c_str(), &err);
if (err != "") {
cerr << "Invalid value for " << i->first << ": " << err << std::endl;
return -EINVAL;
librados::pool_stat_t& s = i->second;
if (!formatter) {
tab << pool_name
- << si_t(s.num_bytes)
+ << byte_u_t(s.num_bytes)
<< s.num_objects
<< s.num_object_clones
<< s.num_object_copies
<< s.num_objects_unfound
<< s.num_objects_degraded
<< s.num_rd
- << si_t(s.num_rd_kb << 10)
+ << byte_u_t(s.num_rd_kb << 10)
<< s.num_wr
- << si_t(s.num_wr_kb << 10)
+ << byte_u_t(s.num_wr_kb << 10)
<< TextTable::endrow;
} else {
formatter->open_object_section("pool");
cout << std::endl;
cout << "total_objects " << tstats.num_objects
<< std::endl;
- cout << "total_used " << si_t(tstats.kb_used << 10)
+ cout << "total_used " << byte_u_t(tstats.kb_used << 10)
<< std::endl;
- cout << "total_avail " << si_t(tstats.kb_avail << 10)
+ cout << "total_avail " << byte_u_t(tstats.kb_avail << 10)
<< std::endl;
- cout << "total_space " << si_t(tstats.kb << 10)
+ cout << "total_space " << byte_u_t(tstats.kb << 10)
<< std::endl;
} else {
formatter->close_section();
const std::string &s = po::validators::get_single_string(values);
std::string parse_error;
- uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
if (!parse_error.empty()) {
throw po::validation_error(po::validation_error::invalid_option_value);
}
const std::string &s = po::validators::get_single_string(values);
std::string parse_error;
- uint64_t objectsize = strict_sistrtoll(s.c_str(), &parse_error);
+ uint64_t objectsize = strict_iecstrtoll(s.c_str(), &parse_error);
if (!parse_error.empty()) {
throw po::validation_error(po::validation_error::invalid_option_value);
}
const std::string &s = po::validators::get_single_string(values);
std::string parse_error;
- uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
if (parse_error.empty() && (size >= (1 << 12))) {
v = boost::any(size);
return;
const std::string &s = po::validators::get_single_string(values);
std::string parse_error;
- uint64_t format = strict_sistrtoll(s.c_str(), &parse_error);
+ uint64_t format = strict_iecstrtoll(s.c_str(), &parse_error);
if (!parse_error.empty() || (format != 1 && format != 2)) {
throw po::validation_error(po::validation_error::invalid_option_value);
}
const std::string &s = po::validators::get_single_string(values);
std::string parse_error;
- uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
if (!parse_error.empty()) {
throw po::validation_error(po::validation_error::invalid_option_value);
}
uint64_t size = 0;
image.size(&size);
if (io_size > size) {
- std::cerr << "rbd: io-size " << prettybyte_t(io_size) << " "
- << "larger than image size " << prettybyte_t(size) << std::endl;
+ std::cerr << "rbd: io-size " << byte_u_t(io_size) << " "
+ << "larger than image size " << byte_u_t(size) << std::endl;
return -EINVAL;
}
full_name += "@" + snap_name;
}
tbl << full_name
- << stringify(si_t(size))
- << stringify(si_t(*used_size))
+ << stringify(byte_u_t(size))
+ << stringify(byte_u_t(*used_size))
<< TextTable::endrow;
}
return 0;
} else {
if (count > 1) {
tbl << "<TOTAL>"
- << stringify(si_t(total_prov))
- << stringify(si_t(total_used))
+ << stringify(byte_u_t(total_prov))
+ << stringify(byte_u_t(total_used))
<< TextTable::endrow;
}
std::cout << tbl;
f->dump_int("format", (old_format ? 1 : 2));
} else {
std::cout << "rbd image '" << (imgname.empty() ? imgid : imgname) << "':\n"
- << "\tsize " << prettybyte_t(info.size) << " in "
+ << "\tsize " << byte_u_t(info.size) << " in "
<< info.num_objs << " objects"
<< std::endl
<< "\torder " << info.order
- << " (" << prettybyte_t(info.obj_size) << " objects)"
+ << " (" << byte_u_t(info.obj_size) << " objects)"
<< std::endl;
if (!data_pool.empty()) {
std::cout << "\tdata_pool: " << data_pool << std::endl;
std::cout << " (trash " << parent_id << ")";
}
std::cout << std::endl;
- std::cout << "\toverlap: " << prettybyte_t(overlap) << std::endl;
+ std::cout << "\toverlap: " << byte_u_t(overlap) << std::endl;
}
}
f->dump_unsigned("stripe_unit", image.get_stripe_unit());
f->dump_unsigned("stripe_count", image.get_stripe_count());
} else {
- std::cout << "\tstripe unit: " << prettybyte_t(image.get_stripe_unit())
+ std::cout << "\tstripe unit: " << byte_u_t(image.get_stripe_unit())
<< std::endl
<< "\tstripe count: " << image.get_stripe_count() << std::endl;
}
std::cout << "\theader_oid: " << header_oid << std::endl;
std::cout << "\tobject_oid_prefix: " << object_oid_prefix << std::endl;
std::cout << "\torder: " << static_cast<int>(order) << " ("
- << prettybyte_t(1ull << order) << " objects)"<< std::endl;
+ << byte_u_t(1ull << order) << " objects)"<< std::endl;
std::cout << "\tsplay_width: " << static_cast<int>(splay_width) << std::endl;
if (!object_pool_name.empty()) {
std::cout << "\tobject_pool: " << object_pool_name << std::endl;
f->close_section();
} else {
tbl << w->name
- << stringify(si_t(info.size))
+ << stringify(byte_u_t(info.size))
<< parent
<< ((old_format) ? '1' : '2')
<< "" // protect doesn't apply to images
f->close_section();
} else {
tbl << w->name + "@" + s->name
- << stringify(si_t(s->size))
+ << stringify(byte_u_t(s->size))
<< parent
<< ((old_format) ? '1' : '2')
<< (is_protected ? "yes" : "")
f->dump_string("timestamp", tt_str);
f->close_section();
} else {
- t << s->id << s->name << stringify(prettybyte_t(s->size)) << tt_str
+ t << s->id << s->name << stringify(byte_u_t(s->size)) << tt_str
<< TextTable::endrow;
}
}
if (info.size > ULONG_MAX) {
r = -EFBIG;
- cerr << "rbd-nbd: image is too large (" << prettybyte_t(info.size)
- << ", max is " << prettybyte_t(ULONG_MAX) << ")" << std::endl;
+ cerr << "rbd-nbd: image is too large (" << byte_u_t(info.size)
+ << ", max is " << byte_u_t(ULONG_MAX) << ")" << std::endl;
goto close_nbd;
}
[Unit]
Description=Map RBD devices
-After=network-online.target local-fs.target
-Wants=network-online.target local-fs.target
+After=network-online.target
+Before=remote-fs-pre.target
+Wants=network-online.target remote-fs-pre.target
[Service]
EnvironmentFile=-/etc/sysconfig/ceph